office-text-extractor
Advanced tools
Comparing version 3.0.1 to 3.0.2
{ | ||
"name": "office-text-extractor", | ||
"version": "3.0.1", | ||
"version": "3.0.2", | ||
"description": "Yet another library to extract text from MS Office and PDF files", | ||
@@ -43,5 +43,5 @@ "keywords": [ | ||
"dependencies": { | ||
"fflate": "0.8.0", | ||
"file-type": "18.2.1", | ||
"got": "12.6.0", | ||
"fflate": "0.8.1", | ||
"file-type": "18.5.0", | ||
"got": "13.0.0", | ||
"js-yaml": "4.1.0", | ||
@@ -51,17 +51,17 @@ "mammoth": "1.6.0", | ||
"text-encoding": "0.7.0", | ||
"xlsx": "0.18.5", | ||
"xml2js": "0.6.0" | ||
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.0/xlsx-0.20.0.tgz", | ||
"xml2js": "0.6.2" | ||
}, | ||
"devDependencies": { | ||
"@types/js-yaml": "4.0.5", | ||
"@types/node": "18.15.11", | ||
"@types/text-encoding": "0.0.36", | ||
"@types/xml2js": "0.4.11", | ||
"ava": "5.3.0", | ||
"np": "7.7.0", | ||
"@types/js-yaml": "4.0.6", | ||
"@types/node": "20.8.3", | ||
"@types/text-encoding": "0.0.37", | ||
"@types/xml2js": "0.4.12", | ||
"ava": "5.3.1", | ||
"np": "8.0.4", | ||
"npm-run-all": "4.1.5", | ||
"prettier": "2.8.7", | ||
"tsx": "3.12.7", | ||
"typescript": "5.0.3", | ||
"xo": "0.53.1" | ||
"prettier": "3.0.3", | ||
"tsx": "3.13.0", | ||
"typescript": "5.2.2", | ||
"xo": "0.56.0" | ||
}, | ||
@@ -68,0 +68,0 @@ "prettier": { |
128
readme.md
@@ -1,15 +0,12 @@ | ||
# <div align="center"> `office-text-extractor` </div> | ||
# <div align="center"> office-text-extractor </div> | ||
<div align="center"> | ||
<img alt="Github Workflow Status" src="https://img.shields.io/github/actions/workflow/status/gamemaker1/office-text-extractor/ci.yaml"/> | ||
<img alt="GitHub Stars" src="https://img.shields.io/github/stars/gamemaker1/office-text-extractor"/> | ||
yet another library to extract text from docx, pptx, xlsx, and pdf files. | ||
</div> | ||
<br> | ||
> Yet another library to extract text from MS Office (`docx`, `pptx`, `xlsx`) | ||
> and PDF (`pdf`) files. | ||
## similar libraries | ||
## Similar projects | ||
There are other great projects that do the same job and have inspired this | ||
there are other great libraries that do the same job and have inspired this | ||
project, such as: | ||
@@ -21,54 +18,38 @@ | ||
### How is this project different? | ||
however, office-text-extractor has the following differences: | ||
- Parses file based on its mime type, not its file extension. | ||
- Does not spawn a child process to use a tool installed on the device. | ||
- Reads and returns text from the file if it contains plain text. | ||
- parses file based on its **mime type**, not its file extension. | ||
- **does not spawn** a child process to use a tool installed on the device. | ||
- reads and returns text from the file if it contains **plain text**. | ||
## Libraries used | ||
## libraries used | ||
This module uses some amazing existing libraries that perform better than the | ||
this package uses some amazing existing libraries that perform better than the | ||
ones that originally existed in this module, and are therefore used instead: | ||
- [`pdf-parse`](https://www.npmjs.com/package/pdf-parse), for parsing PDF files | ||
- [`xlsx`](https://www.npmjs.com/package/xlsx), for parsing MS Excel files | ||
- [`mammoth`](https://www.npmjs.com/package/mammoth), for parsing MS Word files | ||
- [`pdf-parse`](https://www.npmjs.com/package/pdf-parse), for parsing pdf files | ||
- [`xlsx`](https://www.npmjs.com/package/xlsx), for parsing xlsx files | ||
- [`mammoth`](https://www.npmjs.com/package/mammoth), for parsing docx files | ||
This module also uses: | ||
a big thank you to the contributors of these projects! | ||
- [`xml2js`](https://www.npmjs.com/package/xml2js) - to convert the MS Office | ||
XML files into JSON | ||
- [`js-yaml`](https://www.npmjs.com/package/js-yaml) - to convert JSON into YAML | ||
- [`file-type`](https://www.npmjs.com/package/file-type) - to detect the mime | ||
type of files | ||
- [`fflate`](https://www.npmjs.com/package/fflate) - to unzip files | ||
## installation | ||
A big thank you to the contributors of these projects! | ||
#### node | ||
## Installation | ||
#### NodeJs | ||
> **Note** | ||
> | ||
> This package is now pure ESM (from version 2.0.0 onwards). Please read | ||
> from version 2.0.0 onwards, this package is pure esm. please read | ||
> [this article](https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c) | ||
> for a guide on how to ensure your project can import this library. | ||
To use this in an Node project, install it using `npm`/`pnpm`/`yarn`: | ||
to use office-text-extractor in an Node project, install it using `npm`/`pnpm`/`yarn`: | ||
```sh | ||
# Using npm | ||
> npm install office-text-extractor | ||
# Using pnpm | ||
> pnpm add office-text-extractor | ||
# Using yarn | ||
> yarn add office-text-extractor | ||
``` | ||
#### Browser | ||
#### browser | ||
To use this package in the browser, fetch it using your preferred CDN: | ||
to use this package in the browser, fetch it using your preferred cdn: | ||
@@ -79,24 +60,65 @@ ```tsx | ||
## Usage | ||
## usage | ||
an example of using the library to extract text is as follows: | ||
```ts | ||
import { readFile } from 'node:fs/promises' | ||
import { getTextExtractor } from 'office-text-extractor' | ||
// Create a new instance of the extractor. | ||
// this function returns a new instance of the `TextExtractor` class, with the default | ||
// extraction methods (docx, pptx, xlsx, pdf) registered. | ||
const extractor = getTextExtractor() | ||
// Extract text from a URL, file or buffer. | ||
const location = | ||
'https://raw.githubusercontent.com/gamemaker1/office-text-extractor/rewrite/test/fixtures/docs/pptx.pptx' | ||
const text = await extractor.extractText({ | ||
input: location, // this can be a file path or a buffer | ||
type: 'url', // this is can be 'url', 'file' or 'buffer' | ||
}) | ||
// extract text from a url, because that's a neat first example :p | ||
const url = 'https://raw.githubusercontent.com/gamemaker1/office-text-extractor/rewrite/test/fixtures/docs/pptx.pptx' | ||
const text = await extractor.extractText({ input: url, type: 'url' }) | ||
// you can extract text from a file too, like so: | ||
const path = 'stuff/boring.pdf' | ||
const text = await extractor.extractText({ input: path, type: 'file' }) | ||
// if you have a buffer with the file in it, you can pass that too: | ||
const buffer = await readFile(path) | ||
const text = await extractor.extractText({ input: buffer, type: 'buffer' }) | ||
console.log(text) | ||
``` | ||
## License | ||
the following is an example of how to create and use your own text extraction method: | ||
This project is licensed under the ISC license. Please see | ||
[`license.md`](./license.md) for more details. | ||
```ts | ||
import { type Buffer } from 'node:buffer' | ||
import { TextExtractor, type TextExtractionMethod } from 'office-text-extractor' | ||
/** | ||
* Extracts text from images. | ||
*/ | ||
class ImageExtractor implements TextExtractionMethod { | ||
/** | ||
* The mime types of the file that the extractor accepts. | ||
*/ | ||
mimes = ['image/png', 'image/jpeg'] | ||
/** | ||
* Extracts text from the image file passed by the user. | ||
*/ | ||
apply = async (input: Buffer): Promise<string> { | ||
const text = await processImage(input) | ||
return text | ||
} | ||
} | ||
// create a new extractor and register our extraction method | ||
const extractor = new TextExtractor() | ||
extractor.addMethod(new ImageExtractor()) | ||
// then use it like you would normally | ||
const text = await extractor.extractText({ input: '...', type: '...' } | ||
console.log(text) | ||
``` | ||
## license | ||
this project is licensed under the ISC license. please see [`license.md`](./license.md) | ||
for more details. |
HTTP dependency
Supply chain riskContains a dependency which resolves to a remote HTTP URL which could be used to inject untrusted code and reduce overall package reliability.
Found 1 instance in 1 package
34155
123
1
+ Addedfflate@0.8.1(transitive)
+ Addedfile-type@18.5.0(transitive)
+ Addedgot@13.0.0(transitive)
+ Addedxml2js@0.6.2(transitive)
- Removedadler-32@1.3.1(transitive)
- Removedcfb@1.2.2(transitive)
- Removedcodepage@1.15.0(transitive)
- Removedcrc-32@1.2.2(transitive)
- Removedfflate@0.8.0(transitive)
- Removedfile-type@18.2.1(transitive)
- Removedfrac@1.1.2(transitive)
- Removedgot@12.6.0(transitive)
- Removedssf@0.11.2(transitive)
- Removedwmf@1.0.2(transitive)
- Removedword@0.3.0(transitive)
- Removedxlsx@0.18.5(transitive)
- Removedxml2js@0.6.0(transitive)
Updatedfflate@0.8.1
Updatedfile-type@18.5.0
Updatedgot@13.0.0
Updatedxlsx@https://cdn.sheetjs.com/xlsx-0.20.0/xlsx-0.20.0.tgz
Updatedxml2js@0.6.2