You're Invited:Meet the Socket Team at BlackHat and DEF CON in Las Vegas, Aug 4-6.RSVP
Socket
Book a DemoInstallSign in
Socket

officeparser

Package Overview
Dependencies
Maintainers
1
Versions
45
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

officeparser - npm Package Compare versions

Comparing version

to
5.1.1

22

officeParser.js

@@ -482,8 +482,8 @@ #!/usr/bin/env node

/** Main async function with callback to execute parseOffice for supported files
* @param {string | Buffer} file File path or file buffers
* @param {function} callback Callback function that returns value or error
* @param {OfficeParserConfig} [config={}] [OPTIONAL]: Config Object for officeParser
* @param {string | Buffer | ArrayBuffer} srcFile File path or file buffers or Javascript ArrayBuffer
* @param {function} callback Callback function that returns value or error
* @param {OfficeParserConfig} [config={}] [OPTIONAL]: Config Object for officeParser
* @returns {void}
*/
function parseOffice(file, callback, config = {}) {
function parseOffice(srcFile, callback, config = {}) {
// Make a clone of the config with default values such that none of the config flags are undefined.

@@ -498,2 +498,8 @@ /** @type {OfficeParserConfig} */

};
// Our internal code can process regular node Buffers or file path.
// So, if the src file was presented as ArrayBuffers, we create Buffers from them.
let file = srcFile instanceof ArrayBuffer ? Buffer.from(srcFile)
: srcFile;
/**

@@ -564,9 +570,9 @@ * Prepare file for processing

/** Main async function that can be used with await to execute parseOffice. Or it can be used with promises.
* @param {string | Buffer} file File path or file buffers
* @param {OfficeParserConfig} [config={}] [OPTIONAL]: Config Object for officeParser
* @param {string | Buffer | ArrayBuffer} srcFile File path or file buffers or Javascript ArrayBuffer
* @param {OfficeParserConfig} [config={}] [OPTIONAL]: Config Object for officeParser
* @returns {Promise<string>}
*/
function parseOfficeAsync(file, config = {}) {
function parseOfficeAsync(srcFile, config = {}) {
return new Promise((res, rej) => {
parseOffice(file, function (data, err) {
parseOffice(srcFile, function (data, err) {
if (err)

@@ -573,0 +579,0 @@ return rej(err);

{
"name": "officeparser",
"version": "5.0.0",
"version": "5.1.1",
"description": "A Node.js library to parse text out of any office file. Currently supports docx, pptx, xlsx, odt, odp, ods, pdf files.",

@@ -5,0 +5,0 @@ "main": "officeParser.js",

@@ -16,2 +16,3 @@ # officeParser

#### Update
* 2024/11/12 - Added ArrayBuffer as a type of file input. Generating bundle files now which exposes namespace officeParser to be able to access parseOffice and parseOfficeAsync directly on the browser. Extracting text out of pdf files does not work currently in browser bundles.
* 2024/10/21 - Replaced extracting zip files from decompress to yauzl. This means that we now extract files in memory and we no longer need to write them to disk. Removed config flags related to extracted files. Added flags for CLI execution.

@@ -189,6 +190,47 @@ * 2024/10/15 - Fixed erroring out while deleting temp files when multiple worker threads make parallel executions resulting in same file name for multiple files. Fixed erroring out when multiple executions are made without waiting for the previous execution to finish which resulted in deleting the file from other execution. Upgraded dependencies.

## Browser Usage
Download the bundle file available as part of the release asset.
Include this bundle file in your browser html file and access `parseOffice` and `parseOfficeAsync` under the **`officeParser`** namespace.
**Example**
```html
<head>
...
<!-- Include bundle file in the script tag. -->
<script src="officeParserBundle@5.1.0.js"></script>
</head>
<body>
...
<input type="file" id="fileInput" />
...
<script>
document.getElementById('fileInput').addEventListener('change', async function(event) {
const outputDiv = document.getElementById('output');
const file = event.target.files[0];
try {
// Your configuration options for officeParser
const config = {
outputErrorToConsole: false,
newlineDelimiter: '\n',
ignoreNotes: false,
putNotesAtLast: false
};
const arrayBuffer = await file.arrayBuffer();
const result = await officeParser.parseOfficeAsync(arrayBuffer, config);
// result contains the extracted text.
}
catch (error) {
// Handle error
}
});
</script>
</body>
```
## Known Bugs
1. Inconsistency and incorrectness in the positioning of footnotes and endnotes in .docx files where the footnotes and endnotes would end up at the end of the parsed text whereas it would be positioned exactly after the referenced word in .odt files.
2. The charts and objects information of .odt files are not accurate and may end up showing a few NaN in some cases.
3. Extracting texts in browser bundles does not work for pdf files.
----------

@@ -195,0 +237,0 @@