Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

word-extractor

Package Overview
Dependencies
Maintainers
1
Versions
13
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

word-extractor - npm Package Compare versions

Comparing version 1.0.2 to 1.0.3

lib/open-office-parser.js

6

CHANGELOG.md
# Change log
### 1.0.3 / 17th June2021
* Fixes issues with long attribute values (> 65k) in OO XML. See #37
* Propogate errors from XML failures into promise rejections. See #38
* Changed the XML parser dependency for maintenance and fixes. See #39
### 1.0.2 / 28th May 2021

@@ -4,0 +10,0 @@

284

lib/open-office-extractor.js

@@ -20,3 +20,3 @@

const path = require('path');
const SAX = require("sax");
const SAXES = require("saxes");
const yauzl = require('yauzl');

@@ -100,12 +100,10 @@

zipfile.readEntry();
zipfile.on("error", function(error) {
reject(error);
});
zipfile.on("error", reject);
zipfile.on("entry", (entry) => {
if ('[Content_Types].xml' === entry.fileName || this.shouldProcess(entry.fileName)) {
//console.log("entry", entry.fileName);
return this.handleEntry(zipfile, entry)
.then(() => {
zipfile.readEntry();
});
})
.catch((e) => reject(e));
}

@@ -115,6 +113,3 @@

});
zipfile.on("end", () => {
//console.log(this._document);
resolve(this._document);
});
zipfile.on("end", () => resolve(this._document));
});

@@ -134,121 +129,134 @@ })

handleOpenTag(node) {
if (node.name === 'Override') {
const actionFunction = this._streamTypes[node.attributes['ContentType']];
if (actionFunction) {
const partName = node.attributes['PartName'].replace(/^[/]+/, '');
const action = {action: actionFunction, type: node.attributes['ContentType']};
this._actions[partName] = action;
}
} else if (node.name === 'Default') {
const extension = node.attributes['Extension'];
const contentType = node.attributes['ContentType'];
this._defaults[extension] = contentType;
} else if (node.name === 'Relationship') {
// console.log(this._source, node);
this._relationships[node.attributes['Id']] = {
type: node.attributes['Type'],
target: node.attributes['Target'],
};
} else if (node.name === 'w:document' ||
node.name === 'w:footnotes' ||
node.name === 'w:endnotes' ||
node.name === 'w:comments') {
this._context = ['content', 'body'];
this._pieces = [];
} else if (node.name === 'w:hdr' ||
node.name === 'w:ftr') {
this._context = ['content', 'header'];
this._pieces = [];
} else if (node.name === 'w:endnote' || node.name === 'w:footnote') {
const type = (node.attributes['w:type'] || this._context[0]);
this._context.unshift(type);
} else if (node.name === 'w:tab' && this._context[0] === 'content') {
this._pieces.push("\t");
} else if (node.name === 'w:br' && this._context[0] === 'content') {
if ((node.attributes['w:type'] || '') === 'page') {
this._pieces.push("\n");
} else {
this._pieces.push("\n");
}
} else if (node.name === 'w:del' || node.name === 'w:instrText') {
this._context.unshift('deleted');
} else if (node.name === 'w:tabs') {
this._context.unshift('tabs');
} else if (node.name === 'w:tc') {
this._context.unshift('cell');
} else if (node.name === 'w:drawing') {
this._context.unshift('drawing');
} else if (node.name === 'w:txbxContent') {
this._context.unshift(this._pieces);
this._context.unshift('textbox');
this._pieces = [];
}
}
handleCloseTag(node) {
if (node.name === 'w:document') {
this._context = null;
this._document._body = this._pieces.join("");
} else if (node.name === 'w:footnote' || node.name === 'w:endnote') {
this._context.shift();
} else if (node.name === 'w:footnotes') {
this._context = null;
this._document._footnotes = this._pieces.join("");
} else if (node.name === 'w:endnotes') {
this._context = null;
this._document._endnotes = this._pieces.join("");
} else if (node.name === 'w:comments') {
this._context = null;
this._document._annotations = this._pieces.join("");
} else if (node.name === 'w:hdr') {
this._context = null;
this._document._headers = this._document._headers + this._pieces.join("");
} else if (node.name === 'w:ftr') {
this._context = null;
this._document._footers = this._document._footers + this._pieces.join("");
} else if (node.name === 'w:p') {
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') {
this._pieces.push("\n");
}
} else if (node.name === 'w:del' || node.name === 'w:instrText') {
this._context.shift();
} else if (node.name === 'w:tabs') {
this._context.shift();
} else if (node.name === 'w:tc') {
this._pieces.pop();
this._pieces.push("\t");
this._context.shift();
} else if (node.name === 'w:tr') {
this._pieces.push("\n");
} else if (node.name === 'w:drawing') {
this._context.shift();
} else if (node.name === 'w:txbxContent') {
const textBox = this._pieces.join("");
const context = this._context.shift();
if (context !== 'textbox') {
throw new Error("Invalid textbox context");
}
this._pieces = this._context.shift();
// If in drawing context, discard
if (this._context[0] === 'drawing')
return;
if (textBox.length == 0)
return;
const inHeader = this._context.includes('header');
const documentField = (inHeader) ? '_headerTextboxes' : '_textboxes';
if (this._document[documentField]) {
this._document[documentField] = this._document[documentField] + "\n" + textBox;
} else {
this._document[documentField] = textBox;
}
}
}
createXmlParser() {
const strict = true;
const parser = SAX.createStream(strict);
const parser = new SAXES.SaxesParser();
parser.on("opentag", (node) => {
if (node.name === 'Override') {
const actionFunction = this._streamTypes[node.attributes['ContentType']];
if (actionFunction) {
const partName = node.attributes['PartName'].replace(/^[/]+/, '');
const action = {action: actionFunction, type: node.attributes['ContentType']};
this._actions[partName] = action;
}
} else if (node.name === 'Default') {
const extension = node.attributes['Extension'];
const contentType = node.attributes['ContentType'];
this._defaults[extension] = contentType;
} else if (node.name === 'Relationship') {
// console.log(this._source, node);
this._relationships[node.attributes['Id']] = {
type: node.attributes['Type'],
target: node.attributes['Target'],
};
} else if (node.name === 'w:document' ||
node.name === 'w:footnotes' ||
node.name === 'w:endnotes' ||
node.name === 'w:comments') {
this._context = ['content', 'body'];
this._pieces = [];
} else if (node.name === 'w:hdr' ||
node.name === 'w:ftr') {
this._context = ['content', 'header'];
this._pieces = [];
} else if (node.name === 'w:endnote' || node.name === 'w:footnote') {
const type = (node.attributes['w:type'] || this._context[0]);
this._context.unshift(type);
} else if (node.name === 'w:tab' && this._context[0] === 'content') {
this._pieces.push("\t");
} else if (node.name === 'w:br' && this._context[0] === 'content') {
if ((node.attributes['w:type'] || '') === 'page') {
this._pieces.push("\n");
} else {
this._pieces.push("\n");
}
} else if (node.name === 'w:del' || node.name === 'w:instrText') {
this._context.unshift('deleted');
} else if (node.name === 'w:tabs') {
this._context.unshift('tabs');
} else if (node.name === 'w:tc') {
this._context.unshift('cell');
} else if (node.name === 'w:drawing') {
this._context.unshift('drawing');
} else if (node.name === 'w:txbxContent') {
this._context.unshift(this._pieces);
this._context.unshift('textbox');
this._pieces = [];
try {
this.handleOpenTag(node);
} catch (e) {
parser.fail(e.message);
}
});
parser.on('closetag', (node) => {
if (node === 'w:document') {
this._context = null;
this._document._body = this._pieces.join("");
} else if (node === 'w:footnote' || node === 'w:endnote') {
this._context.shift();
} else if (node === 'w:footnotes') {
this._context = null;
this._document._footnotes = this._pieces.join("");
} else if (node === 'w:endnotes') {
this._context = null;
this._document._endnotes = this._pieces.join("");
} else if (node === 'w:comments') {
this._context = null;
this._document._annotations = this._pieces.join("");
} else if (node === 'w:hdr') {
this._context = null;
this._document._headers = this._document._headers + this._pieces.join("");
} else if (node === 'w:ftr') {
this._context = null;
this._document._footers = this._document._footers + this._pieces.join("");
} else if (node === 'w:p') {
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') {
this._pieces.push("\n");
}
} else if (node === 'w:del' || node === 'w:instrText') {
this._context.shift();
} else if (node === 'w:tabs') {
this._context.shift();
} else if (node === 'w:tc') {
this._pieces.pop();
this._pieces.push("\t");
this._context.shift();
} else if (node === 'w:tr') {
this._pieces.push("\n");
} else if (node === 'w:drawing') {
this._context.shift();
} else if (node === 'w:txbxContent') {
const textBox = this._pieces.join("");
const context = this._context.shift();
if (context !== 'textbox') {
throw new Error("Invalid textbox context");
}
this._pieces = this._context.shift();
// If in drawing context, discard
if (this._context[0] === 'drawing')
return;
if (textBox.length == 0)
return;
const inHeader = this._context.includes('header');
const documentField = (inHeader) ? '_headerTextboxes' : '_textboxes';
if (this._document[documentField]) {
this._document[documentField] = this._document[documentField] + "\n" + textBox;
} else {
this._document[documentField] = textBox;
}
try {
this.handleCloseTag(node);
} catch (e) {
parser.fail(e.message);
}

@@ -258,6 +266,10 @@ });

parser.on('text', (string) => {
if (! this._context)
return;
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') {
this._pieces.push(string);
try {
if (! this._context)
return;
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') {
this._pieces.push(string);
}
} catch (e) {
parser.fail(e.message);
}

@@ -278,4 +290,20 @@ });

const parser = this.createXmlParser();
readStream.on("end", resolve);
readStream.pipe(parser);
parser.on("error", (e) => {
readStream.destroy(e);
reject(e);
});
parser.on("end", () => resolve());
readStream.on("end", () => parser.close());
readStream.on("error", (e) => reject(e));
readStream.on("readable", () => {
// eslint-disable-next-line no-constant-condition
while (true) {
const chunk = readStream.read(0x1000);
if (chunk === null) {
return;
}
parser.write(chunk);
}
});
});

@@ -282,0 +310,0 @@ });

{
"name": "word-extractor",
"version": "1.0.2",
"version": "1.0.3",
"description": "Node.js package to read Word .doc files",

@@ -32,3 +32,3 @@ "main": "lib/word.js",

"dependencies": {
"sax": "^1.2.4",
"saxes": "^5.0.1",
"yauzl": "^2.10.0"

@@ -35,0 +35,0 @@ },

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc