word-extractor
Advanced tools
Comparing version 1.0.2 to 1.0.3
# Change log | ||
### 1.0.3 / 17th June2021 | ||
* Fixes issues with long attribute values (> 65k) in OO XML. See #37 | ||
* Propogate errors from XML failures into promise rejections. See #38 | ||
* Changed the XML parser dependency for maintenance and fixes. See #39 | ||
### 1.0.2 / 28th May 2021 | ||
@@ -4,0 +10,0 @@ |
@@ -20,3 +20,3 @@ | ||
const path = require('path'); | ||
const SAX = require("sax"); | ||
const SAXES = require("saxes"); | ||
const yauzl = require('yauzl'); | ||
@@ -100,12 +100,10 @@ | ||
zipfile.readEntry(); | ||
zipfile.on("error", function(error) { | ||
reject(error); | ||
}); | ||
zipfile.on("error", reject); | ||
zipfile.on("entry", (entry) => { | ||
if ('[Content_Types].xml' === entry.fileName || this.shouldProcess(entry.fileName)) { | ||
//console.log("entry", entry.fileName); | ||
return this.handleEntry(zipfile, entry) | ||
.then(() => { | ||
zipfile.readEntry(); | ||
}); | ||
}) | ||
.catch((e) => reject(e)); | ||
} | ||
@@ -115,6 +113,3 @@ | ||
}); | ||
zipfile.on("end", () => { | ||
//console.log(this._document); | ||
resolve(this._document); | ||
}); | ||
zipfile.on("end", () => resolve(this._document)); | ||
}); | ||
@@ -134,121 +129,134 @@ }) | ||
handleOpenTag(node) { | ||
if (node.name === 'Override') { | ||
const actionFunction = this._streamTypes[node.attributes['ContentType']]; | ||
if (actionFunction) { | ||
const partName = node.attributes['PartName'].replace(/^[/]+/, ''); | ||
const action = {action: actionFunction, type: node.attributes['ContentType']}; | ||
this._actions[partName] = action; | ||
} | ||
} else if (node.name === 'Default') { | ||
const extension = node.attributes['Extension']; | ||
const contentType = node.attributes['ContentType']; | ||
this._defaults[extension] = contentType; | ||
} else if (node.name === 'Relationship') { | ||
// console.log(this._source, node); | ||
this._relationships[node.attributes['Id']] = { | ||
type: node.attributes['Type'], | ||
target: node.attributes['Target'], | ||
}; | ||
} else if (node.name === 'w:document' || | ||
node.name === 'w:footnotes' || | ||
node.name === 'w:endnotes' || | ||
node.name === 'w:comments') { | ||
this._context = ['content', 'body']; | ||
this._pieces = []; | ||
} else if (node.name === 'w:hdr' || | ||
node.name === 'w:ftr') { | ||
this._context = ['content', 'header']; | ||
this._pieces = []; | ||
} else if (node.name === 'w:endnote' || node.name === 'w:footnote') { | ||
const type = (node.attributes['w:type'] || this._context[0]); | ||
this._context.unshift(type); | ||
} else if (node.name === 'w:tab' && this._context[0] === 'content') { | ||
this._pieces.push("\t"); | ||
} else if (node.name === 'w:br' && this._context[0] === 'content') { | ||
if ((node.attributes['w:type'] || '') === 'page') { | ||
this._pieces.push("\n"); | ||
} else { | ||
this._pieces.push("\n"); | ||
} | ||
} else if (node.name === 'w:del' || node.name === 'w:instrText') { | ||
this._context.unshift('deleted'); | ||
} else if (node.name === 'w:tabs') { | ||
this._context.unshift('tabs'); | ||
} else if (node.name === 'w:tc') { | ||
this._context.unshift('cell'); | ||
} else if (node.name === 'w:drawing') { | ||
this._context.unshift('drawing'); | ||
} else if (node.name === 'w:txbxContent') { | ||
this._context.unshift(this._pieces); | ||
this._context.unshift('textbox'); | ||
this._pieces = []; | ||
} | ||
} | ||
handleCloseTag(node) { | ||
if (node.name === 'w:document') { | ||
this._context = null; | ||
this._document._body = this._pieces.join(""); | ||
} else if (node.name === 'w:footnote' || node.name === 'w:endnote') { | ||
this._context.shift(); | ||
} else if (node.name === 'w:footnotes') { | ||
this._context = null; | ||
this._document._footnotes = this._pieces.join(""); | ||
} else if (node.name === 'w:endnotes') { | ||
this._context = null; | ||
this._document._endnotes = this._pieces.join(""); | ||
} else if (node.name === 'w:comments') { | ||
this._context = null; | ||
this._document._annotations = this._pieces.join(""); | ||
} else if (node.name === 'w:hdr') { | ||
this._context = null; | ||
this._document._headers = this._document._headers + this._pieces.join(""); | ||
} else if (node.name === 'w:ftr') { | ||
this._context = null; | ||
this._document._footers = this._document._footers + this._pieces.join(""); | ||
} else if (node.name === 'w:p') { | ||
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') { | ||
this._pieces.push("\n"); | ||
} | ||
} else if (node.name === 'w:del' || node.name === 'w:instrText') { | ||
this._context.shift(); | ||
} else if (node.name === 'w:tabs') { | ||
this._context.shift(); | ||
} else if (node.name === 'w:tc') { | ||
this._pieces.pop(); | ||
this._pieces.push("\t"); | ||
this._context.shift(); | ||
} else if (node.name === 'w:tr') { | ||
this._pieces.push("\n"); | ||
} else if (node.name === 'w:drawing') { | ||
this._context.shift(); | ||
} else if (node.name === 'w:txbxContent') { | ||
const textBox = this._pieces.join(""); | ||
const context = this._context.shift(); | ||
if (context !== 'textbox') { | ||
throw new Error("Invalid textbox context"); | ||
} | ||
this._pieces = this._context.shift(); | ||
// If in drawing context, discard | ||
if (this._context[0] === 'drawing') | ||
return; | ||
if (textBox.length == 0) | ||
return; | ||
const inHeader = this._context.includes('header'); | ||
const documentField = (inHeader) ? '_headerTextboxes' : '_textboxes'; | ||
if (this._document[documentField]) { | ||
this._document[documentField] = this._document[documentField] + "\n" + textBox; | ||
} else { | ||
this._document[documentField] = textBox; | ||
} | ||
} | ||
} | ||
createXmlParser() { | ||
const strict = true; | ||
const parser = SAX.createStream(strict); | ||
const parser = new SAXES.SaxesParser(); | ||
parser.on("opentag", (node) => { | ||
if (node.name === 'Override') { | ||
const actionFunction = this._streamTypes[node.attributes['ContentType']]; | ||
if (actionFunction) { | ||
const partName = node.attributes['PartName'].replace(/^[/]+/, ''); | ||
const action = {action: actionFunction, type: node.attributes['ContentType']}; | ||
this._actions[partName] = action; | ||
} | ||
} else if (node.name === 'Default') { | ||
const extension = node.attributes['Extension']; | ||
const contentType = node.attributes['ContentType']; | ||
this._defaults[extension] = contentType; | ||
} else if (node.name === 'Relationship') { | ||
// console.log(this._source, node); | ||
this._relationships[node.attributes['Id']] = { | ||
type: node.attributes['Type'], | ||
target: node.attributes['Target'], | ||
}; | ||
} else if (node.name === 'w:document' || | ||
node.name === 'w:footnotes' || | ||
node.name === 'w:endnotes' || | ||
node.name === 'w:comments') { | ||
this._context = ['content', 'body']; | ||
this._pieces = []; | ||
} else if (node.name === 'w:hdr' || | ||
node.name === 'w:ftr') { | ||
this._context = ['content', 'header']; | ||
this._pieces = []; | ||
} else if (node.name === 'w:endnote' || node.name === 'w:footnote') { | ||
const type = (node.attributes['w:type'] || this._context[0]); | ||
this._context.unshift(type); | ||
} else if (node.name === 'w:tab' && this._context[0] === 'content') { | ||
this._pieces.push("\t"); | ||
} else if (node.name === 'w:br' && this._context[0] === 'content') { | ||
if ((node.attributes['w:type'] || '') === 'page') { | ||
this._pieces.push("\n"); | ||
} else { | ||
this._pieces.push("\n"); | ||
} | ||
} else if (node.name === 'w:del' || node.name === 'w:instrText') { | ||
this._context.unshift('deleted'); | ||
} else if (node.name === 'w:tabs') { | ||
this._context.unshift('tabs'); | ||
} else if (node.name === 'w:tc') { | ||
this._context.unshift('cell'); | ||
} else if (node.name === 'w:drawing') { | ||
this._context.unshift('drawing'); | ||
} else if (node.name === 'w:txbxContent') { | ||
this._context.unshift(this._pieces); | ||
this._context.unshift('textbox'); | ||
this._pieces = []; | ||
try { | ||
this.handleOpenTag(node); | ||
} catch (e) { | ||
parser.fail(e.message); | ||
} | ||
}); | ||
parser.on('closetag', (node) => { | ||
if (node === 'w:document') { | ||
this._context = null; | ||
this._document._body = this._pieces.join(""); | ||
} else if (node === 'w:footnote' || node === 'w:endnote') { | ||
this._context.shift(); | ||
} else if (node === 'w:footnotes') { | ||
this._context = null; | ||
this._document._footnotes = this._pieces.join(""); | ||
} else if (node === 'w:endnotes') { | ||
this._context = null; | ||
this._document._endnotes = this._pieces.join(""); | ||
} else if (node === 'w:comments') { | ||
this._context = null; | ||
this._document._annotations = this._pieces.join(""); | ||
} else if (node === 'w:hdr') { | ||
this._context = null; | ||
this._document._headers = this._document._headers + this._pieces.join(""); | ||
} else if (node === 'w:ftr') { | ||
this._context = null; | ||
this._document._footers = this._document._footers + this._pieces.join(""); | ||
} else if (node === 'w:p') { | ||
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') { | ||
this._pieces.push("\n"); | ||
} | ||
} else if (node === 'w:del' || node === 'w:instrText') { | ||
this._context.shift(); | ||
} else if (node === 'w:tabs') { | ||
this._context.shift(); | ||
} else if (node === 'w:tc') { | ||
this._pieces.pop(); | ||
this._pieces.push("\t"); | ||
this._context.shift(); | ||
} else if (node === 'w:tr') { | ||
this._pieces.push("\n"); | ||
} else if (node === 'w:drawing') { | ||
this._context.shift(); | ||
} else if (node === 'w:txbxContent') { | ||
const textBox = this._pieces.join(""); | ||
const context = this._context.shift(); | ||
if (context !== 'textbox') { | ||
throw new Error("Invalid textbox context"); | ||
} | ||
this._pieces = this._context.shift(); | ||
// If in drawing context, discard | ||
if (this._context[0] === 'drawing') | ||
return; | ||
if (textBox.length == 0) | ||
return; | ||
const inHeader = this._context.includes('header'); | ||
const documentField = (inHeader) ? '_headerTextboxes' : '_textboxes'; | ||
if (this._document[documentField]) { | ||
this._document[documentField] = this._document[documentField] + "\n" + textBox; | ||
} else { | ||
this._document[documentField] = textBox; | ||
} | ||
try { | ||
this.handleCloseTag(node); | ||
} catch (e) { | ||
parser.fail(e.message); | ||
} | ||
@@ -258,6 +266,10 @@ }); | ||
parser.on('text', (string) => { | ||
if (! this._context) | ||
return; | ||
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') { | ||
this._pieces.push(string); | ||
try { | ||
if (! this._context) | ||
return; | ||
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') { | ||
this._pieces.push(string); | ||
} | ||
} catch (e) { | ||
parser.fail(e.message); | ||
} | ||
@@ -278,4 +290,20 @@ }); | ||
const parser = this.createXmlParser(); | ||
readStream.on("end", resolve); | ||
readStream.pipe(parser); | ||
parser.on("error", (e) => { | ||
readStream.destroy(e); | ||
reject(e); | ||
}); | ||
parser.on("end", () => resolve()); | ||
readStream.on("end", () => parser.close()); | ||
readStream.on("error", (e) => reject(e)); | ||
readStream.on("readable", () => { | ||
// eslint-disable-next-line no-constant-condition | ||
while (true) { | ||
const chunk = readStream.read(0x1000); | ||
if (chunk === null) { | ||
return; | ||
} | ||
parser.write(chunk); | ||
} | ||
}); | ||
}); | ||
@@ -282,0 +310,0 @@ }); |
{ | ||
"name": "word-extractor", | ||
"version": "1.0.2", | ||
"version": "1.0.3", | ||
"description": "Node.js package to read Word .doc files", | ||
@@ -32,3 +32,3 @@ "main": "lib/word.js", | ||
"dependencies": { | ||
"sax": "^1.2.4", | ||
"saxes": "^5.0.1", | ||
"yauzl": "^2.10.0" | ||
@@ -35,0 +35,0 @@ }, |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
72996
19
1692
+ Addedsaxes@^5.0.1
+ Addedsaxes@5.0.1(transitive)
+ Addedxmlchars@2.2.0(transitive)
- Removedsax@^1.2.4
- Removedsax@1.4.1(transitive)