word-extractor
Advanced tools
Comparing version 1.0.1 to 1.0.2
# Change log | ||
### 1.0.2 / 28th May 2021 | ||
* Added a new method for reading textbox content. See #35 | ||
### 1.0.1 / 24th May 2021 | ||
@@ -4,0 +8,0 @@ |
@@ -112,2 +112,26 @@ /** | ||
} | ||
/** | ||
* Accessor to read the textboxes from a Word file. The text box content is aggregated as a | ||
* single long string. When both the body and header content exists, they will be separated | ||
* by a newline. | ||
* @param {Object} options - options for body data | ||
* @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes | ||
* to standard ASCII characters | ||
* @param {boolean} options.includeHeadersAndFooters - if true (the default), includes text box | ||
* content in headers and footers | ||
* @param {boolean} options.includeBody - if true (the default), includes text box | ||
* content in the document body | ||
* @returns a string, containing the Word file text box content | ||
*/ | ||
getTextboxes(options) { | ||
options = options || {}; | ||
const segments = []; | ||
if (options.includeBody != false) | ||
segments.push(this._textboxes); | ||
if (options.includeHeadersAndFooters != false) | ||
segments.push(this._headerTextboxes); | ||
const value = segments.join("\n"); | ||
return (options.filterUnicode == false) ? value : filter(value); | ||
} | ||
} | ||
@@ -114,0 +138,0 @@ |
@@ -118,2 +118,11 @@ | ||
}); | ||
}) | ||
.then((document) => { | ||
if (document._textboxes && document._textboxes.length > 0) { | ||
document._textboxes = document._textboxes + "\n"; | ||
} | ||
if (document._headerTextboxes && document._headerTextboxes.length > 0) { | ||
document._headerTextboxes = document._headerTextboxes + "\n"; | ||
} | ||
return document; | ||
}); | ||
@@ -128,3 +137,2 @@ | ||
parser.on("opentag", (node) => { | ||
if (node.name === 'Override') { | ||
@@ -150,6 +158,8 @@ const actionFunction = this._streamTypes[node.attributes['ContentType']]; | ||
node.name === 'w:endnotes' || | ||
node.name === 'w:comments' || | ||
node.name === 'w:hdr' || | ||
node.name === 'w:comments') { | ||
this._context = ['content', 'body']; | ||
this._pieces = []; | ||
} else if (node.name === 'w:hdr' || | ||
node.name === 'w:ftr') { | ||
this._context = ['content']; | ||
this._context = ['content', 'header']; | ||
this._pieces = []; | ||
@@ -175,2 +185,6 @@ } else if (node.name === 'w:endnote' || node.name === 'w:footnote') { | ||
this._context.unshift('drawing'); | ||
} else if (node.name === 'w:txbxContent') { | ||
this._context.unshift(this._pieces); | ||
this._context.unshift('textbox'); | ||
this._pieces = []; | ||
} | ||
@@ -202,4 +216,5 @@ | ||
} else if (node === 'w:p') { | ||
if (this._context[0] === 'content' || this._context[0] === 'cell') | ||
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') { | ||
this._pieces.push("\n"); | ||
} | ||
} else if (node === 'w:del' || node === 'w:instrText') { | ||
@@ -217,2 +232,25 @@ this._context.shift(); | ||
this._context.shift(); | ||
} else if (node === 'w:txbxContent') { | ||
const textBox = this._pieces.join(""); | ||
const context = this._context.shift(); | ||
if (context !== 'textbox') { | ||
throw new Error("Invalid textbox context"); | ||
} | ||
this._pieces = this._context.shift(); | ||
// If in drawing context, discard | ||
if (this._context[0] === 'drawing') | ||
return; | ||
if (textBox.length == 0) | ||
return; | ||
const inHeader = this._context.includes('header'); | ||
const documentField = (inHeader) ? '_headerTextboxes' : '_textboxes'; | ||
if (this._document[documentField]) { | ||
this._document[documentField] = this._document[documentField] + "\n" + textBox; | ||
} else { | ||
this._document[documentField] = textBox; | ||
} | ||
} | ||
@@ -224,3 +262,3 @@ }); | ||
return; | ||
if (this._context[0] === 'content' || this._context[0] === 'cell') { | ||
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') { | ||
this._pieces.push(string); | ||
@@ -227,0 +265,0 @@ } |
@@ -289,2 +289,12 @@ /** | ||
if (this._boundaries.ccpTxbx) { | ||
document._textboxes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpTxbx - 1)); | ||
start += this._boundaries.ccpTxbx; | ||
} | ||
if (this._boundaries.ccpHdrTxbx) { | ||
document._headerTextboxes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpHdrTxbx - 1)); | ||
start += this._boundaries.ccpHdrTxbx; | ||
} | ||
return document; | ||
@@ -318,2 +328,4 @@ } | ||
this._boundaries.ccpEdn = buffer.readUInt32LE(0x0060); | ||
this._boundaries.ccpTxbx = buffer.readUInt32LE(0x0064); | ||
this._boundaries.ccpHdrTxbx = buffer.readUInt32LE(0x0068); | ||
@@ -320,0 +332,0 @@ this.writeBookmarks(buffer, streamBuffer); |
{ | ||
"name": "word-extractor", | ||
"version": "1.0.1", | ||
"version": "1.0.2", | ||
"description": "Node.js package to read Word .doc files", | ||
@@ -5,0 +5,0 @@ "main": "lib/word.js", |
@@ -73,3 +73,3 @@ ## word-extractor | ||
`Document#getHeaders()` | ||
`Document#getHeaders(options?)` | ||
@@ -98,2 +98,15 @@ Retrieves the header and footer text from a Word document. This will handle | ||
`Document#getTextboxes(options?)` | ||
Retrieves the textbox contenttext from a Word document. This will handle | ||
UNICODE characters correctly, so if there are accented or non-Latin-1 | ||
characters present in the document, they'll show as is in the returned string. | ||
Note that by default, `getTextboxes()` returns one string, containing all | ||
textbox content from both main document and the headers and footers. You | ||
can control what gets included by using the options `includeHeadersAndFooters` | ||
(which defaults to true) and `includeBody` (also defaults to true). So, | ||
as an example, if you only want the body text box content, use: | ||
`doc.getTextboxes({includeHeadersAndFooters: false})`. | ||
### License | ||
@@ -100,0 +113,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
72214
1665
115
18