Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

word-extractor

Package Overview
Dependencies
Maintainers
1
Versions
13
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

word-extractor - npm Package Compare versions

Comparing version 1.0.1 to 1.0.2

4

CHANGELOG.md
# Change log
### 1.0.2 / 28th May 2021
* Added a new method for reading textbox content. See #35
### 1.0.1 / 24th May 2021

@@ -4,0 +8,0 @@

@@ -112,2 +112,26 @@ /**

}
/**
* Accessor to read the textboxes from a Word file. The text box content is aggregated as a
* single long string. When both the body and header content exists, they will be separated
* by a newline.
* @param {Object} options - options for body data
* @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes
* to standard ASCII characters
* @param {boolean} options.includeHeadersAndFooters - if true (the default), includes text box
* content in headers and footers
* @param {boolean} options.includeBody - if true (the default), includes text box
* content in the document body
* @returns a string, containing the Word file text box content
*/
getTextboxes(options) {
options = options || {};
const segments = [];
if (options.includeBody != false)
segments.push(this._textboxes);
if (options.includeHeadersAndFooters != false)
segments.push(this._headerTextboxes);
const value = segments.join("\n");
return (options.filterUnicode == false) ? value : filter(value);
}
}

@@ -114,0 +138,0 @@

@@ -118,2 +118,11 @@

});
})
.then((document) => {
if (document._textboxes && document._textboxes.length > 0) {
document._textboxes = document._textboxes + "\n";
}
if (document._headerTextboxes && document._headerTextboxes.length > 0) {
document._headerTextboxes = document._headerTextboxes + "\n";
}
return document;
});

@@ -128,3 +137,2 @@

parser.on("opentag", (node) => {
if (node.name === 'Override') {

@@ -150,6 +158,8 @@ const actionFunction = this._streamTypes[node.attributes['ContentType']];

node.name === 'w:endnotes' ||
node.name === 'w:comments' ||
node.name === 'w:hdr' ||
node.name === 'w:comments') {
this._context = ['content', 'body'];
this._pieces = [];
} else if (node.name === 'w:hdr' ||
node.name === 'w:ftr') {
this._context = ['content'];
this._context = ['content', 'header'];
this._pieces = [];

@@ -175,2 +185,6 @@ } else if (node.name === 'w:endnote' || node.name === 'w:footnote') {

this._context.unshift('drawing');
} else if (node.name === 'w:txbxContent') {
this._context.unshift(this._pieces);
this._context.unshift('textbox');
this._pieces = [];
}

@@ -202,4 +216,5 @@

} else if (node === 'w:p') {
if (this._context[0] === 'content' || this._context[0] === 'cell')
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') {
this._pieces.push("\n");
}
} else if (node === 'w:del' || node === 'w:instrText') {

@@ -217,2 +232,25 @@ this._context.shift();

this._context.shift();
} else if (node === 'w:txbxContent') {
const textBox = this._pieces.join("");
const context = this._context.shift();
if (context !== 'textbox') {
throw new Error("Invalid textbox context");
}
this._pieces = this._context.shift();
// If in drawing context, discard
if (this._context[0] === 'drawing')
return;
if (textBox.length == 0)
return;
const inHeader = this._context.includes('header');
const documentField = (inHeader) ? '_headerTextboxes' : '_textboxes';
if (this._document[documentField]) {
this._document[documentField] = this._document[documentField] + "\n" + textBox;
} else {
this._document[documentField] = textBox;
}
}

@@ -224,3 +262,3 @@ });

return;
if (this._context[0] === 'content' || this._context[0] === 'cell') {
if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') {
this._pieces.push(string);

@@ -227,0 +265,0 @@ }

@@ -289,2 +289,12 @@ /**

if (this._boundaries.ccpTxbx) {
document._textboxes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpTxbx - 1));
start += this._boundaries.ccpTxbx;
}
if (this._boundaries.ccpHdrTxbx) {
document._headerTextboxes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpHdrTxbx - 1));
start += this._boundaries.ccpHdrTxbx;
}
return document;

@@ -318,2 +328,4 @@ }

this._boundaries.ccpEdn = buffer.readUInt32LE(0x0060);
this._boundaries.ccpTxbx = buffer.readUInt32LE(0x0064);
this._boundaries.ccpHdrTxbx = buffer.readUInt32LE(0x0068);

@@ -320,0 +332,0 @@ this.writeBookmarks(buffer, streamBuffer);

2

package.json
{
"name": "word-extractor",
"version": "1.0.1",
"version": "1.0.2",
"description": "Node.js package to read Word .doc files",

@@ -5,0 +5,0 @@ "main": "lib/word.js",

@@ -73,3 +73,3 @@ ## word-extractor

`Document#getHeaders()`
`Document#getHeaders(options?)`

@@ -98,2 +98,15 @@ Retrieves the header and footer text from a Word document. This will handle

`Document#getTextboxes(options?)`
Retrieves the textbox contenttext from a Word document. This will handle
UNICODE characters correctly, so if there are accented or non-Latin-1
characters present in the document, they'll show as is in the returned string.
Note that by default, `getTextboxes()` returns one string, containing all
textbox content from both main document and the headers and footers. You
can control what gets included by using the options `includeHeadersAndFooters`
(which defaults to true) and `includeBody` (also defaults to true). So,
as an example, if you only want the body text box content, use:
`doc.getTextboxes({includeHeadersAndFooters: false})`.
### License

@@ -100,0 +113,0 @@

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc