@@ -112,2 +112,26 @@ /**
		}

		/**
		* Accessor to read the textboxes from a Word file. The text box content is aggregated as a
		* single long string. When both the body and header content exists, they will be separated
		* by a newline.
		* @param {Object} options - options for body data
		* @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes
		* to standard ASCII characters
		* @param {boolean} options.includeHeadersAndFooters - if true (the default), includes text box
		* content in headers and footers
		* @param {boolean} options.includeBody - if true (the default), includes text box
		* content in the document body
		* @returns a string, containing the Word file text box content
		*/
		getTextboxes(options) {
		options = options \|\| {};
		const segments = [];
		if (options.includeBody != false)
		segments.push(this._textboxes);
		if (options.includeHeadersAndFooters != false)
		segments.push(this._headerTextboxes);
		const value = segments.join("\n");
		return (options.filterUnicode == false) ? value : filter(value);
		}
		}
		@@ -114,0 +138,0 @@

lib/open-office-extractor.js

		@@ -118,2 +118,11 @@
		});
		})
		.then((document) => {
		if (document._textboxes && document._textboxes.length > 0) {
		document._textboxes = document._textboxes + "\n";
		}
		if (document._headerTextboxes && document._headerTextboxes.length > 0) {
		document._headerTextboxes = document._headerTextboxes + "\n";
		}
		return document;
		});
		@@ -128,3 +137,2 @@
		parser.on("opentag", (node) => {

		if (node.name === 'Override') {
		@@ -150,6 +158,8 @@ const actionFunction = this._streamTypes[node.attributes['ContentType']];
		node.name === 'w:endnotes' \|\|
		node.name === 'w:comments' \|\|
		node.name === 'w:hdr' \|\|
		node.name === 'w:comments') {
		this._context = ['content', 'body'];
		this._pieces = [];
		} else if (node.name === 'w:hdr' \|\|
		node.name === 'w:ftr') {
		this._context = ['content'];
		this._context = ['content', 'header'];
		this._pieces = [];
		@@ -175,2 +185,6 @@ } else if (node.name === 'w:endnote' \|\| node.name === 'w:footnote') {
		this._context.unshift('drawing');
		} else if (node.name === 'w:txbxContent') {
		this._context.unshift(this._pieces);
		this._context.unshift('textbox');
		this._pieces = [];
		}
		@@ -202,4 +216,5 @@
		} else if (node === 'w:p') {
		if (this._context[0] === 'content' \|\| this._context[0] === 'cell')
		if (this._context[0] === 'content' \|\| this._context[0] === 'cell' \|\| this._context[0] === 'textbox') {
		this._pieces.push("\n");
		}
		} else if (node === 'w:del' \|\| node === 'w:instrText') {
		@@ -217,2 +232,25 @@ this._context.shift();
		this._context.shift();
		} else if (node === 'w:txbxContent') {
		const textBox = this._pieces.join("");
		const context = this._context.shift();
		if (context !== 'textbox') {
		throw new Error("Invalid textbox context");
		}
		this._pieces = this._context.shift();

		// If in drawing context, discard
		if (this._context[0] === 'drawing')
		return;

		if (textBox.length == 0)
		return;

		const inHeader = this._context.includes('header');
		const documentField = (inHeader) ? '_headerTextboxes' : '_textboxes';
		if (this._document[documentField]) {
		this._document[documentField] = this._document[documentField] + "\n" + textBox;
		} else {
		this._document[documentField] = textBox;
		}

		}
		@@ -224,3 +262,3 @@ });
		return;
		if (this._context[0] === 'content' \|\| this._context[0] === 'cell') {
		if (this._context[0] === 'content' \|\| this._context[0] === 'cell' \|\| this._context[0] === 'textbox') {
		this._pieces.push(string);
		@@ -227,0 +265,0 @@ }

lib/word-ole-extractor.js

		@@ -289,2 +289,12 @@ /**

		if (this._boundaries.ccpTxbx) {
		document._textboxes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpTxbx - 1));
		start += this._boundaries.ccpTxbx;
		}

		if (this._boundaries.ccpHdrTxbx) {
		document._headerTextboxes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpHdrTxbx - 1));
		start += this._boundaries.ccpHdrTxbx;
		}

		return document;
		@@ -318,2 +328,4 @@ }
		this._boundaries.ccpEdn = buffer.readUInt32LE(0x0060);
		this._boundaries.ccpTxbx = buffer.readUInt32LE(0x0064);
		this._boundaries.ccpHdrTxbx = buffer.readUInt32LE(0x0068);

		@@ -320,0 +332,0 @@ this.writeBookmarks(buffer, streamBuffer);

package.json

		{
		"name": "word-extractor",
		"version": "1.0.1",
		"version": "1.0.2",
		"description": "Node.js package to read Word .doc files",
		@@ -5,0 +5,0 @@ "main": "lib/word.js",

README.md

		@@ -73,3 +73,3 @@ ## word-extractor

		`Document#getHeaders()`
		`Document#getHeaders(options?)`

		@@ -98,2 +98,15 @@ Retrieves the header and footer text from a Word document. This will handle

		`Document#getTextboxes(options?)`

		Retrieves the textbox contenttext from a Word document. This will handle
		UNICODE characters correctly, so if there are accented or non-Latin-1
		characters present in the document, they'll show as is in the returned string.

		Note that by default, `getTextboxes()` returns one string, containing all
		textbox content from both main document and the headers and footers. You
		can control what gets included by using the options `includeHeadersAndFooters`
		(which defaults to true) and `includeBody` (also defaults to true). So,
		as an example, if you only want the body text box content, use:
		`doc.getTextboxes({includeHeadersAndFooters: false})`.

		### License
		@@ -100,0 +113,0 @@

.eslintrc.yml

.github/workflows/main.yml

word-extractor - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics