word-extractor
Advanced tools
Comparing version 0.3.0 to 1.0.0
@@ -1,6 +0,19 @@ | ||
### 0.2.2 / 23rd January 2019 | ||
# Change log | ||
* Fixed [the bad dependency on event-stream](https://github.com/dominictarr/event-stream/issues/116) | ||
### 1.0.0 / 16th May 2021 | ||
* Major refactoring of the OLE code to use promises internally | ||
* Added support for Open Office XML-based (.docx) Word files. See #1 | ||
* Added support for reading direct from a Buffer. See #11 | ||
* Removed event-stream dependency. See #19 | ||
* Fixed an issue with not closing files properly. See #23 | ||
* Corrected handling of extracting files with files. See #31 | ||
* Corrected handling of extracting files with deleted text. See #32 | ||
* Fixed issues with extracting multiple rows of table data. See #33 | ||
This is a major release, and while there are no incompatible API changes, | ||
it seemed best to bump the version so as not to pick up updates automatically. | ||
However, all old applications should not require any code changes to use | ||
this version. | ||
### 0.3.0 / 18th February 2019 | ||
@@ -13,2 +26,7 @@ | ||
### 0.2.2 / 23rd January 2019 | ||
* Fixed [the bad dependency on event-stream](https://github.com/dominictarr/event-stream/issues/116) | ||
### 0.2.1 / 21st January 2019 | ||
@@ -15,0 +33,0 @@ |
@@ -1,107 +0,108 @@ | ||
const filters = require('./filters'); | ||
/** | ||
* @module document | ||
* | ||
* @description | ||
* Implements the main document returned when a Word file has been extracted. This exposes | ||
* methods that allow the body, annotations, headers, footnotes, and endnotes, to be | ||
* read and used. | ||
* | ||
* @author | ||
* Stuart Watt <stuart@morungos.com> | ||
*/ | ||
const getPieceIndex = (pieces, position) => { | ||
for (let i = 0; i < pieces.length; i++) { | ||
const piece = pieces[i]; | ||
if (position <= piece.endPosition) { | ||
return i; | ||
} | ||
} | ||
}; | ||
const { filter } = require('./filters'); | ||
const filter = (text, shouldFilter) => { | ||
class Document { | ||
if (!shouldFilter) { | ||
return text; | ||
constructor() { | ||
this._body = ""; | ||
this._footnotes = ""; | ||
this._endnotes = ""; | ||
this._headers = ""; | ||
this._annotations = ""; | ||
} | ||
const replacer = function(match, ...rest) { | ||
if (match.length === 1) { | ||
const replaced = filters[match.charCodeAt(0)]; | ||
if (replaced === 0) { | ||
return ""; | ||
} else { | ||
return replaced; | ||
} | ||
} else if (rest.length === 2) { | ||
return ""; | ||
} else if (rest.length === 3) { | ||
return rest[0]; | ||
} | ||
}; | ||
/** | ||
* Accessor to read the main body part of a Word file | ||
* @returns a string, containing the Word file body | ||
*/ | ||
getBody(filterUnicode) { | ||
const value = this._body; | ||
return (filterUnicode == false) ? value : filter(value); | ||
} | ||
const matcher = /(?:[\x02\x05\x07\x08\x0a\x0d\u2018\u2019\u201c\u201d\u2002\u2003\u2012\u2013\u2014]|\x13(?:[^\x14]*\x14)?([^\x15]*)\x15)/g; | ||
return text.replace(matcher, replacer); | ||
}; | ||
/** | ||
* Accessor to read the footnotes part of a Word file | ||
* @returns a string, containing the Word file footnotes | ||
*/ | ||
getFootnotes(filterUnicode) { | ||
const value = this._footnotes; | ||
return (filterUnicode == false) ? value : filter(value); | ||
} | ||
class Document { | ||
/** | ||
* Accessor to read the endnotes part of a Word file | ||
* @returns a string, containing the Word file endnotes | ||
*/ | ||
getEndnotes(filterUnicode) { | ||
const value = this._endnotes; | ||
return (filterUnicode == false) ? value : filter(value); | ||
} | ||
constructor() { | ||
this.pieces = []; | ||
this.bookmarks = {}; | ||
this.boundaries = {}; | ||
/** | ||
* Accessor to read the headers part of a Word file | ||
* @returns a string, containing the Word file headers | ||
*/ | ||
getHeaders(filterUnicode) { | ||
const value = this._headers; | ||
return (filterUnicode == false) ? value : filter(value); | ||
} | ||
getTextRange(start, end) { | ||
const { pieces } = this; | ||
const startPiece = getPieceIndex(pieces, start); | ||
const endPiece = getPieceIndex(pieces, end); | ||
const result = []; | ||
for (let i = startPiece, end1 = endPiece; i <= end1; i++) { | ||
const piece = pieces[i]; | ||
const xstart = i === startPiece ? start - piece.position : 0; | ||
const xend = i === endPiece ? end - piece.position : piece.endPosition; | ||
result.push(piece.text.substring(xstart, xend)); | ||
} | ||
return result.join(""); | ||
/** | ||
* Accessor to read the annotations part of a Word file | ||
* @returns a string, containing the Word file annotations | ||
*/ | ||
getAnnotations(filterUnicode) { | ||
const value = this._annotations; | ||
return (filterUnicode == false) ? value : filter(value); | ||
} | ||
getBody(shouldFilter) { | ||
if (shouldFilter == null) { | ||
shouldFilter = true; | ||
} | ||
const start = 0; | ||
const string = this.getTextRange(start, start + this.boundaries.ccpText); | ||
return filter(string, shouldFilter); | ||
/** | ||
* Accessor to set the main body part of a Word file | ||
* @param {*} body the body string | ||
*/ | ||
setBody(body) { | ||
this._body = body; | ||
} | ||
getFootnotes(shouldFilter) { | ||
if (shouldFilter == null) { | ||
shouldFilter = true; | ||
} | ||
const start = this.boundaries.ccpText; | ||
const string = this.getTextRange(start, start + this.boundaries.ccpFtn); | ||
return filter(string, shouldFilter); | ||
/** | ||
* Accessor to set the footnotes part of a Word file | ||
* @param {*} footnotes the footnotes string | ||
*/ | ||
setFootnotes(footnotes) { | ||
this._footnotes = footnotes; | ||
} | ||
getHeaders(shouldFilter) { | ||
if (shouldFilter == null) { | ||
shouldFilter = true; | ||
} | ||
const start = this.boundaries.ccpText + this.boundaries.ccpFtn; | ||
const string = this.getTextRange(start, start + this.boundaries.ccpHdd); | ||
return filter(string, shouldFilter); | ||
/** | ||
* Accessor to set the endnotes part of a Word file | ||
* @param {*} endnotes the endnotes string | ||
*/ | ||
setEndnotes(endnotes) { | ||
this._endnotes = endnotes; | ||
} | ||
getAnnotations(shouldFilter) { | ||
if (shouldFilter == null) { | ||
shouldFilter = true; | ||
} | ||
const start = this.boundaries.ccpText + this.boundaries.ccpFtn + this.boundaries.ccpHdd; | ||
const string = this.getTextRange(start, start + this.boundaries.ccpAtn); | ||
return filter(string, shouldFilter); | ||
/** | ||
* Accessor to set the headers part of a Word file | ||
* @param {*} headers the headers string | ||
*/ | ||
setHeaders(headers) { | ||
this._headers = headers; | ||
} | ||
getEndnotes(shouldFilter) { | ||
if (shouldFilter == null) { | ||
shouldFilter = true; | ||
} | ||
const start = this.boundaries.ccpText + this.boundaries.ccpFtn + this.boundaries.ccpHdd + this.boundaries.ccpAtn; | ||
const string = this.getTextRange(start, start + this.boundaries.ccpAtn + this.boundaries.ccpEdn); | ||
return filter(string, shouldFilter); | ||
/** | ||
* Accessor to set the annotations part of a Word file | ||
* @param {*} annotations the annotations string | ||
*/ | ||
setAnnotations(annotations) { | ||
this._annotations = annotations; | ||
} | ||
@@ -108,0 +109,0 @@ } |
@@ -1,19 +0,98 @@ | ||
const filters = []; | ||
filters[0x0002] = 0; | ||
filters[0x0005] = 0; | ||
filters[0x0008] = 0; | ||
filters[0x2018] = "'"; | ||
filters[0x2019] = "'"; | ||
filters[0x201C] = "\""; | ||
filters[0x201D] = "\""; | ||
filters[0x0007] = "\t"; | ||
filters[0x000D] = "\n"; | ||
filters[0x2002] = " "; | ||
filters[0x2003] = " "; | ||
filters[0x2012] = "-"; | ||
filters[0x2013] = "-"; | ||
filters[0x2014] = "-"; | ||
filters[0x000A] = "\n"; | ||
filters[0x000D] = "\n"; | ||
/** | ||
* @module filters | ||
* | ||
* @description | ||
* Exports several functions that implement various methods for translating | ||
* characters into Unicode, and cleaning up some of the remaining residues from | ||
* Word's odd internal marker character usage. | ||
*/ | ||
module.exports = filters; | ||
const replaceTable = []; | ||
replaceTable[0x0002] = '\x00'; | ||
replaceTable[0x0005] = '\x00'; | ||
replaceTable[0x0007] = "\t"; | ||
replaceTable[0x0008] = '\x00'; | ||
replaceTable[0x000A] = "\n"; | ||
replaceTable[0x000B] = "\n"; | ||
replaceTable[0x000C] = "\n"; | ||
replaceTable[0x000D] = "\n"; | ||
replaceTable[0x001E] = "\u2011"; | ||
const binaryToUnicodeTable = []; | ||
binaryToUnicodeTable[0x0082] = "\u201a"; | ||
binaryToUnicodeTable[0x0083] = "\u0192"; | ||
binaryToUnicodeTable[0x0084] = "\u201e"; | ||
binaryToUnicodeTable[0x0085] = "\u2026"; | ||
binaryToUnicodeTable[0x0086] = "\u2020"; | ||
binaryToUnicodeTable[0x0087] = "\u2021"; | ||
binaryToUnicodeTable[0x0088] = "\u02C6"; | ||
binaryToUnicodeTable[0x0089] = "\u2030"; | ||
binaryToUnicodeTable[0x008a] = "\u0160"; | ||
binaryToUnicodeTable[0x008b] = "\u2039"; | ||
binaryToUnicodeTable[0x008c] = "\u0152"; | ||
binaryToUnicodeTable[0x008e] = "\u017D"; | ||
binaryToUnicodeTable[0x0091] = "\u2018"; | ||
binaryToUnicodeTable[0x0092] = "\u2019"; | ||
binaryToUnicodeTable[0x0093] = "\u201C"; | ||
binaryToUnicodeTable[0x0094] = "\u201D"; | ||
binaryToUnicodeTable[0x0095] = "\u2022"; | ||
binaryToUnicodeTable[0x0096] = "\u2013"; | ||
binaryToUnicodeTable[0x0097] = "\u2014"; | ||
binaryToUnicodeTable[0x0098] = "\u02DC"; | ||
binaryToUnicodeTable[0x0099] = "\u2122"; | ||
binaryToUnicodeTable[0x009a] = "\u0161"; | ||
binaryToUnicodeTable[0x009b] = "\u203A"; | ||
binaryToUnicodeTable[0x009c] = "\u0153"; | ||
binaryToUnicodeTable[0x009e] = "\u017E"; | ||
binaryToUnicodeTable[0x009f] = "\u0178"; | ||
const binaryToUnicode = (string) => { | ||
return string.replace(/([\x80-\x9f])/g, (match) => binaryToUnicodeTable[match.charCodeAt(0)]); | ||
}; | ||
/** | ||
* The main function for cleaning OLE-based text. It runs a few standard replacements on characters | ||
* that are reserved for special purposes, also removes fields, and finally strips out any weird | ||
* characters that are likely not to be useful for anyone. | ||
* @param {*} string an input string | ||
* @returns a cleaned up string | ||
*/ | ||
const clean = (string) => { | ||
// Fields can be nested, which makes this awkward. We use a strict non-nesting model | ||
// and repeat until we find no substitutions. This is because a second match might | ||
// start before an earlier one, due to our replacements. | ||
string = string.replace(/([\x02\x05\x07\x08\x0a\x0b\x0c\x0d\x1f])/g, (match) => replaceTable[match.charCodeAt(0)]); | ||
let called = true; | ||
while (called) { | ||
called = false; | ||
string = string.replace(/(?:\x13[^\x13\x14\x15]*\x14?([^\x13\x14\x15]*)\x15)/g, (match, p1) => { called = true; return p1; }); | ||
} | ||
return string | ||
.replace(/[\x00-\x07]/g, ''); | ||
}; | ||
const filterTable = []; | ||
filterTable[0x2002] = " "; | ||
filterTable[0x2003] = " "; | ||
filterTable[0x2012] = "-"; | ||
filterTable[0x2013] = "-"; | ||
filterTable[0x2014] = "-"; | ||
filterTable[0x2018] = "'"; | ||
filterTable[0x2019] = "'"; | ||
filterTable[0x201c] = "\""; | ||
filterTable[0x201d] = "\""; | ||
const filter = (string) => { | ||
return string | ||
.replace(/[\u2002\u2003\u2012\u2013\u2014\u2018\u2019\u201c\u201d]/g, (match) => filterTable[match.charCodeAt(0)]); | ||
}; | ||
module.exports = { | ||
clean: clean, | ||
binaryToUnicode: binaryToUnicode, | ||
filter: filter | ||
}; |
277
lib/word.js
@@ -1,237 +0,72 @@ | ||
/* | ||
* decaffeinate suggestions: | ||
* DS102: Remove unnecessary code created because of implicit returns | ||
* DS205: Consider reworking code to avoid use of IIFEs | ||
* DS206: Consider reworking classes to avoid initClass | ||
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md | ||
/** | ||
* @module word | ||
* | ||
* @description | ||
* The main module for the package. This exports an extractor class, which | ||
* provides a single `extract` method that can be called with either a | ||
* string (filename) or a buffer. | ||
*/ | ||
const { Buffer } = require('buffer'); | ||
const oleDoc = require('./ole-doc').OleCompoundDoc; | ||
const { Buffer } = require('buffer'); | ||
const filters = require('./filters'); // eslint-disable-line no-unused-vars | ||
const translations = require('./translations'); // eslint-disable-line no-unused-vars | ||
const WordOleExtractor = require('./word-ole-extractor'); | ||
const OpenOfficeExtractor = require('./open-office-extractor'); | ||
const Document = require('./document'); | ||
const BufferReader = require('./buffer-reader'); | ||
const FileReader = require('./file-reader'); | ||
var WordExtractor = (function() { | ||
let streamBuffer = undefined; | ||
let extractDocument = undefined; | ||
let documentStream = undefined; | ||
let writeBookmarks = undefined; | ||
let writePieces = undefined; | ||
let extractWordDocument = undefined; | ||
let getPiece = undefined; | ||
let addText = undefined; | ||
let addUnicodeText = undefined; | ||
WordExtractor = class WordExtractor { | ||
static initClass() { | ||
//# Given an OLE stream, returns all the data in a buffer, | ||
//# as a promise. | ||
streamBuffer = stream => | ||
new Promise(function(resolve, reject) { | ||
const chunks = []; | ||
stream.on('data', chunk => chunks.push(chunk)); | ||
stream.on('error', error => reject(error)); | ||
return stream.on('end', () => resolve(Buffer.concat(chunks))); | ||
}) | ||
; | ||
extractDocument = filename => | ||
new Promise(function(resolve, reject) { | ||
const document = new oleDoc(filename); | ||
document.on('err', error => { | ||
return reject(error); | ||
}); | ||
document.on('ready', () => { | ||
return resolve(document); | ||
}); | ||
return document.read(); | ||
}) | ||
; | ||
documentStream = (document, stream) => Promise.resolve(document.stream(stream)); | ||
writeBookmarks = function(buffer, tableBuffer, result) { | ||
const fcSttbfBkmk = buffer.readUInt32LE(0x0142); | ||
const lcbSttbfBkmk = buffer.readUInt32LE(0x0146); | ||
const fcPlcfBkf = buffer.readUInt32LE(0x014a); | ||
const lcbPlcfBkf = buffer.readUInt32LE(0x014e); | ||
const fcPlcfBkl = buffer.readUInt32LE(0x0152); | ||
const lcbPlcfBkl = buffer.readUInt32LE(0x0156); | ||
if (lcbSttbfBkmk === 0) { return; } | ||
const sttbfBkmk = tableBuffer.slice(fcSttbfBkmk, fcSttbfBkmk + lcbSttbfBkmk); | ||
const plcfBkf = tableBuffer.slice(fcPlcfBkf, fcPlcfBkf + lcbPlcfBkf); | ||
const plcfBkl = tableBuffer.slice(fcPlcfBkl, fcPlcfBkl + lcbPlcfBkl); | ||
const fcExtend = sttbfBkmk.readUInt16LE(0); | ||
const cData = sttbfBkmk.readUInt16LE(2); // eslint-disable-line no-unused-vars | ||
const cbExtra = sttbfBkmk.readUInt16LE(4); // eslint-disable-line no-unused-vars | ||
if (fcExtend !== 0xffff) { | ||
throw new Error("Internal error: unexpected single-byte bookmark data"); | ||
} | ||
let offset = 6; | ||
const index = 0; | ||
const bookmarks = {}; // eslint-disable-line no-unused-vars | ||
while (offset < lcbSttbfBkmk) { | ||
let length = sttbfBkmk.readUInt16LE(offset); | ||
length = length * 2; | ||
const segment = sttbfBkmk.slice(offset + 2, offset + 2 + length); | ||
const cpStart = plcfBkf.readUInt32LE(index * 4); | ||
const cpEnd = plcfBkl.readUInt32LE(index * 4); | ||
result.bookmarks[segment] = {start: cpStart, end: cpEnd}; | ||
offset = offset + length + 2; | ||
} | ||
}; | ||
writePieces = function(buffer, tableBuffer, result) { | ||
let flag; | ||
let pos = buffer.readUInt32LE(0x01a2); | ||
while (true) { // eslint-disable-line no-constant-condition | ||
flag = tableBuffer.readUInt8(pos); | ||
if (flag !== 1) { break; } | ||
pos = pos + 1; | ||
const skip = tableBuffer.readUInt16LE(pos); | ||
pos = pos + 2 + skip; | ||
} | ||
flag = tableBuffer.readUInt8(pos); | ||
pos = pos + 1; | ||
if (flag !== 2) { | ||
throw new Error("Internal error: ccorrupted Word file"); | ||
} | ||
const pieceTableSize = tableBuffer.readUInt32LE(pos); | ||
pos = pos + 4; | ||
const pieces = (pieceTableSize - 4) / 12; | ||
let start = 0; | ||
let lastPosition = 0; | ||
for (let x = 0, end = pieces - 1; x <= end; x++) { | ||
const offset = pos + ((pieces + 1) * 4) + (x * 8) + 2; | ||
let filePos = tableBuffer.readUInt32LE(offset); | ||
let unicode = false; | ||
if ((filePos & 0x40000000) === 0) { | ||
unicode = true; | ||
} else { | ||
filePos = filePos & ~(0x40000000); | ||
filePos = Math.floor(filePos / 2); | ||
} | ||
const lStart = tableBuffer.readUInt32LE(pos + (x * 4)); | ||
const lEnd = tableBuffer.readUInt32LE(pos + ((x + 1) * 4)); | ||
const totLength = lEnd - lStart; | ||
/** | ||
* The main class for the word extraction package. Typically, people will make | ||
* an instance of this class, and call the {@link #extract} method to transform | ||
* a Word file into a {@link Document} instance, which provides the accessors | ||
* needed to read its body, and so on. | ||
*/ | ||
class WordExtractor { | ||
const piece = { | ||
start, | ||
totLength, | ||
filePos, | ||
unicode | ||
}; | ||
constructor() {} | ||
getPiece(buffer, piece); | ||
piece.length = piece.text.length; | ||
piece.position = lastPosition; | ||
piece.endPosition = lastPosition + piece.length; | ||
result.pieces.push(piece); | ||
/** | ||
* Extracts the main contents of the file. If a Buffer is passed, that | ||
* is used instead. Opens the file, and reads the first block, uses that | ||
* to detect whether this is a .doc file or a .docx file, and then calls | ||
* either {@link WordOleDocument#extract} or {@link OpenOfficeDocument#extract} | ||
* accordingly. | ||
* | ||
* @param {*} source either a string filename, or a Buffer containing the file content | ||
* @returns a {@link Document} providing accessors onto the text | ||
*/ | ||
extract(source) { | ||
let reader = null; | ||
if (Buffer.isBuffer(source)) { | ||
reader = new BufferReader(source); | ||
} else if (typeof source === 'string') { | ||
reader = new FileReader(source); | ||
} | ||
const buffer = Buffer.alloc(512); | ||
return reader.open() | ||
.then(() => reader.read(buffer, 0, 512, 0)) | ||
.then((buffer) => { | ||
let extractor = null; | ||
start = start + (unicode ? Math.floor(totLength / 2) : totLength); | ||
lastPosition = lastPosition + piece.length; | ||
} | ||
}; | ||
extractWordDocument = (document, buffer) => | ||
new Promise(function(resolve, reject) { | ||
const magic = buffer.readUInt16LE(0); | ||
if (magic !== 0xa5ec) { | ||
return reject(new Error(`This does not seem to be a Word document: Invalid magic number: ${magic.toString(16)}`)); | ||
if (buffer.readUInt16BE(0) === 0xd0cf) { | ||
extractor = WordOleExtractor; | ||
} else if (buffer.readUInt16BE(0) === 0x504b) { | ||
const next = buffer.readUInt16BE(2); | ||
if ((next === 0x0304) || (next === 0x0506) || (next === 0x0708)) { | ||
extractor = OpenOfficeExtractor; | ||
} | ||
const flags = buffer.readUInt16LE(0xA); | ||
const table = (flags & 0x0200) !== 0 ? "1Table" : "0Table"; | ||
return documentStream(document, table) | ||
.then(stream => streamBuffer(stream)).then(function(tableBuffer) { | ||
const result = new Document(); | ||
result.boundaries.fcMin = buffer.readUInt32LE(0x0018); | ||
result.boundaries.ccpText = buffer.readUInt32LE(0x004c); | ||
result.boundaries.ccpFtn = buffer.readUInt32LE(0x0050); | ||
result.boundaries.ccpHdd = buffer.readUInt32LE(0x0054); | ||
result.boundaries.ccpAtn = buffer.readUInt32LE(0x005c); | ||
result.boundaries.ccpEdn = buffer.readUInt32LE(0x0060); | ||
writeBookmarks(buffer, tableBuffer, result); | ||
writePieces(buffer, tableBuffer, result); | ||
return resolve(result);}).catch(error => reject(error)); | ||
}) | ||
; | ||
getPiece = function(buffer, piece) { | ||
const pstart = piece.start; | ||
const ptotLength = piece.totLength; | ||
const pfilePos = piece.filePos; | ||
const punicode = piece.unicode; | ||
const pend = pstart + ptotLength; | ||
const textStart = pfilePos; | ||
const textEnd = textStart + (pend - pstart); | ||
if (punicode) { | ||
return piece.text = addUnicodeText(buffer, textStart, textEnd); | ||
} else { | ||
return piece.text = addText(buffer, textStart, textEnd); | ||
} | ||
}; | ||
addText = function(buffer, textStart, textEnd) { | ||
const slice = buffer.slice(textStart, textEnd); | ||
return slice.toString('binary'); | ||
}; | ||
addUnicodeText = function(buffer, textStart, textEnd) { | ||
const slice = buffer.slice(textStart, (2*textEnd) - textStart); | ||
const string = slice.toString('ucs2'); | ||
// See the conversion table for FcCompressed structures. Note that these | ||
// should not affect positions, as these are characters now, not bytes | ||
// for i in [0..string.length] | ||
// if | ||
return string; | ||
}; | ||
} | ||
constructor() {} | ||
if (! extractor) { | ||
throw new Error("Unable to read this type of file"); | ||
} | ||
return (new extractor()).extract(reader); | ||
}) | ||
.finally(() => reader.close()); | ||
} | ||
extract(filename) { | ||
return extractDocument(filename) | ||
.then(document => | ||
documentStream(document, 'WordDocument') | ||
.then(stream => streamBuffer(stream)).then(buffer => extractWordDocument(document, buffer)) | ||
); | ||
} | ||
}; | ||
WordExtractor.initClass(); | ||
return WordExtractor; | ||
})(); | ||
} | ||
module.exports = WordExtractor; |
{ | ||
"name": "word-extractor", | ||
"version": "0.3.0", | ||
"version": "1.0.0", | ||
"description": "Node.js package to read Word .doc files", | ||
@@ -8,3 +8,5 @@ "main": "lib/word.js", | ||
"test": "jest", | ||
"test-watch": "jest --watch" | ||
"test-watch": "jest --watch", | ||
"coverage": "jest --coverage", | ||
"jsdoc": "jsdoc lib --configure jsdoc.json" | ||
}, | ||
@@ -25,9 +27,10 @@ "repository": { | ||
"devDependencies": { | ||
"eslint": "^5.8.0", | ||
"jest": "^24.1.0" | ||
"eslint": "^7.25.0", | ||
"jest": "^26.6.0", | ||
"jest-specific-snapshot": "^4.0.0", | ||
"jsdoc": "^3.6.6" | ||
}, | ||
"dependencies": { | ||
"async": "^1.5.2", | ||
"event-stream": "^3.3.4", | ||
"underscore": "^1.8.3" | ||
"sax": "^1.2.4", | ||
"yauzl": "^2.10.0" | ||
}, | ||
@@ -40,3 +43,3 @@ "jest": { | ||
"transformIgnorePatterns": [], | ||
"testRegex": "(/__tests__/.*|(\\.|/)(test))\\.jsx?$", | ||
"testRegex": "(/__tests__/.*?_test)\\.jsx?$", | ||
"collectCoverageFrom": [ | ||
@@ -43,0 +46,0 @@ "lib/**/*.js" |
@@ -1,14 +0,24 @@ | ||
### word-extractor | ||
## word-extractor | ||
Read data from a Word document using node.js | ||
Read data from a Word document (.doc or .docx) using Node.js | ||
### Why use this module? | ||
#### Why use this module? | ||
There are a fair number of npm components which can extract text from Word .doc | ||
files, but they all appear to require some external helper program, and involve | ||
either spawning a process or communicating with a persistent one. That raises | ||
the installation and deployment burden as well as the runtime one. | ||
There are a fair number of npm components which can extract text from Word .doc files, but they all appear to require some external helper program, and involve either spawning a process or communicating with a persistent one. That raises the installation and deployment burden as well as the runtime one. | ||
This module is intended to provide a much faster way of reading the text from a | ||
Word file, without leaving the Node.js environment. | ||
This module is intended to provide a much faster way of reading the text from a Word file, without leaving the node.js environment. | ||
This means you do not need to install Word, Office, or anything else, and the | ||
module will work on all platforms, without any native binary code requirements. | ||
#### How do I install this module? | ||
As of version 1.0, this module supports both traditional, OLE-based, Word files (usually .doc), | ||
and modern, Open Office-style, ECMA-376 Word files (usually .docx). It can be | ||
used both with files and with file contents in a Node.js Buffer. | ||
### How do I install this module? | ||
```bash= | ||
@@ -21,45 +31,58 @@ yarn add word-extractor | ||
#### How do I use this module? | ||
### How do I use this module? | ||
var WordExtractor = require("word-extractor"); | ||
var extractor = new WordExtractor(); | ||
var extracted = extractor.extract("file.doc"); | ||
extracted.then(function(doc) { | ||
console.log(doc.getBody()); | ||
}); | ||
``` | ||
const WordExtractor = require("word-extractor"); | ||
const extractor = new WordExtractor(); | ||
const extracted = extractor.extract("file.doc"); | ||
The object returned from the `extract()` method is a promise that resolves to a document object, which then provides several views onto different parts of the document contents. | ||
extracted.then(function(doc) { console.log(doc.getBody()); }); | ||
``` | ||
The object returned from the `extract()` method is a promise that resolves to a | ||
document object, which then provides several views onto different parts of the | ||
document contents. | ||
#### Methods | ||
### Methods | ||
`WordExtractor#extract(file)` | ||
`WordExtractor#extract(<filename> | <Buffer>)` | ||
Main method to open a Word file and retrieve the data. Returns a promise which resolves to a `Document`. | ||
Main method to open a Word file and retrieve the data. Returns a promise which | ||
resolves to a `Document`. If a Buffer is passed instead of a filename, then | ||
the buffer is used directly, instad of reading a disk from the file system. | ||
`Document#getBody()` | ||
Retrieves the content text from a Word document. This will handle UNICODE characters correctly, so if there are accented or non-Latin-1 characters present in the document, they'll show as is in the returned string. | ||
Retrieves the content text from a Word document. This will handle UNICODE | ||
characters correctly, so if there are accented or non-Latin-1 characters | ||
present in the document, they'll show as is in the returned string. | ||
`Document#getFootnotes()` | ||
Retrieves the footnote text from a Word document. This will handle UNICODE characters correctly, so if there are accented or non-Latin-1 characters present in the document, they'll show as is in the returned string. | ||
Retrieves the footnote text from a Word document. This will handle UNICODE | ||
characters correctly, so if there are accented or non-Latin-1 characters | ||
present in the document, they'll show as is in the returned string. | ||
`Document#getEndnotes()` | ||
Retrieves the endnote text from a Word document. This will handle UNICODE characters correctly, so if there are accented or non-Latin-1 characters present in the document, they'll show as is in the returned string. | ||
Retrieves the endnote text from a Word document. This will handle UNICODE | ||
characters correctly, so if there are accented or non-Latin-1 characters | ||
present in the document, they'll show as is in the returned string. | ||
`Document#getHeaders()` | ||
Retrieves the header and footer text from a Word document. This will handle UNICODE characters correctly, so if there are accented or non-Latin-1 characters present in the document, they'll show as is in the returned string. | ||
Retrieves the header and footer text from a Word document. This will handle | ||
UNICODE characters correctly, so if there are accented or non-Latin-1 | ||
characters present in the document, they'll show as is in the returned string. | ||
`Document#getAnnotations()` | ||
Retrieves the comment bubble text from a Word document. This will handle UNICODE characters correctly, so if there are accented or non-Latin-1 characters present in the document, they'll show as is in the returned string. | ||
Retrieves the comment bubble text from a Word document. This will handle | ||
UNICODE characters correctly, so if there are accented or non-Latin-1 | ||
characters present in the document, they'll show as is in the returned string. | ||
### License | ||
#### License | ||
Copyright (c) 2016-2021. Stuart Watt. | ||
Copyright (c) 2016-2019. Stuart Watt. | ||
Licensed under the MIT License. |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Native code
Supply chain riskContains native code (e.g., compiled binaries or shared libraries). Including native code can obscure malicious behavior.
Found 12 instances in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Mixed license
License(Experimental) Package contains multiple licenses.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
1984648
2
95
3170
0
88
1
4
1
13
+ Addedsax@^1.2.4
+ Addedyauzl@^2.10.0
+ Addedbuffer-crc32@0.2.13(transitive)
+ Addedfd-slicer@1.1.0(transitive)
+ Addedpend@1.2.0(transitive)
+ Addedsax@1.4.1(transitive)
+ Addedyauzl@2.10.0(transitive)
- Removedasync@^1.5.2
- Removedevent-stream@^3.3.4
- Removedunderscore@^1.8.3
- Removedasync@1.5.2(transitive)
- Removedduplexer@0.1.2(transitive)
- Removedevent-stream@3.3.5(transitive)
- Removedfrom@0.1.7(transitive)
- Removedmap-stream@0.0.7(transitive)
- Removedpause-stream@0.0.11(transitive)
- Removedsplit@1.0.1(transitive)
- Removedstream-combiner@0.2.2(transitive)
- Removedthrough@2.3.8(transitive)
- Removedunderscore@1.13.7(transitive)