@nosferatu500/textract-lite
Advanced tools
Comparing version 3.0.3 to 3.0.4
{ | ||
"extends": "airbnb", | ||
"extends": [ | ||
"airbnb", | ||
"prettier", | ||
"plugin:prettier/recommended", | ||
"plugin:import/recommended", | ||
"plugin:promise/recommended", | ||
"plugin:unicorn/recommended", | ||
"plugin:sonarjs/recommended" | ||
], | ||
"plugins": ["prettier", "import", "sonarjs", "promise", "unicorn"], | ||
"parserOptions": { | ||
"ecmaVersion": 2021, | ||
"sourceType": "module" | ||
}, | ||
"rules": { | ||
"space-in-parens": ["error", "always", { "exceptions": ["{}"] }], | ||
"comma-style": ["error", "first", { "exceptions": { "ArrayExpression": true, "ObjectExpression": true } }], | ||
"comma-dangle": ["error", "only-multiline"], | ||
"space-before-function-paren": ["error", "never"], | ||
"func-names": 0, | ||
"no-param-reassign": 0, | ||
"one-var-declaration-per-line": ["error", "initializations"], | ||
"no-underscore-dangle": 0, | ||
// es6 | ||
"object-shorthand": 0, | ||
"one-var": ["error", "always"], | ||
"no-var": 0, | ||
"prefer-template": 0, | ||
"prefer-arrow-callback": 0, | ||
"prefer-rest-params": 0 | ||
// Disabled | ||
"no-console": "off", | ||
"no-continue": "off", | ||
"no-plusplus": "off", | ||
"no-case-declarations": "off", | ||
"no-prototype-builtins": "off", | ||
"no-restricted-syntax": "off", | ||
"no-param-reassign": "off", | ||
"import/prefer-default-export": "off", | ||
"import/no-default-export": "off", | ||
"import/no-extraneous-dependencies": "off", | ||
"import/named": "off", | ||
"import/no-named-as-default": "off", | ||
"unicorn/prevent-abbreviations": "off", | ||
"unicorn/filename-case": "off", | ||
"unicorn/no-useless-undefined": "off", | ||
"unicorn/prefer-node-protocol": "off", | ||
"sonarjs/cognitive-complexity": "off", | ||
//Modified | ||
"require-atomic-updates": "warn", | ||
"curly": ["warn", "multi-line", "consistent"], | ||
"no-use-before-define": [ | ||
"error", | ||
{ "functions": false, "classes": true, "variables": true } | ||
], | ||
"sort-imports": [ | ||
"error", | ||
{ | ||
"ignoreDeclarationSort": true | ||
} | ||
], | ||
"import/no-unresolved": ["error", { "commonjs": true, "amd": true }], | ||
"import/export": "error", | ||
"import/order": [ | ||
"error", | ||
{ | ||
"alphabetize": { "order": "asc", "caseInsensitive": true }, | ||
"groups": [ | ||
"builtin", | ||
"external", | ||
"internal", | ||
"parent", | ||
"sibling", | ||
"index", | ||
"object" | ||
], | ||
"newlines-between": "never", | ||
"pathGroups": [ | ||
{ | ||
"pattern": "react", | ||
"group": "builtin", | ||
"position": "before" | ||
} | ||
], | ||
"pathGroupsExcludedImportTypes": ["builtin"] | ||
} | ||
], | ||
"sonarjs/max-switch-cases": ["warn", 50] | ||
}, | ||
"settings": { | ||
"import/extensions": [".js"], | ||
"import/resolver": { | ||
"node": { | ||
"extensions": [".js", ".json"], | ||
"paths": ["lib", "test"] | ||
} | ||
} | ||
}, | ||
"env": { | ||
"mocha": true | ||
"mocha": true, | ||
"browser": true, | ||
"shared-node-browser": true, | ||
"node": true, | ||
"es2021": true | ||
}, | ||
@@ -24,0 +98,0 @@ "globals": { |
@@ -1,22 +0,18 @@ | ||
var path = require( 'path' ) | ||
, textract = require( './index' ); | ||
const path = require("path"); | ||
const textract = require("./index"); | ||
module.exports = function( filePath, flags ) { | ||
filePath = path.resolve( process.cwd(), filePath ); | ||
module.exports = function (filePath, flags) { | ||
filePath = path.resolve(process.cwd(), filePath); | ||
if ( flags.preserveLineBreaks === 'false' ) { | ||
flags.preserveLineBreaks = false; | ||
} else { | ||
flags.preserveLineBreaks = true; | ||
} | ||
flags.preserveLineBreaks = flags.preserveLineBreaks !== "false"; | ||
textract.fromFileWithPath( filePath, flags, function( error, text ) { | ||
if ( error ) { | ||
// eslint-disable-next-line no-console | ||
console.error( error ); | ||
} else { | ||
// eslint-disable-next-line no-console | ||
console.log( text ); | ||
} | ||
}); | ||
textract.fromFileWithPath(filePath, flags, function (error, text) { | ||
if (error) { | ||
// eslint-disable-next-line no-console | ||
console.error(error); | ||
} else { | ||
// eslint-disable-next-line no-console | ||
console.log(text); | ||
} | ||
}); | ||
}; |
@@ -1,161 +0,165 @@ | ||
var fs = require( 'fs' ) | ||
, path = require( 'path' ) | ||
, { decode } = require( 'html-entities' ) | ||
, util = require( './util' ) | ||
, extractorPath = path.join( __dirname, 'extractors' ) | ||
, entitiesDecode = decode | ||
, typeExtractors = {} | ||
, regexExtractors = [] | ||
, failedExtractorTypes = {} | ||
, totalExtractors = 0 | ||
, satisfiedExtractors = 0 | ||
, hasInitialized = false | ||
, STRIP_ONLY_SINGLE_LINEBREAKS = /(^|[^\n])\n(?!\n)/g | ||
, WHITELIST_PRESERVE_LINEBREAKS = /[^A-Za-z\x80-\xFF\x24\u20AC\xA3\xA5 0-9 \u2015\u2116\u2018\u2019\u201C|\u201D\u2026 \uFF0C \u2013 \u2014 \u00C0-\u1FFF \u2C00-\uD7FF \uFB50–\uFDFF \uFE70–\uFEFF \uFF01-\uFFE6 \.,\?'""„«»!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~'-\w\n\r]*/g // eslint-disable-line max-len | ||
, WHITELIST_STRIP_LINEBREAKS = /[^A-Za-z\x80-\xFF\x24\u20AC\xA3\xA5 0-9 \u2015\u2116\u2018\u2019\u201C|\u201D\u2026 \uFF0C \u2013 \u2014 \u00C0-\u1FFF \u2C00-\uD7FF \uFB50–\uFDFF \uFE70–\uFEFF \uFF01-\uFFE6 \.,\?'""„«»!@#\$%\^&\*\(\)-_=\+;:<>\/\\\|\}\{\[\]`~'-\w]*/g // eslint-disable-line max-len | ||
; | ||
const fs = require("fs"); | ||
const path = require("path"); | ||
const { decode } = require("html-entities"); | ||
const util = require("./util"); | ||
function registerExtractor( extractor ) { | ||
if ( extractor.types ) { | ||
extractor.types.forEach( function( type ) { | ||
if ( typeof type === 'string' ) { | ||
type = type.toLowerCase(); | ||
typeExtractors[type] = extractor.extract; | ||
} else if ( type instanceof RegExp ) { | ||
regexExtractors.push({ reg: type, extractor: extractor.extract }); | ||
} | ||
}); | ||
} | ||
const extractorPath = path.join(__dirname, "extractors"); | ||
const entitiesDecode = decode; | ||
const typeExtractors = {}; | ||
const regexExtractors = []; | ||
const failedExtractorTypes = {}; | ||
let totalExtractors = 0; | ||
let satisfiedExtractors = 0; | ||
let hasInitialized = false; | ||
const STRIP_ONLY_SINGLE_LINEBREAKS = /(^|[^\n])\n(?!\n)/g; | ||
const WHITELIST_PRESERVE_LINEBREAKS = | ||
/[^\d\n\r !"#$%&'-\w'()-_`a-z{|}~\u0080-\u1FFF\u2013–\u2014\u2015\u2018\u2019\u201C\u201D„\u2026\u20AC\u2116\u2C00-\uD7FF\uFB50\uFDFF\uFE70\uFEFF\uFF01-\uFFE6]*/g; // eslint-disable-line max-len | ||
const WHITELIST_STRIP_LINEBREAKS = | ||
/[^\d !"#$%&'-\w'()-_`a-z{|}~\u0080-\u1FFF\u2013–\u2014\u2015\u2018\u2019\u201C\u201D„\u2026\u20AC\u2116\u2C00-\uD7FF\uFB50\uFDFF\uFE70\uFEFF\uFF01-\uFFE6]*/g; | ||
// eslint-disable-line max-len | ||
function registerExtractor(extractor) { | ||
if (extractor.types) { | ||
for (let type of extractor.types) { | ||
if (typeof type === "string") { | ||
type = type.toLowerCase(); | ||
typeExtractors[type] = extractor.extract; | ||
} else if (type instanceof RegExp) { | ||
regexExtractors.push({ reg: type, extractor: extractor.extract }); | ||
} | ||
} | ||
} | ||
} | ||
function registerFailedExtractor( extractor, failedMessage ) { | ||
if ( extractor.types ) { | ||
extractor.types.forEach( function( type ) { | ||
failedExtractorTypes[type.toLowerCase()] = failedMessage; | ||
}); | ||
} | ||
function registerFailedExtractor(extractor, failedMessage) { | ||
if (extractor.types) { | ||
for (const type of extractor.types) { | ||
failedExtractorTypes[type.toLowerCase()] = failedMessage; | ||
} | ||
} | ||
} | ||
function testExtractor( extractor, options ) { | ||
extractor.test( options, function( passedTest, failedMessage ) { | ||
satisfiedExtractors++; | ||
if ( passedTest ) { | ||
registerExtractor( extractor ); | ||
} else { | ||
registerFailedExtractor( extractor, failedMessage ); | ||
} | ||
}); | ||
function testExtractor(extractor, options) { | ||
extractor.test(options, function (passedTest, failedMessage) { | ||
satisfiedExtractors++; | ||
if (passedTest) { | ||
registerExtractor(extractor); | ||
} else { | ||
registerFailedExtractor(extractor, failedMessage); | ||
} | ||
}); | ||
} | ||
// global, all file type, content cleansing | ||
function cleanseText( options, cb ) { | ||
return function( error, text ) { | ||
if ( !error ) { | ||
// clean up text | ||
text = util.replaceBadCharacters( text ); | ||
function cleanseText(options, cb) { | ||
return function (error, text) { | ||
if (!error) { | ||
// clean up text | ||
text = util.replaceBadCharacters(text); | ||
if ( options.preserveLineBreaks || options.preserveOnlyMultipleLineBreaks ) { | ||
if ( options.preserveOnlyMultipleLineBreaks ) { | ||
text = text.replace( STRIP_ONLY_SINGLE_LINEBREAKS, '$1 ' ).trim(); | ||
} | ||
text = text.replace( WHITELIST_PRESERVE_LINEBREAKS, ' ' ); | ||
} else { | ||
text = text.replace( WHITELIST_STRIP_LINEBREAKS, ' ' ); | ||
} | ||
if (options.preserveLineBreaks || options.preserveOnlyMultipleLineBreaks) { | ||
if (options.preserveOnlyMultipleLineBreaks) { | ||
text = text.replace(STRIP_ONLY_SINGLE_LINEBREAKS, "$1 ").trim(); | ||
} | ||
text = text.replace(WHITELIST_PRESERVE_LINEBREAKS, " "); | ||
} else { | ||
text = text.replace(WHITELIST_STRIP_LINEBREAKS, " "); | ||
} | ||
// multiple spaces, tabs, vertical tabs, non-breaking space] | ||
text = text.replace( / (?! )/g, '' ) | ||
.replace( /[ \t\v\u00A0]{2,}/g, ' ' ); | ||
// multiple spaces, tabs, vertical tabs, non-breaking space] | ||
text = text.replace(/ (?! )/g, "").replace(/[\t\v \u00A0]{2,}/g, " "); | ||
text = entitiesDecode( text, { level: 'xml' }); | ||
} | ||
cb( error, text ); | ||
}; | ||
text = entitiesDecode(text, { level: "xml" }); | ||
} | ||
cb(error, text); | ||
}; | ||
} | ||
function initializeExtractors( options ) { | ||
var extractors; | ||
function initializeExtractors(options) { | ||
let extractors; | ||
hasInitialized = true; | ||
hasInitialized = true; | ||
// discover available extractors | ||
extractors = fs.readdirSync( extractorPath ).map( function( item ) { | ||
var fullExtractorPath = path.join( extractorPath, item ); | ||
// get the extractor | ||
// eslint-disable-next-line global-require | ||
return require( fullExtractorPath ); | ||
}); | ||
// discover available extractors | ||
extractors = fs.readdirSync(extractorPath).map(function (item) { | ||
const fullExtractorPath = path.join(extractorPath, item); | ||
// get the extractor | ||
// eslint-disable-next-line global-require | ||
return require(fullExtractorPath); | ||
}); | ||
// perform any binary tests to ensure extractor is possible | ||
// given execution environment | ||
extractors.forEach( function( extractor ) { | ||
if ( extractor.test ) { | ||
testExtractor( extractor, options ); | ||
} else { | ||
satisfiedExtractors++; | ||
registerExtractor( extractor ); | ||
// perform any binary tests to ensure extractor is possible | ||
// given execution environment | ||
for (const extractor of extractors) { | ||
if (extractor.test) { | ||
testExtractor(extractor, options); | ||
} else { | ||
satisfiedExtractors++; | ||
registerExtractor(extractor); | ||
} | ||
} | ||
}); | ||
// need to keep track of how many extractors we have in total | ||
totalExtractors = extractors.length; | ||
// need to keep track of how many extractors we have in total | ||
totalExtractors = extractors.length; | ||
} | ||
function findExtractor( type ) { | ||
var i | ||
, iLen = regexExtractors.length | ||
, extractor | ||
, regexExtractor; | ||
function findExtractor(type) { | ||
let i; | ||
const iLen = regexExtractors.length; | ||
let extractor; | ||
let regexExtractor; | ||
type = type.toLowerCase(); | ||
if ( typeExtractors[type] ) { | ||
extractor = typeExtractors[type]; | ||
} else { | ||
for ( i = 0; i < iLen; i++ ) { | ||
regexExtractor = regexExtractors[i]; | ||
if ( type.match( regexExtractor.reg ) ) { | ||
extractor = regexExtractor.extractor; | ||
} | ||
type = type.toLowerCase(); | ||
if (typeExtractors[type]) { | ||
extractor = typeExtractors[type]; | ||
} else { | ||
for (i = 0; i < iLen; i++) { | ||
regexExtractor = regexExtractors[i]; | ||
if (regexExtractor.reg.test(type)) { | ||
extractor = regexExtractor.extractor; | ||
} | ||
} | ||
} | ||
} | ||
return extractor; | ||
return extractor; | ||
} | ||
function extract( type, filePath, options, cb ) { | ||
var error, msg, theExtractor; | ||
function extract(type, filePath, options, cb) { | ||
let error; | ||
let msg; | ||
let theExtractor; | ||
if ( !hasInitialized ) { | ||
initializeExtractors( options ); | ||
} | ||
if (!hasInitialized) { | ||
initializeExtractors(options); | ||
} | ||
// registration of extractors complete? | ||
if ( totalExtractors === satisfiedExtractors ) { | ||
theExtractor = findExtractor( type ); | ||
// registration of extractors complete? | ||
if (totalExtractors === satisfiedExtractors) { | ||
theExtractor = findExtractor(type); | ||
if ( theExtractor ) { | ||
cb = cleanseText( options, cb ); | ||
theExtractor( filePath, options, cb ); | ||
} else { | ||
// cannot extract this file type | ||
msg = 'Error for type: [[ ' + type + ' ]], file: [[ ' + filePath + ' ]]'; | ||
if (theExtractor) { | ||
cb = cleanseText(options, cb); | ||
theExtractor(filePath, options, cb); | ||
} else { | ||
// cannot extract this file type | ||
msg = `Error for type: [[ ${type} ]], file: [[ ${filePath} ]]`; | ||
// update error message if type is supported but just not configured/installed properly | ||
if ( failedExtractorTypes[type] ) { | ||
msg += ', extractor for type exists, but failed to initialize.' | ||
+ ' Message: ' + failedExtractorTypes[type]; | ||
} | ||
// update error message if type is supported but just not configured/installed properly | ||
if (failedExtractorTypes[type]) { | ||
msg += | ||
`, extractor for type exists, but failed to initialize.` + | ||
` Message: ${failedExtractorTypes[type]}`; | ||
} | ||
error = new Error( msg ); | ||
error.typeNotFound = true; | ||
cb( error, null ); | ||
error = new Error(msg); | ||
error.typeNotFound = true; | ||
cb(error, null); | ||
} | ||
} else { | ||
// async registration has not wrapped up | ||
// try again later | ||
setTimeout(function () { | ||
extract(type, filePath, options, cb); | ||
}, 100); | ||
} | ||
} else { | ||
// async registration has not wrapped up | ||
// try again later | ||
setTimeout( function() { | ||
extract( type, filePath, options, cb ); | ||
}, 100 ); | ||
} | ||
} | ||
module.exports = extract; |
@@ -1,85 +0,82 @@ | ||
var xpath = require( 'xpath' ) | ||
, Dom = require( '@xmldom/xmldom' ).DOMParser | ||
, yauzl = require( 'yauzl' ) | ||
, util = require( '../util' ) | ||
, includeRegex = /.xml$/ | ||
, excludeRegex = /^(word\/media\/|word\/_rels\/)/; | ||
const xpath = require("xpath"); | ||
const Dom = require("@xmldom/xmldom").DOMParser; | ||
const yauzl = require("yauzl"); | ||
const util = require("../util"); | ||
function _calculateExtractedText( inText, preserveLineBreaks ) { | ||
var doc = new Dom().parseFromString( inText ) | ||
, ps = xpath.select( "//*[local-name()='p']", doc ) | ||
, text = ''; | ||
ps.forEach( function( paragraph ) { | ||
var ts | ||
, localText = ''; | ||
paragraph = new Dom().parseFromString( paragraph.toString() ); | ||
ts = xpath.select( "//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph ); | ||
ts.forEach( function( t ) { | ||
if ( t.localName === 't' && t.childNodes.length > 0 ) { | ||
localText += t.childNodes[0].data; | ||
} else if ( t.localName === 'tab' ) { | ||
localText += ' '; | ||
} else if ( t.localName === 'br' ) { | ||
if ( preserveLineBreaks !== true ) { | ||
localText += ' '; | ||
} else { | ||
localText += '\n'; | ||
const includeRegex = /.xml$/; | ||
const excludeRegex = /^(word\/media\/|word\/_rels\/)/; | ||
function _calculateExtractedText(inText, preserveLineBreaks) { | ||
const doc = new Dom().parseFromString(inText); | ||
const ps = xpath.select("//*[local-name()='p']", doc); | ||
let text = ""; | ||
for (let paragraph of ps) { | ||
var ts; | ||
let localText = ""; | ||
paragraph = new Dom().parseFromString(paragraph.toString()); | ||
ts = xpath.select("//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph); | ||
for (const t of ts) { | ||
if (t.localName === "t" && t.childNodes.length > 0) { | ||
localText += t.childNodes[0].data; | ||
} else if (t.localName === "tab") { | ||
localText += " "; | ||
} else if (t.localName === "br") { | ||
localText += preserveLineBreaks !== true ? " " : "\n"; | ||
} | ||
} | ||
} | ||
}); | ||
text += localText + '\n'; | ||
}); | ||
text += `${localText}\n`; | ||
} | ||
return text; | ||
return text; | ||
} | ||
function extractText( filePath, options, cb ) { | ||
var result = ''; | ||
function extractText(filePath, options, cb) { | ||
let result = ""; | ||
yauzl.open( filePath, function( err, zipfile ) { | ||
var processEnd | ||
, processedEntries = 0; | ||
if ( err ) { | ||
util.yauzlError( err, cb ); | ||
return; | ||
} | ||
processEnd = function() { | ||
var text; | ||
if ( zipfile.entryCount === ++processedEntries ) { | ||
if ( result.length ) { | ||
text = _calculateExtractedText( result, options.preserveLineBreaks ); | ||
cb( null, text ); | ||
} else { | ||
cb( | ||
new Error( | ||
'Extraction could not find content in file, are you' | ||
+ ' sure it is the mime type it says it is?' | ||
), | ||
null | ||
); | ||
yauzl.open(filePath, function (err, zipfile) { | ||
let processEnd; | ||
let processedEntries = 0; | ||
if (err) { | ||
util.yauzlError(err, cb); | ||
return; | ||
} | ||
} | ||
}; | ||
zipfile.on( 'entry', function( entry ) { | ||
if ( includeRegex.test( entry.fileName ) && !excludeRegex.test( entry.fileName ) ) { | ||
util.getTextFromZipFile( zipfile, entry, function( err2, text ) { | ||
result += text + '\n'; | ||
processEnd(); | ||
processEnd = function () { | ||
let text; | ||
if (zipfile.entryCount === ++processedEntries) { | ||
if (result.length > 0) { | ||
text = _calculateExtractedText(result, options.preserveLineBreaks); | ||
cb(null, text); | ||
} else { | ||
cb( | ||
new Error( | ||
"Extraction could not find content in file, are you" + | ||
" sure it is the mime type it says it is?" | ||
), | ||
null | ||
); | ||
} | ||
} | ||
}; | ||
zipfile.on("entry", function (entry) { | ||
if (includeRegex.test(entry.fileName) && !excludeRegex.test(entry.fileName)) { | ||
util.getTextFromZipFile(zipfile, entry, function (err2, text) { | ||
result += `${text}\n`; | ||
processEnd(); | ||
}); | ||
} else { | ||
processEnd(); | ||
} | ||
}); | ||
} else { | ||
processEnd(); | ||
} | ||
}); | ||
zipfile.on( 'error', function( err3 ) { | ||
cb( err3 ); | ||
zipfile.on("error", function (err3) { | ||
cb(err3); | ||
}); | ||
}); | ||
}); | ||
} | ||
module.exports = { | ||
types: ['application/vnd.openxmlformats-officedocument.wordprocessingml.document'], | ||
extract: extractText | ||
types: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], | ||
extract: extractText, | ||
}; |
@@ -1,35 +0,36 @@ | ||
var fs = require( 'fs' ) | ||
, iconv = require( 'iconv-lite' ) | ||
, jschardet = require( 'jschardet' ) | ||
, path = require( 'path' ); | ||
const fs = require("fs"); | ||
const path = require("path"); | ||
const iconv = require("iconv-lite"); | ||
const jschardet = require("jschardet"); | ||
function extractText( filePath, options, cb ) { | ||
fs.readFile( filePath, function( error, data ) { | ||
var encoding, decoded, detectedEncoding; | ||
if ( error ) { | ||
cb( error, null ); | ||
return; | ||
} | ||
try { | ||
detectedEncoding = jschardet.detect( data ).encoding; | ||
if ( !detectedEncoding ) { | ||
error = new Error( 'Could not detect encoding for file named [[ ' | ||
+ path.basename( filePath ) + ' ]]' ); | ||
cb( error, null ); | ||
return; | ||
} | ||
encoding = detectedEncoding.toLowerCase(); | ||
function extractText(filePath, options, cb) { | ||
fs.readFile(filePath, function (error, data) { | ||
let encoding; | ||
let decoded; | ||
let detectedEncoding; | ||
if (error) { | ||
cb(error, null); | ||
return; | ||
} | ||
try { | ||
detectedEncoding = jschardet.detect(data).encoding; | ||
if (!detectedEncoding) { | ||
error = new Error(`Could not detect encoding for file named [[ ${path.basename(filePath)} ]]`); | ||
cb(error, null); | ||
return; | ||
} | ||
encoding = detectedEncoding.toLowerCase(); | ||
decoded = iconv.decode( data, encoding ); | ||
} catch ( e ) { | ||
cb( e ); | ||
return; | ||
} | ||
cb( null, decoded ); | ||
}); | ||
decoded = iconv.decode(data, encoding); | ||
} catch (error_) { | ||
cb(error_); | ||
return; | ||
} | ||
cb(null, decoded); | ||
}); | ||
} | ||
module.exports = { | ||
types: [/text\//, 'application/csv', 'application/javascript'], | ||
extract: extractText | ||
types: [/text\//, "application/csv", "application/javascript"], | ||
extract: extractText, | ||
}; |
195
lib/index.js
@@ -1,124 +0,129 @@ | ||
var fs = require( 'fs' ) | ||
, path = require( 'path' ) | ||
, mime = require( 'mime' ) | ||
, os = require( 'os' ) | ||
, extract = require( './extract' ) | ||
, tmpDir = os.tmpdir(); | ||
const fs = require("fs"); | ||
const os = require("os"); | ||
const path = require("path"); | ||
const mime = require("mime"); | ||
const extract = require("./extract"); | ||
const tmpDir = os.tmpdir(); | ||
function _genRandom() { | ||
return Math.floor( ( Math.random() * 100000000000 ) + 1 ); | ||
return Math.floor(Math.random() * 100_000_000_000 + 1); | ||
} | ||
function _extractWithType( type, filePath, options, cb ) { | ||
fs.exists( filePath, function( exists ) { | ||
if ( exists ) { | ||
extract( type, filePath, options, cb ); | ||
} else { | ||
cb( new Error( 'File at path [[ ' + filePath + ' ]] does not exist.' ), null ); | ||
} | ||
}); | ||
function _extractWithType(type, filePath, options, cb) { | ||
fs.exists(filePath, function (exists) { | ||
if (exists) { | ||
extract(type, filePath, options, cb); | ||
} else { | ||
cb(new Error(`File at path [[ ${filePath} ]] does not exist.`), null); | ||
} | ||
}); | ||
} | ||
function _returnArgsError( _args ) { | ||
var args = Array.prototype.slice.call( _args ) | ||
, callback; | ||
args.forEach( function( parm ) { | ||
if ( parm && typeof parm === 'function' ) { | ||
callback = parm; | ||
function _returnArgsError(_args) { | ||
const args = Array.prototype.slice.call(_args); | ||
let callback; | ||
for (const parm of args) { | ||
if (parm && typeof parm === "function") { | ||
callback = parm; | ||
} | ||
} | ||
}); | ||
if ( callback ) { | ||
callback( new Error( 'Incorrect parameters passed to textract.' ), null ); | ||
} else { | ||
// eslint-disable-next-line no-console | ||
console.error( 'textract could not find a callback function to execute.' ); | ||
} | ||
if (callback) { | ||
callback(new Error("Incorrect parameters passed to textract."), null); | ||
} else { | ||
// eslint-disable-next-line no-console | ||
console.error("textract could not find a callback function to execute."); | ||
} | ||
} | ||
function _writeBufferToDisk( buff, cb ) { | ||
var fullPath = path.join( tmpDir, 'textract_file_' + _genRandom() ); | ||
function _writeBufferToDisk(buff, cb) { | ||
const fullPath = path.join(tmpDir, `textract_file_${_genRandom()}`); | ||
fs.open( fullPath, 'w', function( err, fd ) { | ||
if ( err ) { | ||
throw new Error( 'error opening temp file: ' + err ); | ||
} else { | ||
fs.write( fd, buff, 0, buff.length, null, function( err2 ) { | ||
if ( err2 ) { | ||
throw new Error( 'error writing temp file: ' + err2 ); | ||
fs.open(fullPath, "w", function (err, fd) { | ||
if (err) { | ||
throw new Error(`error opening temp file: ${err}`); | ||
} else { | ||
fs.close( fd, function() { | ||
cb( fullPath ); | ||
}); | ||
fs.write(fd, buff, 0, buff.length, null, function (err2) { | ||
if (err2) { | ||
throw new Error(`error writing temp file: ${err2}`); | ||
} else { | ||
fs.close(fd, function () { | ||
cb(fullPath); | ||
}); | ||
} | ||
}); | ||
} | ||
}); | ||
} | ||
}); | ||
}); | ||
} | ||
function fromFileWithMimeAndPath( type, filePath, options, cb ) { | ||
var called = false; | ||
function fromFileWithMimeAndPath(type, filePath, options, cb) { | ||
let called = false; | ||
if ( typeof type === 'string' && typeof filePath === 'string' ) { | ||
if ( typeof cb === 'function' && typeof options === 'object' ) { | ||
// (mimeType, filePath, options, callback) | ||
_extractWithType( type, filePath, options, cb ); | ||
called = true; | ||
} else if ( typeof options === 'function' && cb === undefined ) { | ||
// (mimeType, filePath, callback) | ||
_extractWithType( type, filePath, {}, options ); | ||
called = true; | ||
if (typeof type === "string" && typeof filePath === "string") { | ||
if (typeof cb === "function" && typeof options === "object") { | ||
// (mimeType, filePath, options, callback) | ||
_extractWithType(type, filePath, options, cb); | ||
called = true; | ||
} else if (typeof options === "function" && cb === undefined) { | ||
// (mimeType, filePath, callback) | ||
_extractWithType(type, filePath, {}, options); | ||
called = true; | ||
} | ||
} | ||
} | ||
if ( !called ) { | ||
_returnArgsError( arguments ); | ||
} | ||
if (!called) { | ||
_returnArgsError(arguments); | ||
} | ||
} | ||
function fromFileWithPath( filePath, options, cb ) { | ||
var type; | ||
if ( typeof filePath === 'string' | ||
&& ( typeof options === 'function' || typeof cb === 'function' ) ) { | ||
type = ( options && options.typeOverride ) || mime.getType( filePath ); | ||
fromFileWithMimeAndPath( type, filePath, options, cb ); | ||
} else { | ||
_returnArgsError( arguments ); | ||
} | ||
function fromFileWithPath(filePath, options, cb) { | ||
let type; | ||
if (typeof filePath === "string" && (typeof options === "function" || typeof cb === "function")) { | ||
type = (options && options.typeOverride) || mime.getType(filePath); | ||
fromFileWithMimeAndPath(type, filePath, options, cb); | ||
} else { | ||
_returnArgsError(arguments); | ||
} | ||
} | ||
// eslint-disable-next-line no-unused-vars | ||
function fromBufferWithMime( type, bufferContent, options, cb, withPath ) { | ||
if ( typeof type === 'string' | ||
&& bufferContent | ||
&& bufferContent instanceof Buffer | ||
&& ( typeof options === 'function' || typeof cb === 'function' ) ) { | ||
if ( typeof options === 'function' ) { cb = options; options = {}; } | ||
_writeBufferToDisk( bufferContent, function( newPath ) { | ||
fromFileWithMimeAndPath( type, newPath, options, function( err, text ) { | ||
// Remove temporary file regardless of error, ignore error on unlink | ||
fs.unlink( newPath, function() {}); | ||
if ( cb ) cb( err, text ); | ||
}); | ||
}); | ||
} else { | ||
_returnArgsError( arguments ); | ||
} | ||
function fromBufferWithMime(type, bufferContent, options, cb, withPath) { | ||
if ( | ||
typeof type === "string" && | ||
bufferContent && | ||
bufferContent instanceof Buffer && | ||
(typeof options === "function" || typeof cb === "function") | ||
) { | ||
if (typeof options === "function") { | ||
cb = options; | ||
options = {}; | ||
} | ||
_writeBufferToDisk(bufferContent, function (newPath) { | ||
fromFileWithMimeAndPath(type, newPath, options, function (err, text) { | ||
// Remove temporary file regardless of error, ignore error on unlink | ||
fs.unlink(newPath, function () {}); | ||
if (cb) cb(err, text); | ||
}); | ||
}); | ||
} else { | ||
_returnArgsError(arguments); | ||
} | ||
} | ||
function fromBufferWithName( filePath, bufferContent, options, cb ) { | ||
var type; | ||
if ( typeof filePath === 'string' ) { | ||
type = mime.getType( filePath ); | ||
fromBufferWithMime( type, bufferContent, options, cb, true ); | ||
} else { | ||
_returnArgsError( arguments ); | ||
} | ||
function fromBufferWithName(filePath, bufferContent, options, cb) { | ||
let type; | ||
if (typeof filePath === "string") { | ||
type = mime.getType(filePath); | ||
fromBufferWithMime(type, bufferContent, options, cb, true); | ||
} else { | ||
_returnArgsError(arguments); | ||
} | ||
} | ||
module.exports = { | ||
fromFileWithPath: fromFileWithPath, | ||
fromFileWithMimeAndPath: fromFileWithMimeAndPath, | ||
fromBufferWithName: fromBufferWithName, | ||
fromBufferWithMime: fromBufferWithMime, | ||
fromFileWithPath, | ||
fromFileWithMimeAndPath, | ||
fromBufferWithName, | ||
fromBufferWithMime, | ||
}; |
227
lib/util.js
@@ -1,85 +0,83 @@ | ||
var { exec } = require( 'child_process' ) | ||
, path = require( 'path' ) | ||
, fs = require( 'fs' ) | ||
, os = require( 'os' ) | ||
, outDir = path.join( os.tmpdir(), 'textract' ) | ||
, replacements = [ | ||
[/[\u201C|\u201D|]|“|â€/g, '"'], // fancy double quotes | ||
[/[\u2018|\u2019]|’|‘]/g, '\''], // fancy single quotes/apostrophes | ||
[/…/g, '…'], // elipses | ||
[/–|—/g, '–'] // long hyphen | ||
] | ||
, rLen = replacements.length | ||
; | ||
const { exec } = require("child_process"); | ||
const fs = require("fs"); | ||
const os = require("os"); | ||
const path = require("path"); | ||
const outDir = path.join(os.tmpdir(), "textract"); | ||
const replacements = [ | ||
[/[|\u201C\u201D]|“|â€/g, '"'], // fancy double quotes | ||
[/[|\u2018\u2019]|’|‘]/g, "'"], // fancy single quotes/apostrophes | ||
[/…/g, "…"], // elipses | ||
[/–|—/g, "–"], // long hyphen | ||
]; | ||
const rLen = replacements.length; | ||
// Up front creation of tmp dir | ||
if ( !fs.existsSync( outDir ) ) { | ||
fs.mkdirSync( outDir ); | ||
if (!fs.existsSync(outDir)) { | ||
fs.mkdirSync(outDir); | ||
} | ||
// replace nasty quotes with simple ones | ||
function replaceBadCharacters( text ) { | ||
var i, repl; | ||
for ( i = 0; i < rLen; i++ ) { | ||
repl = replacements[i]; | ||
text = text.replace( repl[0], repl[1] ); | ||
} | ||
return text; | ||
function replaceBadCharacters(text) { | ||
let i; | ||
let repl; | ||
for (i = 0; i < rLen; i++) { | ||
repl = replacements[i]; | ||
text = text.replace(repl[0], repl[1]); | ||
} | ||
return text; | ||
} | ||
function yauzlError( err, cb ) { | ||
var msg = err.message; | ||
if ( msg === 'end of central directory record signature not found' ) { | ||
msg = 'File not correctly recognized as zip file, ' + msg; | ||
} | ||
cb( new Error( msg ), null ); | ||
function yauzlError(err, cb) { | ||
let msg = err.message; | ||
if (msg === "end of central directory record signature not found") { | ||
msg = `File not correctly recognized as zip file, ${msg}`; | ||
} | ||
cb(new Error(msg), null); | ||
} | ||
function createExecOptions( type, options ) { | ||
var execOptions = {}; | ||
if ( options[type] && options[type].exec ) { | ||
execOptions = options[type].exec; | ||
} else if ( options.exec ) { | ||
execOptions = options.exec; | ||
} | ||
return execOptions; | ||
function createExecOptions(type, options) { | ||
let execOptions = {}; | ||
if (options[type] && options[type].exec) { | ||
execOptions = options[type].exec; | ||
} else if (options.exec) { | ||
execOptions = options.exec; | ||
} | ||
return execOptions; | ||
} | ||
function unzipCheck( type, cb ) { | ||
exec( | ||
'unzip', | ||
function( error /* , stdout, stderr */ ) { | ||
if ( error ) { | ||
// eslint-disable-next-line no-console | ||
console.error( 'textract: \'unzip\' does not appear to be installed, ' | ||
+ 'so textract will be unable to extract ' + type + '.' ); | ||
} | ||
cb( error === null ); | ||
} | ||
); | ||
function unzipCheck(type, cb) { | ||
exec("unzip", function (error /* , stdout, stderr */) { | ||
if (error) { | ||
// eslint-disable-next-line no-console | ||
console.error( | ||
`textract: 'unzip' does not appear to be installed, ` + `so textract will be unable to extract ${type}.` | ||
); | ||
} | ||
cb(error === null); | ||
}); | ||
} | ||
function getTextFromZipFile( zipfile, entry, cb ) { | ||
zipfile.openReadStream( entry, function( err, readStream ) { | ||
var text = '' | ||
, error = ''; | ||
if ( err ) { | ||
cb( err, null ); | ||
return; | ||
} | ||
function getTextFromZipFile(zipfile, entry, cb) { | ||
zipfile.openReadStream(entry, function (err, readStream) { | ||
let text = ""; | ||
let error = ""; | ||
if (err) { | ||
cb(err, null); | ||
return; | ||
} | ||
readStream.on( 'data', function( chunk ) { | ||
text += chunk; | ||
readStream.on("data", function (chunk) { | ||
text += chunk; | ||
}); | ||
readStream.on("end", function () { | ||
if (error.length > 0) { | ||
cb(error, null); | ||
} else { | ||
cb(null, text); | ||
} | ||
}); | ||
readStream.on("error", function (_err) { | ||
error += _err; | ||
}); | ||
}); | ||
readStream.on( 'end', function() { | ||
if ( error.length > 0 ) { | ||
cb( error, null ); | ||
} else { | ||
cb( null, text ); | ||
} | ||
}); | ||
readStream.on( 'error', function( _err ) { | ||
error += _err; | ||
}); | ||
}); | ||
} | ||
@@ -106,56 +104,51 @@ | ||
*/ | ||
function runExecIntoFile( label, filePath, options, execOptions, genCommand, cb ) { | ||
// escape the file paths | ||
var fileTempOutPath = path.join( outDir, path.basename( filePath, path.extname( filePath ) ) ) | ||
, escapedFilePath = filePath.replace( /\s/g, '\\ ' ) | ||
, escapedFileTempOutPath = fileTempOutPath.replace( /\s/g, '\\ ' ) | ||
, cmd = genCommand( options, escapedFilePath, escapedFileTempOutPath ); | ||
exec( | ||
cmd, | ||
execOptions, | ||
function( error /* , stdout, stderr */ ) { | ||
if ( error !== null ) { | ||
error = new Error( 'Error extracting [[ ' | ||
+ path.basename( filePath ) + ' ]], exec error: ' + error.message ); | ||
cb( error, null ); | ||
return; | ||
} | ||
function runExecIntoFile(label, filePath, options, execOptions, genCommand, cb) { | ||
// escape the file paths | ||
const fileTempOutPath = path.join(outDir, path.basename(filePath, path.extname(filePath))); | ||
const escapedFilePath = filePath.replace(/\s/g, "\\ "); | ||
const escapedFileTempOutPath = fileTempOutPath.replace(/\s/g, "\\ "); | ||
const cmd = genCommand(options, escapedFilePath, escapedFileTempOutPath); | ||
exec(cmd, execOptions, function (error /* , stdout, stderr */) { | ||
if (error !== null) { | ||
error = new Error(`Error extracting [[ ${path.basename(filePath)} ]], exec error: ${error.message}`); | ||
cb(error, null); | ||
return; | ||
} | ||
fs.exists( fileTempOutPath + '.txt', function( exists ) { | ||
if ( exists ) { | ||
fs.readFile( fileTempOutPath + '.txt', 'utf8', function( error2, text ) { | ||
if ( error2 ) { | ||
error2 = new Error( 'Error reading' + label | ||
+ ' output at [[ ' + fileTempOutPath + ' ]], error: ' + error2.message ); | ||
cb( error2, null ); | ||
fs.exists(`${fileTempOutPath}.txt`, function (exists) { | ||
if (exists) { | ||
fs.readFile(`${fileTempOutPath}.txt`, "utf8", function (error2, text) { | ||
if (error2) { | ||
error2 = new Error( | ||
`Error reading${label} output at [[ ${fileTempOutPath} ]], error: ${error2.message}` | ||
); | ||
cb(error2, null); | ||
} else { | ||
fs.unlink(`${fileTempOutPath}.txt`, function (error3) { | ||
if (error3) { | ||
error3 = new Error( | ||
`Error, ${label} , cleaning up temp file [[ ${fileTempOutPath} ]], error: ${error3.message}` | ||
); | ||
cb(error3, null); | ||
} else { | ||
cb(null, text.toString()); | ||
} | ||
}); | ||
} | ||
}); | ||
} else { | ||
fs.unlink( fileTempOutPath + '.txt', function( error3 ) { | ||
if ( error3 ) { | ||
error3 = new Error( 'Error, ' + label | ||
+ ' , cleaning up temp file [[ ' + fileTempOutPath | ||
+ ' ]], error: ' + error3.message ); | ||
cb( error3, null ); | ||
} else { | ||
cb( null, text.toString() ); | ||
} | ||
}); | ||
error = new Error(`Error reading ${label} output at [[ ${fileTempOutPath} ]], file does not exist`); | ||
cb(error, null); | ||
} | ||
}); | ||
} else { | ||
error = new Error( 'Error reading ' + label | ||
+ ' output at [[ ' + fileTempOutPath + ' ]], file does not exist' ); | ||
cb( error, null ); | ||
} | ||
}); | ||
} | ||
); | ||
}); | ||
}); | ||
} | ||
module.exports = { | ||
createExecOptions: createExecOptions, | ||
unzipCheck: unzipCheck, | ||
getTextFromZipFile: getTextFromZipFile, | ||
yauzlError: yauzlError, | ||
runExecIntoFile: runExecIntoFile, | ||
replaceBadCharacters: replaceBadCharacters | ||
createExecOptions, | ||
unzipCheck, | ||
getTextFromZipFile, | ||
yauzlError, | ||
runExecIntoFile, | ||
replaceBadCharacters, | ||
}; |
{ | ||
"name": "@nosferatu500/textract-lite", | ||
"version": "3.0.3", | ||
"version": "3.0.4", | ||
"homepage": "https://github.com/nosferatu500/textract-lite", | ||
@@ -27,4 +27,12 @@ "description": "Extracting text from files of various type including html, pdf, doc, docx, xls, xlsx, csv, pptx, png, jpg, gif, rtf, text/*, and various open office.", | ||
], | ||
"prettier": { | ||
"semi": true, | ||
"trailingComma": "es5", | ||
"singleQuote": false, | ||
"bracketSameLine": false, | ||
"tabWidth": 4, | ||
"printWidth": 120 | ||
}, | ||
"dependencies": { | ||
"@xmldom/xmldom": "^0.7.5", | ||
"@xmldom/xmldom": "^0.8.1", | ||
"html-entities": "2.3.2", | ||
@@ -39,13 +47,20 @@ "iconv-lite": "0.6.3", | ||
"devDependencies": { | ||
"chai": "4.3.4", | ||
"eslint": "^8.3.0", | ||
"eslint-config-airbnb": "^19.0.0", | ||
"eslint-plugin-import": "^2.25.3", | ||
"chai": "^4.3.6", | ||
"eslint": "^8.9.0", | ||
"eslint-config-airbnb": "^19.0.4", | ||
"eslint-config-prettier": "^8.4.0", | ||
"eslint-plugin-import": "^2.25.4", | ||
"eslint-plugin-jsx-a11y": "^6.5.1", | ||
"eslint-plugin-react": "^7.27.1", | ||
"mocha": "^9.1.3" | ||
"eslint-plugin-prettier": "^4.0.0", | ||
"eslint-plugin-promise": "^6.0.0", | ||
"eslint-plugin-react": "^7.28.0", | ||
"eslint-plugin-sonarjs": "^0.12.0", | ||
"eslint-plugin-unicorn": "^41.0.0", | ||
"mocha": "^9.2.1", | ||
"prettier": "^2.5.1" | ||
}, | ||
"scripts": { | ||
"test": "node_modules/.bin/mocha", | ||
"lint": "node_modules/.bin/eslint -c .eslintrc.json lib --fix" | ||
"prettier": "prettier --write '{lib,test}/**/*.js'", | ||
"lint": "yarn prettier && node_modules/.bin/eslint -c .eslintrc.json lib --fix" | ||
}, | ||
@@ -52,0 +67,0 @@ "license": "MIT", |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Shell access
Supply chain riskThis module accesses the system shell. Accessing the system shell increases the risk of executing arbitrary code.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Shell access
Supply chain riskThis module accesses the system shell. Accessing the system shell increases the risk of executing arbitrary code.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
45679
618
13
2
+ Added@xmldom/xmldom@0.8.10(transitive)
- Removed@xmldom/xmldom@0.7.13(transitive)
Updated@xmldom/xmldom@^0.8.1