@nosferatu500/textract-lite
Advanced tools
Comparing version 6.0.2 to 7.0.0
@@ -1,15 +0,10 @@ | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.extract = void 0; | ||
const fs_1 = __importDefault(require("fs")); | ||
const path_1 = __importDefault(require("path")); | ||
const extractorPath = path_1.default.join(__dirname, "extractors"); | ||
import fs from "fs"; | ||
import path from "path"; | ||
import { fileURLToPath } from "node:url"; | ||
const __filename = fileURLToPath(import.meta.url); | ||
const __dirname = path.dirname(__filename); | ||
const extractorPath = path.join(__dirname, "extractors"); | ||
const typeExtractors = {}; | ||
const regexExtractors = []; | ||
const failedExtractorTypes = {}; | ||
let totalExtractors = 0; | ||
let satisfiedExtractors = 0; | ||
let hasInitialized = false; | ||
@@ -29,39 +24,15 @@ function registerExtractor(extractor) { | ||
} | ||
function registerFailedExtractor(extractor, failedMessage) { | ||
if (extractor.types) { | ||
for (const type of extractor.types) { | ||
failedExtractorTypes[type.toLowerCase()] = failedMessage; | ||
} | ||
} | ||
} | ||
function testExtractor(extractor, options) { | ||
extractor.test(options, function (passedTest, failedMessage) { | ||
satisfiedExtractors++; | ||
if (passedTest) { | ||
registerExtractor(extractor.default); | ||
} | ||
else { | ||
registerFailedExtractor(extractor, failedMessage); | ||
} | ||
}); | ||
} | ||
function initializeExtractors(options) { | ||
async function initializeExtractors() { | ||
hasInitialized = true; | ||
// discover available extractors | ||
const extractors = fs_1.default.readdirSync(extractorPath).map(function (item) { | ||
const fullExtractorPath = path_1.default.join(extractorPath, item); | ||
const extractors = await Promise.all(fs.readdirSync(extractorPath).map(async (item) => { | ||
const fullExtractorPath = path.join(extractorPath, item); | ||
// get the extractor | ||
// eslint-disable-next-line global-require | ||
return require(fullExtractorPath); | ||
}); | ||
const { default: extractor } = await import(fullExtractorPath); | ||
return extractor; | ||
})); | ||
// perform any binary tests to ensure extractor is possible | ||
// given execution environment | ||
for (const extractor of extractors) { | ||
if (extractor.test) { | ||
testExtractor(extractor, options); | ||
} | ||
else { | ||
satisfiedExtractors++; | ||
registerExtractor(extractor.default); | ||
} | ||
registerExtractor(extractor); | ||
} | ||
@@ -90,3 +61,3 @@ // need to keep track of how many extractors we have in total | ||
} | ||
async function extract(type, filePath, options) { | ||
export async function extract(type, filePath, options) { | ||
let error; | ||
@@ -96,32 +67,14 @@ let msg; | ||
if (!hasInitialized) { | ||
initializeExtractors(options); | ||
await initializeExtractors(); | ||
} | ||
// registration of extractors complete? | ||
if (totalExtractors === satisfiedExtractors) { | ||
theExtractor = findExtractor(type); | ||
if (theExtractor) { | ||
return theExtractor(filePath, options); | ||
} | ||
else { | ||
// cannot extract this file type | ||
msg = `Error for type: [[ ${type} ]], file: [[ ${filePath} ]]`; | ||
// update error message if type is supported but just not configured/installed properly | ||
if (failedExtractorTypes[type]) { | ||
msg += | ||
`, extractor for type exists, but failed to initialize.` + | ||
` Message: ${failedExtractorTypes[type]}`; | ||
} | ||
error = new Error(msg); | ||
return error; | ||
} | ||
theExtractor = findExtractor(type); | ||
if (theExtractor) { | ||
return theExtractor(filePath, options); | ||
} | ||
else { | ||
// async registration has not wrapped up | ||
// try again later | ||
setTimeout(function () { | ||
extract(type, filePath, options); | ||
}, 100); | ||
// cannot extract this file type | ||
msg = `Error for type: [[ ${type} ]], file: [[ ${filePath} ]]`; | ||
error = new Error(msg); | ||
return error; | ||
} | ||
return new Error("Something went wrong."); | ||
} | ||
exports.extract = extract; |
@@ -1,10 +0,5 @@ | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
const xpath_1 = __importDefault(require("xpath")); | ||
const xmldom_1 = require("@xmldom/xmldom"); | ||
const yauzl_1 = __importDefault(require("yauzl")); | ||
const utils_1 = require("../utils"); | ||
import xpath from "xpath"; | ||
import { DOMParser } from "@xmldom/xmldom"; | ||
import yauzl from "yauzl"; | ||
import { yauzlError, getTextFromZipFile, cleanseText } from "../utils"; | ||
const includeRegex = /.xml$/; | ||
@@ -15,9 +10,9 @@ const excludeRegex = /^(word\/media\/|word\/_rels\/)/; | ||
inText = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>' + '<Properties>' + inText + '</Properties>'; | ||
const doc = new xmldom_1.DOMParser().parseFromString(inText); | ||
const ps = xpath_1.default.select("//*[local-name()='p']", doc); | ||
const doc = new DOMParser().parseFromString(inText); | ||
const ps = xpath.select("//*[local-name()='p']", doc); | ||
let text = ""; | ||
for (let paragraph of ps) { | ||
let localText = ""; | ||
paragraph = new xmldom_1.DOMParser().parseFromString(paragraph.toString()); | ||
const ts = xpath_1.default.select("//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph); | ||
paragraph = new DOMParser().parseFromString(paragraph.toString()); | ||
const ts = xpath.select("//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph); | ||
for (const t of ts) { | ||
@@ -41,6 +36,6 @@ if (t.localName === "t" && t.childNodes.length > 0) { | ||
return new Promise((resolve, reject) => { | ||
yauzl_1.default.open(filePath, function (err, zipfile) { | ||
yauzl.open(filePath, function (err, zipfile) { | ||
let processedEntries = 0; | ||
if (err) { | ||
(0, utils_1.yauzlError)(err, resolve); | ||
yauzlError(err, resolve); | ||
return; | ||
@@ -53,3 +48,3 @@ } | ||
text = _calculateExtractedText(result, options.preserveLineBreaks); | ||
text = (0, utils_1.cleanseText)(options, text); | ||
text = cleanseText(options, text); | ||
resolve(text); | ||
@@ -65,3 +60,3 @@ } | ||
if (includeRegex.test(entry.fileName) && !excludeRegex.test(entry.fileName)) { | ||
(0, utils_1.getTextFromZipFile)(zipfile, entry, function (_, text) { | ||
getTextFromZipFile(zipfile, entry, function (_, text) { | ||
// Security workaround for xmldom >= v0.8.4 | ||
@@ -82,5 +77,5 @@ result += `${text}\n`.replace('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\r\n', ''); | ||
} | ||
exports.default = { | ||
export default { | ||
types: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"], | ||
extract: extractText, | ||
}; |
@@ -1,14 +0,9 @@ | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
const fs_1 = __importDefault(require("fs")); | ||
const path_1 = __importDefault(require("path")); | ||
const iconv_lite_1 = __importDefault(require("iconv-lite")); | ||
const jschardet_1 = __importDefault(require("jschardet")); | ||
const utils_1 = require("../utils"); | ||
import fs from "fs"; | ||
import path from "path"; | ||
import iconv from "iconv-lite"; | ||
import jschardet from "jschardet"; | ||
import { cleanseText } from "../utils"; | ||
function extractText(filePath, options) { | ||
return new Promise((resolve, reject) => { | ||
fs_1.default.readFile(filePath, function (error, data) { | ||
fs.readFile(filePath, function (error, data) { | ||
let encoding; | ||
@@ -22,5 +17,5 @@ let decoded; | ||
try { | ||
detectedEncoding = jschardet_1.default.detect(data).encoding; | ||
detectedEncoding = jschardet.detect(data).encoding; | ||
if (!detectedEncoding) { | ||
error = new Error(`Could not detect encoding for file named [[ ${path_1.default.basename(filePath)} ]]`); | ||
error = new Error(`Could not detect encoding for file named [[ ${path.basename(filePath)} ]]`); | ||
resolve(error); | ||
@@ -30,4 +25,4 @@ return; | ||
encoding = detectedEncoding.toLowerCase(); | ||
decoded = iconv_lite_1.default.decode(data, encoding); | ||
decoded = (0, utils_1.cleanseText)(options, decoded); | ||
decoded = iconv.decode(data, encoding); | ||
decoded = cleanseText(options, decoded); | ||
} | ||
@@ -42,5 +37,5 @@ catch (error_) { | ||
} | ||
exports.default = { | ||
export default { | ||
types: [/text\//, "application/csv", "application/javascript"], | ||
extract: extractText, | ||
}; |
@@ -1,21 +0,13 @@ | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.fromFileWithPath = exports.fromFileWithMimeAndPath = void 0; | ||
const fs_1 = __importDefault(require("fs")); | ||
const mime_1 = __importDefault(require("mime")); | ||
const extract_1 = require("./extract"); | ||
async function fromFileWithMimeAndPath(type, filePath, options) { | ||
if (fs_1.default.existsSync(filePath)) { | ||
return (0, extract_1.extract)(type, filePath, options); | ||
import fs from "fs"; | ||
import mime from "mime"; | ||
import { extract } from "./extract"; | ||
export async function fromFileWithMimeAndPath(type, filePath, options) { | ||
if (fs.existsSync(filePath)) { | ||
return extract(type, filePath, options); | ||
} | ||
return new Error(`File at path [[ ${filePath} ]] does not exist.`); | ||
} | ||
exports.fromFileWithMimeAndPath = fromFileWithMimeAndPath; | ||
async function fromFileWithPath(filePath, options) { | ||
const type = (options?.typeOverride) || mime_1.default.getType(filePath); | ||
export async function fromFileWithPath(filePath, options) { | ||
const type = (options?.typeOverride) || mime.getType(filePath); | ||
return fromFileWithMimeAndPath(type, filePath, options); | ||
} | ||
exports.fromFileWithPath = fromFileWithPath; |
@@ -1,19 +0,13 @@ | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.cleanseText = exports.getTextFromZipFile = exports.yauzlError = void 0; | ||
const html_entities_1 = require("html-entities"); | ||
const fs_1 = __importDefault(require("fs")); | ||
const os_1 = __importDefault(require("os")); | ||
const path_1 = __importDefault(require("path")); | ||
import { decode } from "html-entities"; | ||
import fs from "fs"; | ||
import os from "os"; | ||
import path from "path"; | ||
const STRIP_ONLY_SINGLE_LINEBREAKS = /(^|[^\n])\n(?!\n)/g; | ||
const WHITELIST_PRESERVE_LINEBREAKS = /[^\d\n\r !"#$%&'-\w'()-_`a-z{|}~\u0080-\u1FFF\u2013–\u2014\u2015\u2018\u2019\u201C\u201D„\u2026\u20AC\u2116\u2C00-\uD7FF\uFB50\uFDFF\uFE70\uFEFF\uFF01-\uFFE6]*/g; | ||
const WHITELIST_STRIP_LINEBREAKS = /[^\d !"#$%&'-\w'()-_`a-z{|}~\u0080-\u1FFF\u2013–\u2014\u2015\u2018\u2019\u201C\u201D„\u2026\u20AC\u2116\u2C00-\uD7FF\uFB50\uFDFF\uFE70\uFEFF\uFF01-\uFFE6]*/g; | ||
const outDir = path_1.default.join(os_1.default.tmpdir(), "textract"); | ||
const outDir = path.join(os.tmpdir(), "textract"); | ||
const replacements = [ | ||
[/[\u201C\u201D]|“|â€/g, '"'], | ||
[/[\u2018\u2019]|’|‘]/g, "'"], | ||
[/…/g, "…"], | ||
[/[\u201C\u201D]|“|â€/g, '"'], // fancy double quotes | ||
[/[\u2018\u2019]|’|‘]/g, "'"], // fancy single quotes/apostrophes | ||
[/…/g, "…"], // elipses | ||
[/–|—/g, "–"], // long hyphen | ||
@@ -23,4 +17,4 @@ ]; | ||
// Up front creation of tmp dir | ||
if (!fs_1.default.existsSync(outDir)) { | ||
fs_1.default.mkdirSync(outDir); | ||
if (!fs.existsSync(outDir)) { | ||
fs.mkdirSync(outDir); | ||
} | ||
@@ -37,3 +31,3 @@ // replace nasty quotes with simple ones | ||
} | ||
function yauzlError(err, cb) { | ||
export function yauzlError(err, cb) { | ||
let msg = err.message; | ||
@@ -45,4 +39,3 @@ if (msg === "end of central directory record signature not found") { | ||
} | ||
exports.yauzlError = yauzlError; | ||
function getTextFromZipFile(zipfile, entry, cb) { | ||
export function getTextFromZipFile(zipfile, entry, cb) { | ||
zipfile.openReadStream(entry, function (err, readStream) { | ||
@@ -71,4 +64,3 @@ let text = ""; | ||
} | ||
exports.getTextFromZipFile = getTextFromZipFile; | ||
function cleanseText(options, text) { | ||
export function cleanseText(options, text) { | ||
// clean up text | ||
@@ -87,5 +79,4 @@ text = replaceBadCharacters(text); | ||
text = text.replace(/ (?! )/g, "").replace(/[\t\v \u00A0]{2,}/g, " "); | ||
text = (0, html_entities_1.decode)(text, { level: "xml" }); | ||
text = decode(text, { level: "xml" }); | ||
return text; | ||
} | ||
exports.cleanseText = cleanseText; |
{ | ||
"name": "@nosferatu500/textract-lite", | ||
"version": "6.0.2", | ||
"version": "7.0.0", | ||
"type": "module", | ||
"homepage": "https://github.com/nosferatu500/textract-lite", | ||
@@ -24,32 +25,30 @@ "description": "Extracting text from files of various type including txt, doc, docx.", | ||
"lint": "eslint src/ --ext .js,.jsx,.ts,.tsx --cache", | ||
"test": "mocha --require ts-node/register test/**/*.test.ts --exit", | ||
"test": "mocha", | ||
"clean": "rm -rf dist build package", | ||
"docs": "typedoc --entryPoints src/index.ts", | ||
"build": "yarn clean && tsc -p tsconfig.json" | ||
"clean-types": "rimraf dist/extractors/docx.d.ts && rimraf dist/extractors/text.d.ts && rimraf dist/extract.d.ts && rimraf dist/utils.d.ts", | ||
"build": "npm run clean && tsc -p tsconfig.json && npm run clean-types" | ||
}, | ||
"dependencies": { | ||
"@xmldom/xmldom": "^0.8.10", | ||
"html-entities": "^2.4.0", | ||
"html-entities": "^2.5.2", | ||
"iconv-lite": "^0.6.3", | ||
"jschardet": "^3.0.0", | ||
"mime": "^3.0.0", | ||
"xpath": "^0.0.33", | ||
"yauzl": "^2.10.0" | ||
"jschardet": "^3.1.2", | ||
"mime": "^4.0.1", | ||
"xpath": "^0.0.34", | ||
"yauzl": "^3.1.2" | ||
}, | ||
"devDependencies": { | ||
"@types/chai": "^4.3.6", | ||
"@types/chai-as-promised": "^7.1.6", | ||
"@types/mime": "^3.0.2", | ||
"@types/mocha": "^10.0.2", | ||
"@types/node": "^18.18.1", | ||
"@types/yauzl": "^2.10.1", | ||
"@typescript-eslint/eslint-plugin": "^6.7.3", | ||
"@typescript-eslint/parser": "^6.7.3", | ||
"chai": "^4.3.10", | ||
"chai-as-promised": "^7.1.1", | ||
"eslint": "^8.50.0", | ||
"mocha": "^10.2.0", | ||
"ts-node": "^10.9.1", | ||
"typedoc": "^0.25.1", | ||
"typescript": "^5.2.2" | ||
"@types/chai": "^4.3.14", | ||
"@types/mocha": "^10.0.6", | ||
"@types/node": "~20.9.5", | ||
"@types/yauzl": "^2.10.3", | ||
"@typescript-eslint/eslint-plugin": "^7.6.0", | ||
"@typescript-eslint/parser": "^7.6.0", | ||
"chai": "^5.1.0", | ||
"eslint": "^8.57.0", | ||
"mocha": "^10.4.0", | ||
"ts-node": "^10.9.2", | ||
"typedoc": "^0.25.13", | ||
"typescript": "^5.4.5" | ||
}, | ||
@@ -56,0 +55,0 @@ "license": "MIT", |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
12
10
4
Yes
20494
292
+ Addedmime@4.0.6(transitive)
+ Addedxpath@0.0.34(transitive)
+ Addedyauzl@3.2.0(transitive)
- Removedfd-slicer@1.1.0(transitive)
- Removedmime@3.0.0(transitive)
- Removedxpath@0.0.33(transitive)
- Removedyauzl@2.10.0(transitive)
Updatedhtml-entities@^2.5.2
Updatedjschardet@^3.1.2
Updatedmime@^4.0.1
Updatedxpath@^0.0.34
Updatedyauzl@^3.1.2