@nosferatu500/textract-lite - npm Package Compare versions

Comparing version 6.0.2 to 7.0.0

.mocharc.json

dist/extract.js

		@@ -1,15 +0,10 @@
		"use strict";
		var __importDefault = (this && this.__importDefault) \|\| function (mod) {
		return (mod && mod.__esModule) ? mod : { "default": mod };
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.extract = void 0;
		const fs_1 = __importDefault(require("fs"));
		const path_1 = __importDefault(require("path"));
		const extractorPath = path_1.default.join(__dirname, "extractors");
		import fs from "fs";
		import path from "path";
		import { fileURLToPath } from "node:url";
		const __filename = fileURLToPath(import.meta.url);
		const __dirname = path.dirname(__filename);
		const extractorPath = path.join(__dirname, "extractors");
		const typeExtractors = {};
		const regexExtractors = [];
		const failedExtractorTypes = {};
		let totalExtractors = 0;
		let satisfiedExtractors = 0;
		let hasInitialized = false;
		@@ -29,39 +24,15 @@ function registerExtractor(extractor) {
		}
		function registerFailedExtractor(extractor, failedMessage) {
		if (extractor.types) {
		for (const type of extractor.types) {
		failedExtractorTypes[type.toLowerCase()] = failedMessage;
		}
		}
		}
		function testExtractor(extractor, options) {
		extractor.test(options, function (passedTest, failedMessage) {
		satisfiedExtractors++;
		if (passedTest) {
		registerExtractor(extractor.default);
		}
		else {
		registerFailedExtractor(extractor, failedMessage);
		}
		});
		}
		function initializeExtractors(options) {
		async function initializeExtractors() {
		hasInitialized = true;
		// discover available extractors
		const extractors = fs_1.default.readdirSync(extractorPath).map(function (item) {
		const fullExtractorPath = path_1.default.join(extractorPath, item);
		const extractors = await Promise.all(fs.readdirSync(extractorPath).map(async (item) => {
		const fullExtractorPath = path.join(extractorPath, item);
		// get the extractor
		// eslint-disable-next-line global-require
		return require(fullExtractorPath);
		});
		const { default: extractor } = await import(fullExtractorPath);
		return extractor;
		}));
		// perform any binary tests to ensure extractor is possible
		// given execution environment
		for (const extractor of extractors) {
		if (extractor.test) {
		testExtractor(extractor, options);
		}
		else {
		satisfiedExtractors++;
		registerExtractor(extractor.default);
		}
		registerExtractor(extractor);
		}
		@@ -90,3 +61,3 @@ // need to keep track of how many extractors we have in total
		}
		async function extract(type, filePath, options) {
		export async function extract(type, filePath, options) {
		let error;
		@@ -96,32 +67,14 @@ let msg;
		if (!hasInitialized) {
		initializeExtractors(options);
		await initializeExtractors();
		}
		// registration of extractors complete?
		if (totalExtractors === satisfiedExtractors) {
		theExtractor = findExtractor(type);
		if (theExtractor) {
		return theExtractor(filePath, options);
		}
		else {
		// cannot extract this file type
		msg = `Error for type: [[ ${type} ]], file: [[ ${filePath} ]]`;
		// update error message if type is supported but just not configured/installed properly
		if (failedExtractorTypes[type]) {
		msg +=
		`, extractor for type exists, but failed to initialize.` +
		` Message: ${failedExtractorTypes[type]}`;
		}
		error = new Error(msg);
		return error;
		}
		theExtractor = findExtractor(type);
		if (theExtractor) {
		return theExtractor(filePath, options);
		}
		else {
		// async registration has not wrapped up
		// try again later
		setTimeout(function () {
		extract(type, filePath, options);
		}, 100);
		// cannot extract this file type
		msg = `Error for type: [[ ${type} ]], file: [[ ${filePath} ]]`;
		error = new Error(msg);
		return error;
		}
		return new Error("Something went wrong.");
		}
		exports.extract = extract;

dist/extractors/docx.js

		@@ -1,10 +0,5 @@
		"use strict";
		var __importDefault = (this && this.__importDefault) \|\| function (mod) {
		return (mod && mod.__esModule) ? mod : { "default": mod };
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		const xpath_1 = __importDefault(require("xpath"));
		const xmldom_1 = require("@xmldom/xmldom");
		const yauzl_1 = __importDefault(require("yauzl"));
		const utils_1 = require("../utils");
		import xpath from "xpath";
		import { DOMParser } from "@xmldom/xmldom";
		import yauzl from "yauzl";
		import { yauzlError, getTextFromZipFile, cleanseText } from "../utils";
		const includeRegex = /.xml$/;
		@@ -15,9 +10,9 @@ const excludeRegex = /^(word\/media\/\|word\/_rels\/)/;
		inText = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>' + '<Properties>' + inText + '</Properties>';
		const doc = new xmldom_1.DOMParser().parseFromString(inText);
		const ps = xpath_1.default.select("//*[local-name()='p']", doc);
		const doc = new DOMParser().parseFromString(inText);
		const ps = xpath.select("//*[local-name()='p']", doc);
		let text = "";
		for (let paragraph of ps) {
		let localText = "";
		paragraph = new xmldom_1.DOMParser().parseFromString(paragraph.toString());
		const ts = xpath_1.default.select("//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph);
		paragraph = new DOMParser().parseFromString(paragraph.toString());
		const ts = xpath.select("//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph);
		for (const t of ts) {
		@@ -41,6 +36,6 @@ if (t.localName === "t" && t.childNodes.length > 0) {
		return new Promise((resolve, reject) => {
		yauzl_1.default.open(filePath, function (err, zipfile) {
		yauzl.open(filePath, function (err, zipfile) {
		let processedEntries = 0;
		if (err) {
		(0, utils_1.yauzlError)(err, resolve);
		yauzlError(err, resolve);
		return;
		@@ -53,3 +48,3 @@ }
		text = _calculateExtractedText(result, options.preserveLineBreaks);
		text = (0, utils_1.cleanseText)(options, text);
		text = cleanseText(options, text);
		resolve(text);
		@@ -65,3 +60,3 @@ }
		if (includeRegex.test(entry.fileName) && !excludeRegex.test(entry.fileName)) {
		(0, utils_1.getTextFromZipFile)(zipfile, entry, function (_, text) {
		getTextFromZipFile(zipfile, entry, function (_, text) {
		// Security workaround for xmldom >= v0.8.4
		@@ -82,5 +77,5 @@ result += `${text}\n`.replace('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\r\n', '');
		}
		exports.default = {
		export default {
		types: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
		extract: extractText,
		};

dist/extractors/text.js

		@@ -1,14 +0,9 @@
		"use strict";
		var __importDefault = (this && this.__importDefault) \|\| function (mod) {
		return (mod && mod.__esModule) ? mod : { "default": mod };
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		const fs_1 = __importDefault(require("fs"));
		const path_1 = __importDefault(require("path"));
		const iconv_lite_1 = __importDefault(require("iconv-lite"));
		const jschardet_1 = __importDefault(require("jschardet"));
		const utils_1 = require("../utils");
		import fs from "fs";
		import path from "path";
		import iconv from "iconv-lite";
		import jschardet from "jschardet";
		import { cleanseText } from "../utils";
		function extractText(filePath, options) {
		return new Promise((resolve, reject) => {
		fs_1.default.readFile(filePath, function (error, data) {
		fs.readFile(filePath, function (error, data) {
		let encoding;
		@@ -22,5 +17,5 @@ let decoded;
		try {
		detectedEncoding = jschardet_1.default.detect(data).encoding;
		detectedEncoding = jschardet.detect(data).encoding;
		if (!detectedEncoding) {
		error = new Error(`Could not detect encoding for file named [[ ${path_1.default.basename(filePath)} ]]`);
		error = new Error(`Could not detect encoding for file named [[ ${path.basename(filePath)} ]]`);
		resolve(error);
		@@ -30,4 +25,4 @@ return;
		encoding = detectedEncoding.toLowerCase();
		decoded = iconv_lite_1.default.decode(data, encoding);
		decoded = (0, utils_1.cleanseText)(options, decoded);
		decoded = iconv.decode(data, encoding);
		decoded = cleanseText(options, decoded);
		}
		@@ -42,5 +37,5 @@ catch (error_) {
		}
		exports.default = {
		export default {
		types: [/text\//, "application/csv", "application/javascript"],
		extract: extractText,
		};

dist/index.js

		@@ -1,21 +0,13 @@
		"use strict";
		var __importDefault = (this && this.__importDefault) \|\| function (mod) {
		return (mod && mod.__esModule) ? mod : { "default": mod };
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.fromFileWithPath = exports.fromFileWithMimeAndPath = void 0;
		const fs_1 = __importDefault(require("fs"));
		const mime_1 = __importDefault(require("mime"));
		const extract_1 = require("./extract");
		async function fromFileWithMimeAndPath(type, filePath, options) {
		if (fs_1.default.existsSync(filePath)) {
		return (0, extract_1.extract)(type, filePath, options);
		import fs from "fs";
		import mime from "mime";
		import { extract } from "./extract";
		export async function fromFileWithMimeAndPath(type, filePath, options) {
		if (fs.existsSync(filePath)) {
		return extract(type, filePath, options);
		}
		return new Error(`File at path [[ ${filePath} ]] does not exist.`);
		}
		exports.fromFileWithMimeAndPath = fromFileWithMimeAndPath;
		async function fromFileWithPath(filePath, options) {
		const type = (options?.typeOverride) \|\| mime_1.default.getType(filePath);
		export async function fromFileWithPath(filePath, options) {
		const type = (options?.typeOverride) \|\| mime.getType(filePath);
		return fromFileWithMimeAndPath(type, filePath, options);
		}
		exports.fromFileWithPath = fromFileWithPath;

dist/utils.js

		@@ -1,19 +0,13 @@
		"use strict";
		var __importDefault = (this && this.__importDefault) \|\| function (mod) {
		return (mod && mod.__esModule) ? mod : { "default": mod };
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.cleanseText = exports.getTextFromZipFile = exports.yauzlError = void 0;
		const html_entities_1 = require("html-entities");
		const fs_1 = __importDefault(require("fs"));
		const os_1 = __importDefault(require("os"));
		const path_1 = __importDefault(require("path"));
		import { decode } from "html-entities";
		import fs from "fs";
		import os from "os";
		import path from "path";
		const STRIP_ONLY_SINGLE_LINEBREAKS = /(^\|[^\n])\n(?!\n)/g;
		const WHITELIST_PRESERVE_LINEBREAKS = /[^\d\n\r !"#$%&'-\w'()-_`a-z{\|}~\u0080-\u1FFF\u2013–\u2014\u2015\u2018\u2019\u201C\u201D„\u2026\u20AC\u2116\u2C00-\uD7FF\uFB50\uFDFF\uFE70\uFEFF\uFF01-\uFFE6]*/g;
		const WHITELIST_STRIP_LINEBREAKS = /[^\d !"#$%&'-\w'()-_`a-z{\|}~\u0080-\u1FFF\u2013–\u2014\u2015\u2018\u2019\u201C\u201D„\u2026\u20AC\u2116\u2C00-\uD7FF\uFB50\uFDFF\uFE70\uFEFF\uFF01-\uFFE6]*/g;
		const outDir = path_1.default.join(os_1.default.tmpdir(), "textract");
		const outDir = path.join(os.tmpdir(), "textract");
		const replacements = [
		[/[\u201C\u201D]\|â€œ\|â€/g, '"'],
		[/[\u2018\u2019]\|â€™\|â€˜]/g, "'"],
		[/â€¦/g, "…"],
		[/[\u201C\u201D]\|â€œ\|â€/g, '"'], // fancy double quotes
		[/[\u2018\u2019]\|â€™\|â€˜]/g, "'"], // fancy single quotes/apostrophes
		[/â€¦/g, "…"], // elipses
		[/â€“\|â€”/g, "–"], // long hyphen
		@@ -23,4 +17,4 @@ ];
		// Up front creation of tmp dir
		if (!fs_1.default.existsSync(outDir)) {
		fs_1.default.mkdirSync(outDir);
		if (!fs.existsSync(outDir)) {
		fs.mkdirSync(outDir);
		}
		@@ -37,3 +31,3 @@ // replace nasty quotes with simple ones
		}
		function yauzlError(err, cb) {
		export function yauzlError(err, cb) {
		let msg = err.message;
		@@ -45,4 +39,3 @@ if (msg === "end of central directory record signature not found") {
		}
		exports.yauzlError = yauzlError;
		function getTextFromZipFile(zipfile, entry, cb) {
		export function getTextFromZipFile(zipfile, entry, cb) {
		zipfile.openReadStream(entry, function (err, readStream) {
		@@ -71,4 +64,3 @@ let text = "";
		}
		exports.getTextFromZipFile = getTextFromZipFile;
		function cleanseText(options, text) {
		export function cleanseText(options, text) {
		// clean up text
		@@ -87,5 +79,4 @@ text = replaceBadCharacters(text);
		text = text.replace(/ (?! )/g, "").replace(/[\t\v \u00A0]{2,}/g, " ");
		text = (0, html_entities_1.decode)(text, { level: "xml" });
		text = decode(text, { level: "xml" });
		return text;
		}
		exports.cleanseText = cleanseText;

package.json

		{
		"name": "@nosferatu500/textract-lite",
		"version": "6.0.2",
		"version": "7.0.0",
		"type": "module",
		"homepage": "https://github.com/nosferatu500/textract-lite",
		@@ -24,32 +25,30 @@ "description": "Extracting text from files of various type including txt, doc, docx.",
		"lint": "eslint src/ --ext .js,.jsx,.ts,.tsx --cache",
		"test": "mocha --require ts-node/register test/*/.test.ts --exit",
		"test": "mocha",
		"clean": "rm -rf dist build package",
		"docs": "typedoc --entryPoints src/index.ts",
		"build": "yarn clean && tsc -p tsconfig.json"
		"clean-types": "rimraf dist/extractors/docx.d.ts && rimraf dist/extractors/text.d.ts && rimraf dist/extract.d.ts && rimraf dist/utils.d.ts",
		"build": "npm run clean && tsc -p tsconfig.json && npm run clean-types"
		},
		"dependencies": {
		"@xmldom/xmldom": "^0.8.10",
		"html-entities": "^2.4.0",
		"html-entities": "^2.5.2",
		"iconv-lite": "^0.6.3",
		"jschardet": "^3.0.0",
		"mime": "^3.0.0",
		"xpath": "^0.0.33",
		"yauzl": "^2.10.0"
		"jschardet": "^3.1.2",
		"mime": "^4.0.1",
		"xpath": "^0.0.34",
		"yauzl": "^3.1.2"
		},
		"devDependencies": {
		"@types/chai": "^4.3.6",
		"@types/chai-as-promised": "^7.1.6",
		"@types/mime": "^3.0.2",
		"@types/mocha": "^10.0.2",
		"@types/node": "^18.18.1",
		"@types/yauzl": "^2.10.1",
		"@typescript-eslint/eslint-plugin": "^6.7.3",
		"@typescript-eslint/parser": "^6.7.3",
		"chai": "^4.3.10",
		"chai-as-promised": "^7.1.1",
		"eslint": "^8.50.0",
		"mocha": "^10.2.0",
		"ts-node": "^10.9.1",
		"typedoc": "^0.25.1",
		"typescript": "^5.2.2"
		"@types/chai": "^4.3.14",
		"@types/mocha": "^10.0.6",
		"@types/node": "~20.9.5",
		"@types/yauzl": "^2.10.3",
		"@typescript-eslint/eslint-plugin": "^7.6.0",
		"@typescript-eslint/parser": "^7.6.0",
		"chai": "^5.1.0",
		"eslint": "^8.57.0",
		"mocha": "^10.4.0",
		"ts-node": "^10.9.2",
		"typedoc": "^0.25.13",
		"typescript": "^5.4.5"
		},
		@@ -56,0 +55,0 @@ "license": "MIT",

@nosferatu500/textract-lite - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes