Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

@nosferatu500/textract-lite

Package Overview
Dependencies
Maintainers
1
Versions
36
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@nosferatu500/textract-lite - npm Package Compare versions

Comparing version 6.0.2 to 7.0.0

.mocharc.json

91

dist/extract.js

@@ -1,15 +0,10 @@

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.extract = void 0;
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const extractorPath = path_1.default.join(__dirname, "extractors");
import fs from "fs";
import path from "path";
import { fileURLToPath } from "node:url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const extractorPath = path.join(__dirname, "extractors");
const typeExtractors = {};
const regexExtractors = [];
const failedExtractorTypes = {};
let totalExtractors = 0;
let satisfiedExtractors = 0;
let hasInitialized = false;

@@ -29,39 +24,15 @@ function registerExtractor(extractor) {

}
function registerFailedExtractor(extractor, failedMessage) {
if (extractor.types) {
for (const type of extractor.types) {
failedExtractorTypes[type.toLowerCase()] = failedMessage;
}
}
}
function testExtractor(extractor, options) {
extractor.test(options, function (passedTest, failedMessage) {
satisfiedExtractors++;
if (passedTest) {
registerExtractor(extractor.default);
}
else {
registerFailedExtractor(extractor, failedMessage);
}
});
}
function initializeExtractors(options) {
async function initializeExtractors() {
hasInitialized = true;
// discover available extractors
const extractors = fs_1.default.readdirSync(extractorPath).map(function (item) {
const fullExtractorPath = path_1.default.join(extractorPath, item);
const extractors = await Promise.all(fs.readdirSync(extractorPath).map(async (item) => {
const fullExtractorPath = path.join(extractorPath, item);
// get the extractor
// eslint-disable-next-line global-require
return require(fullExtractorPath);
});
const { default: extractor } = await import(fullExtractorPath);
return extractor;
}));
// perform any binary tests to ensure extractor is possible
// given execution environment
for (const extractor of extractors) {
if (extractor.test) {
testExtractor(extractor, options);
}
else {
satisfiedExtractors++;
registerExtractor(extractor.default);
}
registerExtractor(extractor);
}

@@ -90,3 +61,3 @@ // need to keep track of how many extractors we have in total

}
async function extract(type, filePath, options) {
export async function extract(type, filePath, options) {
let error;

@@ -96,32 +67,14 @@ let msg;

if (!hasInitialized) {
initializeExtractors(options);
await initializeExtractors();
}
// registration of extractors complete?
if (totalExtractors === satisfiedExtractors) {
theExtractor = findExtractor(type);
if (theExtractor) {
return theExtractor(filePath, options);
}
else {
// cannot extract this file type
msg = `Error for type: [[ ${type} ]], file: [[ ${filePath} ]]`;
// update error message if type is supported but just not configured/installed properly
if (failedExtractorTypes[type]) {
msg +=
`, extractor for type exists, but failed to initialize.` +
` Message: ${failedExtractorTypes[type]}`;
}
error = new Error(msg);
return error;
}
theExtractor = findExtractor(type);
if (theExtractor) {
return theExtractor(filePath, options);
}
else {
// async registration has not wrapped up
// try again later
setTimeout(function () {
extract(type, filePath, options);
}, 100);
// cannot extract this file type
msg = `Error for type: [[ ${type} ]], file: [[ ${filePath} ]]`;
error = new Error(msg);
return error;
}
return new Error("Something went wrong.");
}
exports.extract = extract;

@@ -1,10 +0,5 @@

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const xpath_1 = __importDefault(require("xpath"));
const xmldom_1 = require("@xmldom/xmldom");
const yauzl_1 = __importDefault(require("yauzl"));
const utils_1 = require("../utils");
import xpath from "xpath";
import { DOMParser } from "@xmldom/xmldom";
import yauzl from "yauzl";
import { yauzlError, getTextFromZipFile, cleanseText } from "../utils";
const includeRegex = /.xml$/;

@@ -15,9 +10,9 @@ const excludeRegex = /^(word\/media\/|word\/_rels\/)/;

inText = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>' + '<Properties>' + inText + '</Properties>';
const doc = new xmldom_1.DOMParser().parseFromString(inText);
const ps = xpath_1.default.select("//*[local-name()='p']", doc);
const doc = new DOMParser().parseFromString(inText);
const ps = xpath.select("//*[local-name()='p']", doc);
let text = "";
for (let paragraph of ps) {
let localText = "";
paragraph = new xmldom_1.DOMParser().parseFromString(paragraph.toString());
const ts = xpath_1.default.select("//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph);
paragraph = new DOMParser().parseFromString(paragraph.toString());
const ts = xpath.select("//*[local-name()='t' or local-name()='tab' or local-name()='br']", paragraph);
for (const t of ts) {

@@ -41,6 +36,6 @@ if (t.localName === "t" && t.childNodes.length > 0) {

return new Promise((resolve, reject) => {
yauzl_1.default.open(filePath, function (err, zipfile) {
yauzl.open(filePath, function (err, zipfile) {
let processedEntries = 0;
if (err) {
(0, utils_1.yauzlError)(err, resolve);
yauzlError(err, resolve);
return;

@@ -53,3 +48,3 @@ }

text = _calculateExtractedText(result, options.preserveLineBreaks);
text = (0, utils_1.cleanseText)(options, text);
text = cleanseText(options, text);
resolve(text);

@@ -65,3 +60,3 @@ }

if (includeRegex.test(entry.fileName) && !excludeRegex.test(entry.fileName)) {
(0, utils_1.getTextFromZipFile)(zipfile, entry, function (_, text) {
getTextFromZipFile(zipfile, entry, function (_, text) {
// Security workaround for xmldom >= v0.8.4

@@ -82,5 +77,5 @@ result += `${text}\n`.replace('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\r\n', '');

}
exports.default = {
export default {
types: ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"],
extract: extractText,
};

@@ -1,14 +0,9 @@

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const iconv_lite_1 = __importDefault(require("iconv-lite"));
const jschardet_1 = __importDefault(require("jschardet"));
const utils_1 = require("../utils");
import fs from "fs";
import path from "path";
import iconv from "iconv-lite";
import jschardet from "jschardet";
import { cleanseText } from "../utils";
function extractText(filePath, options) {
return new Promise((resolve, reject) => {
fs_1.default.readFile(filePath, function (error, data) {
fs.readFile(filePath, function (error, data) {
let encoding;

@@ -22,5 +17,5 @@ let decoded;

try {
detectedEncoding = jschardet_1.default.detect(data).encoding;
detectedEncoding = jschardet.detect(data).encoding;
if (!detectedEncoding) {
error = new Error(`Could not detect encoding for file named [[ ${path_1.default.basename(filePath)} ]]`);
error = new Error(`Could not detect encoding for file named [[ ${path.basename(filePath)} ]]`);
resolve(error);

@@ -30,4 +25,4 @@ return;

encoding = detectedEncoding.toLowerCase();
decoded = iconv_lite_1.default.decode(data, encoding);
decoded = (0, utils_1.cleanseText)(options, decoded);
decoded = iconv.decode(data, encoding);
decoded = cleanseText(options, decoded);
}

@@ -42,5 +37,5 @@ catch (error_) {

}
exports.default = {
export default {
types: [/text\//, "application/csv", "application/javascript"],
extract: extractText,
};

@@ -1,21 +0,13 @@

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fromFileWithPath = exports.fromFileWithMimeAndPath = void 0;
const fs_1 = __importDefault(require("fs"));
const mime_1 = __importDefault(require("mime"));
const extract_1 = require("./extract");
async function fromFileWithMimeAndPath(type, filePath, options) {
if (fs_1.default.existsSync(filePath)) {
return (0, extract_1.extract)(type, filePath, options);
import fs from "fs";
import mime from "mime";
import { extract } from "./extract";
export async function fromFileWithMimeAndPath(type, filePath, options) {
if (fs.existsSync(filePath)) {
return extract(type, filePath, options);
}
return new Error(`File at path [[ ${filePath} ]] does not exist.`);
}
exports.fromFileWithMimeAndPath = fromFileWithMimeAndPath;
async function fromFileWithPath(filePath, options) {
const type = (options?.typeOverride) || mime_1.default.getType(filePath);
export async function fromFileWithPath(filePath, options) {
const type = (options?.typeOverride) || mime.getType(filePath);
return fromFileWithMimeAndPath(type, filePath, options);
}
exports.fromFileWithPath = fromFileWithPath;

@@ -1,19 +0,13 @@

"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.cleanseText = exports.getTextFromZipFile = exports.yauzlError = void 0;
const html_entities_1 = require("html-entities");
const fs_1 = __importDefault(require("fs"));
const os_1 = __importDefault(require("os"));
const path_1 = __importDefault(require("path"));
import { decode } from "html-entities";
import fs from "fs";
import os from "os";
import path from "path";
const STRIP_ONLY_SINGLE_LINEBREAKS = /(^|[^\n])\n(?!\n)/g;
const WHITELIST_PRESERVE_LINEBREAKS = /[^\d\n\r !"#$%&'-\w'()-_`a-z{|}~\u0080-\u1FFF\u2013–\u2014\u2015\u2018\u2019\u201C\u201D„\u2026\u20AC\u2116\u2C00-\uD7FF\uFB50\uFDFF\uFE70\uFEFF\uFF01-\uFFE6]*/g;
const WHITELIST_STRIP_LINEBREAKS = /[^\d !"#$%&'-\w'()-_`a-z{|}~\u0080-\u1FFF\u2013–\u2014\u2015\u2018\u2019\u201C\u201D„\u2026\u20AC\u2116\u2C00-\uD7FF\uFB50\uFDFF\uFE70\uFEFF\uFF01-\uFFE6]*/g;
const outDir = path_1.default.join(os_1.default.tmpdir(), "textract");
const outDir = path.join(os.tmpdir(), "textract");
const replacements = [
[/[\u201C\u201D]|“|â€/g, '"'],
[/[\u2018\u2019]|’|‘]/g, "'"],
[/…/g, "…"],
[/[\u201C\u201D]|“|â€/g, '"'], // fancy double quotes
[/[\u2018\u2019]|’|‘]/g, "'"], // fancy single quotes/apostrophes
[/…/g, "…"], // elipses
[/–|—/g, "–"], // long hyphen

@@ -23,4 +17,4 @@ ];

// Up front creation of tmp dir
if (!fs_1.default.existsSync(outDir)) {
fs_1.default.mkdirSync(outDir);
if (!fs.existsSync(outDir)) {
fs.mkdirSync(outDir);
}

@@ -37,3 +31,3 @@ // replace nasty quotes with simple ones

}
function yauzlError(err, cb) {
export function yauzlError(err, cb) {
let msg = err.message;

@@ -45,4 +39,3 @@ if (msg === "end of central directory record signature not found") {

}
exports.yauzlError = yauzlError;
function getTextFromZipFile(zipfile, entry, cb) {
export function getTextFromZipFile(zipfile, entry, cb) {
zipfile.openReadStream(entry, function (err, readStream) {

@@ -71,4 +64,3 @@ let text = "";

}
exports.getTextFromZipFile = getTextFromZipFile;
function cleanseText(options, text) {
export function cleanseText(options, text) {
// clean up text

@@ -87,5 +79,4 @@ text = replaceBadCharacters(text);

text = text.replace(/ (?! )/g, "").replace(/[\t\v \u00A0]{2,}/g, " ");
text = (0, html_entities_1.decode)(text, { level: "xml" });
text = decode(text, { level: "xml" });
return text;
}
exports.cleanseText = cleanseText;
{
"name": "@nosferatu500/textract-lite",
"version": "6.0.2",
"version": "7.0.0",
"type": "module",
"homepage": "https://github.com/nosferatu500/textract-lite",

@@ -24,32 +25,30 @@ "description": "Extracting text from files of various type including txt, doc, docx.",

"lint": "eslint src/ --ext .js,.jsx,.ts,.tsx --cache",
"test": "mocha --require ts-node/register test/**/*.test.ts --exit",
"test": "mocha",
"clean": "rm -rf dist build package",
"docs": "typedoc --entryPoints src/index.ts",
"build": "yarn clean && tsc -p tsconfig.json"
"clean-types": "rimraf dist/extractors/docx.d.ts && rimraf dist/extractors/text.d.ts && rimraf dist/extract.d.ts && rimraf dist/utils.d.ts",
"build": "npm run clean && tsc -p tsconfig.json && npm run clean-types"
},
"dependencies": {
"@xmldom/xmldom": "^0.8.10",
"html-entities": "^2.4.0",
"html-entities": "^2.5.2",
"iconv-lite": "^0.6.3",
"jschardet": "^3.0.0",
"mime": "^3.0.0",
"xpath": "^0.0.33",
"yauzl": "^2.10.0"
"jschardet": "^3.1.2",
"mime": "^4.0.1",
"xpath": "^0.0.34",
"yauzl": "^3.1.2"
},
"devDependencies": {
"@types/chai": "^4.3.6",
"@types/chai-as-promised": "^7.1.6",
"@types/mime": "^3.0.2",
"@types/mocha": "^10.0.2",
"@types/node": "^18.18.1",
"@types/yauzl": "^2.10.1",
"@typescript-eslint/eslint-plugin": "^6.7.3",
"@typescript-eslint/parser": "^6.7.3",
"chai": "^4.3.10",
"chai-as-promised": "^7.1.1",
"eslint": "^8.50.0",
"mocha": "^10.2.0",
"ts-node": "^10.9.1",
"typedoc": "^0.25.1",
"typescript": "^5.2.2"
"@types/chai": "^4.3.14",
"@types/mocha": "^10.0.6",
"@types/node": "~20.9.5",
"@types/yauzl": "^2.10.3",
"@typescript-eslint/eslint-plugin": "^7.6.0",
"@typescript-eslint/parser": "^7.6.0",
"chai": "^5.1.0",
"eslint": "^8.57.0",
"mocha": "^10.4.0",
"ts-node": "^10.9.2",
"typedoc": "^0.25.13",
"typescript": "^5.4.5"
},

@@ -56,0 +55,0 @@ "license": "MIT",

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc