office-text-extractor
Advanced tools
Comparing version 3.0.3 to 3.4.0-beta.0
@@ -1,3 +0,2 @@ | ||
/// <reference types="node" resolution-mode="require"/> | ||
import { Buffer } from 'node:buffer'; | ||
import { Buffer } from 'buffer/index.js'; | ||
/** | ||
@@ -4,0 +3,0 @@ * A method of text extraction. |
// source/lib.ts | ||
// The source code for the library. | ||
import { Buffer } from 'node:buffer'; | ||
import { Buffer } from 'buffer/index.js'; | ||
import { fileTypeFromBuffer as getFileType } from 'file-type'; | ||
@@ -5,0 +5,0 @@ import { readFile, fetchUrl } from './util.js'; |
@@ -1,2 +0,2 @@ | ||
import { type Buffer } from 'node:buffer'; | ||
import { type Buffer } from 'buffer/'; | ||
import type { TextExtractionMethod } from '../lib.js'; | ||
@@ -3,0 +3,0 @@ export declare class DocExtractor implements TextExtractionMethod { |
@@ -20,2 +20,3 @@ // source/parsers/docx.ts | ||
// Convert the DOCX to text and return the text. | ||
// @ts-expect-error: see feross/buffer#353, the types are incomplete. | ||
const parsedDocx = await parseWordFile({ buffer: input }); | ||
@@ -22,0 +23,0 @@ return parsedDocx.value; |
@@ -1,2 +0,2 @@ | ||
import { type Buffer } from 'node:buffer'; | ||
import { type Buffer } from 'buffer/'; | ||
import type { TextExtractionMethod } from '../lib.js'; | ||
@@ -3,0 +3,0 @@ export declare class ExcelExtractor implements TextExtractionMethod { |
@@ -1,2 +0,2 @@ | ||
import { type Buffer } from 'node:buffer'; | ||
import { type Buffer } from 'buffer/'; | ||
import type { TextExtractionMethod } from '../lib.js'; | ||
@@ -3,0 +3,0 @@ export declare class PdfExtractor implements TextExtractionMethod { |
@@ -1,2 +0,2 @@ | ||
import { type Buffer } from 'node:buffer'; | ||
import { type Buffer } from 'buffer/'; | ||
import type { TextExtractionMethod } from '../lib.js'; | ||
@@ -3,0 +3,0 @@ export declare class PptExtractor implements TextExtractionMethod { |
@@ -1,3 +0,3 @@ | ||
import { type Buffer } from 'node:buffer'; | ||
import { type Buffer } from 'buffer/'; | ||
export declare const readFile: (filePath: string) => Promise<Buffer>; | ||
export declare const fetchUrl: (url: string) => Promise<Buffer>; |
@@ -5,3 +5,3 @@ // source/util.ts | ||
import { got as fetch } from 'got'; | ||
export const readFile = async (filePath) => read(filePath); | ||
export const fetchUrl = async (url) => fetch(url).buffer(); | ||
export const readFile = async (filePath) => (await read(filePath)); | ||
export const fetchUrl = async (url) => (await fetch(url).buffer()); |
{ | ||
"name": "office-text-extractor", | ||
"version": "3.0.3", | ||
"version": "3.4.0-beta.0", | ||
"description": "Yet another library to extract text from MS Office and PDF files", | ||
@@ -43,24 +43,25 @@ "keywords": [ | ||
"dependencies": { | ||
"fflate": "0.8.1", | ||
"file-type": "18.5.0", | ||
"got": "13.0.0", | ||
"buffer": "6.0.3", | ||
"fflate": "0.8.2", | ||
"file-type": "19.3.0", | ||
"got": "14.4.1", | ||
"js-yaml": "4.1.0", | ||
"mammoth": "1.6.0", | ||
"mammoth": "1.8.0", | ||
"pdf-parse": "1.1.1", | ||
"text-encoding": "0.7.0", | ||
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz", | ||
"xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", | ||
"xml2js": "0.6.2" | ||
}, | ||
"devDependencies": { | ||
"@types/js-yaml": "4.0.6", | ||
"@types/node": "20.8.3", | ||
"@types/text-encoding": "0.0.37", | ||
"@types/xml2js": "0.4.12", | ||
"ava": "5.3.1", | ||
"np": "8.0.4", | ||
"@types/js-yaml": "4.0.9", | ||
"@types/node": "20.14.11", | ||
"@types/text-encoding": "0.0.39", | ||
"@types/xml2js": "0.4.14", | ||
"ava": "6.1.3", | ||
"np": "10.0.7", | ||
"npm-run-all": "4.1.5", | ||
"prettier": "3.0.3", | ||
"tsx": "3.13.0", | ||
"typescript": "5.2.2", | ||
"xo": "0.56.0" | ||
"prettier": "3.3.3", | ||
"tsimp": "2.0.11", | ||
"typescript": "5.5.4", | ||
"xo": "0.59.0" | ||
}, | ||
@@ -90,3 +91,3 @@ "prettier": { | ||
"nodeArguments": [ | ||
"--no-warnings" | ||
"--import=tsimp" | ||
] | ||
@@ -99,4 +100,4 @@ }, | ||
"test:quality": "xo source/ test/", | ||
"test:integration": "NODE_OPTIONS='--loader=tsx' ava" | ||
"test:integration": "TSIMP_DIAG=ignore ava" | ||
} | ||
} |
// source/lib.ts | ||
// The source code for the library. | ||
import { Buffer } from 'node:buffer' | ||
import { Buffer } from 'buffer/index.js' | ||
import { fileTypeFromBuffer as getFileType } from 'file-type' | ||
@@ -6,0 +6,0 @@ import { readFile, fetchUrl } from './util.js' |
// source/parsers/docx.ts | ||
// The text extracter for DOCX files. | ||
import { type Buffer } from 'node:buffer' | ||
import { type Buffer } from 'buffer/' | ||
import { extractRawText as parseWordFile } from 'mammoth' | ||
@@ -25,2 +25,3 @@ | ||
// Convert the DOCX to text and return the text. | ||
// @ts-expect-error: see feross/buffer#353, the types are incomplete. | ||
const parsedDocx = await parseWordFile({ buffer: input }) | ||
@@ -27,0 +28,0 @@ return parsedDocx.value |
// source/parsers/excel.ts | ||
// The text extracter for Excel files. | ||
import { type Buffer } from 'node:buffer' | ||
import { type Buffer } from 'buffer/' | ||
import Xlsx, { utils as sheetUtils } from 'xlsx' | ||
@@ -6,0 +6,0 @@ import { dump as convertToYaml } from 'js-yaml' |
// source/parsers/pdf.ts | ||
// The text extracter for PDF files. | ||
import { type Buffer } from 'node:buffer' | ||
import { type Buffer } from 'buffer/' | ||
// @ts-expect-error There are no types for this package. | ||
@@ -6,0 +6,0 @@ import parsePdf from 'pdf-parse/lib/pdf-parse.js' |
@@ -6,3 +6,3 @@ // source/parsers/ppt.ts | ||
import { type Buffer } from 'node:buffer' | ||
import { type Buffer } from 'buffer/' | ||
import { unzip } from 'fflate' | ||
@@ -9,0 +9,0 @@ import { parseStringPromise as xmlToJson } from 'xml2js' |
// source/util.ts | ||
// Utility functions to help with the handling of input. | ||
import { type Buffer } from 'node:buffer' | ||
import { readFile as read } from 'node:fs/promises' | ||
import { got as fetch } from 'got' | ||
import { type Buffer } from 'buffer/' | ||
export const readFile = async (filePath: string): Promise<Buffer> => | ||
read(filePath) | ||
(await read(filePath)) as unknown as Buffer | ||
export const fetchUrl = async (url: string): Promise<Buffer> => | ||
fetch(url).buffer() | ||
(await fetch(url).buffer()) as unknown as Buffer |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
HTTP dependency
Supply chain riskContains a dependency which resolves to a remote HTTP URL which could be used to inject untrusted code and reduce overall package reliability.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
HTTP dependency
Supply chain riskContains a dependency which resolves to a remote HTTP URL which could be used to inject untrusted code and reduce overall package reliability.
Found 1 instance in 1 package
35001
26
781
10
1
+ Addedbuffer@6.0.3
+ Added@sec-ant/readable-stream@0.4.1(transitive)
+ Added@sindresorhus/is@6.3.1(transitive)
+ Addedbuffer@6.0.3(transitive)
+ Addedcacheable-request@12.0.1(transitive)
+ Addedfflate@0.8.2(transitive)
+ Addedfile-type@19.3.0(transitive)
+ Addedform-data-encoder@4.0.2(transitive)
+ Addedget-stream@8.0.19.0.1(transitive)
+ Addedgot@14.4.1(transitive)
+ Addedis-stream@4.0.1(transitive)
+ Addedmammoth@1.8.0(transitive)
+ Addedp-cancelable@4.0.1(transitive)
+ Addedstrtok3@8.1.0(transitive)
+ Addedtoken-types@6.0.0(transitive)
+ Addedtype-fest@4.26.1(transitive)
+ Addeduint8array-extras@1.4.0(transitive)
- Removed@sindresorhus/is@5.6.0(transitive)
- Removedcacheable-request@10.2.14(transitive)
- Removedfflate@0.8.1(transitive)
- Removedfile-type@18.5.0(transitive)
- Removedform-data-encoder@2.1.4(transitive)
- Removedget-stream@6.0.1(transitive)
- Removedgot@13.0.0(transitive)
- Removedmammoth@1.6.0(transitive)
- Removedp-cancelable@3.0.0(transitive)
- Removedreadable-stream@3.6.2(transitive)
- Removedreadable-web-to-node-stream@3.0.2(transitive)
- Removedstrtok3@7.1.1(transitive)
- Removedtoken-types@5.0.1(transitive)
Updatedfflate@0.8.2
Updatedfile-type@19.3.0
Updatedgot@14.4.1
Updatedmammoth@1.8.0
Updatedxlsx@https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz