šŸš€ Big News: Socket Acquires Coana to Bring Reachability Analysis to Every Appsec Team.Learn more →
Socket
DemoInstallSign in
Socket

bulk-files-ocr-search

Package Overview
Dependencies
Maintainers
1
Versions
6
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

bulk-files-ocr-search - npm Package Compare versions

Comparing version

to
0.1.1

dist/a.d.ts

9

dist/index.d.ts
import dirTree from 'directory-tree';
import { Progress, ocr, ScanOptions, WorkerPool } from './utils';
export { Progress, ScanOptions, ocr };
import { Progress, ocr, ScanOptions, WorkerPool, pdfToImages } from './utils';
export { Progress, ScanOptions, ocr, pdfToImages };
/** Internal function */

@@ -13,3 +13,4 @@ export declare const visitDir: (dir: dirTree.DirectoryTree, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig }: {

pool: WorkerPool;
} & Pick<ScanOptions, "words" | "shouldConsoleLog" | "outputLogFile" | "tesseractConfig">) => void;
export declare const scanDir: (scannedDir: string, { words, shouldConsoleLog, progressFile, outputLogFile, workerPoolSize, tesseractConfig }?: ScanOptions) => Promise<Progress>;
} & Pick<ScanOptions, "words" | "shouldConsoleLog" | "outputLogFile" | "tesseractConfig">) => Promise<void>;
export declare const scanDir: (scannedDir: string, { words, shouldConsoleLog, outputLogFile, workerPoolSize, tesseractConfig, progressFile }?: ScanOptions) => Promise<Progress>;
//# sourceMappingURL=index.d.ts.map

@@ -6,3 +6,3 @@ "use strict";

Object.defineProperty(exports, "__esModule", { value: true });
exports.scanDir = exports.visitFile = exports.visitDir = exports.ocr = void 0;
exports.scanDir = exports.visitFile = exports.visitDir = exports.pdfToImages = exports.ocr = void 0;
const os_1 = __importDefault(require("os"));

@@ -13,2 +13,3 @@ const fs_extra_1 = __importDefault(require("fs-extra"));

Object.defineProperty(exports, "ocr", { enumerable: true, get: function () { return utils_1.ocr; } });
Object.defineProperty(exports, "pdfToImages", { enumerable: true, get: function () { return utils_1.pdfToImages; } });
/** Internal function */

@@ -20,5 +21,5 @@ const visitDir = async (dir, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig }) => {

if (child.type === 'file')
(0, exports.visitFile)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig });
await (0, exports.visitFile)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig });
else
await (0, exports.visitDir)(child, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig });
await (0, exports.visitDir)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig, progressFile });
}

@@ -33,5 +34,12 @@ await pool.settled(true);

/** Internal function */
const visitFile = (file, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }) => {
const visitFile = async (file, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }) => {
if (file.name === '.gitkeep')
return;
if (!(0, utils_1.isSupportedExtension)(file.extension)) {
if (shouldConsoleLog)
console.log(`šŸ‘½ Unsupported file ${file.path}`);
// Mark as visited
progress.visited.add(file.path);
return;
}
if (progress.visited.has(file.path)) {

@@ -42,10 +50,24 @@ if (shouldConsoleLog)

}
// Convert PDF pages to images
if (file.extension === '.pdf') {
// return
const images = await (0, utils_1.pdfToImages)(file.path);
if (shouldConsoleLog)
console.log(`✨ Extracted PDF ${file.path}`);
for (const image of images) {
// Convert to directoryTree format
const imageTreeFormat = {
name: image.name,
path: image.path,
size: -1,
type: 'file',
extension: '.webp'
};
await (0, exports.visitFile)(imageTreeFormat, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig });
}
// Mark PDF as visited to not convert it again
progress.visited.add(file.path);
return;
}
pool.queue(async ({ scanFile }) => {
if (!(0, utils_1.isSupportedExtension)(file.extension)) {
if (shouldConsoleLog)
console.log(`šŸ‘½ Unsupported file ${file.path}`);
// Mark as visited
progress.visited.add(file.path);
return;
}
try {

@@ -83,3 +105,3 @@ const scanRes = await scanFile(file, words, tesseractConfig);

exports.visitFile = visitFile;
const scanDir = async (scannedDir, { words = ['MATCH_ALL'], shouldConsoleLog = false, progressFile, outputLogFile, workerPoolSize, tesseractConfig } = {}) => {
const scanDir = async (scannedDir, { words = ['MATCH_ALL'], shouldConsoleLog = false, outputLogFile, workerPoolSize, tesseractConfig, progressFile } = {}) => {
// Do not use all CPU cores as default, it makes the OCR process way slower!

@@ -96,1 +118,2 @@ if (!workerPoolSize)

exports.scanDir = scanDir;
//# sourceMappingURL=index.js.map

@@ -0,1 +1,2 @@

import path from 'path';
import { recognize as tesseractRecognize } from 'node-tesseract-ocr';

@@ -32,3 +33,3 @@ import dirTree from 'directory-tree';

/**
* Tesseract OCR config, will default to english language
* Tesseract OCR config, will default to english language `{ lang: 'eng' }`
*

@@ -63,3 +64,3 @@ * @see https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc

/**
* @param path Path to the image to extract text from
* @param filePath Path to the image to extract text from
* @param tesseractConfig Tesseract configuration

@@ -69,4 +70,13 @@ * @param shouldCleanStr Should the string be normalized (lowercase, accents removed, whitespace removed)

*/
export declare const ocr: (path: string, tesseractConfig?: TesseractConfig, shouldCleanStr?: boolean) => Promise<string>;
export declare const ocr: (filePath: string, tesseractConfig?: TesseractConfig, shouldCleanStr?: boolean) => Promise<string>;
/**
* Extract all the pages of a PDF to images
* @param filePath Path to the PDF to be converted
* @returns List of generated output images
*/
export declare const pdfToImages: (filePath: string) => Promise<Array<{
name: string;
path: string;
}>>;
/**
* Find all words that were matched in text

@@ -88,1 +98,2 @@ *

export declare const getTree: (scannedDir: string) => Promise<dirTree.DirectoryTree>;
//# sourceMappingURL=utils.d.ts.map

@@ -6,5 +6,7 @@ "use strict";

Object.defineProperty(exports, "__esModule", { value: true });
exports.getTree = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.ocr = exports.cleanStr = void 0;
exports.getTree = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.pdfToImages = exports.ocr = exports.cleanStr = void 0;
const fs_extra_1 = __importDefault(require("fs-extra"));
const path_1 = __importDefault(require("path"));
const node_tesseract_ocr_1 = require("node-tesseract-ocr");
const execa_1 = __importDefault(require("execa"));
const directory_tree_1 = __importDefault(require("directory-tree"));

@@ -18,3 +20,3 @@ const cleanStr = (str) => str

/**
* @param path Path to the image to extract text from
* @param filePath Path to the image to extract text from
* @param tesseractConfig Tesseract configuration

@@ -24,3 +26,3 @@ * @param shouldCleanStr Should the string be normalized (lowercase, accents removed, whitespace removed)

*/
const ocr = async (path, tesseractConfig = {}, shouldCleanStr = true) => {
const ocr = async (filePath, tesseractConfig = {}, shouldCleanStr = true) => {
// Apply default options

@@ -31,5 +33,6 @@ if (!tesseractConfig.lang)

tesseractConfig.oem = 1;
// PSM 1 seems to output better result for rotated content
if (!tesseractConfig.psm)
tesseractConfig.psm = 3;
const text = await (0, node_tesseract_ocr_1.recognize)(path, tesseractConfig);
tesseractConfig.psm = 1;
const text = await (0, node_tesseract_ocr_1.recognize)(filePath, tesseractConfig);
return text

@@ -43,2 +46,18 @@ .split('\n')

/**
* Extract all the pages of a PDF to images
* @param filePath Path to the PDF to be converted
* @returns List of generated output images
*/
const pdfToImages = async (filePath) => {
const fileName = path_1.default.basename(filePath);
// pdftoppm -png file.pdf output-images-prefix
await (0, execa_1.default)('pdftoppm', ['-png', filePath, filePath]);
// Find the list of created files (we don't know how many pages are in the pdf!)
const files = await fs_extra_1.default.readdir(path_1.default.dirname(filePath));
return files
.filter(x => x.startsWith(fileName) && x !== fileName)
.map(x => ({ name: x, path: path_1.default.resolve(path_1.default.dirname(fileName), x) }));
};
exports.pdfToImages = pdfToImages;
/**
* Find all words that were matched in text

@@ -58,4 +77,3 @@ *

exports.findMatches = findMatches;
// TODO: Support PDF
const isSupportedExtension = (ext) => ['.jpg', '.png', '.webp'].some(x => ext.toLowerCase() === x);
const isSupportedExtension = (ext) => ['.jpg', '.jpeg', '.png', '.webp', '.pdf'].includes(ext);
exports.isSupportedExtension = isSupportedExtension;

@@ -93,1 +111,2 @@ const scanFile = async (file, words, tesseractConfig) => {

exports.getTree = getTree;
//# sourceMappingURL=utils.js.map
export {};
//# sourceMappingURL=worker.d.ts.map

@@ -6,1 +6,2 @@ "use strict";

(0, worker_1.expose)({ scanFile: utils_1.scanFile });
//# sourceMappingURL=worker.js.map
{
"name": "bulk-files-ocr-search",
"version": "0.1.0",
"version": "0.1.1",
"description": "Find files that contain some text with OCR",

@@ -18,4 +18,10 @@ "license": "MIT",

"types": "dist/index.d.ts",
"scripts": {
"build": "tsc",
"test": "tsc && ava",
"test:ava": "ava"
},
"dependencies": {
"directory-tree": "^3.0.1",
"execa": "^5.1.1",
"fs-extra": "^10.0.0",

@@ -51,8 +57,3 @@ "node-tesseract-ocr": "^2.2.1",

"verbose": true
},
"scripts": {
"build": "tsc",
"test": "tsc && ava",
"test:ava": "ava"
}
}
}

@@ -12,3 +12,3 @@ # Bulk Files OCR Text Finder

- Images: JPG, PNG, [WebP](https://en.wikipedia.org/wiki/WebP)
- Images: JPEG, PNG, [WebP](https://en.wikipedia.org/wiki/WebP)
- Documents: PDF

@@ -21,3 +21,3 @@

[Tesseract OCR](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc) is used internally.
[Tesseract OCR](https://github.com/tesseract-ocr/tesseract) is used internally ([Tesseract Documentation](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc)). For PDF to PNG conversion, [Poppler](https://poppler.freedesktop.org/) is used.

@@ -28,3 +28,5 @@ This package uses [worker threads](https://nodejs.org/api/worker_threads.html) to make use of your CPU cores and be faster.

- The OCR will only provide relevant results if your files are in a proper orientation (text is horizontal and not upside-down).
- The OCR will provide bad results for rotated files/non-straight text.
- 90/180 degrees rotations seems to output a good result
- You may want to pre-process your files somehow to make the text straight!
- Files will be matched if at least 1 of the words is found in the text contained in it.

@@ -34,10 +36,12 @@

No matter how you decide to use this package, you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc) anyway.
No matter how you decide to use this package, you need to install Tesseract OCR anyway. If you have some PDF files, they need to be converted with additional packages
```sh
# OCR Package (non-linux, see https://github.com/tesseract-ocr/tesseract#installing-tesseract)
sudo apt install tesseract-ocr
# PDF to JPEG conversion command-line (for Windows, see https://stackoverflow.com/a/53960829 - MacOS `brew install poppler`)
sudo apt install poppler-utils
```
See [Installing Tesseract](https://github.com/tesseract-ocr/tesseract#installing-tesseract).
### OCR Language

@@ -58,4 +62,3 @@

cd bulk-files-ocr-search
# npm install -D
pnpm install
pnpm install # or npm install -D
pnpm build

@@ -120,3 +123,3 @@ ```

/**
* Tesseract OCR config, will default to english language
* Tesseract OCR config, will default to english language `{ lang: 'eng' }`
*

@@ -149,3 +152,3 @@ * @see https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc

The standalone OCR function is also exported.
The standalone OCR and PDF to images functions are also exported.

@@ -159,9 +162,23 @@ ```ts

// Tesseract configuration
const tesseractConfig: TesseractConfig = {}
const tesseractConfig: TesseractConfig = {
lang: 'eng'
}
// Should the string be normalized (lowercase, accents removed, whitespace removed)
const shouldCleanStr: boolean = true
const shouldCleanStr: boolean | undefined = true
// OCR
const text = await ocr(file, tesseractConfig, shouldCleanStr)
console.log(text)
// ---
const filePdf = path.resolve(__dirname, '..', 'test', '_testFiles', 'sample.pdf')
// Extracted pages output format
const format: string | undefined = 'jpg'
// PDF to images
const res = await pdfToImages(filePdf, format)
console.log(res) // Files are generated on the file system, 1 file per page
```

@@ -168,0 +185,0 @@

Sorry, the diff of this file is not supported yet