import dirTree from 'directory-tree';
		import { Progress, ocr, ScanOptions, WorkerPool } from './utils';
		export { Progress, ScanOptions, ocr };
		import { Progress, ocr, ScanOptions, WorkerPool, pdfToImages } from './utils';
		export { Progress, ScanOptions, ocr, pdfToImages };
		/** Internal function */
		@@ -13,3 +13,4 @@ export declare const visitDir: (dir: dirTree.DirectoryTree, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig }: {
		pool: WorkerPool;
		} & Pick<ScanOptions, "words" \| "shouldConsoleLog" \| "outputLogFile" \| "tesseractConfig">) => void;
		export declare const scanDir: (scannedDir: string, { words, shouldConsoleLog, progressFile, outputLogFile, workerPoolSize, tesseractConfig }?: ScanOptions) => Promise<Progress>;
		} & Pick<ScanOptions, "words" \| "shouldConsoleLog" \| "outputLogFile" \| "tesseractConfig">) => Promise<void>;
		export declare const scanDir: (scannedDir: string, { words, shouldConsoleLog, outputLogFile, workerPoolSize, tesseractConfig, progressFile }?: ScanOptions) => Promise<Progress>;
		//# sourceMappingURL=index.d.ts.map

dist/index.js

		@@ -6,3 +6,3 @@ "use strict";
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.scanDir = exports.visitFile = exports.visitDir = exports.ocr = void 0;
		exports.scanDir = exports.visitFile = exports.visitDir = exports.pdfToImages = exports.ocr = void 0;
		const os_1 = __importDefault(require("os"));
		@@ -13,2 +13,3 @@ const fs_extra_1 = __importDefault(require("fs-extra"));
		Object.defineProperty(exports, "ocr", { enumerable: true, get: function () { return utils_1.ocr; } });
		Object.defineProperty(exports, "pdfToImages", { enumerable: true, get: function () { return utils_1.pdfToImages; } });
		/** Internal function */
		@@ -20,5 +21,5 @@ const visitDir = async (dir, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig }) => {
		if (child.type === 'file')
		(0, exports.visitFile)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig });
		await (0, exports.visitFile)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig });
		else
		await (0, exports.visitDir)(child, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig });
		await (0, exports.visitDir)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig, progressFile });
		}
		@@ -33,5 +34,12 @@ await pool.settled(true);
		/** Internal function */
		const visitFile = (file, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }) => {
		const visitFile = async (file, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }) => {
		if (file.name === '.gitkeep')
		return;
		if (!(0, utils_1.isSupportedExtension)(file.extension)) {
		if (shouldConsoleLog)
		console.log(`👽 Unsupported file ${file.path}`);
		// Mark as visited
		progress.visited.add(file.path);
		return;
		}
		if (progress.visited.has(file.path)) {
		@@ -42,10 +50,24 @@ if (shouldConsoleLog)
		}
		// Convert PDF pages to images
		if (file.extension === '.pdf') {
		// return
		const images = await (0, utils_1.pdfToImages)(file.path);
		if (shouldConsoleLog)
		console.log(`✨ Extracted PDF ${file.path}`);
		for (const image of images) {
		// Convert to directoryTree format
		const imageTreeFormat = {
		name: image.name,
		path: image.path,
		size: -1,
		type: 'file',
		extension: '.webp'
		};
		await (0, exports.visitFile)(imageTreeFormat, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig });
		}
		// Mark PDF as visited to not convert it again
		progress.visited.add(file.path);
		return;
		}
		pool.queue(async ({ scanFile }) => {
		if (!(0, utils_1.isSupportedExtension)(file.extension)) {
		if (shouldConsoleLog)
		console.log(`👽 Unsupported file ${file.path}`);
		// Mark as visited
		progress.visited.add(file.path);
		return;
		}
		try {
		@@ -83,3 +105,3 @@ const scanRes = await scanFile(file, words, tesseractConfig);
		exports.visitFile = visitFile;
		const scanDir = async (scannedDir, { words = ['MATCH_ALL'], shouldConsoleLog = false, progressFile, outputLogFile, workerPoolSize, tesseractConfig } = {}) => {
		const scanDir = async (scannedDir, { words = ['MATCH_ALL'], shouldConsoleLog = false, outputLogFile, workerPoolSize, tesseractConfig, progressFile } = {}) => {
		// Do not use all CPU cores as default, it makes the OCR process way slower!
		@@ -96,1 +118,2 @@ if (!workerPoolSize)
		exports.scanDir = scanDir;
		//# sourceMappingURL=index.js.map

dist/utils.d.ts

		@@ -0,1 +1,2 @@
		import path from 'path';
		import { recognize as tesseractRecognize } from 'node-tesseract-ocr';
		@@ -32,3 +33,3 @@ import dirTree from 'directory-tree';
		/**
		* Tesseract OCR config, will default to english language
		* Tesseract OCR config, will default to english language `{ lang: 'eng' }`
		*
		@@ -63,3 +64,3 @@ * @see https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc
		/**
		* @param path Path to the image to extract text from
		* @param filePath Path to the image to extract text from
		* @param tesseractConfig Tesseract configuration
		@@ -69,4 +70,13 @@ * @param shouldCleanStr Should the string be normalized (lowercase, accents removed, whitespace removed)
		*/
		export declare const ocr: (path: string, tesseractConfig?: TesseractConfig, shouldCleanStr?: boolean) => Promise<string>;
		export declare const ocr: (filePath: string, tesseractConfig?: TesseractConfig, shouldCleanStr?: boolean) => Promise<string>;
		/**
		* Extract all the pages of a PDF to images
		* @param filePath Path to the PDF to be converted
		* @returns List of generated output images
		*/
		export declare const pdfToImages: (filePath: string) => Promise<Array<{
		name: string;
		path: string;
		}>>;
		/**
		* Find all words that were matched in text
		@@ -88,1 +98,2 @@ *
		export declare const getTree: (scannedDir: string) => Promise<dirTree.DirectoryTree>;
		//# sourceMappingURL=utils.d.ts.map

dist/utils.js

		@@ -6,5 +6,7 @@ "use strict";
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getTree = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.ocr = exports.cleanStr = void 0;
		exports.getTree = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.pdfToImages = exports.ocr = exports.cleanStr = void 0;
		const fs_extra_1 = __importDefault(require("fs-extra"));
		const path_1 = __importDefault(require("path"));
		const node_tesseract_ocr_1 = require("node-tesseract-ocr");
		const execa_1 = __importDefault(require("execa"));
		const directory_tree_1 = __importDefault(require("directory-tree"));
		@@ -18,3 +20,3 @@ const cleanStr = (str) => str
		/**
		* @param path Path to the image to extract text from
		* @param filePath Path to the image to extract text from
		* @param tesseractConfig Tesseract configuration
		@@ -24,3 +26,3 @@ * @param shouldCleanStr Should the string be normalized (lowercase, accents removed, whitespace removed)
		*/
		const ocr = async (path, tesseractConfig = {}, shouldCleanStr = true) => {
		const ocr = async (filePath, tesseractConfig = {}, shouldCleanStr = true) => {
		// Apply default options
		@@ -31,5 +33,6 @@ if (!tesseractConfig.lang)
		tesseractConfig.oem = 1;
		// PSM 1 seems to output better result for rotated content
		if (!tesseractConfig.psm)
		tesseractConfig.psm = 3;
		const text = await (0, node_tesseract_ocr_1.recognize)(path, tesseractConfig);
		tesseractConfig.psm = 1;
		const text = await (0, node_tesseract_ocr_1.recognize)(filePath, tesseractConfig);
		return text
		@@ -43,2 +46,18 @@ .split('\n')
		/**
		* Extract all the pages of a PDF to images
		* @param filePath Path to the PDF to be converted
		* @returns List of generated output images
		*/
		const pdfToImages = async (filePath) => {
		const fileName = path_1.default.basename(filePath);
		// pdftoppm -png file.pdf output-images-prefix
		await (0, execa_1.default)('pdftoppm', ['-png', filePath, filePath]);
		// Find the list of created files (we don't know how many pages are in the pdf!)
		const files = await fs_extra_1.default.readdir(path_1.default.dirname(filePath));
		return files
		.filter(x => x.startsWith(fileName) && x !== fileName)
		.map(x => ({ name: x, path: path_1.default.resolve(path_1.default.dirname(fileName), x) }));
		};
		exports.pdfToImages = pdfToImages;
		/**
		* Find all words that were matched in text
		@@ -58,4 +77,3 @@ *
		exports.findMatches = findMatches;
		// TODO: Support PDF
		const isSupportedExtension = (ext) => ['.jpg', '.png', '.webp'].some(x => ext.toLowerCase() === x);
		const isSupportedExtension = (ext) => ['.jpg', '.jpeg', '.png', '.webp', '.pdf'].includes(ext);
		exports.isSupportedExtension = isSupportedExtension;
		@@ -93,1 +111,2 @@ const scanFile = async (file, words, tesseractConfig) => {
		exports.getTree = getTree;
		//# sourceMappingURL=utils.js.map

dist/worker.d.ts

		export {};
		//# sourceMappingURL=worker.d.ts.map

dist/worker.js

		@@ -6,1 +6,2 @@ "use strict";
		(0, worker_1.expose)({ scanFile: utils_1.scanFile });
		//# sourceMappingURL=worker.js.map

package.json

		{
		"name": "bulk-files-ocr-search",
		"version": "0.1.0",
		"version": "0.1.1",
		"description": "Find files that contain some text with OCR",
		@@ -18,4 +18,10 @@ "license": "MIT",
		"types": "dist/index.d.ts",
		"scripts": {
		"build": "tsc",
		"test": "tsc && ava",
		"test:ava": "ava"
		},
		"dependencies": {
		"directory-tree": "^3.0.1",
		"execa": "^5.1.1",
		"fs-extra": "^10.0.0",
		@@ -51,8 +57,3 @@ "node-tesseract-ocr": "^2.2.1",
		"verbose": true
		},
		"scripts": {
		"build": "tsc",
		"test": "tsc && ava",
		"test:ava": "ava"
		}
		}
		}

README.md

		@@ -12,3 +12,3 @@ # Bulk Files OCR Text Finder

		- Images: JPG, PNG, [WebP](https://en.wikipedia.org/wiki/WebP)
		- Images: JPEG, PNG, [WebP](https://en.wikipedia.org/wiki/WebP)
		- Documents: PDF
		@@ -21,3 +21,3 @@

		[Tesseract OCR](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc) is used internally.
		[Tesseract OCR](https://github.com/tesseract-ocr/tesseract) is used internally ([Tesseract Documentation](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc)). For PDF to PNG conversion, [Poppler](https://poppler.freedesktop.org/) is used.

		@@ -28,3 +28,5 @@ This package uses [worker threads](https://nodejs.org/api/worker_threads.html) to make use of your CPU cores and be faster.

		- The OCR will only provide relevant results if your files are in a proper orientation (text is horizontal and not upside-down).
		- The OCR will provide bad results for rotated files/non-straight text.
		- 90/180 degrees rotations seems to output a good result
		- You may want to pre-process your files somehow to make the text straight!
		- Files will be matched if at least 1 of the words is found in the text contained in it.
		@@ -34,10 +36,12 @@

		No matter how you decide to use this package, you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc) anyway.
		No matter how you decide to use this package, you need to install Tesseract OCR anyway. If you have some PDF files, they need to be converted with additional packages

		```sh
		# OCR Package (non-linux, see https://github.com/tesseract-ocr/tesseract#installing-tesseract)
		sudo apt install tesseract-ocr

		# PDF to JPEG conversion command-line (for Windows, see https://stackoverflow.com/a/53960829 - MacOS `brew install poppler`)
		sudo apt install poppler-utils
		```

		See [Installing Tesseract](https://github.com/tesseract-ocr/tesseract#installing-tesseract).

		### OCR Language
		@@ -58,4 +62,3 @@
		cd bulk-files-ocr-search
		# npm install -D
		pnpm install
		pnpm install # or npm install -D
		pnpm build
		@@ -120,3 +123,3 @@ ```
		/**
		* Tesseract OCR config, will default to english language
		* Tesseract OCR config, will default to english language `{ lang: 'eng' }`
		*
		@@ -149,3 +152,3 @@ * @see https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc

		The standalone OCR function is also exported.
		The standalone OCR and PDF to images functions are also exported.

		@@ -159,9 +162,23 @@ ```ts
		// Tesseract configuration
		const tesseractConfig: TesseractConfig = {}
		const tesseractConfig: TesseractConfig = {
		lang: 'eng'
		}

		// Should the string be normalized (lowercase, accents removed, whitespace removed)
		const shouldCleanStr: boolean = true
		const shouldCleanStr: boolean \| undefined = true

		// OCR
		const text = await ocr(file, tesseractConfig, shouldCleanStr)
		console.log(text)

		// ---

		const filePdf = path.resolve(__dirname, '..', 'test', '_testFiles', 'sample.pdf')

		// Extracted pages output format
		const format: string \| undefined = 'jpg'

		// PDF to images
		const res = await pdfToImages(filePdf, format)
		console.log(res) // Files are generated on the file system, 1 file per page
		```
		@@ -168,0 +185,0 @@

pnpm-lock.yaml

Sorry, the diff of this file is not supported yet

bulk-files-ocr-search - npm Package Compare versions

Improved metrics

Worsened metrics

Dependency changes