@@ -5,12 +5,6 @@ import dirTree from 'directory-tree';
		/** Internal function */
		export declare const visitDir: (dir: dirTree.DirectoryTree, { progress, pool, words, shouldConsoleLog, progressFile, matchesLogFile, tesseractConfig }: {
		progress: Progress;
		pool: WorkerPool;
		} & Pick<ScanOptions, "words" \| "shouldConsoleLog" \| "progressFile" \| "matchesLogFile" \| "tesseractConfig">) => Promise<void>;
		export declare const visitDir: (dir: dirTree.DirectoryTree, progress: Progress, pool: WorkerPool, options: Omit<ScanOptions, 'workerPoolSize'>) => Promise<void>;
		/** Internal function */
		export declare const visitFile: (file: dirTree.DirectoryTree, { progress, pool, words, shouldConsoleLog, progressFile, matchesLogFile, tesseractConfig }: {
		progress: Progress;
		pool: WorkerPool;
		} & Pick<ScanOptions, "words" \| "shouldConsoleLog" \| "progressFile" \| "matchesLogFile" \| "tesseractConfig">) => Promise<void>;
		export declare const scanDir: (scannedDir: string, { words, shouldConsoleLog, matchesLogFile, workerPoolSize, tesseractConfig, progressFile }?: ScanOptions) => Promise<Progress>;
		export declare const visitFile: (file: dirTree.DirectoryTree, progress: Progress, pool: WorkerPool, options: Omit<ScanOptions, 'workerPoolSize'>) => Promise<void>;
		export declare const scanDir: (scannedDir: string, options?: ScanOptions) => Promise<Progress>;
		//# sourceMappingURL=index.d.ts.map

179

dist/index.js

		@@ -14,31 +14,17 @@ "use strict";
		let scannedFilesSinceLastSaveProgressCount = 0;
		let visitedCount = 1;
		let totalFilesCount = 0;
		/** Internal function */
		const visitDir = async (dir, { progress, pool, words, shouldConsoleLog, progressFile, matchesLogFile, tesseractConfig }) => {
		if (shouldConsoleLog)
		console.log(`🔍 Scan directory ${dir.path}`);
		const visitDir = async (dir, progress, pool, options) => {
		if (options.shouldConsoleLog)
		console.log(`${' '.repeat(totalFilesCount.toString().length * 2 + 3)} 🔍 Scan directory ${dir.path}`);
		for (const child of dir.children) {
		if (child.type === 'file')
		await (0, exports.visitFile)(child, {
		progress,
		pool,
		words,
		shouldConsoleLog,
		progressFile,
		matchesLogFile: matchesLogFile,
		tesseractConfig
		});
		else
		await (0, exports.visitDir)(child, {
		progress,
		pool,
		words,
		shouldConsoleLog,
		progressFile,
		matchesLogFile: matchesLogFile,
		tesseractConfig
		});
		await (0, exports.visitFile)(child, progress, pool, options);
		else if (child.type === 'directory')
		await (0, exports.visitDir)(child, progress, pool, options);
		}
		await pool.settled(true);
		if (progressFile)
		await (0, utils_1.saveProgress)(progressFile, progress);
		if (options.progressFile)
		await (0, utils_1.saveProgress)(options.progressFile, progress);
		// We do not mark directories as visited in case the user adds new files
		@@ -49,15 +35,23 @@ // in them in the future!
		/** Internal function */
		const visitFile = async (file, { progress, pool, words, shouldConsoleLog, progressFile, matchesLogFile, tesseractConfig }) => {
		const visitFile = async (file, progress, pool, options) => {
		var _a;
		if (file.name === '.gitkeep')
		return;
		if ((_a = options.ignoreExt) === null \|\| _a === void 0 ? void 0 : _a.has(file.extension)) {
		if (options.shouldConsoleLog)
		(0, utils_1.logProgress)(visitedCount, totalFilesCount, `☢️ Ignored .ext ${file.path}`);
		return;
		}
		if (!(0, utils_1.isSupportedExtension)(file.extension)) {
		if (shouldConsoleLog)
		console.log(`👽 Unsupported file ${file.path}`);
		if (options.shouldConsoleLog)
		(0, utils_1.logProgress)(visitedCount, totalFilesCount, `👽 Unsupported file ${file.path}`);
		// Mark as visited
		progress.visited.add(file.path);
		visitedCount++;
		return;
		}
		if (progress.visited.has(file.path)) {
		if (shouldConsoleLog)
		console.log(`⏩ Skip visited ${file.path}`);
		if (options.shouldConsoleLog)
		(0, utils_1.logProgress)(visitedCount, totalFilesCount, `⏩ Skip visited ${file.path}`);
		visitedCount++;
		return;
		@@ -68,33 +62,45 @@ }
		let images = [];
		if (!(await (0, utils_1.isPdfAlreadyExtractedToImages)(file.path))) {
		images = await (0, utils_1.pdfToImages)(file.path);
		if (shouldConsoleLog)
		console.log(`✨ Extracted PDF ${file.path}`);
		let hasAlreadyExtractedPdf = false;
		try {
		if (!(await (0, utils_1.isPdfAlreadyExtractedToImages)(file.path))) {
		let first = undefined;
		let last = undefined;
		if (options.pdfExtractFirst)
		first = options.pdfExtractFirst;
		if (options.pdfExtractLast)
		last = options.pdfExtractLast;
		images = await (0, utils_1.pdfToImages)(file.path, first, last);
		totalFilesCount += images.length;
		if (options.shouldConsoleLog)
		(0, utils_1.logProgress)(visitedCount, totalFilesCount, `✨ Extracted PDF ${file.path}`);
		}
		else {
		images = await (0, utils_1.getPdfExtractedImages)(file.path);
		hasAlreadyExtractedPdf = true;
		if (options.shouldConsoleLog)
		(0, utils_1.logProgress)(visitedCount, totalFilesCount, `📄 PDF is ready ${file.path}`);
		}
		}
		else {
		images = await (0, utils_1.getPdfExtractedImages)(file.path);
		if (shouldConsoleLog)
		console.log(`📄 PDF is ready ${file.path}`);
		catch (error) {
		{
		console.log('💥 ERROR! PDF FAIL! ', file.path);
		console.error(error);
		}
		}
		for (const image of images) {
		// Convert to directoryTree format
		const imageTreeFormat = {
		name: image.name,
		path: image.path,
		size: -1,
		type: 'file',
		extension: '.png'
		};
		await (0, exports.visitFile)(imageTreeFormat, {
		progress,
		pool,
		words,
		shouldConsoleLog,
		progressFile,
		matchesLogFile: matchesLogFile,
		tesseractConfig
		});
		if (!hasAlreadyExtractedPdf) {
		for (const image of images) {
		// Convert to directoryTree format
		const imageTreeFormat = {
		name: image.name,
		path: image.path,
		size: -1,
		type: 'file',
		extension: '.png'
		};
		await (0, exports.visitFile)(imageTreeFormat, progress, pool, options);
		}
		}
		// Mark PDF as visited to not convert it again
		progress.visited.add(file.path);
		visitedCount++;
		return;
		@@ -104,23 +110,27 @@ }
		try {
		const scanRes = await scanFile(file, words, tesseractConfig);
		const scanRes = await scanFile(file, options.words, options.tesseractConfig);
		if (scanRes && scanRes.matches.length > 0) {
		let str = '';
		str += `\n✅ MATCH! ${file.path}\n`;
		str += `Words: ${scanRes.matches.join()}\n`;
		str += `Text:\n${scanRes.text}\n`;
		if (shouldConsoleLog)
		console.log(str);
		str += `✅ MATCH! ${file.path}`;
		if (options.shouldConsoleLog && !options.shouldConsoleLogMatches) {
		(0, utils_1.logProgress)(visitedCount, totalFilesCount, str);
		}
		str += `\nWords: ${scanRes.matches.join()}\n`;
		str += `Text:\n${scanRes.text}`;
		if (options.shouldConsoleLog && options.shouldConsoleLogMatches) {
		(0, utils_1.logProgress)(visitedCount, totalFilesCount, str);
		}
		// Save in the matched Map
		progress.matched.set(file.path, scanRes);
		if (matchesLogFile) {
		await fs_extra_1.default.promises.writeFile(matchesLogFile, `${str}\n----------------\n`, { flag: 'a' });
		if (options.matchesLogFile) {
		await fs_extra_1.default.promises.writeFile(options.matchesLogFile, `${str}\n----------------\n`, { flag: 'a' });
		}
		}
		else {
		if (shouldConsoleLog)
		console.log(`❌ No words matched ${file.path}`);
		if (options.shouldConsoleLog)
		(0, utils_1.logProgress)(visitedCount, totalFilesCount, `❌ No words matched ${file.path}`);
		}
		}
		catch (error) {
		if (shouldConsoleLog) {
		{
		console.log('💥 ERROR! Scan fail ', file.path);
		@@ -133,7 +143,8 @@ console.error(error);
		progress.visited.add(file.path);
		if (progressFile) {
		visitedCount++;
		if (options.progressFile) {
		scannedFilesSinceLastSaveProgressCount++;
		// Save progress every 5 scans
		if (progressFile && scannedFilesSinceLastSaveProgressCount > 5) {
		await (0, utils_1.saveProgress)(progressFile, progress);
		if (options.progressFile && scannedFilesSinceLastSaveProgressCount > 5) {
		await (0, utils_1.saveProgress)(options.progressFile, progress);
		scannedFilesSinceLastSaveProgressCount = 0;
		@@ -145,18 +156,18 @@ }
		exports.visitFile = visitFile;
		const scanDir = async (scannedDir, { words = ['MATCH_ALL'], shouldConsoleLog = false, matchesLogFile, workerPoolSize, tesseractConfig, progressFile } = {}) => {
		const scanDir = async (scannedDir, options = {}) => {
		if (!options.words)
		options.words = ['MATCH_ALL'];
		if (!options.shouldConsoleLog)
		options.shouldConsoleLog = false;
		// Do not use all CPU cores as default, it makes the OCR process way slower!
		if (!workerPoolSize)
		workerPoolSize = os_1.default.cpus().length > 3 ? os_1.default.cpus().length - 2 : 1;
		const pool = (0, threads_1.Pool)(() => (0, threads_1.spawn)(new threads_1.Worker('./worker')), { size: workerPoolSize });
		const progress = await (0, utils_1.loadProgress)(progressFile);
		const dir = await (0, utils_1.getTree)(scannedDir);
		await (0, exports.visitDir)(dir, {
		words,
		progress,
		pool,
		shouldConsoleLog,
		progressFile,
		matchesLogFile: matchesLogFile,
		tesseractConfig
		});
		if (!options.workerPoolSize)
		options.workerPoolSize = os_1.default.cpus().length > 3 ? os_1.default.cpus().length - 2 : 1;
		const pool = (0, threads_1.Pool)(() => (0, threads_1.spawn)(new threads_1.Worker('./worker')), { size: options.workerPoolSize });
		const progress = await (0, utils_1.loadProgress)(options.progressFile);
		const tree = await (0, utils_1.getTree)(scannedDir);
		totalFilesCount = (0, utils_1.getTreeFilesCount)(tree);
		if (tree.type === 'directory')
		await (0, exports.visitDir)(tree, progress, pool, options);
		else if (tree.type === 'file')
		await (0, exports.visitFile)(tree, progress, pool, options);
		await pool.terminate();
		@@ -163,0 +174,0 @@ return progress;

dist/utils.d.ts

		@@ -11,7 +11,7 @@ import path from 'path';
		*/
		words?: string[] \| ['MATCH_ALL'];
		/**
		* Should the logs be printed to the console? (default = false)
		*/
		words?: string[];
		/** Should the logs be printed to the console? (default = false) */
		shouldConsoleLog?: boolean;
		/** Should the matches file content be printed to the console? (default = true) */
		shouldConsoleLogMatches?: boolean;
		/**
		@@ -23,6 +23,8 @@ * If provided, the progress will be saved to a file
		progressFile?: string;
		/**
		* If provided, every file path and their text content that were matched are logged to this file
		*/
		/** If provided, every file path and their text content that were matched are logged to this file */
		matchesLogFile?: string;
		/** File extensions to ignore when looking for files */
		ignoreExt?: Set<string>;
		pdfExtractFirst?: number;
		pdfExtractLast?: number;
		/**
		@@ -64,2 +66,3 @@ * Amount of worker threads to use (default = your total CPU cores - 2)
		export declare const cleanStr: (str: string) => string;
		export declare const logProgress: (visitedCount: number, totalFilesCount: number, str: string) => void;
		/**
		@@ -93,3 +96,3 @@ * @param filePath Path to the image to extract text from
		*/
		export declare const pdfToImages: (filePath: string) => Promise<Array<{
		export declare const pdfToImages: (filePath: string, firstPage?: number \| undefined, lastPage?: number \| undefined) => Promise<Array<{
		name: string;
		@@ -114,3 +117,4 @@ path: string;
		export declare const saveProgress: (progressFile: string, progress: Progress) => Promise<void>;
		export declare const getTreeFilesCount: (tree: dirTree.DirectoryTree) => number;
		export declare const getTree: (scannedDir: string) => Promise<dirTree.DirectoryTree>;
		//# sourceMappingURL=utils.d.ts.map

dist/utils.js

		@@ -6,3 +6,3 @@ "use strict";
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getTree = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.pdfToImages = exports.getPdfExtractedImages = exports.isPdfAlreadyExtractedToImages = exports.ocr = exports.cleanStr = void 0;
		exports.getTree = exports.getTreeFilesCount = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.pdfToImages = exports.getPdfExtractedImages = exports.isPdfAlreadyExtractedToImages = exports.ocr = exports.logProgress = exports.cleanStr = void 0;
		const fs_extra_1 = __importDefault(require("fs-extra"));
		@@ -19,2 +19,7 @@ const path_1 = __importDefault(require("path"));
		exports.cleanStr = cleanStr;
		const logProgress = (visitedCount, totalFilesCount, str) => {
		const visitedCountPadded = visitedCount.toString().padStart(totalFilesCount.toString().length);
		console.log(`[${visitedCountPadded}/${totalFilesCount}] ${str}`);
		};
		exports.logProgress = logProgress;
		/**
		@@ -73,5 +78,10 @@ * @param filePath Path to the image to extract text from
		*/
		const pdfToImages = async (filePath) => {
		// pdftoppm -png file.pdf output-images-prefix
		await (0, execa_1.default)('pdftoppm', ['-png', filePath, filePath]);
		const pdfToImages = async (filePath, firstPage, lastPage) => {
		// pdftoppm -f 1 -l 5 -png file.pdf output-images-prefix
		const pageParams = [];
		if (firstPage)
		pageParams.push('-f', `${firstPage}`);
		if (lastPage)
		pageParams.push('-l', `${lastPage}`);
		await (0, execa_1.default)('pdftoppm', [...pageParams, '-png', filePath, filePath]);
		return (0, exports.getPdfExtractedImages)(filePath);
		@@ -122,8 +132,28 @@ };
		exports.saveProgress = saveProgress;
		const getTreeFilesCount = (tree) => {
		let count = 0;
		if (tree.type === 'file') {
		count++;
		}
		else if (tree.type === 'directory' && tree.children) {
		tree.children.forEach(x => (count += (0, exports.getTreeFilesCount)(x)));
		}
		return count;
		};
		exports.getTreeFilesCount = getTreeFilesCount;
		const getTree = async (scannedDir) => {
		if (!(await fs_extra_1.default.pathExists(scannedDir)))
		throw new Error('Directory not found');
		return (0, directory_tree_1.default)(scannedDir, { attributes: ['size', 'type', 'extension'] });
		throw new Error('File or directory not found');
		const tree = (0, directory_tree_1.default)(scannedDir, { attributes: ['type', 'extension'] });
		// Convert all relative paths to absolute paths
		const relativeToAbsolute = (tree) => {
		tree.path = path_1.default.resolve(tree.path);
		if (tree.type === 'directory' && tree.children) {
		tree.children.forEach(relativeToAbsolute);
		}
		};
		relativeToAbsolute(tree);
		return tree;
		};
		exports.getTree = getTree;
		//# sourceMappingURL=utils.js.map

package.json

		{
		"name": "bulk-files-ocr-search",
		"version": "0.1.4",
		"version": "0.1.5",
		"description": "🔍 Find files that contain some text with OCR",
		@@ -19,3 +19,4 @@ "license": "MIT",
		"bin": {
		"ocr-search": "bin/cli.mjs"
		"ocr-search": "bin/cli.mjs",
		"ocr-search-clean-extracted": "bin/cleanExtracted.mjs"
		},
		@@ -22,0 +23,0 @@ "main": "dist/index.js",

README.md

		@@ -62,3 +62,5 @@ # Bulk Files OCR Text Finder

		```sh
		```
		$ ocr-search --help

		🔍 Find files that contain some text with OCR
		@@ -73,6 +75,11 @@
		Options
		--progressFile File to save progress to, will start from where it stopped last time by looking there (none="none") [default="progress.json"]
		--matchesLogFile Log all matches to this file (none="none") [default="matches.txt"]
		--no-console-logs Silence console logs
		--workers Amount of worker threads to use (default is total CPU cores count - 2)
		--ignoreExt List of comma-separated file extensions to ignore
		--pdfExtractFirst Range start of the pages to extract from PDF files (1-indexed)
		--pdfExtractLast Range end of the pages to extract from PDF files, last page if overflow (1-indexed)
		--progressFile File to save progress to, will start from where it
		stopped last time by looking there (no file, use "none") [default="progress.json"]
		--matchesLogFile Log all matches to this file (no file, use "none") [default="matches.txt"]
		--no-console-logs Silence all console logs
		--no-show-matches Do not print matched files text content to the console [default="false"]
		--workers Amount of worker threads to use (default is total CPU cores count - 2)

		@@ -91,8 +98,24 @@ OCR Options - See https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc

		Skip .pdf and .webp files
		$ ocr-search --words "wiki,hello" --ignoreExt "pdf,webp" scanned-dir

		Extract only page 3 to 6 in all PDF files (1-indexed)
		$ ocr-search --words "wiki,hello" --pdfExtractFirst 3 --pdfExtractLast 6 scanned-dir

		Use a specific Tesseract OCR configuration
		$ ocr-search --words "wiki,hello" --lang fra --oem 1 --psm 3 scanned-dir

		Do not save progress and do not log matches to file
		$ ocr-search --words "wiki,hello" --progressFile none --matchesLogFile none scanned-dir
		https://github.com/rigwild/bulk-files-ocr-search
		```

		Another CLI is provided to easily remove all extracted PDF pages images.

		```
		$ ocr-search-clean-extracted --help

		🗑️ Find and remove all images from PDF pages extractions

		Usage
		$ ocr-search-clean-extracted <input_files>

		https://github.com/rigwild/bulk-files-ocr-search
		@@ -143,9 +166,10 @@ ```
		*/
		words: string[] \| ['MATCH_ALL']
		words?: string[]

		/**
		* Should the logs be printed to the console? (default = false)
		*/
		/** Should the logs be printed to the console? (default = false) */
		shouldConsoleLog?: boolean

		/** Should the matches file content be printed to the console? (default = true) */
		shouldConsoleLogMatches?: boolean

		/**
		@@ -158,7 +182,14 @@ * If provided, the progress will be saved to a file

		/**
		* If provided, every file path and their text content that were matched are logged to this file
		*/
		/** If provided, every file path and their text content that were matched are logged to this file */
		matchesLogFile?: string

		/** File extensions to ignore when looking for files */
		ignoreExt?: Set<string>

		/* Extract PDF files starting at this page, first page is 1 (1-indexed) (default = 1) */
		pdfExtractFirst?: number

		/* Extract PDF files until this page, last page if overflow (1-indexed) (default = last page of PDF file) */
		pdfExtractLast?: number

		/**
		@@ -172,3 +203,3 @@ * Amount of worker threads to use (default = your total CPU cores - 2)
		/**
		* Tesseract OCR config, will default to `{ lang: 'eng', oem: 1, psm: 1 }`
		* Tesseract OCR config, will default `{ lang: 'eng', oem: 1, psm: 1 }`
		*
		@@ -180,9 +211,6 @@ * @see https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc

		const scannedDir = path.resolve(__dirname, 'data')
		const words = ['hello', 'match this', '<<<<<']
		const tesseractConfig: TesseractConfig = { lang: 'fra', oem: 1, psm: 1 }

		const scannedDir = path.resolve(__dirname, 'data')
		const progressFile = path.resolve(__dirname, 'progress.json')
		const matchesLogFile = path.resolve(__dirname, 'matches.txt')
		const tesseractConfig: TesseractConfig = { lang: 'eng', oem: 1, psm: 1 }

		console.time('scan')
		@@ -193,4 +221,2 @@
		shouldConsoleLog: true,
		progressFile,
		matchesLogFile,
		tesseractConfig
		@@ -231,3 +257,4 @@ })

		const res = await pdfToImages(filePdf)
		// Extract from page 1 to page 3 (1-indexed)
		const res = await pdfToImages(filePdf, 1, 3)
		console.log(res) // Paths to generated PNG files
		@@ -234,0 +261,0 @@ ```

bin/cli.mjs