bulk-files-ocr-search
Advanced tools
Comparing version
@@ -5,12 +5,6 @@ import dirTree from 'directory-tree'; | ||
/** Internal function */ | ||
export declare const visitDir: (dir: dirTree.DirectoryTree, { progress, pool, words, shouldConsoleLog, progressFile, matchesLogFile, tesseractConfig }: { | ||
progress: Progress; | ||
pool: WorkerPool; | ||
} & Pick<ScanOptions, "words" | "shouldConsoleLog" | "progressFile" | "matchesLogFile" | "tesseractConfig">) => Promise<void>; | ||
export declare const visitDir: (dir: dirTree.DirectoryTree, progress: Progress, pool: WorkerPool, options: Omit<ScanOptions, 'workerPoolSize'>) => Promise<void>; | ||
/** Internal function */ | ||
export declare const visitFile: (file: dirTree.DirectoryTree, { progress, pool, words, shouldConsoleLog, progressFile, matchesLogFile, tesseractConfig }: { | ||
progress: Progress; | ||
pool: WorkerPool; | ||
} & Pick<ScanOptions, "words" | "shouldConsoleLog" | "progressFile" | "matchesLogFile" | "tesseractConfig">) => Promise<void>; | ||
export declare const scanDir: (scannedDir: string, { words, shouldConsoleLog, matchesLogFile, workerPoolSize, tesseractConfig, progressFile }?: ScanOptions) => Promise<Progress>; | ||
export declare const visitFile: (file: dirTree.DirectoryTree, progress: Progress, pool: WorkerPool, options: Omit<ScanOptions, 'workerPoolSize'>) => Promise<void>; | ||
export declare const scanDir: (scannedDir: string, options?: ScanOptions) => Promise<Progress>; | ||
//# sourceMappingURL=index.d.ts.map |
@@ -14,31 +14,17 @@ "use strict"; | ||
let scannedFilesSinceLastSaveProgressCount = 0; | ||
let visitedCount = 1; | ||
let totalFilesCount = 0; | ||
/** Internal function */ | ||
const visitDir = async (dir, { progress, pool, words, shouldConsoleLog, progressFile, matchesLogFile, tesseractConfig }) => { | ||
if (shouldConsoleLog) | ||
console.log(`🔍 Scan directory ${dir.path}`); | ||
const visitDir = async (dir, progress, pool, options) => { | ||
if (options.shouldConsoleLog) | ||
console.log(`${' '.repeat(totalFilesCount.toString().length * 2 + 3)} 🔍 Scan directory ${dir.path}`); | ||
for (const child of dir.children) { | ||
if (child.type === 'file') | ||
await (0, exports.visitFile)(child, { | ||
progress, | ||
pool, | ||
words, | ||
shouldConsoleLog, | ||
progressFile, | ||
matchesLogFile: matchesLogFile, | ||
tesseractConfig | ||
}); | ||
else | ||
await (0, exports.visitDir)(child, { | ||
progress, | ||
pool, | ||
words, | ||
shouldConsoleLog, | ||
progressFile, | ||
matchesLogFile: matchesLogFile, | ||
tesseractConfig | ||
}); | ||
await (0, exports.visitFile)(child, progress, pool, options); | ||
else if (child.type === 'directory') | ||
await (0, exports.visitDir)(child, progress, pool, options); | ||
} | ||
await pool.settled(true); | ||
if (progressFile) | ||
await (0, utils_1.saveProgress)(progressFile, progress); | ||
if (options.progressFile) | ||
await (0, utils_1.saveProgress)(options.progressFile, progress); | ||
// We do not mark directories as visited in case the user adds new files | ||
@@ -49,15 +35,23 @@ // in them in the future! | ||
/** Internal function */ | ||
const visitFile = async (file, { progress, pool, words, shouldConsoleLog, progressFile, matchesLogFile, tesseractConfig }) => { | ||
const visitFile = async (file, progress, pool, options) => { | ||
var _a; | ||
if (file.name === '.gitkeep') | ||
return; | ||
if ((_a = options.ignoreExt) === null || _a === void 0 ? void 0 : _a.has(file.extension)) { | ||
if (options.shouldConsoleLog) | ||
(0, utils_1.logProgress)(visitedCount, totalFilesCount, `☢️ Ignored .ext ${file.path}`); | ||
return; | ||
} | ||
if (!(0, utils_1.isSupportedExtension)(file.extension)) { | ||
if (shouldConsoleLog) | ||
console.log(`👽 Unsupported file ${file.path}`); | ||
if (options.shouldConsoleLog) | ||
(0, utils_1.logProgress)(visitedCount, totalFilesCount, `👽 Unsupported file ${file.path}`); | ||
// Mark as visited | ||
progress.visited.add(file.path); | ||
visitedCount++; | ||
return; | ||
} | ||
if (progress.visited.has(file.path)) { | ||
if (shouldConsoleLog) | ||
console.log(`⏩ Skip visited ${file.path}`); | ||
if (options.shouldConsoleLog) | ||
(0, utils_1.logProgress)(visitedCount, totalFilesCount, `⏩ Skip visited ${file.path}`); | ||
visitedCount++; | ||
return; | ||
@@ -68,33 +62,45 @@ } | ||
let images = []; | ||
if (!(await (0, utils_1.isPdfAlreadyExtractedToImages)(file.path))) { | ||
images = await (0, utils_1.pdfToImages)(file.path); | ||
if (shouldConsoleLog) | ||
console.log(`✨ Extracted PDF ${file.path}`); | ||
let hasAlreadyExtractedPdf = false; | ||
try { | ||
if (!(await (0, utils_1.isPdfAlreadyExtractedToImages)(file.path))) { | ||
let first = undefined; | ||
let last = undefined; | ||
if (options.pdfExtractFirst) | ||
first = options.pdfExtractFirst; | ||
if (options.pdfExtractLast) | ||
last = options.pdfExtractLast; | ||
images = await (0, utils_1.pdfToImages)(file.path, first, last); | ||
totalFilesCount += images.length; | ||
if (options.shouldConsoleLog) | ||
(0, utils_1.logProgress)(visitedCount, totalFilesCount, `✨ Extracted PDF ${file.path}`); | ||
} | ||
else { | ||
images = await (0, utils_1.getPdfExtractedImages)(file.path); | ||
hasAlreadyExtractedPdf = true; | ||
if (options.shouldConsoleLog) | ||
(0, utils_1.logProgress)(visitedCount, totalFilesCount, `📄 PDF is ready ${file.path}`); | ||
} | ||
} | ||
else { | ||
images = await (0, utils_1.getPdfExtractedImages)(file.path); | ||
if (shouldConsoleLog) | ||
console.log(`📄 PDF is ready ${file.path}`); | ||
catch (error) { | ||
{ | ||
console.log('💥 ERROR! PDF FAIL! ', file.path); | ||
console.error(error); | ||
} | ||
} | ||
for (const image of images) { | ||
// Convert to directoryTree format | ||
const imageTreeFormat = { | ||
name: image.name, | ||
path: image.path, | ||
size: -1, | ||
type: 'file', | ||
extension: '.png' | ||
}; | ||
await (0, exports.visitFile)(imageTreeFormat, { | ||
progress, | ||
pool, | ||
words, | ||
shouldConsoleLog, | ||
progressFile, | ||
matchesLogFile: matchesLogFile, | ||
tesseractConfig | ||
}); | ||
if (!hasAlreadyExtractedPdf) { | ||
for (const image of images) { | ||
// Convert to directoryTree format | ||
const imageTreeFormat = { | ||
name: image.name, | ||
path: image.path, | ||
size: -1, | ||
type: 'file', | ||
extension: '.png' | ||
}; | ||
await (0, exports.visitFile)(imageTreeFormat, progress, pool, options); | ||
} | ||
} | ||
// Mark PDF as visited to not convert it again | ||
progress.visited.add(file.path); | ||
visitedCount++; | ||
return; | ||
@@ -104,23 +110,27 @@ } | ||
try { | ||
const scanRes = await scanFile(file, words, tesseractConfig); | ||
const scanRes = await scanFile(file, options.words, options.tesseractConfig); | ||
if (scanRes && scanRes.matches.length > 0) { | ||
let str = ''; | ||
str += `\n✅ MATCH! ${file.path}\n`; | ||
str += `Words: ${scanRes.matches.join()}\n`; | ||
str += `Text:\n${scanRes.text}\n`; | ||
if (shouldConsoleLog) | ||
console.log(str); | ||
str += `✅ MATCH! ${file.path}`; | ||
if (options.shouldConsoleLog && !options.shouldConsoleLogMatches) { | ||
(0, utils_1.logProgress)(visitedCount, totalFilesCount, str); | ||
} | ||
str += `\nWords: ${scanRes.matches.join()}\n`; | ||
str += `Text:\n${scanRes.text}`; | ||
if (options.shouldConsoleLog && options.shouldConsoleLogMatches) { | ||
(0, utils_1.logProgress)(visitedCount, totalFilesCount, str); | ||
} | ||
// Save in the matched Map | ||
progress.matched.set(file.path, scanRes); | ||
if (matchesLogFile) { | ||
await fs_extra_1.default.promises.writeFile(matchesLogFile, `${str}\n----------------\n`, { flag: 'a' }); | ||
if (options.matchesLogFile) { | ||
await fs_extra_1.default.promises.writeFile(options.matchesLogFile, `${str}\n----------------\n`, { flag: 'a' }); | ||
} | ||
} | ||
else { | ||
if (shouldConsoleLog) | ||
console.log(`❌ No words matched ${file.path}`); | ||
if (options.shouldConsoleLog) | ||
(0, utils_1.logProgress)(visitedCount, totalFilesCount, `❌ No words matched ${file.path}`); | ||
} | ||
} | ||
catch (error) { | ||
if (shouldConsoleLog) { | ||
{ | ||
console.log('💥 ERROR! Scan fail ', file.path); | ||
@@ -133,7 +143,8 @@ console.error(error); | ||
progress.visited.add(file.path); | ||
if (progressFile) { | ||
visitedCount++; | ||
if (options.progressFile) { | ||
scannedFilesSinceLastSaveProgressCount++; | ||
// Save progress every 5 scans | ||
if (progressFile && scannedFilesSinceLastSaveProgressCount > 5) { | ||
await (0, utils_1.saveProgress)(progressFile, progress); | ||
if (options.progressFile && scannedFilesSinceLastSaveProgressCount > 5) { | ||
await (0, utils_1.saveProgress)(options.progressFile, progress); | ||
scannedFilesSinceLastSaveProgressCount = 0; | ||
@@ -145,18 +156,18 @@ } | ||
exports.visitFile = visitFile; | ||
const scanDir = async (scannedDir, { words = ['MATCH_ALL'], shouldConsoleLog = false, matchesLogFile, workerPoolSize, tesseractConfig, progressFile } = {}) => { | ||
const scanDir = async (scannedDir, options = {}) => { | ||
if (!options.words) | ||
options.words = ['MATCH_ALL']; | ||
if (!options.shouldConsoleLog) | ||
options.shouldConsoleLog = false; | ||
// Do not use all CPU cores as default, it makes the OCR process way slower! | ||
if (!workerPoolSize) | ||
workerPoolSize = os_1.default.cpus().length > 3 ? os_1.default.cpus().length - 2 : 1; | ||
const pool = (0, threads_1.Pool)(() => (0, threads_1.spawn)(new threads_1.Worker('./worker')), { size: workerPoolSize }); | ||
const progress = await (0, utils_1.loadProgress)(progressFile); | ||
const dir = await (0, utils_1.getTree)(scannedDir); | ||
await (0, exports.visitDir)(dir, { | ||
words, | ||
progress, | ||
pool, | ||
shouldConsoleLog, | ||
progressFile, | ||
matchesLogFile: matchesLogFile, | ||
tesseractConfig | ||
}); | ||
if (!options.workerPoolSize) | ||
options.workerPoolSize = os_1.default.cpus().length > 3 ? os_1.default.cpus().length - 2 : 1; | ||
const pool = (0, threads_1.Pool)(() => (0, threads_1.spawn)(new threads_1.Worker('./worker')), { size: options.workerPoolSize }); | ||
const progress = await (0, utils_1.loadProgress)(options.progressFile); | ||
const tree = await (0, utils_1.getTree)(scannedDir); | ||
totalFilesCount = (0, utils_1.getTreeFilesCount)(tree); | ||
if (tree.type === 'directory') | ||
await (0, exports.visitDir)(tree, progress, pool, options); | ||
else if (tree.type === 'file') | ||
await (0, exports.visitFile)(tree, progress, pool, options); | ||
await pool.terminate(); | ||
@@ -163,0 +174,0 @@ return progress; |
@@ -11,7 +11,7 @@ import path from 'path'; | ||
*/ | ||
words?: string[] | ['MATCH_ALL']; | ||
/** | ||
* Should the logs be printed to the console? (default = false) | ||
*/ | ||
words?: string[]; | ||
/** Should the logs be printed to the console? (default = false) */ | ||
shouldConsoleLog?: boolean; | ||
/** Should the matches file content be printed to the console? (default = true) */ | ||
shouldConsoleLogMatches?: boolean; | ||
/** | ||
@@ -23,6 +23,8 @@ * If provided, the progress will be saved to a file | ||
progressFile?: string; | ||
/** | ||
* If provided, every file path and their text content that were matched are logged to this file | ||
*/ | ||
/** If provided, every file path and their text content that were matched are logged to this file */ | ||
matchesLogFile?: string; | ||
/** File extensions to ignore when looking for files */ | ||
ignoreExt?: Set<string>; | ||
pdfExtractFirst?: number; | ||
pdfExtractLast?: number; | ||
/** | ||
@@ -64,2 +66,3 @@ * Amount of worker threads to use (default = your total CPU cores - 2) | ||
export declare const cleanStr: (str: string) => string; | ||
export declare const logProgress: (visitedCount: number, totalFilesCount: number, str: string) => void; | ||
/** | ||
@@ -93,3 +96,3 @@ * @param filePath Path to the image to extract text from | ||
*/ | ||
export declare const pdfToImages: (filePath: string) => Promise<Array<{ | ||
export declare const pdfToImages: (filePath: string, firstPage?: number | undefined, lastPage?: number | undefined) => Promise<Array<{ | ||
name: string; | ||
@@ -114,3 +117,4 @@ path: string; | ||
export declare const saveProgress: (progressFile: string, progress: Progress) => Promise<void>; | ||
export declare const getTreeFilesCount: (tree: dirTree.DirectoryTree) => number; | ||
export declare const getTree: (scannedDir: string) => Promise<dirTree.DirectoryTree>; | ||
//# sourceMappingURL=utils.d.ts.map |
@@ -6,3 +6,3 @@ "use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.getTree = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.pdfToImages = exports.getPdfExtractedImages = exports.isPdfAlreadyExtractedToImages = exports.ocr = exports.cleanStr = void 0; | ||
exports.getTree = exports.getTreeFilesCount = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.pdfToImages = exports.getPdfExtractedImages = exports.isPdfAlreadyExtractedToImages = exports.ocr = exports.logProgress = exports.cleanStr = void 0; | ||
const fs_extra_1 = __importDefault(require("fs-extra")); | ||
@@ -19,2 +19,7 @@ const path_1 = __importDefault(require("path")); | ||
exports.cleanStr = cleanStr; | ||
const logProgress = (visitedCount, totalFilesCount, str) => { | ||
const visitedCountPadded = visitedCount.toString().padStart(totalFilesCount.toString().length); | ||
console.log(`[${visitedCountPadded}/${totalFilesCount}] ${str}`); | ||
}; | ||
exports.logProgress = logProgress; | ||
/** | ||
@@ -73,5 +78,10 @@ * @param filePath Path to the image to extract text from | ||
*/ | ||
const pdfToImages = async (filePath) => { | ||
// pdftoppm -png file.pdf output-images-prefix | ||
await (0, execa_1.default)('pdftoppm', ['-png', filePath, filePath]); | ||
const pdfToImages = async (filePath, firstPage, lastPage) => { | ||
// pdftoppm -f 1 -l 5 -png file.pdf output-images-prefix | ||
const pageParams = []; | ||
if (firstPage) | ||
pageParams.push('-f', `${firstPage}`); | ||
if (lastPage) | ||
pageParams.push('-l', `${lastPage}`); | ||
await (0, execa_1.default)('pdftoppm', [...pageParams, '-png', filePath, filePath]); | ||
return (0, exports.getPdfExtractedImages)(filePath); | ||
@@ -122,8 +132,28 @@ }; | ||
exports.saveProgress = saveProgress; | ||
const getTreeFilesCount = (tree) => { | ||
let count = 0; | ||
if (tree.type === 'file') { | ||
count++; | ||
} | ||
else if (tree.type === 'directory' && tree.children) { | ||
tree.children.forEach(x => (count += (0, exports.getTreeFilesCount)(x))); | ||
} | ||
return count; | ||
}; | ||
exports.getTreeFilesCount = getTreeFilesCount; | ||
const getTree = async (scannedDir) => { | ||
if (!(await fs_extra_1.default.pathExists(scannedDir))) | ||
throw new Error('Directory not found'); | ||
return (0, directory_tree_1.default)(scannedDir, { attributes: ['size', 'type', 'extension'] }); | ||
throw new Error('File or directory not found'); | ||
const tree = (0, directory_tree_1.default)(scannedDir, { attributes: ['type', 'extension'] }); | ||
// Convert all relative paths to absolute paths | ||
const relativeToAbsolute = (tree) => { | ||
tree.path = path_1.default.resolve(tree.path); | ||
if (tree.type === 'directory' && tree.children) { | ||
tree.children.forEach(relativeToAbsolute); | ||
} | ||
}; | ||
relativeToAbsolute(tree); | ||
return tree; | ||
}; | ||
exports.getTree = getTree; | ||
//# sourceMappingURL=utils.js.map |
{ | ||
"name": "bulk-files-ocr-search", | ||
"version": "0.1.4", | ||
"version": "0.1.5", | ||
"description": "🔍 Find files that contain some text with OCR", | ||
@@ -19,3 +19,4 @@ "license": "MIT", | ||
"bin": { | ||
"ocr-search": "bin/cli.mjs" | ||
"ocr-search": "bin/cli.mjs", | ||
"ocr-search-clean-extracted": "bin/cleanExtracted.mjs" | ||
}, | ||
@@ -22,0 +23,0 @@ "main": "dist/index.js", |
@@ -62,3 +62,5 @@ # Bulk Files OCR Text Finder | ||
```sh | ||
``` | ||
$ ocr-search --help | ||
🔍 Find files that contain some text with OCR | ||
@@ -73,6 +75,11 @@ | ||
Options | ||
--progressFile File to save progress to, will start from where it stopped last time by looking there (none="none") [default="progress.json"] | ||
--matchesLogFile Log all matches to this file (none="none") [default="matches.txt"] | ||
--no-console-logs Silence console logs | ||
--workers Amount of worker threads to use (default is total CPU cores count - 2) | ||
--ignoreExt List of comma-separated file extensions to ignore | ||
--pdfExtractFirst Range start of the pages to extract from PDF files (1-indexed) | ||
--pdfExtractLast Range end of the pages to extract from PDF files, last page if overflow (1-indexed) | ||
--progressFile File to save progress to, will start from where it | ||
stopped last time by looking there (no file, use "none") [default="progress.json"] | ||
--matchesLogFile Log all matches to this file (no file, use "none") [default="matches.txt"] | ||
--no-console-logs Silence all console logs | ||
--no-show-matches Do not print matched files text content to the console [default="false"] | ||
--workers Amount of worker threads to use (default is total CPU cores count - 2) | ||
@@ -91,8 +98,24 @@ OCR Options - See https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc | ||
Skip .pdf and .webp files | ||
$ ocr-search --words "wiki,hello" --ignoreExt "pdf,webp" scanned-dir | ||
Extract only page 3 to 6 in all PDF files (1-indexed) | ||
$ ocr-search --words "wiki,hello" --pdfExtractFirst 3 --pdfExtractLast 6 scanned-dir | ||
Use a specific Tesseract OCR configuration | ||
$ ocr-search --words "wiki,hello" --lang fra --oem 1 --psm 3 scanned-dir | ||
Do not save progress and do not log matches to file | ||
$ ocr-search --words "wiki,hello" --progressFile none --matchesLogFile none scanned-dir | ||
https://github.com/rigwild/bulk-files-ocr-search | ||
``` | ||
Another CLI is provided to easily remove all extracted PDF pages images. | ||
``` | ||
$ ocr-search-clean-extracted --help | ||
🗑️ Find and remove all images from PDF pages extractions | ||
Usage | ||
$ ocr-search-clean-extracted <input_files> | ||
https://github.com/rigwild/bulk-files-ocr-search | ||
@@ -143,9 +166,10 @@ ``` | ||
*/ | ||
words: string[] | ['MATCH_ALL'] | ||
words?: string[] | ||
/** | ||
* Should the logs be printed to the console? (default = false) | ||
*/ | ||
/** Should the logs be printed to the console? (default = false) */ | ||
shouldConsoleLog?: boolean | ||
/** Should the matches file content be printed to the console? (default = true) */ | ||
shouldConsoleLogMatches?: boolean | ||
/** | ||
@@ -158,7 +182,14 @@ * If provided, the progress will be saved to a file | ||
/** | ||
* If provided, every file path and their text content that were matched are logged to this file | ||
*/ | ||
/** If provided, every file path and their text content that were matched are logged to this file */ | ||
matchesLogFile?: string | ||
/** File extensions to ignore when looking for files */ | ||
ignoreExt?: Set<string> | ||
/* Extract PDF files starting at this page, first page is 1 (1-indexed) (default = 1) */ | ||
pdfExtractFirst?: number | ||
/* Extract PDF files until this page, last page if overflow (1-indexed) (default = last page of PDF file) */ | ||
pdfExtractLast?: number | ||
/** | ||
@@ -172,3 +203,3 @@ * Amount of worker threads to use (default = your total CPU cores - 2) | ||
/** | ||
* Tesseract OCR config, will default to `{ lang: 'eng', oem: 1, psm: 1 }` | ||
* Tesseract OCR config, will default `{ lang: 'eng', oem: 1, psm: 1 }` | ||
* | ||
@@ -180,9 +211,6 @@ * @see https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc | ||
const scannedDir = path.resolve(__dirname, 'data') | ||
const words = ['hello', 'match this', '<<<<<'] | ||
const tesseractConfig: TesseractConfig = { lang: 'fra', oem: 1, psm: 1 } | ||
const scannedDir = path.resolve(__dirname, 'data') | ||
const progressFile = path.resolve(__dirname, 'progress.json') | ||
const matchesLogFile = path.resolve(__dirname, 'matches.txt') | ||
const tesseractConfig: TesseractConfig = { lang: 'eng', oem: 1, psm: 1 } | ||
console.time('scan') | ||
@@ -193,4 +221,2 @@ | ||
shouldConsoleLog: true, | ||
progressFile, | ||
matchesLogFile, | ||
tesseractConfig | ||
@@ -231,3 +257,4 @@ }) | ||
const res = await pdfToImages(filePdf) | ||
// Extract from page 1 to page 3 (1-indexed) | ||
const res = await pdfToImages(filePdf, 1, 3) | ||
console.log(res) // Paths to generated PNG files | ||
@@ -234,0 +261,0 @@ ``` |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
108433
8.66%22
4.76%614
19.69%257
11.74%4
33.33%