bulk-files-ocr-search
Advanced tools
Comparing version
import dirTree from 'directory-tree'; | ||
import { Progress, ocr, ScanOptions, WorkerPool } from './utils'; | ||
export { Progress, ScanOptions, ocr }; | ||
import { Progress, ocr, ScanOptions, WorkerPool, pdfToImages } from './utils'; | ||
export { Progress, ScanOptions, ocr, pdfToImages }; | ||
/** Internal function */ | ||
@@ -13,3 +13,4 @@ export declare const visitDir: (dir: dirTree.DirectoryTree, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig }: { | ||
pool: WorkerPool; | ||
} & Pick<ScanOptions, "words" | "shouldConsoleLog" | "outputLogFile" | "tesseractConfig">) => void; | ||
export declare const scanDir: (scannedDir: string, { words, shouldConsoleLog, progressFile, outputLogFile, workerPoolSize, tesseractConfig }?: ScanOptions) => Promise<Progress>; | ||
} & Pick<ScanOptions, "words" | "shouldConsoleLog" | "outputLogFile" | "tesseractConfig">) => Promise<void>; | ||
export declare const scanDir: (scannedDir: string, { words, shouldConsoleLog, outputLogFile, workerPoolSize, tesseractConfig, progressFile }?: ScanOptions) => Promise<Progress>; | ||
//# sourceMappingURL=index.d.ts.map |
@@ -6,3 +6,3 @@ "use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.scanDir = exports.visitFile = exports.visitDir = exports.ocr = void 0; | ||
exports.scanDir = exports.visitFile = exports.visitDir = exports.pdfToImages = exports.ocr = void 0; | ||
const os_1 = __importDefault(require("os")); | ||
@@ -13,2 +13,3 @@ const fs_extra_1 = __importDefault(require("fs-extra")); | ||
Object.defineProperty(exports, "ocr", { enumerable: true, get: function () { return utils_1.ocr; } }); | ||
Object.defineProperty(exports, "pdfToImages", { enumerable: true, get: function () { return utils_1.pdfToImages; } }); | ||
/** Internal function */ | ||
@@ -20,5 +21,5 @@ const visitDir = async (dir, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig }) => { | ||
if (child.type === 'file') | ||
(0, exports.visitFile)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }); | ||
await (0, exports.visitFile)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }); | ||
else | ||
await (0, exports.visitDir)(child, { progress, pool, words, shouldConsoleLog, progressFile, outputLogFile, tesseractConfig }); | ||
await (0, exports.visitDir)(child, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig, progressFile }); | ||
} | ||
@@ -33,5 +34,12 @@ await pool.settled(true); | ||
/** Internal function */ | ||
const visitFile = (file, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }) => { | ||
const visitFile = async (file, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }) => { | ||
if (file.name === '.gitkeep') | ||
return; | ||
if (!(0, utils_1.isSupportedExtension)(file.extension)) { | ||
if (shouldConsoleLog) | ||
console.log(`š½ Unsupported file ${file.path}`); | ||
// Mark as visited | ||
progress.visited.add(file.path); | ||
return; | ||
} | ||
if (progress.visited.has(file.path)) { | ||
@@ -42,10 +50,24 @@ if (shouldConsoleLog) | ||
} | ||
// Convert PDF pages to images | ||
if (file.extension === '.pdf') { | ||
// return | ||
const images = await (0, utils_1.pdfToImages)(file.path); | ||
if (shouldConsoleLog) | ||
console.log(`⨠Extracted PDF ${file.path}`); | ||
for (const image of images) { | ||
// Convert to directoryTree format | ||
const imageTreeFormat = { | ||
name: image.name, | ||
path: image.path, | ||
size: -1, | ||
type: 'file', | ||
extension: '.webp' | ||
}; | ||
await (0, exports.visitFile)(imageTreeFormat, { progress, pool, words, shouldConsoleLog, outputLogFile, tesseractConfig }); | ||
} | ||
// Mark PDF as visited to not convert it again | ||
progress.visited.add(file.path); | ||
return; | ||
} | ||
pool.queue(async ({ scanFile }) => { | ||
if (!(0, utils_1.isSupportedExtension)(file.extension)) { | ||
if (shouldConsoleLog) | ||
console.log(`š½ Unsupported file ${file.path}`); | ||
// Mark as visited | ||
progress.visited.add(file.path); | ||
return; | ||
} | ||
try { | ||
@@ -83,3 +105,3 @@ const scanRes = await scanFile(file, words, tesseractConfig); | ||
exports.visitFile = visitFile; | ||
const scanDir = async (scannedDir, { words = ['MATCH_ALL'], shouldConsoleLog = false, progressFile, outputLogFile, workerPoolSize, tesseractConfig } = {}) => { | ||
const scanDir = async (scannedDir, { words = ['MATCH_ALL'], shouldConsoleLog = false, outputLogFile, workerPoolSize, tesseractConfig, progressFile } = {}) => { | ||
// Do not use all CPU cores as default, it makes the OCR process way slower! | ||
@@ -96,1 +118,2 @@ if (!workerPoolSize) | ||
exports.scanDir = scanDir; | ||
//# sourceMappingURL=index.js.map |
@@ -0,1 +1,2 @@ | ||
import path from 'path'; | ||
import { recognize as tesseractRecognize } from 'node-tesseract-ocr'; | ||
@@ -32,3 +33,3 @@ import dirTree from 'directory-tree'; | ||
/** | ||
* Tesseract OCR config, will default to english language | ||
* Tesseract OCR config, will default to english language `{ lang: 'eng' }` | ||
* | ||
@@ -63,3 +64,3 @@ * @see https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc | ||
/** | ||
* @param path Path to the image to extract text from | ||
* @param filePath Path to the image to extract text from | ||
* @param tesseractConfig Tesseract configuration | ||
@@ -69,4 +70,13 @@ * @param shouldCleanStr Should the string be normalized (lowercase, accents removed, whitespace removed) | ||
*/ | ||
export declare const ocr: (path: string, tesseractConfig?: TesseractConfig, shouldCleanStr?: boolean) => Promise<string>; | ||
export declare const ocr: (filePath: string, tesseractConfig?: TesseractConfig, shouldCleanStr?: boolean) => Promise<string>; | ||
/** | ||
* Extract all the pages of a PDF to images | ||
* @param filePath Path to the PDF to be converted | ||
* @returns List of generated output images | ||
*/ | ||
export declare const pdfToImages: (filePath: string) => Promise<Array<{ | ||
name: string; | ||
path: string; | ||
}>>; | ||
/** | ||
* Find all words that were matched in text | ||
@@ -88,1 +98,2 @@ * | ||
export declare const getTree: (scannedDir: string) => Promise<dirTree.DirectoryTree>; | ||
//# sourceMappingURL=utils.d.ts.map |
@@ -6,5 +6,7 @@ "use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.getTree = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.ocr = exports.cleanStr = void 0; | ||
exports.getTree = exports.saveProgress = exports.loadProgress = exports.scanFile = exports.isSupportedExtension = exports.findMatches = exports.pdfToImages = exports.ocr = exports.cleanStr = void 0; | ||
const fs_extra_1 = __importDefault(require("fs-extra")); | ||
const path_1 = __importDefault(require("path")); | ||
const node_tesseract_ocr_1 = require("node-tesseract-ocr"); | ||
const execa_1 = __importDefault(require("execa")); | ||
const directory_tree_1 = __importDefault(require("directory-tree")); | ||
@@ -18,3 +20,3 @@ const cleanStr = (str) => str | ||
/** | ||
* @param path Path to the image to extract text from | ||
* @param filePath Path to the image to extract text from | ||
* @param tesseractConfig Tesseract configuration | ||
@@ -24,3 +26,3 @@ * @param shouldCleanStr Should the string be normalized (lowercase, accents removed, whitespace removed) | ||
*/ | ||
const ocr = async (path, tesseractConfig = {}, shouldCleanStr = true) => { | ||
const ocr = async (filePath, tesseractConfig = {}, shouldCleanStr = true) => { | ||
// Apply default options | ||
@@ -31,5 +33,6 @@ if (!tesseractConfig.lang) | ||
tesseractConfig.oem = 1; | ||
// PSM 1 seems to output better result for rotated content | ||
if (!tesseractConfig.psm) | ||
tesseractConfig.psm = 3; | ||
const text = await (0, node_tesseract_ocr_1.recognize)(path, tesseractConfig); | ||
tesseractConfig.psm = 1; | ||
const text = await (0, node_tesseract_ocr_1.recognize)(filePath, tesseractConfig); | ||
return text | ||
@@ -43,2 +46,18 @@ .split('\n') | ||
/** | ||
* Extract all the pages of a PDF to images | ||
* @param filePath Path to the PDF to be converted | ||
* @returns List of generated output images | ||
*/ | ||
const pdfToImages = async (filePath) => { | ||
const fileName = path_1.default.basename(filePath); | ||
// pdftoppm -png file.pdf output-images-prefix | ||
await (0, execa_1.default)('pdftoppm', ['-png', filePath, filePath]); | ||
// Find the list of created files (we don't know how many pages are in the pdf!) | ||
const files = await fs_extra_1.default.readdir(path_1.default.dirname(filePath)); | ||
return files | ||
.filter(x => x.startsWith(fileName) && x !== fileName) | ||
.map(x => ({ name: x, path: path_1.default.resolve(path_1.default.dirname(fileName), x) })); | ||
}; | ||
exports.pdfToImages = pdfToImages; | ||
/** | ||
* Find all words that were matched in text | ||
@@ -58,4 +77,3 @@ * | ||
exports.findMatches = findMatches; | ||
// TODO: Support PDF | ||
const isSupportedExtension = (ext) => ['.jpg', '.png', '.webp'].some(x => ext.toLowerCase() === x); | ||
const isSupportedExtension = (ext) => ['.jpg', '.jpeg', '.png', '.webp', '.pdf'].includes(ext); | ||
exports.isSupportedExtension = isSupportedExtension; | ||
@@ -93,1 +111,2 @@ const scanFile = async (file, words, tesseractConfig) => { | ||
exports.getTree = getTree; | ||
//# sourceMappingURL=utils.js.map |
export {}; | ||
//# sourceMappingURL=worker.d.ts.map |
@@ -6,1 +6,2 @@ "use strict"; | ||
(0, worker_1.expose)({ scanFile: utils_1.scanFile }); | ||
//# sourceMappingURL=worker.js.map |
{ | ||
"name": "bulk-files-ocr-search", | ||
"version": "0.1.0", | ||
"version": "0.1.1", | ||
"description": "Find files that contain some text with OCR", | ||
@@ -18,4 +18,10 @@ "license": "MIT", | ||
"types": "dist/index.d.ts", | ||
"scripts": { | ||
"build": "tsc", | ||
"test": "tsc && ava", | ||
"test:ava": "ava" | ||
}, | ||
"dependencies": { | ||
"directory-tree": "^3.0.1", | ||
"execa": "^5.1.1", | ||
"fs-extra": "^10.0.0", | ||
@@ -51,8 +57,3 @@ "node-tesseract-ocr": "^2.2.1", | ||
"verbose": true | ||
}, | ||
"scripts": { | ||
"build": "tsc", | ||
"test": "tsc && ava", | ||
"test:ava": "ava" | ||
} | ||
} | ||
} |
@@ -12,3 +12,3 @@ # Bulk Files OCR Text Finder | ||
- Images: JPG, PNG, [WebP](https://en.wikipedia.org/wiki/WebP) | ||
- Images: JPEG, PNG, [WebP](https://en.wikipedia.org/wiki/WebP) | ||
- Documents: PDF | ||
@@ -21,3 +21,3 @@ | ||
[Tesseract OCR](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc) is used internally. | ||
[Tesseract OCR](https://github.com/tesseract-ocr/tesseract) is used internally ([Tesseract Documentation](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc)). For PDF to PNG conversion, [Poppler](https://poppler.freedesktop.org/) is used. | ||
@@ -28,3 +28,5 @@ This package uses [worker threads](https://nodejs.org/api/worker_threads.html) to make use of your CPU cores and be faster. | ||
- The OCR will only provide relevant results if your files are in a proper orientation (text is horizontal and not upside-down). | ||
- The OCR will provide bad results for rotated files/non-straight text. | ||
- 90/180 degrees rotations seems to output a good result | ||
- You may want to pre-process your files somehow to make the text straight! | ||
- Files will be matched if at least 1 of the words is found in the text contained in it. | ||
@@ -34,10 +36,12 @@ | ||
No matter how you decide to use this package, you need to install [Tesseract OCR](https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc) anyway. | ||
No matter how you decide to use this package, you need to install Tesseract OCR anyway. If you have some PDF files, they need to be converted with additional packages | ||
```sh | ||
# OCR Package (non-linux, see https://github.com/tesseract-ocr/tesseract#installing-tesseract) | ||
sudo apt install tesseract-ocr | ||
# PDF to JPEG conversion command-line (for Windows, see https://stackoverflow.com/a/53960829 - MacOS `brew install poppler`) | ||
sudo apt install poppler-utils | ||
``` | ||
See [Installing Tesseract](https://github.com/tesseract-ocr/tesseract#installing-tesseract). | ||
### OCR Language | ||
@@ -58,4 +62,3 @@ | ||
cd bulk-files-ocr-search | ||
# npm install -D | ||
pnpm install | ||
pnpm install # or npm install -D | ||
pnpm build | ||
@@ -120,3 +123,3 @@ ``` | ||
/** | ||
* Tesseract OCR config, will default to english language | ||
* Tesseract OCR config, will default to english language `{ lang: 'eng' }` | ||
* | ||
@@ -149,3 +152,3 @@ * @see https://github.com/tesseract-ocr/tesseract/blob/main/doc/tesseract.1.asc | ||
The standalone OCR function is also exported. | ||
The standalone OCR and PDF to images functions are also exported. | ||
@@ -159,9 +162,23 @@ ```ts | ||
// Tesseract configuration | ||
const tesseractConfig: TesseractConfig = {} | ||
const tesseractConfig: TesseractConfig = { | ||
lang: 'eng' | ||
} | ||
// Should the string be normalized (lowercase, accents removed, whitespace removed) | ||
const shouldCleanStr: boolean = true | ||
const shouldCleanStr: boolean | undefined = true | ||
// OCR | ||
const text = await ocr(file, tesseractConfig, shouldCleanStr) | ||
console.log(text) | ||
// --- | ||
const filePdf = path.resolve(__dirname, '..', 'test', '_testFiles', 'sample.pdf') | ||
// Extracted pages output format | ||
const format: string | undefined = 'jpg' | ||
// PDF to images | ||
const res = await pdfToImages(filePdf, format) | ||
console.log(res) // Files are generated on the file system, 1 file per page | ||
``` | ||
@@ -168,0 +185,0 @@ |
Sorry, the diff of this file is not supported yet
79245
29.01%20
100%346
24.01%181
10.37%5
25%+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added