tesseract.js
Advanced tools
Comparing version 3.0.3 to 4.0.0
# API | ||
- [createWorker()](#create-worker) | ||
- [Worker.load](#worker-load) | ||
- [Worker.writeText](#worker-writeText) | ||
@@ -56,3 +55,3 @@ - [Worker.readText](#worker-readText) | ||
const { createWorker } = Tesseract; | ||
const worker = createWorker({ | ||
const worker = await createWorker({ | ||
langPath: '...', | ||
@@ -67,3 +66,2 @@ logger: m => console.log(m), | ||
- load | ||
- FS functions // optional | ||
@@ -87,19 +85,2 @@ - loadLanguauge | ||
<a name="worker-load"></a> | ||
### Worker.load(jobId): Promise | ||
Worker.load() loads tesseract.js-core scripts (download from remote if not presented), it makes Web Worker/Child Process ready for next action. | ||
**Arguments:** | ||
- `jobId` Please see details above | ||
**Examples:** | ||
```javascript | ||
(async () => { | ||
await worker.load(); | ||
})(); | ||
``` | ||
<a name="worker-writeText"></a> | ||
@@ -231,3 +212,3 @@ ### Worker.writeText(path, text, jobId): Promise | ||
**Supported Paramters:** | ||
**Useful Paramters:** | ||
@@ -241,8 +222,5 @@ | name | type | default value | description | | ||
| user\_defined\_dpi | string | '' | Define custom dpi, use to fix **Warning: Invalid resolution 0 dpi. Using 70 instead.** | | ||
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | | ||
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | | ||
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | | ||
| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result | | ||
| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result | | ||
This list is incomplete. As Tesseract.js passes parameters to the Tesseract engine, all parameters supported by the underlying version of Tesseract should also be supported by Tesseract.js. (Note that parameters marked as “init only” in Tesseract documentation cannot be set by `setParameters` or `recognize`.) | ||
**Examples:** | ||
@@ -270,4 +248,5 @@ | ||
- `image` see [Image Format](./image-format.md) for more details. | ||
- `options` a object of customized options | ||
- `options` an object of customized options | ||
- `rectangle` an object to specify the regions you want to recognized in the image, should contain top, left, width and height, see example below. | ||
- `output` an object specifying which output formats to return (by default `text`, `blocks`, `hocr`, and `tsv` are returned) | ||
- `jobId` Please see details above | ||
@@ -282,4 +261,3 @@ | ||
(async () => { | ||
const worker = createWorker(); | ||
await worker.load(); | ||
const worker = await createWorker(); | ||
await worker.loadLanguage('eng'); | ||
@@ -297,4 +275,3 @@ await worker.initialize('eng'); | ||
(async () => { | ||
const worker = createWorker(); | ||
await worker.load(); | ||
const worker = await createWorker(); | ||
await worker.loadLanguage('eng'); | ||
@@ -324,4 +301,3 @@ await worker.initialize('eng'); | ||
(async () => { | ||
const worker = createWorker(); | ||
await worker.load(); | ||
const worker = await createWorker(); | ||
await worker.loadLanguage('eng'); | ||
@@ -373,3 +349,3 @@ await worker.initialize('eng'); | ||
const scheduler = createScheduler(); | ||
const worker = createWorker(); | ||
const worker = await createWorker(); | ||
scheduler.addWorker(worker); | ||
@@ -376,0 +352,0 @@ ``` |
@@ -10,6 +10,5 @@ # Tesseract.js Examples | ||
const worker = createWorker(); | ||
const worker = await createWorker(); | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng'); | ||
@@ -28,3 +27,3 @@ await worker.initialize('eng'); | ||
const worker = createWorker({ | ||
const worker = await createWorker({ | ||
logger: m => console.log(m), // Add logger here | ||
@@ -34,3 +33,2 @@ }); | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng'); | ||
@@ -49,6 +47,5 @@ await worker.initialize('eng'); | ||
const worker = createWorker(); | ||
const worker = await createWorker(); | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng+chi_tra'); | ||
@@ -61,3 +58,3 @@ await worker.initialize('eng+chi_tra'); | ||
``` | ||
### with whitelist char (^2.0.0-beta.1) | ||
### with whitelist char | ||
@@ -67,6 +64,5 @@ ```javascript | ||
const worker = createWorker(); | ||
const worker = await createWorker(); | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng'); | ||
@@ -83,3 +79,3 @@ await worker.initialize('eng'); | ||
### with different pageseg mode (^2.0.0-beta.1) | ||
### with different pageseg mode | ||
@@ -91,6 +87,5 @@ Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163 | ||
const worker = createWorker(); | ||
const worker = await createWorker(); | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng'); | ||
@@ -107,3 +102,3 @@ await worker.initialize('eng'); | ||
### with pdf output (^2.0.0-beta.1) | ||
### with pdf output | ||
@@ -122,7 +117,6 @@ Please check **examples** folder for details. | ||
const worker = createWorker(); | ||
const worker = await createWorker(); | ||
const rectangle = { left: 0, top: 0, width: 500, height: 250 }; | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng'); | ||
@@ -141,3 +135,3 @@ await worker.initialize('eng'); | ||
const worker = createWorker(); | ||
const worker = await createWorker(); | ||
const rectangles = [ | ||
@@ -159,3 +153,2 @@ { | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng'); | ||
@@ -179,4 +172,4 @@ await worker.initialize('eng'); | ||
const scheduler = createScheduler(); | ||
const worker1 = createWorker(); | ||
const worker2 = createWorker(); | ||
const worker1 = await createWorker(); | ||
const worker2 = await createWorker(); | ||
const rectangles = [ | ||
@@ -198,4 +191,2 @@ { | ||
(async () => { | ||
await worker1.load(); | ||
await worker2.load(); | ||
await worker1.loadLanguage('eng'); | ||
@@ -215,3 +206,3 @@ await worker2.loadLanguage('eng'); | ||
### with multiple workers to speed up (^2.0.0-beta.1) | ||
### with multiple workers to speed up | ||
@@ -222,8 +213,6 @@ ```javascript | ||
const scheduler = createScheduler(); | ||
const worker1 = createWorker(); | ||
const worker2 = createWorker(); | ||
const worker1 = await createWorker(); | ||
const worker2 = await createWorker(); | ||
(async () => { | ||
await worker1.load(); | ||
await worker2.load(); | ||
await worker1.loadLanguage('eng'); | ||
@@ -230,0 +219,0 @@ await worker2.loadLanguage('eng'); |
FAQ | ||
=== | ||
# Project | ||
## What is the scope of this project? | ||
Tesseract.js is the JavaScript/Webassembly port of the Tesseract OCR engine. We do not edit the underlying Tesseract recognition engine in any way. Therefore, if you encounter bugs caused by the Tesseract engine you may open an issue here for the purposes of raising awareness to other users, but fixing is outside the scope of this repository. | ||
If you encounter a Tesseract bug you would like to see fixed you should confirm the behavior is the same in the [main (CLI) version](https://github.com/tesseract-ocr/tesseract) of Tesseract and then open a Git Issue in that repository. | ||
# Trained Data | ||
## How does tesseract.js download and keep \*.traineddata? | ||
@@ -12,32 +19,3 @@ | ||
For tesseract.js v2, check [TrainingTesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00) | ||
See the documentation from the main [Tesseract project](https://tesseract-ocr.github.io/tessdoc/) for training instructions. | ||
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://tesseract-ocr.github.io/tessdoc/Training-Tesseract-3.03%E2%80%933.05) | ||
## How can I get HOCR, TSV, Box, UNLV, OSD? | ||
Starting from 2.0.0-beta.1, you can get all these information in the final result. | ||
```javascript | ||
import { createWorker } from 'tesseract.js'; | ||
const worker = createWorker({ | ||
logger: m => console.log(m) | ||
}); | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng'); | ||
await worker.initialize('eng'); | ||
await worker.setParameters({ | ||
tessedit_create_box: '1', | ||
tessedit_create_unlv: '1', | ||
tessedit_create_osd: '1', | ||
}); | ||
const { data: { text, hocr, tsv, box, unlv } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png'); | ||
console.log(text); | ||
console.log(hocr); | ||
console.log(tsv); | ||
console.log(box); | ||
console.log(unlv); | ||
})(); | ||
``` |
# Image Format | ||
Support Format: **bmp, jpg, png, pbm** | ||
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below. | ||
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS. | ||
Support Image Formats: **bmp, jpg, png, pbm, webp** | ||
On a browser, an image can be: | ||
- an `img` or `canvas` element | ||
- a `File` object (from a file `<input>`) | ||
- a `Blob` object | ||
- a path or URL to an accessible image | ||
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp | ||
For browser and Node, supported data types are: | ||
- string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp) | ||
- buffer | ||
In Node.js, an image can be | ||
- a path to a local image | ||
- a Buffer storing binary image | ||
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp | ||
For browser only, supported data types are: | ||
- `File` or `Blob` object | ||
- `img` or `canvas` element | ||
For Node only, supported data types are: | ||
- string containing a path to local image | ||
Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported. |
@@ -22,3 +22,3 @@ ## Local Installation | ||
```javascript | ||
const worker = createWorker({ | ||
const worker = await createWorker({ | ||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js', | ||
@@ -37,4 +37,4 @@ langPath: 'https://tessdata.projectnaptha.com/4.0.0', | ||
### corePath | ||
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available). | ||
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js'. | ||
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm'. But it fails to fetch at this moment. |
@@ -5,6 +5,4 @@ #!/usr/bin/env node | ||
const worker = createWorker(); | ||
(async () => { | ||
await worker.load(); | ||
const worker = await createWorker(); | ||
await worker.loadLanguage('eng'); | ||
@@ -11,0 +9,0 @@ await worker.initialize('eng'); |
@@ -12,12 +12,10 @@ #!/usr/bin/env node | ||
(async () => { | ||
const worker = createWorker(); | ||
await worker.load(); | ||
const worker = await createWorker(); | ||
await worker.loadLanguage('eng'); | ||
await worker.initialize('eng'); | ||
const { data: { text } } = await worker.recognize(image); | ||
const { data: { text, pdf } } = await worker.recognize(image, {pdfTitle: "Example PDF"}, {pdf: true}); | ||
console.log(text); | ||
const { data } = await worker.getPDF('Tesseract OCR Result'); | ||
fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(data)); | ||
fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(pdf)); | ||
console.log('Generate PDF: tesseract-ocr-result.pdf'); | ||
await worker.terminate(); | ||
})(); |
@@ -9,8 +9,7 @@ #!/usr/bin/env node | ||
console.log(`Recognizing ${image}`); | ||
const worker = createWorker({ | ||
logger: m => console.log(m), | ||
}); | ||
(async () => { | ||
await worker.load(); | ||
const worker = await createWorker({ | ||
logger: m => console.log(m), | ||
}); | ||
await worker.loadLanguage('eng'); | ||
@@ -17,0 +16,0 @@ await worker.initialize('eng'); |
{ | ||
"name": "tesseract.js", | ||
"version": "3.0.3", | ||
"version": "4.0.0", | ||
"description": "Pure Javascript Multilingual OCR", | ||
@@ -50,4 +50,4 @@ "main": "src/index.js", | ||
"express": "^4.17.1", | ||
"mocha": "^8.1.3", | ||
"mocha-headless-chrome": "^2.0.3", | ||
"mocha": "^10.0.0", | ||
"mocha-headless-chrome": "^4.0.0", | ||
"npm-run-all": "^4.1.5", | ||
@@ -74,3 +74,3 @@ "nyc": "^15.1.0", | ||
"resolve-url": "^0.2.1", | ||
"tesseract.js-core": "^3.0.2", | ||
"tesseract.js-core": "^4.0.0", | ||
"wasm-feature-detect": "^1.2.11", | ||
@@ -77,0 +77,0 @@ "zlibjs": "^0.3.1" |
@@ -49,3 +49,3 @@ <p align="center"> | ||
const worker = createWorker({ | ||
const worker = await createWorker({ | ||
logger: m => console.log(m) | ||
@@ -55,3 +55,2 @@ }); | ||
(async () => { | ||
await worker.load(); | ||
await worker.loadLanguage('eng'); | ||
@@ -67,2 +66,12 @@ await worker.initialize('eng'); | ||
## Major changes in v4 | ||
Version 4 includes many new features and bug fixes--see [this issue](https://github.com/naptha/tesseract.js/issues/662) for a full list. Several highlights are below. | ||
- Added rotation preprocessing options (including auto-rotate) for significantly better accuracy | ||
- Processed images (rotated, grayscale, binary) can now be retrieved | ||
- Improved support for parallel processing (schedulers) | ||
- Breaking changes: | ||
- `createWorker` is now async | ||
- `getPDF` function replaced by `pdf` recognize option | ||
## Major changes in v3 | ||
@@ -69,0 +78,0 @@ - Significantly faster performance |
@@ -18,2 +18,3 @@ /* | ||
SPARSE_TEXT_OSD: '12', | ||
RAW_LINE: '13', | ||
}; |
@@ -18,3 +18,3 @@ const resolvePaths = require('./utils/resolvePaths'); | ||
module.exports = (_options = {}) => { | ||
module.exports = async (_options = {}) => { | ||
const id = getId('Worker', workerCounter); | ||
@@ -31,3 +31,13 @@ const { | ||
const rejects = {}; | ||
let workerResReject; | ||
let workerResResolve; | ||
const workerRes = new Promise((resolve, reject) => { | ||
workerResResolve = resolve; | ||
workerResReject = reject; | ||
}); | ||
const workerError = (event) => { workerResReject(event.message); }; | ||
let worker = spawnWorker(options); | ||
worker.onerror = workerError; | ||
@@ -58,3 +68,7 @@ workerCounter += 1; | ||
const load = (jobId) => ( | ||
const load = () => ( | ||
console.warn('`load` is depreciated and should be removed from code (workers now come pre-loaded)') | ||
); | ||
const loadInternal = (jobId) => ( | ||
startJob(createJob({ | ||
@@ -105,7 +119,7 @@ id: jobId, action: 'load', payload: { options }, | ||
const initialize = (langs = 'eng', oem = defaultOEM, jobId) => ( | ||
const initialize = (langs = 'eng', oem = defaultOEM, config, jobId) => ( | ||
startJob(createJob({ | ||
id: jobId, | ||
action: 'initialize', | ||
payload: { langs, oem }, | ||
payload: { langs, oem, config }, | ||
})) | ||
@@ -122,17 +136,20 @@ ); | ||
const recognize = async (image, opts = {}, jobId) => ( | ||
const recognize = async (image, opts = {}, output = { | ||
blocks: true, text: true, hocr: true, tsv: true, | ||
}, jobId) => ( | ||
startJob(createJob({ | ||
id: jobId, | ||
action: 'recognize', | ||
payload: { image: await loadImage(image), options: opts }, | ||
payload: { image: await loadImage(image), options: opts, output }, | ||
})) | ||
); | ||
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => ( | ||
startJob(createJob({ | ||
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => { | ||
console.log('`getPDF` function is depreciated. `recognize` option `savePDF` should be used instead.'); | ||
return startJob(createJob({ | ||
id: jobId, | ||
action: 'getPDF', | ||
payload: { title, textonly }, | ||
})) | ||
); | ||
})); | ||
}; | ||
@@ -175,2 +192,3 @@ const detect = async (image, jobId) => ( | ||
rejects[action](data); | ||
if (action === 'load') workerResReject(data); | ||
if (errorHandler) { | ||
@@ -186,3 +204,3 @@ errorHandler(data); | ||
return { | ||
const resolveObj = { | ||
id, | ||
@@ -205,2 +223,6 @@ worker, | ||
}; | ||
loadInternal().then(() => workerResResolve(resolveObj)).catch(() => {}); | ||
return workerRes; | ||
}; |
declare namespace Tesseract { | ||
function createScheduler(): Scheduler | ||
function createWorker(options?: Partial<WorkerOptions>): Worker | ||
function createWorker(options?: Partial<WorkerOptions>): Promise<Worker> | ||
function setLogging(logging: boolean): void | ||
@@ -22,6 +22,7 @@ function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult> | ||
FS(method: string, args: any[], jobId?: string): Promise<ConfigResult> | ||
loadLanguage(langs?: string, jobId?: string): Promise<ConfigResult> | ||
initialize(langs?: string, oem?: OEM, jobId?: string): Promise<ConfigResult> | ||
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult> | ||
initialize(langs?: string | Lang[], oem?: OEM, config?: string | Partial<InitOptions>, jobId?: string): Promise<ConfigResult> | ||
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult> | ||
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult> | ||
getImage(type: imageType): string | ||
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, output?: Partial<OutputFormats>, jobId?: string): Promise<RecognizeResult> | ||
detect(image: ImageLike, jobId?: string): Promise<DetectResult> | ||
@@ -32,2 +33,15 @@ terminate(jobId?: string): Promise<ConfigResult> | ||
interface Lang { | ||
code: string; | ||
data: unknown; | ||
} | ||
interface InitOptions { | ||
load_system_dawg: string | ||
load_freq_dawg: string | ||
load_unambig_dawg: string | ||
load_punc_dawg: string | ||
load_number_dawg: string | ||
load_bigram_dawg: string | ||
} | ||
interface WorkerOptions { | ||
@@ -57,4 +71,22 @@ corePath: string | ||
} | ||
interface OutputFormats { | ||
text: boolean; | ||
blocks: boolean; | ||
hocr: boolean; | ||
tsv: boolean; | ||
box: boolean; | ||
unlv: boolean; | ||
osd: boolean; | ||
pdf: boolean; | ||
imageColor: boolean; | ||
imageGrey: boolean; | ||
imageBinary: boolean; | ||
debug: boolean; | ||
} | ||
interface RecognizeOptions { | ||
rectangle: Rectangle | ||
pdfTitle: string | ||
pdfTextOnly: boolean | ||
rotateAuto: boolean | ||
rotateRadians: number | ||
} | ||
@@ -78,7 +110,7 @@ interface ConfigResult { | ||
interface DetectData { | ||
tesseract_script_id: number | ||
script: string | ||
script_confidence: number | ||
orientation_degrees: number | ||
orientation_confidence: number | ||
tesseract_script_id: number | null | ||
script: string | null | ||
script_confidence: number | null | ||
orientation_degrees: number | null | ||
orientation_confidence: number | null | ||
} | ||
@@ -91,3 +123,3 @@ interface Rectangle { | ||
} | ||
const enum OEM { | ||
enum OEM { | ||
TESSERACT_ONLY, | ||
@@ -98,3 +130,3 @@ LSTM_ONLY, | ||
} | ||
const enum PSM { | ||
enum PSM { | ||
OSD_ONLY = '0', | ||
@@ -115,2 +147,7 @@ AUTO_OSD = '1', | ||
} | ||
const enum imageType { | ||
COLOR = 0, | ||
GREY = 1, | ||
BINARY = 2 | ||
} | ||
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement | ||
@@ -213,3 +250,3 @@ | CanvasRenderingContext2D | File | Blob | ImageData | Buffer; | ||
interface Page { | ||
blocks: Block[]; | ||
blocks: Block[] | null; | ||
confidence: number; | ||
@@ -230,2 +267,7 @@ lines: Line[]; | ||
sd: string | null; | ||
imageColor: string | null; | ||
imageGrey: string | null; | ||
imageBinary: string | null; | ||
rotateRadians: number | null; | ||
pdf: number[] | null; | ||
} | ||
@@ -232,0 +274,0 @@ } |
const createWorker = require('./createWorker'); | ||
const recognize = async (image, langs, options) => { | ||
const worker = createWorker(options); | ||
await worker.load(); | ||
const worker = await createWorker(options); | ||
await worker.loadLanguage(langs); | ||
@@ -15,4 +14,3 @@ await worker.initialize(langs); | ||
const detect = async (image, options) => { | ||
const worker = createWorker(options); | ||
await worker.load(); | ||
const worker = await createWorker(options); | ||
await worker.loadLanguage('osd'); | ||
@@ -19,0 +17,0 @@ await worker.initialize('osd'); |
@@ -25,27 +25,29 @@ /** | ||
page.blocks.forEach((block) => { | ||
block.paragraphs.forEach((paragraph) => { | ||
paragraph.lines.forEach((line) => { | ||
line.words.forEach((word) => { | ||
word.symbols.forEach((sym) => { | ||
symbols.push({ | ||
...sym, page, block, paragraph, line, word, | ||
if (page.blocks) { | ||
page.blocks.forEach((block) => { | ||
block.paragraphs.forEach((paragraph) => { | ||
paragraph.lines.forEach((line) => { | ||
line.words.forEach((word) => { | ||
word.symbols.forEach((sym) => { | ||
symbols.push({ | ||
...sym, page, block, paragraph, line, word, | ||
}); | ||
}); | ||
words.push({ | ||
...word, page, block, paragraph, line, | ||
}); | ||
}); | ||
words.push({ | ||
...word, page, block, paragraph, line, | ||
lines.push({ | ||
...line, page, block, paragraph, | ||
}); | ||
}); | ||
lines.push({ | ||
...line, page, block, paragraph, | ||
paragraphs.push({ | ||
...paragraph, page, block, | ||
}); | ||
}); | ||
paragraphs.push({ | ||
...paragraph, page, block, | ||
blocks.push({ | ||
...block, page, | ||
}); | ||
}); | ||
blocks.push({ | ||
...block, page, | ||
}); | ||
}); | ||
} | ||
@@ -52,0 +54,0 @@ return { |
@@ -17,3 +17,5 @@ /** | ||
const defaultParams = require('./constants/defaultParams'); | ||
const defaultOutput = require('./constants/defaultOutput'); | ||
const { log, setLogging } = require('../utils/log'); | ||
const PSM = require('../constants/PSM'); | ||
@@ -58,3 +60,3 @@ /* | ||
const FS = ({ workerId, payload: { method, args } }, res) => { | ||
const FS = async ({ workerId, payload: { method, args } }, res) => { | ||
log(`[${workerId}]: FS.${method} with args ${args}`); | ||
@@ -84,2 +86,3 @@ res.resolve(TessModule.FS[method](...args)); | ||
let data = null; | ||
let newData = false; | ||
@@ -96,2 +99,3 @@ try { | ||
} catch (e) { | ||
newData = true; | ||
log(`[${workerId}]: Load ${lang}.traineddata from ${langPath}`); | ||
@@ -138,4 +142,9 @@ if (typeof _lang === 'string') { | ||
if (['write', 'refresh', undefined].includes(cacheMethod)) { | ||
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data); | ||
if (newData && ['write', 'refresh', undefined].includes(cacheMethod)) { | ||
try { | ||
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data); | ||
} catch (err) { | ||
log(`[${workerId}]: Failed to write ${lang}.traineddata to cache due to error:`); | ||
log(err.toString()); | ||
} | ||
} | ||
@@ -152,15 +161,7 @@ | ||
} catch (err) { | ||
if (isWebWorker && err instanceof DOMException) { | ||
/* | ||
* For some reason google chrome throw DOMException in loadLang, | ||
* while other browser is OK, for now we ignore this exception | ||
* and hopefully to find the root cause one day. | ||
*/ | ||
} else { | ||
res.reject(err.toString()); | ||
} | ||
res.reject(err.toString()); | ||
} | ||
}; | ||
const setParameters = ({ payload: { params: _params } }, res) => { | ||
const setParameters = async ({ payload: { params: _params } }, res) => { | ||
Object.keys(_params) | ||
@@ -178,5 +179,5 @@ .filter((k) => !k.startsWith('tessjs_')) | ||
const initialize = ({ | ||
const initialize = async ({ | ||
workerId, | ||
payload: { langs: _langs, oem }, | ||
payload: { langs: _langs, oem, config }, | ||
}, res) => { | ||
@@ -194,2 +195,16 @@ const langs = (typeof _langs === 'string') | ||
} | ||
let configFile; | ||
let configStr; | ||
// config argument may either be config file text, or object with key/value pairs | ||
// In the latter case we convert to config file text here | ||
if (typeof config === 'object') { | ||
configStr = JSON.stringify(config).replace(/,/g, '\n').replace(/:/g, ' ').replace(/["'{}]/g, ''); | ||
} else { | ||
configStr = config; | ||
} | ||
if (typeof configStr === 'string') { | ||
configFile = '/config'; | ||
TessModule.FS.writeFile(configFile, configStr); | ||
} | ||
api = new TessModule.TessBaseAPI(); | ||
@@ -201,3 +216,3 @@ const status = api.Init(null, langs, oem); | ||
params = defaultParams; | ||
setParameters({ payload: { params } }); | ||
await setParameters({ payload: { params } }); | ||
res.progress({ | ||
@@ -212,11 +227,138 @@ workerId, status: 'initialized api', progress: 1, | ||
const recognize = ({ payload: { image, options: { rectangle: rec } } }, res) => { | ||
const getPDFInternal = (title, textonly) => { | ||
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); | ||
pdfRenderer.BeginDocument(title); | ||
pdfRenderer.AddImage(api); | ||
pdfRenderer.EndDocument(); | ||
TessModule._free(pdfRenderer); | ||
return TessModule.FS.readFile('/tesseract-ocr.pdf'); | ||
}; | ||
const getPDF = async ({ payload: { title, textonly } }, res) => { | ||
res.resolve(getPDFInternal(title, textonly)); | ||
}; | ||
// Combines default output with user-specified options and | ||
// counts (1) total output formats requested and (2) outputs that require OCR | ||
const processOutput = (output) => { | ||
const workingOutput = JSON.parse(JSON.stringify(defaultOutput)); | ||
// Output formats were set using `setParameters` in previous versions | ||
// These settings are copied over for compatability | ||
if (params.tessjs_create_box === '1') workingOutput.box = true; | ||
if (params.tessjs_create_hocr === '1') workingOutput.hocr = true; | ||
if (params.tessjs_create_osd === '1') workingOutput.osd = true; | ||
if (params.tessjs_create_tsv === '1') workingOutput.tsv = true; | ||
if (params.tessjs_create_unlv === '1') workingOutput.unlv = true; | ||
const nonRecOutputs = ['imageColor', 'imageGrey', 'imageBinary']; | ||
let recOutputCount = 0; | ||
for (const prop of Object.keys(output)) { | ||
workingOutput[prop] = output[prop]; | ||
} | ||
for (const prop of Object.keys(workingOutput)) { | ||
if (workingOutput[prop]) { | ||
if (!nonRecOutputs.includes(prop)) { | ||
recOutputCount += 1; | ||
} | ||
} | ||
} | ||
return { workingOutput, recOutputCount }; | ||
}; | ||
// List of options for Tesseract.js (rather than passed through to Tesseract), | ||
// not including those with prefix "tessjs_" | ||
const tessjsOptions = ['rectangle', 'pdfTitle', 'pdfTextOnly', 'rotateAuto', 'rotateRadians']; | ||
const recognize = async ({ | ||
payload: { | ||
image, options, output, | ||
}, | ||
}, res) => { | ||
try { | ||
const ptr = setImage(TessModule, api, image); | ||
const optionsTess = {}; | ||
if (typeof options === 'object' && Object.keys(options).length > 0) { | ||
// The options provided by users contain a mix of options for Tesseract.js | ||
// and parameters passed through to Tesseract. | ||
for (const param of Object.keys(options)) { | ||
if (!param.startsWith('tessjs_') && !tessjsOptions.includes(param)) { | ||
optionsTess[param] = options[param]; | ||
} | ||
} | ||
} | ||
if (output.debug) { | ||
optionsTess.debug_file = '/debugInternal.txt'; | ||
TessModule.FS.writeFile('/debugInternal.txt', ''); | ||
} | ||
// If any parameters are changed here they are changed back at the end | ||
if (Object.keys(optionsTess).length > 0) { | ||
api.SaveParameters(); | ||
for (const prop of Object.keys(optionsTess)) { | ||
api.SetVariable(prop, optionsTess[prop]); | ||
} | ||
} | ||
const { workingOutput, recOutputCount } = processOutput(output); | ||
// When the auto-rotate option is True, setImage is called with no angle, | ||
// then the angle is calculated by Tesseract and then setImage is re-called. | ||
// Otherwise, setImage is called once using the user-provided rotateRadiansFinal value. | ||
let rotateRadiansFinal; | ||
if (options.rotateAuto) { | ||
// The angle is only detected if auto page segmentation is used | ||
// Therefore, if this is not the mode specified by the user, it is enabled temporarily here | ||
const psmInit = api.GetPageSegMode(); | ||
let psmEdit = false; | ||
if (![PSM.AUTO, PSM.AUTO_ONLY, PSM.OSD].includes(psmInit)) { | ||
psmEdit = true; | ||
api.SetVariable('tessedit_pageseg_mode', String(PSM.AUTO)); | ||
} | ||
setImage(TessModule, api, image); | ||
api.FindLines(); | ||
const rotateRadiansCalc = api.GetAngle(); | ||
// Restore user-provided PSM setting | ||
if (psmEdit) { | ||
api.SetVariable('tessedit_pageseg_mode', String(psmInit)); | ||
} | ||
// Small angles (<0.005 radians/~0.3 degrees) are ignored to save on runtime | ||
if (Math.abs(rotateRadiansCalc) >= 0.005) { | ||
rotateRadiansFinal = rotateRadiansCalc; | ||
setImage(TessModule, api, image, rotateRadiansFinal); | ||
} else { | ||
// Image needs to be reset if run with different PSM setting earlier | ||
if (psmEdit) { | ||
setImage(TessModule, api, image); | ||
} | ||
rotateRadiansFinal = 0; | ||
} | ||
} else { | ||
rotateRadiansFinal = options.rotateRadians || 0; | ||
setImage(TessModule, api, image, rotateRadiansFinal); | ||
} | ||
const rec = options.rectangle; | ||
if (typeof rec === 'object') { | ||
api.SetRectangle(rec.left, rec.top, rec.width, rec.height); | ||
} | ||
api.Recognize(null); | ||
res.resolve(dump(TessModule, api, params)); | ||
TessModule._free(ptr); | ||
if (recOutputCount > 0) { | ||
api.Recognize(null); | ||
} else { | ||
log('Skipping recognition: all output options requiring recognition are disabled.'); | ||
} | ||
const { pdfTitle } = options; | ||
const { pdfTextOnly } = options; | ||
const result = dump(TessModule, api, workingOutput, { pdfTitle, pdfTextOnly }); | ||
result.rotateRadians = rotateRadiansFinal; | ||
if (output.debug) TessModule.FS.unlink('/debugInternal.txt'); | ||
if (Object.keys(optionsTess).length > 0) { | ||
api.RestoreParameters(); | ||
} | ||
res.resolve(result); | ||
} catch (err) { | ||
@@ -227,21 +369,15 @@ res.reject(err.toString()); | ||
const getPDF = ({ payload: { title, textonly } }, res) => { | ||
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); | ||
pdfRenderer.BeginDocument(title); | ||
pdfRenderer.AddImage(api); | ||
pdfRenderer.EndDocument(); | ||
TessModule._free(pdfRenderer); | ||
res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf')); | ||
}; | ||
const detect = ({ payload: { image } }, res) => { | ||
const detect = async ({ payload: { image } }, res) => { | ||
try { | ||
const ptr = setImage(TessModule, api, image); | ||
setImage(TessModule, api, image); | ||
const results = new TessModule.OSResults(); | ||
if (!api.DetectOS(results)) { | ||
api.End(); | ||
TessModule._free(ptr); | ||
res.reject('Failed to detect OS'); | ||
res.resolve({ | ||
tesseract_script_id: null, | ||
script: null, | ||
script_confidence: null, | ||
orientation_degrees: null, | ||
orientation_confidence: null, | ||
}); | ||
} else { | ||
@@ -252,4 +388,2 @@ const best = results.best_result; | ||
TessModule._free(ptr); | ||
res.resolve({ | ||
@@ -268,3 +402,3 @@ tesseract_script_id: sid, | ||
const terminate = (_, res) => { | ||
const terminate = async (_, res) => { | ||
try { | ||
@@ -306,18 +440,14 @@ if (api !== null) { | ||
try { | ||
({ | ||
load, | ||
FS, | ||
loadLanguage, | ||
initialize, | ||
setParameters, | ||
recognize, | ||
getPDF, | ||
detect, | ||
terminate, | ||
})[packet.action](packet, res); | ||
} catch (err) { | ||
/** Prepare exception to travel through postMessage */ | ||
res.reject(err.toString()); | ||
} | ||
({ | ||
load, | ||
FS, | ||
loadLanguage, | ||
initialize, | ||
setParameters, | ||
recognize, | ||
getPDF, | ||
detect, | ||
terminate, | ||
})[packet.action](packet, res) | ||
.catch((err) => res.reject(err.toString())); | ||
}; | ||
@@ -324,0 +454,0 @@ |
@@ -10,2 +10,4 @@ /** | ||
*/ | ||
const arrayBufferToBase64 = require('./arrayBufferToBase64'); | ||
const imageType = require('../../constants/imageType'); | ||
@@ -41,9 +43,3 @@ /** | ||
*/ | ||
module.exports = (TessModule, api, { | ||
tessjs_create_hocr, | ||
tessjs_create_tsv, | ||
tessjs_create_box, | ||
tessjs_create_unlv, | ||
tessjs_create_osd, | ||
}) => { | ||
module.exports = (TessModule, api, output, options) => { | ||
const ri = api.GetIterator(); | ||
@@ -70,135 +66,160 @@ const { | ||
ri.Begin(); | ||
do { | ||
if (ri.IsAtBeginningOf(RIL_BLOCK)) { | ||
const poly = ri.BlockPolygon(); | ||
let polygon = null; | ||
// BlockPolygon() returns null when automatic page segmentation is off | ||
if (TessModule.getPointer(poly) > 0) { | ||
const n = poly.get_n(); | ||
const px = poly.get_x(); | ||
const py = poly.get_y(); | ||
polygon = []; | ||
for (let i = 0; i < n; i += 1) { | ||
polygon.push([px.getValue(i), py.getValue(i)]); | ||
const getImage = (type) => { | ||
api.WriteImage(type, '/image.png'); | ||
const pngBuffer = TessModule.FS.readFile('/image.png'); | ||
const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`; | ||
TessModule.FS.unlink('/image.png'); | ||
return pngStr; | ||
}; | ||
const getPDFInternal = (title, textonly) => { | ||
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly); | ||
pdfRenderer.BeginDocument(title); | ||
pdfRenderer.AddImage(api); | ||
pdfRenderer.EndDocument(); | ||
TessModule._free(pdfRenderer); | ||
return TessModule.FS.readFile('/tesseract-ocr.pdf'); | ||
}; | ||
if (output.blocks) { | ||
ri.Begin(); | ||
do { | ||
if (ri.IsAtBeginningOf(RIL_BLOCK)) { | ||
const poly = ri.BlockPolygon(); | ||
let polygon = null; | ||
// BlockPolygon() returns null when automatic page segmentation is off | ||
if (TessModule.getPointer(poly) > 0) { | ||
const n = poly.get_n(); | ||
const px = poly.get_x(); | ||
const py = poly.get_y(); | ||
polygon = []; | ||
for (let i = 0; i < n; i += 1) { | ||
polygon.push([px.getValue(i), py.getValue(i)]); | ||
} | ||
/* | ||
* TODO: find out why _ptaDestroy doesn't work | ||
*/ | ||
// TessModule._ptaDestroy(TessModule.getPointer(poly)); | ||
} | ||
/* | ||
* TODO: find out why _ptaDestroy doesn't work | ||
*/ | ||
// TessModule._ptaDestroy(TessModule.getPointer(poly)); | ||
block = { | ||
paragraphs: [], | ||
text: ri.GetUTF8Text(RIL_BLOCK), | ||
confidence: ri.Confidence(RIL_BLOCK), | ||
baseline: ri.getBaseline(RIL_BLOCK), | ||
bbox: ri.getBoundingBox(RIL_BLOCK), | ||
blocktype: enumToString(ri.BlockType(), 'PT'), | ||
polygon, | ||
}; | ||
blocks.push(block); | ||
} | ||
if (ri.IsAtBeginningOf(RIL_PARA)) { | ||
para = { | ||
lines: [], | ||
text: ri.GetUTF8Text(RIL_PARA), | ||
confidence: ri.Confidence(RIL_PARA), | ||
baseline: ri.getBaseline(RIL_PARA), | ||
bbox: ri.getBoundingBox(RIL_PARA), | ||
is_ltr: !!ri.ParagraphIsLtr(), | ||
}; | ||
block.paragraphs.push(para); | ||
} | ||
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { | ||
textline = { | ||
words: [], | ||
text: ri.GetUTF8Text(RIL_TEXTLINE), | ||
confidence: ri.Confidence(RIL_TEXTLINE), | ||
baseline: ri.getBaseline(RIL_TEXTLINE), | ||
bbox: ri.getBoundingBox(RIL_TEXTLINE), | ||
}; | ||
para.lines.push(textline); | ||
} | ||
if (ri.IsAtBeginningOf(RIL_WORD)) { | ||
const fontInfo = ri.getWordFontAttributes(); | ||
const wordDir = ri.WordDirection(); | ||
word = { | ||
symbols: [], | ||
choices: [], | ||
block = { | ||
paragraphs: [], | ||
text: ri.GetUTF8Text(RIL_BLOCK), | ||
confidence: ri.Confidence(RIL_BLOCK), | ||
baseline: ri.getBaseline(RIL_BLOCK), | ||
bbox: ri.getBoundingBox(RIL_BLOCK), | ||
blocktype: enumToString(ri.BlockType(), 'PT'), | ||
polygon, | ||
}; | ||
blocks.push(block); | ||
} | ||
if (ri.IsAtBeginningOf(RIL_PARA)) { | ||
para = { | ||
lines: [], | ||
text: ri.GetUTF8Text(RIL_PARA), | ||
confidence: ri.Confidence(RIL_PARA), | ||
baseline: ri.getBaseline(RIL_PARA), | ||
bbox: ri.getBoundingBox(RIL_PARA), | ||
is_ltr: !!ri.ParagraphIsLtr(), | ||
}; | ||
block.paragraphs.push(para); | ||
} | ||
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) { | ||
textline = { | ||
words: [], | ||
text: ri.GetUTF8Text(RIL_TEXTLINE), | ||
confidence: ri.Confidence(RIL_TEXTLINE), | ||
baseline: ri.getBaseline(RIL_TEXTLINE), | ||
bbox: ri.getBoundingBox(RIL_TEXTLINE), | ||
}; | ||
para.lines.push(textline); | ||
} | ||
if (ri.IsAtBeginningOf(RIL_WORD)) { | ||
const fontInfo = ri.getWordFontAttributes(); | ||
const wordDir = ri.WordDirection(); | ||
word = { | ||
symbols: [], | ||
choices: [], | ||
text: ri.GetUTF8Text(RIL_WORD), | ||
confidence: ri.Confidence(RIL_WORD), | ||
baseline: ri.getBaseline(RIL_WORD), | ||
bbox: ri.getBoundingBox(RIL_WORD), | ||
text: ri.GetUTF8Text(RIL_WORD), | ||
confidence: ri.Confidence(RIL_WORD), | ||
baseline: ri.getBaseline(RIL_WORD), | ||
bbox: ri.getBoundingBox(RIL_WORD), | ||
is_numeric: !!ri.WordIsNumeric(), | ||
in_dictionary: !!ri.WordIsFromDictionary(), | ||
direction: enumToString(wordDir, 'DIR'), | ||
language: ri.WordRecognitionLanguage(), | ||
is_numeric: !!ri.WordIsNumeric(), | ||
in_dictionary: !!ri.WordIsFromDictionary(), | ||
direction: enumToString(wordDir, 'DIR'), | ||
language: ri.WordRecognitionLanguage(), | ||
is_bold: fontInfo.is_bold, | ||
is_italic: fontInfo.is_italic, | ||
is_underlined: fontInfo.is_underlined, | ||
is_monospace: fontInfo.is_monospace, | ||
is_serif: fontInfo.is_serif, | ||
is_smallcaps: fontInfo.is_smallcaps, | ||
font_size: fontInfo.pointsize, | ||
font_id: fontInfo.font_id, | ||
font_name: fontInfo.font_name, | ||
}; | ||
const wc = new TessModule.WordChoiceIterator(ri); | ||
do { | ||
word.choices.push({ | ||
text: wc.GetUTF8Text(), | ||
confidence: wc.Confidence(), | ||
}); | ||
} while (wc.Next()); | ||
TessModule.destroy(wc); | ||
textline.words.push(word); | ||
} | ||
is_bold: fontInfo.is_bold, | ||
is_italic: fontInfo.is_italic, | ||
is_underlined: fontInfo.is_underlined, | ||
is_monospace: fontInfo.is_monospace, | ||
is_serif: fontInfo.is_serif, | ||
is_smallcaps: fontInfo.is_smallcaps, | ||
font_size: fontInfo.pointsize, | ||
font_id: fontInfo.font_id, | ||
font_name: fontInfo.font_name, | ||
}; | ||
const wc = new TessModule.WordChoiceIterator(ri); | ||
do { | ||
word.choices.push({ | ||
text: wc.GetUTF8Text(), | ||
confidence: wc.Confidence(), | ||
}); | ||
} while (wc.Next()); | ||
TessModule.destroy(wc); | ||
textline.words.push(word); | ||
} | ||
// let image = null; | ||
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL) | ||
// var image = pix2array(pix); | ||
// // for some reason it seems that things stop working if you destroy pics | ||
// TessModule._pixDestroy(TessModule.getPointer(pix)); | ||
if (ri.IsAtBeginningOf(RIL_SYMBOL)) { | ||
symbol = { | ||
choices: [], | ||
image: null, | ||
text: ri.GetUTF8Text(RIL_SYMBOL), | ||
confidence: ri.Confidence(RIL_SYMBOL), | ||
baseline: ri.getBaseline(RIL_SYMBOL), | ||
bbox: ri.getBoundingBox(RIL_SYMBOL), | ||
is_superscript: !!ri.SymbolIsSuperscript(), | ||
is_subscript: !!ri.SymbolIsSubscript(), | ||
is_dropcap: !!ri.SymbolIsDropcap(), | ||
}; | ||
word.symbols.push(symbol); | ||
const ci = new TessModule.ChoiceIterator(ri); | ||
do { | ||
symbol.choices.push({ | ||
text: ci.GetUTF8Text(), | ||
confidence: ci.Confidence(), | ||
}); | ||
} while (ci.Next()); | ||
// TessModule.destroy(i); | ||
} | ||
} while (ri.Next(RIL_SYMBOL)); | ||
TessModule.destroy(ri); | ||
} | ||
// let image = null; | ||
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL) | ||
// var image = pix2array(pix); | ||
// // for some reason it seems that things stop working if you destroy pics | ||
// TessModule._pixDestroy(TessModule.getPointer(pix)); | ||
if (ri.IsAtBeginningOf(RIL_SYMBOL)) { | ||
symbol = { | ||
choices: [], | ||
image: null, | ||
text: ri.GetUTF8Text(RIL_SYMBOL), | ||
confidence: ri.Confidence(RIL_SYMBOL), | ||
baseline: ri.getBaseline(RIL_SYMBOL), | ||
bbox: ri.getBoundingBox(RIL_SYMBOL), | ||
is_superscript: !!ri.SymbolIsSuperscript(), | ||
is_subscript: !!ri.SymbolIsSubscript(), | ||
is_dropcap: !!ri.SymbolIsDropcap(), | ||
}; | ||
word.symbols.push(symbol); | ||
const ci = new TessModule.ChoiceIterator(ri); | ||
do { | ||
symbol.choices.push({ | ||
text: ci.GetUTF8Text(), | ||
confidence: ci.Confidence(), | ||
}); | ||
} while (ci.Next()); | ||
// TessModule.destroy(i); | ||
} | ||
} while (ri.Next(RIL_SYMBOL)); | ||
TessModule.destroy(ri); | ||
return { | ||
text: api.GetUTF8Text(), | ||
hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, | ||
tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null, | ||
box: tessjs_create_box === '1' ? api.GetBoxText() : null, | ||
unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null, | ||
osd: tessjs_create_osd === '1' ? api.GetOsdText() : null, | ||
text: output.text ? api.GetUTF8Text() : null, | ||
hocr: output.hocr ? deindent(api.GetHOCRText()) : null, | ||
tsv: output.tsv ? api.GetTSVText() : null, | ||
box: output.box ? api.GetBoxText() : null, | ||
unlv: output.unlv ? api.GetUNLVText() : null, | ||
osd: output.osd ? api.GetOsdText() : null, | ||
pdf: output.pdf ? getPDFInternal(options.pdfTitle ?? 'Tesseract OCR Result', options.pdfTextOnly ?? false) : null, | ||
imageColor: output.imageColor ? getImage(imageType.COLOR) : null, | ||
imageGrey: output.imageGrey ? getImage(imageType.GREY) : null, | ||
imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null, | ||
confidence: api.MeanTextConf(), | ||
blocks, | ||
blocks: output.blocks ? blocks : null, | ||
psm: enumToString(api.GetPageSegMode(), 'PSM'), | ||
oem: enumToString(api.oem(), 'OEM'), | ||
version: api.Version(), | ||
debug: output.debug ? TessModule.FS.readFile('/debugInternal.txt', { encoding: 'utf8', flags: 'a+' }) : null, | ||
}; | ||
}; |
@@ -11,54 +11,22 @@ const bmp = require('bmp-js'); | ||
*/ | ||
module.exports = (TessModule, api, image) => { | ||
const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length })); | ||
const type = fileType(buf); | ||
let bytesPerPixel = 0; | ||
let data = null; | ||
let pix = null; | ||
let w = 0; | ||
let h = 0; | ||
module.exports = (TessModule, api, image, angle = 0) => { | ||
const type = fileType(image); | ||
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1; | ||
const exif = image.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1; | ||
/* | ||
* Leptonica supports uncompressed but not compressed bmp files | ||
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516 | ||
* We therefore use bmp-js to process all bmp files | ||
*/ | ||
// /* | ||
// * Leptonica supports some but not all bmp files | ||
// * @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516 | ||
// * We therefore use bmp-js to convert all bmp files into a format Leptonica is known to support | ||
// */ | ||
if (type && type.mime === 'image/bmp') { | ||
// Not sure what this line actually does, but removing breaks the function | ||
const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length })); | ||
const bmpBuf = bmp.decode(buf); | ||
data = TessModule._malloc(bmpBuf.data.length * Uint8Array.BYTES_PER_ELEMENT); | ||
TessModule.HEAPU8.set(bmpBuf.data, data); | ||
w = bmpBuf.width; | ||
h = bmpBuf.height; | ||
bytesPerPixel = 4; | ||
TessModule.FS.writeFile('/input', bmp.encode(bmpBuf).data); | ||
} else { | ||
const ptr = TessModule._malloc(buf.length * Uint8Array.BYTES_PER_ELEMENT); | ||
TessModule.HEAPU8.set(buf, ptr); | ||
pix = TessModule._pixReadMem(ptr, buf.length); | ||
if (TessModule.getValue(pix + (7 * 4), 'i32') === 0) { | ||
/* | ||
* Set a yres default value to prevent warning from tesseract | ||
* See kMinCredibleResolution in tesseract/src/ccstruct/publictypes.h | ||
*/ | ||
TessModule.setValue(pix + (7 * 4), 300, 'i32'); | ||
} | ||
[w, h] = Array(2).fill(0) | ||
.map((v, idx) => ( | ||
TessModule.getValue(pix + (idx * 4), 'i32') | ||
)); | ||
TessModule.FS.writeFile('/input', image); | ||
} | ||
/* | ||
* As some image format (ex. bmp) is not supported natiely by tesseract, | ||
* sometimes it will not return pix directly, but data and bytesPerPixel | ||
* for another SetImage usage. | ||
* | ||
*/ | ||
if (data === null) { | ||
api.SetImage(pix, undefined, undefined, undefined, undefined, exif); | ||
} else { | ||
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif); | ||
} | ||
return data === null ? pix : data; | ||
api.SetImageFile(exif, angle); | ||
}; |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
13318463
89
3301
207
+ Addedtesseract.js-core@4.0.4(transitive)
- Removedtesseract.js-core@3.0.2(transitive)
Updatedtesseract.js-core@^4.0.0