Socket
Socket
Sign inDemoInstall

tesseract.js

Package Overview
Dependencies
Maintainers
4
Versions
68
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

tesseract.js - npm Package Compare versions

Comparing version 3.0.3 to 4.0.0

examples/browser/basic-efficient.html

44

docs/api.md
# API
- [createWorker()](#create-worker)
- [Worker.load](#worker-load)
- [Worker.writeText](#worker-writeText)

@@ -56,3 +55,3 @@ - [Worker.readText](#worker-readText)

const { createWorker } = Tesseract;
const worker = createWorker({
const worker = await createWorker({
langPath: '...',

@@ -67,3 +66,2 @@ logger: m => console.log(m),

- load
- FS functions // optional

@@ -87,19 +85,2 @@ - loadLanguauge

<a name="worker-load"></a>
### Worker.load(jobId): Promise
Worker.load() loads tesseract.js-core scripts (download from remote if not presented), it makes Web Worker/Child Process ready for next action.
**Arguments:**
- `jobId` Please see details above
**Examples:**
```javascript
(async () => {
await worker.load();
})();
```
<a name="worker-writeText"></a>

@@ -231,3 +212,3 @@ ### Worker.writeText(path, text, jobId): Promise

**Supported Paramters:**
**Useful Paramters:**

@@ -241,8 +222,5 @@ | name | type | default value | description |

| user\_defined\_dpi | string | '' | Define custom dpi, use to fix **Warning: Invalid resolution 0 dpi. Using 70 instead.** |
| tessjs\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result |
| tessjs\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result |
| tessjs\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result |
| tessjs\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result |
| tessjs\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result |
This list is incomplete. As Tesseract.js passes parameters to the Tesseract engine, all parameters supported by the underlying version of Tesseract should also be supported by Tesseract.js. (Note that parameters marked as “init only” in Tesseract documentation cannot be set by `setParameters` or `recognize`.)
**Examples:**

@@ -270,4 +248,5 @@

- `image` see [Image Format](./image-format.md) for more details.
- `options` a object of customized options
- `options` an object of customized options
- `rectangle` an object to specify the regions you want to recognized in the image, should contain top, left, width and height, see example below.
- `output` an object specifying which output formats to return (by default `text`, `blocks`, `hocr`, and `tsv` are returned)
- `jobId` Please see details above

@@ -282,4 +261,3 @@

(async () => {
const worker = createWorker();
await worker.load();
const worker = await createWorker();
await worker.loadLanguage('eng');

@@ -297,4 +275,3 @@ await worker.initialize('eng');

(async () => {
const worker = createWorker();
await worker.load();
const worker = await createWorker();
await worker.loadLanguage('eng');

@@ -324,4 +301,3 @@ await worker.initialize('eng');

(async () => {
const worker = createWorker();
await worker.load();
const worker = await createWorker();
await worker.loadLanguage('eng');

@@ -373,3 +349,3 @@ await worker.initialize('eng');

const scheduler = createScheduler();
const worker = createWorker();
const worker = await createWorker();
scheduler.addWorker(worker);

@@ -376,0 +352,0 @@ ```

@@ -10,6 +10,5 @@ # Tesseract.js Examples

const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');

@@ -28,3 +27,3 @@ await worker.initialize('eng');

const worker = createWorker({
const worker = await createWorker({
logger: m => console.log(m), // Add logger here

@@ -34,3 +33,2 @@ });

(async () => {
await worker.load();
await worker.loadLanguage('eng');

@@ -49,6 +47,5 @@ await worker.initialize('eng');

const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng+chi_tra');

@@ -61,3 +58,3 @@ await worker.initialize('eng+chi_tra');

```
### with whitelist char (^2.0.0-beta.1)
### with whitelist char

@@ -67,6 +64,5 @@ ```javascript

const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');

@@ -83,3 +79,3 @@ await worker.initialize('eng');

### with different pageseg mode (^2.0.0-beta.1)
### with different pageseg mode

@@ -91,6 +87,5 @@ Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163

const worker = createWorker();
const worker = await createWorker();
(async () => {
await worker.load();
await worker.loadLanguage('eng');

@@ -107,3 +102,3 @@ await worker.initialize('eng');

### with pdf output (^2.0.0-beta.1)
### with pdf output

@@ -122,7 +117,6 @@ Please check **examples** folder for details.

const worker = createWorker();
const worker = await createWorker();
const rectangle = { left: 0, top: 0, width: 500, height: 250 };
(async () => {
await worker.load();
await worker.loadLanguage('eng');

@@ -141,3 +135,3 @@ await worker.initialize('eng');

const worker = createWorker();
const worker = await createWorker();
const rectangles = [

@@ -159,3 +153,2 @@ {

(async () => {
await worker.load();
await worker.loadLanguage('eng');

@@ -179,4 +172,4 @@ await worker.initialize('eng');

const scheduler = createScheduler();
const worker1 = createWorker();
const worker2 = createWorker();
const worker1 = await createWorker();
const worker2 = await createWorker();
const rectangles = [

@@ -198,4 +191,2 @@ {

(async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng');

@@ -215,3 +206,3 @@ await worker2.loadLanguage('eng');

### with multiple workers to speed up (^2.0.0-beta.1)
### with multiple workers to speed up

@@ -222,8 +213,6 @@ ```javascript

const scheduler = createScheduler();
const worker1 = createWorker();
const worker2 = createWorker();
const worker1 = await createWorker();
const worker2 = await createWorker();
(async () => {
await worker1.load();
await worker2.load();
await worker1.loadLanguage('eng');

@@ -230,0 +219,0 @@ await worker2.loadLanguage('eng');

FAQ
===
# Project
## What is the scope of this project?
Tesseract.js is the JavaScript/Webassembly port of the Tesseract OCR engine. We do not edit the underlying Tesseract recognition engine in any way. Therefore, if you encounter bugs caused by the Tesseract engine you may open an issue here for the purposes of raising awareness to other users, but fixing is outside the scope of this repository.
If you encounter a Tesseract bug you would like to see fixed you should confirm the behavior is the same in the [main (CLI) version](https://github.com/tesseract-ocr/tesseract) of Tesseract and then open a Git Issue in that repository.
# Trained Data
## How does tesseract.js download and keep \*.traineddata?

@@ -12,32 +19,3 @@

For tesseract.js v2, check [TrainingTesseract 4.00](https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00)
See the documentation from the main [Tesseract project](https://tesseract-ocr.github.io/tessdoc/) for training instructions.
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://tesseract-ocr.github.io/tessdoc/Training-Tesseract-3.03%E2%80%933.05)
## How can I get HOCR, TSV, Box, UNLV, OSD?
Starting from 2.0.0-beta.1, you can get all these information in the final result.
```javascript
import { createWorker } from 'tesseract.js';
const worker = createWorker({
logger: m => console.log(m)
});
(async () => {
await worker.load();
await worker.loadLanguage('eng');
await worker.initialize('eng');
await worker.setParameters({
tessedit_create_box: '1',
tessedit_create_unlv: '1',
tessedit_create_osd: '1',
});
const { data: { text, hocr, tsv, box, unlv } } = await worker.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png');
console.log(text);
console.log(hocr);
console.log(tsv);
console.log(box);
console.log(unlv);
})();
```
# Image Format
Support Format: **bmp, jpg, png, pbm**
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below.
The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter, which should be something that is like an image. What's considered "image-like" differs depending on whether it is being run from the browser or through NodeJS.
Support Image Formats: **bmp, jpg, png, pbm, webp**
On a browser, an image can be:
- an `img` or `canvas` element
- a `File` object (from a file `<input>`)
- a `Blob` object
- a path or URL to an accessible image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For browser and Node, supported data types are:
- string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp)
- buffer
In Node.js, an image can be
- a path to a local image
- a Buffer storing binary image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
For browser only, supported data types are:
- `File` or `Blob` object
- `img` or `canvas` element
For Node only, supported data types are:
- string containing a path to local image
Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported.

@@ -22,3 +22,3 @@ ## Local Installation

```javascript
const worker = createWorker({
const worker = await createWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0/dist/worker.min.js',

@@ -37,4 +37,4 @@ langPath: 'https://tessdata.projectnaptha.com/4.0.0',

### corePath
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available).
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm.js'.
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0/tesseract-core.wasm'. But it fails to fetch at this moment.

@@ -5,6 +5,4 @@ #!/usr/bin/env node

const worker = createWorker();
(async () => {
await worker.load();
const worker = await createWorker();
await worker.loadLanguage('eng');

@@ -11,0 +9,0 @@ await worker.initialize('eng');

@@ -12,12 +12,10 @@ #!/usr/bin/env node

(async () => {
const worker = createWorker();
await worker.load();
const worker = await createWorker();
await worker.loadLanguage('eng');
await worker.initialize('eng');
const { data: { text } } = await worker.recognize(image);
const { data: { text, pdf } } = await worker.recognize(image, {pdfTitle: "Example PDF"}, {pdf: true});
console.log(text);
const { data } = await worker.getPDF('Tesseract OCR Result');
fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(data));
fs.writeFileSync('tesseract-ocr-result.pdf', Buffer.from(pdf));
console.log('Generate PDF: tesseract-ocr-result.pdf');
await worker.terminate();
})();

@@ -9,8 +9,7 @@ #!/usr/bin/env node

console.log(`Recognizing ${image}`);
const worker = createWorker({
logger: m => console.log(m),
});
(async () => {
await worker.load();
const worker = await createWorker({
logger: m => console.log(m),
});
await worker.loadLanguage('eng');

@@ -17,0 +16,0 @@ await worker.initialize('eng');

{
"name": "tesseract.js",
"version": "3.0.3",
"version": "4.0.0",
"description": "Pure Javascript Multilingual OCR",

@@ -50,4 +50,4 @@ "main": "src/index.js",

"express": "^4.17.1",
"mocha": "^8.1.3",
"mocha-headless-chrome": "^2.0.3",
"mocha": "^10.0.0",
"mocha-headless-chrome": "^4.0.0",
"npm-run-all": "^4.1.5",

@@ -74,3 +74,3 @@ "nyc": "^15.1.0",

"resolve-url": "^0.2.1",
"tesseract.js-core": "^3.0.2",
"tesseract.js-core": "^4.0.0",
"wasm-feature-detect": "^1.2.11",

@@ -77,0 +77,0 @@ "zlibjs": "^0.3.1"

@@ -49,3 +49,3 @@ <p align="center">

const worker = createWorker({
const worker = await createWorker({
logger: m => console.log(m)

@@ -55,3 +55,2 @@ });

(async () => {
await worker.load();
await worker.loadLanguage('eng');

@@ -67,2 +66,12 @@ await worker.initialize('eng');

## Major changes in v4
Version 4 includes many new features and bug fixes--see [this issue](https://github.com/naptha/tesseract.js/issues/662) for a full list. Several highlights are below.
- Added rotation preprocessing options (including auto-rotate) for significantly better accuracy
- Processed images (rotated, grayscale, binary) can now be retrieved
- Improved support for parallel processing (schedulers)
- Breaking changes:
- `createWorker` is now async
- `getPDF` function replaced by `pdf` recognize option
## Major changes in v3

@@ -69,0 +78,0 @@ - Significantly faster performance

@@ -18,2 +18,3 @@ /*

SPARSE_TEXT_OSD: '12',
RAW_LINE: '13',
};

@@ -18,3 +18,3 @@ const resolvePaths = require('./utils/resolvePaths');

module.exports = (_options = {}) => {
module.exports = async (_options = {}) => {
const id = getId('Worker', workerCounter);

@@ -31,3 +31,13 @@ const {

const rejects = {};
let workerResReject;
let workerResResolve;
const workerRes = new Promise((resolve, reject) => {
workerResResolve = resolve;
workerResReject = reject;
});
const workerError = (event) => { workerResReject(event.message); };
let worker = spawnWorker(options);
worker.onerror = workerError;

@@ -58,3 +68,7 @@ workerCounter += 1;

const load = (jobId) => (
const load = () => (
console.warn('`load` is depreciated and should be removed from code (workers now come pre-loaded)')
);
const loadInternal = (jobId) => (
startJob(createJob({

@@ -105,7 +119,7 @@ id: jobId, action: 'load', payload: { options },

const initialize = (langs = 'eng', oem = defaultOEM, jobId) => (
const initialize = (langs = 'eng', oem = defaultOEM, config, jobId) => (
startJob(createJob({
id: jobId,
action: 'initialize',
payload: { langs, oem },
payload: { langs, oem, config },
}))

@@ -122,17 +136,20 @@ );

const recognize = async (image, opts = {}, jobId) => (
const recognize = async (image, opts = {}, output = {
blocks: true, text: true, hocr: true, tsv: true,
}, jobId) => (
startJob(createJob({
id: jobId,
action: 'recognize',
payload: { image: await loadImage(image), options: opts },
payload: { image: await loadImage(image), options: opts, output },
}))
);
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => (
startJob(createJob({
const getPDF = (title = 'Tesseract OCR Result', textonly = false, jobId) => {
console.log('`getPDF` function is depreciated. `recognize` option `savePDF` should be used instead.');
return startJob(createJob({
id: jobId,
action: 'getPDF',
payload: { title, textonly },
}))
);
}));
};

@@ -175,2 +192,3 @@ const detect = async (image, jobId) => (

rejects[action](data);
if (action === 'load') workerResReject(data);
if (errorHandler) {

@@ -186,3 +204,3 @@ errorHandler(data);

return {
const resolveObj = {
id,

@@ -205,2 +223,6 @@ worker,

};
loadInternal().then(() => workerResResolve(resolveObj)).catch(() => {});
return workerRes;
};
declare namespace Tesseract {
function createScheduler(): Scheduler
function createWorker(options?: Partial<WorkerOptions>): Worker
function createWorker(options?: Partial<WorkerOptions>): Promise<Worker>
function setLogging(logging: boolean): void

@@ -22,6 +22,7 @@ function recognize(image: ImageLike, langs?: string, options?: Partial<WorkerOptions>): Promise<RecognizeResult>

FS(method: string, args: any[], jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string, jobId?: string): Promise<ConfigResult>
initialize(langs?: string, oem?: OEM, jobId?: string): Promise<ConfigResult>
loadLanguage(langs?: string | Lang[], jobId?: string): Promise<ConfigResult>
initialize(langs?: string | Lang[], oem?: OEM, config?: string | Partial<InitOptions>, jobId?: string): Promise<ConfigResult>
setParameters(params: Partial<WorkerParams>, jobId?: string): Promise<ConfigResult>
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, jobId?: string): Promise<RecognizeResult>
getImage(type: imageType): string
recognize(image: ImageLike, options?: Partial<RecognizeOptions>, output?: Partial<OutputFormats>, jobId?: string): Promise<RecognizeResult>
detect(image: ImageLike, jobId?: string): Promise<DetectResult>

@@ -32,2 +33,15 @@ terminate(jobId?: string): Promise<ConfigResult>

interface Lang {
code: string;
data: unknown;
}
interface InitOptions {
load_system_dawg: string
load_freq_dawg: string
load_unambig_dawg: string
load_punc_dawg: string
load_number_dawg: string
load_bigram_dawg: string
}
interface WorkerOptions {

@@ -57,4 +71,22 @@ corePath: string

}
interface OutputFormats {
text: boolean;
blocks: boolean;
hocr: boolean;
tsv: boolean;
box: boolean;
unlv: boolean;
osd: boolean;
pdf: boolean;
imageColor: boolean;
imageGrey: boolean;
imageBinary: boolean;
debug: boolean;
}
interface RecognizeOptions {
rectangle: Rectangle
pdfTitle: string
pdfTextOnly: boolean
rotateAuto: boolean
rotateRadians: number
}

@@ -78,7 +110,7 @@ interface ConfigResult {

interface DetectData {
tesseract_script_id: number
script: string
script_confidence: number
orientation_degrees: number
orientation_confidence: number
tesseract_script_id: number | null
script: string | null
script_confidence: number | null
orientation_degrees: number | null
orientation_confidence: number | null
}

@@ -91,3 +123,3 @@ interface Rectangle {

}
const enum OEM {
enum OEM {
TESSERACT_ONLY,

@@ -98,3 +130,3 @@ LSTM_ONLY,

}
const enum PSM {
enum PSM {
OSD_ONLY = '0',

@@ -115,2 +147,7 @@ AUTO_OSD = '1',

}
const enum imageType {
COLOR = 0,
GREY = 1,
BINARY = 2
}
type ImageLike = string | HTMLImageElement | HTMLCanvasElement | HTMLVideoElement

@@ -213,3 +250,3 @@ | CanvasRenderingContext2D | File | Blob | ImageData | Buffer;

interface Page {
blocks: Block[];
blocks: Block[] | null;
confidence: number;

@@ -230,2 +267,7 @@ lines: Line[];

sd: string | null;
imageColor: string | null;
imageGrey: string | null;
imageBinary: string | null;
rotateRadians: number | null;
pdf: number[] | null;
}

@@ -232,0 +274,0 @@ }

const createWorker = require('./createWorker');
const recognize = async (image, langs, options) => {
const worker = createWorker(options);
await worker.load();
const worker = await createWorker(options);
await worker.loadLanguage(langs);

@@ -15,4 +14,3 @@ await worker.initialize(langs);

const detect = async (image, options) => {
const worker = createWorker(options);
await worker.load();
const worker = await createWorker(options);
await worker.loadLanguage('osd');

@@ -19,0 +17,0 @@ await worker.initialize('osd');

@@ -25,27 +25,29 @@ /**

page.blocks.forEach((block) => {
block.paragraphs.forEach((paragraph) => {
paragraph.lines.forEach((line) => {
line.words.forEach((word) => {
word.symbols.forEach((sym) => {
symbols.push({
...sym, page, block, paragraph, line, word,
if (page.blocks) {
page.blocks.forEach((block) => {
block.paragraphs.forEach((paragraph) => {
paragraph.lines.forEach((line) => {
line.words.forEach((word) => {
word.symbols.forEach((sym) => {
symbols.push({
...sym, page, block, paragraph, line, word,
});
});
words.push({
...word, page, block, paragraph, line,
});
});
words.push({
...word, page, block, paragraph, line,
lines.push({
...line, page, block, paragraph,
});
});
lines.push({
...line, page, block, paragraph,
paragraphs.push({
...paragraph, page, block,
});
});
paragraphs.push({
...paragraph, page, block,
blocks.push({
...block, page,
});
});
blocks.push({
...block, page,
});
});
}

@@ -52,0 +54,0 @@ return {

@@ -17,3 +17,5 @@ /**

const defaultParams = require('./constants/defaultParams');
const defaultOutput = require('./constants/defaultOutput');
const { log, setLogging } = require('../utils/log');
const PSM = require('../constants/PSM');

@@ -58,3 +60,3 @@ /*

const FS = ({ workerId, payload: { method, args } }, res) => {
const FS = async ({ workerId, payload: { method, args } }, res) => {
log(`[${workerId}]: FS.${method} with args ${args}`);

@@ -84,2 +86,3 @@ res.resolve(TessModule.FS[method](...args));

let data = null;
let newData = false;

@@ -96,2 +99,3 @@ try {

} catch (e) {
newData = true;
log(`[${workerId}]: Load ${lang}.traineddata from ${langPath}`);

@@ -138,4 +142,9 @@ if (typeof _lang === 'string') {

if (['write', 'refresh', undefined].includes(cacheMethod)) {
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data);
if (newData && ['write', 'refresh', undefined].includes(cacheMethod)) {
try {
await adapter.writeCache(`${cachePath || '.'}/${lang}.traineddata`, data);
} catch (err) {
log(`[${workerId}]: Failed to write ${lang}.traineddata to cache due to error:`);
log(err.toString());
}
}

@@ -152,15 +161,7 @@

} catch (err) {
if (isWebWorker && err instanceof DOMException) {
/*
* For some reason google chrome throw DOMException in loadLang,
* while other browser is OK, for now we ignore this exception
* and hopefully to find the root cause one day.
*/
} else {
res.reject(err.toString());
}
res.reject(err.toString());
}
};
const setParameters = ({ payload: { params: _params } }, res) => {
const setParameters = async ({ payload: { params: _params } }, res) => {
Object.keys(_params)

@@ -178,5 +179,5 @@ .filter((k) => !k.startsWith('tessjs_'))

const initialize = ({
const initialize = async ({
workerId,
payload: { langs: _langs, oem },
payload: { langs: _langs, oem, config },
}, res) => {

@@ -194,2 +195,16 @@ const langs = (typeof _langs === 'string')

}
let configFile;
let configStr;
// config argument may either be config file text, or object with key/value pairs
// In the latter case we convert to config file text here
if (typeof config === 'object') {
configStr = JSON.stringify(config).replace(/,/g, '\n').replace(/:/g, ' ').replace(/["'{}]/g, '');
} else {
configStr = config;
}
if (typeof configStr === 'string') {
configFile = '/config';
TessModule.FS.writeFile(configFile, configStr);
}
api = new TessModule.TessBaseAPI();

@@ -201,3 +216,3 @@ const status = api.Init(null, langs, oem);

params = defaultParams;
setParameters({ payload: { params } });
await setParameters({ payload: { params } });
res.progress({

@@ -212,11 +227,138 @@ workerId, status: 'initialized api', progress: 1,

const recognize = ({ payload: { image, options: { rectangle: rec } } }, res) => {
const getPDFInternal = (title, textonly) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};
const getPDF = async ({ payload: { title, textonly } }, res) => {
res.resolve(getPDFInternal(title, textonly));
};
// Combines default output with user-specified options and
// counts (1) total output formats requested and (2) outputs that require OCR
const processOutput = (output) => {
const workingOutput = JSON.parse(JSON.stringify(defaultOutput));
// Output formats were set using `setParameters` in previous versions
// These settings are copied over for compatability
if (params.tessjs_create_box === '1') workingOutput.box = true;
if (params.tessjs_create_hocr === '1') workingOutput.hocr = true;
if (params.tessjs_create_osd === '1') workingOutput.osd = true;
if (params.tessjs_create_tsv === '1') workingOutput.tsv = true;
if (params.tessjs_create_unlv === '1') workingOutput.unlv = true;
const nonRecOutputs = ['imageColor', 'imageGrey', 'imageBinary'];
let recOutputCount = 0;
for (const prop of Object.keys(output)) {
workingOutput[prop] = output[prop];
}
for (const prop of Object.keys(workingOutput)) {
if (workingOutput[prop]) {
if (!nonRecOutputs.includes(prop)) {
recOutputCount += 1;
}
}
}
return { workingOutput, recOutputCount };
};
// List of options for Tesseract.js (rather than passed through to Tesseract),
// not including those with prefix "tessjs_"
const tessjsOptions = ['rectangle', 'pdfTitle', 'pdfTextOnly', 'rotateAuto', 'rotateRadians'];
const recognize = async ({
payload: {
image, options, output,
},
}, res) => {
try {
const ptr = setImage(TessModule, api, image);
const optionsTess = {};
if (typeof options === 'object' && Object.keys(options).length > 0) {
// The options provided by users contain a mix of options for Tesseract.js
// and parameters passed through to Tesseract.
for (const param of Object.keys(options)) {
if (!param.startsWith('tessjs_') && !tessjsOptions.includes(param)) {
optionsTess[param] = options[param];
}
}
}
if (output.debug) {
optionsTess.debug_file = '/debugInternal.txt';
TessModule.FS.writeFile('/debugInternal.txt', '');
}
// If any parameters are changed here they are changed back at the end
if (Object.keys(optionsTess).length > 0) {
api.SaveParameters();
for (const prop of Object.keys(optionsTess)) {
api.SetVariable(prop, optionsTess[prop]);
}
}
const { workingOutput, recOutputCount } = processOutput(output);
// When the auto-rotate option is True, setImage is called with no angle,
// then the angle is calculated by Tesseract and then setImage is re-called.
// Otherwise, setImage is called once using the user-provided rotateRadiansFinal value.
let rotateRadiansFinal;
if (options.rotateAuto) {
// The angle is only detected if auto page segmentation is used
// Therefore, if this is not the mode specified by the user, it is enabled temporarily here
const psmInit = api.GetPageSegMode();
let psmEdit = false;
if (![PSM.AUTO, PSM.AUTO_ONLY, PSM.OSD].includes(psmInit)) {
psmEdit = true;
api.SetVariable('tessedit_pageseg_mode', String(PSM.AUTO));
}
setImage(TessModule, api, image);
api.FindLines();
const rotateRadiansCalc = api.GetAngle();
// Restore user-provided PSM setting
if (psmEdit) {
api.SetVariable('tessedit_pageseg_mode', String(psmInit));
}
// Small angles (<0.005 radians/~0.3 degrees) are ignored to save on runtime
if (Math.abs(rotateRadiansCalc) >= 0.005) {
rotateRadiansFinal = rotateRadiansCalc;
setImage(TessModule, api, image, rotateRadiansFinal);
} else {
// Image needs to be reset if run with different PSM setting earlier
if (psmEdit) {
setImage(TessModule, api, image);
}
rotateRadiansFinal = 0;
}
} else {
rotateRadiansFinal = options.rotateRadians || 0;
setImage(TessModule, api, image, rotateRadiansFinal);
}
const rec = options.rectangle;
if (typeof rec === 'object') {
api.SetRectangle(rec.left, rec.top, rec.width, rec.height);
}
api.Recognize(null);
res.resolve(dump(TessModule, api, params));
TessModule._free(ptr);
if (recOutputCount > 0) {
api.Recognize(null);
} else {
log('Skipping recognition: all output options requiring recognition are disabled.');
}
const { pdfTitle } = options;
const { pdfTextOnly } = options;
const result = dump(TessModule, api, workingOutput, { pdfTitle, pdfTextOnly });
result.rotateRadians = rotateRadiansFinal;
if (output.debug) TessModule.FS.unlink('/debugInternal.txt');
if (Object.keys(optionsTess).length > 0) {
api.RestoreParameters();
}
res.resolve(result);
} catch (err) {

@@ -227,21 +369,15 @@ res.reject(err.toString());

const getPDF = ({ payload: { title, textonly } }, res) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
res.resolve(TessModule.FS.readFile('/tesseract-ocr.pdf'));
};
const detect = ({ payload: { image } }, res) => {
const detect = async ({ payload: { image } }, res) => {
try {
const ptr = setImage(TessModule, api, image);
setImage(TessModule, api, image);
const results = new TessModule.OSResults();
if (!api.DetectOS(results)) {
api.End();
TessModule._free(ptr);
res.reject('Failed to detect OS');
res.resolve({
tesseract_script_id: null,
script: null,
script_confidence: null,
orientation_degrees: null,
orientation_confidence: null,
});
} else {

@@ -252,4 +388,2 @@ const best = results.best_result;

TessModule._free(ptr);
res.resolve({

@@ -268,3 +402,3 @@ tesseract_script_id: sid,

const terminate = (_, res) => {
const terminate = async (_, res) => {
try {

@@ -306,18 +440,14 @@ if (api !== null) {

try {
({
load,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
getPDF,
detect,
terminate,
})[packet.action](packet, res);
} catch (err) {
/** Prepare exception to travel through postMessage */
res.reject(err.toString());
}
({
load,
FS,
loadLanguage,
initialize,
setParameters,
recognize,
getPDF,
detect,
terminate,
})[packet.action](packet, res)
.catch((err) => res.reject(err.toString()));
};

@@ -324,0 +454,0 @@

@@ -10,2 +10,4 @@ /**

*/
const arrayBufferToBase64 = require('./arrayBufferToBase64');
const imageType = require('../../constants/imageType');

@@ -41,9 +43,3 @@ /**

*/
module.exports = (TessModule, api, {
tessjs_create_hocr,
tessjs_create_tsv,
tessjs_create_box,
tessjs_create_unlv,
tessjs_create_osd,
}) => {
module.exports = (TessModule, api, output, options) => {
const ri = api.GetIterator();

@@ -70,135 +66,160 @@ const {

ri.Begin();
do {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if (TessModule.getPointer(poly) > 0) {
const n = poly.get_n();
const px = poly.get_x();
const py = poly.get_y();
polygon = [];
for (let i = 0; i < n; i += 1) {
polygon.push([px.getValue(i), py.getValue(i)]);
const getImage = (type) => {
api.WriteImage(type, '/image.png');
const pngBuffer = TessModule.FS.readFile('/image.png');
const pngStr = `data:image/png;base64,${arrayBufferToBase64(pngBuffer.buffer)}`;
TessModule.FS.unlink('/image.png');
return pngStr;
};
const getPDFInternal = (title, textonly) => {
const pdfRenderer = new TessModule.TessPDFRenderer('tesseract-ocr', '/', textonly);
pdfRenderer.BeginDocument(title);
pdfRenderer.AddImage(api);
pdfRenderer.EndDocument();
TessModule._free(pdfRenderer);
return TessModule.FS.readFile('/tesseract-ocr.pdf');
};
if (output.blocks) {
ri.Begin();
do {
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
const poly = ri.BlockPolygon();
let polygon = null;
// BlockPolygon() returns null when automatic page segmentation is off
if (TessModule.getPointer(poly) > 0) {
const n = poly.get_n();
const px = poly.get_x();
const py = poly.get_y();
polygon = [];
for (let i = 0; i < n; i += 1) {
polygon.push([px.getValue(i), py.getValue(i)]);
}
/*
* TODO: find out why _ptaDestroy doesn't work
*/
// TessModule._ptaDestroy(TessModule.getPointer(poly));
}
/*
* TODO: find out why _ptaDestroy doesn't work
*/
// TessModule._ptaDestroy(TessModule.getPointer(poly));
block = {
paragraphs: [],
text: ri.GetUTF8Text(RIL_BLOCK),
confidence: ri.Confidence(RIL_BLOCK),
baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon,
};
blocks.push(block);
}
if (ri.IsAtBeginningOf(RIL_PARA)) {
para = {
lines: [],
text: ri.GetUTF8Text(RIL_PARA),
confidence: ri.Confidence(RIL_PARA),
baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr(),
};
block.paragraphs.push(para);
}
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
textline = {
words: [],
text: ri.GetUTF8Text(RIL_TEXTLINE),
confidence: ri.Confidence(RIL_TEXTLINE),
baseline: ri.getBaseline(RIL_TEXTLINE),
bbox: ri.getBoundingBox(RIL_TEXTLINE),
};
para.lines.push(textline);
}
if (ri.IsAtBeginningOf(RIL_WORD)) {
const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
block = {
paragraphs: [],
text: ri.GetUTF8Text(RIL_BLOCK),
confidence: ri.Confidence(RIL_BLOCK),
baseline: ri.getBaseline(RIL_BLOCK),
bbox: ri.getBoundingBox(RIL_BLOCK),
blocktype: enumToString(ri.BlockType(), 'PT'),
polygon,
};
blocks.push(block);
}
if (ri.IsAtBeginningOf(RIL_PARA)) {
para = {
lines: [],
text: ri.GetUTF8Text(RIL_PARA),
confidence: ri.Confidence(RIL_PARA),
baseline: ri.getBaseline(RIL_PARA),
bbox: ri.getBoundingBox(RIL_PARA),
is_ltr: !!ri.ParagraphIsLtr(),
};
block.paragraphs.push(para);
}
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
textline = {
words: [],
text: ri.GetUTF8Text(RIL_TEXTLINE),
confidence: ri.Confidence(RIL_TEXTLINE),
baseline: ri.getBaseline(RIL_TEXTLINE),
bbox: ri.getBoundingBox(RIL_TEXTLINE),
};
para.lines.push(textline);
}
if (ri.IsAtBeginningOf(RIL_WORD)) {
const fontInfo = ri.getWordFontAttributes();
const wordDir = ri.WordDirection();
word = {
symbols: [],
choices: [],
text: ri.GetUTF8Text(RIL_WORD),
confidence: ri.Confidence(RIL_WORD),
baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD),
text: ri.GetUTF8Text(RIL_WORD),
confidence: ri.Confidence(RIL_WORD),
baseline: ri.getBaseline(RIL_WORD),
bbox: ri.getBoundingBox(RIL_WORD),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),
is_numeric: !!ri.WordIsNumeric(),
in_dictionary: !!ri.WordIsFromDictionary(),
direction: enumToString(wordDir, 'DIR'),
language: ri.WordRecognitionLanguage(),
is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
};
const wc = new TessModule.WordChoiceIterator(ri);
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence(),
});
} while (wc.Next());
TessModule.destroy(wc);
textline.words.push(word);
}
is_bold: fontInfo.is_bold,
is_italic: fontInfo.is_italic,
is_underlined: fontInfo.is_underlined,
is_monospace: fontInfo.is_monospace,
is_serif: fontInfo.is_serif,
is_smallcaps: fontInfo.is_smallcaps,
font_size: fontInfo.pointsize,
font_id: fontInfo.font_id,
font_name: fontInfo.font_name,
};
const wc = new TessModule.WordChoiceIterator(ri);
do {
word.choices.push({
text: wc.GetUTF8Text(),
confidence: wc.Confidence(),
});
} while (wc.Next());
TessModule.destroy(wc);
textline.words.push(word);
}
// let image = null;
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
symbol = {
choices: [],
image: null,
text: ri.GetUTF8Text(RIL_SYMBOL),
confidence: ri.Confidence(RIL_SYMBOL),
baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
};
word.symbols.push(symbol);
const ci = new TessModule.ChoiceIterator(ri);
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence(),
});
} while (ci.Next());
// TessModule.destroy(i);
}
} while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri);
}
// let image = null;
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
// var image = pix2array(pix);
// // for some reason it seems that things stop working if you destroy pics
// TessModule._pixDestroy(TessModule.getPointer(pix));
if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
symbol = {
choices: [],
image: null,
text: ri.GetUTF8Text(RIL_SYMBOL),
confidence: ri.Confidence(RIL_SYMBOL),
baseline: ri.getBaseline(RIL_SYMBOL),
bbox: ri.getBoundingBox(RIL_SYMBOL),
is_superscript: !!ri.SymbolIsSuperscript(),
is_subscript: !!ri.SymbolIsSubscript(),
is_dropcap: !!ri.SymbolIsDropcap(),
};
word.symbols.push(symbol);
const ci = new TessModule.ChoiceIterator(ri);
do {
symbol.choices.push({
text: ci.GetUTF8Text(),
confidence: ci.Confidence(),
});
} while (ci.Next());
// TessModule.destroy(i);
}
} while (ri.Next(RIL_SYMBOL));
TessModule.destroy(ri);
return {
text: api.GetUTF8Text(),
hocr: tessjs_create_hocr === '1' ? deindent(api.GetHOCRText()) : null,
tsv: tessjs_create_tsv === '1' ? api.GetTSVText() : null,
box: tessjs_create_box === '1' ? api.GetBoxText() : null,
unlv: tessjs_create_unlv === '1' ? api.GetUNLVText() : null,
osd: tessjs_create_osd === '1' ? api.GetOsdText() : null,
text: output.text ? api.GetUTF8Text() : null,
hocr: output.hocr ? deindent(api.GetHOCRText()) : null,
tsv: output.tsv ? api.GetTSVText() : null,
box: output.box ? api.GetBoxText() : null,
unlv: output.unlv ? api.GetUNLVText() : null,
osd: output.osd ? api.GetOsdText() : null,
pdf: output.pdf ? getPDFInternal(options.pdfTitle ?? 'Tesseract OCR Result', options.pdfTextOnly ?? false) : null,
imageColor: output.imageColor ? getImage(imageType.COLOR) : null,
imageGrey: output.imageGrey ? getImage(imageType.GREY) : null,
imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null,
confidence: api.MeanTextConf(),
blocks,
blocks: output.blocks ? blocks : null,
psm: enumToString(api.GetPageSegMode(), 'PSM'),
oem: enumToString(api.oem(), 'OEM'),
version: api.Version(),
debug: output.debug ? TessModule.FS.readFile('/debugInternal.txt', { encoding: 'utf8', flags: 'a+' }) : null,
};
};

@@ -11,54 +11,22 @@ const bmp = require('bmp-js');

*/
module.exports = (TessModule, api, image) => {
const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length }));
const type = fileType(buf);
let bytesPerPixel = 0;
let data = null;
let pix = null;
let w = 0;
let h = 0;
module.exports = (TessModule, api, image, angle = 0) => {
const type = fileType(image);
const exif = buf.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1;
const exif = image.slice(0, 500).toString().match(/\x01\x12\x00\x03\x00\x00\x00\x01\x00(.)/)?.[1]?.charCodeAt(0) || 1;
/*
* Leptonica supports uncompressed but not compressed bmp files
* @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516
* We therefore use bmp-js to process all bmp files
*/
// /*
// * Leptonica supports some but not all bmp files
// * @see https://github.com/DanBloomberg/leptonica/issues/607#issuecomment-1068802516
// * We therefore use bmp-js to convert all bmp files into a format Leptonica is known to support
// */
if (type && type.mime === 'image/bmp') {
// Not sure what this line actually does, but removing breaks the function
const buf = Buffer.from(Array.from({ ...image, length: Object.keys(image).length }));
const bmpBuf = bmp.decode(buf);
data = TessModule._malloc(bmpBuf.data.length * Uint8Array.BYTES_PER_ELEMENT);
TessModule.HEAPU8.set(bmpBuf.data, data);
w = bmpBuf.width;
h = bmpBuf.height;
bytesPerPixel = 4;
TessModule.FS.writeFile('/input', bmp.encode(bmpBuf).data);
} else {
const ptr = TessModule._malloc(buf.length * Uint8Array.BYTES_PER_ELEMENT);
TessModule.HEAPU8.set(buf, ptr);
pix = TessModule._pixReadMem(ptr, buf.length);
if (TessModule.getValue(pix + (7 * 4), 'i32') === 0) {
/*
* Set a yres default value to prevent warning from tesseract
* See kMinCredibleResolution in tesseract/src/ccstruct/publictypes.h
*/
TessModule.setValue(pix + (7 * 4), 300, 'i32');
}
[w, h] = Array(2).fill(0)
.map((v, idx) => (
TessModule.getValue(pix + (idx * 4), 'i32')
));
TessModule.FS.writeFile('/input', image);
}
/*
* As some image format (ex. bmp) is not supported natiely by tesseract,
* sometimes it will not return pix directly, but data and bytesPerPixel
* for another SetImage usage.
*
*/
if (data === null) {
api.SetImage(pix, undefined, undefined, undefined, undefined, exif);
} else {
api.SetImage(data, w, h, bytesPerPixel, w * bytesPerPixel, exif);
}
return data === null ? pix : data;
api.SetImageFile(exif, angle);
};

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc