tesseract.js
Advanced tools
Comparing version 2.0.0-alpha.9 to 2.0.0-alpha.10
@@ -15,1 +15,27 @@ FAQ | ||
For tesseract.js v1, check [Training Tesseract 3.03–3.05](https://github.com/tesseract-ocr/tesseract/wiki/Training-Tesseract-3.03%E2%80%933.05) | ||
## How can I get HOCR, TSV, Box, UNLV, OSD? | ||
Starting from 2.0.0-alpha.10, you can get all these information in the final result. | ||
```javascript | ||
import Tesseract from 'tesseract.js'; | ||
const { TesseractWorker } = Tesseract; | ||
const worker = new TesseractWorker(); | ||
worker | ||
.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png', 'eng', { | ||
tessedit_create_box: '1', | ||
tessedit_create_unlv: '1', | ||
tessedit_create_osd: '1', | ||
}) | ||
.then((result) => { | ||
console.log(result.text); | ||
console.log(result.hocr); | ||
console.log(result.tsv); | ||
console.log(result.box); | ||
console.log(result.unlv); | ||
console.log(result.osd); | ||
}); | ||
``` |
## Local Installation | ||
Check here for an example: https://github.com/jeromewu/tesseract.js-offline | ||
Check here for examples: https://github.com/naptha/tesseract.js/blob/master/docs/examples.md | ||
@@ -13,3 +13,3 @@ In browser environment, `tesseract.js` simply provides the API layer. Internally, it opens a WebWorker to handle requests. That worker itself loads code from the Emscripten-built `tesseract.js-core` which itself is hosted on a CDN. Then it dynamically loads language files hosted on another CDN. | ||
const worker = Tesseract.TesseractWorker({ | ||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.8/dist/worker.min.js', | ||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.10/dist/worker.min.js', | ||
langPath: 'https://tessdata.projectnaptha.com/4.0.0', | ||
@@ -27,4 +27,4 @@ corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.wasm.js', | ||
### corePath | ||
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available). | ||
A string specifying the location of the [tesseract.js-core library](https://github.com/naptha/tesseract.js-core), with default value 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.wasm.js' (fallback to tesseract-core.asm.js when WebAssembly is not available). | ||
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.wasm'. But it fails to fetch at this moment. | ||
Another WASM option is 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.js' which is a script that loads 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.wasm'. But it fails to fetch at this moment. |
@@ -27,3 +27,8 @@ Tesseract.js Parameters | ||
| tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | | ||
| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js will generate a pdf output | | ||
| tessedit\_create\_pdf | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js generates a pdf output | | ||
| tessedit\_create\_hocr | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes hocr in the result | | ||
| tessedit\_create\_tsv | string | '1' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes tsv in the result | | ||
| tessedit\_create\_box | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes box in the result | | ||
| tessedit\_create\_unlv | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes unlv in the result | | ||
| tessedit\_create\_osd | string | '0' | only 2 values, '0' or '1', when the value is '1', tesseract.js includes osd in the result | | ||
| pdf\_name | string | 'tesseract.js-ocr-result' | the name of the generated pdf file | | ||
@@ -30,0 +35,0 @@ | pdf\_title | string | 'Tesseract.js OCR Result' | the title of the generated pdf file | |
{ | ||
"name": "tesseract.js", | ||
"version": "2.0.0-alpha.9", | ||
"version": "2.0.0-alpha.10", | ||
"description": "Pure Javascript Multilingual OCR", | ||
@@ -5,0 +5,0 @@ "main": "src/index.js", |
@@ -42,3 +42,3 @@ # [Tesseract.js](http://tesseract.projectnaptha.com/) | ||
```html | ||
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.8/dist/tesseract.min.js'></script> | ||
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.10/dist/tesseract.min.js'></script> | ||
``` | ||
@@ -45,0 +45,0 @@ |
@@ -45,3 +45,9 @@ /** | ||
*/ | ||
module.exports = (TessModule, api) => { | ||
module.exports = (TessModule, api, { | ||
tessedit_create_hocr, | ||
tessedit_create_tsv, | ||
tessedit_create_box, | ||
tessedit_create_unlv, | ||
tessedit_create_osd, | ||
}) => { | ||
const ri = api.GetIterator(); | ||
@@ -181,3 +187,7 @@ const blocks = []; | ||
text: api.GetUTF8Text(), | ||
html: deindent(api.GetHOCRText()), | ||
hocr: tessedit_create_hocr === '1' ? deindent(api.GetHOCRText()) : null, | ||
tsv: tessedit_create_tsv === '1' ? api.GetTSVText() : null, | ||
box: tessedit_create_box === '1' ? api.GetBoxText() : null, | ||
unlv: tessedit_create_unlv === '1' ? api.GetUNLVText() : null, | ||
osd: tessedit_create_osd === '1' ? api.GetOsdText() : null, | ||
confidence: api.MeanTextConf(), | ||
@@ -184,0 +194,0 @@ blocks, |
@@ -20,2 +20,7 @@ const { OEM, PSM } = require('./types'); | ||
tessedit_create_pdf: '0', | ||
tessedit_create_hocr: '1', | ||
tessedit_create_tsv: '1', | ||
tessedit_create_box: '0', | ||
tessedit_create_unlv: '0', | ||
tessedit_create_osd: '0', | ||
textonly_pdf: '0', | ||
@@ -22,0 +27,0 @@ pdf_name: 'tesseract.js-ocr-result', |
@@ -70,10 +70,7 @@ /** | ||
*/ | ||
const handleParams = (langs, customParams) => { | ||
const handleParams = (langs, iParams) => { | ||
const { | ||
tessedit_ocr_engine_mode, | ||
...params | ||
} = { | ||
...defaultParams, | ||
...customParams, | ||
}; | ||
} = iParams; | ||
api.Init(null, getLangsStr(langs), tessedit_ocr_engine_mode); | ||
@@ -195,3 +192,3 @@ Object.keys(params).forEach((key) => { | ||
const handleRecognize = ({ | ||
image, langs, options, params, | ||
image, langs, options, params: customParams, | ||
}, res) => ( | ||
@@ -216,2 +213,6 @@ handleInit(options, res) | ||
}; | ||
const params = { | ||
...defaultParams, | ||
...customParams, | ||
}; | ||
progressUpdate(0); | ||
@@ -224,3 +225,3 @@ handleParams(langs, params); | ||
const files = handleOutput(params); | ||
const result = dump(TessModule, api); | ||
const result = dump(TessModule, api, params); | ||
api.End(); | ||
@@ -227,0 +228,0 @@ TessModule._free(ptr); |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
1769608
2307