tesseract.js
Advanced tools
Comparing version 2.0.0-alpha.10 to 2.0.0-alpha.11
@@ -21,4 +21,8 @@ # Tesseract.js Examples | ||
.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png') | ||
.then((result) => { | ||
console.log(result); | ||
.progress((p) => { | ||
console.log('progress', p); | ||
}) | ||
.then(({ text }) => { | ||
console.log(text); | ||
worker.terminate(); | ||
}); | ||
@@ -40,4 +44,5 @@ ``` | ||
}) | ||
.then((result) => { | ||
console.log(result); | ||
.then(({ text }) => { | ||
console.log(text); | ||
worker.terminate(); | ||
}); | ||
@@ -62,4 +67,5 @@ ``` | ||
}) | ||
.then((result) => { | ||
console.log(result); | ||
.then(({ text }) => { | ||
console.log(text); | ||
worker.terminate(); | ||
}); | ||
@@ -90,4 +96,5 @@ ``` | ||
}) | ||
.then((result) => { | ||
console.log(result); | ||
.then(({ text }) => { | ||
console.log(text); | ||
worker.terminate(); | ||
}); | ||
@@ -117,4 +124,5 @@ ``` | ||
}) | ||
.then((result) => { | ||
console.log(result); | ||
.then(({ text }) => { | ||
console.log(text); | ||
worker.terminate(); | ||
}); | ||
@@ -144,4 +152,5 @@ ``` | ||
}) | ||
.then((result) => { | ||
console.log(result); | ||
.then(({ text }) => { | ||
console.log(text); | ||
worker.terminate(); | ||
}); | ||
@@ -171,5 +180,27 @@ ``` | ||
}) | ||
.then((result) => { | ||
console.log(result.files.pdf); // You can access pdf binary array here. | ||
.then(({ files: { pdf } }) => { | ||
console.log(Object.values(pdf)); // As pdf is an array-like object, you need to do a little convertion first. | ||
worker.terminate(); | ||
}); | ||
``` | ||
### with preload language data | ||
```javascript | ||
const Tesseract = require('tesseract.js'); | ||
const { TesseractWorker, utils: { loadLang } } = Tesseract; | ||
const worker = new TesseractWorker(); | ||
loadLang({ langs: 'eng', langPath: worker.options.langPath }) | ||
.then(() => { | ||
worker | ||
.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png') | ||
.progress(p => console.log(p)) | ||
.then(({ text }) => { | ||
console.log(text); | ||
worker.terminate(); | ||
}); | ||
}); | ||
``` |
@@ -11,4 +11,7 @@ # Image Format | ||
- a path or URL to an accessible image | ||
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp | ||
In Node.js, an image can be | ||
- a path to a local image | ||
- a Buffer storing binary image | ||
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp |
@@ -13,3 +13,3 @@ ## Local Installation | ||
const worker = Tesseract.TesseractWorker({ | ||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.10/dist/worker.min.js', | ||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.11/dist/worker.min.js', | ||
langPath: 'https://tessdata.projectnaptha.com/4.0.0', | ||
@@ -16,0 +16,0 @@ corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.wasm.js', |
@@ -5,69 +5,105 @@ # Tesseract Languages | ||
| `lang` | Language | | ||
|-----------|-----------------------| | ||
| 'afr' | Afrikaans | | ||
| 'ara' | Arabic | | ||
| 'aze' | Azerbaijani | | ||
| 'bel' | Belarusian | | ||
| 'ben' | Bengali | | ||
| 'bul' | Bulgarian | | ||
| 'cat' | Catalan | | ||
| 'ces' | Czech | | ||
| 'chi_sim' | Chinese | | ||
| 'chi_tra' | Traditional Chinese | | ||
| 'chr' | Cherokee | | ||
| 'dan' | Danish | | ||
| 'deu' | German | | ||
| 'ell' | Greek | | ||
| 'eng' | English | | ||
| 'enm' | English (Old) | | ||
| 'epo' | Esperanto | | ||
| 'epo_alt' | Esperanto alternative | | ||
| 'equ' | Math | | ||
| 'est' | Estonian | | ||
| 'eus' | Basque | | ||
| 'fas' |Persian (Farsi) | | ||
| 'fin' | Finnish | | ||
| 'fra' | French | | ||
| 'frk' | Frankish | | ||
| 'frm' | French (Old) | | ||
| 'glg' | Galician | | ||
| 'grc' | Ancient Greek | | ||
| 'heb' | Hebrew | | ||
| 'hin' | Hindi | | ||
| 'hrv' | Croatian | | ||
| 'hun' | Hungarian | | ||
| 'ind' | Indonesian | | ||
| 'isl' | Icelandic | | ||
| 'ita' | Italian | | ||
| 'ita_old' | Italian (Old) | | ||
| 'jpn' | Japanese | | ||
| 'kan' | Kannada | | ||
| 'kor' | Korean | | ||
| 'lav' | Latvian | | ||
| 'lit' | Lithuanian | | ||
| 'mal' | Malayalam | | ||
| 'mkd' | Macedonian | | ||
| 'mlt' | Maltese | | ||
| 'msa' | Malay | | ||
| 'nld' | Dutch | | ||
| 'nor' | Norwegian | | ||
| 'pol' | Polish | | ||
| 'por' | Portuguese | | ||
| 'ron' | Romanian | | ||
| 'rus' | Russian | | ||
| 'slk' | Slovakian | | ||
| 'slv' | Slovenian | | ||
| 'spa' | Spanish | | ||
| 'spa_old' | Old Spanish | | ||
| 'sqi' | Albanian | | ||
| 'srp' | Serbian (Latin) | | ||
| 'swa' | Swahili | | ||
| 'swe' | Swedish | | ||
| 'tam' | Tamil | | ||
| 'tel' | Telugu | | ||
| 'tgl' | Tagalog | | ||
| 'tha' | Thai | | ||
| 'tur' | Turkish | | ||
| 'ukr' | Ukrainian | | ||
| 'vie' | Vietnamese | | ||
Lang Code | Language | 4.0 traineddata | ||
:---------| :------- | :--------------- | ||
afr | Afrikaans | [afr.traineddata.gz](https://tessdata.projectnaptha.com/4.00/afr.traineddata.gz) | ||
amh | Amharic | [amh.traineddata.gz](https://tessdata.projectnaptha.com/4.00/amh.traineddata.gz) | ||
ara | Arabic | [ara.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ara.traineddata.gz) | ||
asm | Assamese | [asm.traineddata.gz](https://tessdata.projectnaptha.com/4.00/asm.traineddata.gz) | ||
aze | Azerbaijani | [aze.traineddata.gz](https://tessdata.projectnaptha.com/4.00/aze.traineddata.gz) | ||
aze_cyrl | Azerbaijani - Cyrillic | [aze_cyrl.traineddata.gz](https://tessdata.projectnaptha.com/4.00/aze_cyrl.traineddata.gz) | ||
bel | Belarusian | [bel.traineddata.gz](https://tessdata.projectnaptha.com/4.00/bel.traineddata.gz) | ||
ben | Bengali | [ben.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ben.traineddata.gz) | ||
bod | Tibetan | [bod.traineddata.gz](https://tessdata.projectnaptha.com/4.00/bod.traineddata.gz) | ||
bos | Bosnian | [bos.traineddata.gz](https://tessdata.projectnaptha.com/4.00/bos.traineddata.gz) | ||
bul | Bulgarian | [bul.traineddata.gz](https://tessdata.projectnaptha.com/4.00/bul.traineddata.gz) | ||
cat | Catalan; Valencian | [cat.traineddata.gz](https://tessdata.projectnaptha.com/4.00/cat.traineddata.gz) | ||
ceb | Cebuano | [ceb.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ceb.traineddata.gz) | ||
ces | Czech | [ces.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ces.traineddata.gz) | ||
chi_sim | Chinese - Simplified | [chi_sim.traineddata.gz](https://tessdata.projectnaptha.com/4.00/chi_sim.traineddata.gz) | ||
chi_tra | Chinese - Traditional | [chi_tra.traineddata.gz](https://tessdata.projectnaptha.com/4.00/chi_tra.traineddata.gz) | ||
chr | Cherokee | [chr.traineddata.gz](https://tessdata.projectnaptha.com/4.00/chr.traineddata.gz) | ||
cym | Welsh | [cym.traineddata.gz](https://tessdata.projectnaptha.com/4.00/cym.traineddata.gz) | ||
dan | Danish | [dan.traineddata.gz](https://tessdata.projectnaptha.com/4.00/dan.traineddata.gz) | ||
deu | German | [deu.traineddata.gz](https://tessdata.projectnaptha.com/4.00/deu.traineddata.gz) | ||
dzo | Dzongkha | [dzo.traineddata.gz](https://tessdata.projectnaptha.com/4.00/dzo.traineddata.gz) | ||
ell | Greek, Modern (1453-) | [ell.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ell.traineddata.gz) | ||
eng | English | [eng.traineddata.gz](https://tessdata.projectnaptha.com/4.00/eng.traineddata.gz) | ||
enm | English, Middle (1100-1500) | [enm.traineddata.gz](https://tessdata.projectnaptha.com/4.00/enm.traineddata.gz) | ||
epo | Esperanto | [epo.traineddata.gz](https://tessdata.projectnaptha.com/4.00/epo.traineddata.gz) | ||
est | Estonian | [est.traineddata.gz](https://tessdata.projectnaptha.com/4.00/est.traineddata.gz) | ||
eus | Basque | [eus.traineddata.gz](https://tessdata.projectnaptha.com/4.00/eus.traineddata.gz) | ||
fas | Persian | [fas.traineddata.gz](https://tessdata.projectnaptha.com/4.00/fas.traineddata.gz) | ||
fin | Finnish | [fin.traineddata.gz](https://tessdata.projectnaptha.com/4.00/fin.traineddata.gz) | ||
fra | French | [fra.traineddata.gz](https://tessdata.projectnaptha.com/4.00/fra.traineddata.gz) | ||
frk | Frankish | [frk.traineddata.gz](https://tessdata.projectnaptha.com/4.00/frk.traineddata.gz) | ||
frm | French, Middle (ca. 1400-1600) | [frm.traineddata.gz](https://tessdata.projectnaptha.com/4.00/frm.traineddata.gz) | ||
gle | Irish | [gle.traineddata.gz](https://tessdata.projectnaptha.com/4.00/gle.traineddata.gz) | ||
glg | Galician | [glg.traineddata.gz](https://tessdata.projectnaptha.com/4.00/glg.traineddata.gz) | ||
grc | Greek, Ancient (-1453) | [grc.traineddata.gz](https://tessdata.projectnaptha.com/4.00/grc.traineddata.gz) | ||
guj | Gujarati | [guj.traineddata.gz](https://tessdata.projectnaptha.com/4.00/guj.traineddata.gz) | ||
hat | Haitian; Haitian Creole | [hat.traineddata.gz](https://tessdata.projectnaptha.com/4.00/hat.traineddata.gz) | ||
heb | Hebrew | [heb.traineddata.gz](https://tessdata.projectnaptha.com/4.00/heb.traineddata.gz) | ||
hin | Hindi | [hin.traineddata.gz](https://tessdata.projectnaptha.com/4.00/hin.traineddata.gz) | ||
hrv | Croatian | [hrv.traineddata.gz](https://tessdata.projectnaptha.com/4.00/hrv.traineddata.gz) | ||
hun | Hungarian | [hun.traineddata.gz](https://tessdata.projectnaptha.com/4.00/hun.traineddata.gz) | ||
iku | Inuktitut | [iku.traineddata.gz](https://tessdata.projectnaptha.com/4.00/iku.traineddata.gz) | ||
ind | Indonesian | [ind.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ind.traineddata.gz) | ||
isl | Icelandic | [isl.traineddata.gz](https://tessdata.projectnaptha.com/4.00/isl.traineddata.gz) | ||
ita | Italian | [ita.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ita.traineddata.gz) | ||
ita_old | Italian - Old | [ita_old.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ita_old.traineddata.gz) | ||
jav | Javanese | [jav.traineddata.gz](https://tessdata.projectnaptha.com/4.00/jav.traineddata.gz) | ||
jpn | Japanese | [jpn.traineddata.gz](https://tessdata.projectnaptha.com/4.00/jpn.traineddata.gz) | ||
kan | Kannada | [kan.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kan.traineddata.gz) | ||
kat | Georgian | [kat.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kat.traineddata.gz) | ||
kat_old | Georgian - Old | [kat_old.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kat_old.traineddata.gz) | ||
kaz | Kazakh | [kaz.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kaz.traineddata.gz) | ||
khm | Central Khmer | [khm.traineddata.gz](https://tessdata.projectnaptha.com/4.00/khm.traineddata.gz) | ||
kir | Kirghiz; Kyrgyz | [kir.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kir.traineddata.gz) | ||
kor | Korean | [kor.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kor.traineddata.gz) | ||
kur | Kurdish | [kur.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kur.traineddata.gz) | ||
lao | Lao | [lao.traineddata.gz](https://tessdata.projectnaptha.com/4.00/lao.traineddata.gz) | ||
lat | Latin | [lat.traineddata.gz](https://tessdata.projectnaptha.com/4.00/lat.traineddata.gz) | ||
lav | Latvian | [lav.traineddata.gz](https://tessdata.projectnaptha.com/4.00/lav.traineddata.gz) | ||
lit | Lithuanian | [lit.traineddata.gz](https://tessdata.projectnaptha.com/4.00/lit.traineddata.gz) | ||
mal | Malayalam | [mal.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mal.traineddata.gz) | ||
mar | Marathi | [mar.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mar.traineddata.gz) | ||
mkd | Macedonian | [mkd.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mkd.traineddata.gz) | ||
mlt | Maltese | [mlt.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mlt.traineddata.gz) | ||
msa | Malay | [msa.traineddata.gz](https://tessdata.projectnaptha.com/4.00/msa.traineddata.gz) | ||
mya | Burmese | [mya.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mya.traineddata.gz) | ||
nep | Nepali | [nep.traineddata.gz](https://tessdata.projectnaptha.com/4.00/nep.traineddata.gz) | ||
nld | Dutch; Flemish | [nld.traineddata.gz](https://tessdata.projectnaptha.com/4.00/nld.traineddata.gz) | ||
nor | Norwegian | [nor.traineddata.gz](https://tessdata.projectnaptha.com/4.00/nor.traineddata.gz) | ||
ori | Oriya | [ori.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ori.traineddata.gz) | ||
pan | Panjabi; Punjabi | [pan.traineddata.gz](https://tessdata.projectnaptha.com/4.00/pan.traineddata.gz) | ||
pol | Polish | [pol.traineddata.gz](https://tessdata.projectnaptha.com/4.00/pol.traineddata.gz) | ||
por | Portuguese | [por.traineddata.gz](https://tessdata.projectnaptha.com/4.00/por.traineddata.gz) | ||
pus | Pushto; Pashto | [pus.traineddata.gz](https://tessdata.projectnaptha.com/4.00/pus.traineddata.gz) | ||
ron | Romanian; Moldavian; Moldovan | [ron.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ron.traineddata.gz) | ||
rus | Russian | [rus.traineddata.gz](https://tessdata.projectnaptha.com/4.00/rus.traineddata.gz) | ||
san | Sanskrit | [san.traineddata.gz](https://tessdata.projectnaptha.com/4.00/san.traineddata.gz) | ||
sin | Sinhala; Sinhalese | [sin.traineddata.gz](https://tessdata.projectnaptha.com/4.00/sin.traineddata.gz) | ||
slk | Slovak | [slk.traineddata.gz](https://tessdata.projectnaptha.com/4.00/slk.traineddata.gz) | ||
slv | Slovenian | [slv.traineddata.gz](https://tessdata.projectnaptha.com/4.00/slv.traineddata.gz) | ||
spa | Spanish; Castilian | [spa.traineddata.gz](https://tessdata.projectnaptha.com/4.00/spa.traineddata.gz) | ||
spa_old | Spanish; Castilian - Old | [spa_old.traineddata.gz](https://tessdata.projectnaptha.com/4.00/spa_old.traineddata.gz) | ||
sqi | Albanian | [sqi.traineddata.gz](https://tessdata.projectnaptha.com/4.00/sqi.traineddata.gz) | ||
srp | Serbian | [srp.traineddata.gz](https://tessdata.projectnaptha.com/4.00/srp.traineddata.gz) | ||
srp_latn | Serbian - Latin | [srp_latn.traineddata.gz](https://tessdata.projectnaptha.com/4.00/srp_latn.traineddata.gz) | ||
swa | Swahili | [swa.traineddata.gz](https://tessdata.projectnaptha.com/4.00/swa.traineddata.gz) | ||
swe | Swedish | [swe.traineddata.gz](https://tessdata.projectnaptha.com/4.00/swe.traineddata.gz) | ||
syr | Syriac | [syr.traineddata.gz](https://tessdata.projectnaptha.com/4.00/syr.traineddata.gz) | ||
tam | Tamil | [tam.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tam.traineddata.gz) | ||
tel | Telugu | [tel.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tel.traineddata.gz) | ||
tgk | Tajik | [tgk.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tgk.traineddata.gz) | ||
tgl | Tagalog | [tgl.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tgl.traineddata.gz) | ||
tha | Thai | [tha.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tha.traineddata.gz) | ||
tir | Tigrinya | [tir.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tir.traineddata.gz) | ||
tur | Turkish | [tur.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tur.traineddata.gz) | ||
uig | Uighur; Uyghur | [uig.traineddata.gz](https://tessdata.projectnaptha.com/4.00/uig.traineddata.gz) | ||
ukr | Ukrainian | [ukr.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ukr.traineddata.gz) | ||
urd | Urdu | [urd.traineddata.gz](https://tessdata.projectnaptha.com/4.00/urd.traineddata.gz) | ||
uzb | Uzbek | [uzb.traineddata.gz](https://tessdata.projectnaptha.com/4.00/uzb.traineddata.gz) | ||
uzb_cyrl | Uzbek - Cyrillic | [uzb_cyrl.traineddata.gz](https://tessdata.projectnaptha.com/4.00/uzb_cyrl.traineddata.gz) | ||
vie | Vietnamese | [vie.traineddata.gz](https://tessdata.projectnaptha.com/4.00/vie.traineddata.gz) | ||
yid | Yiddish | [yid.traineddata.gz](https://tessdata.projectnaptha.com/4.00/yid.traineddata.gz) |
@@ -16,3 +16,3 @@ Tesseract.js Parameters | ||
.recognize(image, 'eng', { | ||
tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED, | ||
tessedit_ocr_engine_mode: OEM.LSTM_ONLY, | ||
tessedit_pageseg_mode: PSM.SINGLE_BLOCK, | ||
@@ -25,3 +25,3 @@ }) | ||
| ---- | ---- | ------------- | ----------- | | ||
| tessedit\_ocr\_engine\_mode | enum | OEM.TESSERACT\_LSTM\_COMBINED | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | | ||
| tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode | | ||
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode | | ||
@@ -28,0 +28,0 @@ | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited | |
{ | ||
"name": "tesseract.js", | ||
"version": "2.0.0-alpha.10", | ||
"version": "2.0.0-alpha.11", | ||
"description": "Pure Javascript Multilingual OCR", | ||
"main": "src/index.js", | ||
"unpkg": "dist/tesseract.min.js", | ||
"jsdelivr": "dist/tesseract.min.js", | ||
"scripts": { | ||
@@ -7,0 +9,0 @@ "start": "node scripts/server.js", |
122
README.md
@@ -1,14 +0,20 @@ | ||
# [Tesseract.js](http://tesseract.projectnaptha.com/) | ||
<p align="center"> | ||
<a href="https://tesseract.projectnaptha.com/"><img alt="Tesseract.js" src="https://tesseract.projectnaptha.com/img/logo_small.png"></a> | ||
</p> | ||
[![Build Status](https://travis-ci.org/naptha/tesseract.js.svg?branch=master)](https://travis-ci.org/naptha/tesseract.js) | ||
[![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js) | ||
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity) | ||
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) | ||
[![Code Style](https://badgen.net/badge/code%20style/airbnb/ff5a5f?icon=airbnb)](https://github.com/airbnb/javascript) | ||
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity) | ||
[![Build Status](https://travis-ci.org/naptha/tesseract.js.svg?branch=master)](https://travis-ci.org/naptha/tesseract.js) | ||
[![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js) | ||
[![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js) | ||
[![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js) | ||
**Tesseract.js v2 is now available and under development in master branch, check [support/1.x](https://github.com/naptha/tesseract.js/tree/support/1.x) branch for v1.** | ||
<h3 align="center"> | ||
Version 2 is now available and under development in the master branch<br> | ||
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1 | ||
</h3> | ||
<br> | ||
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/)) | ||
@@ -18,3 +24,5 @@ | ||
Tesseract.js works with script tags, [webpack](https://webpack.js.org/), and [Node.js](https://nodejs.org/en/). [After you install it](#installation), using it is as simple as | ||
Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine. | ||
It works in the browser using [webpack](https://webpack.js.org/) or plain script tags with a [#CDN](CDN) and on the server with [Node.js](https://nodejs.org/en/). | ||
After you [install it](#installation), using it is as simple as: | ||
@@ -26,55 +34,49 @@ ```javascript | ||
worker.recognize(myImage) | ||
.progress((p) => { console.log('progress', p); }) | ||
.then((result) => { console.log('result', result); }); | ||
.progress(progress => { | ||
console.log('progress', progress); | ||
}).then(result => { | ||
console.log('result', result); | ||
}); | ||
``` | ||
[Check out the docs](#docs) for a full treatment of the API. | ||
[Check out the docs](#docs) for a full explanation of the API. | ||
## Provenance | ||
Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine. | ||
## Major changes in v2 | ||
- Upgrade to tesseract v4 | ||
- Support multiple languages at the same time, eg: eng+chi_tra for English and Traditional Chinese | ||
- Supported image formats: png, jpg, bmp, pbm | ||
# Installation | ||
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm`, and on Node.js via `npm`. [Check out the docs](#docs) for a full treatment of the API. | ||
## CDN | ||
## Installation | ||
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`. | ||
You can simply include Tesseract.js with a CDN like this: | ||
### CDN | ||
```html | ||
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.10/dist/tesseract.min.js'></script> | ||
<!-- v2 --> | ||
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.11/dist/tesseract.min.js'></script> | ||
<!-- v1 --> | ||
<script src='https://unpkg.com/tesseract.js@1.0.19/src/index.js'></script> | ||
``` | ||
After including the script the `Tesseract` variable will be globally available. | ||
After including your scripts, the `Tesseract` variable will be defined globally! | ||
## npm | ||
### Node.js | ||
### 2.x | ||
**Tesseract.js currently requires Node.js v6.8.0 or higher** | ||
Major Changes | ||
- Upgrade to tesseract v4 | ||
- Support multiple languages, ex: eng+chi_tra | ||
- Support image formats: png, jpg, bmp, pbm | ||
```shell | ||
> yarn add tesseract.js@next | ||
``` | ||
or | ||
``` | ||
> npm install tesseract.js@next --save | ||
``` | ||
# For v2 | ||
npm install tesseract.js@next | ||
yarn add tesseract.js@next | ||
### 1.x | ||
```shell | ||
> yarn add tesseract.js | ||
# For v1 | ||
npm install tesseract.js | ||
yarn add tesseract.js | ||
``` | ||
or | ||
``` | ||
> npm install tesseract.js --save | ||
``` | ||
> Note: Tesseract.js currently requires Node.js v6.8.0 or higher. | ||
# Documentation | ||
## Documentation | ||
@@ -87,31 +89,27 @@ * [Examples](./docs/examples.md) | ||
# Contributing | ||
## Development | ||
To run a development copy of tesseract.js, first clone this repo. | ||
```shell | ||
> git clone https://github.com/naptha/tesseract.js.git | ||
``` | ||
## Contributing | ||
Then, `cd tesseract.js && npm install && npm start` | ||
### Development | ||
To run a development copy of Tesseract.js do the following: | ||
```shell | ||
> cd tesseract.js | ||
> npm install && npm start | ||
# First we clone the repository | ||
git clone https://github.com/naptha/tesseract.js.git | ||
cd tesseract.js | ||
... a bunch of npm stuff ... | ||
# Then we install the dependencies | ||
npm install | ||
Starting up http-server, serving ./ | ||
Available on: | ||
http://127.0.0.1:3000 | ||
http://[your ip]:3000 | ||
# And finally we start the development server | ||
npm start | ||
``` | ||
Then open `http://localhost:3000/examples/browser/demo.html` in your favorite browser. The devServer automatically rebuilds `tesseract.dev.js` and `worker.min.js` when you change files in the src folder. | ||
The development server will be available at http://localhost:3000/examples/browser/demo.html in your favorite browser. | ||
It will automatically rebuild `tesseract.dev.js` and `worker.min.js` when you change files in the src folder. | ||
## Building Static Files | ||
After you've cloned the repo and run `npm install` as described in the [Development Section](#development), you can build static library files in the dist folder with | ||
### Building Static Files | ||
To build the compiled static files just execute the following: | ||
```shell | ||
> npm run build | ||
npm run build | ||
``` | ||
This will output the files into the `dist` directory. |
global.expect = require('expect.js'); | ||
global.fetch = require('node-fetch'); | ||
global.fs = require('fs'); | ||
global.path = require('path'); | ||
global.Tesseract = require('../src'); |
@@ -13,2 +13,3 @@ /** | ||
const axios = require('axios'); | ||
const b64toU8Array = require('./b64toU8Array'); | ||
const { defaultOptions } = require('../common/options'); | ||
@@ -42,2 +43,3 @@ const { version } = require('../../package.json'); | ||
* string: URL string, can be relative path | ||
* string: base64 image | ||
* img HTMLElement: extract image source from src attribute | ||
@@ -51,2 +53,7 @@ * video HTMLElement: extract image source from poster attribute | ||
if (check.string(image)) { | ||
// Base64 Image | ||
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { | ||
return Promise.resolve(b64toU8Array(image.split(',')[1])); | ||
} | ||
// Image URL | ||
return axios.get(resolveURL(image), { | ||
@@ -104,3 +111,3 @@ responseType: 'arraybuffer', | ||
...defaultOptions, | ||
workerPath: process.env.TESS_ENV === 'development' | ||
workerPath: (typeof process !== 'undefined' && process.env.TESS_ENV === 'development') | ||
? resolveURL(`/dist/worker.dev.js?nocache=${Math.random().toString(36).slice(3)}`) | ||
@@ -107,0 +114,0 @@ : `https://unpkg.com/tesseract.js@v${version}/dist/worker.min.js`, |
@@ -13,2 +13,3 @@ /** | ||
const workerUtils = require('../common/workerUtils'); | ||
const b64toU8Array = require('./b64toU8Array'); | ||
@@ -46,3 +47,3 @@ /* | ||
}, | ||
b64toU8Array: s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))), | ||
b64toU8Array, | ||
writeFile: (path, data, type) => { | ||
@@ -49,0 +50,0 @@ postMessage({ |
@@ -16,3 +16,3 @@ const { OEM, PSM } = require('./types'); | ||
defaultParams: { | ||
tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED, | ||
tessedit_ocr_engine_mode: OEM.LSTM_ONLY, | ||
tessedit_pageseg_mode: PSM.SINGLE_BLOCK, | ||
@@ -19,0 +19,0 @@ tessedit_char_whiltelist: '', |
@@ -11,3 +11,3 @@ /** | ||
const check = require('check-types'); | ||
const resolveURL = process.browser ? require('resolve-url') : s => s; | ||
const resolveURL = (typeof window !== 'undefined' && typeof window.document !== 'undefined') ? require('resolve-url') : s => s; | ||
const adapter = require('../node'); | ||
@@ -14,0 +14,0 @@ const circularize = require('./circularize'); |
@@ -16,2 +16,3 @@ /** | ||
const path = require('path'); | ||
const b64toU8Array = require('./b64toU8Array'); | ||
const { defaultOptions } = require('../common/options'); | ||
@@ -29,2 +30,4 @@ | ||
* string: URL string or file path | ||
* string: base64 image | ||
* buffer: image buffer | ||
* @returns {array} binary image in array format | ||
@@ -39,2 +42,11 @@ */ | ||
} | ||
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) { | ||
return Promise.resolve(b64toU8Array(image.split(',')[1])); | ||
} | ||
if (Buffer.isBuffer(image)) { | ||
return Promise.resolve(image); | ||
} | ||
return readFile(image); | ||
@@ -41,0 +53,0 @@ }; |
@@ -13,2 +13,3 @@ /** | ||
const workerUtils = require('../common/workerUtils'); | ||
const b64toU8Array = require('./b64toU8Array'); | ||
@@ -37,3 +38,3 @@ let TesseractCore = null; | ||
}, | ||
b64toU8Array: s => Buffer.from(s, 'base64'), | ||
b64toU8Array, | ||
writeFile: (path, data) => { | ||
@@ -40,0 +41,0 @@ const fs = require('fs'); |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
1983721
68
2329
113
4