Socket
Socket
Sign inDemoInstall

tesseract.js

Package Overview
Dependencies
Maintainers
3
Versions
68
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

tesseract.js - npm Package Compare versions

Comparing version 2.0.0-alpha.10 to 2.0.0-alpha.11

.github/FUNDING.yml

59

docs/examples.md

@@ -21,4 +21,8 @@ # Tesseract.js Examples

.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png')
.then((result) => {
console.log(result);
.progress((p) => {
console.log('progress', p);
})
.then(({ text }) => {
console.log(text);
worker.terminate();
});

@@ -40,4 +44,5 @@ ```

})
.then((result) => {
console.log(result);
.then(({ text }) => {
console.log(text);
worker.terminate();
});

@@ -62,4 +67,5 @@ ```

})
.then((result) => {
console.log(result);
.then(({ text }) => {
console.log(text);
worker.terminate();
});

@@ -90,4 +96,5 @@ ```

})
.then((result) => {
console.log(result);
.then(({ text }) => {
console.log(text);
worker.terminate();
});

@@ -117,4 +124,5 @@ ```

})
.then((result) => {
console.log(result);
.then(({ text }) => {
console.log(text);
worker.terminate();
});

@@ -144,4 +152,5 @@ ```

})
.then((result) => {
console.log(result);
.then(({ text }) => {
console.log(text);
worker.terminate();
});

@@ -171,5 +180,27 @@ ```

})
.then((result) => {
console.log(result.files.pdf); // You can access pdf binary array here.
.then(({ files: { pdf } }) => {
console.log(Object.values(pdf)); // As pdf is an array-like object, you need to do a little convertion first.
worker.terminate();
});
```
### with preload language data
```javascript
const Tesseract = require('tesseract.js');
const { TesseractWorker, utils: { loadLang } } = Tesseract;
const worker = new TesseractWorker();
loadLang({ langs: 'eng', langPath: worker.options.langPath })
.then(() => {
worker
.recognize('https://tesseract.projectnaptha.com/img/eng_bw.png')
.progress(p => console.log(p))
.then(({ text }) => {
console.log(text);
worker.terminate();
});
});
```

@@ -11,4 +11,7 @@ # Image Format

- a path or URL to an accessible image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp
In Node.js, an image can be
- a path to a local image
- a Buffer storing binary image
- a base64 encoded image fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp

@@ -13,3 +13,3 @@ ## Local Installation

const worker = Tesseract.TesseractWorker({
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.10/dist/worker.min.js',
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.11/dist/worker.min.js',
langPath: 'https://tessdata.projectnaptha.com/4.0.0',

@@ -16,0 +16,0 @@ corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.wasm.js',

@@ -5,69 +5,105 @@ # Tesseract Languages

| `lang` | Language |
|-----------|-----------------------|
| 'afr' | Afrikaans |
| 'ara' | Arabic |
| 'aze' | Azerbaijani |
| 'bel' | Belarusian |
| 'ben' | Bengali |
| 'bul' | Bulgarian |
| 'cat' | Catalan |
| 'ces' | Czech |
| 'chi_sim' | Chinese |
| 'chi_tra' | Traditional Chinese |
| 'chr' | Cherokee |
| 'dan' | Danish |
| 'deu' | German |
| 'ell' | Greek |
| 'eng' | English |
| 'enm' | English (Old) |
| 'epo' | Esperanto |
| 'epo_alt' | Esperanto alternative |
| 'equ' | Math |
| 'est' | Estonian |
| 'eus' | Basque |
| 'fas' |Persian (Farsi) |
| 'fin' | Finnish |
| 'fra' | French |
| 'frk' | Frankish |
| 'frm' | French (Old) |
| 'glg' | Galician |
| 'grc' | Ancient Greek |
| 'heb' | Hebrew |
| 'hin' | Hindi |
| 'hrv' | Croatian |
| 'hun' | Hungarian |
| 'ind' | Indonesian |
| 'isl' | Icelandic |
| 'ita' | Italian |
| 'ita_old' | Italian (Old) |
| 'jpn' | Japanese |
| 'kan' | Kannada |
| 'kor' | Korean |
| 'lav' | Latvian |
| 'lit' | Lithuanian |
| 'mal' | Malayalam |
| 'mkd' | Macedonian |
| 'mlt' | Maltese |
| 'msa' | Malay |
| 'nld' | Dutch |
| 'nor' | Norwegian |
| 'pol' | Polish |
| 'por' | Portuguese |
| 'ron' | Romanian |
| 'rus' | Russian |
| 'slk' | Slovakian |
| 'slv' | Slovenian |
| 'spa' | Spanish |
| 'spa_old' | Old Spanish |
| 'sqi' | Albanian |
| 'srp' | Serbian (Latin) |
| 'swa' | Swahili |
| 'swe' | Swedish |
| 'tam' | Tamil |
| 'tel' | Telugu |
| 'tgl' | Tagalog |
| 'tha' | Thai |
| 'tur' | Turkish |
| 'ukr' | Ukrainian |
| 'vie' | Vietnamese |
Lang Code | Language | 4.0 traineddata
:---------| :------- | :---------------
afr | Afrikaans | [afr.traineddata.gz](https://tessdata.projectnaptha.com/4.00/afr.traineddata.gz)
amh | Amharic | [amh.traineddata.gz](https://tessdata.projectnaptha.com/4.00/amh.traineddata.gz)
ara | Arabic | [ara.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ara.traineddata.gz)
asm | Assamese | [asm.traineddata.gz](https://tessdata.projectnaptha.com/4.00/asm.traineddata.gz)
aze | Azerbaijani | [aze.traineddata.gz](https://tessdata.projectnaptha.com/4.00/aze.traineddata.gz)
aze_cyrl | Azerbaijani - Cyrillic | [aze_cyrl.traineddata.gz](https://tessdata.projectnaptha.com/4.00/aze_cyrl.traineddata.gz)
bel | Belarusian | [bel.traineddata.gz](https://tessdata.projectnaptha.com/4.00/bel.traineddata.gz)
ben | Bengali | [ben.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ben.traineddata.gz)
bod | Tibetan | [bod.traineddata.gz](https://tessdata.projectnaptha.com/4.00/bod.traineddata.gz)
bos | Bosnian | [bos.traineddata.gz](https://tessdata.projectnaptha.com/4.00/bos.traineddata.gz)
bul | Bulgarian | [bul.traineddata.gz](https://tessdata.projectnaptha.com/4.00/bul.traineddata.gz)
cat | Catalan; Valencian | [cat.traineddata.gz](https://tessdata.projectnaptha.com/4.00/cat.traineddata.gz)
ceb | Cebuano | [ceb.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ceb.traineddata.gz)
ces | Czech | [ces.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ces.traineddata.gz)
chi_sim | Chinese - Simplified | [chi_sim.traineddata.gz](https://tessdata.projectnaptha.com/4.00/chi_sim.traineddata.gz)
chi_tra | Chinese - Traditional | [chi_tra.traineddata.gz](https://tessdata.projectnaptha.com/4.00/chi_tra.traineddata.gz)
chr | Cherokee | [chr.traineddata.gz](https://tessdata.projectnaptha.com/4.00/chr.traineddata.gz)
cym | Welsh | [cym.traineddata.gz](https://tessdata.projectnaptha.com/4.00/cym.traineddata.gz)
dan | Danish | [dan.traineddata.gz](https://tessdata.projectnaptha.com/4.00/dan.traineddata.gz)
deu | German | [deu.traineddata.gz](https://tessdata.projectnaptha.com/4.00/deu.traineddata.gz)
dzo | Dzongkha | [dzo.traineddata.gz](https://tessdata.projectnaptha.com/4.00/dzo.traineddata.gz)
ell | Greek, Modern (1453-) | [ell.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ell.traineddata.gz)
eng | English | [eng.traineddata.gz](https://tessdata.projectnaptha.com/4.00/eng.traineddata.gz)
enm | English, Middle (1100-1500) | [enm.traineddata.gz](https://tessdata.projectnaptha.com/4.00/enm.traineddata.gz)
epo | Esperanto | [epo.traineddata.gz](https://tessdata.projectnaptha.com/4.00/epo.traineddata.gz)
est | Estonian | [est.traineddata.gz](https://tessdata.projectnaptha.com/4.00/est.traineddata.gz)
eus | Basque | [eus.traineddata.gz](https://tessdata.projectnaptha.com/4.00/eus.traineddata.gz)
fas | Persian | [fas.traineddata.gz](https://tessdata.projectnaptha.com/4.00/fas.traineddata.gz)
fin | Finnish | [fin.traineddata.gz](https://tessdata.projectnaptha.com/4.00/fin.traineddata.gz)
fra | French | [fra.traineddata.gz](https://tessdata.projectnaptha.com/4.00/fra.traineddata.gz)
frk | Frankish | [frk.traineddata.gz](https://tessdata.projectnaptha.com/4.00/frk.traineddata.gz)
frm | French, Middle (ca. 1400-1600) | [frm.traineddata.gz](https://tessdata.projectnaptha.com/4.00/frm.traineddata.gz)
gle | Irish | [gle.traineddata.gz](https://tessdata.projectnaptha.com/4.00/gle.traineddata.gz)
glg | Galician | [glg.traineddata.gz](https://tessdata.projectnaptha.com/4.00/glg.traineddata.gz)
grc | Greek, Ancient (-1453) | [grc.traineddata.gz](https://tessdata.projectnaptha.com/4.00/grc.traineddata.gz)
guj | Gujarati | [guj.traineddata.gz](https://tessdata.projectnaptha.com/4.00/guj.traineddata.gz)
hat | Haitian; Haitian Creole | [hat.traineddata.gz](https://tessdata.projectnaptha.com/4.00/hat.traineddata.gz)
heb | Hebrew | [heb.traineddata.gz](https://tessdata.projectnaptha.com/4.00/heb.traineddata.gz)
hin | Hindi | [hin.traineddata.gz](https://tessdata.projectnaptha.com/4.00/hin.traineddata.gz)
hrv | Croatian | [hrv.traineddata.gz](https://tessdata.projectnaptha.com/4.00/hrv.traineddata.gz)
hun | Hungarian | [hun.traineddata.gz](https://tessdata.projectnaptha.com/4.00/hun.traineddata.gz)
iku | Inuktitut | [iku.traineddata.gz](https://tessdata.projectnaptha.com/4.00/iku.traineddata.gz)
ind | Indonesian | [ind.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ind.traineddata.gz)
isl | Icelandic | [isl.traineddata.gz](https://tessdata.projectnaptha.com/4.00/isl.traineddata.gz)
ita | Italian | [ita.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ita.traineddata.gz)
ita_old | Italian - Old | [ita_old.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ita_old.traineddata.gz)
jav | Javanese | [jav.traineddata.gz](https://tessdata.projectnaptha.com/4.00/jav.traineddata.gz)
jpn | Japanese | [jpn.traineddata.gz](https://tessdata.projectnaptha.com/4.00/jpn.traineddata.gz)
kan | Kannada | [kan.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kan.traineddata.gz)
kat | Georgian | [kat.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kat.traineddata.gz)
kat_old | Georgian - Old | [kat_old.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kat_old.traineddata.gz)
kaz | Kazakh | [kaz.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kaz.traineddata.gz)
khm | Central Khmer | [khm.traineddata.gz](https://tessdata.projectnaptha.com/4.00/khm.traineddata.gz)
kir | Kirghiz; Kyrgyz | [kir.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kir.traineddata.gz)
kor | Korean | [kor.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kor.traineddata.gz)
kur | Kurdish | [kur.traineddata.gz](https://tessdata.projectnaptha.com/4.00/kur.traineddata.gz)
lao | Lao | [lao.traineddata.gz](https://tessdata.projectnaptha.com/4.00/lao.traineddata.gz)
lat | Latin | [lat.traineddata.gz](https://tessdata.projectnaptha.com/4.00/lat.traineddata.gz)
lav | Latvian | [lav.traineddata.gz](https://tessdata.projectnaptha.com/4.00/lav.traineddata.gz)
lit | Lithuanian | [lit.traineddata.gz](https://tessdata.projectnaptha.com/4.00/lit.traineddata.gz)
mal | Malayalam | [mal.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mal.traineddata.gz)
mar | Marathi | [mar.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mar.traineddata.gz)
mkd | Macedonian | [mkd.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mkd.traineddata.gz)
mlt | Maltese | [mlt.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mlt.traineddata.gz)
msa | Malay | [msa.traineddata.gz](https://tessdata.projectnaptha.com/4.00/msa.traineddata.gz)
mya | Burmese | [mya.traineddata.gz](https://tessdata.projectnaptha.com/4.00/mya.traineddata.gz)
nep | Nepali | [nep.traineddata.gz](https://tessdata.projectnaptha.com/4.00/nep.traineddata.gz)
nld | Dutch; Flemish | [nld.traineddata.gz](https://tessdata.projectnaptha.com/4.00/nld.traineddata.gz)
nor | Norwegian | [nor.traineddata.gz](https://tessdata.projectnaptha.com/4.00/nor.traineddata.gz)
ori | Oriya | [ori.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ori.traineddata.gz)
pan | Panjabi; Punjabi | [pan.traineddata.gz](https://tessdata.projectnaptha.com/4.00/pan.traineddata.gz)
pol | Polish | [pol.traineddata.gz](https://tessdata.projectnaptha.com/4.00/pol.traineddata.gz)
por | Portuguese | [por.traineddata.gz](https://tessdata.projectnaptha.com/4.00/por.traineddata.gz)
pus | Pushto; Pashto | [pus.traineddata.gz](https://tessdata.projectnaptha.com/4.00/pus.traineddata.gz)
ron | Romanian; Moldavian; Moldovan | [ron.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ron.traineddata.gz)
rus | Russian | [rus.traineddata.gz](https://tessdata.projectnaptha.com/4.00/rus.traineddata.gz)
san | Sanskrit | [san.traineddata.gz](https://tessdata.projectnaptha.com/4.00/san.traineddata.gz)
sin | Sinhala; Sinhalese | [sin.traineddata.gz](https://tessdata.projectnaptha.com/4.00/sin.traineddata.gz)
slk | Slovak | [slk.traineddata.gz](https://tessdata.projectnaptha.com/4.00/slk.traineddata.gz)
slv | Slovenian | [slv.traineddata.gz](https://tessdata.projectnaptha.com/4.00/slv.traineddata.gz)
spa | Spanish; Castilian | [spa.traineddata.gz](https://tessdata.projectnaptha.com/4.00/spa.traineddata.gz)
spa_old | Spanish; Castilian - Old | [spa_old.traineddata.gz](https://tessdata.projectnaptha.com/4.00/spa_old.traineddata.gz)
sqi | Albanian | [sqi.traineddata.gz](https://tessdata.projectnaptha.com/4.00/sqi.traineddata.gz)
srp | Serbian | [srp.traineddata.gz](https://tessdata.projectnaptha.com/4.00/srp.traineddata.gz)
srp_latn | Serbian - Latin | [srp_latn.traineddata.gz](https://tessdata.projectnaptha.com/4.00/srp_latn.traineddata.gz)
swa | Swahili | [swa.traineddata.gz](https://tessdata.projectnaptha.com/4.00/swa.traineddata.gz)
swe | Swedish | [swe.traineddata.gz](https://tessdata.projectnaptha.com/4.00/swe.traineddata.gz)
syr | Syriac | [syr.traineddata.gz](https://tessdata.projectnaptha.com/4.00/syr.traineddata.gz)
tam | Tamil | [tam.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tam.traineddata.gz)
tel | Telugu | [tel.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tel.traineddata.gz)
tgk | Tajik | [tgk.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tgk.traineddata.gz)
tgl | Tagalog | [tgl.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tgl.traineddata.gz)
tha | Thai | [tha.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tha.traineddata.gz)
tir | Tigrinya | [tir.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tir.traineddata.gz)
tur | Turkish | [tur.traineddata.gz](https://tessdata.projectnaptha.com/4.00/tur.traineddata.gz)
uig | Uighur; Uyghur | [uig.traineddata.gz](https://tessdata.projectnaptha.com/4.00/uig.traineddata.gz)
ukr | Ukrainian | [ukr.traineddata.gz](https://tessdata.projectnaptha.com/4.00/ukr.traineddata.gz)
urd | Urdu | [urd.traineddata.gz](https://tessdata.projectnaptha.com/4.00/urd.traineddata.gz)
uzb | Uzbek | [uzb.traineddata.gz](https://tessdata.projectnaptha.com/4.00/uzb.traineddata.gz)
uzb_cyrl | Uzbek - Cyrillic | [uzb_cyrl.traineddata.gz](https://tessdata.projectnaptha.com/4.00/uzb_cyrl.traineddata.gz)
vie | Vietnamese | [vie.traineddata.gz](https://tessdata.projectnaptha.com/4.00/vie.traineddata.gz)
yid | Yiddish | [yid.traineddata.gz](https://tessdata.projectnaptha.com/4.00/yid.traineddata.gz)

@@ -16,3 +16,3 @@ Tesseract.js Parameters

.recognize(image, 'eng', {
tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED,
tessedit_ocr_engine_mode: OEM.LSTM_ONLY,
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,

@@ -25,3 +25,3 @@ })

| ---- | ---- | ------------- | ----------- |
| tessedit\_ocr\_engine\_mode | enum | OEM.TESSERACT\_LSTM\_COMBINED | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_ocr\_engine\_mode | enum | OEM.LSTM\_ONLY | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L268) for definition of each mode |
| tessedit\_pageseg\_mode | enum | PSM.SINGLE\_BLOCK | Check [HERE](https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163) for definition of each mode |

@@ -28,0 +28,0 @@ | tessedit\_char\_whitelist | string | '' | setting white list characters makes the result only contains these characters, useful the content in image is limited |

{
"name": "tesseract.js",
"version": "2.0.0-alpha.10",
"version": "2.0.0-alpha.11",
"description": "Pure Javascript Multilingual OCR",
"main": "src/index.js",
"unpkg": "dist/tesseract.min.js",
"jsdelivr": "dist/tesseract.min.js",
"scripts": {

@@ -7,0 +9,0 @@ "start": "node scripts/server.js",

@@ -1,14 +0,20 @@

# [Tesseract.js](http://tesseract.projectnaptha.com/)
<p align="center">
<a href="https://tesseract.projectnaptha.com/"><img alt="Tesseract.js" src="https://tesseract.projectnaptha.com/img/logo_small.png"></a>
</p>
[![Build Status](https://travis-ci.org/naptha/tesseract.js.svg?branch=master)](https://travis-ci.org/naptha/tesseract.js)
[![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
[![Code Style](https://badgen.net/badge/code%20style/airbnb/ff5a5f?icon=airbnb)](https://github.com/airbnb/javascript)
[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](https://github.com/naptha/tesseract.js/graphs/commit-activity)
[![Build Status](https://travis-ci.org/naptha/tesseract.js.svg?branch=master)](https://travis-ci.org/naptha/tesseract.js)
[![npm version](https://badge.fury.io/js/tesseract.js.svg)](https://badge.fury.io/js/tesseract.js)
[![Downloads Total](https://img.shields.io/npm/dt/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
[![Downloads Month](https://img.shields.io/npm/dm/tesseract.js.svg)](https://www.npmjs.com/package/tesseract.js)
**Tesseract.js v2 is now available and under development in master branch, check [support/1.x](https://github.com/naptha/tesseract.js/tree/support/1.x) branch for v1.**
<h3 align="center">
Version 2 is now available and under development in the master branch<br>
Check the <a href="https://github.com/naptha/tesseract.js/tree/support/1.x">support/1.x</a> branch for version 1
</h3>
<br>
Tesseract.js is a javascript library that gets words in [almost any language](./docs/tesseract_lang_list.md) out of images. ([Demo](http://tesseract.projectnaptha.com/))

@@ -18,3 +24,5 @@

Tesseract.js works with script tags, [webpack](https://webpack.js.org/), and [Node.js](https://nodejs.org/en/). [After you install it](#installation), using it is as simple as
Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine.
It works in the browser using [webpack](https://webpack.js.org/) or plain script tags with a [#CDN](CDN) and on the server with [Node.js](https://nodejs.org/en/).
After you [install it](#installation), using it is as simple as:

@@ -26,55 +34,49 @@ ```javascript

worker.recognize(myImage)
.progress((p) => { console.log('progress', p); })
.then((result) => { console.log('result', result); });
.progress(progress => {
console.log('progress', progress);
}).then(result => {
console.log('result', result);
});
```
[Check out the docs](#docs) for a full treatment of the API.
[Check out the docs](#docs) for a full explanation of the API.
## Provenance
Tesseract.js wraps an [emscripten](https://github.com/kripken/emscripten) [port](https://github.com/naptha/tesseract.js-core) of the [Tesseract](https://github.com/tesseract-ocr/tesseract) [OCR](https://en.wikipedia.org/wiki/Optical_character_recognition) Engine.
## Major changes in v2
- Upgrade to tesseract v4
- Support multiple languages at the same time, eg: eng+chi_tra for English and Traditional Chinese
- Supported image formats: png, jpg, bmp, pbm
# Installation
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm`, and on Node.js via `npm`. [Check out the docs](#docs) for a full treatment of the API.
## CDN
## Installation
Tesseract.js works with a `<script>` tag via local copy or CDN, with webpack via `npm` and on Node.js with `npm/yarn`.
You can simply include Tesseract.js with a CDN like this:
### CDN
```html
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.10/dist/tesseract.min.js'></script>
<!-- v2 -->
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.11/dist/tesseract.min.js'></script>
<!-- v1 -->
<script src='https://unpkg.com/tesseract.js@1.0.19/src/index.js'></script>
```
After including the script the `Tesseract` variable will be globally available.
After including your scripts, the `Tesseract` variable will be defined globally!
## npm
### Node.js
### 2.x
**Tesseract.js currently requires Node.js v6.8.0 or higher**
Major Changes
- Upgrade to tesseract v4
- Support multiple languages, ex: eng+chi_tra
- Support image formats: png, jpg, bmp, pbm
```shell
> yarn add tesseract.js@next
```
or
```
> npm install tesseract.js@next --save
```
# For v2
npm install tesseract.js@next
yarn add tesseract.js@next
### 1.x
```shell
> yarn add tesseract.js
# For v1
npm install tesseract.js
yarn add tesseract.js
```
or
```
> npm install tesseract.js --save
```
> Note: Tesseract.js currently requires Node.js v6.8.0 or higher.
# Documentation
## Documentation

@@ -87,31 +89,27 @@ * [Examples](./docs/examples.md)

# Contributing
## Development
To run a development copy of tesseract.js, first clone this repo.
```shell
> git clone https://github.com/naptha/tesseract.js.git
```
## Contributing
Then, `cd tesseract.js && npm install && npm start`
### Development
To run a development copy of Tesseract.js do the following:
```shell
> cd tesseract.js
> npm install && npm start
# First we clone the repository
git clone https://github.com/naptha/tesseract.js.git
cd tesseract.js
... a bunch of npm stuff ...
# Then we install the dependencies
npm install
Starting up http-server, serving ./
Available on:
http://127.0.0.1:3000
http://[your ip]:3000
# And finally we start the development server
npm start
```
Then open `http://localhost:3000/examples/browser/demo.html` in your favorite browser. The devServer automatically rebuilds `tesseract.dev.js` and `worker.min.js` when you change files in the src folder.
The development server will be available at http://localhost:3000/examples/browser/demo.html in your favorite browser.
It will automatically rebuild `tesseract.dev.js` and `worker.min.js` when you change files in the src folder.
## Building Static Files
After you've cloned the repo and run `npm install` as described in the [Development Section](#development), you can build static library files in the dist folder with
### Building Static Files
To build the compiled static files just execute the following:
```shell
> npm run build
npm run build
```
This will output the files into the `dist` directory.
global.expect = require('expect.js');
global.fetch = require('node-fetch');
global.fs = require('fs');
global.path = require('path');
global.Tesseract = require('../src');

@@ -13,2 +13,3 @@ /**

const axios = require('axios');
const b64toU8Array = require('./b64toU8Array');
const { defaultOptions } = require('../common/options');

@@ -42,2 +43,3 @@ const { version } = require('../../package.json');

* string: URL string, can be relative path
* string: base64 image
* img HTMLElement: extract image source from src attribute

@@ -51,2 +53,7 @@ * video HTMLElement: extract image source from poster attribute

if (check.string(image)) {
// Base64 Image
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
return Promise.resolve(b64toU8Array(image.split(',')[1]));
}
// Image URL
return axios.get(resolveURL(image), {

@@ -104,3 +111,3 @@ responseType: 'arraybuffer',

...defaultOptions,
workerPath: process.env.TESS_ENV === 'development'
workerPath: (typeof process !== 'undefined' && process.env.TESS_ENV === 'development')
? resolveURL(`/dist/worker.dev.js?nocache=${Math.random().toString(36).slice(3)}`)

@@ -107,0 +114,0 @@ : `https://unpkg.com/tesseract.js@v${version}/dist/worker.min.js`,

@@ -13,2 +13,3 @@ /**

const workerUtils = require('../common/workerUtils');
const b64toU8Array = require('./b64toU8Array');

@@ -46,3 +47,3 @@ /*

},
b64toU8Array: s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))),
b64toU8Array,
writeFile: (path, data, type) => {

@@ -49,0 +50,0 @@ postMessage({

@@ -16,3 +16,3 @@ const { OEM, PSM } = require('./types');

defaultParams: {
tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED,
tessedit_ocr_engine_mode: OEM.LSTM_ONLY,
tessedit_pageseg_mode: PSM.SINGLE_BLOCK,

@@ -19,0 +19,0 @@ tessedit_char_whiltelist: '',

@@ -11,3 +11,3 @@ /**

const check = require('check-types');
const resolveURL = process.browser ? require('resolve-url') : s => s;
const resolveURL = (typeof window !== 'undefined' && typeof window.document !== 'undefined') ? require('resolve-url') : s => s;
const adapter = require('../node');

@@ -14,0 +14,0 @@ const circularize = require('./circularize');

@@ -16,2 +16,3 @@ /**

const path = require('path');
const b64toU8Array = require('./b64toU8Array');
const { defaultOptions } = require('../common/options');

@@ -29,2 +30,4 @@

* string: URL string or file path
* string: base64 image
* buffer: image buffer
* @returns {array} binary image in array format

@@ -39,2 +42,11 @@ */

}
if (/data:image\/([a-zA-Z]*);base64,([^"]*)/.test(image)) {
return Promise.resolve(b64toU8Array(image.split(',')[1]));
}
if (Buffer.isBuffer(image)) {
return Promise.resolve(image);
}
return readFile(image);

@@ -41,0 +53,0 @@ };

@@ -13,2 +13,3 @@ /**

const workerUtils = require('../common/workerUtils');
const b64toU8Array = require('./b64toU8Array');

@@ -37,3 +38,3 @@ let TesseractCore = null;

},
b64toU8Array: s => Buffer.from(s, 'base64'),
b64toU8Array,
writeFile: (path, data) => {

@@ -40,0 +41,0 @@ const fs = require('fs');

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc