tesseract.js
Advanced tools
Comparing version 2.0.0-alpha.4 to 2.0.0-alpha.6
@@ -59,3 +59,3 @@ # Tesseract.js Examples | ||
### with whitelist chars (^2.0.0-alpha.4) | ||
### with whitelist char (^2.0.0-alpha.5) | ||
@@ -75,3 +75,3 @@ Sadly, whitelist chars is not supported in tesseract.js v4, so in tesseract.js we need to switch to tesseract v3 mode to make it work. | ||
{ | ||
'init_oem': OEM.TESSERACT_ONLY, | ||
'tessedit_ocr_engine_mode': OEM.TESSERACT_ONLY, | ||
'tessedit_char_whitelist': '0123456789-.', | ||
@@ -87,1 +87,51 @@ } | ||
``` | ||
### with different pageseg mode (^2.0.0-alpha.5) | ||
Check here for more details of pageseg mode: https://github.com/tesseract-ocr/tesseract/blob/4.0.0/src/ccstruct/publictypes.h#L163 | ||
```javascript | ||
import Tesseract from 'tesseract.js'; | ||
const { TesseractWorker, PSM } = Tesseract; | ||
const worker = new TesseractWorker(); | ||
worker | ||
.recognize( | ||
'http://jeroen.github.io/images/testocr.png', | ||
'eng', | ||
{ | ||
'tessedit_pageseg_mode': PSM.SINGLE_BLOCK, | ||
} | ||
) | ||
.progress((p) => { | ||
console.log('progress', p); | ||
}) | ||
.then((result) => { | ||
console.log(result); | ||
}); | ||
``` | ||
### with pdf output (^2.0.0-alpha.5) | ||
```javascript | ||
import Tesseract from 'tesseract.js'; | ||
const { TesseractWorker } = Tesseract; | ||
const worker = new TesseractWorker(); | ||
worker | ||
.recognize( | ||
'http://jeroen.github.io/images/testocr.png', | ||
'eng', | ||
{ | ||
'tessedit_create_pdf': '1', | ||
} | ||
) | ||
.progress((p) => { | ||
console.log('progress', p); | ||
}) | ||
.then((result) => { | ||
console.log(result); | ||
}); | ||
``` |
@@ -13,5 +13,5 @@ ## Local Installation | ||
const worker = Tesseract.TesseractWorker({ | ||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.4/dist/worker.min.js', | ||
workerPath: 'https://unpkg.com/tesseract.js@v2.0.0-alpha.6/dist/worker.min.js', | ||
langPath: 'https://tessdata.projectnaptha.com/4.0.0', | ||
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.wasm.js', | ||
corePath: 'https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.wasm.js', | ||
}); | ||
@@ -18,0 +18,0 @@ ``` |
{ | ||
"name": "tesseract.js", | ||
"version": "2.0.0-alpha.4", | ||
"version": "2.0.0-alpha.6", | ||
"description": "Pure Javascript Multilingual OCR", | ||
@@ -50,3 +50,3 @@ "main": "src/index.js", | ||
"resolve-url": "^0.2.1", | ||
"tesseract.js-core": "^2.0.0-beta.8", | ||
"tesseract.js-core": "^2.0.0-beta.10", | ||
"tesseract.js-utils": "^1.0.0-beta.5" | ||
@@ -53,0 +53,0 @@ }, |
@@ -42,3 +42,3 @@ # [Tesseract.js](http://tesseract.projectnaptha.com/) | ||
```html | ||
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.4/dist/tesseract.min.js'></script> | ||
<script src='https://unpkg.com/tesseract.js@v2.0.0-alpha.6/dist/tesseract.min.js'></script> | ||
``` | ||
@@ -45,0 +45,0 @@ |
@@ -74,2 +74,21 @@ /** | ||
const downloadFile = (path, blob) => { | ||
if (navigator.msSaveBlob) { | ||
// IE 10+ | ||
navigator.msSaveBlob(blob, path); | ||
} else { | ||
const link = document.createElement('a'); | ||
// Browsers that support HTML5 download attribute | ||
if (link.download !== undefined) { | ||
const url = URL.createObjectURL(blob); | ||
link.setAttribute('href', url); | ||
link.setAttribute('download', path); | ||
link.style.visibility = 'hidden'; | ||
document.body.appendChild(link); | ||
link.click(); | ||
document.body.removeChild(link); | ||
} | ||
} | ||
} | ||
/* | ||
@@ -87,3 +106,3 @@ * Default options for browser worker | ||
*/ | ||
corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.8/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, | ||
corePath: `https://unpkg.com/tesseract.js-core@v2.0.0-beta.10/tesseract-core.${typeof WebAssembly === 'object' ? 'wasm' : 'asm'}.js`, | ||
}; | ||
@@ -113,3 +132,8 @@ | ||
worker.onmessage = ({ data }) => { | ||
instance.recv(data); | ||
if (data.jobId.startsWith('Job')) { | ||
instance.recv(data); | ||
} else if (data.jobId.startsWith('Download')) { | ||
const { path, blob } = data; | ||
downloadFile(path, blob); | ||
} | ||
}; | ||
@@ -116,0 +140,0 @@ |
@@ -45,2 +45,11 @@ /** | ||
}, | ||
b64toU8Array: s => new Uint8Array(atob(s).split('').map(c => c.charCodeAt(0))), | ||
writeFile: (path, data, type) => { | ||
const blob = new Blob([data], { type }); | ||
self.postMessage({ | ||
jobId: 'Download', | ||
path, | ||
blob, | ||
}); | ||
}, | ||
}); |
@@ -0,1 +1,3 @@ | ||
const { OEM, PSM } = require('./types'); | ||
module.exports = { | ||
@@ -10,2 +12,14 @@ defaultOptions: { | ||
}, | ||
/* | ||
* default params for recognize() | ||
*/ | ||
defaultParams: { | ||
tessedit_ocr_engine_mode: OEM.TESSERACT_LSTM_COMBINED, | ||
tessedit_pageseg_mode: PSM.SINGLE_BLOCK, | ||
tessedit_char_whiltelist: '', | ||
tessedit_create_pdf: '0', | ||
textonly_pdf: '0', | ||
pdf_name: 'tesseract.js-ocr-result', | ||
pdf_title: 'Tesseract.js OCR Result', | ||
}, | ||
}; |
@@ -5,3 +5,3 @@ module.exports = { | ||
* | ||
* By default tesseract.js uses DEFAULT mode, which uses LSTM when possible. | ||
* By default tesseract.js uses TESSERACT_LSTM_COMBINED mode, which uses LSTM when possible. | ||
* If you need to use some tesseract v3 features (like tessedit_char_whitelist), | ||
@@ -18,2 +18,21 @@ * you need to use TESSERACT_ONLY mode. | ||
}, | ||
/* | ||
* PSM = Page Segmentation Mode | ||
*/ | ||
PSM: { | ||
OSD_ONLY: '0', | ||
AUTO_OSD: '1', | ||
AUTO_ONLY: '2', | ||
AUTO: '3', | ||
SINGLE_COLUMN: '4', | ||
SINGLE_BLOCK_VERT_TEXT: '5', | ||
SINGLE_BLOCK: '6', | ||
SINGLE_LINE: '7', | ||
SINGLE_WORD: '8', | ||
SINGLE_CHAR: '9', | ||
SPARSE_TEXT: '10', | ||
SPARSE_TEXT_OSD: '11', | ||
RAW_LINE: '12', | ||
COUNT: '13', | ||
}, | ||
}; |
@@ -12,3 +12,5 @@ /** | ||
const check = require('check-types'); | ||
const pdfTTF = require('./pdf-ttf'); | ||
const dump = require('./dump'); | ||
const { defaultParams } = require('./options'); | ||
@@ -56,2 +58,54 @@ /* | ||
/** | ||
* handleParams | ||
* | ||
* @name handleParams | ||
* @function hanlde params from users | ||
* @access private | ||
* @param {string} lang - lang string for Init() | ||
* @param {object} customParams - an object of params | ||
*/ | ||
const handleParams = (lang, customParams) => { | ||
const { | ||
tessedit_ocr_engine_mode, | ||
...params | ||
} = { | ||
...defaultParams, | ||
...customParams, | ||
}; | ||
api.Init(null, lang, tessedit_ocr_engine_mode); | ||
Object.keys(params).forEach((key) => { | ||
api.SetVariable(key, params[key]); | ||
}); | ||
}; | ||
/** | ||
* handleOutput | ||
* | ||
* @name handleOutput | ||
* @function handle file output | ||
* @access private | ||
* @param {object} customParams - an object of params | ||
*/ | ||
const handleOutput = (customParams) => { | ||
const { | ||
tessedit_create_pdf, | ||
textonly_pdf, | ||
pdf_name, | ||
pdf_title, | ||
} = { | ||
...defaultParams, | ||
...customParams, | ||
}; | ||
if (tessedit_create_pdf === '1') { | ||
const pdfRenderer = new TessModule.TessPDFRenderer(pdf_name, '/', textonly_pdf === '1'); | ||
pdfRenderer.BeginDocument(pdf_title); | ||
pdfRenderer.AddImage(api); | ||
pdfRenderer.EndDocument(); | ||
adapter.writeFile(`${pdf_name}.pdf`, TessModule.FS.readFile(`/${pdf_name}.pdf`), 'application/pdf'); | ||
TessModule._free(pdfRenderer); | ||
} | ||
} | ||
/** | ||
* handleInit | ||
@@ -80,2 +134,3 @@ * | ||
TessModule = tessModule; | ||
TessModule.FS.writeFile('/pdf.ttf', adapter.b64toU8Array(pdfTTF)); | ||
api = new TessModule.TessBaseAPI(); | ||
@@ -129,5 +184,2 @@ res.progress({ status: 'initialized tesseract', progress: 1 }); | ||
.then(() => { | ||
const OEM = check.undefined(params['init_oem']) | ||
? TessModule.OEM_DEFAULT | ||
: params['init_oem']; | ||
const progressUpdate = (progress) => { | ||
@@ -137,11 +189,8 @@ res.progress({ status: 'initializing api', progress }); | ||
progressUpdate(0); | ||
api.Init(null, lang, OEM); | ||
progressUpdate(0.3); | ||
Object.keys(params).filter(key => !key.startsWith('init_')).forEach((key) => { | ||
api.SetVariable(key, params[key]); | ||
}); | ||
progressUpdate(0.6); | ||
handleParams(lang, params); | ||
progressUpdate(0.5); | ||
const ptr = setImage(image); | ||
progressUpdate(1); | ||
api.Recognize(null); | ||
handleOutput(params); | ||
const result = dump(TessModule, api); | ||
@@ -148,0 +197,0 @@ api.End(); |
@@ -12,3 +12,3 @@ /** | ||
const TesseractWorker = require('./common/TesseractWorker'); | ||
const { OEM } = require('./common/types'); | ||
const types = require('./common/types'); | ||
@@ -21,3 +21,3 @@ module.exports = { | ||
/** Check ./common/types for more details */ | ||
OEM, | ||
...types, | ||
}; |
@@ -36,2 +36,9 @@ /** | ||
}, | ||
b64toU8Array: s => Buffer.from(s, 'base64'), | ||
writeFile: (path, data) => { | ||
const fs = require('fs'); | ||
fs.writeFile(path, data, () => { | ||
console.log('File Write Succeeded!'); | ||
}); | ||
}, | ||
}); |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
2066
1195339
44
3