bank-voucher-ocr
Advanced tools
Comparing version 0.13.0 to 1.0.0
@@ -5,3 +5,3 @@ /** | ||
* | ||
* @version 0.13.0 | ||
* @version 1.0.0 | ||
* @author waiting | ||
@@ -15,7 +15,7 @@ * @license MIT | ||
import { promisify } from 'util'; | ||
import { tmpdir } from 'os'; | ||
import { tmpdir, cpus } from 'os'; | ||
import { crop, info, resize } from 'easyimage'; | ||
import * as moment_ from 'moment'; | ||
import { from, of, range, defer } from 'rxjs'; | ||
import { concatMap, map, mergeMap, reduce, catchError, defaultIfEmpty, delay, filter, mapTo, retry, skipWhile, take, tap } from 'rxjs/operators'; | ||
import { concatMap, map, mergeMap, catchError, last, mapTo, reduce, defaultIfEmpty, filter, skipWhile, take, tap } from 'rxjs/operators'; | ||
import run from 'rxrunscript'; | ||
@@ -247,4 +247,20 @@ | ||
// crop all defined zones from a image | ||
function cropImgAllZones(srcPath, zoneTmpDir, ocrZoneOptsArr) { | ||
return from(ocrZoneOptsArr).pipe(mergeMap(ocrZoneOpts => { | ||
function cropImgAllZones(srcPath, zoneTmpDir, ocrFields, ocrZoneOptsArr) { | ||
const flds = []; | ||
const srcFldSet = new Set(); | ||
for (const srcFld of Object.values(ocrFields)) { | ||
if (!srcFld || srcFldSet.has(srcFld)) { | ||
continue | ||
} | ||
srcFldSet.add(srcFld); | ||
} | ||
for (const row of ocrZoneOptsArr) { | ||
const fld = row.zoneName; | ||
if (fld && srcFldSet.has(fld)) { | ||
flds.push(row); | ||
} | ||
} | ||
return from(flds).pipe(mergeMap(ocrZoneOpts => { | ||
return cropImgZone(srcPath, zoneTmpDir, ocrZoneOpts).pipe(map(img => { | ||
@@ -284,9 +300,9 @@ return [ocrZoneOpts.zoneName, img] | ||
} | ||
// ocr a iamge file | ||
function runOcr(path, lang) { | ||
// second path will be append with '.txt' | ||
// ocr a iamge file, txtPath without extension | ||
function runOcr(imgPath, lang, txtPath) { | ||
// second path will be append with '.txt' by tesseract | ||
if (!lang) { | ||
lang = 'eng'; | ||
} | ||
const cmd = `tesseract "${path}" "${path}" -l ${lang}`; | ||
const cmd = `tesseract "${imgPath}" "${txtPath}" -l ${lang}`; | ||
@@ -296,3 +312,4 @@ // const opts = { | ||
// } | ||
return run(cmd) | ||
return run(cmd).pipe(last(), catchError(() => of(void 0)), // tesseract will exit with code(0) but output with stderr | ||
mapTo(void 0)) | ||
} | ||
@@ -377,3 +394,33 @@ | ||
} | ||
function getOcrRetLangPath(ocrRetTxtMap, fieldName, lang) { | ||
const ocrRetLangMap = getOcrRetLangMap(ocrRetTxtMap, fieldName); | ||
if (!ocrRetLangMap) { | ||
return '' | ||
} | ||
const txtPath = ocrRetLangMap.get(lang); | ||
return txtPath ? txtPath : '' | ||
} | ||
function updateOcrRetTxtMap(ocrRetTxtMap, fieldName, lang, txtPath) { | ||
if (!fieldName || !lang || !txtPath) { | ||
return | ||
} | ||
let ocrRetLangMap = getOcrRetLangMap(ocrRetTxtMap, fieldName); | ||
if (!ocrRetLangMap) { | ||
ocrRetLangMap = new Map(); | ||
} | ||
if (lang && txtPath) { | ||
updateOcrRetLangMap(ocrRetLangMap, lang, txtPath); | ||
} | ||
ocrRetTxtMap.set(fieldName, ocrRetLangMap); | ||
} | ||
function updateOcrRetLangMap(ocrRetLangMap, lang, txtPath) { | ||
ocrRetLangMap.set(lang, txtPath); | ||
} | ||
function getOcrRetLangMap(ocrRetTxtMap, fieldName) { | ||
return ocrRetTxtMap.get(fieldName) | ||
} | ||
const moment$1 = moment_; | ||
@@ -412,3 +459,3 @@ | ||
function recognize(imgPath, options) { | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, resizeImgDir, voucherConfigMap, globalScale, skipImgDir, } = options; | ||
const { bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, resizeImgDir, voucherConfigMap, globalScale, skipImgDir, } = options; | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir; | ||
@@ -418,2 +465,3 @@ const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir; | ||
const skipDir = skipImgDir ? join(skipImgDir, moment$1().format('YYYYMMDD')) : ''; | ||
const cnumber = typeof concurrent === 'number' && concurrent > 0 ? concurrent : cpus().length; | ||
// if config set for 300api, but source image from 600dpi, then set globalSale=600/300. default is 1 | ||
@@ -438,6 +486,6 @@ const voucherConfigMapNew = parseVoucherConfigMapScale(voucherConfigMap, globalScale); | ||
}; | ||
const ret$ = recognizePageBank(bankOpts).pipe(filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), concatMap(({ bankName, pagePath }) => { | ||
const ret$ = recognizePageBank(bankOpts).pipe(filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), mergeMap(({ bankName, pagePath }) => { | ||
!!debug && console.info('start split page'); | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
}), concatMap(({ bankName, imgFile }) => { | ||
}), mergeMap(({ bankName, imgFile }) => { | ||
const ocrFields = getOcrFields(bankName, voucherConfigMapNew); | ||
@@ -451,3 +499,2 @@ | ||
baseDir, | ||
concurrent: 2, | ||
debug: !!debug, | ||
@@ -467,3 +514,3 @@ defaultValue: '', | ||
})) | ||
}), mergeMap(retInfo => { | ||
}, cnumber > 0 ? cnumber : 1), mergeMap(retInfo => { | ||
const opts = { | ||
@@ -493,3 +540,3 @@ retInfo, | ||
concatMap(zoneInfo => { | ||
return runOcr(zoneInfo.path, lang).pipe(map(() => ({ path, zoneImgPath: zoneInfo.path })), mapTo(zoneInfo.path), catchError(() => of(zoneInfo.path))) | ||
return runOcr(zoneInfo.path, lang, zoneInfo.path).pipe(mapTo(zoneInfo.path)) | ||
}), concatMap(zoneImgPath => { | ||
@@ -545,4 +592,4 @@ // 批量提取参数值 | ||
function recognizeFields(options) { | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = join(baseDir, zoneTmpDirPrefix, `${basename(imgFile.path)}-${Math.random().toString()}`); | ||
const { bankName, baseDir, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = join(baseDir, zoneTmpDirPrefix, `${basename(imgFile.path)}`); | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap); | ||
@@ -553,7 +600,7 @@ | ||
} | ||
const stream$ = from(createDir(zoneTmpDir)).pipe(mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, bankConfig.ocrZones)), // 切分图片区域分别做ocr识别 | ||
concatMap(fileMap => { | ||
const stream$ = from(createDir(zoneTmpDir)).pipe( | ||
// 切分图片区域分别做ocr识别 | ||
mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), concatMap(fileMap => { | ||
const opts = { | ||
bankConfig, ocrFields, defaultValue, debug, | ||
concurrent: concurrent > 0 ? concurrent : 2, | ||
zoneImgMap: fileMap, | ||
@@ -563,2 +610,8 @@ }; | ||
return batchOcrAndRetrieve(opts) | ||
}), tap(() => { | ||
if (!debug) { | ||
setTimeout(dir => { | ||
rimraf(dir).catch(console.info); | ||
}, 5000, zoneTmpDir); | ||
} | ||
})); | ||
@@ -569,26 +622,33 @@ | ||
function batchOcrAndRetrieve(options) { | ||
const { zoneImgMap, bankConfig, ocrFields, defaultValue, concurrent, debug, } = options; | ||
const { zoneImgMap, bankConfig, ocrFields, defaultValue, debug, } = options; | ||
const { bankName } = bankConfig; | ||
const del$ = from(zoneImgMap.entries()).pipe(delay(15000), mergeMap(([, imgInfo]) => { | ||
return defer(async () => { | ||
const img = imgInfo.path; | ||
const txt = img + '.txt'; | ||
if (await isFileExists(img)) { | ||
await rimraf(img); | ||
} | ||
if (await isFileExists(txt)) { | ||
await rimraf(txt); | ||
} | ||
return null | ||
}).pipe(delay(20000), retry(2), catchError(err => { | ||
console.info('Delete zone file retry failed:', err); | ||
return of(null) | ||
})) | ||
}), catchError(() => { | ||
return of(null) | ||
})); | ||
// const del$ = ofrom(zoneImgMap.entries()).pipe( | ||
// delay(15000), | ||
// mergeMap(([, imgInfo]) => { | ||
// return defer(async () => { | ||
// const img = imgInfo.path | ||
// const txt = img + '.txt' | ||
// if (await isFileExists(img)) { | ||
// await rimraf(img) | ||
// } | ||
// if (await isFileExists(txt)) { | ||
// await rimraf(txt) | ||
// } | ||
// return null | ||
// }).pipe( | ||
// delay(20000), | ||
// retry(2), | ||
// catchError(err => { | ||
// console.info('Delete zone file retry failed:', err) | ||
// return of(null) | ||
// }), | ||
// ) | ||
// }), | ||
// catchError(() => { | ||
// return of(null) | ||
// }), | ||
// ) | ||
const process$ = from(zoneImgMap.entries()).pipe(concatMap((zoneImgRow) => { | ||
return ocrAndPickFromZoneImg(zoneImgRow, bankConfig, concurrent, debug) | ||
}), reduce((acc, curr) => acc.set(curr.fieldName, curr.value), new Map()), map(retMap => retMap.set('bank' /* bank */, bankName)), map(retMap => setDefaultValue(retMap, ocrFields, defaultValue)), tap(() => debug || del$.subscribe())); | ||
return ocrAndPickFromZoneImg(zoneImgRow, bankConfig, debug) | ||
}), reduce((acc, curr) => acc.set(curr.fieldName, curr.value), new Map()), map(retMap => retMap.set('bank' /* bank */, bankName)), map(retMap => setDefaultValue(retMap, ocrFields, defaultValue))); | ||
@@ -614,3 +674,3 @@ return process$ | ||
const { fieldName, value } = zoneRet; | ||
const ret = { fieldName, value }; | ||
const ret = Object.assign({}, zoneRet); | ||
@@ -712,3 +772,3 @@ switch (fieldName) { | ||
const { retInfo, resizeDir, debug, scale, jpegQuality } = options; | ||
let filename = retInfo.get('filename'); | ||
const filename = retInfo.get('filename'); | ||
const path = retInfo.get('path'); | ||
@@ -723,11 +783,16 @@ const sn = retInfo.get('sn' /* sn */); | ||
} | ||
// YYYYMMDD-A15295623630009-0.31486898522590034-pageSplitItemIndex.jpg | ||
const arr = filename.split('.').slice(0, -1).join('').split('-'); | ||
arr.splice(2, 1); | ||
let filename2 = arr.join('-'); | ||
if (sn) { | ||
const name = filename.split('.').slice(0, -1); | ||
filename = name + `-${sn.replace(/[^\d\w]/g, '_')}.jpg`; | ||
filename2 = filename2 + `-${sn.replace(/[^\d\w]/g, '_')}`; | ||
} | ||
filename2 = filename2 + '.jpg'; | ||
const curDate = moment$1().format('YYYY-MM-DD'); | ||
const targetPath = join(resizeDir, curDate, filename); | ||
const targetPath = join(resizeDir, curDate, filename2); | ||
retInfo.set('filename', filename); | ||
retInfo.set('filename', filename2); | ||
return resizeAndSaveImg(path, targetPath, scale, jpegQuality).pipe(map(imgInfo => { | ||
@@ -746,4 +811,5 @@ retInfo.set('path', imgInfo.path); | ||
} | ||
function ocrAndPickFromZoneImg(zoneImgRow, config, concurrent = 2, debug = false) { | ||
function ocrAndPickFromZoneImg(zoneImgRow, config, debug = false) { | ||
const { ocrDefaultLangs, ocrFieldLangs, regexpOpts, ocrFields } = config; | ||
const ocrRetTxtMap = new Map(); | ||
@@ -754,9 +820,10 @@ return from(Object.entries(ocrFields)).pipe(filter(data => { | ||
return !!zoneName && zoneName === zoneImgRow[0] | ||
}), mergeMap(data => { | ||
}), concatMap(data => { | ||
const fieldName = data[0]; | ||
const zoneName = data[1]; | ||
return ocrAndPickFieldFromZoneImg(fieldName, zoneImgRow, regexpOpts, ocrDefaultLangs, ocrFieldLangs, debug) | ||
}, concurrent)) | ||
return ocrAndPickFieldFromZoneImg(fieldName, zoneName, zoneImgRow, regexpOpts, ocrDefaultLangs, ocrFieldLangs, debug, ocrRetTxtMap) | ||
})) | ||
} | ||
function ocrAndPickFieldFromZoneImg(fieldName, zoneImgRow, regexpOpts, defaultLangs, fieldLangs, debug = false) { | ||
function ocrAndPickFieldFromZoneImg(fieldName, zoneName, zoneImgRow, regexpOpts, defaultLangs, fieldLangs, debug = false, ocrRetTxtMap) { | ||
const [, zoneImg] = zoneImgRow; | ||
@@ -774,6 +841,37 @@ const langs = genFieldLangs(fieldName, defaultLangs, fieldLangs); | ||
concatMap(lang => { | ||
// console.log(`fld "${fieldName}" use lang:`, lang, zoneImg.path) | ||
return runOcr(zoneImg.path, lang).pipe(mapTo(true), catchError(() => of(true))) | ||
}), concatMap(() => { | ||
return retrieveKeyValuesFromOcrResult(zoneImg.path + '.txt', regexp, prepareContent, debug).pipe(map(val => ({ fieldName, value: val }))) | ||
// console.info(`\n\n\nfld "${fieldName}" zoneName: "${zoneName}" use lang: ${lang}, path: "${zoneImg.path}"\n`) | ||
const path = getOcrRetLangPath(ocrRetTxtMap, zoneName, lang); | ||
if (path) { | ||
// console.info(`reused txtPath. fieldName: "${fieldName}", zoneName: "${zoneName}", lang: "${lang}", | ||
// txtPath: "${path}"\n\n`) | ||
return retrieveKeyValuesFromOcrResult(path + '.txt', regexp, prepareContent, debug).pipe(map(val => { | ||
return { | ||
fieldName, | ||
zoneName, | ||
value: val, | ||
usedLang: lang, | ||
txtPath: path, | ||
} | ||
})) | ||
} | ||
else { | ||
const imgPath = zoneImg.path; | ||
const txtPath = imgPath.split('.').slice(0, -1).join('.') + `-${Math.random()}`; | ||
return runOcr(imgPath, lang, txtPath).pipe(concatMap(() => { | ||
// console.info(`\n\n--------- usedLang: "${lang}", txtPath:"${txtPath}"`) | ||
return retrieveKeyValuesFromOcrResult(txtPath + '.txt', regexp, prepareContent, debug).pipe(map(val => { | ||
return { | ||
fieldName, | ||
zoneName, | ||
value: val, | ||
usedLang: lang, | ||
txtPath, | ||
} | ||
})) | ||
})) | ||
} | ||
}), tap(({ zoneName: zone, usedLang, txtPath }) => { | ||
updateOcrRetTxtMap(ocrRetTxtMap, zone, usedLang, txtPath); | ||
}), skipWhile((data, index) => { | ||
@@ -780,0 +878,0 @@ const valid = validateZoneImgRow(data.fieldName, data.value); |
@@ -5,3 +5,3 @@ /** | ||
* | ||
* @version 0.13.0 | ||
* @version 1.0.0 | ||
* @author waiting | ||
@@ -12,3 +12,3 @@ * @license MIT | ||
import{access,chmod,close,copyFile,mkdir,open,readdir,readFile,rmdir,stat,unlink,write,writeFile}from"fs";import{basename,join,normalize,resolve,sep}from"path";import{promisify}from"util";import{tmpdir}from"os";import{crop,info,resize}from"easyimage";import*as moment_ from"moment";import{from,of,range,defer}from"rxjs";import{concatMap,map,mergeMap,reduce,catchError,defaultIfEmpty,delay,filter,mapTo,retry,skipWhile,take,tap}from"rxjs/operators";import run from"rxrunscript";const closeAsync=promisify(close),chmodAsync=promisify(chmod),copyFileAsync=promisify(copyFile),mkdirAsync=promisify(mkdir),openAsync=promisify(open),readFileAsync=promisify(readFile),readDirAsync=promisify(readdir),rmdirAsync=promisify(rmdir),unlinkAsync=promisify(unlink),writeAsync=promisify(write),writeFileAsync=promisify(writeFile);function isPathAcessible(e){return e?new Promise(t=>access(e,e=>t(!e))):Promise.resolve(!1)}function isDirExists(e){return e?isDirFileExists(e,"DIR"):Promise.resolve(!1)}function isFileExists(e){return e?isDirFileExists(e,"FILE"):Promise.resolve(!1)}function isDirFileExists(e,t){return e?new Promise(i=>{stat(e,(e,r)=>{i(!(e||!r)&&("DIR"===t?r.isDirectory():r.isFile()))})}):Promise.resolve(!1)}async function createDir(e){if(!e)throw new Error("value of path param invalid");e=normalize(e),await isDirExists(e)||await e.split(sep).reduce(async(e,t)=>{const i=resolve(await e,t);return await isPathAcessible(i)||await mkdirAsync(i,493),i},Promise.resolve(sep))}async function rimraf(e){e&&(await _rimraf(e),await isDirExists(e)&&await rmdirAsync(e))}async function _rimraf(e){if(e&&await isPathAcessible(e)){if(await isFileExists(e))return void await unlinkAsync(e);const t=await readDirAsync(e);if(t.length)for(const i of t)await _rimraf(join(e,i));else await rmdirAsync(e)}}const initialBaseTmpDir=join(tmpdir(),"voucher-ocr"),initialResizeImgDir=join(initialBaseTmpDir,"resize"),initialSplitTmpDir=join(initialBaseTmpDir,"split"),zoneTmpDirPrefix="zone",initialBankZone={zoneName:"bank",width:2250,height:390,offsetX:70,offsetY:10},moment=moment_;function splitPagetoItems(e,t,i){return readImgInfo(e).pipe(map(r=>{const n=calcItemsPerPage(r.height,i.height);return n?range(0,n).pipe(mergeMap(n=>{const a={index:n,itemConfig:Object.assign({},i),srcPath:e,targetDir:t,pageHeight:r.height};return r.width<a.itemConfig.width&&(a.itemConfig.width=r.width),parseSplitPage(a).pipe(mergeMap(e=>{const t=new Map;return e.name&&t.set(e.name,e),of(t)}))})):of(new Map)}),concatMap(e=>e))}function resizeAndSaveImg(e,t,i,r){if(i<=0||i>1)throw new Error(`value of scale invalid: "${i}"`);return readImgInfo(e).pipe(mergeMap(n=>{const a={src:e,dst:t,width:n.width*i,height:n.height*i,quality:r};return from(resize(a))}),map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function parseSplitPage(e){const{index:t,srcPath:i,pageHeight:r}=e,{width:n,marginBottom:a}=e.itemConfig;let{height:o}=e.itemConfig;const s=0,c=t*o;if(c+o>r&&(o=r-c),o/r<.1||o<100){const e={name:"",path:"",width:0,height:0,size:0};return of(e)}const p=basename(i),m=p.split(".")[0],l=moment().format("YYYYMMDD"),f=join(e.targetDir,`${l}-${m}-${Math.random()}-${t}.jpg`),u={dst:f,src:i,quality:100,cropWidth:n,cropHeight:o+a,x:0,y:c};return from(crop(u)).pipe(mergeMap(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return of(t)}))}function readImgInfo(e){return from(info(e))}function calcItemsPerPage(e,t){const i=33;return e>=t?Math.ceil((e+33)/t):1}function getOcrZoneOptsByBankName(e,t){return t.get(e)}function cropImgAllZones(e,t,i){return from(i).pipe(mergeMap(i=>cropImgZone(e,t,i).pipe(map(e=>[i.zoneName,e]))),reduce((e,t)=>(e.set(t[0],t[1]),e),new Map))}function cropImgZone(e,t,i){const{zoneName:r,width:n,height:a,offsetX:o,offsetY:s}=i,c=join(t,`${r}-${Math.random()}.png`),p={dst:c,src:e,quality:100,cropWidth:n,cropHeight:a,x:o,y:s};return from(crop(p)).pipe(map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function runOcr(e,t){t||(t="eng");const i=`tesseract "${e}" "${e}" -l ${t}`;return run(i)}function retrieveKeyValuesFromOcrResult(e,t,i,r=!1){if(!t)throw new Error("matchRules empty");return readTxtFile(e).pipe(map(e=>({buf:e,regexp:t})),map(({buf:e,regexp:t})=>{const n=i&&"function"==typeof i?i(e):e.toString("utf8");return retrieveValueByRegexp(n,t,r)}))}function getRegexpOptsByName(e,t){for(const i of Object.keys(t))if(i===e)return t[i]}function retrieveValueByRegexp(e,t,i=!1){const r=regexMatch(e,t,i);return i&&console.info("retrieveValueByRegexp ----- text start: ---------------\x3e \n",e,"\n<--------------- text END ----------------\n\n",t,">>>>>>>>matched value: ",r,"\n"),r}function readTxtFile(e){if(!e)throw new Error("path empty");const t=e.split(".");if(t.length>1&&"txt"!==t[t.length-1].toLowerCase())throw new Error(`file extensiion must empty or be '.txt', but is: "${e}"`);return defer(async()=>readFileAsync(e))}function prepareContent(e){let t=e&&e.byteLength?e.toString():"";return t?t=t.replace(/(?<=\S) /g,""):""}function regexMatch(e,t,i=!1){if(e)for(const r of t){const t=e.match(r);if(Array.isArray(t)&&t.length)return r.global&&t.length>1?(i&&console.info("----------multi matched regex: --------------\x3e\n",t,"\n--- used regex ----: ",r,"\n<-------------ignore matched result---------------\n\n"),""):(i&&console.info("----------matched regex: --------------\x3e\n",t,"\n--- used regex ----: ",r,"\n<----------------------------\n\n"),t[0])}}const moment$1=moment_;class Bvo{constructor(e){this.options=e;const t=+this.options.globalScale;this.options.globalScale=Number.isNaN(t)||t<=0?1:t,this.options.debug=!!this.options.debug;const{baseTmpDir:i,splitTmpDir:r,resizeImgDir:n}=e,a=i||initialBaseTmpDir,o=r||initialSplitTmpDir,s=n||initialResizeImgDir;from(createDir(a)).pipe(catchError(e=>(console.info(e),of(null))),concatMap(()=>createDir(o)),catchError(e=>(console.info(e),of(null))),concatMap(()=>createDir(s)),catchError(e=>(console.info(e),of(null)))).subscribe(()=>{},console.error)}run(e){return recognize(e,this.options)}}function recognize(e,t){const{bankZone:i,baseTmpDir:r,debug:n,defaultOcrLang:a,jpegQuality:o,scale:s,splitTmpDir:c,resizeImgDir:p,voucherConfigMap:m,globalScale:l,skipImgDir:f}=t,u=r||initialBaseTmpDir,g=c||initialSplitTmpDir,h=p||initialResizeImgDir,d=f?join(f,moment$1().format("YYYYMMDD")):"",y=parseVoucherConfigMapScale(m,l),w=parseOcrZoneScale(i,l),b=s/l,k=getBankRegexpOpts(y),D={baseDir:u,path:e,bankZone:w,bankRegexpOptsMap:k,debug:!!n,lang:a,skipImgDir:d},x=recognizePageBank(D).pipe(filter(({bankName:e})=>!!e&&"n/a"!==e),concatMap(({bankName:e,pagePath:t})=>(n&&console.info("start split page"),splitPageToImgs(t,e,g,y))),concatMap(({bankName:e,imgFile:t})=>{const i=getOcrFields(e,y);if(!i)throw new Error(`ocrFields not defined with bankName: "${e}"`);const r={bankName:e,baseDir:u,concurrent:2,debug:!!n,defaultValue:"",imgFile:t,ocrFields:i,voucherConfigMap:y};return n&&console.info("recognize item"),recognizeFields(r).pipe(map(i=>(i.set("bank",e),i.set("filename",t.name.trim()),i.set("path",t.path.trim()),i)))}),mergeMap(e=>{const t={retInfo:e,resizeDir:h,scale:b,jpegQuality:o,debug:!!n};return saveImgAndPrune(t)})),M=from(isFileExists(e)).pipe(filter(e=>e));return M.pipe(mergeMap(()=>x))}function recognizePageBank(e){const{baseDir:t,path:i,bankZone:r,bankRegexpOptsMap:n,debug:a,lang:o,skipImgDir:s}=e,c=join(t,zoneTmpDirPrefix,`${basename(i)}-${Math.random().toString()}`);return a&&console.info("recognize pageBank:",c,i),from(createDir(c)).pipe(catchError(e=>(console.info(e),of(null))),mergeMap(()=>cropImgZone(join(i),c,r)),concatMap(e=>runOcr(e.path,o).pipe(map(()=>({path:i,zoneImgPath:e.path})),mapTo(e.path),catchError(()=>of(e.path)))),concatMap(e=>from(n.entries()).pipe(concatMap(([t,i])=>retrieveKeyValuesFromOcrResult(e+".txt",i,e=>e.toString().replace(/(?<=\S)[. ]{1,2}(?=\S)/g,""),a).pipe(map(e=>({bankName:t,value:e})))),skipWhile(({value:e})=>"undefined"==typeof e||"string"==typeof e&&!e.length),take(1),map(({bankName:e})=>({bankName:e,pagePath:i})),defaultIfEmpty({bankName:"n/a",pagePath:""}))),tap(e=>{const{bankName:t,pagePath:r}=e;"n/a"!==t&&r||(console.info(`recognize bank of page fail. no matached regexp. file: "${i}", pagePath: "${r}" `),cpSkipImg(i,s)),a||rimraf(c).catch(console.info)}))}async function cpSkipImg(e,t){t&&(await isPathAcessible(t)||await createDir(t),copyFileAsync(e,join(t,basename(e))).catch(console.error))}function splitPageToImgs(e,t,i,r){const n=r.get(t);if(!n)throw new Error("bank config empty during split page to images");return splitPagetoItems(e,i,n).pipe(mergeMap(e=>{const i=from(e.values()).pipe(map(e=>({bankName:t,imgFile:e})));return i}))}function recognizeFields(e){const{bankName:t,baseDir:i,concurrent:r,debug:n,defaultValue:a,imgFile:o,ocrFields:s,voucherConfigMap:c}=e,p=join(i,zoneTmpDirPrefix,`${basename(o.path)}-${Math.random().toString()}`),m=getOcrZoneOptsByBankName(t,c);if(!m)throw new Error(`get bankConfig empty with bankName: "${t}"`);const l=from(createDir(p)).pipe(mergeMap(()=>cropImgAllZones(o.path,p,m.ocrZones)),concatMap(e=>{const t={bankConfig:m,ocrFields:s,defaultValue:a,debug:n,concurrent:r>0?r:2,zoneImgMap:e};return batchOcrAndRetrieve(t)}));return l}function batchOcrAndRetrieve(e){const{zoneImgMap:t,bankConfig:i,ocrFields:r,defaultValue:n,concurrent:a,debug:o}=e,{bankName:s}=i,c=from(t.entries()).pipe(delay(15e3),mergeMap(([,e])=>defer(async()=>{const t=e.path,i=t+".txt";return await isFileExists(t)&&await rimraf(t),await isFileExists(i)&&await rimraf(i),null}).pipe(delay(2e4),retry(2),catchError(e=>(console.info("Delete zone file retry failed:",e),of(null))))),catchError(()=>of(null))),p=from(t.entries()).pipe(concatMap(e=>ocrAndPickFromZoneImg(e,i,a,o)),reduce((e,t)=>e.set(t.fieldName,t.value),new Map),map(e=>e.set("bank",s)),map(e=>setDefaultValue(e,r,n)),tap(()=>o||c.subscribe()));return p}function setDefaultValue(e,t,i=""){const r=new Map;for(const i of Object.keys(t)){const t=e.get(i);"string"==typeof t?r.set(i,t):r.set(i,"")}return r}function processZoneImgRow(e){const{fieldName:t,value:i}=e,r={fieldName:t,value:i};switch(t){case"amount":r.value=i.trim().replace(/,/g,"");break;case"date":r.value=i.trim().replace(/\D/g,"");break;case"sn":case"destAccountNumber":case"paymentAccountNumber":r.value=i.trim()}return r}function validateZoneImgRow(e,t){if("string"!=typeof t)return!1;switch(e){case"amount":if(validateRetInfoAmout(t))return!0;break;case"sn":if(t)return!0;break;case"date":if(validateRetInfoDate(t))return!0;break;case"bank":return!0;default:if("string"==typeof t)return!0}return!1}function validateRetInfoAmout(e){if(!e)return!1;if(!e.trim())return!1;const t=parseFloat(e);return!Number.isNaN(t)&&"number"==typeof t}function validateRetInfoDate(e){return!!e&&moment$1(e,"YYYYMMDD").isValid()}function getBankRegexpOpts(e){const t=new Map;for(const{bankName:i,regexpOpts:r}of e.values())r&&r.bank&&t.set(i,r.bank);if(!t.size)throw new Error("not BankRegexpOpts found, should not set");return t}function getOcrFields(e,t){const i=t.get(e);if(!i)throw new Error(`get ocrFields empty by bankName: "${e}"`);return i.ocrFields}function saveImgAndPrune(e){const{retInfo:t,resizeDir:i,debug:r,scale:n,jpegQuality:a}=e;let o=t.get("filename");const s=t.get("path"),c=t.get("sn");if(!o)throw new Error(`result info map invalid with empty path. info: ${t}`);if(!s)throw new Error(`result info map invalid with empty path. info: ${t}`);if(c){const e=o.split(".").slice(0,-1);o=e+`-${c.replace(/[^\d\w]/g,"_")}.jpg`}const p=moment$1().format("YYYY-MM-DD"),m=join(i,p,o);return t.set("filename",o),resizeAndSaveImg(s,m,n,a).pipe(map(e=>(t.set("path",e.path),t)),tap(()=>{r||unlinkAsync(s).catch(console.info)}))}function genFieldLangs(e,t,i){return i&&"undefined"!=typeof i[e]&&Array.isArray(i[e])?i[e]:t}function ocrAndPickFromZoneImg(e,t,i=2,r=!1){const{ocrDefaultLangs:n,ocrFieldLangs:a,regexpOpts:o,ocrFields:s}=t;return from(Object.entries(s)).pipe(filter(t=>{const i=t[1];return!!i&&i===e[0]}),mergeMap(t=>{const i=t[0];return ocrAndPickFieldFromZoneImg(i,e,o,n,a,r)},i))}function ocrAndPickFieldFromZoneImg(e,t,i,r,n,a=!1){const[,o]=t,s=genFieldLangs(e,r,n),c=s.length-1,p=getRegexpOptsByName(e,i);if(!p)throw new Error(`got regexp empty by zoneName: "${e}"`);return from(s).pipe(concatMap(e=>runOcr(o.path,e).pipe(mapTo(!0),catchError(()=>of(!0)))),concatMap(()=>retrieveKeyValuesFromOcrResult(o.path+".txt",p,prepareContent,a).pipe(map(t=>({fieldName:e,value:t})))),skipWhile((e,t)=>{const i=validateZoneImgRow(e.fieldName,e.value);return!i&&t!==c}),take(1),map(e=>("string"!=typeof e.value&&(e.value=""),e)),map(processZoneImgRow))}function parseVoucherConfigMapScale(e,t){const i=new Map;for(const[r,n]of e){const e=Object.assign({},n),a=[];for(const i of e.ocrZones)a.push(parseOcrZoneScale(i,t));e.ocrZones=a,e.width=e.width*t,e.height=e.height*t,e.marginBottom=e.marginBottom*t,i.set(r,e)}return i}function parseOcrZoneScale(e,t){const i=Object.assign({},e);return i.width=i.width*t,i.height=i.height*t,i.offsetX=i.offsetX*t,i.offsetY=i.offsetY*t,i}export{initialBankZone,initialBaseTmpDir,initialResizeImgDir,initialSplitTmpDir,zoneTmpDirPrefix,Bvo,recognize}; | ||
import{access,chmod,close,copyFile,mkdir,open,readdir,readFile,rmdir,stat,unlink,write,writeFile}from"fs";import{basename,join,normalize,resolve,sep}from"path";import{promisify}from"util";import{tmpdir,cpus}from"os";import{crop,info,resize}from"easyimage";import*as moment_ from"moment";import{from,of,range,defer}from"rxjs";import{concatMap,map,mergeMap,catchError,last,mapTo,reduce,defaultIfEmpty,filter,skipWhile,take,tap}from"rxjs/operators";import run from"rxrunscript";const closeAsync=promisify(close),chmodAsync=promisify(chmod),copyFileAsync=promisify(copyFile),mkdirAsync=promisify(mkdir),openAsync=promisify(open),readFileAsync=promisify(readFile),readDirAsync=promisify(readdir),rmdirAsync=promisify(rmdir),unlinkAsync=promisify(unlink),writeAsync=promisify(write),writeFileAsync=promisify(writeFile);function isPathAcessible(e){return e?new Promise(t=>access(e,e=>t(!e))):Promise.resolve(!1)}function isDirExists(e){return e?isDirFileExists(e,"DIR"):Promise.resolve(!1)}function isFileExists(e){return e?isDirFileExists(e,"FILE"):Promise.resolve(!1)}function isDirFileExists(e,t){return e?new Promise(n=>{stat(e,(e,i)=>{n(!(e||!i)&&("DIR"===t?i.isDirectory():i.isFile()))})}):Promise.resolve(!1)}async function createDir(e){if(!e)throw new Error("value of path param invalid");e=normalize(e),await isDirExists(e)||await e.split(sep).reduce(async(e,t)=>{const n=resolve(await e,t);return await isPathAcessible(n)||await mkdirAsync(n,493),n},Promise.resolve(sep))}async function rimraf(e){e&&(await _rimraf(e),await isDirExists(e)&&await rmdirAsync(e))}async function _rimraf(e){if(e&&await isPathAcessible(e)){if(await isFileExists(e))return void await unlinkAsync(e);const t=await readDirAsync(e);if(t.length)for(const n of t)await _rimraf(join(e,n));else await rmdirAsync(e)}}const initialBaseTmpDir=join(tmpdir(),"voucher-ocr"),initialResizeImgDir=join(initialBaseTmpDir,"resize"),initialSplitTmpDir=join(initialBaseTmpDir,"split"),zoneTmpDirPrefix="zone",initialBankZone={zoneName:"bank",width:2250,height:390,offsetX:70,offsetY:10},moment=moment_;function splitPagetoItems(e,t,n){return readImgInfo(e).pipe(map(i=>{const r=calcItemsPerPage(i.height,n.height);return r?range(0,r).pipe(mergeMap(r=>{const a={index:r,itemConfig:Object.assign({},n),srcPath:e,targetDir:t,pageHeight:i.height};return i.width<a.itemConfig.width&&(a.itemConfig.width=i.width),parseSplitPage(a).pipe(mergeMap(e=>{const t=new Map;return e.name&&t.set(e.name,e),of(t)}))})):of(new Map)}),concatMap(e=>e))}function resizeAndSaveImg(e,t,n,i){if(n<=0||n>1)throw new Error(`value of scale invalid: "${n}"`);return readImgInfo(e).pipe(mergeMap(r=>{const a={src:e,dst:t,width:r.width*n,height:r.height*n,quality:i};return from(resize(a))}),map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function parseSplitPage(e){const{index:t,srcPath:n,pageHeight:i}=e,{width:r,marginBottom:a}=e.itemConfig;let{height:o}=e.itemConfig;const s=0,c=t*o;if(c+o>i&&(o=i-c),o/i<.1||o<100){const e={name:"",path:"",width:0,height:0,size:0};return of(e)}const p=basename(n),m=p.split(".")[0],l=moment().format("YYYYMMDD"),f=join(e.targetDir,`${l}-${m}-${Math.random()}-${t}.jpg`),u={dst:f,src:n,quality:100,cropWidth:r,cropHeight:o+a,x:0,y:c};return from(crop(u)).pipe(mergeMap(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return of(t)}))}function readImgInfo(e){return from(info(e))}function calcItemsPerPage(e,t){const n=33;return e>=t?Math.ceil((e+33)/t):1}function getOcrZoneOptsByBankName(e,t){return t.get(e)}function cropImgAllZones(e,t,n,i){const r=[],a=new Set;for(const e of Object.values(n))e&&!a.has(e)&&a.add(e);for(const e of i){const t=e.zoneName;t&&a.has(t)&&r.push(e)}return from(r).pipe(mergeMap(n=>cropImgZone(e,t,n).pipe(map(e=>[n.zoneName,e]))),reduce((e,t)=>(e.set(t[0],t[1]),e),new Map))}function cropImgZone(e,t,n){const{zoneName:i,width:r,height:a,offsetX:o,offsetY:s}=n,c=join(t,`${i}-${Math.random()}.png`),p={dst:c,src:e,quality:100,cropWidth:r,cropHeight:a,x:o,y:s};return from(crop(p)).pipe(map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function runOcr(e,t,n){t||(t="eng");const i=`tesseract "${e}" "${n}" -l ${t}`;return run(i).pipe(last(),catchError(()=>of(void 0)),mapTo(void 0))}function retrieveKeyValuesFromOcrResult(e,t,n,i=!1){if(!t)throw new Error("matchRules empty");return readTxtFile(e).pipe(map(e=>({buf:e,regexp:t})),map(({buf:e,regexp:t})=>{const r=n&&"function"==typeof n?n(e):e.toString("utf8");return retrieveValueByRegexp(r,t,i)}))}function getRegexpOptsByName(e,t){for(const n of Object.keys(t))if(n===e)return t[n]}function retrieveValueByRegexp(e,t,n=!1){const i=regexMatch(e,t,n);return n&&console.info("retrieveValueByRegexp ----- text start: ---------------\x3e \n",e,"\n<--------------- text END ----------------\n\n",t,">>>>>>>>matched value: ",i,"\n"),i}function readTxtFile(e){if(!e)throw new Error("path empty");const t=e.split(".");if(t.length>1&&"txt"!==t[t.length-1].toLowerCase())throw new Error(`file extensiion must empty or be '.txt', but is: "${e}"`);return defer(async()=>readFileAsync(e))}function prepareContent(e){let t=e&&e.byteLength?e.toString():"";return t?t=t.replace(/(?<=\S) /g,""):""}function regexMatch(e,t,n=!1){if(e)for(const i of t){const t=e.match(i);if(Array.isArray(t)&&t.length)return i.global&&t.length>1?(n&&console.info("----------multi matched regex: --------------\x3e\n",t,"\n--- used regex ----: ",i,"\n<-------------ignore matched result---------------\n\n"),""):(n&&console.info("----------matched regex: --------------\x3e\n",t,"\n--- used regex ----: ",i,"\n<----------------------------\n\n"),t[0])}}function getOcrRetLangPath(e,t,n){const i=getOcrRetLangMap(e,t);if(!i)return"";const r=i.get(n);return r||""}function updateOcrRetTxtMap(e,t,n,i){if(!t||!n||!i)return;let r=getOcrRetLangMap(e,t);r||(r=new Map),n&&i&&updateOcrRetLangMap(r,n,i),e.set(t,r)}function updateOcrRetLangMap(e,t,n){e.set(t,n)}function getOcrRetLangMap(e,t){return e.get(t)}const moment$1=moment_;class Bvo{constructor(e){this.options=e;const t=+this.options.globalScale;this.options.globalScale=Number.isNaN(t)||t<=0?1:t,this.options.debug=!!this.options.debug;const{baseTmpDir:n,splitTmpDir:i,resizeImgDir:r}=e,a=n||initialBaseTmpDir,o=i||initialSplitTmpDir,s=r||initialResizeImgDir;from(createDir(a)).pipe(catchError(e=>(console.info(e),of(null))),concatMap(()=>createDir(o)),catchError(e=>(console.info(e),of(null))),concatMap(()=>createDir(s)),catchError(e=>(console.info(e),of(null)))).subscribe(()=>{},console.error)}run(e){return recognize(e,this.options)}}function recognize(e,t){const{bankZone:n,baseTmpDir:i,concurrent:r,debug:a,defaultOcrLang:o,jpegQuality:s,scale:c,splitTmpDir:p,resizeImgDir:m,voucherConfigMap:l,globalScale:f,skipImgDir:u}=t,g=i||initialBaseTmpDir,h=p||initialSplitTmpDir,d=m||initialResizeImgDir,y=u?join(u,moment$1().format("YYYYMMDD")):"",b="number"==typeof r&&r>0?r:cpus().length,w=parseVoucherConfigMapScale(l,f),k=parseOcrZoneScale(n,f),D=c/f,M=getBankRegexpOpts(w),x={baseDir:g,path:e,bankZone:k,bankRegexpOptsMap:M,debug:!!a,lang:o,skipImgDir:y},v=recognizePageBank(x).pipe(filter(({bankName:e})=>!!e&&"n/a"!==e),mergeMap(({bankName:e,pagePath:t})=>(a&&console.info("start split page"),splitPageToImgs(t,e,h,w))),mergeMap(({bankName:e,imgFile:t})=>{const n=getOcrFields(e,w);if(!n)throw new Error(`ocrFields not defined with bankName: "${e}"`);const i={bankName:e,baseDir:g,debug:!!a,defaultValue:"",imgFile:t,ocrFields:n,voucherConfigMap:w};return a&&console.info("recognize item"),recognizeFields(i).pipe(map(n=>(n.set("bank",e),n.set("filename",t.name.trim()),n.set("path",t.path.trim()),n)))},b>0?b:1),mergeMap(e=>{const t={retInfo:e,resizeDir:d,scale:D,jpegQuality:s,debug:!!a};return saveImgAndPrune(t)})),I=from(isFileExists(e)).pipe(filter(e=>e));return I.pipe(mergeMap(()=>v))}function recognizePageBank(e){const{baseDir:t,path:n,bankZone:i,bankRegexpOptsMap:r,debug:a,lang:o,skipImgDir:s}=e,c=join(t,zoneTmpDirPrefix,`${basename(n)}-${Math.random().toString()}`);return a&&console.info("recognize pageBank:",c,n),from(createDir(c)).pipe(catchError(e=>(console.info(e),of(null))),mergeMap(()=>cropImgZone(join(n),c,i)),concatMap(e=>runOcr(e.path,o,e.path).pipe(mapTo(e.path))),concatMap(e=>from(r.entries()).pipe(concatMap(([t,n])=>retrieveKeyValuesFromOcrResult(e+".txt",n,e=>e.toString().replace(/(?<=\S)[. ]{1,2}(?=\S)/g,""),a).pipe(map(e=>({bankName:t,value:e})))),skipWhile(({value:e})=>"undefined"==typeof e||"string"==typeof e&&!e.length),take(1),map(({bankName:e})=>({bankName:e,pagePath:n})),defaultIfEmpty({bankName:"n/a",pagePath:""}))),tap(e=>{const{bankName:t,pagePath:i}=e;"n/a"!==t&&i||(console.info(`recognize bank of page fail. no matached regexp. file: "${n}", pagePath: "${i}" `),cpSkipImg(n,s)),a||rimraf(c).catch(console.info)}))}async function cpSkipImg(e,t){t&&(await isPathAcessible(t)||await createDir(t),copyFileAsync(e,join(t,basename(e))).catch(console.error))}function splitPageToImgs(e,t,n,i){const r=i.get(t);if(!r)throw new Error("bank config empty during split page to images");return splitPagetoItems(e,n,r).pipe(mergeMap(e=>{const n=from(e.values()).pipe(map(e=>({bankName:t,imgFile:e})));return n}))}function recognizeFields(e){const{bankName:t,baseDir:n,debug:i,defaultValue:r,imgFile:a,ocrFields:o,voucherConfigMap:s}=e,c=join(n,zoneTmpDirPrefix,`${basename(a.path)}`),p=getOcrZoneOptsByBankName(t,s);if(!p)throw new Error(`get bankConfig empty with bankName: "${t}"`);const m=from(createDir(c)).pipe(mergeMap(()=>cropImgAllZones(a.path,c,o,p.ocrZones)),concatMap(e=>{const t={bankConfig:p,ocrFields:o,defaultValue:r,debug:i,zoneImgMap:e};return batchOcrAndRetrieve(t)}),tap(()=>{i||setTimeout(e=>{rimraf(e).catch(console.info)},5e3,c)}));return m}function batchOcrAndRetrieve(e){const{zoneImgMap:t,bankConfig:n,ocrFields:i,defaultValue:r,debug:a}=e,{bankName:o}=n,s=from(t.entries()).pipe(concatMap(e=>ocrAndPickFromZoneImg(e,n,a)),reduce((e,t)=>e.set(t.fieldName,t.value),new Map),map(e=>e.set("bank",o)),map(e=>setDefaultValue(e,i,r)));return s}function setDefaultValue(e,t,n=""){const i=new Map;for(const n of Object.keys(t)){const t=e.get(n);"string"==typeof t?i.set(n,t):i.set(n,"")}return i}function processZoneImgRow(e){const{fieldName:t,value:n}=e,i=Object.assign({},e);switch(t){case"amount":i.value=n.trim().replace(/,/g,"");break;case"date":i.value=n.trim().replace(/\D/g,"");break;case"sn":case"destAccountNumber":case"paymentAccountNumber":i.value=n.trim()}return i}function validateZoneImgRow(e,t){if("string"!=typeof t)return!1;switch(e){case"amount":if(validateRetInfoAmout(t))return!0;break;case"sn":if(t)return!0;break;case"date":if(validateRetInfoDate(t))return!0;break;case"bank":return!0;default:if("string"==typeof t)return!0}return!1}function validateRetInfoAmout(e){if(!e)return!1;if(!e.trim())return!1;const t=parseFloat(e);return!Number.isNaN(t)&&"number"==typeof t}function validateRetInfoDate(e){return!!e&&moment$1(e,"YYYYMMDD").isValid()}function getBankRegexpOpts(e){const t=new Map;for(const{bankName:n,regexpOpts:i}of e.values())i&&i.bank&&t.set(n,i.bank);if(!t.size)throw new Error("not BankRegexpOpts found, should not set");return t}function getOcrFields(e,t){const n=t.get(e);if(!n)throw new Error(`get ocrFields empty by bankName: "${e}"`);return n.ocrFields}function saveImgAndPrune(e){const{retInfo:t,resizeDir:n,debug:i,scale:r,jpegQuality:a}=e,o=t.get("filename"),s=t.get("path"),c=t.get("sn");if(!o)throw new Error(`result info map invalid with empty path. info: ${t}`);if(!s)throw new Error(`result info map invalid with empty path. info: ${t}`);const p=o.split(".").slice(0,-1).join("").split("-");p.splice(2,1);let m=p.join("-");c&&(m+=`-${c.replace(/[^\d\w]/g,"_")}`),m+=".jpg";const l=moment$1().format("YYYY-MM-DD"),f=join(n,l,m);return t.set("filename",m),resizeAndSaveImg(s,f,r,a).pipe(map(e=>(t.set("path",e.path),t)),tap(()=>{i||unlinkAsync(s).catch(console.info)}))}function genFieldLangs(e,t,n){return n&&"undefined"!=typeof n[e]&&Array.isArray(n[e])?n[e]:t}function ocrAndPickFromZoneImg(e,t,n=!1){const{ocrDefaultLangs:i,ocrFieldLangs:r,regexpOpts:a,ocrFields:o}=t,s=new Map;return from(Object.entries(o)).pipe(filter(t=>{const n=t[1];return!!n&&n===e[0]}),concatMap(t=>{const o=t[0],c=t[1];return ocrAndPickFieldFromZoneImg(o,c,e,a,i,r,n,s)}))}function ocrAndPickFieldFromZoneImg(e,t,n,i,r,a,o=!1,s){const[,c]=n,p=genFieldLangs(e,r,a),m=p.length-1,l=getRegexpOptsByName(e,i);if(!l)throw new Error(`got regexp empty by zoneName: "${e}"`);return from(p).pipe(concatMap(n=>{const i=getOcrRetLangPath(s,t,n);if(i)return retrieveKeyValuesFromOcrResult(i+".txt",l,prepareContent,o).pipe(map(r=>({fieldName:e,zoneName:t,value:r,usedLang:n,txtPath:i})));{const i=c.path,r=i.split(".").slice(0,-1).join(".")+`-${Math.random()}`;return runOcr(i,n,r).pipe(concatMap(()=>retrieveKeyValuesFromOcrResult(r+".txt",l,prepareContent,o).pipe(map(i=>({fieldName:e,zoneName:t,value:i,usedLang:n,txtPath:r})))))}}),tap(({zoneName:e,usedLang:t,txtPath:n})=>{updateOcrRetTxtMap(s,e,t,n)}),skipWhile((e,t)=>{const n=validateZoneImgRow(e.fieldName,e.value);return!n&&t!==m}),take(1),map(e=>("string"!=typeof e.value&&(e.value=""),e)),map(processZoneImgRow))}function parseVoucherConfigMapScale(e,t){const n=new Map;for(const[i,r]of e){const e=Object.assign({},r),a=[];for(const n of e.ocrZones)a.push(parseOcrZoneScale(n,t));e.ocrZones=a,e.width=e.width*t,e.height=e.height*t,e.marginBottom=e.marginBottom*t,n.set(i,e)}return n}function parseOcrZoneScale(e,t){const n=Object.assign({},e);return n.width=n.width*t,n.height=n.height*t,n.offsetX=n.offsetX*t,n.offsetY=n.offsetY*t,n}export{initialBankZone,initialBaseTmpDir,initialResizeImgDir,initialSplitTmpDir,zoneTmpDirPrefix,Bvo,recognize}; | ||
//# sourceMappingURL=bvocr.esm.min.js.map |
@@ -5,3 +5,3 @@ /** | ||
* | ||
* @version 0.13.0 | ||
* @version 1.0.0 | ||
* @author waiting | ||
@@ -252,4 +252,20 @@ * @license MIT | ||
// crop all defined zones from a image | ||
function cropImgAllZones(srcPath, zoneTmpDir, ocrZoneOptsArr) { | ||
return rxjs.from(ocrZoneOptsArr).pipe(operators.mergeMap(ocrZoneOpts => { | ||
function cropImgAllZones(srcPath, zoneTmpDir, ocrFields, ocrZoneOptsArr) { | ||
const flds = []; | ||
const srcFldSet = new Set(); | ||
for (const srcFld of Object.values(ocrFields)) { | ||
if (!srcFld || srcFldSet.has(srcFld)) { | ||
continue | ||
} | ||
srcFldSet.add(srcFld); | ||
} | ||
for (const row of ocrZoneOptsArr) { | ||
const fld = row.zoneName; | ||
if (fld && srcFldSet.has(fld)) { | ||
flds.push(row); | ||
} | ||
} | ||
return rxjs.from(flds).pipe(operators.mergeMap(ocrZoneOpts => { | ||
return cropImgZone(srcPath, zoneTmpDir, ocrZoneOpts).pipe(operators.map(img => { | ||
@@ -289,9 +305,9 @@ return [ocrZoneOpts.zoneName, img] | ||
} | ||
// ocr a iamge file | ||
function runOcr(path$$1, lang) { | ||
// second path will be append with '.txt' | ||
// ocr a iamge file, txtPath without extension | ||
function runOcr(imgPath, lang, txtPath) { | ||
// second path will be append with '.txt' by tesseract | ||
if (!lang) { | ||
lang = 'eng'; | ||
} | ||
const cmd = `tesseract "${path$$1}" "${path$$1}" -l ${lang}`; | ||
const cmd = `tesseract "${imgPath}" "${txtPath}" -l ${lang}`; | ||
@@ -301,3 +317,4 @@ // const opts = { | ||
// } | ||
return run(cmd) | ||
return run(cmd).pipe(operators.last(), operators.catchError(() => rxjs.of(void 0)), // tesseract will exit with code(0) but output with stderr | ||
operators.mapTo(void 0)) | ||
} | ||
@@ -382,3 +399,33 @@ | ||
} | ||
function getOcrRetLangPath(ocrRetTxtMap, fieldName, lang) { | ||
const ocrRetLangMap = getOcrRetLangMap(ocrRetTxtMap, fieldName); | ||
if (!ocrRetLangMap) { | ||
return '' | ||
} | ||
const txtPath = ocrRetLangMap.get(lang); | ||
return txtPath ? txtPath : '' | ||
} | ||
function updateOcrRetTxtMap(ocrRetTxtMap, fieldName, lang, txtPath) { | ||
if (!fieldName || !lang || !txtPath) { | ||
return | ||
} | ||
let ocrRetLangMap = getOcrRetLangMap(ocrRetTxtMap, fieldName); | ||
if (!ocrRetLangMap) { | ||
ocrRetLangMap = new Map(); | ||
} | ||
if (lang && txtPath) { | ||
updateOcrRetLangMap(ocrRetLangMap, lang, txtPath); | ||
} | ||
ocrRetTxtMap.set(fieldName, ocrRetLangMap); | ||
} | ||
function updateOcrRetLangMap(ocrRetLangMap, lang, txtPath) { | ||
ocrRetLangMap.set(lang, txtPath); | ||
} | ||
function getOcrRetLangMap(ocrRetTxtMap, fieldName) { | ||
return ocrRetTxtMap.get(fieldName) | ||
} | ||
const moment$1 = moment_; | ||
@@ -417,3 +464,3 @@ | ||
function recognize(imgPath, options) { | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, resizeImgDir, voucherConfigMap, globalScale, skipImgDir, } = options; | ||
const { bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, resizeImgDir, voucherConfigMap, globalScale, skipImgDir, } = options; | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir; | ||
@@ -423,2 +470,3 @@ const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir; | ||
const skipDir = skipImgDir ? path.join(skipImgDir, moment$1().format('YYYYMMDD')) : ''; | ||
const cnumber = typeof concurrent === 'number' && concurrent > 0 ? concurrent : os.cpus().length; | ||
// if config set for 300api, but source image from 600dpi, then set globalSale=600/300. default is 1 | ||
@@ -443,6 +491,6 @@ const voucherConfigMapNew = parseVoucherConfigMapScale(voucherConfigMap, globalScale); | ||
}; | ||
const ret$ = recognizePageBank(bankOpts).pipe(operators.filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), operators.concatMap(({ bankName, pagePath }) => { | ||
const ret$ = recognizePageBank(bankOpts).pipe(operators.filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), operators.mergeMap(({ bankName, pagePath }) => { | ||
!!debug && console.info('start split page'); | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
}), operators.concatMap(({ bankName, imgFile }) => { | ||
}), operators.mergeMap(({ bankName, imgFile }) => { | ||
const ocrFields = getOcrFields(bankName, voucherConfigMapNew); | ||
@@ -456,3 +504,2 @@ | ||
baseDir, | ||
concurrent: 2, | ||
debug: !!debug, | ||
@@ -472,3 +519,3 @@ defaultValue: '', | ||
})) | ||
}), operators.mergeMap(retInfo => { | ||
}, cnumber > 0 ? cnumber : 1), operators.mergeMap(retInfo => { | ||
const opts = { | ||
@@ -498,3 +545,3 @@ retInfo, | ||
operators.concatMap(zoneInfo => { | ||
return runOcr(zoneInfo.path, lang).pipe(operators.map(() => ({ path: path$$1, zoneImgPath: zoneInfo.path })), operators.mapTo(zoneInfo.path), operators.catchError(() => rxjs.of(zoneInfo.path))) | ||
return runOcr(zoneInfo.path, lang, zoneInfo.path).pipe(operators.mapTo(zoneInfo.path)) | ||
}), operators.concatMap(zoneImgPath => { | ||
@@ -550,4 +597,4 @@ // 批量提取参数值 | ||
function recognizeFields(options) { | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = path.join(baseDir, zoneTmpDirPrefix, `${path.basename(imgFile.path)}-${Math.random().toString()}`); | ||
const { bankName, baseDir, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = path.join(baseDir, zoneTmpDirPrefix, `${path.basename(imgFile.path)}`); | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap); | ||
@@ -558,7 +605,7 @@ | ||
} | ||
const stream$ = rxjs.from(createDir(zoneTmpDir)).pipe(operators.mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, bankConfig.ocrZones)), // 切分图片区域分别做ocr识别 | ||
operators.concatMap(fileMap => { | ||
const stream$ = rxjs.from(createDir(zoneTmpDir)).pipe( | ||
// 切分图片区域分别做ocr识别 | ||
operators.mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), operators.concatMap(fileMap => { | ||
const opts = { | ||
bankConfig, ocrFields, defaultValue, debug, | ||
concurrent: concurrent > 0 ? concurrent : 2, | ||
zoneImgMap: fileMap, | ||
@@ -568,2 +615,8 @@ }; | ||
return batchOcrAndRetrieve(opts) | ||
}), operators.tap(() => { | ||
if (!debug) { | ||
setTimeout(dir => { | ||
rimraf(dir).catch(console.info); | ||
}, 5000, zoneTmpDir); | ||
} | ||
})); | ||
@@ -574,26 +627,33 @@ | ||
function batchOcrAndRetrieve(options) { | ||
const { zoneImgMap, bankConfig, ocrFields, defaultValue, concurrent, debug, } = options; | ||
const { zoneImgMap, bankConfig, ocrFields, defaultValue, debug, } = options; | ||
const { bankName } = bankConfig; | ||
const del$ = rxjs.from(zoneImgMap.entries()).pipe(operators.delay(15000), operators.mergeMap(([, imgInfo]) => { | ||
return rxjs.defer(async () => { | ||
const img = imgInfo.path; | ||
const txt = img + '.txt'; | ||
if (await isFileExists(img)) { | ||
await rimraf(img); | ||
} | ||
if (await isFileExists(txt)) { | ||
await rimraf(txt); | ||
} | ||
return null | ||
}).pipe(operators.delay(20000), operators.retry(2), operators.catchError(err => { | ||
console.info('Delete zone file retry failed:', err); | ||
return rxjs.of(null) | ||
})) | ||
}), operators.catchError(() => { | ||
return rxjs.of(null) | ||
})); | ||
// const del$ = ofrom(zoneImgMap.entries()).pipe( | ||
// delay(15000), | ||
// mergeMap(([, imgInfo]) => { | ||
// return defer(async () => { | ||
// const img = imgInfo.path | ||
// const txt = img + '.txt' | ||
// if (await isFileExists(img)) { | ||
// await rimraf(img) | ||
// } | ||
// if (await isFileExists(txt)) { | ||
// await rimraf(txt) | ||
// } | ||
// return null | ||
// }).pipe( | ||
// delay(20000), | ||
// retry(2), | ||
// catchError(err => { | ||
// console.info('Delete zone file retry failed:', err) | ||
// return of(null) | ||
// }), | ||
// ) | ||
// }), | ||
// catchError(() => { | ||
// return of(null) | ||
// }), | ||
// ) | ||
const process$ = rxjs.from(zoneImgMap.entries()).pipe(operators.concatMap((zoneImgRow) => { | ||
return ocrAndPickFromZoneImg(zoneImgRow, bankConfig, concurrent, debug) | ||
}), operators.reduce((acc, curr) => acc.set(curr.fieldName, curr.value), new Map()), operators.map(retMap => retMap.set('bank' /* bank */, bankName)), operators.map(retMap => setDefaultValue(retMap, ocrFields, defaultValue)), operators.tap(() => debug || del$.subscribe())); | ||
return ocrAndPickFromZoneImg(zoneImgRow, bankConfig, debug) | ||
}), operators.reduce((acc, curr) => acc.set(curr.fieldName, curr.value), new Map()), operators.map(retMap => retMap.set('bank' /* bank */, bankName)), operators.map(retMap => setDefaultValue(retMap, ocrFields, defaultValue))); | ||
@@ -619,3 +679,3 @@ return process$ | ||
const { fieldName, value } = zoneRet; | ||
const ret = { fieldName, value }; | ||
const ret = Object.assign({}, zoneRet); | ||
@@ -717,3 +777,3 @@ switch (fieldName) { | ||
const { retInfo, resizeDir, debug, scale, jpegQuality } = options; | ||
let filename = retInfo.get('filename'); | ||
const filename = retInfo.get('filename'); | ||
const path$$1 = retInfo.get('path'); | ||
@@ -728,11 +788,16 @@ const sn = retInfo.get('sn' /* sn */); | ||
} | ||
// YYYYMMDD-A15295623630009-0.31486898522590034-pageSplitItemIndex.jpg | ||
const arr = filename.split('.').slice(0, -1).join('').split('-'); | ||
arr.splice(2, 1); | ||
let filename2 = arr.join('-'); | ||
if (sn) { | ||
const name = filename.split('.').slice(0, -1); | ||
filename = name + `-${sn.replace(/[^\d\w]/g, '_')}.jpg`; | ||
filename2 = filename2 + `-${sn.replace(/[^\d\w]/g, '_')}`; | ||
} | ||
filename2 = filename2 + '.jpg'; | ||
const curDate = moment$1().format('YYYY-MM-DD'); | ||
const targetPath = path.join(resizeDir, curDate, filename); | ||
const targetPath = path.join(resizeDir, curDate, filename2); | ||
retInfo.set('filename', filename); | ||
retInfo.set('filename', filename2); | ||
return resizeAndSaveImg(path$$1, targetPath, scale, jpegQuality).pipe(operators.map(imgInfo => { | ||
@@ -751,4 +816,5 @@ retInfo.set('path', imgInfo.path); | ||
} | ||
function ocrAndPickFromZoneImg(zoneImgRow, config, concurrent = 2, debug = false) { | ||
function ocrAndPickFromZoneImg(zoneImgRow, config, debug = false) { | ||
const { ocrDefaultLangs, ocrFieldLangs, regexpOpts, ocrFields } = config; | ||
const ocrRetTxtMap = new Map(); | ||
@@ -759,9 +825,10 @@ return rxjs.from(Object.entries(ocrFields)).pipe(operators.filter(data => { | ||
return !!zoneName && zoneName === zoneImgRow[0] | ||
}), operators.mergeMap(data => { | ||
}), operators.concatMap(data => { | ||
const fieldName = data[0]; | ||
const zoneName = data[1]; | ||
return ocrAndPickFieldFromZoneImg(fieldName, zoneImgRow, regexpOpts, ocrDefaultLangs, ocrFieldLangs, debug) | ||
}, concurrent)) | ||
return ocrAndPickFieldFromZoneImg(fieldName, zoneName, zoneImgRow, regexpOpts, ocrDefaultLangs, ocrFieldLangs, debug, ocrRetTxtMap) | ||
})) | ||
} | ||
function ocrAndPickFieldFromZoneImg(fieldName, zoneImgRow, regexpOpts, defaultLangs, fieldLangs, debug = false) { | ||
function ocrAndPickFieldFromZoneImg(fieldName, zoneName, zoneImgRow, regexpOpts, defaultLangs, fieldLangs, debug = false, ocrRetTxtMap) { | ||
const [, zoneImg] = zoneImgRow; | ||
@@ -779,6 +846,37 @@ const langs = genFieldLangs(fieldName, defaultLangs, fieldLangs); | ||
operators.concatMap(lang => { | ||
// console.log(`fld "${fieldName}" use lang:`, lang, zoneImg.path) | ||
return runOcr(zoneImg.path, lang).pipe(operators.mapTo(true), operators.catchError(() => rxjs.of(true))) | ||
}), operators.concatMap(() => { | ||
return retrieveKeyValuesFromOcrResult(zoneImg.path + '.txt', regexp, prepareContent, debug).pipe(operators.map(val => ({ fieldName, value: val }))) | ||
// console.info(`\n\n\nfld "${fieldName}" zoneName: "${zoneName}" use lang: ${lang}, path: "${zoneImg.path}"\n`) | ||
const path$$1 = getOcrRetLangPath(ocrRetTxtMap, zoneName, lang); | ||
if (path$$1) { | ||
// console.info(`reused txtPath. fieldName: "${fieldName}", zoneName: "${zoneName}", lang: "${lang}", | ||
// txtPath: "${path}"\n\n`) | ||
return retrieveKeyValuesFromOcrResult(path$$1 + '.txt', regexp, prepareContent, debug).pipe(operators.map(val => { | ||
return { | ||
fieldName, | ||
zoneName, | ||
value: val, | ||
usedLang: lang, | ||
txtPath: path$$1, | ||
} | ||
})) | ||
} | ||
else { | ||
const imgPath = zoneImg.path; | ||
const txtPath = imgPath.split('.').slice(0, -1).join('.') + `-${Math.random()}`; | ||
return runOcr(imgPath, lang, txtPath).pipe(operators.concatMap(() => { | ||
// console.info(`\n\n--------- usedLang: "${lang}", txtPath:"${txtPath}"`) | ||
return retrieveKeyValuesFromOcrResult(txtPath + '.txt', regexp, prepareContent, debug).pipe(operators.map(val => { | ||
return { | ||
fieldName, | ||
zoneName, | ||
value: val, | ||
usedLang: lang, | ||
txtPath, | ||
} | ||
})) | ||
})) | ||
} | ||
}), operators.tap(({ zoneName: zone, usedLang, txtPath }) => { | ||
updateOcrRetTxtMap(ocrRetTxtMap, zone, usedLang, txtPath); | ||
}), operators.skipWhile((data, index) => { | ||
@@ -785,0 +883,0 @@ const valid = validateZoneImgRow(data.fieldName, data.value); |
import * as moment_ from 'moment' | ||
import { defer, from as ofrom, of } from 'rxjs' | ||
import { catchError, concatMap, defaultIfEmpty, delay, filter, map, mapTo, mergeMap, reduce, retry, skipWhile, take, tap, } from 'rxjs/operators' | ||
import { cpus } from 'os' | ||
import { from as ofrom, of } from 'rxjs' | ||
import { catchError, concatMap, defaultIfEmpty, filter, map, mapTo, mergeMap, reduce, skipWhile, take, tap, } from 'rxjs/operators' | ||
import { basename, copyFileAsync, createDir, isFileExists, isPathAcessible, join, rimraf, unlinkAsync, } from '../shared/index' | ||
@@ -8,3 +9,3 @@ import { initialBaseTmpDir, initialResizeImgDir, initialSplitTmpDir, zoneTmpDirPrefix } from './config' | ||
import { cropImgAllZones, cropImgZone, getOcrZoneOptsByBankName, runOcr } from './ocr-process' | ||
import { getRegexpOptsByName, prepareContent, retrieveKeyValuesFromOcrResult } from './txt-process' | ||
import { getOcrRetLangPath, getRegexpOptsByName, prepareContent, retrieveKeyValuesFromOcrResult, updateOcrRetTxtMap, } from './txt-process' | ||
const moment = moment_ | ||
@@ -43,3 +44,3 @@ | ||
export function recognize(imgPath, options) { | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, resizeImgDir, voucherConfigMap, globalScale, skipImgDir, } = options | ||
const { bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, resizeImgDir, voucherConfigMap, globalScale, skipImgDir, } = options | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir | ||
@@ -49,2 +50,3 @@ const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir | ||
const skipDir = skipImgDir ? join(skipImgDir, moment().format('YYYYMMDD')) : '' | ||
const cnumber = typeof concurrent === 'number' && concurrent > 0 ? concurrent : cpus().length | ||
// if config set for 300api, but source image from 600dpi, then set globalSale=600/300. default is 1 | ||
@@ -69,6 +71,6 @@ const voucherConfigMapNew = parseVoucherConfigMapScale(voucherConfigMap, globalScale) | ||
} | ||
const ret$ = recognizePageBank(bankOpts).pipe(filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), concatMap(({ bankName, pagePath }) => { | ||
const ret$ = recognizePageBank(bankOpts).pipe(filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), mergeMap(({ bankName, pagePath }) => { | ||
!!debug && console.info('start split page') | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
}), concatMap(({ bankName, imgFile }) => { | ||
}), mergeMap(({ bankName, imgFile }) => { | ||
const ocrFields = getOcrFields(bankName, voucherConfigMapNew) | ||
@@ -82,3 +84,2 @@ | ||
baseDir, | ||
concurrent: 2, | ||
debug: !!debug, | ||
@@ -98,3 +99,3 @@ defaultValue: '', | ||
})) | ||
}), mergeMap(retInfo => { | ||
}, cnumber > 0 ? cnumber : 1), mergeMap(retInfo => { | ||
const opts = { | ||
@@ -124,3 +125,3 @@ retInfo, | ||
concatMap(zoneInfo => { | ||
return runOcr(zoneInfo.path, lang).pipe(map(() => ({ path, zoneImgPath: zoneInfo.path })), mapTo(zoneInfo.path), catchError(() => of(zoneInfo.path))) | ||
return runOcr(zoneInfo.path, lang, zoneInfo.path).pipe(mapTo(zoneInfo.path)) | ||
}), concatMap(zoneImgPath => { | ||
@@ -176,4 +177,4 @@ // 批量提取参数值 | ||
function recognizeFields(options) { | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options | ||
const zoneTmpDir = join(baseDir, zoneTmpDirPrefix, `${basename(imgFile.path)}-${Math.random().toString()}`) | ||
const { bankName, baseDir, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options | ||
const zoneTmpDir = join(baseDir, zoneTmpDirPrefix, `${basename(imgFile.path)}`) | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap) | ||
@@ -184,7 +185,7 @@ | ||
} | ||
const stream$ = ofrom(createDir(zoneTmpDir)).pipe(mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, bankConfig.ocrZones)), // 切分图片区域分别做ocr识别 | ||
concatMap(fileMap => { | ||
const stream$ = ofrom(createDir(zoneTmpDir)).pipe( | ||
// 切分图片区域分别做ocr识别 | ||
mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), concatMap(fileMap => { | ||
const opts = { | ||
bankConfig, ocrFields, defaultValue, debug, | ||
concurrent: concurrent > 0 ? concurrent : 2, | ||
zoneImgMap: fileMap, | ||
@@ -194,2 +195,8 @@ } | ||
return batchOcrAndRetrieve(opts) | ||
}), tap(() => { | ||
if (!debug) { | ||
setTimeout(dir => { | ||
rimraf(dir).catch(console.info) | ||
}, 5000, zoneTmpDir) | ||
} | ||
})) | ||
@@ -200,26 +207,33 @@ | ||
function batchOcrAndRetrieve(options) { | ||
const { zoneImgMap, bankConfig, ocrFields, defaultValue, concurrent, debug, } = options | ||
const { zoneImgMap, bankConfig, ocrFields, defaultValue, debug, } = options | ||
const { bankName } = bankConfig | ||
const del$ = ofrom(zoneImgMap.entries()).pipe(delay(15000), mergeMap(([, imgInfo]) => { | ||
return defer(async () => { | ||
const img = imgInfo.path | ||
const txt = img + '.txt' | ||
if (await isFileExists(img)) { | ||
await rimraf(img) | ||
} | ||
if (await isFileExists(txt)) { | ||
await rimraf(txt) | ||
} | ||
return null | ||
}).pipe(delay(20000), retry(2), catchError(err => { | ||
console.info('Delete zone file retry failed:', err) | ||
return of(null) | ||
})) | ||
}), catchError(() => { | ||
return of(null) | ||
})) | ||
// const del$ = ofrom(zoneImgMap.entries()).pipe( | ||
// delay(15000), | ||
// mergeMap(([, imgInfo]) => { | ||
// return defer(async () => { | ||
// const img = imgInfo.path | ||
// const txt = img + '.txt' | ||
// if (await isFileExists(img)) { | ||
// await rimraf(img) | ||
// } | ||
// if (await isFileExists(txt)) { | ||
// await rimraf(txt) | ||
// } | ||
// return null | ||
// }).pipe( | ||
// delay(20000), | ||
// retry(2), | ||
// catchError(err => { | ||
// console.info('Delete zone file retry failed:', err) | ||
// return of(null) | ||
// }), | ||
// ) | ||
// }), | ||
// catchError(() => { | ||
// return of(null) | ||
// }), | ||
// ) | ||
const process$ = ofrom(zoneImgMap.entries()).pipe(concatMap((zoneImgRow) => { | ||
return ocrAndPickFromZoneImg(zoneImgRow, bankConfig, concurrent, debug) | ||
}), reduce((acc, curr) => acc.set(curr.fieldName, curr.value), new Map()), map(retMap => retMap.set('bank' /* bank */, bankName)), map(retMap => setDefaultValue(retMap, ocrFields, defaultValue)), tap(() => debug || del$.subscribe())) | ||
return ocrAndPickFromZoneImg(zoneImgRow, bankConfig, debug) | ||
}), reduce((acc, curr) => acc.set(curr.fieldName, curr.value), new Map()), map(retMap => retMap.set('bank' /* bank */, bankName)), map(retMap => setDefaultValue(retMap, ocrFields, defaultValue))) | ||
@@ -245,3 +259,3 @@ return process$ | ||
const { fieldName, value } = zoneRet | ||
const ret = { fieldName, value } | ||
const ret = Object.assign({}, zoneRet) | ||
@@ -343,3 +357,3 @@ switch (fieldName) { | ||
const { retInfo, resizeDir, debug, scale, jpegQuality } = options | ||
let filename = retInfo.get('filename') | ||
const filename = retInfo.get('filename') | ||
const path = retInfo.get('path') | ||
@@ -354,11 +368,16 @@ const sn = retInfo.get('sn' /* sn */) | ||
} | ||
// YYYYMMDD-A15295623630009-0.31486898522590034-pageSplitItemIndex.jpg | ||
const arr = filename.split('.').slice(0, -1).join('').split('-') | ||
arr.splice(2, 1) | ||
let filename2 = arr.join('-') | ||
if (sn) { | ||
const name = filename.split('.').slice(0, -1) | ||
filename = name + `-${sn.replace(/[^\d\w]/g, '_')}.jpg` | ||
filename2 = filename2 + `-${sn.replace(/[^\d\w]/g, '_')}` | ||
} | ||
filename2 = filename2 + '.jpg' | ||
const curDate = moment().format('YYYY-MM-DD') | ||
const targetPath = join(resizeDir, curDate, filename) | ||
const targetPath = join(resizeDir, curDate, filename2) | ||
retInfo.set('filename', filename) | ||
retInfo.set('filename', filename2) | ||
return resizeAndSaveImg(path, targetPath, scale, jpegQuality).pipe(map(imgInfo => { | ||
@@ -377,4 +396,5 @@ retInfo.set('path', imgInfo.path) | ||
} | ||
function ocrAndPickFromZoneImg(zoneImgRow, config, concurrent = 2, debug = false) { | ||
function ocrAndPickFromZoneImg(zoneImgRow, config, debug = false) { | ||
const { ocrDefaultLangs, ocrFieldLangs, regexpOpts, ocrFields } = config | ||
const ocrRetTxtMap = new Map() | ||
@@ -385,9 +405,10 @@ return ofrom(Object.entries(ocrFields)).pipe(filter(data => { | ||
return !!zoneName && zoneName === zoneImgRow[0] | ||
}), mergeMap(data => { | ||
}), concatMap(data => { | ||
const fieldName = data[0] | ||
const zoneName = data[1] | ||
return ocrAndPickFieldFromZoneImg(fieldName, zoneImgRow, regexpOpts, ocrDefaultLangs, ocrFieldLangs, debug) | ||
}, concurrent)) | ||
return ocrAndPickFieldFromZoneImg(fieldName, zoneName, zoneImgRow, regexpOpts, ocrDefaultLangs, ocrFieldLangs, debug, ocrRetTxtMap) | ||
})) | ||
} | ||
function ocrAndPickFieldFromZoneImg(fieldName, zoneImgRow, regexpOpts, defaultLangs, fieldLangs, debug = false) { | ||
function ocrAndPickFieldFromZoneImg(fieldName, zoneName, zoneImgRow, regexpOpts, defaultLangs, fieldLangs, debug = false, ocrRetTxtMap) { | ||
const [, zoneImg] = zoneImgRow | ||
@@ -405,6 +426,37 @@ const langs = genFieldLangs(fieldName, defaultLangs, fieldLangs) | ||
concatMap(lang => { | ||
// console.log(`fld "${fieldName}" use lang:`, lang, zoneImg.path) | ||
return runOcr(zoneImg.path, lang).pipe(mapTo(true), catchError(() => of(true))) | ||
}), concatMap(() => { | ||
return retrieveKeyValuesFromOcrResult(zoneImg.path + '.txt', regexp, prepareContent, debug).pipe(map(val => ({ fieldName, value: val }))) | ||
// console.info(`\n\n\nfld "${fieldName}" zoneName: "${zoneName}" use lang: ${lang}, path: "${zoneImg.path}"\n`) | ||
const path = getOcrRetLangPath(ocrRetTxtMap, zoneName, lang) | ||
if (path) { | ||
// console.info(`reused txtPath. fieldName: "${fieldName}", zoneName: "${zoneName}", lang: "${lang}", | ||
// txtPath: "${path}"\n\n`) | ||
return retrieveKeyValuesFromOcrResult(path + '.txt', regexp, prepareContent, debug).pipe(map(val => { | ||
return { | ||
fieldName, | ||
zoneName, | ||
value: val, | ||
usedLang: lang, | ||
txtPath: path, | ||
} | ||
})) | ||
} | ||
else { | ||
const imgPath = zoneImg.path | ||
const txtPath = imgPath.split('.').slice(0, -1).join('.') + `-${Math.random()}` | ||
return runOcr(imgPath, lang, txtPath).pipe(concatMap(() => { | ||
// console.info(`\n\n--------- usedLang: "${lang}", txtPath:"${txtPath}"`) | ||
return retrieveKeyValuesFromOcrResult(txtPath + '.txt', regexp, prepareContent, debug).pipe(map(val => { | ||
return { | ||
fieldName, | ||
zoneName, | ||
value: val, | ||
usedLang: lang, | ||
txtPath, | ||
} | ||
})) | ||
})) | ||
} | ||
}), tap(({ zoneName: zone, usedLang, txtPath }) => { | ||
updateOcrRetTxtMap(ocrRetTxtMap, zone, usedLang, txtPath) | ||
}), skipWhile((data, index) => { | ||
@@ -411,0 +463,0 @@ const valid = validateZoneImgRow(data.fieldName, data.value) |
@@ -9,2 +9,3 @@ /// <reference types="node" /> | ||
baseTmpDir?: string; | ||
concurrent?: number; | ||
debug?: boolean; | ||
@@ -56,3 +57,6 @@ defaultOcrLang: string; | ||
fieldName: FieldName; | ||
zoneName: FieldName; | ||
value: string; | ||
usedLang: string; | ||
txtPath: string; | ||
} | ||
@@ -138,3 +142,2 @@ export declare type RegexpArray = ReadonlyArray<RegExp>; | ||
baseDir: string; | ||
concurrent: number; | ||
debug: boolean; | ||
@@ -160,3 +163,2 @@ defaultValue: string; | ||
defaultValue: string; | ||
concurrent: number; | ||
debug: boolean; | ||
@@ -171,1 +173,3 @@ } | ||
} | ||
export declare type OcrRetTxtMap = Map<FieldName, OcrRetLangMap>; | ||
export declare type OcrRetLangMap = Map<string, string>; |
@@ -1,7 +0,6 @@ | ||
/// <reference types="node" /> | ||
import { Observable } from 'rxjs'; | ||
import { BankName, ImgFileInfo, OcrZone, VoucherConfig, VoucherConfigMap, ZoneImgMap } from './model'; | ||
import { BankName, ImgFileInfo, OcrFields, OcrZone, VoucherConfig, VoucherConfigMap, ZoneImgMap } from './model'; | ||
export declare function getOcrZoneOptsByBankName(bankName: BankName, configMap: VoucherConfigMap): VoucherConfig | void; | ||
export declare function cropImgAllZones(srcPath: string, zoneTmpDir: string, ocrZoneOptsArr: ReadonlyArray<OcrZone>): Observable<ZoneImgMap>; | ||
export declare function cropImgAllZones(srcPath: string, zoneTmpDir: string, ocrFields: OcrFields, ocrZoneOptsArr: ReadonlyArray<OcrZone>): Observable<ZoneImgMap>; | ||
export declare function cropImgZone(srcPath: string, targetDir: string, ocrZoneOpts: OcrZone): Observable<ImgFileInfo>; | ||
export declare function runOcr(path: string, lang: string): Observable<Buffer>; | ||
export declare function runOcr(imgPath: string, lang: string, txtPath: string): Observable<void>; |
import { crop } from 'easyimage' | ||
import { from as ofrom } from 'rxjs' | ||
import { map, mergeMap, reduce } from 'rxjs/operators' | ||
import { from as ofrom, of } from 'rxjs' | ||
import { catchError, last, map, mapTo, mergeMap, reduce } from 'rxjs/operators' | ||
import run from 'rxrunscript' | ||
@@ -10,4 +10,20 @@ import { join } from '../shared/index' | ||
// crop all defined zones from a image | ||
export function cropImgAllZones(srcPath, zoneTmpDir, ocrZoneOptsArr) { | ||
return ofrom(ocrZoneOptsArr).pipe(mergeMap(ocrZoneOpts => { | ||
export function cropImgAllZones(srcPath, zoneTmpDir, ocrFields, ocrZoneOptsArr) { | ||
const flds = [] | ||
const srcFldSet = new Set() | ||
for (const srcFld of Object.values(ocrFields)) { | ||
if (!srcFld || srcFldSet.has(srcFld)) { | ||
continue | ||
} | ||
srcFldSet.add(srcFld) | ||
} | ||
for (const row of ocrZoneOptsArr) { | ||
const fld = row.zoneName | ||
if (fld && srcFldSet.has(fld)) { | ||
flds.push(row) | ||
} | ||
} | ||
return ofrom(flds).pipe(mergeMap(ocrZoneOpts => { | ||
return cropImgZone(srcPath, zoneTmpDir, ocrZoneOpts).pipe(map(img => { | ||
@@ -47,9 +63,9 @@ return [ocrZoneOpts.zoneName, img] | ||
} | ||
// ocr a iamge file | ||
export function runOcr(path, lang) { | ||
// second path will be append with '.txt' | ||
// ocr a iamge file, txtPath without extension | ||
export function runOcr(imgPath, lang, txtPath) { | ||
// second path will be append with '.txt' by tesseract | ||
if (!lang) { | ||
lang = 'eng' | ||
} | ||
const cmd = `tesseract "${path}" "${path}" -l ${lang}` | ||
const cmd = `tesseract "${imgPath}" "${txtPath}" -l ${lang}` | ||
@@ -59,3 +75,4 @@ // const opts = { | ||
// } | ||
return run(cmd) | ||
return run(cmd).pipe(last(), catchError(() => of(void 0)), // tesseract will exit with code(0) but output with stderr | ||
mapTo(void 0)) | ||
} |
/// <reference types="node" /> | ||
import { Observable } from 'rxjs'; | ||
import { FieldName, PreProcessBufferFn, RegexpArray, ZoneRegexpOpts } from './model'; | ||
import { FieldName, OcrRetTxtMap, PreProcessBufferFn, RegexpArray, ZoneRegexpOpts } from './model'; | ||
export declare function retrieveKeyValuesFromOcrResult(path: string, // ocr result txt file path | ||
@@ -12,1 +12,3 @@ matchRules: RegexpArray, preProcssBufferFn: PreProcessBufferFn | null, debug?: boolean): Observable<string | void>; | ||
export declare function prepareContent(buf: Buffer): string; | ||
export declare function getOcrRetLangPath(ocrRetTxtMap: OcrRetTxtMap, fieldName: FieldName, lang: string): string; | ||
export declare function updateOcrRetTxtMap(ocrRetTxtMap: OcrRetTxtMap, fieldName: FieldName, lang: string, txtPath: string): void; |
import { defer } from 'rxjs' | ||
import { map, } from 'rxjs/operators' | ||
import { map } from 'rxjs/operators' | ||
import { readFileAsync } from '../shared/index' | ||
@@ -81,1 +81,31 @@ export function retrieveKeyValuesFromOcrResult(path, // ocr result txt file path | ||
} | ||
export function getOcrRetLangPath(ocrRetTxtMap, fieldName, lang) { | ||
const ocrRetLangMap = getOcrRetLangMap(ocrRetTxtMap, fieldName) | ||
if (!ocrRetLangMap) { | ||
return '' | ||
} | ||
const txtPath = ocrRetLangMap.get(lang) | ||
return txtPath ? txtPath : '' | ||
} | ||
export function updateOcrRetTxtMap(ocrRetTxtMap, fieldName, lang, txtPath) { | ||
if (!fieldName || !lang || !txtPath) { | ||
return | ||
} | ||
let ocrRetLangMap = getOcrRetLangMap(ocrRetTxtMap, fieldName) | ||
if (!ocrRetLangMap) { | ||
ocrRetLangMap = new Map() | ||
} | ||
if (lang && txtPath) { | ||
updateOcrRetLangMap(ocrRetLangMap, lang, txtPath) | ||
} | ||
ocrRetTxtMap.set(fieldName, ocrRetLangMap) | ||
} | ||
function updateOcrRetLangMap(ocrRetLangMap, lang, txtPath) { | ||
ocrRetLangMap.set(lang, txtPath) | ||
} | ||
function getOcrRetLangMap(ocrRetTxtMap, fieldName) { | ||
return ocrRetTxtMap.get(fieldName) | ||
} |
{ | ||
"name": "bank-voucher-ocr", | ||
"author": "waiting", | ||
"version": "0.13.0", | ||
"version": "1.0.0", | ||
"description": "Bank Voucher ocr by tesseract and retrieve fields", | ||
@@ -6,0 +6,0 @@ "keywords": [ |
Sorry, the diff of this file is not supported yet
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
176187
3025
0