bank-voucher-ocr
Advanced tools
Comparing version 2.0.0 to 2.1.0
@@ -5,3 +5,3 @@ /** | ||
* | ||
* @version 2.0.0 | ||
* @version 2.1.0 | ||
* @author waiting | ||
@@ -469,3 +469,3 @@ * @license MIT | ||
function recognize(imgPath, options) { | ||
const { bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, splitTmpDir, voucherConfigMap, globalScale, skipImgDir, } = options; | ||
const { bankName: inputBankName, bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, isSingleVoucher, splitTmpDir, voucherConfigMap, globalScale, skipImgDir, } = options; | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir; | ||
@@ -488,6 +488,17 @@ const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir; | ||
}; | ||
const ret$ = recognizePageBank(bankOpts).pipe(filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), mergeMap(({ bankName, pagePath }) => { | ||
!!debug && console.info('start split page'); | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
}), mergeMap(({ bankName, imgFile }) => { | ||
const bank$ = !inputBankName | ||
? recognizePageBank(bankOpts) | ||
: of({ | ||
bankName: inputBankName, | ||
pagePath: imgPath, | ||
}); | ||
const ret$ = bank$.pipe(filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), mergeMap(({ bankName, pagePath }) => { | ||
if (isSingleVoucher) { | ||
return readImgInfo(pagePath).pipe(map(imgInfo => of({ bankName, imgInfo }))) | ||
} | ||
else { | ||
!!debug && console.info('start split page'); | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
} | ||
}), mergeMap(({ bankName, imgInfo }) => { | ||
const ocrFields = getOcrFields(bankName, voucherConfigMapNew); | ||
@@ -503,3 +514,3 @@ | ||
defaultValue: '', | ||
imgFile, | ||
imgInfo, | ||
ocrFields, | ||
@@ -512,4 +523,4 @@ voucherConfigMap: voucherConfigMapNew, | ||
retInfo.set('bank' /* bank */, bankName); | ||
retInfo.set('filename' /* filename */, imgFile.name.trim()); | ||
retInfo.set('path' /* path */, imgFile.path.trim()); | ||
retInfo.set('filename' /* filename */, imgInfo.name.trim()); | ||
retInfo.set('path' /* path */, imgInfo.path.trim()); | ||
return retInfo | ||
@@ -574,4 +585,4 @@ })) | ||
return splitPagetoItems(pagePath, targetDir, config).pipe(mergeMap(fileMap => { | ||
const ret$ = from(fileMap.values()).pipe(map(imgFile => { | ||
return { bankName, imgFile } | ||
const ret$ = from(fileMap.values()).pipe(map(imgInfo => { | ||
return { bankName, imgInfo } | ||
})); | ||
@@ -584,4 +595,4 @@ | ||
function recognizeFields(options) { | ||
const { bankName, baseDir, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = join(baseDir, zoneTmpDirPrefix, `${basename(imgFile.path)}`); | ||
const { bankName, baseDir, debug, defaultValue, imgInfo, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = join(baseDir, zoneTmpDirPrefix, `${basename(imgInfo.path)}`); | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap); | ||
@@ -594,3 +605,3 @@ | ||
// 切分图片区域分别做ocr识别 | ||
mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), concatMap(fileMap => { | ||
mergeMap(() => cropImgAllZones(imgInfo.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), concatMap(fileMap => { | ||
const opts = { | ||
@@ -597,0 +608,0 @@ bankConfig, ocrFields, defaultValue, debug, |
@@ -5,3 +5,3 @@ /** | ||
* | ||
* @version 2.0.0 | ||
* @version 2.1.0 | ||
* @author waiting | ||
@@ -12,3 +12,3 @@ * @license MIT | ||
import{access,chmod,close,copyFile,mkdir,open,readdir,readFile,rmdir,stat,unlink,write,writeFile}from"fs";import{basename,join,normalize,resolve,sep}from"path";import{promisify}from"util";import{tmpdir,cpus}from"os";import{crop,info,resize}from"easyimage";import*as moment_ from"moment";import{defer,of,range,from}from"rxjs";import{concatMap,map,mergeMap,catchError,last,mapTo,reduce,defaultIfEmpty,filter,skipWhile,take,tap}from"rxjs/operators";import run from"rxrunscript";const closeAsync=promisify(close),chmodAsync=promisify(chmod),copyFileAsync=promisify(copyFile),mkdirAsync=promisify(mkdir),openAsync=promisify(open),readFileAsync=promisify(readFile),readDirAsync=promisify(readdir),rmdirAsync=promisify(rmdir),unlinkAsync=promisify(unlink),writeAsync=promisify(write),writeFileAsync=promisify(writeFile);function isPathAcessible(e){return e?new Promise(t=>access(e,e=>t(!e))):Promise.resolve(!1)}function isDirExists(e){return e?isDirFileExists(e,"DIR"):Promise.resolve(!1)}function isFileExists(e){return e?isDirFileExists(e,"FILE"):Promise.resolve(!1)}function isDirFileExists(e,t){return e?new Promise(n=>{stat(e,(e,i)=>{n(!(e||!i)&&("DIR"===t?i.isDirectory():i.isFile()))})}):Promise.resolve(!1)}async function createDir(e){if(!e)throw new Error("value of path param invalid");e=normalize(e),await isDirExists(e)||await e.split(sep).reduce(async(e,t)=>{const n=resolve(await e,t);return await isPathAcessible(n)||await mkdirAsync(n,493),n},Promise.resolve(sep))}async function rimraf(e){e&&(await _rimraf(e),await isDirExists(e)&&await rmdirAsync(e))}async function _rimraf(e){if(e&&await isPathAcessible(e)){if(await isFileExists(e))return void await unlinkAsync(e);const t=await readDirAsync(e);if(t.length)for(const n of t)await _rimraf(join(e,n));else await rmdirAsync(e)}}const initialBaseTmpDir=join(tmpdir(),"voucher-ocr"),initialResizeImgDir=join(initialBaseTmpDir,"resize"),initialSplitTmpDir=join(initialBaseTmpDir,"split"),zoneTmpDirPrefix="zone",initialBankZone={zoneName:"bank",width:2250,height:390,offsetX:70,offsetY:10},moment=moment_;function splitPagetoItems(e,t,n){return readImgInfo(e).pipe(map(i=>{const r=calcItemsPerPage(i.height,n.height);return r?range(0,r).pipe(mergeMap(r=>{const a={index:r,itemConfig:Object.assign({},n),srcPath:e,targetDir:t,pageHeight:i.height};return i.width<a.itemConfig.width&&(a.itemConfig.width=i.width),parseSplitPage(a).pipe(mergeMap(e=>{const t=new Map;return e.name&&t.set(e.name,e),of(t)}))})):of(new Map)}),concatMap(e=>e))}function resizeAndSaveImg(e,t,n,i){if(n<=0||n>1)throw new Error(`value of scale invalid: "${n}"`);return readImgInfo(e).pipe(mergeMap(r=>{const a={src:e,dst:t,width:r.width*n,height:r.height*n,quality:i};return defer(()=>resize(a))}),map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function parseSplitPage(e){const{index:t,srcPath:n,pageHeight:i}=e,{width:r,marginBottom:a}=e.itemConfig;let{height:o}=e.itemConfig;const s=0,c=t*o;if(c+o>i&&(o=i-c),o/i<.1||o<100){const e={name:"",path:"",width:0,height:0,size:0};return of(e)}const p=basename(n),m=p.split(".")[0],l=moment().format("YYYYMMDD"),f=join(e.targetDir,`${l}-${m}-${Math.random()}-${t}.jpg`),u={dst:f,src:n,quality:100,cropWidth:r,cropHeight:o+a,x:0,y:c};return defer(()=>crop(u)).pipe(mergeMap(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return of(t)}))}function readImgInfo(e){return defer(()=>info(e))}function calcItemsPerPage(e,t){const n=33;return e>=t?Math.ceil((e+33)/t):1}function getOcrZoneOptsByBankName(e,t){return t.get(e)}function cropImgAllZones(e,t,n,i){const r=[],a=new Set;for(const e of Object.values(n))e&&!a.has(e)&&a.add(e);for(const e of i){const t=e.zoneName;t&&a.has(t)&&r.push(e)}return from(r).pipe(mergeMap(n=>cropImgZone(e,t,n).pipe(map(e=>[n.zoneName,e]))),reduce((e,t)=>(e.set(t[0],t[1]),e),new Map))}function cropImgZone(e,t,n){const{zoneName:i,width:r,height:a,offsetX:o,offsetY:s}=n,c=join(t,`${i}-${Math.random()}.png`),p={dst:c,src:e,quality:100,cropWidth:r,cropHeight:a,x:o,y:s};return defer(()=>crop(p)).pipe(map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function runOcr(e,t,n){t||(t="eng");const i=`tesseract "${e}" "${n}" -l ${t}`;return run(i).pipe(last(),catchError(()=>of(void 0)),mapTo(void 0))}function retrieveKeyValuesFromOcrResult(e,t,n,i=!1){if(!t)throw new Error("matchRules empty");return readTxtFile(e).pipe(map(e=>({buf:e,regexp:t})),map(({buf:e,regexp:t})=>{const r=n&&"function"==typeof n?n(e):e.toString("utf8");return retrieveValueByRegexp(r,t,i)}))}function getRegexpOptsByName(e,t){for(const n of Object.keys(t))if(n===e)return t[n]}function retrieveValueByRegexp(e,t,n=!1){const i=regexMatch(e,t,n);return n&&console.info("retrieveValueByRegexp ----- text start: ---------------\x3e \n",e,"\n<--------------- text END ----------------\n\n",t,">>>>>>>>matched value: ",i,"\n"),i}function readTxtFile(e){if(!e)throw new Error("path empty");const t=e.split(".");if(t.length>1&&"txt"!==t[t.length-1].toLowerCase())throw new Error(`file extensiion must empty or be '.txt', but is: "${e}"`);return defer(async()=>readFileAsync(e))}function prepareContent(e){let t=e&&e.byteLength?e.toString():"";return t?t=t.replace(/(?<=\S) /g,""):""}function regexMatch(e,t,n=!1){if(e)for(const i of t){const t=e.match(i);if(Array.isArray(t)&&t.length)return i.global&&t.length>1?(n&&console.info("----------multi matched regex: --------------\x3e\n",t,"\n--- used regex ----: ",i,"\n<-------------ignore matched result---------------\n\n"),""):(n&&console.info("----------matched regex: --------------\x3e\n",t,"\n--- used regex ----: ",i,"\n<----------------------------\n\n"),t[0])}}function getOcrRetLangPath(e,t,n){const i=getOcrRetLangMap(e,t);if(!i)return"";const r=i.get(n);return r||""}function updateOcrRetTxtMap(e,t,n,i){if(!t||!n||!i)return;let r=getOcrRetLangMap(e,t);r||(r=new Map),n&&i&&updateOcrRetLangMap(r,n,i),e.set(t,r)}function updateOcrRetLangMap(e,t,n){e.set(t,n)}function getOcrRetLangMap(e,t){return e.get(t)}const moment$1=moment_;class Bvo{constructor(e){this.options=e;const t=+this.options.globalScale;this.options.globalScale=Number.isNaN(t)||t<=0?1:t,this.options.debug=!!this.options.debug;const{baseTmpDir:n,splitTmpDir:i,resizeImgDir:r}=e,a=n||initialBaseTmpDir,o=i||initialSplitTmpDir,s=r||initialResizeImgDir;defer(()=>createDir(a)).pipe(catchError(e=>(console.info(e),of(null))),concatMap(()=>createDir(o)),catchError(e=>(console.info(e),of(null))),concatMap(()=>createDir(s)),catchError(e=>(console.info(e),of(null)))).subscribe(()=>{},console.error)}run(e){const{debug:t,jpegQuality:n,scale:i,resizeImgDir:r,globalScale:a}=this.options,o=r||initialResizeImgDir,s=i/a,c={resizeDir:o,scale:s,jpegQuality:n,debug:!!t};return recognize(e,this.options).pipe(mergeMap(e=>{const t=Object.assign({retInfo:e},c);return saveImgAndPrune(t)}))}}function recognize(e,t){const{bankZone:n,baseTmpDir:i,concurrent:r,debug:a,defaultOcrLang:o,splitTmpDir:s,voucherConfigMap:c,globalScale:p,skipImgDir:m}=t,l=i||initialBaseTmpDir,f=s||initialSplitTmpDir,u=m?join(m,moment$1().format("YYYYMMDD")):"",g="number"==typeof r&&r>0?r:cpus().length,h=parseVoucherConfigMapScale(c,p),d=parseOcrZoneScale(n,p),b=getBankRegexpOpts(h),y={baseDir:l,path:e,bankZone:d,bankRegexpOptsMap:b,debug:!!a,lang:o,skipImgDir:u},w=recognizePageBank(y).pipe(filter(({bankName:e})=>!!e&&"n/a"!==e),mergeMap(({bankName:e,pagePath:t})=>(a&&console.info("start split page"),splitPageToImgs(t,e,f,h))),mergeMap(({bankName:e,imgFile:t})=>{const n=getOcrFields(e,h);if(!n)throw new Error(`ocrFields not defined with bankName: "${e}"`);const i={bankName:e,baseDir:l,debug:!!a,defaultValue:"",imgFile:t,ocrFields:n,voucherConfigMap:h};return a&&console.info("recognize item"),recognizeFields(i).pipe(map(n=>(n.set("bank",e),n.set("filename",t.name.trim()),n.set("path",t.path.trim()),n)))},g>0?g:1)),k=defer(()=>isFileExists(e)).pipe(filter(e=>e));return k.pipe(mergeMap(()=>w))}function recognizePageBank(e){const{baseDir:t,path:n,bankZone:i,bankRegexpOptsMap:r,debug:a,lang:o,skipImgDir:s}=e,c=join(t,zoneTmpDirPrefix,`${basename(n)}-${Math.random().toString()}`);return a&&console.info("recognize pageBank:",c,n),defer(()=>createDir(c)).pipe(catchError(e=>(console.info(e),of(null))),mergeMap(()=>cropImgZone(join(n),c,i)),concatMap(e=>runOcr(e.path,o,e.path).pipe(mapTo(e.path))),concatMap(e=>from(r.entries()).pipe(concatMap(([t,n])=>retrieveKeyValuesFromOcrResult(e+".txt",n,e=>e.toString().replace(/(?<=\S)[. ]{1,2}(?=\S)/g,"").replace(/\n{2,}/g,""),a).pipe(map(e=>({bankName:t,value:e})))),skipWhile(({value:e})=>"undefined"==typeof e||"string"==typeof e&&!e.length),take(1),map(({bankName:e})=>({bankName:e,pagePath:n})),defaultIfEmpty({bankName:"n/a",pagePath:""}))),tap(e=>{const{bankName:t,pagePath:i}=e;"n/a"!==t&&i||(console.info(`recognize bank of page fail. no matached regexp. file: "${n}", pagePath: "${i}" `),cpSkipImg(n,s)),a||rimraf(c).catch(console.info)}))}async function cpSkipImg(e,t){t&&(await isPathAcessible(t)||await createDir(t),copyFileAsync(e,join(t,basename(e))).catch(console.error))}function splitPageToImgs(e,t,n,i){const r=i.get(t);if(!r)throw new Error("bank config empty during split page to images");return splitPagetoItems(e,n,r).pipe(mergeMap(e=>{const n=from(e.values()).pipe(map(e=>({bankName:t,imgFile:e})));return n}))}function recognizeFields(e){const{bankName:t,baseDir:n,debug:i,defaultValue:r,imgFile:a,ocrFields:o,voucherConfigMap:s}=e,c=join(n,zoneTmpDirPrefix,`${basename(a.path)}`),p=getOcrZoneOptsByBankName(t,s);if(!p)throw new Error(`get bankConfig empty with bankName: "${t}"`);const m=defer(()=>createDir(c)).pipe(mergeMap(()=>cropImgAllZones(a.path,c,o,p.ocrZones)),concatMap(e=>{const t={bankConfig:p,ocrFields:o,defaultValue:r,debug:i,zoneImgMap:e};return batchOcrAndRetrieve(t)}),tap(()=>{i||setTimeout(e=>{rimraf(e).catch(console.info)},5e3,c)}));return m}function batchOcrAndRetrieve(e){const{zoneImgMap:t,bankConfig:n,ocrFields:i,defaultValue:r,debug:a}=e,{bankName:o}=n,s=from(t.entries()).pipe(concatMap(e=>ocrAndPickFromZoneImg(e,n,a)),reduce((e,t)=>e.set(t.fieldName,t.value),new Map),map(e=>e.set("bank",o)),map(e=>setDefaultValue(e,i,r)));return s}function setDefaultValue(e,t,n=""){const i=new Map;for(const n of Object.keys(t)){const t=e.get(n);"string"==typeof t?i.set(n,t):i.set(n,"")}return i}function processZoneImgRow(e){const{fieldName:t,value:n}=e,i=Object.assign({},e);switch(t){case"amount":i.value=n.trim().replace(/,/g,"");break;case"date":i.value=n.trim().replace(/\D/g,""),i.value&&"0"===i.value.slice(0,1)&&(i.value="2"+i.value);break;case"sn":case"destAccountNumber":case"paymentAccountNumber":i.value=n.trim()}return i}function validateZoneImgRow(e,t){if("string"!=typeof t)return!1;switch(e){case"amount":if(validateRetInfoAmout(t))return!0;break;case"sn":if(t)return!0;break;case"date":if(validateRetInfoDate(t))return!0;break;case"bank":return!0;default:if("string"==typeof t)return!0}return!1}function validateRetInfoAmout(e){if(!e)return!1;if(!e.trim())return!1;const t=parseFloat(e);return!Number.isNaN(t)&&"number"==typeof t}function validateRetInfoDate(e){return!!e&&moment$1(e,"YYYYMMDD").isValid()}function getBankRegexpOpts(e){const t=new Map;for(const{bankName:n,regexpOpts:i}of e.values())i&&i.bank&&t.set(n,i.bank);if(!t.size)throw new Error("not BankRegexpOpts found, should not set");return t}function getOcrFields(e,t){const n=t.get(e);if(!n)throw new Error(`get ocrFields empty by bankName: "${e}"`);return n.ocrFields}function saveImgAndPrune(e){const{retInfo:t,resizeDir:n,debug:i,scale:r,jpegQuality:a}=e,o=t.get("filename"),s=t.get("path"),c=t.get("sn");if(!o)throw new Error(`result info map invalid with empty path. info: ${t}`);if(!s)throw new Error(`result info map invalid with empty path. info: ${t}`);const p=o.split(".").slice(0,-1).join("").split("-");p.splice(2,1);let m=p.join("-");c&&(m+=`-${c.replace(/[^\d\w]/g,"_")}`),m+=".jpg";const l=moment$1().format("YYYY-MM-DD"),f=join(n,l,m);return t.set("filename",m),resizeAndSaveImg(s,f,r,a).pipe(map(e=>(t.set("path",e.path),t)),tap(()=>{i||unlinkAsync(s).catch(console.info)}))}function genFieldLangs(e,t,n){return n&&"undefined"!=typeof n[e]&&Array.isArray(n[e])?n[e]:t}function ocrAndPickFromZoneImg(e,t,n=!1){const{ocrDefaultLangs:i,ocrFieldLangs:r,regexpOpts:a,ocrFields:o}=t,s=new Map;return from(Object.entries(o)).pipe(filter(t=>{const n=t[1];return!!n&&n===e[0]}),concatMap(t=>{const o=t[0],c=t[1];return ocrAndPickFieldFromZoneImg(o,c,e,a,i,r,n,s)}))}function ocrAndPickFieldFromZoneImg(e,t,n,i,r,a,o=!1,s){const[,c]=n,p=genFieldLangs(e,r,a),m=p.length-1,l=getRegexpOptsByName(e,i);if(!l)throw new Error(`got regexp empty by zoneName: "${e}"`);return from(p).pipe(concatMap(n=>{const i=getOcrRetLangPath(s,t,n);if(i)return retrieveKeyValuesFromOcrResult(i+".txt",l,prepareContent,o).pipe(map(r=>({fieldName:e,zoneName:t,value:r,usedLang:n,txtPath:i})));{const i=c.path,r=i.split(".").slice(0,-1).join(".")+`-${Math.random()}`;return runOcr(i,n,r).pipe(concatMap(()=>retrieveKeyValuesFromOcrResult(r+".txt",l,prepareContent,o).pipe(map(i=>({fieldName:e,zoneName:t,value:i,usedLang:n,txtPath:r})))))}}),tap(({zoneName:e,usedLang:t,txtPath:n})=>{updateOcrRetTxtMap(s,e,t,n)}),skipWhile((e,t)=>{const n=validateZoneImgRow(e.fieldName,e.value);return!n&&t!==m}),take(1),map(e=>("string"!=typeof e.value&&(e.value=""),e)),map(processZoneImgRow))}function parseVoucherConfigMapScale(e,t){const n=new Map;for(const[i,r]of e){const e=Object.assign({},r),a=[];for(const n of e.ocrZones)a.push(parseOcrZoneScale(n,t));e.ocrZones=a,e.width=e.width*t,e.height=e.height*t,e.marginBottom=e.marginBottom*t,n.set(i,e)}return n}function parseOcrZoneScale(e,t){const n=Object.assign({},e);return n.width=n.width*t,n.height=n.height*t,n.offsetX=n.offsetX*t,n.offsetY=n.offsetY*t,n}export{initialBankZone,initialBaseTmpDir,initialResizeImgDir,initialSplitTmpDir,zoneTmpDirPrefix,Bvo,recognize,recognizePageBank,splitPageToImgs,recognizeFields,batchOcrAndRetrieve,ocrAndPickFromZoneImg}; | ||
import{access,chmod,close,copyFile,mkdir,open,readdir,readFile,rmdir,stat,unlink,write,writeFile}from"fs";import{basename,join,normalize,resolve,sep}from"path";import{promisify}from"util";import{tmpdir,cpus}from"os";import{crop,info,resize}from"easyimage";import*as moment_ from"moment";import{defer,of,range,from}from"rxjs";import{concatMap,map,mergeMap,catchError,last,mapTo,reduce,defaultIfEmpty,filter,skipWhile,take,tap}from"rxjs/operators";import run from"rxrunscript";const closeAsync=promisify(close),chmodAsync=promisify(chmod),copyFileAsync=promisify(copyFile),mkdirAsync=promisify(mkdir),openAsync=promisify(open),readFileAsync=promisify(readFile),readDirAsync=promisify(readdir),rmdirAsync=promisify(rmdir),unlinkAsync=promisify(unlink),writeAsync=promisify(write),writeFileAsync=promisify(writeFile);function isPathAcessible(e){return e?new Promise(t=>access(e,e=>t(!e))):Promise.resolve(!1)}function isDirExists(e){return e?isDirFileExists(e,"DIR"):Promise.resolve(!1)}function isFileExists(e){return e?isDirFileExists(e,"FILE"):Promise.resolve(!1)}function isDirFileExists(e,t){return e?new Promise(n=>{stat(e,(e,i)=>{n(!(e||!i)&&("DIR"===t?i.isDirectory():i.isFile()))})}):Promise.resolve(!1)}async function createDir(e){if(!e)throw new Error("value of path param invalid");e=normalize(e),await isDirExists(e)||await e.split(sep).reduce(async(e,t)=>{const n=resolve(await e,t);return await isPathAcessible(n)||await mkdirAsync(n,493),n},Promise.resolve(sep))}async function rimraf(e){e&&(await _rimraf(e),await isDirExists(e)&&await rmdirAsync(e))}async function _rimraf(e){if(e&&await isPathAcessible(e)){if(await isFileExists(e))return void await unlinkAsync(e);const t=await readDirAsync(e);if(t.length)for(const n of t)await _rimraf(join(e,n));else await rmdirAsync(e)}}const initialBaseTmpDir=join(tmpdir(),"voucher-ocr"),initialResizeImgDir=join(initialBaseTmpDir,"resize"),initialSplitTmpDir=join(initialBaseTmpDir,"split"),zoneTmpDirPrefix="zone",initialBankZone={zoneName:"bank",width:2250,height:390,offsetX:70,offsetY:10},moment=moment_;function splitPagetoItems(e,t,n){return readImgInfo(e).pipe(map(i=>{const r=calcItemsPerPage(i.height,n.height);return r?range(0,r).pipe(mergeMap(r=>{const a={index:r,itemConfig:Object.assign({},n),srcPath:e,targetDir:t,pageHeight:i.height};return i.width<a.itemConfig.width&&(a.itemConfig.width=i.width),parseSplitPage(a).pipe(mergeMap(e=>{const t=new Map;return e.name&&t.set(e.name,e),of(t)}))})):of(new Map)}),concatMap(e=>e))}function resizeAndSaveImg(e,t,n,i){if(n<=0||n>1)throw new Error(`value of scale invalid: "${n}"`);return readImgInfo(e).pipe(mergeMap(r=>{const a={src:e,dst:t,width:r.width*n,height:r.height*n,quality:i};return defer(()=>resize(a))}),map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function parseSplitPage(e){const{index:t,srcPath:n,pageHeight:i}=e,{width:r,marginBottom:a}=e.itemConfig;let{height:o}=e.itemConfig;const s=0,c=t*o;if(c+o>i&&(o=i-c),o/i<.1||o<100){const e={name:"",path:"",width:0,height:0,size:0};return of(e)}const p=basename(n),m=p.split(".")[0],l=moment().format("YYYYMMDD"),f=join(e.targetDir,`${l}-${m}-${Math.random()}-${t}.jpg`),g={dst:f,src:n,quality:100,cropWidth:r,cropHeight:o+a,x:0,y:c};return defer(()=>crop(g)).pipe(mergeMap(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return of(t)}))}function readImgInfo(e){return defer(()=>info(e))}function calcItemsPerPage(e,t){const n=33;return e>=t?Math.ceil((e+33)/t):1}function getOcrZoneOptsByBankName(e,t){return t.get(e)}function cropImgAllZones(e,t,n,i){const r=[],a=new Set;for(const e of Object.values(n))e&&!a.has(e)&&a.add(e);for(const e of i){const t=e.zoneName;t&&a.has(t)&&r.push(e)}return from(r).pipe(mergeMap(n=>cropImgZone(e,t,n).pipe(map(e=>[n.zoneName,e]))),reduce((e,t)=>(e.set(t[0],t[1]),e),new Map))}function cropImgZone(e,t,n){const{zoneName:i,width:r,height:a,offsetX:o,offsetY:s}=n,c=join(t,`${i}-${Math.random()}.png`),p={dst:c,src:e,quality:100,cropWidth:r,cropHeight:a,x:o,y:s};return defer(()=>crop(p)).pipe(map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function runOcr(e,t,n){t||(t="eng");const i=`tesseract "${e}" "${n}" -l ${t}`;return run(i).pipe(last(),catchError(()=>of(void 0)),mapTo(void 0))}function retrieveKeyValuesFromOcrResult(e,t,n,i=!1){if(!t)throw new Error("matchRules empty");return readTxtFile(e).pipe(map(e=>({buf:e,regexp:t})),map(({buf:e,regexp:t})=>{const r=n&&"function"==typeof n?n(e):e.toString("utf8");return retrieveValueByRegexp(r,t,i)}))}function getRegexpOptsByName(e,t){for(const n of Object.keys(t))if(n===e)return t[n]}function retrieveValueByRegexp(e,t,n=!1){const i=regexMatch(e,t,n);return n&&console.info("retrieveValueByRegexp ----- text start: ---------------\x3e \n",e,"\n<--------------- text END ----------------\n\n",t,">>>>>>>>matched value: ",i,"\n"),i}function readTxtFile(e){if(!e)throw new Error("path empty");const t=e.split(".");if(t.length>1&&"txt"!==t[t.length-1].toLowerCase())throw new Error(`file extensiion must empty or be '.txt', but is: "${e}"`);return defer(async()=>readFileAsync(e))}function prepareContent(e){let t=e&&e.byteLength?e.toString():"";return t?t=t.replace(/(?<=\S) /g,""):""}function regexMatch(e,t,n=!1){if(e)for(const i of t){const t=e.match(i);if(Array.isArray(t)&&t.length)return i.global&&t.length>1?(n&&console.info("----------multi matched regex: --------------\x3e\n",t,"\n--- used regex ----: ",i,"\n<-------------ignore matched result---------------\n\n"),""):(n&&console.info("----------matched regex: --------------\x3e\n",t,"\n--- used regex ----: ",i,"\n<----------------------------\n\n"),t[0])}}function getOcrRetLangPath(e,t,n){const i=getOcrRetLangMap(e,t);if(!i)return"";const r=i.get(n);return r||""}function updateOcrRetTxtMap(e,t,n,i){if(!t||!n||!i)return;let r=getOcrRetLangMap(e,t);r||(r=new Map),n&&i&&updateOcrRetLangMap(r,n,i),e.set(t,r)}function updateOcrRetLangMap(e,t,n){e.set(t,n)}function getOcrRetLangMap(e,t){return e.get(t)}const moment$1=moment_;class Bvo{constructor(e){this.options=e;const t=+this.options.globalScale;this.options.globalScale=Number.isNaN(t)||t<=0?1:t,this.options.debug=!!this.options.debug;const{baseTmpDir:n,splitTmpDir:i,resizeImgDir:r}=e,a=n||initialBaseTmpDir,o=i||initialSplitTmpDir,s=r||initialResizeImgDir;defer(()=>createDir(a)).pipe(catchError(e=>(console.info(e),of(null))),concatMap(()=>createDir(o)),catchError(e=>(console.info(e),of(null))),concatMap(()=>createDir(s)),catchError(e=>(console.info(e),of(null)))).subscribe(()=>{},console.error)}run(e){const{debug:t,jpegQuality:n,scale:i,resizeImgDir:r,globalScale:a}=this.options,o=r||initialResizeImgDir,s=i/a,c={resizeDir:o,scale:s,jpegQuality:n,debug:!!t};return recognize(e,this.options).pipe(mergeMap(e=>{const t=Object.assign({retInfo:e},c);return saveImgAndPrune(t)}))}}function recognize(e,t){const{bankName:n,bankZone:i,baseTmpDir:r,concurrent:a,debug:o,defaultOcrLang:s,isSingleVoucher:c,splitTmpDir:p,voucherConfigMap:m,globalScale:l,skipImgDir:f}=t,g=r||initialBaseTmpDir,u=p||initialSplitTmpDir,h=f?join(f,moment$1().format("YYYYMMDD")):"",d="number"==typeof a&&a>0?a:cpus().length,b=parseVoucherConfigMapScale(m,l),y=parseOcrZoneScale(i,l),w=getBankRegexpOpts(b),k={baseDir:g,path:e,bankZone:y,bankRegexpOptsMap:w,debug:!!o,lang:s,skipImgDir:h},D=n?of({bankName:n,pagePath:e}):recognizePageBank(k),M=D.pipe(filter(({bankName:e})=>!!e&&"n/a"!==e),mergeMap(({bankName:e,pagePath:t})=>c?readImgInfo(t).pipe(map(t=>of({bankName:e,imgInfo:t}))):(o&&console.info("start split page"),splitPageToImgs(t,e,u,b))),mergeMap(({bankName:e,imgInfo:t})=>{const n=getOcrFields(e,b);if(!n)throw new Error(`ocrFields not defined with bankName: "${e}"`);const i={bankName:e,baseDir:g,debug:!!o,defaultValue:"",imgInfo:t,ocrFields:n,voucherConfigMap:b};return o&&console.info("recognize item"),recognizeFields(i).pipe(map(n=>(n.set("bank",e),n.set("filename",t.name.trim()),n.set("path",t.path.trim()),n)))},d>0?d:1)),v=defer(()=>isFileExists(e)).pipe(filter(e=>e));return v.pipe(mergeMap(()=>M))}function recognizePageBank(e){const{baseDir:t,path:n,bankZone:i,bankRegexpOptsMap:r,debug:a,lang:o,skipImgDir:s}=e,c=join(t,zoneTmpDirPrefix,`${basename(n)}-${Math.random().toString()}`);return a&&console.info("recognize pageBank:",c,n),defer(()=>createDir(c)).pipe(catchError(e=>(console.info(e),of(null))),mergeMap(()=>cropImgZone(join(n),c,i)),concatMap(e=>runOcr(e.path,o,e.path).pipe(mapTo(e.path))),concatMap(e=>from(r.entries()).pipe(concatMap(([t,n])=>retrieveKeyValuesFromOcrResult(e+".txt",n,e=>e.toString().replace(/(?<=\S)[. ]{1,2}(?=\S)/g,"").replace(/\n{2,}/g,""),a).pipe(map(e=>({bankName:t,value:e})))),skipWhile(({value:e})=>"undefined"==typeof e||"string"==typeof e&&!e.length),take(1),map(({bankName:e})=>({bankName:e,pagePath:n})),defaultIfEmpty({bankName:"n/a",pagePath:""}))),tap(e=>{const{bankName:t,pagePath:i}=e;"n/a"!==t&&i||(console.info(`recognize bank of page fail. no matached regexp. file: "${n}", pagePath: "${i}" `),cpSkipImg(n,s)),a||rimraf(c).catch(console.info)}))}async function cpSkipImg(e,t){t&&(await isPathAcessible(t)||await createDir(t),copyFileAsync(e,join(t,basename(e))).catch(console.error))}function splitPageToImgs(e,t,n,i){const r=i.get(t);if(!r)throw new Error("bank config empty during split page to images");return splitPagetoItems(e,n,r).pipe(mergeMap(e=>{const n=from(e.values()).pipe(map(e=>({bankName:t,imgInfo:e})));return n}))}function recognizeFields(e){const{bankName:t,baseDir:n,debug:i,defaultValue:r,imgInfo:a,ocrFields:o,voucherConfigMap:s}=e,c=join(n,zoneTmpDirPrefix,`${basename(a.path)}`),p=getOcrZoneOptsByBankName(t,s);if(!p)throw new Error(`get bankConfig empty with bankName: "${t}"`);const m=defer(()=>createDir(c)).pipe(mergeMap(()=>cropImgAllZones(a.path,c,o,p.ocrZones)),concatMap(e=>{const t={bankConfig:p,ocrFields:o,defaultValue:r,debug:i,zoneImgMap:e};return batchOcrAndRetrieve(t)}),tap(()=>{i||setTimeout(e=>{rimraf(e).catch(console.info)},5e3,c)}));return m}function batchOcrAndRetrieve(e){const{zoneImgMap:t,bankConfig:n,ocrFields:i,defaultValue:r,debug:a}=e,{bankName:o}=n,s=from(t.entries()).pipe(concatMap(e=>ocrAndPickFromZoneImg(e,n,a)),reduce((e,t)=>e.set(t.fieldName,t.value),new Map),map(e=>e.set("bank",o)),map(e=>setDefaultValue(e,i,r)));return s}function setDefaultValue(e,t,n=""){const i=new Map;for(const n of Object.keys(t)){const t=e.get(n);"string"==typeof t?i.set(n,t):i.set(n,"")}return i}function processZoneImgRow(e){const{fieldName:t,value:n}=e,i=Object.assign({},e);switch(t){case"amount":i.value=n.trim().replace(/,/g,"");break;case"date":i.value=n.trim().replace(/\D/g,""),i.value&&"0"===i.value.slice(0,1)&&(i.value="2"+i.value);break;case"sn":case"destAccountNumber":case"paymentAccountNumber":i.value=n.trim()}return i}function validateZoneImgRow(e,t){if("string"!=typeof t)return!1;switch(e){case"amount":if(validateRetInfoAmout(t))return!0;break;case"sn":if(t)return!0;break;case"date":if(validateRetInfoDate(t))return!0;break;case"bank":return!0;default:if("string"==typeof t)return!0}return!1}function validateRetInfoAmout(e){if(!e)return!1;if(!e.trim())return!1;const t=parseFloat(e);return!Number.isNaN(t)&&"number"==typeof t}function validateRetInfoDate(e){return!!e&&moment$1(e,"YYYYMMDD").isValid()}function getBankRegexpOpts(e){const t=new Map;for(const{bankName:n,regexpOpts:i}of e.values())i&&i.bank&&t.set(n,i.bank);if(!t.size)throw new Error("not BankRegexpOpts found, should not set");return t}function getOcrFields(e,t){const n=t.get(e);if(!n)throw new Error(`get ocrFields empty by bankName: "${e}"`);return n.ocrFields}function saveImgAndPrune(e){const{retInfo:t,resizeDir:n,debug:i,scale:r,jpegQuality:a}=e,o=t.get("filename"),s=t.get("path"),c=t.get("sn");if(!o)throw new Error(`result info map invalid with empty path. info: ${t}`);if(!s)throw new Error(`result info map invalid with empty path. info: ${t}`);const p=o.split(".").slice(0,-1).join("").split("-");p.splice(2,1);let m=p.join("-");c&&(m+=`-${c.replace(/[^\d\w]/g,"_")}`),m+=".jpg";const l=moment$1().format("YYYY-MM-DD"),f=join(n,l,m);return t.set("filename",m),resizeAndSaveImg(s,f,r,a).pipe(map(e=>(t.set("path",e.path),t)),tap(()=>{i||unlinkAsync(s).catch(console.info)}))}function genFieldLangs(e,t,n){return n&&"undefined"!=typeof n[e]&&Array.isArray(n[e])?n[e]:t}function ocrAndPickFromZoneImg(e,t,n=!1){const{ocrDefaultLangs:i,ocrFieldLangs:r,regexpOpts:a,ocrFields:o}=t,s=new Map;return from(Object.entries(o)).pipe(filter(t=>{const n=t[1];return!!n&&n===e[0]}),concatMap(t=>{const o=t[0],c=t[1];return ocrAndPickFieldFromZoneImg(o,c,e,a,i,r,n,s)}))}function ocrAndPickFieldFromZoneImg(e,t,n,i,r,a,o=!1,s){const[,c]=n,p=genFieldLangs(e,r,a),m=p.length-1,l=getRegexpOptsByName(e,i);if(!l)throw new Error(`got regexp empty by zoneName: "${e}"`);return from(p).pipe(concatMap(n=>{const i=getOcrRetLangPath(s,t,n);if(i)return retrieveKeyValuesFromOcrResult(i+".txt",l,prepareContent,o).pipe(map(r=>({fieldName:e,zoneName:t,value:r,usedLang:n,txtPath:i})));{const i=c.path,r=i.split(".").slice(0,-1).join(".")+`-${Math.random()}`;return runOcr(i,n,r).pipe(concatMap(()=>retrieveKeyValuesFromOcrResult(r+".txt",l,prepareContent,o).pipe(map(i=>({fieldName:e,zoneName:t,value:i,usedLang:n,txtPath:r})))))}}),tap(({zoneName:e,usedLang:t,txtPath:n})=>{updateOcrRetTxtMap(s,e,t,n)}),skipWhile((e,t)=>{const n=validateZoneImgRow(e.fieldName,e.value);return!n&&t!==m}),take(1),map(e=>("string"!=typeof e.value&&(e.value=""),e)),map(processZoneImgRow))}function parseVoucherConfigMapScale(e,t){const n=new Map;for(const[i,r]of e){const e=Object.assign({},r),a=[];for(const n of e.ocrZones)a.push(parseOcrZoneScale(n,t));e.ocrZones=a,e.width=e.width*t,e.height=e.height*t,e.marginBottom=e.marginBottom*t,n.set(i,e)}return n}function parseOcrZoneScale(e,t){const n=Object.assign({},e);return n.width=n.width*t,n.height=n.height*t,n.offsetX=n.offsetX*t,n.offsetY=n.offsetY*t,n}export{initialBankZone,initialBaseTmpDir,initialResizeImgDir,initialSplitTmpDir,zoneTmpDirPrefix,Bvo,recognize,recognizePageBank,splitPageToImgs,recognizeFields,batchOcrAndRetrieve,ocrAndPickFromZoneImg}; | ||
//# sourceMappingURL=bvocr.esm.min.js.map |
@@ -5,3 +5,3 @@ /** | ||
* | ||
* @version 2.0.0 | ||
* @version 2.1.0 | ||
* @author waiting | ||
@@ -475,3 +475,3 @@ * @license MIT | ||
function recognize(imgPath, options) { | ||
const { bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, splitTmpDir, voucherConfigMap, globalScale, skipImgDir, } = options; | ||
const { bankName: inputBankName, bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, isSingleVoucher, splitTmpDir, voucherConfigMap, globalScale, skipImgDir, } = options; | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir; | ||
@@ -494,6 +494,17 @@ const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir; | ||
}; | ||
const ret$ = recognizePageBank(bankOpts).pipe(operators.filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), operators.mergeMap(({ bankName, pagePath }) => { | ||
!!debug && console.info('start split page'); | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
}), operators.mergeMap(({ bankName, imgFile }) => { | ||
const bank$ = !inputBankName | ||
? recognizePageBank(bankOpts) | ||
: rxjs.of({ | ||
bankName: inputBankName, | ||
pagePath: imgPath, | ||
}); | ||
const ret$ = bank$.pipe(operators.filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), operators.mergeMap(({ bankName, pagePath }) => { | ||
if (isSingleVoucher) { | ||
return readImgInfo(pagePath).pipe(operators.map(imgInfo => rxjs.of({ bankName, imgInfo }))) | ||
} | ||
else { | ||
!!debug && console.info('start split page'); | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
} | ||
}), operators.mergeMap(({ bankName, imgInfo }) => { | ||
const ocrFields = getOcrFields(bankName, voucherConfigMapNew); | ||
@@ -509,3 +520,3 @@ | ||
defaultValue: '', | ||
imgFile, | ||
imgInfo, | ||
ocrFields, | ||
@@ -518,4 +529,4 @@ voucherConfigMap: voucherConfigMapNew, | ||
retInfo.set('bank' /* bank */, bankName); | ||
retInfo.set('filename' /* filename */, imgFile.name.trim()); | ||
retInfo.set('path' /* path */, imgFile.path.trim()); | ||
retInfo.set('filename' /* filename */, imgInfo.name.trim()); | ||
retInfo.set('path' /* path */, imgInfo.path.trim()); | ||
return retInfo | ||
@@ -580,4 +591,4 @@ })) | ||
return splitPagetoItems(pagePath, targetDir, config).pipe(operators.mergeMap(fileMap => { | ||
const ret$ = rxjs.from(fileMap.values()).pipe(operators.map(imgFile => { | ||
return { bankName, imgFile } | ||
const ret$ = rxjs.from(fileMap.values()).pipe(operators.map(imgInfo => { | ||
return { bankName, imgInfo } | ||
})); | ||
@@ -590,4 +601,4 @@ | ||
function recognizeFields(options) { | ||
const { bankName, baseDir, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = path.join(baseDir, zoneTmpDirPrefix, `${path.basename(imgFile.path)}`); | ||
const { bankName, baseDir, debug, defaultValue, imgInfo, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = path.join(baseDir, zoneTmpDirPrefix, `${path.basename(imgInfo.path)}`); | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap); | ||
@@ -600,3 +611,3 @@ | ||
// 切分图片区域分别做ocr识别 | ||
operators.mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), operators.concatMap(fileMap => { | ||
operators.mergeMap(() => cropImgAllZones(imgInfo.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), operators.concatMap(fileMap => { | ||
const opts = { | ||
@@ -603,0 +614,0 @@ bankConfig, ocrFields, defaultValue, debug, |
@@ -0,1 +1,2 @@ | ||
import { IInfoResult } from 'easyimage'; | ||
import { Observable } from 'rxjs'; | ||
@@ -6,1 +7,2 @@ import { Filename, ImgFileInfo, VoucherConfig } from './model'; | ||
quality: number): Observable<ImgFileInfo>; | ||
export declare function readImgInfo(path: string): Observable<IInfoResult>; |
@@ -117,3 +117,3 @@ import { crop, info as getImgInfo, resize } from 'easyimage' | ||
} | ||
function readImgInfo(path) { | ||
export function readImgInfo(path) { | ||
return defer(() => getImgInfo(path)) | ||
@@ -120,0 +120,0 @@ } |
@@ -7,3 +7,3 @@ import * as moment_ from 'moment' | ||
import { initialBaseTmpDir, initialResizeImgDir, initialSplitTmpDir, zoneTmpDirPrefix } from './config' | ||
import { resizeAndSaveImg, splitPagetoItems } from './img-process' | ||
import { readImgInfo, resizeAndSaveImg, splitPagetoItems } from './img-process' | ||
import { cropImgAllZones, cropImgZone, getOcrZoneOptsByBankName, runOcr } from './ocr-process' | ||
@@ -59,3 +59,3 @@ import { getOcrRetLangPath, getRegexpOptsByName, prepareContent, retrieveKeyValuesFromOcrResult, updateOcrRetTxtMap, } from './txt-process' | ||
export function recognize(imgPath, options) { | ||
const { bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, splitTmpDir, voucherConfigMap, globalScale, skipImgDir, } = options | ||
const { bankName: inputBankName, bankZone, baseTmpDir, concurrent, debug, defaultOcrLang, isSingleVoucher, splitTmpDir, voucherConfigMap, globalScale, skipImgDir, } = options | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir | ||
@@ -78,6 +78,17 @@ const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir | ||
} | ||
const ret$ = recognizePageBank(bankOpts).pipe(filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), mergeMap(({ bankName, pagePath }) => { | ||
!!debug && console.info('start split page') | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
}), mergeMap(({ bankName, imgFile }) => { | ||
const bank$ = !inputBankName | ||
? recognizePageBank(bankOpts) | ||
: of({ | ||
bankName: inputBankName, | ||
pagePath: imgPath, | ||
}) | ||
const ret$ = bank$.pipe(filter(({ bankName }) => !!bankName && bankName !== 'n/a' /* NA */), mergeMap(({ bankName, pagePath }) => { | ||
if (isSingleVoucher) { | ||
return readImgInfo(pagePath).pipe(map(imgInfo => of({ bankName, imgInfo }))) | ||
} | ||
else { | ||
!!debug && console.info('start split page') | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMapNew) | ||
} | ||
}), mergeMap(({ bankName, imgInfo }) => { | ||
const ocrFields = getOcrFields(bankName, voucherConfigMapNew) | ||
@@ -93,3 +104,3 @@ | ||
defaultValue: '', | ||
imgFile, | ||
imgInfo, | ||
ocrFields, | ||
@@ -102,4 +113,4 @@ voucherConfigMap: voucherConfigMapNew, | ||
retInfo.set('bank' /* bank */, bankName) | ||
retInfo.set('filename' /* filename */, imgFile.name.trim()) | ||
retInfo.set('path' /* path */, imgFile.path.trim()) | ||
retInfo.set('filename' /* filename */, imgInfo.name.trim()) | ||
retInfo.set('path' /* path */, imgInfo.path.trim()) | ||
return retInfo | ||
@@ -164,4 +175,4 @@ })) | ||
return splitPagetoItems(pagePath, targetDir, config).pipe(mergeMap(fileMap => { | ||
const ret$ = ofrom(fileMap.values()).pipe(map(imgFile => { | ||
return { bankName, imgFile } | ||
const ret$ = ofrom(fileMap.values()).pipe(map(imgInfo => { | ||
return { bankName, imgInfo } | ||
})) | ||
@@ -174,4 +185,4 @@ | ||
export function recognizeFields(options) { | ||
const { bankName, baseDir, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options | ||
const zoneTmpDir = join(baseDir, zoneTmpDirPrefix, `${basename(imgFile.path)}`) | ||
const { bankName, baseDir, debug, defaultValue, imgInfo, ocrFields, voucherConfigMap, } = options | ||
const zoneTmpDir = join(baseDir, zoneTmpDirPrefix, `${basename(imgInfo.path)}`) | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap) | ||
@@ -184,3 +195,3 @@ | ||
// 切分图片区域分别做ocr识别 | ||
mergeMap(() => cropImgAllZones(imgFile.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), concatMap(fileMap => { | ||
mergeMap(() => cropImgAllZones(imgInfo.path, zoneTmpDir, ocrFields, bankConfig.ocrZones)), concatMap(fileMap => { | ||
const opts = { | ||
@@ -187,0 +198,0 @@ bankConfig, ocrFields, defaultValue, debug, |
@@ -7,12 +7,28 @@ /// <reference types="node" /> | ||
export interface OcrOpts { | ||
bankName?: BankName; | ||
/** For recognize bank from page */ | ||
bankZone: OcrZone; | ||
baseTmpDir?: string; | ||
/** os.cpus() if undefined */ | ||
concurrent?: number; | ||
debug?: boolean; | ||
/** Default tesseract ocr lang, eg 'eng' */ | ||
defaultOcrLang: string; | ||
/** Not need to split if true. Default:false */ | ||
isSingleVoucher?: boolean; | ||
/** Result image quality (0, 100] */ | ||
jpegQuality: number; | ||
/** Store result images */ | ||
resizeImgDir?: string; | ||
/** Save resize result image (0, 1] */ | ||
scale: number; | ||
/** Folder store images not recogniezed bank */ | ||
skipImgDir?: string; | ||
/** | ||
* if item of voucherConfigMap for 300api, | ||
* but source image is 600dpi then set thie value to 600/300==2. | ||
* Default:1 | ||
*/ | ||
globalScale: number; | ||
/** Store temp split images to ocr */ | ||
splitTmpDir?: string; | ||
@@ -140,3 +156,3 @@ voucherConfigMap: VoucherConfigMap; | ||
bankName: BankName; | ||
imgFile: ImgFileInfo; | ||
imgInfo: ImgFileInfo; | ||
} | ||
@@ -148,3 +164,3 @@ export interface RecognizeFieldsOpts { | ||
defaultValue: string; | ||
imgFile: ImgFileInfo; | ||
imgInfo: ImgFileInfo; | ||
ocrFields: OcrFields; | ||
@@ -151,0 +167,0 @@ voucherConfigMap: VoucherConfigMap; |
{ | ||
"name": "bank-voucher-ocr", | ||
"author": "waiting", | ||
"version": "2.0.0", | ||
"version": "2.1.0", | ||
"description": "Bank Voucher ocr by tesseract and retrieve fields", | ||
@@ -6,0 +6,0 @@ "keywords": [ |
Sorry, the diff of this file is not supported yet
176094
2892