bank-voucher-ocr
Advanced tools
Comparing version 0.1.0 to 0.1.1
/** | ||
* bank-voucher-ocr | ||
* 银行凭证扫描处理 | ||
* Bank Voucher ocr by tesseract and retrieve fields | ||
* | ||
* @version 0.1.0 | ||
* @version 0.1.1 | ||
* @author waiting | ||
@@ -97,3 +97,3 @@ * @license MIT | ||
const initialBaseTmpDir = join(tmpdir(), 'vocher-ocr'); | ||
const initialBaseTmpDir = join(tmpdir(), 'voucher-ocr'); | ||
const initialResizeImgDir = join(initialBaseTmpDir, 'resize'); // store result images | ||
@@ -161,3 +161,3 @@ const initialSplitTmpDir = join(initialBaseTmpDir, 'split'); // store temp split images to ocr | ||
} | ||
// split one vocher item from a page and save it | ||
// split one voucher item from a page and save it | ||
function parseSplitPage(options) { | ||
@@ -335,7 +335,7 @@ const { index, srcPath, pageHeight } = options; | ||
function recognize(options) { | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, vocherConfigMap, } = options; | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, voucherConfigMap, } = options; | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir; | ||
const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir; | ||
const resizeDir = resizeImgDir ? resizeImgDir : initialResizeImgDir; | ||
const bankRegexpOptsMap = getBankRegexpOpts(vocherConfigMap); | ||
const bankRegexpOptsMap = getBankRegexpOpts(voucherConfigMap); | ||
const bankOpts = { | ||
@@ -351,5 +351,5 @@ baseDir, | ||
!!debug && console.info('start split page'); | ||
return splitPageToImgs(pagePath, bankName, splitDir, vocherConfigMap) | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMap) | ||
}), concatMap(({ bankName, imgFile }) => { | ||
const ocrFields = getOcrFields(bankName, vocherConfigMap); | ||
const ocrFields = getOcrFields(bankName, voucherConfigMap); | ||
@@ -367,3 +367,3 @@ if (!ocrFields) { | ||
ocrFields, | ||
vocherConfigMap, | ||
voucherConfigMap, | ||
}; | ||
@@ -426,4 +426,4 @@ | ||
// 切分页面为多张凭证 | ||
function splitPageToImgs(pagePath, bankName, targetDir, vocherConfigMap) { | ||
const config = vocherConfigMap.get(bankName); | ||
function splitPageToImgs(pagePath, bankName, targetDir, voucherConfigMap) { | ||
const config = voucherConfigMap.get(bankName); | ||
@@ -443,5 +443,5 @@ if (!config) { | ||
function recognizeFields(options) { | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, vocherConfigMap, } = options; | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = join(baseDir, '/zone/', Math.random().toString()); | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, vocherConfigMap); | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap); | ||
@@ -448,0 +448,0 @@ // console.info('recognize single image:', zoneTmpDir, imgFile.path) |
/** | ||
* bank-voucher-ocr | ||
* 银行凭证扫描处理 | ||
* Bank Voucher ocr by tesseract and retrieve fields | ||
* | ||
* @version 0.1.0 | ||
* @version 0.1.1 | ||
* @author waiting | ||
@@ -11,3 +11,3 @@ * @license MIT | ||
import{access,chmod,close,copyFile,mkdir,open,readdir,readFile,rmdir,stat,unlink,write,writeFile}from"fs";import{join,normalize,resolve,sep}from"path";import{promisify}from"util";import{tmpdir}from"os";import{crop,info,resize}from"easyimage";import*as moment_ from"moment";import{from,of,range,defer}from"rxjs";import{concatMap,map,mergeMap,reduce,catchError,defaultIfEmpty,filter,mapTo,skipWhile,take,tap}from"rxjs/operators";import run from"rxrunscript";const closeAsync=promisify(close),chmodAsync=promisify(chmod),copyFileAsync=promisify(copyFile),mkdirAsync=promisify(mkdir),openAsync=promisify(open),readFileAsync=promisify(readFile),readDirAsync=promisify(readdir),rmdirAsync=promisify(rmdir),unlinkAsync=promisify(unlink),writeAsync=promisify(write),writeFileAsync=promisify(writeFile);function isPathAcessible(e){return e?new Promise(t=>access(e,e=>t(!e))):Promise.resolve(!1)}function isDirExists(e){return e?isDirFileExists(e,"DIR"):Promise.resolve(!1)}function isFileExists(e){return e?isDirFileExists(e,"FILE"):Promise.resolve(!1)}function isDirFileExists(e,t){return e?new Promise(r=>{stat(e,(e,i)=>r(!e&&("DIR"===t?i.isDirectory():i.isFile())))}):Promise.resolve(!1)}async function createDir(e){if(!e)throw new Error("value of path param invalid");e=normalize(e),await isDirExists(e)||await e.split(sep).reduce(async(e,t)=>{const r=resolve(await e,t);return await isPathAcessible(r)||await mkdirAsync(r,493),r},Promise.resolve(sep))}async function rimraf(e){e&&(await _rimraf(e),await rmdirAsync(e))}async function _rimraf(e){if(e&&await isPathAcessible(e)){if(await isFileExists(e))return void await unlinkAsync(e);const t=await readDirAsync(e);for(const r of t)await _rimraf(join(e,r))}}const initialBaseTmpDir=join(tmpdir(),"vocher-ocr"),initialResizeImgDir=join(initialBaseTmpDir,"resize"),initialSplitTmpDir=join(initialBaseTmpDir,"split"),moment=moment_;function splitPagetoItems(e,t,r){return readImgInfo(e).pipe(map(i=>{const n=calcItemsPerPage(i.height,r.height);return n?range(0,n).pipe(mergeMap(n=>{const a={index:n,itemConfig:Object.assign({},r),srcPath:e,targetDir:t,pageHeight:i.height};return i.width<a.itemConfig.width&&(a.itemConfig.width=i.width),parseSplitPage(a).pipe(mergeMap(e=>{const t=new Map;return t.set(e.name,e),of(t)}))})):of(new Map)}),concatMap(e=>e))}function resizeAndSaveImg(e,t,r,i){if(r<=0||r>1)throw new Error(`value of scale invalid: "${r}"`);return readImgInfo(e).pipe(mergeMap(n=>{const a={src:e,dst:t,width:n.width*r,height:n.height*r,quality:i};return from(resize(a))}),map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function parseSplitPage(e){const{index:t,srcPath:r,pageHeight:i}=e,{width:n}=e.itemConfig;let{height:a}=e.itemConfig;const o=36,s=0,p=t*a;p+a>i&&(a=i-p);const c=moment().format("YYYYMMDD"),m=join(e.targetDir,`${c}-${Math.random()}-${t}.jpg`),f={dst:m,src:r,quality:100,cropWidth:n,cropHeight:a+36,x:0,y:p};return from(crop(f)).pipe(mergeMap(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return of(t)}))}function readImgInfo(e){return from(info(e))}function calcItemsPerPage(e,t){const r=33;return e>=t?Math.floor((e+33)/t):0}function getOcrZoneOptsByBankName(e,t){return t.get(e)}function cropImgAllZones(e,t,r){return from(r).pipe(mergeMap(r=>cropImgZone(e,t,r).pipe(map(e=>[r.zoneName,e]))),reduce((e,t)=>(e.set(t[0],t[1]),e),new Map))}function cropImgZone(e,t,r){const{zoneName:i,width:n,height:a,offsetX:o,offsetY:s}=r,p=join(t,`${i}.png`),c={dst:p,src:e,quality:100,cropWidth:n,cropHeight:a,x:o,y:s};return from(crop(c)).pipe(map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function runOcr(e,t){t||(t="eng");const r=`tesseract "${e}" "${e}" -l ${t}`;return run(r)}function retrieveKeyValuesFromOcrResult(e,t,r){if(!t)throw new Error("matchRules empty");return readTxtFile(e).pipe(map(e=>({buf:e,regexp:t})),map(({buf:e,regexp:t})=>{const i=r&&"function"==typeof r?r(e):e.toString("utf8");return retrieveValueByRegexp(i,t)}))}function getRegexpOptsByName(e,t){for(const r of Object.keys(t))if(r===e)return t[r]}function retrieveValueByRegexp(e,t){return regexMatch(e,t)}function readTxtFile(e){if(!e)throw new Error("path empty");const t=e.split(".");if(t.length>1&&"txt"!==t[t.length-1].toLowerCase())throw new Error(`file extensiion must empty or be '.txt', but is: "${e}"`);return defer(async()=>readFileAsync(e))}function prepareContent(e){let t=e&&e.byteLength?e.toString():"";return t?t=t.replace(/[\t ]/g,""):""}function regexMatch(e,t){if(e)for(const r of t){const t=e.match(r);if(Array.isArray(t)&&t.length)return t[0]}}const moment$1=moment_;function recognize(e){const{bankZone:t,baseTmpDir:r,debug:i,defaultOcrLang:n,jpegQuality:a,scale:o,splitTmpDir:s,imgPath:p,resizeImgDir:c,vocherConfigMap:m}=e,f=r||initialBaseTmpDir,l=s||initialSplitTmpDir,g=c||initialResizeImgDir,u=getBankRegexpOpts(m),h={baseDir:f,path:p,bankZone:t,bankRegexpOptsMap:u,debug:!!i,lang:n},d=recognizePageBank(h).pipe(filter(({bankName:e})=>!!e&&"n/a"!==e),concatMap(({bankName:e,pagePath:t})=>(i&&console.info("start split page"),splitPageToImgs(t,e,l,m))),concatMap(({bankName:e,imgFile:t})=>{const r=getOcrFields(e,m);if(!r)throw new Error(`ocrFields not defined with bankName: "${e}"`);const n={bankName:e,baseDir:f,concurrent:2,debug:!!i,defaultValue:"",imgFile:t,ocrFields:r,vocherConfigMap:m};return i&&console.info("recognize item"),recognizeFields(n).pipe(map(r=>(r.set("bank",e),r.set("filename",t.name.trim()),r.set("path",t.path.trim()),r)))}),mergeMap(e=>{const t={retInfo:e,resizeDir:g,scale:o,jpegQuality:a,debug:!!i};return saveImgAndPrune(t)})),y=from(createDir(f)).pipe(concatMap(()=>createDir(l)),concatMap(()=>createDir(g))),w=from(isFileExists(p)).pipe(filter(e=>e));return y.pipe(mergeMap(()=>w),mergeMap(()=>d))}function recognizePageBank(e){const{baseDir:t,path:r,bankZone:i,bankRegexpOptsMap:n,debug:a,lang:o}=e,s=join(t,"zone/",Math.random().toString());return a&&console.info("recognize pageBank:",s,r),from(createDir(s)).pipe(mergeMap(()=>cropImgZone(join(r),s,i)),concatMap(e=>runOcr(e.path,o).pipe(map(()=>({path:r,zoneImgPath:e.path})),mapTo(e.path),catchError(()=>of(e.path)))),concatMap(e=>from(n.entries()).pipe(concatMap(([t,r])=>retrieveKeyValuesFromOcrResult(e+".txt",r,e=>e.toString()).pipe(map(e=>({bankName:t,value:e})))),skipWhile(({value:e})=>"undefined"==typeof e||"string"==typeof e&&!e.length),take(1),map(({bankName:e})=>({bankName:e,pagePath:r})),defaultIfEmpty({bankName:"n/a",pagePath:""}))),tap(e=>{const{bankName:t,pagePath:i}=e;"n/a"!==t&&i||console.info(`recognize bank of page fail. no matached regexp. file: "${r}", pagePath: "${i}" `),a||rimraf(s).catch(console.info)}))}function splitPageToImgs(e,t,r,i){const n=i.get(t);if(!n)throw new Error("bank config empty during split page to images");return splitPagetoItems(e,r,n).pipe(mergeMap(e=>{const r=from(e.values()).pipe(map(e=>({bankName:t,imgFile:e})));return r}))}function recognizeFields(e){const{bankName:t,baseDir:r,concurrent:i,debug:n,defaultValue:a,imgFile:o,ocrFields:s,vocherConfigMap:p}=e,c=join(r,"/zone/",Math.random().toString()),m=getOcrZoneOptsByBankName(t,p);if(!m)throw new Error(`get bankConfig empty with bankName: "${t}"`);const f=from(createDir(c)).pipe(mergeMap(()=>cropImgAllZones(o.path,c,m.ocrZones)),concatMap(e=>batchOcrAndRetrieve(e,m,s,a,i)),tap(()=>n||rimraf(c).catch(console.info)));return f}function batchOcrAndRetrieve(e,t,r,i="",n=2){const{bankName:a}=t;return from(e.entries()).pipe(concatMap(e=>ocrAndPickFromZoneImg(e,t,n)),reduce((e,t)=>e.set(t.fieldName,t.value),new Map),map(e=>e.set("bank",a)),map(e=>setDefaultValue(e,r,i)))}function setDefaultValue(e,t,r=""){const i=new Map;for(const r of Object.keys(t)){const t=e.get(r);"string"==typeof t?i.set(r,t):i.set(r,"")}return i}function processZoneImgRow(e){const{fieldName:t,value:r}=e,i={fieldName:t,value:r};switch(t){case"amount":i.value=r.trim().replace(/,/g,"");break;case"date":i.value=r.trim().replace(/\D/g,"");break;case"sn":i.value=r.trim()}return i}function validateZoneImgRow(e,t){if("string"!=typeof t)return!1;switch(e){case"amount":if(validateRetInfoAmout(t))return!0;break;case"sn":if(t)return!0;break;case"date":if(validateRetInfoDate(t))return!0;break;case"bank":return!0;default:if("string"==typeof t)return!0}return!1}function validateRetInfoAmout(e){if(!e)return!1;if(!e.trim())return!1;const t=parseFloat(e);return!Number.isNaN(t)&&"number"==typeof t}function validateRetInfoDate(e){return!!e&&moment$1(e,"YYYYMMDD").isValid()}function getBankRegexpOpts(e){const t=new Map;for(const{bankName:r,regexpOpts:i}of e.values())i&&i.bank&&t.set(r,i.bank);if(!t.size)throw new Error("not BankRegexpOpts found, should not set");return t}function getOcrFields(e,t){const r=t.get(e);if(!r)throw new Error(`get ocrFields empty by bankName: "${e}"`);return r.ocrFields}function saveImgAndPrune(e){const{retInfo:t,resizeDir:r,debug:i,scale:n,jpegQuality:a}=e,o=t.get("filename"),s=t.get("path"),p=t.get("sn");if(!o)throw new Error(`result info map invalid with empty path. info: ${t}`);if(!s)throw new Error(`result info map invalid with empty path. info: ${t}`);const c=p?`${(new Date).getTime()}-${p.replace(/[^\d\w]/g,"_")}.jpg`:o,m=moment$1().format("YYYY-MM-DD"),f=join(r,m,c);return t.set("filename",c),resizeAndSaveImg(s,f,n,a).pipe(map(e=>(t.set("path",e.path),t)),tap(()=>{i||unlinkAsync(s).catch(console.info)}))}function genFieldLangs(e,t,r){return r&&"undefined"!=typeof r[e]&&Array.isArray(r[e])?r[e]:t}function ocrAndPickFromZoneImg(e,t,r=2){const{ocrDefaultLangs:i,ocrFieldLangs:n,regexpOpts:a,ocrFields:o}=t;return from(Object.entries(o)).pipe(filter(t=>{const r=t[1];return!!r&&r===e[0]}),mergeMap(t=>{const r=t[0];return ocrAndPickFieldFromZoneImg(r,e,a,i,n)},r))}function ocrAndPickFieldFromZoneImg(e,t,r,i,n){const[,a]=t,o=genFieldLangs(e,i,n),s=o.length-1,p=getRegexpOptsByName(e,r);if(!p)throw new Error(`got regexp empty by zoneName: "${e}"`);return from(o).pipe(concatMap(e=>runOcr(a.path,e).pipe(mapTo(!0),catchError(()=>of(!0)))),concatMap(()=>retrieveKeyValuesFromOcrResult(a.path+".txt",p,prepareContent).pipe(map(t=>({fieldName:e,value:t})))),skipWhile((e,t)=>{const r=validateZoneImgRow(e.fieldName,e.value);return!r&&t!==s}),take(1),map(e=>("string"!=typeof e.value&&(e.value=""),e)),map(processZoneImgRow))}export{recognize}; | ||
import{access,chmod,close,copyFile,mkdir,open,readdir,readFile,rmdir,stat,unlink,write,writeFile}from"fs";import{join,normalize,resolve,sep}from"path";import{promisify}from"util";import{tmpdir}from"os";import{crop,info,resize}from"easyimage";import*as moment_ from"moment";import{from,of,range,defer}from"rxjs";import{concatMap,map,mergeMap,reduce,catchError,defaultIfEmpty,filter,mapTo,skipWhile,take,tap}from"rxjs/operators";import run from"rxrunscript";const closeAsync=promisify(close),chmodAsync=promisify(chmod),copyFileAsync=promisify(copyFile),mkdirAsync=promisify(mkdir),openAsync=promisify(open),readFileAsync=promisify(readFile),readDirAsync=promisify(readdir),rmdirAsync=promisify(rmdir),unlinkAsync=promisify(unlink),writeAsync=promisify(write),writeFileAsync=promisify(writeFile);function isPathAcessible(e){return e?new Promise(t=>access(e,e=>t(!e))):Promise.resolve(!1)}function isDirExists(e){return e?isDirFileExists(e,"DIR"):Promise.resolve(!1)}function isFileExists(e){return e?isDirFileExists(e,"FILE"):Promise.resolve(!1)}function isDirFileExists(e,t){return e?new Promise(r=>{stat(e,(e,i)=>r(!e&&("DIR"===t?i.isDirectory():i.isFile())))}):Promise.resolve(!1)}async function createDir(e){if(!e)throw new Error("value of path param invalid");e=normalize(e),await isDirExists(e)||await e.split(sep).reduce(async(e,t)=>{const r=resolve(await e,t);return await isPathAcessible(r)||await mkdirAsync(r,493),r},Promise.resolve(sep))}async function rimraf(e){e&&(await _rimraf(e),await rmdirAsync(e))}async function _rimraf(e){if(e&&await isPathAcessible(e)){if(await isFileExists(e))return void await unlinkAsync(e);const t=await readDirAsync(e);for(const r of t)await _rimraf(join(e,r))}}const initialBaseTmpDir=join(tmpdir(),"voucher-ocr"),initialResizeImgDir=join(initialBaseTmpDir,"resize"),initialSplitTmpDir=join(initialBaseTmpDir,"split"),moment=moment_;function splitPagetoItems(e,t,r){return readImgInfo(e).pipe(map(i=>{const n=calcItemsPerPage(i.height,r.height);return n?range(0,n).pipe(mergeMap(n=>{const a={index:n,itemConfig:Object.assign({},r),srcPath:e,targetDir:t,pageHeight:i.height};return i.width<a.itemConfig.width&&(a.itemConfig.width=i.width),parseSplitPage(a).pipe(mergeMap(e=>{const t=new Map;return t.set(e.name,e),of(t)}))})):of(new Map)}),concatMap(e=>e))}function resizeAndSaveImg(e,t,r,i){if(r<=0||r>1)throw new Error(`value of scale invalid: "${r}"`);return readImgInfo(e).pipe(mergeMap(n=>{const a={src:e,dst:t,width:n.width*r,height:n.height*r,quality:i};return from(resize(a))}),map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function parseSplitPage(e){const{index:t,srcPath:r,pageHeight:i}=e,{width:n}=e.itemConfig;let{height:a}=e.itemConfig;const o=36,s=0,p=t*a;p+a>i&&(a=i-p);const c=moment().format("YYYYMMDD"),m=join(e.targetDir,`${c}-${Math.random()}-${t}.jpg`),f={dst:m,src:r,quality:100,cropWidth:n,cropHeight:a+36,x:0,y:p};return from(crop(f)).pipe(mergeMap(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return of(t)}))}function readImgInfo(e){return from(info(e))}function calcItemsPerPage(e,t){const r=33;return e>=t?Math.floor((e+33)/t):0}function getOcrZoneOptsByBankName(e,t){return t.get(e)}function cropImgAllZones(e,t,r){return from(r).pipe(mergeMap(r=>cropImgZone(e,t,r).pipe(map(e=>[r.zoneName,e]))),reduce((e,t)=>(e.set(t[0],t[1]),e),new Map))}function cropImgZone(e,t,r){const{zoneName:i,width:n,height:a,offsetX:o,offsetY:s}=r,p=join(t,`${i}.png`),c={dst:p,src:e,quality:100,cropWidth:n,cropHeight:a,x:o,y:s};return from(crop(c)).pipe(map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function runOcr(e,t){t||(t="eng");const r=`tesseract "${e}" "${e}" -l ${t}`;return run(r)}function retrieveKeyValuesFromOcrResult(e,t,r){if(!t)throw new Error("matchRules empty");return readTxtFile(e).pipe(map(e=>({buf:e,regexp:t})),map(({buf:e,regexp:t})=>{const i=r&&"function"==typeof r?r(e):e.toString("utf8");return retrieveValueByRegexp(i,t)}))}function getRegexpOptsByName(e,t){for(const r of Object.keys(t))if(r===e)return t[r]}function retrieveValueByRegexp(e,t){return regexMatch(e,t)}function readTxtFile(e){if(!e)throw new Error("path empty");const t=e.split(".");if(t.length>1&&"txt"!==t[t.length-1].toLowerCase())throw new Error(`file extensiion must empty or be '.txt', but is: "${e}"`);return defer(async()=>readFileAsync(e))}function prepareContent(e){let t=e&&e.byteLength?e.toString():"";return t?t=t.replace(/[\t ]/g,""):""}function regexMatch(e,t){if(e)for(const r of t){const t=e.match(r);if(Array.isArray(t)&&t.length)return t[0]}}const moment$1=moment_;function recognize(e){const{bankZone:t,baseTmpDir:r,debug:i,defaultOcrLang:n,jpegQuality:a,scale:o,splitTmpDir:s,imgPath:p,resizeImgDir:c,voucherConfigMap:m}=e,f=r||initialBaseTmpDir,u=s||initialSplitTmpDir,l=c||initialResizeImgDir,g=getBankRegexpOpts(m),h={baseDir:f,path:p,bankZone:t,bankRegexpOptsMap:g,debug:!!i,lang:n},d=recognizePageBank(h).pipe(filter(({bankName:e})=>!!e&&"n/a"!==e),concatMap(({bankName:e,pagePath:t})=>(i&&console.info("start split page"),splitPageToImgs(t,e,u,m))),concatMap(({bankName:e,imgFile:t})=>{const r=getOcrFields(e,m);if(!r)throw new Error(`ocrFields not defined with bankName: "${e}"`);const n={bankName:e,baseDir:f,concurrent:2,debug:!!i,defaultValue:"",imgFile:t,ocrFields:r,voucherConfigMap:m};return i&&console.info("recognize item"),recognizeFields(n).pipe(map(r=>(r.set("bank",e),r.set("filename",t.name.trim()),r.set("path",t.path.trim()),r)))}),mergeMap(e=>{const t={retInfo:e,resizeDir:l,scale:o,jpegQuality:a,debug:!!i};return saveImgAndPrune(t)})),y=from(createDir(f)).pipe(concatMap(()=>createDir(u)),concatMap(()=>createDir(l))),w=from(isFileExists(p)).pipe(filter(e=>e));return y.pipe(mergeMap(()=>w),mergeMap(()=>d))}function recognizePageBank(e){const{baseDir:t,path:r,bankZone:i,bankRegexpOptsMap:n,debug:a,lang:o}=e,s=join(t,"zone/",Math.random().toString());return a&&console.info("recognize pageBank:",s,r),from(createDir(s)).pipe(mergeMap(()=>cropImgZone(join(r),s,i)),concatMap(e=>runOcr(e.path,o).pipe(map(()=>({path:r,zoneImgPath:e.path})),mapTo(e.path),catchError(()=>of(e.path)))),concatMap(e=>from(n.entries()).pipe(concatMap(([t,r])=>retrieveKeyValuesFromOcrResult(e+".txt",r,e=>e.toString()).pipe(map(e=>({bankName:t,value:e})))),skipWhile(({value:e})=>"undefined"==typeof e||"string"==typeof e&&!e.length),take(1),map(({bankName:e})=>({bankName:e,pagePath:r})),defaultIfEmpty({bankName:"n/a",pagePath:""}))),tap(e=>{const{bankName:t,pagePath:i}=e;"n/a"!==t&&i||console.info(`recognize bank of page fail. no matached regexp. file: "${r}", pagePath: "${i}" `),a||rimraf(s).catch(console.info)}))}function splitPageToImgs(e,t,r,i){const n=i.get(t);if(!n)throw new Error("bank config empty during split page to images");return splitPagetoItems(e,r,n).pipe(mergeMap(e=>{const r=from(e.values()).pipe(map(e=>({bankName:t,imgFile:e})));return r}))}function recognizeFields(e){const{bankName:t,baseDir:r,concurrent:i,debug:n,defaultValue:a,imgFile:o,ocrFields:s,voucherConfigMap:p}=e,c=join(r,"/zone/",Math.random().toString()),m=getOcrZoneOptsByBankName(t,p);if(!m)throw new Error(`get bankConfig empty with bankName: "${t}"`);const f=from(createDir(c)).pipe(mergeMap(()=>cropImgAllZones(o.path,c,m.ocrZones)),concatMap(e=>batchOcrAndRetrieve(e,m,s,a,i)),tap(()=>n||rimraf(c).catch(console.info)));return f}function batchOcrAndRetrieve(e,t,r,i="",n=2){const{bankName:a}=t;return from(e.entries()).pipe(concatMap(e=>ocrAndPickFromZoneImg(e,t,n)),reduce((e,t)=>e.set(t.fieldName,t.value),new Map),map(e=>e.set("bank",a)),map(e=>setDefaultValue(e,r,i)))}function setDefaultValue(e,t,r=""){const i=new Map;for(const r of Object.keys(t)){const t=e.get(r);"string"==typeof t?i.set(r,t):i.set(r,"")}return i}function processZoneImgRow(e){const{fieldName:t,value:r}=e,i={fieldName:t,value:r};switch(t){case"amount":i.value=r.trim().replace(/,/g,"");break;case"date":i.value=r.trim().replace(/\D/g,"");break;case"sn":i.value=r.trim()}return i}function validateZoneImgRow(e,t){if("string"!=typeof t)return!1;switch(e){case"amount":if(validateRetInfoAmout(t))return!0;break;case"sn":if(t)return!0;break;case"date":if(validateRetInfoDate(t))return!0;break;case"bank":return!0;default:if("string"==typeof t)return!0}return!1}function validateRetInfoAmout(e){if(!e)return!1;if(!e.trim())return!1;const t=parseFloat(e);return!Number.isNaN(t)&&"number"==typeof t}function validateRetInfoDate(e){return!!e&&moment$1(e,"YYYYMMDD").isValid()}function getBankRegexpOpts(e){const t=new Map;for(const{bankName:r,regexpOpts:i}of e.values())i&&i.bank&&t.set(r,i.bank);if(!t.size)throw new Error("not BankRegexpOpts found, should not set");return t}function getOcrFields(e,t){const r=t.get(e);if(!r)throw new Error(`get ocrFields empty by bankName: "${e}"`);return r.ocrFields}function saveImgAndPrune(e){const{retInfo:t,resizeDir:r,debug:i,scale:n,jpegQuality:a}=e,o=t.get("filename"),s=t.get("path"),p=t.get("sn");if(!o)throw new Error(`result info map invalid with empty path. info: ${t}`);if(!s)throw new Error(`result info map invalid with empty path. info: ${t}`);const c=p?`${(new Date).getTime()}-${p.replace(/[^\d\w]/g,"_")}.jpg`:o,m=moment$1().format("YYYY-MM-DD"),f=join(r,m,c);return t.set("filename",c),resizeAndSaveImg(s,f,n,a).pipe(map(e=>(t.set("path",e.path),t)),tap(()=>{i||unlinkAsync(s).catch(console.info)}))}function genFieldLangs(e,t,r){return r&&"undefined"!=typeof r[e]&&Array.isArray(r[e])?r[e]:t}function ocrAndPickFromZoneImg(e,t,r=2){const{ocrDefaultLangs:i,ocrFieldLangs:n,regexpOpts:a,ocrFields:o}=t;return from(Object.entries(o)).pipe(filter(t=>{const r=t[1];return!!r&&r===e[0]}),mergeMap(t=>{const r=t[0];return ocrAndPickFieldFromZoneImg(r,e,a,i,n)},r))}function ocrAndPickFieldFromZoneImg(e,t,r,i,n){const[,a]=t,o=genFieldLangs(e,i,n),s=o.length-1,p=getRegexpOptsByName(e,r);if(!p)throw new Error(`got regexp empty by zoneName: "${e}"`);return from(o).pipe(concatMap(e=>runOcr(a.path,e).pipe(mapTo(!0),catchError(()=>of(!0)))),concatMap(()=>retrieveKeyValuesFromOcrResult(a.path+".txt",p,prepareContent).pipe(map(t=>({fieldName:e,value:t})))),skipWhile((e,t)=>{const r=validateZoneImgRow(e.fieldName,e.value);return!r&&t!==s}),take(1),map(e=>("string"!=typeof e.value&&(e.value=""),e)),map(processZoneImgRow))}export{recognize}; | ||
//# sourceMappingURL=bvocr.esm.min.js.map |
/** | ||
* bank-voucher-ocr | ||
* 银行凭证扫描处理 | ||
* Bank Voucher ocr by tesseract and retrieve fields | ||
* | ||
* @version 0.1.0 | ||
* @version 0.1.1 | ||
* @author waiting | ||
@@ -103,3 +103,3 @@ * @license MIT | ||
const initialBaseTmpDir = path.join(os.tmpdir(), 'vocher-ocr'); | ||
const initialBaseTmpDir = path.join(os.tmpdir(), 'voucher-ocr'); | ||
const initialResizeImgDir = path.join(initialBaseTmpDir, 'resize'); // store result images | ||
@@ -167,3 +167,3 @@ const initialSplitTmpDir = path.join(initialBaseTmpDir, 'split'); // store temp split images to ocr | ||
} | ||
// split one vocher item from a page and save it | ||
// split one voucher item from a page and save it | ||
function parseSplitPage(options) { | ||
@@ -341,7 +341,7 @@ const { index, srcPath, pageHeight } = options; | ||
function recognize(options) { | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, vocherConfigMap, } = options; | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, voucherConfigMap, } = options; | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir; | ||
const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir; | ||
const resizeDir = resizeImgDir ? resizeImgDir : initialResizeImgDir; | ||
const bankRegexpOptsMap = getBankRegexpOpts(vocherConfigMap); | ||
const bankRegexpOptsMap = getBankRegexpOpts(voucherConfigMap); | ||
const bankOpts = { | ||
@@ -357,5 +357,5 @@ baseDir, | ||
!!debug && console.info('start split page'); | ||
return splitPageToImgs(pagePath, bankName, splitDir, vocherConfigMap) | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMap) | ||
}), operators.concatMap(({ bankName, imgFile }) => { | ||
const ocrFields = getOcrFields(bankName, vocherConfigMap); | ||
const ocrFields = getOcrFields(bankName, voucherConfigMap); | ||
@@ -373,3 +373,3 @@ if (!ocrFields) { | ||
ocrFields, | ||
vocherConfigMap, | ||
voucherConfigMap, | ||
}; | ||
@@ -432,4 +432,4 @@ | ||
// 切分页面为多张凭证 | ||
function splitPageToImgs(pagePath, bankName, targetDir, vocherConfigMap) { | ||
const config = vocherConfigMap.get(bankName); | ||
function splitPageToImgs(pagePath, bankName, targetDir, voucherConfigMap) { | ||
const config = voucherConfigMap.get(bankName); | ||
@@ -449,5 +449,5 @@ if (!config) { | ||
function recognizeFields(options) { | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, vocherConfigMap, } = options; | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options; | ||
const zoneTmpDir = path.join(baseDir, '/zone/', Math.random().toString()); | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, vocherConfigMap); | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap); | ||
@@ -454,0 +454,0 @@ // console.info('recognize single image:', zoneTmpDir, imgFile.path) |
import { join, tmpdir } from '../shared/index' | ||
export const initialBaseTmpDir = join(tmpdir(), 'vocher-ocr') | ||
export const initialBaseTmpDir = join(tmpdir(), 'voucher-ocr') | ||
export const initialResizeImgDir = join(initialBaseTmpDir, 'resize') // store result images | ||
@@ -4,0 +4,0 @@ export const initialSplitTmpDir = join(initialBaseTmpDir, 'split') // store temp split images to ocr |
import { Observable } from 'rxjs'; | ||
import { Filename, ImgFileInfo, VocherConfig } from './model'; | ||
export declare function splitPagetoItems(srcPath: string, targetDir: string, itemConfig: VocherConfig): Observable<Map<Filename, ImgFileInfo>>; | ||
import { Filename, ImgFileInfo, VoucherConfig } from './model'; | ||
export declare function splitPagetoItems(srcPath: string, targetDir: string, itemConfig: VoucherConfig): Observable<Map<Filename, ImgFileInfo>>; | ||
export declare function resizeAndSaveImg(srcPath: string, targetPath: string, scale: number, // 0-1 | ||
quality: number): Observable<ImgFileInfo>; |
@@ -65,3 +65,3 @@ import { crop, info as getImgInfo, resize } from 'easyimage' | ||
} | ||
// split one vocher item from a page and save it | ||
// split one voucher item from a page and save it | ||
function parseSplitPage(options) { | ||
@@ -68,0 +68,0 @@ const { index, srcPath, pageHeight } = options |
@@ -5,16 +5,14 @@ import * as moment_ from 'moment' | ||
import { createDir, isFileExists, join, rimraf, unlinkAsync } from '../shared/index' | ||
import { initialBaseTmpDir, initialResizeImgDir, initialSplitTmpDir, } from './config' | ||
import { resizeAndSaveImg, splitPagetoItems, } from './img-process' | ||
import { initialBaseTmpDir, initialResizeImgDir, initialSplitTmpDir } from './config' | ||
import { resizeAndSaveImg, splitPagetoItems } from './img-process' | ||
import { cropImgAllZones, cropImgZone, getOcrZoneOptsByBankName, runOcr } from './ocr-process' | ||
import { | ||
// batchRetrieveValuesFromZones, | ||
getRegexpOptsByName, prepareContent, retrieveKeyValuesFromOcrResult, } from './txt-process' | ||
import { getRegexpOptsByName, prepareContent, retrieveKeyValuesFromOcrResult } from './txt-process' | ||
const moment = moment_ | ||
export function recognize(options) { | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, vocherConfigMap, } = options | ||
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, voucherConfigMap, } = options | ||
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir | ||
const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir | ||
const resizeDir = resizeImgDir ? resizeImgDir : initialResizeImgDir | ||
const bankRegexpOptsMap = getBankRegexpOpts(vocherConfigMap) | ||
const bankRegexpOptsMap = getBankRegexpOpts(voucherConfigMap) | ||
const bankOpts = { | ||
@@ -30,5 +28,5 @@ baseDir, | ||
!!debug && console.info('start split page') | ||
return splitPageToImgs(pagePath, bankName, splitDir, vocherConfigMap) | ||
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMap) | ||
}), concatMap(({ bankName, imgFile }) => { | ||
const ocrFields = getOcrFields(bankName, vocherConfigMap) | ||
const ocrFields = getOcrFields(bankName, voucherConfigMap) | ||
@@ -46,3 +44,3 @@ if (!ocrFields) { | ||
ocrFields, | ||
vocherConfigMap, | ||
voucherConfigMap, | ||
} | ||
@@ -105,4 +103,4 @@ | ||
// 切分页面为多张凭证 | ||
function splitPageToImgs(pagePath, bankName, targetDir, vocherConfigMap) { | ||
const config = vocherConfigMap.get(bankName) | ||
function splitPageToImgs(pagePath, bankName, targetDir, voucherConfigMap) { | ||
const config = voucherConfigMap.get(bankName) | ||
@@ -122,5 +120,5 @@ if (!config) { | ||
function recognizeFields(options) { | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, vocherConfigMap, } = options | ||
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options | ||
const zoneTmpDir = join(baseDir, '/zone/', Math.random().toString()) | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, vocherConfigMap) | ||
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap) | ||
@@ -127,0 +125,0 @@ // console.info('recognize single image:', zoneTmpDir, imgFile.path) |
@@ -16,3 +16,3 @@ /// <reference types="node" /> | ||
splitTmpDir?: string; | ||
vocherConfigMap: VocherConfigMap; | ||
voucherConfigMap: VoucherConfigMap; | ||
} | ||
@@ -47,3 +47,3 @@ export declare const enum Actions { | ||
export declare type OcrRetInfo = Map<OcrRetInfoKey, string>; | ||
export declare type VocherConfigMap = Map<BankName, VocherConfig>; | ||
export declare type VoucherConfigMap = Map<BankName, VoucherConfig>; | ||
export interface OcrZoneRet { | ||
@@ -54,3 +54,3 @@ fieldName: FieldName; | ||
export declare type RegexpArray = ReadonlyArray<RegExp>; | ||
export interface VocherConfig { | ||
export interface VoucherConfig { | ||
bankName: BankName; | ||
@@ -104,3 +104,3 @@ width: number; | ||
index: number; | ||
itemConfig: VocherConfig; | ||
itemConfig: VoucherConfig; | ||
srcPath: string; | ||
@@ -120,3 +120,3 @@ targetDir: string; | ||
} | ||
export declare type VocherImgMap = Map<Filename, ImgFileInfo>; | ||
export declare type VoucherImgMap = Map<Filename, ImgFileInfo>; | ||
export declare type PreProcessBufferFn = (buf: Buffer) => string; | ||
@@ -139,3 +139,3 @@ export interface PageBankRet { | ||
ocrFields: OcrFields; | ||
vocherConfigMap: VocherConfigMap; | ||
voucherConfigMap: VoucherConfigMap; | ||
} | ||
@@ -142,0 +142,0 @@ export interface RecognizePageBankOpts { |
/// <reference types="node" /> | ||
import { Observable } from 'rxjs'; | ||
import { BankName, ImgFileInfo, OcrZone, VocherConfig, VocherConfigMap, ZoneImgMap } from './model'; | ||
export declare function getOcrZoneOptsByBankName(bankName: BankName, configMap: VocherConfigMap): VocherConfig | void; | ||
import { BankName, ImgFileInfo, OcrZone, VoucherConfig, VoucherConfigMap, ZoneImgMap } from './model'; | ||
export declare function getOcrZoneOptsByBankName(bankName: BankName, configMap: VoucherConfigMap): VoucherConfig | void; | ||
export declare function cropImgAllZones(srcPath: string, zoneTmpDir: string, ocrZoneOptsArr: ReadonlyArray<OcrZone>): Observable<ZoneImgMap>; | ||
export declare function cropImgZone(srcPath: string, targetDir: string, ocrZoneOpts: OcrZone): Observable<ImgFileInfo>; | ||
export declare function runOcr(path: string, lang: string): Observable<Buffer>; |
{ | ||
"name": "bank-voucher-ocr", | ||
"author": "waiting", | ||
"version": "0.1.0", | ||
"description": "银行凭证扫描处理", | ||
"keywords": [], | ||
"version": "0.1.1", | ||
"description": "Bank Voucher ocr by tesseract and retrieve fields", | ||
"keywords": [ | ||
"voucher", | ||
"rxjs", | ||
"ocr", | ||
"tesseract", | ||
"银行凭证", | ||
"银行回单" | ||
], | ||
"engines": { | ||
@@ -8,0 +15,0 @@ "node": ">=8.10.0" |
Sorry, the diff of this file is not supported yet
131394
2257