New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

bank-voucher-ocr

Package Overview
Dependencies
Maintainers
1
Versions
42
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

bank-voucher-ocr - npm Package Compare versions

Comparing version 0.1.0 to 0.1.1

26

dist/bvocr.esm.js
/**
* bank-voucher-ocr
* 银行凭证扫描处理
* Bank Voucher ocr by tesseract and retrieve fields
*
* @version 0.1.0
* @version 0.1.1
* @author waiting

@@ -97,3 +97,3 @@ * @license MIT

const initialBaseTmpDir = join(tmpdir(), 'vocher-ocr');
const initialBaseTmpDir = join(tmpdir(), 'voucher-ocr');
const initialResizeImgDir = join(initialBaseTmpDir, 'resize'); // store result images

@@ -161,3 +161,3 @@ const initialSplitTmpDir = join(initialBaseTmpDir, 'split'); // store temp split images to ocr

}
// split one vocher item from a page and save it
// split one voucher item from a page and save it
function parseSplitPage(options) {

@@ -335,7 +335,7 @@ const { index, srcPath, pageHeight } = options;

function recognize(options) {
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, vocherConfigMap, } = options;
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, voucherConfigMap, } = options;
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir;
const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir;
const resizeDir = resizeImgDir ? resizeImgDir : initialResizeImgDir;
const bankRegexpOptsMap = getBankRegexpOpts(vocherConfigMap);
const bankRegexpOptsMap = getBankRegexpOpts(voucherConfigMap);
const bankOpts = {

@@ -351,5 +351,5 @@ baseDir,

!!debug && console.info('start split page');
return splitPageToImgs(pagePath, bankName, splitDir, vocherConfigMap)
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMap)
}), concatMap(({ bankName, imgFile }) => {
const ocrFields = getOcrFields(bankName, vocherConfigMap);
const ocrFields = getOcrFields(bankName, voucherConfigMap);

@@ -367,3 +367,3 @@ if (!ocrFields) {

ocrFields,
vocherConfigMap,
voucherConfigMap,
};

@@ -426,4 +426,4 @@

// 切分页面为多张凭证
function splitPageToImgs(pagePath, bankName, targetDir, vocherConfigMap) {
const config = vocherConfigMap.get(bankName);
function splitPageToImgs(pagePath, bankName, targetDir, voucherConfigMap) {
const config = voucherConfigMap.get(bankName);

@@ -443,5 +443,5 @@ if (!config) {

function recognizeFields(options) {
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, vocherConfigMap, } = options;
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options;
const zoneTmpDir = join(baseDir, '/zone/', Math.random().toString());
const bankConfig = getOcrZoneOptsByBankName(bankName, vocherConfigMap);
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap);

@@ -448,0 +448,0 @@ // console.info('recognize single image:', zoneTmpDir, imgFile.path)

/**
* bank-voucher-ocr
* 银行凭证扫描处理
* Bank Voucher ocr by tesseract and retrieve fields
*
* @version 0.1.0
* @version 0.1.1
* @author waiting

@@ -11,3 +11,3 @@ * @license MIT

import{access,chmod,close,copyFile,mkdir,open,readdir,readFile,rmdir,stat,unlink,write,writeFile}from"fs";import{join,normalize,resolve,sep}from"path";import{promisify}from"util";import{tmpdir}from"os";import{crop,info,resize}from"easyimage";import*as moment_ from"moment";import{from,of,range,defer}from"rxjs";import{concatMap,map,mergeMap,reduce,catchError,defaultIfEmpty,filter,mapTo,skipWhile,take,tap}from"rxjs/operators";import run from"rxrunscript";const closeAsync=promisify(close),chmodAsync=promisify(chmod),copyFileAsync=promisify(copyFile),mkdirAsync=promisify(mkdir),openAsync=promisify(open),readFileAsync=promisify(readFile),readDirAsync=promisify(readdir),rmdirAsync=promisify(rmdir),unlinkAsync=promisify(unlink),writeAsync=promisify(write),writeFileAsync=promisify(writeFile);function isPathAcessible(e){return e?new Promise(t=>access(e,e=>t(!e))):Promise.resolve(!1)}function isDirExists(e){return e?isDirFileExists(e,"DIR"):Promise.resolve(!1)}function isFileExists(e){return e?isDirFileExists(e,"FILE"):Promise.resolve(!1)}function isDirFileExists(e,t){return e?new Promise(r=>{stat(e,(e,i)=>r(!e&&("DIR"===t?i.isDirectory():i.isFile())))}):Promise.resolve(!1)}async function createDir(e){if(!e)throw new Error("value of path param invalid");e=normalize(e),await isDirExists(e)||await e.split(sep).reduce(async(e,t)=>{const r=resolve(await e,t);return await isPathAcessible(r)||await mkdirAsync(r,493),r},Promise.resolve(sep))}async function rimraf(e){e&&(await _rimraf(e),await rmdirAsync(e))}async function _rimraf(e){if(e&&await isPathAcessible(e)){if(await isFileExists(e))return void await unlinkAsync(e);const t=await readDirAsync(e);for(const r of t)await _rimraf(join(e,r))}}const initialBaseTmpDir=join(tmpdir(),"vocher-ocr"),initialResizeImgDir=join(initialBaseTmpDir,"resize"),initialSplitTmpDir=join(initialBaseTmpDir,"split"),moment=moment_;function splitPagetoItems(e,t,r){return readImgInfo(e).pipe(map(i=>{const n=calcItemsPerPage(i.height,r.height);return n?range(0,n).pipe(mergeMap(n=>{const a={index:n,itemConfig:Object.assign({},r),srcPath:e,targetDir:t,pageHeight:i.height};return i.width<a.itemConfig.width&&(a.itemConfig.width=i.width),parseSplitPage(a).pipe(mergeMap(e=>{const t=new Map;return t.set(e.name,e),of(t)}))})):of(new Map)}),concatMap(e=>e))}function resizeAndSaveImg(e,t,r,i){if(r<=0||r>1)throw new Error(`value of scale invalid: "${r}"`);return readImgInfo(e).pipe(mergeMap(n=>{const a={src:e,dst:t,width:n.width*r,height:n.height*r,quality:i};return from(resize(a))}),map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function parseSplitPage(e){const{index:t,srcPath:r,pageHeight:i}=e,{width:n}=e.itemConfig;let{height:a}=e.itemConfig;const o=36,s=0,p=t*a;p+a>i&&(a=i-p);const c=moment().format("YYYYMMDD"),m=join(e.targetDir,`${c}-${Math.random()}-${t}.jpg`),f={dst:m,src:r,quality:100,cropWidth:n,cropHeight:a+36,x:0,y:p};return from(crop(f)).pipe(mergeMap(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return of(t)}))}function readImgInfo(e){return from(info(e))}function calcItemsPerPage(e,t){const r=33;return e>=t?Math.floor((e+33)/t):0}function getOcrZoneOptsByBankName(e,t){return t.get(e)}function cropImgAllZones(e,t,r){return from(r).pipe(mergeMap(r=>cropImgZone(e,t,r).pipe(map(e=>[r.zoneName,e]))),reduce((e,t)=>(e.set(t[0],t[1]),e),new Map))}function cropImgZone(e,t,r){const{zoneName:i,width:n,height:a,offsetX:o,offsetY:s}=r,p=join(t,`${i}.png`),c={dst:p,src:e,quality:100,cropWidth:n,cropHeight:a,x:o,y:s};return from(crop(c)).pipe(map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function runOcr(e,t){t||(t="eng");const r=`tesseract "${e}" "${e}" -l ${t}`;return run(r)}function retrieveKeyValuesFromOcrResult(e,t,r){if(!t)throw new Error("matchRules empty");return readTxtFile(e).pipe(map(e=>({buf:e,regexp:t})),map(({buf:e,regexp:t})=>{const i=r&&"function"==typeof r?r(e):e.toString("utf8");return retrieveValueByRegexp(i,t)}))}function getRegexpOptsByName(e,t){for(const r of Object.keys(t))if(r===e)return t[r]}function retrieveValueByRegexp(e,t){return regexMatch(e,t)}function readTxtFile(e){if(!e)throw new Error("path empty");const t=e.split(".");if(t.length>1&&"txt"!==t[t.length-1].toLowerCase())throw new Error(`file extensiion must empty or be '.txt', but is: "${e}"`);return defer(async()=>readFileAsync(e))}function prepareContent(e){let t=e&&e.byteLength?e.toString():"";return t?t=t.replace(/[\t ]/g,""):""}function regexMatch(e,t){if(e)for(const r of t){const t=e.match(r);if(Array.isArray(t)&&t.length)return t[0]}}const moment$1=moment_;function recognize(e){const{bankZone:t,baseTmpDir:r,debug:i,defaultOcrLang:n,jpegQuality:a,scale:o,splitTmpDir:s,imgPath:p,resizeImgDir:c,vocherConfigMap:m}=e,f=r||initialBaseTmpDir,l=s||initialSplitTmpDir,g=c||initialResizeImgDir,u=getBankRegexpOpts(m),h={baseDir:f,path:p,bankZone:t,bankRegexpOptsMap:u,debug:!!i,lang:n},d=recognizePageBank(h).pipe(filter(({bankName:e})=>!!e&&"n/a"!==e),concatMap(({bankName:e,pagePath:t})=>(i&&console.info("start split page"),splitPageToImgs(t,e,l,m))),concatMap(({bankName:e,imgFile:t})=>{const r=getOcrFields(e,m);if(!r)throw new Error(`ocrFields not defined with bankName: "${e}"`);const n={bankName:e,baseDir:f,concurrent:2,debug:!!i,defaultValue:"",imgFile:t,ocrFields:r,vocherConfigMap:m};return i&&console.info("recognize item"),recognizeFields(n).pipe(map(r=>(r.set("bank",e),r.set("filename",t.name.trim()),r.set("path",t.path.trim()),r)))}),mergeMap(e=>{const t={retInfo:e,resizeDir:g,scale:o,jpegQuality:a,debug:!!i};return saveImgAndPrune(t)})),y=from(createDir(f)).pipe(concatMap(()=>createDir(l)),concatMap(()=>createDir(g))),w=from(isFileExists(p)).pipe(filter(e=>e));return y.pipe(mergeMap(()=>w),mergeMap(()=>d))}function recognizePageBank(e){const{baseDir:t,path:r,bankZone:i,bankRegexpOptsMap:n,debug:a,lang:o}=e,s=join(t,"zone/",Math.random().toString());return a&&console.info("recognize pageBank:",s,r),from(createDir(s)).pipe(mergeMap(()=>cropImgZone(join(r),s,i)),concatMap(e=>runOcr(e.path,o).pipe(map(()=>({path:r,zoneImgPath:e.path})),mapTo(e.path),catchError(()=>of(e.path)))),concatMap(e=>from(n.entries()).pipe(concatMap(([t,r])=>retrieveKeyValuesFromOcrResult(e+".txt",r,e=>e.toString()).pipe(map(e=>({bankName:t,value:e})))),skipWhile(({value:e})=>"undefined"==typeof e||"string"==typeof e&&!e.length),take(1),map(({bankName:e})=>({bankName:e,pagePath:r})),defaultIfEmpty({bankName:"n/a",pagePath:""}))),tap(e=>{const{bankName:t,pagePath:i}=e;"n/a"!==t&&i||console.info(`recognize bank of page fail. no matached regexp. file: "${r}", pagePath: "${i}" `),a||rimraf(s).catch(console.info)}))}function splitPageToImgs(e,t,r,i){const n=i.get(t);if(!n)throw new Error("bank config empty during split page to images");return splitPagetoItems(e,r,n).pipe(mergeMap(e=>{const r=from(e.values()).pipe(map(e=>({bankName:t,imgFile:e})));return r}))}function recognizeFields(e){const{bankName:t,baseDir:r,concurrent:i,debug:n,defaultValue:a,imgFile:o,ocrFields:s,vocherConfigMap:p}=e,c=join(r,"/zone/",Math.random().toString()),m=getOcrZoneOptsByBankName(t,p);if(!m)throw new Error(`get bankConfig empty with bankName: "${t}"`);const f=from(createDir(c)).pipe(mergeMap(()=>cropImgAllZones(o.path,c,m.ocrZones)),concatMap(e=>batchOcrAndRetrieve(e,m,s,a,i)),tap(()=>n||rimraf(c).catch(console.info)));return f}function batchOcrAndRetrieve(e,t,r,i="",n=2){const{bankName:a}=t;return from(e.entries()).pipe(concatMap(e=>ocrAndPickFromZoneImg(e,t,n)),reduce((e,t)=>e.set(t.fieldName,t.value),new Map),map(e=>e.set("bank",a)),map(e=>setDefaultValue(e,r,i)))}function setDefaultValue(e,t,r=""){const i=new Map;for(const r of Object.keys(t)){const t=e.get(r);"string"==typeof t?i.set(r,t):i.set(r,"")}return i}function processZoneImgRow(e){const{fieldName:t,value:r}=e,i={fieldName:t,value:r};switch(t){case"amount":i.value=r.trim().replace(/,/g,"");break;case"date":i.value=r.trim().replace(/\D/g,"");break;case"sn":i.value=r.trim()}return i}function validateZoneImgRow(e,t){if("string"!=typeof t)return!1;switch(e){case"amount":if(validateRetInfoAmout(t))return!0;break;case"sn":if(t)return!0;break;case"date":if(validateRetInfoDate(t))return!0;break;case"bank":return!0;default:if("string"==typeof t)return!0}return!1}function validateRetInfoAmout(e){if(!e)return!1;if(!e.trim())return!1;const t=parseFloat(e);return!Number.isNaN(t)&&"number"==typeof t}function validateRetInfoDate(e){return!!e&&moment$1(e,"YYYYMMDD").isValid()}function getBankRegexpOpts(e){const t=new Map;for(const{bankName:r,regexpOpts:i}of e.values())i&&i.bank&&t.set(r,i.bank);if(!t.size)throw new Error("not BankRegexpOpts found, should not set");return t}function getOcrFields(e,t){const r=t.get(e);if(!r)throw new Error(`get ocrFields empty by bankName: "${e}"`);return r.ocrFields}function saveImgAndPrune(e){const{retInfo:t,resizeDir:r,debug:i,scale:n,jpegQuality:a}=e,o=t.get("filename"),s=t.get("path"),p=t.get("sn");if(!o)throw new Error(`result info map invalid with empty path. info: ${t}`);if(!s)throw new Error(`result info map invalid with empty path. info: ${t}`);const c=p?`${(new Date).getTime()}-${p.replace(/[^\d\w]/g,"_")}.jpg`:o,m=moment$1().format("YYYY-MM-DD"),f=join(r,m,c);return t.set("filename",c),resizeAndSaveImg(s,f,n,a).pipe(map(e=>(t.set("path",e.path),t)),tap(()=>{i||unlinkAsync(s).catch(console.info)}))}function genFieldLangs(e,t,r){return r&&"undefined"!=typeof r[e]&&Array.isArray(r[e])?r[e]:t}function ocrAndPickFromZoneImg(e,t,r=2){const{ocrDefaultLangs:i,ocrFieldLangs:n,regexpOpts:a,ocrFields:o}=t;return from(Object.entries(o)).pipe(filter(t=>{const r=t[1];return!!r&&r===e[0]}),mergeMap(t=>{const r=t[0];return ocrAndPickFieldFromZoneImg(r,e,a,i,n)},r))}function ocrAndPickFieldFromZoneImg(e,t,r,i,n){const[,a]=t,o=genFieldLangs(e,i,n),s=o.length-1,p=getRegexpOptsByName(e,r);if(!p)throw new Error(`got regexp empty by zoneName: "${e}"`);return from(o).pipe(concatMap(e=>runOcr(a.path,e).pipe(mapTo(!0),catchError(()=>of(!0)))),concatMap(()=>retrieveKeyValuesFromOcrResult(a.path+".txt",p,prepareContent).pipe(map(t=>({fieldName:e,value:t})))),skipWhile((e,t)=>{const r=validateZoneImgRow(e.fieldName,e.value);return!r&&t!==s}),take(1),map(e=>("string"!=typeof e.value&&(e.value=""),e)),map(processZoneImgRow))}export{recognize};
import{access,chmod,close,copyFile,mkdir,open,readdir,readFile,rmdir,stat,unlink,write,writeFile}from"fs";import{join,normalize,resolve,sep}from"path";import{promisify}from"util";import{tmpdir}from"os";import{crop,info,resize}from"easyimage";import*as moment_ from"moment";import{from,of,range,defer}from"rxjs";import{concatMap,map,mergeMap,reduce,catchError,defaultIfEmpty,filter,mapTo,skipWhile,take,tap}from"rxjs/operators";import run from"rxrunscript";const closeAsync=promisify(close),chmodAsync=promisify(chmod),copyFileAsync=promisify(copyFile),mkdirAsync=promisify(mkdir),openAsync=promisify(open),readFileAsync=promisify(readFile),readDirAsync=promisify(readdir),rmdirAsync=promisify(rmdir),unlinkAsync=promisify(unlink),writeAsync=promisify(write),writeFileAsync=promisify(writeFile);function isPathAcessible(e){return e?new Promise(t=>access(e,e=>t(!e))):Promise.resolve(!1)}function isDirExists(e){return e?isDirFileExists(e,"DIR"):Promise.resolve(!1)}function isFileExists(e){return e?isDirFileExists(e,"FILE"):Promise.resolve(!1)}function isDirFileExists(e,t){return e?new Promise(r=>{stat(e,(e,i)=>r(!e&&("DIR"===t?i.isDirectory():i.isFile())))}):Promise.resolve(!1)}async function createDir(e){if(!e)throw new Error("value of path param invalid");e=normalize(e),await isDirExists(e)||await e.split(sep).reduce(async(e,t)=>{const r=resolve(await e,t);return await isPathAcessible(r)||await mkdirAsync(r,493),r},Promise.resolve(sep))}async function rimraf(e){e&&(await _rimraf(e),await rmdirAsync(e))}async function _rimraf(e){if(e&&await isPathAcessible(e)){if(await isFileExists(e))return void await unlinkAsync(e);const t=await readDirAsync(e);for(const r of t)await _rimraf(join(e,r))}}const initialBaseTmpDir=join(tmpdir(),"voucher-ocr"),initialResizeImgDir=join(initialBaseTmpDir,"resize"),initialSplitTmpDir=join(initialBaseTmpDir,"split"),moment=moment_;function splitPagetoItems(e,t,r){return readImgInfo(e).pipe(map(i=>{const n=calcItemsPerPage(i.height,r.height);return n?range(0,n).pipe(mergeMap(n=>{const a={index:n,itemConfig:Object.assign({},r),srcPath:e,targetDir:t,pageHeight:i.height};return i.width<a.itemConfig.width&&(a.itemConfig.width=i.width),parseSplitPage(a).pipe(mergeMap(e=>{const t=new Map;return t.set(e.name,e),of(t)}))})):of(new Map)}),concatMap(e=>e))}function resizeAndSaveImg(e,t,r,i){if(r<=0||r>1)throw new Error(`value of scale invalid: "${r}"`);return readImgInfo(e).pipe(mergeMap(n=>{const a={src:e,dst:t,width:n.width*r,height:n.height*r,quality:i};return from(resize(a))}),map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function parseSplitPage(e){const{index:t,srcPath:r,pageHeight:i}=e,{width:n}=e.itemConfig;let{height:a}=e.itemConfig;const o=36,s=0,p=t*a;p+a>i&&(a=i-p);const c=moment().format("YYYYMMDD"),m=join(e.targetDir,`${c}-${Math.random()}-${t}.jpg`),f={dst:m,src:r,quality:100,cropWidth:n,cropHeight:a+36,x:0,y:p};return from(crop(f)).pipe(mergeMap(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return of(t)}))}function readImgInfo(e){return from(info(e))}function calcItemsPerPage(e,t){const r=33;return e>=t?Math.floor((e+33)/t):0}function getOcrZoneOptsByBankName(e,t){return t.get(e)}function cropImgAllZones(e,t,r){return from(r).pipe(mergeMap(r=>cropImgZone(e,t,r).pipe(map(e=>[r.zoneName,e]))),reduce((e,t)=>(e.set(t[0],t[1]),e),new Map))}function cropImgZone(e,t,r){const{zoneName:i,width:n,height:a,offsetX:o,offsetY:s}=r,p=join(t,`${i}.png`),c={dst:p,src:e,quality:100,cropWidth:n,cropHeight:a,x:o,y:s};return from(crop(c)).pipe(map(e=>{const t={name:e.name,path:e.path,width:e.width,height:e.height,size:e.size};return t}))}function runOcr(e,t){t||(t="eng");const r=`tesseract "${e}" "${e}" -l ${t}`;return run(r)}function retrieveKeyValuesFromOcrResult(e,t,r){if(!t)throw new Error("matchRules empty");return readTxtFile(e).pipe(map(e=>({buf:e,regexp:t})),map(({buf:e,regexp:t})=>{const i=r&&"function"==typeof r?r(e):e.toString("utf8");return retrieveValueByRegexp(i,t)}))}function getRegexpOptsByName(e,t){for(const r of Object.keys(t))if(r===e)return t[r]}function retrieveValueByRegexp(e,t){return regexMatch(e,t)}function readTxtFile(e){if(!e)throw new Error("path empty");const t=e.split(".");if(t.length>1&&"txt"!==t[t.length-1].toLowerCase())throw new Error(`file extensiion must empty or be '.txt', but is: "${e}"`);return defer(async()=>readFileAsync(e))}function prepareContent(e){let t=e&&e.byteLength?e.toString():"";return t?t=t.replace(/[\t ]/g,""):""}function regexMatch(e,t){if(e)for(const r of t){const t=e.match(r);if(Array.isArray(t)&&t.length)return t[0]}}const moment$1=moment_;function recognize(e){const{bankZone:t,baseTmpDir:r,debug:i,defaultOcrLang:n,jpegQuality:a,scale:o,splitTmpDir:s,imgPath:p,resizeImgDir:c,voucherConfigMap:m}=e,f=r||initialBaseTmpDir,u=s||initialSplitTmpDir,l=c||initialResizeImgDir,g=getBankRegexpOpts(m),h={baseDir:f,path:p,bankZone:t,bankRegexpOptsMap:g,debug:!!i,lang:n},d=recognizePageBank(h).pipe(filter(({bankName:e})=>!!e&&"n/a"!==e),concatMap(({bankName:e,pagePath:t})=>(i&&console.info("start split page"),splitPageToImgs(t,e,u,m))),concatMap(({bankName:e,imgFile:t})=>{const r=getOcrFields(e,m);if(!r)throw new Error(`ocrFields not defined with bankName: "${e}"`);const n={bankName:e,baseDir:f,concurrent:2,debug:!!i,defaultValue:"",imgFile:t,ocrFields:r,voucherConfigMap:m};return i&&console.info("recognize item"),recognizeFields(n).pipe(map(r=>(r.set("bank",e),r.set("filename",t.name.trim()),r.set("path",t.path.trim()),r)))}),mergeMap(e=>{const t={retInfo:e,resizeDir:l,scale:o,jpegQuality:a,debug:!!i};return saveImgAndPrune(t)})),y=from(createDir(f)).pipe(concatMap(()=>createDir(u)),concatMap(()=>createDir(l))),w=from(isFileExists(p)).pipe(filter(e=>e));return y.pipe(mergeMap(()=>w),mergeMap(()=>d))}function recognizePageBank(e){const{baseDir:t,path:r,bankZone:i,bankRegexpOptsMap:n,debug:a,lang:o}=e,s=join(t,"zone/",Math.random().toString());return a&&console.info("recognize pageBank:",s,r),from(createDir(s)).pipe(mergeMap(()=>cropImgZone(join(r),s,i)),concatMap(e=>runOcr(e.path,o).pipe(map(()=>({path:r,zoneImgPath:e.path})),mapTo(e.path),catchError(()=>of(e.path)))),concatMap(e=>from(n.entries()).pipe(concatMap(([t,r])=>retrieveKeyValuesFromOcrResult(e+".txt",r,e=>e.toString()).pipe(map(e=>({bankName:t,value:e})))),skipWhile(({value:e})=>"undefined"==typeof e||"string"==typeof e&&!e.length),take(1),map(({bankName:e})=>({bankName:e,pagePath:r})),defaultIfEmpty({bankName:"n/a",pagePath:""}))),tap(e=>{const{bankName:t,pagePath:i}=e;"n/a"!==t&&i||console.info(`recognize bank of page fail. no matached regexp. file: "${r}", pagePath: "${i}" `),a||rimraf(s).catch(console.info)}))}function splitPageToImgs(e,t,r,i){const n=i.get(t);if(!n)throw new Error("bank config empty during split page to images");return splitPagetoItems(e,r,n).pipe(mergeMap(e=>{const r=from(e.values()).pipe(map(e=>({bankName:t,imgFile:e})));return r}))}function recognizeFields(e){const{bankName:t,baseDir:r,concurrent:i,debug:n,defaultValue:a,imgFile:o,ocrFields:s,voucherConfigMap:p}=e,c=join(r,"/zone/",Math.random().toString()),m=getOcrZoneOptsByBankName(t,p);if(!m)throw new Error(`get bankConfig empty with bankName: "${t}"`);const f=from(createDir(c)).pipe(mergeMap(()=>cropImgAllZones(o.path,c,m.ocrZones)),concatMap(e=>batchOcrAndRetrieve(e,m,s,a,i)),tap(()=>n||rimraf(c).catch(console.info)));return f}function batchOcrAndRetrieve(e,t,r,i="",n=2){const{bankName:a}=t;return from(e.entries()).pipe(concatMap(e=>ocrAndPickFromZoneImg(e,t,n)),reduce((e,t)=>e.set(t.fieldName,t.value),new Map),map(e=>e.set("bank",a)),map(e=>setDefaultValue(e,r,i)))}function setDefaultValue(e,t,r=""){const i=new Map;for(const r of Object.keys(t)){const t=e.get(r);"string"==typeof t?i.set(r,t):i.set(r,"")}return i}function processZoneImgRow(e){const{fieldName:t,value:r}=e,i={fieldName:t,value:r};switch(t){case"amount":i.value=r.trim().replace(/,/g,"");break;case"date":i.value=r.trim().replace(/\D/g,"");break;case"sn":i.value=r.trim()}return i}function validateZoneImgRow(e,t){if("string"!=typeof t)return!1;switch(e){case"amount":if(validateRetInfoAmout(t))return!0;break;case"sn":if(t)return!0;break;case"date":if(validateRetInfoDate(t))return!0;break;case"bank":return!0;default:if("string"==typeof t)return!0}return!1}function validateRetInfoAmout(e){if(!e)return!1;if(!e.trim())return!1;const t=parseFloat(e);return!Number.isNaN(t)&&"number"==typeof t}function validateRetInfoDate(e){return!!e&&moment$1(e,"YYYYMMDD").isValid()}function getBankRegexpOpts(e){const t=new Map;for(const{bankName:r,regexpOpts:i}of e.values())i&&i.bank&&t.set(r,i.bank);if(!t.size)throw new Error("not BankRegexpOpts found, should not set");return t}function getOcrFields(e,t){const r=t.get(e);if(!r)throw new Error(`get ocrFields empty by bankName: "${e}"`);return r.ocrFields}function saveImgAndPrune(e){const{retInfo:t,resizeDir:r,debug:i,scale:n,jpegQuality:a}=e,o=t.get("filename"),s=t.get("path"),p=t.get("sn");if(!o)throw new Error(`result info map invalid with empty path. info: ${t}`);if(!s)throw new Error(`result info map invalid with empty path. info: ${t}`);const c=p?`${(new Date).getTime()}-${p.replace(/[^\d\w]/g,"_")}.jpg`:o,m=moment$1().format("YYYY-MM-DD"),f=join(r,m,c);return t.set("filename",c),resizeAndSaveImg(s,f,n,a).pipe(map(e=>(t.set("path",e.path),t)),tap(()=>{i||unlinkAsync(s).catch(console.info)}))}function genFieldLangs(e,t,r){return r&&"undefined"!=typeof r[e]&&Array.isArray(r[e])?r[e]:t}function ocrAndPickFromZoneImg(e,t,r=2){const{ocrDefaultLangs:i,ocrFieldLangs:n,regexpOpts:a,ocrFields:o}=t;return from(Object.entries(o)).pipe(filter(t=>{const r=t[1];return!!r&&r===e[0]}),mergeMap(t=>{const r=t[0];return ocrAndPickFieldFromZoneImg(r,e,a,i,n)},r))}function ocrAndPickFieldFromZoneImg(e,t,r,i,n){const[,a]=t,o=genFieldLangs(e,i,n),s=o.length-1,p=getRegexpOptsByName(e,r);if(!p)throw new Error(`got regexp empty by zoneName: "${e}"`);return from(o).pipe(concatMap(e=>runOcr(a.path,e).pipe(mapTo(!0),catchError(()=>of(!0)))),concatMap(()=>retrieveKeyValuesFromOcrResult(a.path+".txt",p,prepareContent).pipe(map(t=>({fieldName:e,value:t})))),skipWhile((e,t)=>{const r=validateZoneImgRow(e.fieldName,e.value);return!r&&t!==s}),take(1),map(e=>("string"!=typeof e.value&&(e.value=""),e)),map(processZoneImgRow))}export{recognize};
//# sourceMappingURL=bvocr.esm.min.js.map
/**
* bank-voucher-ocr
* 银行凭证扫描处理
* Bank Voucher ocr by tesseract and retrieve fields
*
* @version 0.1.0
* @version 0.1.1
* @author waiting

@@ -103,3 +103,3 @@ * @license MIT

const initialBaseTmpDir = path.join(os.tmpdir(), 'vocher-ocr');
const initialBaseTmpDir = path.join(os.tmpdir(), 'voucher-ocr');
const initialResizeImgDir = path.join(initialBaseTmpDir, 'resize'); // store result images

@@ -167,3 +167,3 @@ const initialSplitTmpDir = path.join(initialBaseTmpDir, 'split'); // store temp split images to ocr

}
// split one vocher item from a page and save it
// split one voucher item from a page and save it
function parseSplitPage(options) {

@@ -341,7 +341,7 @@ const { index, srcPath, pageHeight } = options;

function recognize(options) {
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, vocherConfigMap, } = options;
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, voucherConfigMap, } = options;
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir;
const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir;
const resizeDir = resizeImgDir ? resizeImgDir : initialResizeImgDir;
const bankRegexpOptsMap = getBankRegexpOpts(vocherConfigMap);
const bankRegexpOptsMap = getBankRegexpOpts(voucherConfigMap);
const bankOpts = {

@@ -357,5 +357,5 @@ baseDir,

!!debug && console.info('start split page');
return splitPageToImgs(pagePath, bankName, splitDir, vocherConfigMap)
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMap)
}), operators.concatMap(({ bankName, imgFile }) => {
const ocrFields = getOcrFields(bankName, vocherConfigMap);
const ocrFields = getOcrFields(bankName, voucherConfigMap);

@@ -373,3 +373,3 @@ if (!ocrFields) {

ocrFields,
vocherConfigMap,
voucherConfigMap,
};

@@ -432,4 +432,4 @@

// 切分页面为多张凭证
function splitPageToImgs(pagePath, bankName, targetDir, vocherConfigMap) {
const config = vocherConfigMap.get(bankName);
function splitPageToImgs(pagePath, bankName, targetDir, voucherConfigMap) {
const config = voucherConfigMap.get(bankName);

@@ -449,5 +449,5 @@ if (!config) {

function recognizeFields(options) {
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, vocherConfigMap, } = options;
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options;
const zoneTmpDir = path.join(baseDir, '/zone/', Math.random().toString());
const bankConfig = getOcrZoneOptsByBankName(bankName, vocherConfigMap);
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap);

@@ -454,0 +454,0 @@ // console.info('recognize single image:', zoneTmpDir, imgFile.path)

import { join, tmpdir } from '../shared/index'
export const initialBaseTmpDir = join(tmpdir(), 'vocher-ocr')
export const initialBaseTmpDir = join(tmpdir(), 'voucher-ocr')
export const initialResizeImgDir = join(initialBaseTmpDir, 'resize') // store result images

@@ -4,0 +4,0 @@ export const initialSplitTmpDir = join(initialBaseTmpDir, 'split') // store temp split images to ocr

import { Observable } from 'rxjs';
import { Filename, ImgFileInfo, VocherConfig } from './model';
export declare function splitPagetoItems(srcPath: string, targetDir: string, itemConfig: VocherConfig): Observable<Map<Filename, ImgFileInfo>>;
import { Filename, ImgFileInfo, VoucherConfig } from './model';
export declare function splitPagetoItems(srcPath: string, targetDir: string, itemConfig: VoucherConfig): Observable<Map<Filename, ImgFileInfo>>;
export declare function resizeAndSaveImg(srcPath: string, targetPath: string, scale: number, // 0-1
quality: number): Observable<ImgFileInfo>;

@@ -65,3 +65,3 @@ import { crop, info as getImgInfo, resize } from 'easyimage'

}
// split one vocher item from a page and save it
// split one voucher item from a page and save it
function parseSplitPage(options) {

@@ -68,0 +68,0 @@ const { index, srcPath, pageHeight } = options

@@ -5,16 +5,14 @@ import * as moment_ from 'moment'

import { createDir, isFileExists, join, rimraf, unlinkAsync } from '../shared/index'
import { initialBaseTmpDir, initialResizeImgDir, initialSplitTmpDir, } from './config'
import { resizeAndSaveImg, splitPagetoItems, } from './img-process'
import { initialBaseTmpDir, initialResizeImgDir, initialSplitTmpDir } from './config'
import { resizeAndSaveImg, splitPagetoItems } from './img-process'
import { cropImgAllZones, cropImgZone, getOcrZoneOptsByBankName, runOcr } from './ocr-process'
import {
// batchRetrieveValuesFromZones,
getRegexpOptsByName, prepareContent, retrieveKeyValuesFromOcrResult, } from './txt-process'
import { getRegexpOptsByName, prepareContent, retrieveKeyValuesFromOcrResult } from './txt-process'
const moment = moment_
export function recognize(options) {
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, vocherConfigMap, } = options
const { bankZone, baseTmpDir, debug, defaultOcrLang, jpegQuality, scale, splitTmpDir, imgPath, resizeImgDir, voucherConfigMap, } = options
const baseDir = baseTmpDir ? baseTmpDir : initialBaseTmpDir
const splitDir = splitTmpDir ? splitTmpDir : initialSplitTmpDir
const resizeDir = resizeImgDir ? resizeImgDir : initialResizeImgDir
const bankRegexpOptsMap = getBankRegexpOpts(vocherConfigMap)
const bankRegexpOptsMap = getBankRegexpOpts(voucherConfigMap)
const bankOpts = {

@@ -30,5 +28,5 @@ baseDir,

!!debug && console.info('start split page')
return splitPageToImgs(pagePath, bankName, splitDir, vocherConfigMap)
return splitPageToImgs(pagePath, bankName, splitDir, voucherConfigMap)
}), concatMap(({ bankName, imgFile }) => {
const ocrFields = getOcrFields(bankName, vocherConfigMap)
const ocrFields = getOcrFields(bankName, voucherConfigMap)

@@ -46,3 +44,3 @@ if (!ocrFields) {

ocrFields,
vocherConfigMap,
voucherConfigMap,
}

@@ -105,4 +103,4 @@

// 切分页面为多张凭证
function splitPageToImgs(pagePath, bankName, targetDir, vocherConfigMap) {
const config = vocherConfigMap.get(bankName)
function splitPageToImgs(pagePath, bankName, targetDir, voucherConfigMap) {
const config = voucherConfigMap.get(bankName)

@@ -122,5 +120,5 @@ if (!config) {

function recognizeFields(options) {
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, vocherConfigMap, } = options
const { bankName, baseDir, concurrent, debug, defaultValue, imgFile, ocrFields, voucherConfigMap, } = options
const zoneTmpDir = join(baseDir, '/zone/', Math.random().toString())
const bankConfig = getOcrZoneOptsByBankName(bankName, vocherConfigMap)
const bankConfig = getOcrZoneOptsByBankName(bankName, voucherConfigMap)

@@ -127,0 +125,0 @@ // console.info('recognize single image:', zoneTmpDir, imgFile.path)

@@ -16,3 +16,3 @@ /// <reference types="node" />

splitTmpDir?: string;
vocherConfigMap: VocherConfigMap;
voucherConfigMap: VoucherConfigMap;
}

@@ -47,3 +47,3 @@ export declare const enum Actions {

export declare type OcrRetInfo = Map<OcrRetInfoKey, string>;
export declare type VocherConfigMap = Map<BankName, VocherConfig>;
export declare type VoucherConfigMap = Map<BankName, VoucherConfig>;
export interface OcrZoneRet {

@@ -54,3 +54,3 @@ fieldName: FieldName;

export declare type RegexpArray = ReadonlyArray<RegExp>;
export interface VocherConfig {
export interface VoucherConfig {
bankName: BankName;

@@ -104,3 +104,3 @@ width: number;

index: number;
itemConfig: VocherConfig;
itemConfig: VoucherConfig;
srcPath: string;

@@ -120,3 +120,3 @@ targetDir: string;

}
export declare type VocherImgMap = Map<Filename, ImgFileInfo>;
export declare type VoucherImgMap = Map<Filename, ImgFileInfo>;
export declare type PreProcessBufferFn = (buf: Buffer) => string;

@@ -139,3 +139,3 @@ export interface PageBankRet {

ocrFields: OcrFields;
vocherConfigMap: VocherConfigMap;
voucherConfigMap: VoucherConfigMap;
}

@@ -142,0 +142,0 @@ export interface RecognizePageBankOpts {

/// <reference types="node" />
import { Observable } from 'rxjs';
import { BankName, ImgFileInfo, OcrZone, VocherConfig, VocherConfigMap, ZoneImgMap } from './model';
export declare function getOcrZoneOptsByBankName(bankName: BankName, configMap: VocherConfigMap): VocherConfig | void;
import { BankName, ImgFileInfo, OcrZone, VoucherConfig, VoucherConfigMap, ZoneImgMap } from './model';
export declare function getOcrZoneOptsByBankName(bankName: BankName, configMap: VoucherConfigMap): VoucherConfig | void;
export declare function cropImgAllZones(srcPath: string, zoneTmpDir: string, ocrZoneOptsArr: ReadonlyArray<OcrZone>): Observable<ZoneImgMap>;
export declare function cropImgZone(srcPath: string, targetDir: string, ocrZoneOpts: OcrZone): Observable<ImgFileInfo>;
export declare function runOcr(path: string, lang: string): Observable<Buffer>;
{
"name": "bank-voucher-ocr",
"author": "waiting",
"version": "0.1.0",
"description": "银行凭证扫描处理",
"keywords": [],
"version": "0.1.1",
"description": "Bank Voucher ocr by tesseract and retrieve fields",
"keywords": [
"voucher",
"rxjs",
"ocr",
"tesseract",
"银行凭证",
"银行回单"
],
"engines": {

@@ -8,0 +15,0 @@ "node": ">=8.10.0"

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc