ocr-document-classification
Advanced tools
Comparing version 1.0.9 to 1.0.10
@@ -45,3 +45,6 @@ "use strict"; | ||
targetWords.forEach((targetWord) => { | ||
const similarity = (0, import_string_similarity_js.stringSimilarity)(docWord.toLowerCase(), targetWord.toLowerCase()); | ||
const similarity = (0, import_string_similarity_js.stringSimilarity)( | ||
docWord.toLowerCase(), | ||
targetWord.toLowerCase() | ||
); | ||
if (similarity >= threshold) { | ||
@@ -55,3 +58,3 @@ foundTargetWords.push(targetWord); | ||
var defaultDocumentDictionary = { | ||
BEVISPAFORSTEGANGSTJENESTE: [ | ||
"BEVIS P\xC5 F\xD8RSTEGANGSTJENESTE": [ | ||
["f\xF8rstegangstjeneste", "bevis", "avtjent"], | ||
@@ -63,6 +66,9 @@ ["attest", "f\xF8rstegangstjeneste"], | ||
KOMPETANSEBEVIS: [["omfatter", "oppl\xE6ring", "utdanningsprogram"]], | ||
LEGEERKLERING: [["legeerkl\xE6ring", "f\xF8dselsnummer"]] | ||
LEGEERKL\u00C6RING: [["legeerkl\xE6ring", "f\xF8dselsnummer"]] | ||
}; | ||
async function classifyDocument(file, onProgress, customDocumentDictionary) { | ||
const documentDictionary = { ...defaultDocumentDictionary, ...customDocumentDictionary }; | ||
const documentDictionary = { | ||
...defaultDocumentDictionary, | ||
...customDocumentDictionary | ||
}; | ||
let progress = 0; | ||
@@ -86,10 +92,17 @@ const worker = await Tesseract.createWorker(["noreng"], 1, { | ||
if (onProgress) onProgress(progress); | ||
const { data: { text } } = await worker.recognize(imageDataUrl, { | ||
const { | ||
data: { text } | ||
} = await worker.recognize(imageDataUrl, { | ||
rotateAuto: true | ||
}); | ||
const targetWords = Array.from(new Set(Object.values(documentDictionary).flat(2))); | ||
const targetWords = Array.from( | ||
new Set(Object.values(documentDictionary).flat(2)) | ||
); | ||
const targetWordsFound = findTargetWords(text, targetWords); | ||
progress = 0.9; | ||
if (onProgress) onProgress(progress); | ||
const classification = determineClassification(targetWordsFound, documentDictionary); | ||
const classification = determineClassification( | ||
targetWordsFound, | ||
documentDictionary | ||
); | ||
progress = 1; | ||
@@ -106,3 +119,5 @@ if (onProgress) onProgress(progress); | ||
function determineClassification(targetWordsFound, documentDictionary) { | ||
for (const [classification, targetWordSets] of Object.entries(documentDictionary)) { | ||
for (const [classification, targetWordSets] of Object.entries( | ||
documentDictionary | ||
)) { | ||
for (const targetWords of targetWordSets) { | ||
@@ -114,3 +129,3 @@ if (targetWords.every((word) => targetWordsFound.includes(word))) { | ||
} | ||
return "UNKNOWN"; | ||
return "UKJENT"; | ||
} | ||
@@ -117,0 +132,0 @@ // Annotate the CommonJS export names for ESM import in node: |
{ | ||
"name": "ocr-document-classification", | ||
"version": "1.0.9", | ||
"version": "1.0.10", | ||
"description": "Document classification using tesseract.js and string-similarity-js.", | ||
@@ -5,0 +5,0 @@ "main": "./dist/index.js", |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
12662
227