ocr-document-classification
Advanced tools
Comparing version 1.1.3 to 1.1.4
@@ -40,3 +40,2 @@ "use strict"; | ||
var Tesseract = __toESM(require("tesseract.js")); | ||
var pdfjsLib = __toESM(require("pdfjs-dist")); | ||
function findTargetWords(documentText, targetWords, threshold = 0.75) { | ||
@@ -68,14 +67,2 @@ const foundTargetWords = []; | ||
}; | ||
async function extractTextFromPDF(fileContent) { | ||
pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdn.jsdelivr.net/npm/pdfjs-dist@4.4.168/+esm`; | ||
const pdf = await pdfjsLib.getDocument({ data: fileContent }).promise; | ||
let text = ""; | ||
for (let i = 0; i < pdf.numPages; i++) { | ||
const page = await pdf.getPage(i + 1); | ||
const content = await page.getTextContent(); | ||
const pageText = content.items.map((item) => item.str).join(" "); | ||
text += pageText + "\n"; | ||
} | ||
return text; | ||
} | ||
async function ocrImage(imageDataUrl, onProgress) { | ||
@@ -93,3 +80,5 @@ const worker = await Tesseract.createWorker(["nor", "eng"], 1, { | ||
}); | ||
const { data: { text } } = await worker.recognize(imageDataUrl, { | ||
const { | ||
data: { text } | ||
} = await worker.recognize(imageDataUrl, { | ||
rotateAuto: true | ||
@@ -114,11 +103,6 @@ }); | ||
console.log("Processing PDF file..."); | ||
text = await extractTextFromPDF(fileContent); | ||
console.log("Text extracted from PDF:", text); | ||
if (!text.trim()) { | ||
console.log("Extracted text is empty, using OCR..."); | ||
const blob = new Blob([fileContent], { type: "application/pdf" }); | ||
const imageDataUrl = URL.createObjectURL(blob); | ||
text = await ocrImage(imageDataUrl, onProgress); | ||
console.log("Text extracted using OCR:", text); | ||
} | ||
const blob = new Blob([fileContent], { type: "application/pdf" }); | ||
const imageDataUrl = URL.createObjectURL(blob); | ||
text = await ocrImage(imageDataUrl, onProgress); | ||
console.log("Text extracted using OCR from pdf:", text); | ||
progress = 0.9; | ||
@@ -125,0 +109,0 @@ } else { |
{ | ||
"name": "ocr-document-classification", | ||
"version": "1.1.3", | ||
"version": "1.1.4", | ||
"description": "Document classification using tesseract.js and string-similarity-js.", | ||
@@ -28,3 +28,2 @@ "main": "./dist/index.js", | ||
"dependencies": { | ||
"pdfjs-dist": "^4.4.168", | ||
"string-similarity-js": "^2.1.4", | ||
@@ -31,0 +30,0 @@ "tesseract.js": "^5.1.0", |
Sorry, the diff of this file is not supported yet
3
14612
279
- Removedpdfjs-dist@^4.4.168
- Removed@napi-rs/canvas@0.1.67(transitive)
- Removed@napi-rs/canvas-android-arm64@0.1.67(transitive)
- Removed@napi-rs/canvas-darwin-arm64@0.1.67(transitive)
- Removed@napi-rs/canvas-darwin-x64@0.1.67(transitive)
- Removed@napi-rs/canvas-linux-arm-gnueabihf@0.1.67(transitive)
- Removed@napi-rs/canvas-linux-arm64-gnu@0.1.67(transitive)
- Removed@napi-rs/canvas-linux-arm64-musl@0.1.67(transitive)
- Removed@napi-rs/canvas-linux-riscv64-gnu@0.1.67(transitive)
- Removed@napi-rs/canvas-linux-x64-gnu@0.1.67(transitive)
- Removed@napi-rs/canvas-linux-x64-musl@0.1.67(transitive)
- Removed@napi-rs/canvas-win32-x64-msvc@0.1.67(transitive)
- Removedpdfjs-dist@4.10.38(transitive)