@@ -40,3 +40,2 @@ "use strict";
		var Tesseract = __toESM(require("tesseract.js"));
		var pdfjsLib = __toESM(require("pdfjs-dist"));
		function findTargetWords(documentText, targetWords, threshold = 0.75) {
		@@ -68,14 +67,2 @@ const foundTargetWords = [];
		};
		async function extractTextFromPDF(fileContent) {
		pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdn.jsdelivr.net/npm/pdfjs-dist@4.4.168/+esm`;
		const pdf = await pdfjsLib.getDocument({ data: fileContent }).promise;
		let text = "";
		for (let i = 0; i < pdf.numPages; i++) {
		const page = await pdf.getPage(i + 1);
		const content = await page.getTextContent();
		const pageText = content.items.map((item) => item.str).join(" ");
		text += pageText + "\n";
		}
		return text;
		}
		async function ocrImage(imageDataUrl, onProgress) {
		@@ -93,3 +80,5 @@ const worker = await Tesseract.createWorker(["nor", "eng"], 1, {
		});
		const { data: { text } } = await worker.recognize(imageDataUrl, {
		const {
		data: { text }
		} = await worker.recognize(imageDataUrl, {
		rotateAuto: true
		@@ -114,11 +103,6 @@ });
		console.log("Processing PDF file...");
		text = await extractTextFromPDF(fileContent);
		console.log("Text extracted from PDF:", text);
		if (!text.trim()) {
		console.log("Extracted text is empty, using OCR...");
		const blob = new Blob([fileContent], { type: "application/pdf" });
		const imageDataUrl = URL.createObjectURL(blob);
		text = await ocrImage(imageDataUrl, onProgress);
		console.log("Text extracted using OCR:", text);
		}
		const blob = new Blob([fileContent], { type: "application/pdf" });
		const imageDataUrl = URL.createObjectURL(blob);
		text = await ocrImage(imageDataUrl, onProgress);
		console.log("Text extracted using OCR from pdf:", text);
		progress = 0.9;
		@@ -125,0 +109,0 @@ } else {

package.json

		{
		"name": "ocr-document-classification",
		"version": "1.1.3",
		"version": "1.1.4",
		"description": "Document classification using tesseract.js and string-similarity-js.",
		@@ -28,3 +28,2 @@ "main": "./dist/index.js",
		"dependencies": {
		"pdfjs-dist": "^4.4.168",
		"string-similarity-js": "^2.1.4",
		@@ -31,0 +30,0 @@ "tesseract.js": "^5.1.0",

dist/index.mjs

Sorry, the diff of this file is not supported yet

ocr-document-classification - npm Package Compare versions

Improved metrics

Worsened metrics

Dependency changes