New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

ocr-document-classification

Package Overview
Dependencies
Maintainers
0
Versions
46
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

ocr-document-classification - npm Package Compare versions

Comparing version 1.1.3 to 1.1.4

30

dist/index.js

@@ -40,3 +40,2 @@ "use strict";

var Tesseract = __toESM(require("tesseract.js"));
var pdfjsLib = __toESM(require("pdfjs-dist"));
function findTargetWords(documentText, targetWords, threshold = 0.75) {

@@ -68,14 +67,2 @@ const foundTargetWords = [];

};
async function extractTextFromPDF(fileContent) {
pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdn.jsdelivr.net/npm/pdfjs-dist@4.4.168/+esm`;
const pdf = await pdfjsLib.getDocument({ data: fileContent }).promise;
let text = "";
for (let i = 0; i < pdf.numPages; i++) {
const page = await pdf.getPage(i + 1);
const content = await page.getTextContent();
const pageText = content.items.map((item) => item.str).join(" ");
text += pageText + "\n";
}
return text;
}
async function ocrImage(imageDataUrl, onProgress) {

@@ -93,3 +80,5 @@ const worker = await Tesseract.createWorker(["nor", "eng"], 1, {

});
const { data: { text } } = await worker.recognize(imageDataUrl, {
const {
data: { text }
} = await worker.recognize(imageDataUrl, {
rotateAuto: true

@@ -114,11 +103,6 @@ });

console.log("Processing PDF file...");
text = await extractTextFromPDF(fileContent);
console.log("Text extracted from PDF:", text);
if (!text.trim()) {
console.log("Extracted text is empty, using OCR...");
const blob = new Blob([fileContent], { type: "application/pdf" });
const imageDataUrl = URL.createObjectURL(blob);
text = await ocrImage(imageDataUrl, onProgress);
console.log("Text extracted using OCR:", text);
}
const blob = new Blob([fileContent], { type: "application/pdf" });
const imageDataUrl = URL.createObjectURL(blob);
text = await ocrImage(imageDataUrl, onProgress);
console.log("Text extracted using OCR from pdf:", text);
progress = 0.9;

@@ -125,0 +109,0 @@ } else {

{
"name": "ocr-document-classification",
"version": "1.1.3",
"version": "1.1.4",
"description": "Document classification using tesseract.js and string-similarity-js.",

@@ -28,3 +28,2 @@ "main": "./dist/index.js",

"dependencies": {
"pdfjs-dist": "^4.4.168",
"string-similarity-js": "^2.1.4",

@@ -31,0 +30,0 @@ "tesseract.js": "^5.1.0",

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc