llama-ocr - npm Package Compare versions

Comparing version 0.0.2 to 0.0.3

package.json

		{
		"name": "llama-ocr",
		"version": "0.0.2",
		"description": "PDF to markdown (OCR) with Llama 3.2 Vision.",
		"version": "0.0.3",
		"description": "Image to markdown (OCR) with Llama 3.2 Vision.",
		"author": "Hassan El Mghari (@nutlope)",
		@@ -14,6 +14,8 @@ "repository": {
		"scripts": {
		"build": "tsc"
		"build": "tsc",
		"test": "tsx ./test/index.ts"
		},
		"devDependencies": {
		"@types/node": "^22.9.0",
		"tsx": "^4.19.2",
		"typescript": "^5.6.3"
		@@ -28,3 +30,6 @@ },
		"Llama"
		]
		],
		"dependencies": {
		"together-ai": "^0.9.0"
		}
		}

README.md

		@@ -5,5 +5,5 @@ <div align="center">
		</div>
		<p>An npm library to run OCR with Llama 3.2 Vision.</p>
		<p>An npm library to run OCR for free with Llama 3.2 Vision.</p>

		<a href="https://www.npmjs.com/package/together-ai"><img src="https://img.shields.io/npm/v/together-ai" alt="Current version"></a>
		<a href="https://www.npmjs.com/package/llama-ocr"><img src="https://img.shields.io/npm/v/llama-ocr" alt="Current version"></a>

		@@ -24,4 +24,5 @@ </div>
		const markdown = await ocr({
		filePath: "yourfile.pdf",
		apiKey: process.env.TOGETHER_API_KEY,
		filePath: "./trader-joes-receipt.jpg", // path to your image (soon PDF!)
		model: "Llama-3.2-90B-Vision", // optional, defaults to "free"
		apiKey: process.env.TOGETHER_API_KEY, // Together AI API key
		});
		@@ -32,2 +33,13 @@ ```

		This library uses Llama 3.2 through [Together AI](https://www.together.ai/blog/llama-3-2-vision).
		This library uses the free Llama 3.2 endpoint from [Together AI](https://dub.sh/together-ai) to parse images and return markdown. Paid endpoints for Llama 3.2 11B and Llama 3.2 90B are also available for faster performance and higher rate limits.

		## Roadmap

		- [x] Add support for local images OCR
		- [x] Add support for remote images OCR
		- [ ] Add support for PDF OCR (take screenshots of PDF & feed to vision model)
		- [ ] Add support for JSON output in addition to markdown

		## Credit

		This project was inspired by [Zerox](https://github.com/getomni-ai/zerox). Go check them out!

104

src/index.ts

		@@ -1,9 +0,105 @@
		export function ocr({
		import Together from "together-ai";
		import fs from "fs";
		// import { fromPath } from "pdf2pic";

		export async function ocr({
		filePath,
		apiKey,
		apiKey = process.env.TOGETHER_API_KEY,
		model = "free",
		}: {
		filePath: string;
		apiKey: string;
		apiKey?: string;
		model?: "Llama-3.2-90B-Vision" \| "Llama-3.2-11B-Vision" \| "free";
		}) {
		console.log(filePath);
		const visionLLM =
		model === "free"
		? "meta-llama/Llama-Vision-Free"
		: `meta-llama/${model}-Instruct-Turbo`;

		const together = new Together({
		apiKey,
		});

		let finalMarkdown = await getMarkDown({ together, visionLLM, filePath });

		return finalMarkdown;

		// if (filePath.endsWith(".pdf")) {
		// const options = {
		// density: 100, // Image density
		// saveFilename: "output", // Output filename
		// savePath: "./images", // Output directory
		// format: "png", // Image format: png, jpeg, etc.
		// width: 2550, // Desired width of the image
		// height: 3300, // Desired height of the image
		// };

		// // Create a converter instance
		// const converter = fromPath(filePath, options);

		// // Convert all pages to images
		// await converter
		// .bulk(-1)
		// .then((resolve) => {
		// console.log("Images converted:", resolve);
		// })
		// .catch((error) => {
		// console.error("Error converting PDF:", error);
		// });

		// continue here by calling the getMarkDown function for each image
		// }
		}

		async function getMarkDown({
		together,
		visionLLM,
		filePath,
		}: {
		together: Together;
		visionLLM: string;
		filePath: string;
		}) {
		const systemPrompt = `Convert the provided image into Markdown format. Ensure that all content from the page is included, such as headers, footers, subtexts, images (with alt text if possible), tables, and any other elements.

		Requirements:

		- Output Only Markdown: Return solely the Markdown content without any additional explanations or comments.
		- No Delimiters: Do not use code fences or delimiters like \`\`\`markdown.
		- Complete Content: Do not omit any part of the page, including headers, footers, and subtext.
		`;

		const finalImageUrl = isRemoteFile(filePath)
		? filePath
		: `data:image/jpeg;base64,${encodeImage(filePath)}`;

		const output = await together.chat.completions.create({
		model: visionLLM,
		messages: [
		{
		role: "user",
		// @ts-expect-error
		content: [
		{ type: "text", text: systemPrompt },
		{
		type: "image_url",
		image_url: {
		url: finalImageUrl,
		},
		},
		],
		},
		],
		});

		return output.choices[0].message.content;
		}

		function encodeImage(imagePath: string) {
		const imageFile = fs.readFileSync(imagePath);
		return Buffer.from(imageFile).toString("base64");
		}

		function isRemoteFile(filePath: string): boolean {
		return filePath.startsWith("http://") \|\| filePath.startsWith("https://");
		}

tsconfig.json

		{
		"compilerOptions": {
		"target": "es2016",
		"target": "es2020",
		"module": "commonjs",
		@@ -5,0 +5,0 @@ "outDir": "./dist",

dist/index.d.ts

dist/index.js

llama-ocr - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes