@llama-node/llama-cpp - npm Package Compare versions

+33

example/embedding.ts

		import { LLama, LlamaContextParams, LlamaInvocation } from "../index";
		import path from "path";

		const llama = LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-4bit-rev1.bin"),
		{
		nCtx: 512,
		nParts: -1,
		seed: 0,
		f16Kv: false,
		logitsAll: false,
		vocabOnly: false,
		useMlock: false,
		embedding: true,
		},
		false
		);

		const prompt = `Who is the president of the United States?`;

		const params: LlamaInvocation = {
		nThreads: 4,
		nTokPredict: 2048,
		topK: 40,
		topP: 0.1,
		temp: 0.2,
		repeatPenalty: 1,
		prompt,
		};

		llama.getWordEmbedding(params, (data) => {
		console.log(data.data);
		});

+14

example/tokenize.ts

		import { LLama } from "../index";
		import path from "path";

		const llama = LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-4bit-rev1.bin"),
		null,
		false
		);

		const template = `Who is the president of the United States?`;

		llama.tokenize(template, 2048, (data) => {
		console.log(data.data);
		});

+55

src/types.rs

		use crate::context::LlamaInvocation;
		use napi::bindgen_prelude::*;
		use std::sync::mpsc::Sender;

		#[derive(Clone, Debug)]
		pub enum LLamaCommand {
		Inference(LlamaInvocation, Sender<InferenceResult>),
		Tokenize(String, usize, Sender<TokenizeResult>),
		Embedding(LlamaInvocation, Sender<EmbeddingResult>),
		}

		#[napi]
		pub enum TokenizeResultType {
		Error,
		Data,
		}

		#[napi(object)]
		pub struct TokenizeResult {
		pub r#type: TokenizeResultType,
		pub data: Vec<i32>,
		}

		#[napi(object)]
		#[derive(Clone, Debug)]
		pub struct InferenceToken {
		pub token: String,
		pub completed: bool,
		}

		#[napi]
		pub enum InferenceResultType {
		Error,
		Data,
		End,
		}

		#[napi(object)]
		pub struct InferenceResult {
		pub r#type: InferenceResultType,
		pub data: Option<InferenceToken>,
		pub message: Option<String>,
		}

		#[napi]
		pub enum EmbeddingResultType {
		Error,
		Data,
		}

		#[napi(object)]
		pub struct EmbeddingResult {
		pub r#type: EmbeddingResultType,
		pub data: Vec<f64>,
		}

+1

-0

Cargo.toml

		@@ -20,2 +20,3 @@ [package]
		llm-chain-llama-sys = { git = "https://github.com/hlhr202/llm-chain.git", branch = "feature/fix-cross-compile" }
		# llm-chain-llama-sys = { path = "../../../llm-chain/llm-chain-llama/sys" }
		napi = { version = "2.12.2", default-features = false, features = ["napi6", "async"] }
		@@ -22,0 +23,0 @@ napi-derive = "2.12.2"

+23

-0

index.d.ts

		@@ -26,2 +26,10 @@ /* tslint:disable */
		}
		export const enum TokenizeResultType {
		Error = 0,
		Data = 1
		}
		export interface TokenizeResult {
		type: TokenizeResultType
		data: Array<number>
		}
		export interface InferenceToken {
		@@ -41,6 +49,21 @@ token: string
		}
		export const enum EmbeddingResultType {
		Error = 0,
		Data = 1
		}
		export interface EmbeddingResult {
		type: EmbeddingResultType
		data: Array<number>
		}
		export class LLama {
		static load(path: string, params: LlamaContextParams \| undefined \| null, enableLogger: boolean): LLama
		getWordEmbedding(input: LlamaInvocation,
		callback: (result: EmbeddingResult) => void): void
		tokenize(params: string,
		nCtx: number,
		callback: (result:
		{ type: TokenizeResultType, data: number[] }
		) => void): void
		inference(input: LlamaInvocation,
		callback: (result: InferenceResult) => void): void
		}

+3

-1

index.js

		@@ -255,5 +255,7 @@ /* tslint:disable */

		const { InferenceResultType, LLama } = nativeBinding
		const { TokenizeResultType, InferenceResultType, EmbeddingResultType, LLama } = nativeBinding

		module.exports.TokenizeResultType = TokenizeResultType
		module.exports.InferenceResultType = InferenceResultType
		module.exports.EmbeddingResultType = EmbeddingResultType
		module.exports.LLama = LLama

+1

-1

package.json

		{
		"name": "@llama-node/llama-cpp",
		"version": "0.0.21",
		"version": "0.0.22",
		"main": "index.js",
		@@ -5,0 +5,0 @@ "types": "index.d.ts",

+17

-7

src/context.rs

 @@ -1,2 +0,2 @@
 use std::{ffi::CStr, ptr::null_mut};
 use std::{ffi::CStr, ptr::null_mut, slice};
 @@ -6,4 +6,4 @@ use anyhow::Result;
     llama_context, llama_context_default_params, llama_context_params, llama_eval, llama_free,
     llama_init_from_file, llama_print_system_info, llama_sample_top_p_top_k, llama_token,
     llama_token_to_str,
     llama_get_embeddings, llama_init_from_file, llama_n_embd, llama_print_system_info,
     llama_sample_top_p_top_k, llama_token, llama_token_to_str,
 };
 @@ -36,2 +36,3 @@
     pub embedding: bool,
     // pub use_mmap: bool,
 @@ -62,2 +63,3 @@
             progress_callback_user_data: null_mut(),
             // use_mmap: params.use_mmap,
 @@ -74,6 +76,3 @@ }
     // Creates a new LLamaContext from the specified file and configuration parameters.
     pub fn from_file_and_params(
         path: &str,
         params: &Option<LlamaContextParams>,
     ) -> Self {
     pub fn from_file_and_params(path: &str, params: &Option<LlamaContextParams>) -> Self {
         let params = LlamaContextParams::or_default(params);
 @@ -125,2 +124,13 @@ let ctx = unsafe { llama_init_from_file(path.as_ptr() as *const i8, params) };
     pub fn llama_get_embeddings(&self) -> Result<Vec<f32>, ()> {
         unsafe {
             let embd_size = llama_n_embd(self.ctx);
             let embd_ptr = llama_get_embeddings(self.ctx);
             if embd_ptr.is_null() {
                 return Err(());
             Ok(slice::from_raw_parts(embd_ptr, embd_size as usize).to_vec())
     // Evaluates the given tokens with the specified configuration.
 @@ -127,0 +137,0 @@ pub fn llama_eval(

+89

-12

src/lib.rs

		@@ -10,12 +10,19 @@ #![deny(clippy::all)]
		mod tokenizer;
		mod types;

		use std::sync::{mpsc::channel, Arc};
		use std::{
		sync::{mpsc::channel, Arc},
		thread, time,
		};

		use context::{LlamaContextParams, LlamaInvocation};
		use llama::{InferenceResult, LLamaChannel};
		use llama::LLamaChannel;
		use napi::{
		bindgen_prelude::*,
		threadsafe_function::{ErrorStrategy, ThreadsafeFunction, ThreadsafeFunctionCallMode},
		threadsafe_function::{
		ErrorStrategy, ThreadSafeCallContext, ThreadsafeFunction, ThreadsafeFunctionCallMode,
		},
		JsFunction,
		};
		use types::{InferenceResult, TokenizeResult, EmbeddingResult};

		@@ -54,3 +61,3 @@ #[napi]
		_ => {
		std::thread::yield_now();
		thread::yield_now();
		}
		@@ -63,2 +70,68 @@ }
		#[napi(ts_args_type = "input: LlamaInvocation,
		callback: (result: EmbeddingResult) => void")]
		pub fn get_word_embedding(&self, input: LlamaInvocation, callback: JsFunction) -> Result<()> {
		let tsfn: ThreadsafeFunction<EmbeddingResult, ErrorStrategy::Fatal> =
		callback.create_threadsafe_function(0, \|ctx\| Ok(vec![ctx.value]))?;
		let (embeddings_sender, embeddings_receiver) = channel();
		let llama_channel = self.llama_channel.clone();

		llama_channel.embedding(input, embeddings_sender);

		thread::spawn(move \|\| {
		loop {
		let result = embeddings_receiver.recv();
		match result {
		Ok(result) => {
		tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
		}
		Err(_) => {
		break;
		}
		}
		}
		thread::sleep(time::Duration::from_millis(300)); // wait for end signal
		tsfn.abort().unwrap();
		});

		Ok(())
		}

		#[napi(ts_args_type = "params: string,
		nCtx: number,
		callback: (result:
		{ type: TokenizeResultType, data: number[] }
		) => void")]
		pub fn tokenize(&self, params: String, n_ctx: i32, callback: JsFunction) -> Result<()> {
		let (tokenize_sender, tokenize_receiver) = channel::<TokenizeResult>();

		let tsfn: ThreadsafeFunction<TokenizeResult, ErrorStrategy::Fatal> = callback
		.create_threadsafe_function(0, \|ctx: ThreadSafeCallContext<TokenizeResult>\| {
		Ok(vec![ctx.value])
		})?;

		let llama_channel = self.llama_channel.clone();

		llama_channel.tokenize(params, n_ctx as usize, tokenize_sender);

		thread::spawn(move \|\| {
		'waiting_tokenize: loop {
		let recv = tokenize_receiver.recv();
		match recv {
		Ok(callback) => {
		tsfn.call(callback, ThreadsafeFunctionCallMode::Blocking);
		break 'waiting_tokenize;
		}
		_ => {
		thread::yield_now();
		}
		}
		}
		thread::sleep(time::Duration::from_millis(300)); // wait for end signal
		tsfn.abort().unwrap();
		});

		Ok(())
		}

		#[napi(ts_args_type = "input: LlamaInvocation,
		callback: (result: InferenceResult) => void")]
		@@ -73,12 +146,16 @@ pub fn inference(&self, input: LlamaInvocation, callback: JsFunction) -> Result<()> {

		std::thread::spawn(move \|\| loop {
		let result = inference_receiver.recv();
		match result {
		Ok(result) => {
		tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
		thread::spawn(move \|\| {
		loop {
		let result = inference_receiver.recv();
		match result {
		Ok(result) => {
		tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
		}
		Err(_) => {
		break;
		}
		}
		Err(_) => {
		break;
		}
		}
		thread::sleep(time::Duration::from_millis(300)); // wait for end signal
		tsfn.abort().unwrap();
		});
		@@ -85,0 +162,0 @@

+75

-26

src/llama.rs

		@@ -1,2 +0,1 @@
		use napi::bindgen_prelude::*;
		use std::{
		@@ -13,2 +12,6 @@ sync::{
		tokenizer::{embedding_to_output, llama_token_eos, tokenize},
		types::{
		EmbeddingResult, EmbeddingResultType, InferenceResult, InferenceResultType, InferenceToken,
		LLamaCommand, TokenizeResult, TokenizeResultType,
		},
		};
		@@ -27,33 +30,61 @@

		#[derive(Clone, Debug)]
		pub enum LLamaCommand {
		Inference(LlamaInvocation, Sender<InferenceResult>),
		}
		impl LLamaInternal {
		pub fn tokenize(&self, input: &str, n_ctx: usize, sender: &Sender<TokenizeResult>) {
		if let Ok(data) = tokenize(&self.context, input, n_ctx, false) {
		sender
		.send(TokenizeResult {
		data,
		r#type: TokenizeResultType::Data,
		})
		.unwrap();
		} else {
		sender
		.send(TokenizeResult {
		data: vec![],
		r#type: TokenizeResultType::Error,
		})
		.unwrap();
		}
		}

		#[napi(object)]
		#[derive(Clone, Debug)]
		pub struct InferenceToken {
		pub token: String,
		pub completed: bool,
		}
		pub fn embedding(&self, input: &LlamaInvocation, sender: &Sender<EmbeddingResult>) {
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		let input_ctx = &self.context;
		let embd_inp = tokenize(
		input_ctx,
		input.prompt.as_str(),
		context_params_c.n_ctx as usize,
		true,
		)
		.unwrap();

		#[napi]
		pub enum InferenceResultType {
		Error,
		Data,
		End,
		}
		// let end_text = "\n";
		// let end_token =
		// tokenize(input_ctx, end_text, context_params_c.n_ctx as usize, false).unwrap();

		#[napi(object)]
		pub struct InferenceResult {
		pub r#type: InferenceResultType,
		pub data: Option<InferenceToken>,
		pub message: Option<String>,
		}
		input_ctx
		.llama_eval(embd_inp.as_slice(), embd_inp.len() as i32, 0, input)
		.unwrap();

		impl LLamaInternal {
		let embeddings = input_ctx.llama_get_embeddings();

		if let Ok(embeddings) = embeddings {
		sender
		.send(EmbeddingResult {
		r#type: EmbeddingResultType::Data,
		data: embeddings.iter().map(\|&x\| x as f64).collect(),
		})
		.unwrap();
		} else {
		sender
		.send(EmbeddingResult {
		r#type: EmbeddingResultType::Error,
		data: vec![],
		})
		.unwrap();
		}
		}

		pub fn inference(&self, input: &LlamaInvocation, sender: &Sender<InferenceResult>) {
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		log::info!("inference: {:?}", input);
		log::info!("context_params: {:?}", context_params_c);
		let input_ctx = &self.context;
		@@ -193,2 +224,14 @@ // Tokenize the stop sequence and input prompt.

		pub fn tokenize(&self, input: String, n_ctx: usize, sender: Sender<TokenizeResult>) {
		self.command_sender
		.send(LLamaCommand::Tokenize(input, n_ctx, sender))
		.unwrap();
		}

		pub fn embedding(&self, params: LlamaInvocation, sender: Sender<EmbeddingResult>) {
		self.command_sender
		.send(LLamaCommand::Embedding(params, sender))
		.unwrap();
		}

		pub fn inference(&self, params: LlamaInvocation, sender: Sender<InferenceResult>) {
		@@ -230,2 +273,8 @@ self.command_sender
		}
		Ok(LLamaCommand::Embedding(params, sender)) => {
		llama.embedding(&params, &sender);
		}
		Ok(LLamaCommand::Tokenize(text, n_ctx, sender)) => {
		llama.tokenize(&text, n_ctx, &sender);
		}
		Err(TryRecvError::Disconnected) => {
		@@ -232,0 +281,0 @@ break 'llama_loop;

@llama-node/llama-cpp.darwin-arm64.node

Sorry, the diff of this file is not supported yet

@llama-node/llama-cpp.darwin-x64.node

Sorry, the diff of this file is not supported yet

@llama-node/llama-cpp.linux-x64-gnu.node

Sorry, the diff of this file is not supported yet

@llama-node/llama-cpp.win32-x64-msvc.node

Sorry, the diff of this file is not supported yet

@llama-node/llama-cpp - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics