@llama-node/llama-cpp - npm Package Compare versions

+36

example/abortable.ts

		import { LLama, LlamaInvocation } from "../index";
		import path from "path";

		const run = async () => {
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		null,
		true
		);

		const template = `Who is the president of the United States?`;

		const prompt = `A chat between a user and an assistant.
		USER: ${template}
		ASSISTANT:`;

		const params: LlamaInvocation = {
		nThreads: 4,
		nTokPredict: 2048,
		topK: 40,
		topP: 0.1,
		temp: 0.2,
		repeatPenalty: 1,
		prompt,
		};

		const abort = llama.inference(params, (data) => {
		process.stdout.write(data.data?.token ?? "");
		});

		setTimeout(() => {
		abort();
		}, 3000);
		};

		run();

+32

example/inference.ts

		import { LLama, LlamaInvocation } from "../index";
		import path from "path";

		const run = async () => {
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		null,
		true
		);

		const template = `Who is the president of the United States?`;

		const prompt = `A chat between a user and an assistant.
		USER: ${template}
		ASSISTANT:`;

		const params: LlamaInvocation = {
		nThreads: 4,
		nTokPredict: 2048,
		topK: 40,
		topP: 0.1,
		temp: 0.2,
		repeatPenalty: 1,
		prompt,
		};

		llama.inference(params, (data) => {
		process.stdout.write(data.data?.token ?? "");
		});
		};

		run();

+1

-1

index.d.ts

		@@ -51,3 +51,3 @@ /* tslint:disable */
		tokenize(params: string, nCtx: number): Promise<Array<number>>
		inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): void
		inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): () => void
		}

+3

-2

package.json

		{
		"name": "@llama-node/llama-cpp",
		"version": "0.1.0",
		"version": "0.1.1",
		"main": "index.js",
		@@ -30,5 +30,6 @@ "types": "index.d.ts",
		"test": "vitest",
		"start": "tsx example/load.ts",
		"start": "tsx example/inference.ts",
		"embedding": "tsx example/embedding.ts",
		"inference": "tsx example/inference.ts",
		"abortable": "tsx example/abortable.ts",
		"tokenize": "tsx example/tokenize.ts",
		@@ -35,0 +36,0 @@ "semantic-compare": "tsx example/semantic-compare/compare.ts",

+8

-8

src/context.rs

 @@ -45,3 +45,2 @@ use std::{ffi::CStr, ptr::null_mut, slice};
 // Represents the LLamaContext which wraps FFI calls to the llama.cpp library.
 #[derive(Clone)]
 pub struct LLamaContext {
 @@ -71,3 +70,3 @@ ctx: *mut llama_context,
         &self,
         last_n_tokens_data: &mut [llama_token],
         last_n_tokens: &mut [llama_token],
         input: &LlamaInvocation,
 @@ -122,3 +121,3 @@ context_params: &llama_context_params,
         let last_n_repeat = std::cmp::min(
             std::cmp::min(last_n_tokens_data.len() as i32, repeat_last_n),
             std::cmp::min(last_n_tokens.len() as i32, repeat_last_n),
             n_ctx,
 @@ -128,7 +127,7 @@ );
         fn get_last_n_ptr(
             last_n_tokens_data: &mut [llama_token],
             last_n_tokens: &mut [llama_token],
             last_n_repeat: i32,
         ) -> *mut llama_token {
             let last_n_tokens_ptr = last_n_tokens_data.as_ptr();
             let last_n_tokens_size = last_n_tokens_data.len();
             let last_n_tokens_ptr = last_n_tokens.as_ptr();
             let last_n_tokens_size = last_n_tokens.len();
             let end_ptr = unsafe { last_n_tokens_ptr.add(last_n_tokens_size) };
 @@ -142,3 +141,3 @@ unsafe { end_ptr.sub(last_n_repeat as usize) }.cast_mut()
                 candidates_p,
                 get_last_n_ptr(last_n_tokens_data, last_n_repeat),
                 get_last_n_ptr(last_n_tokens, last_n_repeat),
                 last_n_repeat as usize,
 @@ -151,3 +150,3 @@ repeat_penalty,
                 candidates_p,
                 get_last_n_ptr(last_n_tokens_data, last_n_repeat),
                 get_last_n_ptr(last_n_tokens, last_n_repeat),
                 last_n_repeat as usize,
 @@ -228,2 +227,3 @@ alpha_frequency,
 unsafe impl Send for LLamaContext {}
 // TODO: this is not Sync-able
 unsafe impl Sync for LLamaContext {}
 @@ -230,0 +230,0 @@

+18

-10

src/lib.rs

		@@ -61,8 +61,9 @@ #![deny(clippy::all)]

		#[napi]
		#[napi(ts_return_type = "() => void")]
		pub fn inference(
		&self,
		env: Env,
		params: LlamaInvocation,
		#[napi(ts_arg_type = "(result: InferenceResult) => void")] callback: JsFunction,
		) -> Result<()> {
		) -> Result<JsFunction> {
		let tsfn: ThreadsafeFunction<InferenceResult, ErrorStrategy::Fatal> = callback
		@@ -75,13 +76,20 @@ .create_threadsafe_function(0, \|ctx: ThreadSafeCallContext<InferenceResult>\| {

		tokio::spawn(async move {
		let llama = llama.lock().await;
		llama
		.inference(&params, \|result\| {
		let running = Arc::new(Mutex::new(true));

		{
		let running = running.clone();
		tokio::task::spawn_blocking(move \|\| {
		let llama = llama.blocking_lock();
		llama.inference(&params, running, \|result\| {
		tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
		})
		.await;
		});
		});
		});
		}

		Ok(())
		env.create_function_from_closure("abort_inference", move \|_\| {
		let mut running = running.blocking_lock();
		*running = false;
		Ok(())
		})
		}
		}

+41

-21

src/llama.rs

		@@ -6,8 +6,9 @@ use std::sync::Arc;
		use crate::{
		context::{LLamaContext},
		context::LLamaContext,
		tokenizer::{llama_token_eos, tokenize},
		types::{InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation},
		types::{
		InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation,
		},
		};

		#[derive(Clone)]
		pub struct LLamaInternal {
		@@ -36,3 +37,4 @@ context: LLamaContext,
		pub async fn tokenize(&self, input: &str, n_ctx: usize) -> Result<Vec<i32>, napi::Error> {
		if let Ok(data) = tokenize(&self.context, input, n_ctx, false) {
		let context = &self.context;
		if let Ok(data) = tokenize(context, input, n_ctx, false) {
		Ok(data)
		@@ -45,6 +47,6 @@ } else {
		pub async fn embedding(&self, input: &LlamaInvocation) -> Result<Vec<f64>, napi::Error> {
		let context = &self.context;
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		let input_ctx = &self.context;
		let embd_inp = tokenize(
		input_ctx,
		context,
		input.prompt.as_str(),
		@@ -60,7 +62,7 @@ context_params_c.n_ctx as usize,

		input_ctx
		context
		.llama_eval(embd_inp.as_slice(), embd_inp.len() as i32, 0, input)
		.unwrap();

		let embeddings = input_ctx.llama_get_embeddings();
		let embeddings = context.llama_get_embeddings();

		@@ -74,9 +76,14 @@ if let Ok(embeddings) = embeddings {

		pub async fn inference(&self, input: &LlamaInvocation, callback: impl Fn(InferenceResult)) {
		pub fn inference(
		&self,
		input: &LlamaInvocation,
		running: Arc<Mutex<bool>>,
		callback: impl Fn(InferenceResult),
		) {
		let context = &self.context;
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		let input_ctx = &self.context;
		// Tokenize the stop sequence and input prompt.
		let tokenized_stop_prompt = input.stop_sequence.as_ref().map(\|stop_sequence\| {
		tokenize(
		input_ctx,
		context,
		stop_sequence,
		@@ -92,3 +99,3 @@ context_params_c.n_ctx as usize,
		let tokenized_input = tokenize(
		input_ctx,
		context,
		input.prompt.as_str(),
		@@ -104,4 +111,4 @@ context_params_c.n_ctx as usize,

		// Evaluate the prompt in full.
		input_ctx
		// Feed prompt to the model.
		context
		.llama_eval(
		@@ -121,7 +128,16 @@ tokenized_input.as_slice(),
		let mut completed = false;

		while n_remaining > 0 {
		let tok = input_ctx.llama_sample(embd.as_mut_slice(), input, &context_params_c);
		// Check if we are aborted by caller.
		let running = *running.blocking_lock();
		if !running {
		break;
		}

		n_used += 1;
		n_remaining -= 1;

		let tok = context.llama_sample(embd.as_mut_slice(), input, &context_params_c);
		embd[n_used] = tok;

		if tok == token_eos {
		@@ -131,2 +147,4 @@ completed = true;
		}

		// If we are predicting a fixed number of tokens, check if we have reached that number.
		if input.n_tok_predict != 0
		@@ -143,2 +161,3 @@ && n_used > (input.n_tok_predict as usize) + tokenized_input.len() - 1

		// Check if we have reached the stop sequence.
		if let Some(tokenized_stop_prompt) = &tokenized_stop_prompt {
		@@ -156,8 +175,4 @@ if tok == tokenized_stop_prompt[stop_sequence_i] {

		input_ctx
		.llama_eval(&embd[n_used..], 1, n_used as i32, input)
		.unwrap();

		let output = input_ctx.llama_token_to_str(&embd[n_used]);

		// We can output the token.
		let output = context.llama_token_to_str(&embd[n_used]);
		if let Some(output) = output {
		@@ -175,2 +190,7 @@ if stop_sequence_i == 0 {
		}

		// Continue feeding the token to the model.
		context
		.llama_eval(&embd[n_used..], 1, n_used as i32, input)
		.unwrap();
		}
		@@ -177,0 +197,0 @@

+2

-31

src/tokenizer.rs

		// use crate::output::Output;
		use anyhow::Result;
		use std::ffi::{CStr, CString};
		use std::ffi::CString;
		use std::os::raw::c_char;

		use llama_sys::{
		llama_token, llama_token_eos as inner_eos, llama_token_to_str, llama_tokenize,
		};
		use llama_sys::{llama_token, llama_token_eos as inner_eos, llama_tokenize};

		@@ -17,21 +15,2 @@ use crate::context::LLamaContext;

		/// Converts a llama_token to a Rust String.
		///
		/// # Arguments
		///
		/// * `ctx` - A pointer to the llama_context.
		/// * `token` - The llama_token to convert to a string.
		///
		/// # Returns
		///
		/// A Rust String representation of the given llama_token.
		fn _to_output(context: &LLamaContext, token: i32) -> String {
		let c_ptr = unsafe { llama_token_to_str(**context, token) };
		let native_string = unsafe { CStr::from_ptr(c_ptr) }
		.to_str()
		.unwrap()
		.to_owned();
		native_string
		}

		pub fn llama_token_eos() -> i32 {
		@@ -94,9 +73,1 @@ unsafe { inner_eos() }
		}

		// Converts an embedding represented as a slice into the Output string.
		// pub(crate) fn _embedding_to_output(context: &LLamaContext, embd: &[i32]) -> Output {
		// embd.iter()
		// .map(\|token\| _to_output(context, *token))
		// .fold("".to_string(), \|cur, nxt\| cur + &nxt)
		// .into()
		// }

-32

example/load.ts

		import { LLama, LlamaInvocation } from "../index";
		import path from "path";

		const run = async () => {
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		null,
		true
		);

		const template = `Who is the president of the United States?`;

		const prompt = `A chat between a user and an assistant.
		USER: ${template}
		ASSISTANT:`;

		const params: LlamaInvocation = {
		nThreads: 4,
		nTokPredict: 2048,
		topK: 40,
		topP: 0.1,
		temp: 0.2,
		repeatPenalty: 1,
		prompt,
		};

		llama.inference(params, (data) => {
		process.stdout.write(data.data?.token ?? "");
		});
		};

		run();

@llama-node/llama-cpp.darwin-arm64.node