@llama-node/llama-cpp
Advanced tools
| import { LLama, LlamaInvocation } from "../index"; | ||
| import path from "path"; | ||
| const run = async () => { | ||
| const llama = await LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| null, | ||
| true | ||
| ); | ||
| const template = `Who is the president of the United States?`; | ||
| const prompt = `A chat between a user and an assistant. | ||
| USER: ${template} | ||
| ASSISTANT:`; | ||
| const params: LlamaInvocation = { | ||
| nThreads: 4, | ||
| nTokPredict: 2048, | ||
| topK: 40, | ||
| topP: 0.1, | ||
| temp: 0.2, | ||
| repeatPenalty: 1, | ||
| prompt, | ||
| }; | ||
| const abort = llama.inference(params, (data) => { | ||
| process.stdout.write(data.data?.token ?? ""); | ||
| }); | ||
| setTimeout(() => { | ||
| abort(); | ||
| }, 3000); | ||
| }; | ||
| run(); |
| import { LLama, LlamaInvocation } from "../index"; | ||
| import path from "path"; | ||
| const run = async () => { | ||
| const llama = await LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| null, | ||
| true | ||
| ); | ||
| const template = `Who is the president of the United States?`; | ||
| const prompt = `A chat between a user and an assistant. | ||
| USER: ${template} | ||
| ASSISTANT:`; | ||
| const params: LlamaInvocation = { | ||
| nThreads: 4, | ||
| nTokPredict: 2048, | ||
| topK: 40, | ||
| topP: 0.1, | ||
| temp: 0.2, | ||
| repeatPenalty: 1, | ||
| prompt, | ||
| }; | ||
| llama.inference(params, (data) => { | ||
| process.stdout.write(data.data?.token ?? ""); | ||
| }); | ||
| }; | ||
| run(); |
+1
-1
@@ -51,3 +51,3 @@ /* tslint:disable */ | ||
| tokenize(params: string, nCtx: number): Promise<Array<number>> | ||
| inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): void | ||
| inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): () => void | ||
| } |
+3
-2
| { | ||
| "name": "@llama-node/llama-cpp", | ||
| "version": "0.1.0", | ||
| "version": "0.1.1", | ||
| "main": "index.js", | ||
@@ -30,5 +30,6 @@ "types": "index.d.ts", | ||
| "test": "vitest", | ||
| "start": "tsx example/load.ts", | ||
| "start": "tsx example/inference.ts", | ||
| "embedding": "tsx example/embedding.ts", | ||
| "inference": "tsx example/inference.ts", | ||
| "abortable": "tsx example/abortable.ts", | ||
| "tokenize": "tsx example/tokenize.ts", | ||
@@ -35,0 +36,0 @@ "semantic-compare": "tsx example/semantic-compare/compare.ts", |
+8
-8
@@ -45,3 +45,2 @@ use std::{ffi::CStr, ptr::null_mut, slice}; | ||
| // Represents the LLamaContext which wraps FFI calls to the llama.cpp library. | ||
| #[derive(Clone)] | ||
| pub struct LLamaContext { | ||
@@ -71,3 +70,3 @@ ctx: *mut llama_context, | ||
| &self, | ||
| last_n_tokens_data: &mut [llama_token], | ||
| last_n_tokens: &mut [llama_token], | ||
| input: &LlamaInvocation, | ||
@@ -122,3 +121,3 @@ context_params: &llama_context_params, | ||
| let last_n_repeat = std::cmp::min( | ||
| std::cmp::min(last_n_tokens_data.len() as i32, repeat_last_n), | ||
| std::cmp::min(last_n_tokens.len() as i32, repeat_last_n), | ||
| n_ctx, | ||
@@ -128,7 +127,7 @@ ); | ||
| fn get_last_n_ptr( | ||
| last_n_tokens_data: &mut [llama_token], | ||
| last_n_tokens: &mut [llama_token], | ||
| last_n_repeat: i32, | ||
| ) -> *mut llama_token { | ||
| let last_n_tokens_ptr = last_n_tokens_data.as_ptr(); | ||
| let last_n_tokens_size = last_n_tokens_data.len(); | ||
| let last_n_tokens_ptr = last_n_tokens.as_ptr(); | ||
| let last_n_tokens_size = last_n_tokens.len(); | ||
| let end_ptr = unsafe { last_n_tokens_ptr.add(last_n_tokens_size) }; | ||
@@ -142,3 +141,3 @@ unsafe { end_ptr.sub(last_n_repeat as usize) }.cast_mut() | ||
| candidates_p, | ||
| get_last_n_ptr(last_n_tokens_data, last_n_repeat), | ||
| get_last_n_ptr(last_n_tokens, last_n_repeat), | ||
| last_n_repeat as usize, | ||
@@ -151,3 +150,3 @@ repeat_penalty, | ||
| candidates_p, | ||
| get_last_n_ptr(last_n_tokens_data, last_n_repeat), | ||
| get_last_n_ptr(last_n_tokens, last_n_repeat), | ||
| last_n_repeat as usize, | ||
@@ -228,2 +227,3 @@ alpha_frequency, | ||
| unsafe impl Send for LLamaContext {} | ||
| // TODO: this is not Sync-able | ||
| unsafe impl Sync for LLamaContext {} | ||
@@ -230,0 +230,0 @@ |
+18
-10
@@ -61,8 +61,9 @@ #![deny(clippy::all)] | ||
| #[napi] | ||
| #[napi(ts_return_type = "() => void")] | ||
| pub fn inference( | ||
| &self, | ||
| env: Env, | ||
| params: LlamaInvocation, | ||
| #[napi(ts_arg_type = "(result: InferenceResult) => void")] callback: JsFunction, | ||
| ) -> Result<()> { | ||
| ) -> Result<JsFunction> { | ||
| let tsfn: ThreadsafeFunction<InferenceResult, ErrorStrategy::Fatal> = callback | ||
@@ -75,13 +76,20 @@ .create_threadsafe_function(0, |ctx: ThreadSafeCallContext<InferenceResult>| { | ||
| tokio::spawn(async move { | ||
| let llama = llama.lock().await; | ||
| llama | ||
| .inference(¶ms, |result| { | ||
| let running = Arc::new(Mutex::new(true)); | ||
| { | ||
| let running = running.clone(); | ||
| tokio::task::spawn_blocking(move || { | ||
| let llama = llama.blocking_lock(); | ||
| llama.inference(¶ms, running, |result| { | ||
| tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking); | ||
| }) | ||
| .await; | ||
| }); | ||
| }); | ||
| }); | ||
| } | ||
| Ok(()) | ||
| env.create_function_from_closure("abort_inference", move |_| { | ||
| let mut running = running.blocking_lock(); | ||
| *running = false; | ||
| Ok(()) | ||
| }) | ||
| } | ||
| } |
+41
-21
@@ -6,8 +6,9 @@ use std::sync::Arc; | ||
| use crate::{ | ||
| context::{LLamaContext}, | ||
| context::LLamaContext, | ||
| tokenizer::{llama_token_eos, tokenize}, | ||
| types::{InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation}, | ||
| types::{ | ||
| InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation, | ||
| }, | ||
| }; | ||
| #[derive(Clone)] | ||
| pub struct LLamaInternal { | ||
@@ -36,3 +37,4 @@ context: LLamaContext, | ||
| pub async fn tokenize(&self, input: &str, n_ctx: usize) -> Result<Vec<i32>, napi::Error> { | ||
| if let Ok(data) = tokenize(&self.context, input, n_ctx, false) { | ||
| let context = &self.context; | ||
| if let Ok(data) = tokenize(context, input, n_ctx, false) { | ||
| Ok(data) | ||
@@ -45,6 +47,6 @@ } else { | ||
| pub async fn embedding(&self, input: &LlamaInvocation) -> Result<Vec<f64>, napi::Error> { | ||
| let context = &self.context; | ||
| let context_params_c = LlamaContextParams::or_default(&self.context_params); | ||
| let input_ctx = &self.context; | ||
| let embd_inp = tokenize( | ||
| input_ctx, | ||
| context, | ||
| input.prompt.as_str(), | ||
@@ -60,7 +62,7 @@ context_params_c.n_ctx as usize, | ||
| input_ctx | ||
| context | ||
| .llama_eval(embd_inp.as_slice(), embd_inp.len() as i32, 0, input) | ||
| .unwrap(); | ||
| let embeddings = input_ctx.llama_get_embeddings(); | ||
| let embeddings = context.llama_get_embeddings(); | ||
@@ -74,9 +76,14 @@ if let Ok(embeddings) = embeddings { | ||
| pub async fn inference(&self, input: &LlamaInvocation, callback: impl Fn(InferenceResult)) { | ||
| pub fn inference( | ||
| &self, | ||
| input: &LlamaInvocation, | ||
| running: Arc<Mutex<bool>>, | ||
| callback: impl Fn(InferenceResult), | ||
| ) { | ||
| let context = &self.context; | ||
| let context_params_c = LlamaContextParams::or_default(&self.context_params); | ||
| let input_ctx = &self.context; | ||
| // Tokenize the stop sequence and input prompt. | ||
| let tokenized_stop_prompt = input.stop_sequence.as_ref().map(|stop_sequence| { | ||
| tokenize( | ||
| input_ctx, | ||
| context, | ||
| stop_sequence, | ||
@@ -92,3 +99,3 @@ context_params_c.n_ctx as usize, | ||
| let tokenized_input = tokenize( | ||
| input_ctx, | ||
| context, | ||
| input.prompt.as_str(), | ||
@@ -104,4 +111,4 @@ context_params_c.n_ctx as usize, | ||
| // Evaluate the prompt in full. | ||
| input_ctx | ||
| // Feed prompt to the model. | ||
| context | ||
| .llama_eval( | ||
@@ -121,7 +128,16 @@ tokenized_input.as_slice(), | ||
| let mut completed = false; | ||
| while n_remaining > 0 { | ||
| let tok = input_ctx.llama_sample(embd.as_mut_slice(), input, &context_params_c); | ||
| // Check if we are aborted by caller. | ||
| let running = *running.blocking_lock(); | ||
| if !running { | ||
| break; | ||
| } | ||
| n_used += 1; | ||
| n_remaining -= 1; | ||
| let tok = context.llama_sample(embd.as_mut_slice(), input, &context_params_c); | ||
| embd[n_used] = tok; | ||
| if tok == token_eos { | ||
@@ -131,2 +147,4 @@ completed = true; | ||
| } | ||
| // If we are predicting a fixed number of tokens, check if we have reached that number. | ||
| if input.n_tok_predict != 0 | ||
@@ -143,2 +161,3 @@ && n_used > (input.n_tok_predict as usize) + tokenized_input.len() - 1 | ||
| // Check if we have reached the stop sequence. | ||
| if let Some(tokenized_stop_prompt) = &tokenized_stop_prompt { | ||
@@ -156,8 +175,4 @@ if tok == tokenized_stop_prompt[stop_sequence_i] { | ||
| input_ctx | ||
| .llama_eval(&embd[n_used..], 1, n_used as i32, input) | ||
| .unwrap(); | ||
| let output = input_ctx.llama_token_to_str(&embd[n_used]); | ||
| // We can output the token. | ||
| let output = context.llama_token_to_str(&embd[n_used]); | ||
| if let Some(output) = output { | ||
@@ -175,2 +190,7 @@ if stop_sequence_i == 0 { | ||
| } | ||
| // Continue feeding the token to the model. | ||
| context | ||
| .llama_eval(&embd[n_used..], 1, n_used as i32, input) | ||
| .unwrap(); | ||
| } | ||
@@ -177,0 +197,0 @@ |
+2
-31
| // use crate::output::Output; | ||
| use anyhow::Result; | ||
| use std::ffi::{CStr, CString}; | ||
| use std::ffi::CString; | ||
| use std::os::raw::c_char; | ||
| use llama_sys::{ | ||
| llama_token, llama_token_eos as inner_eos, llama_token_to_str, llama_tokenize, | ||
| }; | ||
| use llama_sys::{llama_token, llama_token_eos as inner_eos, llama_tokenize}; | ||
@@ -17,21 +15,2 @@ use crate::context::LLamaContext; | ||
| /// Converts a llama_token to a Rust String. | ||
| /// | ||
| /// # Arguments | ||
| /// | ||
| /// * `ctx` - A pointer to the llama_context. | ||
| /// * `token` - The llama_token to convert to a string. | ||
| /// | ||
| /// # Returns | ||
| /// | ||
| /// A Rust String representation of the given llama_token. | ||
| fn _to_output(context: &LLamaContext, token: i32) -> String { | ||
| let c_ptr = unsafe { llama_token_to_str(**context, token) }; | ||
| let native_string = unsafe { CStr::from_ptr(c_ptr) } | ||
| .to_str() | ||
| .unwrap() | ||
| .to_owned(); | ||
| native_string | ||
| } | ||
| pub fn llama_token_eos() -> i32 { | ||
@@ -94,9 +73,1 @@ unsafe { inner_eos() } | ||
| } | ||
| // Converts an embedding represented as a slice into the Output string. | ||
| // pub(crate) fn _embedding_to_output(context: &LLamaContext, embd: &[i32]) -> Output { | ||
| // embd.iter() | ||
| // .map(|token| _to_output(context, *token)) | ||
| // .fold("".to_string(), |cur, nxt| cur + &nxt) | ||
| // .into() | ||
| // } |
| import { LLama, LlamaInvocation } from "../index"; | ||
| import path from "path"; | ||
| const run = async () => { | ||
| const llama = await LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| null, | ||
| true | ||
| ); | ||
| const template = `Who is the president of the United States?`; | ||
| const prompt = `A chat between a user and an assistant. | ||
| USER: ${template} | ||
| ASSISTANT:`; | ||
| const params: LlamaInvocation = { | ||
| nThreads: 4, | ||
| nTokPredict: 2048, | ||
| topK: 40, | ||
| topP: 0.1, | ||
| temp: 0.2, | ||
| repeatPenalty: 1, | ||
| prompt, | ||
| }; | ||
| llama.inference(params, (data) => { | ||
| process.stdout.write(data.data?.token ?? ""); | ||
| }); | ||
| }; | ||
| run(); |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Shell access
Supply chain riskThis module accesses the system shell. Accessing the system shell increases the risk of executing arbitrary code.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
AI-detected potential code anomaly
Supply chain riskAI has identified unusual behaviors that may pose a security risk.
Found 1 instance in 1 package
Shell access
Supply chain riskThis module accesses the system shell. Accessing the system shell increases the risk of executing arbitrary code.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
27
3.85%503
6.12%14891783
-0.14%3
50%