@llama-node/llama-cpp
Advanced tools
+31
-27
| import { LLama, LlamaContextParams, LlamaInvocation } from "../index"; | ||
| import path from "path"; | ||
| const llama = LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| { | ||
| nCtx: 512, | ||
| nParts: -1, | ||
| seed: 0, | ||
| f16Kv: false, | ||
| logitsAll: false, | ||
| vocabOnly: false, | ||
| useMlock: false, | ||
| embedding: true, | ||
| useMmap: true, | ||
| }, | ||
| false | ||
| ); | ||
| const run = async () => { | ||
| const llama = await LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| { | ||
| nCtx: 512, | ||
| nParts: -1, | ||
| seed: 0, | ||
| f16Kv: false, | ||
| logitsAll: false, | ||
| vocabOnly: false, | ||
| useMlock: false, | ||
| embedding: true, | ||
| useMmap: true, | ||
| }, | ||
| false | ||
| ); | ||
| const prompt = `Who is the president of the United States?`; | ||
| const prompt = `Who is the president of the United States?`; | ||
| const params: LlamaInvocation = { | ||
| nThreads: 4, | ||
| nTokPredict: 2048, | ||
| topK: 40, | ||
| topP: 0.1, | ||
| temp: 0.2, | ||
| repeatPenalty: 1, | ||
| prompt, | ||
| const params: LlamaInvocation = { | ||
| nThreads: 4, | ||
| nTokPredict: 2048, | ||
| topK: 40, | ||
| topP: 0.1, | ||
| temp: 0.2, | ||
| repeatPenalty: 1, | ||
| prompt, | ||
| }; | ||
| llama.getWordEmbedding(params).then((data) => { | ||
| console.log(data); | ||
| }); | ||
| }; | ||
| llama.getWordEmbedding(params, (data) => { | ||
| console.log(data.data); | ||
| }); | ||
| run(); |
+22
-18
| import { LLama, LlamaInvocation } from "../index"; | ||
| import path from "path"; | ||
| const llama = LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| null, | ||
| true | ||
| ); | ||
| const run = async () => { | ||
| const llama = await LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| null, | ||
| true | ||
| ); | ||
| const template = `Who is the president of the United States?`; | ||
| const template = `Who is the president of the United States?`; | ||
| const prompt = `A chat between a user and an assistant. | ||
| const prompt = `A chat between a user and an assistant. | ||
| USER: ${template} | ||
| ASSISTANT:`; | ||
| const params: LlamaInvocation = { | ||
| nThreads: 4, | ||
| nTokPredict: 2048, | ||
| topK: 40, | ||
| topP: 0.1, | ||
| temp: 0.2, | ||
| repeatPenalty: 1, | ||
| prompt, | ||
| const params: LlamaInvocation = { | ||
| nThreads: 4, | ||
| nTokPredict: 2048, | ||
| topK: 40, | ||
| topP: 0.1, | ||
| temp: 0.2, | ||
| repeatPenalty: 1, | ||
| prompt, | ||
| }; | ||
| llama.inference(params, (data) => { | ||
| process.stdout.write(data.data?.token ?? ""); | ||
| }); | ||
| }; | ||
| llama.inference(params, (data) => { | ||
| process.stdout.write(data.data?.token ?? ""); | ||
| }); | ||
| run(); |
+13
-9
| import { LLama } from "../index"; | ||
| import path from "path"; | ||
| const llama = LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| null, | ||
| false | ||
| ); | ||
| const run = async () => { | ||
| const llama = await LLama.load( | ||
| path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"), | ||
| null, | ||
| false | ||
| ); | ||
| const template = `Who is the president of the United States?`; | ||
| const template = `Who is the president of the United States?`; | ||
| llama.tokenize(template, 2048, (data) => { | ||
| console.log(data.data); | ||
| }); | ||
| llama.tokenize(template, 2048).then((data) => { | ||
| console.log(data); | ||
| }); | ||
| }; | ||
| run(); |
+18
-34
@@ -6,2 +6,16 @@ /* tslint:disable */ | ||
| export interface InferenceToken { | ||
| token: string | ||
| completed: boolean | ||
| } | ||
| export const enum InferenceResultType { | ||
| Error = 'Error', | ||
| Data = 'Data', | ||
| End = 'End' | ||
| } | ||
| export interface InferenceResult { | ||
| type: InferenceResultType | ||
| data?: InferenceToken | ||
| message?: string | ||
| } | ||
| export interface LlamaInvocation { | ||
@@ -34,37 +48,7 @@ nThreads: number | ||
| } | ||
| export const enum TokenizeResultType { | ||
| Error = 'Error', | ||
| Data = 'Data' | ||
| } | ||
| export interface TokenizeResult { | ||
| type: TokenizeResultType | ||
| data: Array<number> | ||
| } | ||
| export interface InferenceToken { | ||
| token: string | ||
| completed: boolean | ||
| } | ||
| export const enum InferenceResultType { | ||
| Error = 'Error', | ||
| Data = 'Data', | ||
| End = 'End' | ||
| } | ||
| export interface InferenceResult { | ||
| type: InferenceResultType | ||
| data?: InferenceToken | ||
| message?: string | ||
| } | ||
| export const enum EmbeddingResultType { | ||
| Error = 'Error', | ||
| Data = 'Data' | ||
| } | ||
| export interface EmbeddingResult { | ||
| type: EmbeddingResultType | ||
| data: Array<number> | ||
| } | ||
| export class LLama { | ||
| static load(path: string, params: LlamaContextParams | undefined | null, enableLogger: boolean): LLama | ||
| getWordEmbedding(input: LlamaInvocation, callback: (result: EmbeddingResult) => void): void | ||
| tokenize(params: string, nCtx: number, callback: (result: TokenizeResult) => void): void | ||
| inference(input: LlamaInvocation, callback: (result: InferenceResult) => void): void | ||
| static load(path: string, params: LlamaContextParams | undefined | null, enableLogger: boolean): Promise<LLama> | ||
| getWordEmbedding(params: LlamaInvocation): Promise<Array<number>> | ||
| tokenize(params: string, nCtx: number): Promise<Array<number>> | ||
| inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): void | ||
| } |
+1
-3
@@ -255,7 +255,5 @@ /* tslint:disable */ | ||
| const { TokenizeResultType, InferenceResultType, EmbeddingResultType, LLama } = nativeBinding | ||
| const { InferenceResultType, LLama } = nativeBinding | ||
| module.exports.TokenizeResultType = TokenizeResultType | ||
| module.exports.InferenceResultType = InferenceResultType | ||
| module.exports.EmbeddingResultType = EmbeddingResultType | ||
| module.exports.LLama = LLama |
+1
-1
| { | ||
| "name": "@llama-node/llama-cpp", | ||
| "version": "0.0.37", | ||
| "version": "0.1.0", | ||
| "main": "index.js", | ||
@@ -5,0 +5,0 @@ "types": "index.d.ts", |
+3
-34
@@ -14,36 +14,4 @@ use std::{ffi::CStr, ptr::null_mut, slice}; | ||
| #[napi(object)] | ||
| #[derive(Debug, Clone)] | ||
| pub struct LlamaInvocation { | ||
| pub n_threads: i32, | ||
| pub n_tok_predict: i32, | ||
| pub top_k: i32, // 40 | ||
| pub top_p: Option<f64>, // default 0.95f, 1.0 = disabled | ||
| pub tfs_z: Option<f64>, // default 1.00f, 1.0 = disabled | ||
| pub temp: Option<f64>, // default 0.80f, 1.0 = disabled | ||
| pub typical_p: Option<f64>, // default 1.00f, 1.0 = disabled | ||
| pub repeat_penalty: Option<f64>, // default 1.10f, 1.0 = disabled | ||
| pub repeat_last_n: Option<i32>, // default 64, last n tokens to penalize (0 = disable penalty, -1 = context size) | ||
| pub frequency_penalty: Option<f64>, // default 0.00f, 1.0 = disabled | ||
| pub presence_penalty: Option<f64>, // default 0.00f, 1.0 = disabled | ||
| pub stop_sequence: Option<String>, | ||
| pub penalize_nl: Option<bool>, | ||
| pub prompt: String, | ||
| } | ||
| use crate::types::{LlamaContextParams, LlamaInvocation}; | ||
| // Represents the configuration parameters for a LLamaContext. | ||
| #[napi(object)] | ||
| #[derive(Debug, Clone)] | ||
| pub struct LlamaContextParams { | ||
| pub n_ctx: i32, | ||
| pub n_parts: i32, | ||
| pub seed: i32, | ||
| pub f16_kv: bool, | ||
| pub logits_all: bool, | ||
| pub vocab_only: bool, | ||
| pub use_mlock: bool, | ||
| pub embedding: bool, | ||
| pub use_mmap: bool, | ||
| } | ||
| impl LlamaContextParams { | ||
@@ -78,2 +46,3 @@ // Returns the default parameters or the user-specified parameters. | ||
| // Represents the LLamaContext which wraps FFI calls to the llama.cpp library. | ||
| #[derive(Clone)] | ||
| pub struct LLamaContext { | ||
@@ -85,3 +54,3 @@ ctx: *mut llama_context, | ||
| // Creates a new LLamaContext from the specified file and configuration parameters. | ||
| pub fn from_file_and_params(path: &str, params: &Option<LlamaContextParams>) -> Self { | ||
| pub async fn from_file_and_params(path: &str, params: &Option<LlamaContextParams>) -> Self { | ||
| let params = LlamaContextParams::or_default(params); | ||
@@ -88,0 +57,0 @@ let ctx = unsafe { llama_init_from_file(path.as_ptr() as *const i8, params) }; |
+28
-110
@@ -11,9 +11,5 @@ #![deny(clippy::all)] | ||
| use std::{ | ||
| sync::{mpsc::channel, Arc}, | ||
| thread, time, | ||
| }; | ||
| use std::sync::Arc; | ||
| use context::{LlamaContextParams, LlamaInvocation}; | ||
| use llama::LLamaChannel; | ||
| use llama::LLamaInternal; | ||
| use napi::{ | ||
@@ -26,7 +22,8 @@ bindgen_prelude::*, | ||
| }; | ||
| use types::{EmbeddingResult, InferenceResult, TokenizeResult}; | ||
| use tokio::sync::Mutex; | ||
| use types::{InferenceResult, LlamaContextParams, LlamaInvocation}; | ||
| #[napi] | ||
| pub struct LLama { | ||
| llama_channel: Arc<LLamaChannel>, | ||
| llama: Arc<Mutex<LLamaInternal>>, | ||
| } | ||
@@ -37,3 +34,3 @@ | ||
| #[napi] | ||
| pub fn load( | ||
| pub async fn load( | ||
| path: String, | ||
@@ -50,89 +47,17 @@ params: Option<LlamaContextParams>, | ||
| let (load_result_sender, load_result_receiver) = channel::<bool>(); | ||
| let llama_channel = LLamaChannel::new(path, params, load_result_sender, enable_logger); | ||
| 'waiting_load: loop { | ||
| let recv = load_result_receiver.recv(); | ||
| match recv { | ||
| Ok(r) => { | ||
| if !r { | ||
| return Err(Error::new(Status::InvalidArg, "Load error".to_string())); | ||
| } | ||
| break 'waiting_load; | ||
| } | ||
| _ => { | ||
| thread::yield_now(); | ||
| } | ||
| } | ||
| } | ||
| Ok(Self { llama_channel }) | ||
| Ok(Self { | ||
| llama: LLamaInternal::load(path, params, enable_logger).await, | ||
| }) | ||
| } | ||
| #[napi] | ||
| pub fn get_word_embedding( | ||
| &self, | ||
| input: LlamaInvocation, | ||
| #[napi(ts_arg_type = "(result: EmbeddingResult) => void")] callback: JsFunction, | ||
| ) -> Result<()> { | ||
| let tsfn: ThreadsafeFunction<EmbeddingResult, ErrorStrategy::Fatal> = | ||
| callback.create_threadsafe_function(0, |ctx| Ok(vec![ctx.value]))?; | ||
| let (embeddings_sender, embeddings_receiver) = channel(); | ||
| let llama_channel = self.llama_channel.clone(); | ||
| llama_channel.embedding(input, embeddings_sender); | ||
| thread::spawn(move || { | ||
| loop { | ||
| let result = embeddings_receiver.recv(); | ||
| match result { | ||
| Ok(result) => { | ||
| tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking); | ||
| } | ||
| Err(_) => { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| thread::sleep(time::Duration::from_millis(300)); // wait for end signal | ||
| tsfn.abort().unwrap(); | ||
| }); | ||
| Ok(()) | ||
| pub async fn get_word_embedding(&self, params: LlamaInvocation) -> Result<Vec<f64>> { | ||
| let llama = self.llama.lock().await; | ||
| llama.embedding(¶ms).await | ||
| } | ||
| #[napi] | ||
| pub fn tokenize( | ||
| &self, | ||
| params: String, | ||
| n_ctx: i32, | ||
| #[napi(ts_arg_type = "(result: TokenizeResult) => void")] callback: JsFunction, | ||
| ) -> Result<()> { | ||
| let (tokenize_sender, tokenize_receiver) = channel::<TokenizeResult>(); | ||
| let tsfn: ThreadsafeFunction<TokenizeResult, ErrorStrategy::Fatal> = callback | ||
| .create_threadsafe_function(0, |ctx: ThreadSafeCallContext<TokenizeResult>| { | ||
| Ok(vec![ctx.value]) | ||
| })?; | ||
| let llama_channel = self.llama_channel.clone(); | ||
| llama_channel.tokenize(params, n_ctx as usize, tokenize_sender); | ||
| thread::spawn(move || { | ||
| 'waiting_tokenize: loop { | ||
| let recv = tokenize_receiver.recv(); | ||
| match recv { | ||
| Ok(callback) => { | ||
| tsfn.call(callback, ThreadsafeFunctionCallMode::Blocking); | ||
| break 'waiting_tokenize; | ||
| } | ||
| _ => { | ||
| thread::yield_now(); | ||
| } | ||
| } | ||
| } | ||
| thread::sleep(time::Duration::from_millis(300)); // wait for end signal | ||
| tsfn.abort().unwrap(); | ||
| }); | ||
| Ok(()) | ||
| pub async fn tokenize(&self, params: String, n_ctx: i32) -> Result<Vec<i32>> { | ||
| let llama = self.llama.lock().await; | ||
| llama.tokenize(¶ms, n_ctx as usize).await | ||
| } | ||
@@ -143,26 +68,19 @@ | ||
| &self, | ||
| input: LlamaInvocation, | ||
| params: LlamaInvocation, | ||
| #[napi(ts_arg_type = "(result: InferenceResult) => void")] callback: JsFunction, | ||
| ) -> Result<()> { | ||
| let tsfn: ThreadsafeFunction<InferenceResult, ErrorStrategy::Fatal> = | ||
| callback.create_threadsafe_function(0, |ctx| Ok(vec![ctx.value]))?; | ||
| let (inference_sender, inference_receiver) = channel(); | ||
| let llama_channel = self.llama_channel.clone(); | ||
| let tsfn: ThreadsafeFunction<InferenceResult, ErrorStrategy::Fatal> = callback | ||
| .create_threadsafe_function(0, |ctx: ThreadSafeCallContext<InferenceResult>| { | ||
| Ok(vec![ctx.value]) | ||
| })?; | ||
| llama_channel.inference(input, inference_sender); | ||
| let llama = self.llama.clone(); | ||
| thread::spawn(move || { | ||
| loop { | ||
| let result = inference_receiver.recv(); | ||
| match result { | ||
| Ok(result) => { | ||
| tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking); | ||
| } | ||
| Err(_) => { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| thread::sleep(time::Duration::from_millis(300)); // wait for end signal | ||
| tsfn.abort().unwrap(); | ||
| tokio::spawn(async move { | ||
| let llama = llama.lock().await; | ||
| llama | ||
| .inference(¶ms, |result| { | ||
| tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking); | ||
| }) | ||
| .await; | ||
| }); | ||
@@ -169,0 +87,0 @@ |
+53
-166
@@ -1,24 +0,12 @@ | ||
| use std::{ | ||
| sync::{ | ||
| mpsc::{channel, Receiver, Sender, TryRecvError}, | ||
| Arc, Mutex, | ||
| }, | ||
| thread, | ||
| }; | ||
| use std::sync::Arc; | ||
| use tokio::sync::Mutex; | ||
| use crate::{ | ||
| context::{LLamaContext, LlamaContextParams, LlamaInvocation}, | ||
| context::{LLamaContext}, | ||
| tokenizer::{llama_token_eos, tokenize}, | ||
| types::{ | ||
| EmbeddingResult, EmbeddingResultType, InferenceResult, InferenceResultType, InferenceToken, | ||
| LLamaCommand, TokenizeResult, TokenizeResultType, | ||
| }, | ||
| types::{InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation}, | ||
| }; | ||
| #[derive(Clone)] | ||
| pub struct LLamaChannel { | ||
| command_sender: Sender<LLamaCommand>, | ||
| command_receiver: Arc<Mutex<Receiver<LLamaCommand>>>, | ||
| } | ||
| pub struct LLamaInternal { | ||
@@ -30,21 +18,27 @@ context: LLamaContext, | ||
| impl LLamaInternal { | ||
| pub fn tokenize(&self, input: &str, n_ctx: usize, sender: &Sender<TokenizeResult>) { | ||
| pub async fn load( | ||
| path: String, | ||
| params: Option<LlamaContextParams>, | ||
| enable_logger: bool, | ||
| ) -> Arc<Mutex<Self>> { | ||
| let llama = LLamaInternal { | ||
| context: LLamaContext::from_file_and_params(&path, ¶ms).await, | ||
| context_params: params, | ||
| }; | ||
| if enable_logger { | ||
| llama.context.llama_print_system_info(); | ||
| } | ||
| Arc::new(Mutex::new(llama)) | ||
| } | ||
| pub async fn tokenize(&self, input: &str, n_ctx: usize) -> Result<Vec<i32>, napi::Error> { | ||
| if let Ok(data) = tokenize(&self.context, input, n_ctx, false) { | ||
| sender | ||
| .send(TokenizeResult { | ||
| data, | ||
| r#type: TokenizeResultType::Data, | ||
| }) | ||
| .unwrap(); | ||
| Ok(data) | ||
| } else { | ||
| sender | ||
| .send(TokenizeResult { | ||
| data: vec![], | ||
| r#type: TokenizeResultType::Error, | ||
| }) | ||
| .unwrap(); | ||
| Err(napi::Error::from_reason("Failed to tokenize")) | ||
| } | ||
| } | ||
| pub fn embedding(&self, input: &LlamaInvocation, sender: &Sender<EmbeddingResult>) { | ||
| pub async fn embedding(&self, input: &LlamaInvocation) -> Result<Vec<f64>, napi::Error> { | ||
| let context_params_c = LlamaContextParams::or_default(&self.context_params); | ||
@@ -71,19 +65,9 @@ let input_ctx = &self.context; | ||
| if let Ok(embeddings) = embeddings { | ||
| sender | ||
| .send(EmbeddingResult { | ||
| r#type: EmbeddingResultType::Data, | ||
| data: embeddings.iter().map(|&x| x as f64).collect(), | ||
| }) | ||
| .unwrap(); | ||
| Ok(embeddings.iter().map(|&x| x as f64).collect()) | ||
| } else { | ||
| sender | ||
| .send(EmbeddingResult { | ||
| r#type: EmbeddingResultType::Error, | ||
| data: vec![], | ||
| }) | ||
| .unwrap(); | ||
| Err(napi::Error::from_reason("Failed to get embeddings")) | ||
| } | ||
| } | ||
| pub fn inference(&self, input: &LlamaInvocation, sender: &Sender<InferenceResult>) { | ||
| pub async fn inference(&self, input: &LlamaInvocation, callback: impl Fn(InferenceResult)) { | ||
| let context_params_c = LlamaContextParams::or_default(&self.context_params); | ||
@@ -144,9 +128,7 @@ let input_ctx = &self.context; | ||
| { | ||
| sender | ||
| .send(InferenceResult { | ||
| r#type: InferenceResultType::Error, | ||
| data: None, | ||
| message: Some("Too many tokens predicted".to_string()), | ||
| }) | ||
| .unwrap(); | ||
| callback(InferenceResult { | ||
| r#type: InferenceResultType::Error, | ||
| data: None, | ||
| message: Some("Too many tokens predicted".to_string()), | ||
| }); | ||
| break; | ||
@@ -175,12 +157,10 @@ } | ||
| if stop_sequence_i == 0 { | ||
| sender | ||
| .send(InferenceResult { | ||
| r#type: InferenceResultType::Data, | ||
| data: Some(InferenceToken { | ||
| token: output, | ||
| completed: false, | ||
| }), | ||
| message: None, | ||
| }) | ||
| .unwrap(); | ||
| callback(InferenceResult { | ||
| r#type: InferenceResultType::Data, | ||
| data: Some(InferenceToken { | ||
| token: output, | ||
| completed: false, | ||
| }), | ||
| message: None, | ||
| }); | ||
| } | ||
@@ -191,111 +171,18 @@ } | ||
| if completed { | ||
| sender | ||
| .send(InferenceResult { | ||
| r#type: InferenceResultType::Data, | ||
| data: Some(InferenceToken { | ||
| token: "\n\n<end>\n".to_string(), | ||
| completed: true, | ||
| }), | ||
| message: None, | ||
| }) | ||
| .unwrap(); | ||
| callback(InferenceResult { | ||
| r#type: InferenceResultType::Data, | ||
| data: Some(InferenceToken { | ||
| token: "\n\n<end>\n".to_string(), | ||
| completed: true, | ||
| }), | ||
| message: None, | ||
| }); | ||
| } | ||
| sender | ||
| .send(InferenceResult { | ||
| r#type: InferenceResultType::End, | ||
| data: None, | ||
| message: None, | ||
| }) | ||
| .unwrap(); | ||
| // embedding_to_output( | ||
| // input_ctx, | ||
| // &embd[tokenized_input.len()..n_used + 1 - stop_sequence_i], | ||
| // ); | ||
| } | ||
| } | ||
| impl LLamaChannel { | ||
| pub fn new( | ||
| path: String, | ||
| params: Option<LlamaContextParams>, | ||
| load_result_sender: Sender<bool>, | ||
| enable_logger: bool, | ||
| ) -> Arc<Self> { | ||
| let (command_sender, command_receiver) = channel::<LLamaCommand>(); | ||
| let channel = LLamaChannel { | ||
| command_receiver: Arc::new(Mutex::new(command_receiver)), | ||
| command_sender, | ||
| }; | ||
| channel.spawn(path, params, load_result_sender, enable_logger); | ||
| Arc::new(channel) | ||
| } | ||
| pub fn tokenize(&self, input: String, n_ctx: usize, sender: Sender<TokenizeResult>) { | ||
| self.command_sender | ||
| .send(LLamaCommand::Tokenize(input, n_ctx, sender)) | ||
| .unwrap(); | ||
| } | ||
| pub fn embedding(&self, params: LlamaInvocation, sender: Sender<EmbeddingResult>) { | ||
| self.command_sender | ||
| .send(LLamaCommand::Embedding(params, sender)) | ||
| .unwrap(); | ||
| } | ||
| pub fn inference(&self, params: LlamaInvocation, sender: Sender<InferenceResult>) { | ||
| self.command_sender | ||
| .send(LLamaCommand::Inference(params, sender)) | ||
| .unwrap(); | ||
| } | ||
| // llama instance main loop | ||
| pub fn spawn( | ||
| &self, | ||
| path: String, | ||
| params: Option<LlamaContextParams>, | ||
| load_result_sender: Sender<bool>, | ||
| enable_logger: bool, | ||
| ) { | ||
| let rv = self.command_receiver.clone(); | ||
| thread::spawn(move || { | ||
| let llama = LLamaInternal { | ||
| context: LLamaContext::from_file_and_params(&path, ¶ms), | ||
| context_params: params, | ||
| }; | ||
| if enable_logger { | ||
| llama.context.llama_print_system_info(); | ||
| } | ||
| load_result_sender.send(true).unwrap(); | ||
| let rv = rv.lock().unwrap(); | ||
| 'llama_loop: loop { | ||
| let command = rv.try_recv(); | ||
| match command { | ||
| Ok(LLamaCommand::Inference(params, sender)) => { | ||
| llama.inference(¶ms, &sender); | ||
| } | ||
| Ok(LLamaCommand::Embedding(params, sender)) => { | ||
| llama.embedding(¶ms, &sender); | ||
| } | ||
| Ok(LLamaCommand::Tokenize(text, n_ctx, sender)) => { | ||
| llama.tokenize(&text, n_ctx, &sender); | ||
| } | ||
| Err(TryRecvError::Disconnected) => { | ||
| break 'llama_loop; | ||
| } | ||
| _ => { | ||
| thread::yield_now(); | ||
| } | ||
| } | ||
| } | ||
| callback(InferenceResult { | ||
| r#type: InferenceResultType::End, | ||
| data: None, | ||
| message: None, | ||
| }); | ||
| } | ||
| } |
+29
-28
@@ -1,25 +0,4 @@ | ||
| use crate::context::LlamaInvocation; | ||
| use napi::bindgen_prelude::*; | ||
| use std::sync::mpsc::Sender; | ||
| #[derive(Clone, Debug)] | ||
| pub enum LLamaCommand { | ||
| Inference(LlamaInvocation, Sender<InferenceResult>), | ||
| Tokenize(String, usize, Sender<TokenizeResult>), | ||
| Embedding(LlamaInvocation, Sender<EmbeddingResult>), | ||
| } | ||
| #[napi(string_enum)] | ||
| pub enum TokenizeResultType { | ||
| Error, | ||
| Data, | ||
| } | ||
| #[napi(object)] | ||
| pub struct TokenizeResult { | ||
| pub r#type: TokenizeResultType, | ||
| pub data: Vec<i32>, | ||
| } | ||
| #[napi(object)] | ||
| #[derive(Clone, Debug)] | ||
@@ -45,12 +24,34 @@ pub struct InferenceToken { | ||
| #[napi(string_enum)] | ||
| pub enum EmbeddingResultType { | ||
| Error, | ||
| Data, | ||
| #[napi(object)] | ||
| #[derive(Debug, Clone)] | ||
| pub struct LlamaInvocation { | ||
| pub n_threads: i32, | ||
| pub n_tok_predict: i32, | ||
| pub top_k: i32, // 40 | ||
| pub top_p: Option<f64>, // default 0.95f, 1.0 = disabled | ||
| pub tfs_z: Option<f64>, // default 1.00f, 1.0 = disabled | ||
| pub temp: Option<f64>, // default 0.80f, 1.0 = disabled | ||
| pub typical_p: Option<f64>, // default 1.00f, 1.0 = disabled | ||
| pub repeat_penalty: Option<f64>, // default 1.10f, 1.0 = disabled | ||
| pub repeat_last_n: Option<i32>, // default 64, last n tokens to penalize (0 = disable penalty, -1 = context size) | ||
| pub frequency_penalty: Option<f64>, // default 0.00f, 1.0 = disabled | ||
| pub presence_penalty: Option<f64>, // default 0.00f, 1.0 = disabled | ||
| pub stop_sequence: Option<String>, | ||
| pub penalize_nl: Option<bool>, | ||
| pub prompt: String, | ||
| } | ||
| // Represents the configuration parameters for a LLamaContext. | ||
| #[napi(object)] | ||
| pub struct EmbeddingResult { | ||
| pub r#type: EmbeddingResultType, | ||
| pub data: Vec<f64>, | ||
| #[derive(Debug, Clone)] | ||
| pub struct LlamaContextParams { | ||
| pub n_ctx: i32, | ||
| pub n_parts: i32, | ||
| pub seed: i32, | ||
| pub f16_kv: bool, | ||
| pub logits_all: bool, | ||
| pub vocab_only: bool, | ||
| pub use_mlock: bool, | ||
| pub embedding: bool, | ||
| pub use_mmap: bool, | ||
| } |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Shell access
Supply chain riskThis module accesses the system shell. Accessing the system shell increases the risk of executing arbitrary code.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
Shell access
Supply chain riskThis module accesses the system shell. Accessing the system shell increases the risk of executing arbitrary code.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
14913112
-1.79%474
-1.86%