@@ -17,5 +17,6 @@ [package]
		llama-sys = { path = "./llama-sys" }
		# llm-chain-llama-sys = { path = "../../../llm-chain/llm-chain-llama/sys" }
		napi = { version = "2.12.2", default-features = false, features = ["napi6", "async"] }
		napi-derive = "2.12.2"
		serde = { version = "1.0.163", features = ["derive"] }
		serde_json = "1.0.96"
		napi = { version = "2.12.4", default-features = false, features = ["napi6", "async", "serde-json"] }
		napi-derive = "2.12.3"
		tokio = { version = "1.26.0", features = ["full"] }
		@@ -22,0 +23,0 @@ futures = "0.3"

+5

-4

example/abortable.ts

		@@ -1,2 +0,2 @@
		import { LLama, LlamaInvocation } from "../index";
		import { LLama, Generate } from "../index";
		import path from "path";
		@@ -6,4 +6,5 @@
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		null,
		{
		modelPath: path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		},
		true
		@@ -18,3 +19,3 @@ );

		const params: LlamaInvocation = {
		const params: Generate = {
		nThreads: 4,
		@@ -21,0 +22,0 @@ nTokPredict: 2048,

+3

-4

example/embedding.ts

		@@ -1,2 +0,2 @@
		import { LLama, LlamaContextParams, LlamaInvocation } from "../index";
		import { LLama, Generate } from "../index";
		import path from "path";
		@@ -6,6 +6,5 @@
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		{
		modelPath: path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		nCtx: 512,
		nParts: -1,
		nGpuLayers: 0,
		@@ -25,3 +24,3 @@ seed: 0,

		const params: LlamaInvocation = {
		const params: Generate = {
		nThreads: 4,
		@@ -28,0 +27,0 @@ nTokPredict: 2048,

+3

-4

example/inference.ts

		import { InferenceResultType } from "../index";
		import { LLama, LlamaInvocation } from "../index";
		import { LLama, Generate } from "../index";
		import path from "path";
		@@ -7,7 +7,6 @@
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		{
		modelPath: path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		nGpuLayers: 32,
		nCtx: 1024,
		nParts: 1,
		seed: 0,
		@@ -30,3 +29,3 @@ f16Kv: false,

		const params: LlamaInvocation = {
		const params: Generate = {
		nThreads: 4,
		@@ -33,0 +32,0 @@ nTokPredict: 2048,

+3

-2

example/tokenize.ts

		@@ -6,4 +6,5 @@ import { LLama } from "../index";
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		null,
		{
		modelPath: path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		},
		false
		@@ -10,0 +11,0 @@ );

+89

-7

index.d.ts

		@@ -20,21 +20,103 @@ /* tslint:disable */
		}
		export interface LlamaInvocation {
		export interface LogitBias {
		token: number
		bias: number
		}
		export interface Generate {
		nThreads: number
		nTokPredict: number
		topK: number
		/**
		* logit bias for specific tokens
		* Default: None
		*/
		logitBias?: Array<LogitBias>
		/**
		* top k tokens to sample from
		* Range: <= 0 to use vocab size
		* Default: 40
		*/
		topK?: number
		/**
		* top p tokens to sample from
		* Default: 0.95
		* 1.0 = disabled
		*/
		topP?: number
		/**
		* tail free sampling
		* Default: 1.0
		* 1.0 = disabled
		*/
		tfsZ?: number
		/**
		* temperature
		* Default: 0.80
		* 1.0 = disabled
		*/
		temp?: number
		/**
		* locally typical sampling
		* Default: 1.0
		* 1.0 = disabled
		*/
		typicalP?: number
		/**
		* repeat penalty
		* Default: 1.10
		* 1.0 = disabled
		*/
		repeatPenalty?: number
		/**
		* last n tokens to penalize
		* Default: 64
		* 0 = disable penalty, -1 = context size
		*/
		repeatLastN?: number
		/**
		* frequency penalty
		* Default: 0.00
		* 1.0 = disabled
		*/
		frequencyPenalty?: number
		/**
		* presence penalty
		* Default: 0.00
		* 1.0 = disabled
		*/
		presencePenalty?: number
		/**
		* Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
		* Mirostat: A Neural Text Decoding Algorithm that Directly Controls Perplexity
		* Default: 0
		* 0 = disabled
		* 1 = mirostat 1.0
		* 2 = mirostat 2.0
		*/
		mirostat?: number
		/**
		* The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
		* Default: 5.0
		*/
		mirostatTau?: number
		/**
		* The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
		* Default: 0.1
		*/
		mirostatEta?: number
		/**
		* stop sequence
		* Default: None
		*/
		stopSequence?: string
		/**
		* consider newlines as a repeatable token
		* Default: true
		*/
		penalizeNl?: boolean
		/** prompt */
		prompt: string
		}
		export interface LlamaContextParams {
		export interface ModelLoad {
		modelPath: string
		nCtx: number
		nParts: number
		nGpuLayers: number
		@@ -56,6 +138,6 @@ seed: number
		export class LLama {
		static load(path: string, params: LlamaContextParams \| undefined \| null, enableLogger: boolean): Promise<LLama>
		getWordEmbedding(params: LlamaInvocation): Promise<Array<number>>
		static load(params: Partial<LoadModel>, enableLogger: boolean): Promise<LLama>
		getWordEmbedding(params: Generate): Promise<Array<number>>
		tokenize(params: string): Promise<Array<number>>
		inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): () => void
		inference(params: Generate, callback: (result: InferenceResult) => void): () => void
		}

+1

-1

package.json

		{
		"name": "@llama-node/llama-cpp",
		"version": "0.1.5",
		"version": "0.1.6",
		"main": "index.js",
		@@ -5,0 +5,0 @@ "types": "index.d.ts",

+1

-1

scripts/cuda-compile.mts

		@@ -164,3 +164,3 @@ import { exec, execSync } from "child_process";
		const run = async () => {
		console.log("Checking environment...\n...");
		console.log("Checking environment...\n");
		checkEnv();
		@@ -167,0 +167,0 @@

+55

-50

src/context.rs

 @@ -5,42 +5,13 @@ use std::{ffi::CStr, ptr::null_mut, slice};
 use llama_sys::{
     llama_apply_lora_from_file, llama_context, llama_context_default_params, llama_context_params,
     llama_eval, llama_free, llama_get_embeddings, llama_get_logits, llama_init_from_file,
     llama_n_embd, llama_n_vocab, llama_print_system_info,
     llama_sample_frequency_and_presence_penalties, llama_sample_repetition_penalty,
     llama_sample_tail_free, llama_sample_temperature, llama_sample_token,
     llama_sample_token_greedy, llama_sample_top_k, llama_sample_top_p, llama_sample_typical,
     llama_apply_lora_from_file, llama_context, llama_context_params, llama_eval, llama_free,
     llama_get_embeddings, llama_get_logits, llama_init_from_file, llama_n_embd, llama_n_vocab,
     llama_print_system_info, llama_sample_frequency_and_presence_penalties,
     llama_sample_repetition_penalty, llama_sample_tail_free, llama_sample_temperature,
     llama_sample_token, llama_sample_token_greedy, llama_sample_token_mirostat,
     llama_sample_token_mirostat_v2, llama_sample_top_k, llama_sample_top_p, llama_sample_typical,
     llama_token, llama_token_data, llama_token_data_array, llama_token_nl, llama_token_to_str,
 };
 use crate::types::{LlamaContextParams, LlamaInvocation};
 use crate::types::{Generate, ModelLoad};
 impl LlamaContextParams {
     // Returns the default parameters or the user-specified parameters.
     pub fn or_default(params: &Option<LlamaContextParams>) -> llama_context_params {
         match params {
             Some(params) => params.clone().into(),
             None => unsafe { llama_context_default_params() },
 impl From<LlamaContextParams> for llama_context_params {
     fn from(params: LlamaContextParams) -> Self {
         llama_context_params {
             n_ctx: params.n_ctx,
             n_parts: params.n_parts,
             n_gpu_layers: params.n_gpu_layers,
             seed: params.seed,
             f16_kv: params.f16_kv,
             logits_all: params.logits_all,
             vocab_only: params.vocab_only,
             use_mmap: params.use_mmap,
             use_mlock: params.use_mlock,
             embedding: params.embedding,
             progress_callback: None,
             progress_callback_user_data: null_mut(),
 // Represents the LLamaContext which wraps FFI calls to the llama.cpp library.
 @@ -53,9 +24,8 @@ pub struct LLamaContext {
     // Creates a new LLamaContext from the specified file and configuration parameters.
     pub async fn from_file_and_params(
         path: &str,
         params: &Option<LlamaContextParams>,
     ) -> Result<Self, napi::Error> {
         let lora_params = params.as_ref().and_then(|p| p.lora.clone());
         let params = LlamaContextParams::or_default(params);
         let ctx = unsafe { llama_init_from_file(path.as_ptr() as *const i8, params) };
     pub async fn from_file_and_params(params: &ModelLoad) -> Result<Self, napi::Error> {
         let lora_params = &params.lora;
         let context_params = ModelLoad::to_llama_context_params(params);
         let ctx = unsafe {
             llama_init_from_file(params.model_path.as_ptr() as *const i8, context_params)
         };
 @@ -65,3 +35,3 @@ if ctx.is_null() {
                 "Failed to initialize LLama context from file: {}",
                 path
                 params.model_path,
             )));
 @@ -109,3 +79,3 @@ }
         last_n_tokens: &mut [llama_token],
         input: &LlamaInvocation,
         input: &Generate,
         context_params: &llama_context_params,
 @@ -115,6 +85,7 @@ ) -> i32 {
         let top_p = input.top_p.unwrap_or(0.95) as f32;
         let top_k = if input.top_k <= 0 {
         let top_k = input.top_k.unwrap_or(40);
         let top_k = if top_k <= 0 {
             unsafe { llama_n_vocab(self.ctx) }
         } else {
             input.top_k
             top_k
         };
 @@ -135,2 +106,9 @@ let tfs_z = input.tfs_z.unwrap_or(1.0) as f32;
         let empty_logit_bias = Vec::new();
         let logit_bias = input.logit_bias.as_ref().unwrap_or(&empty_logit_bias);
         let mirostat = input.mirostat.unwrap_or(0);
         let mirostat_tau = input.mirostat_tau.unwrap_or(5.0) as f32;
         let mirostat_eta = input.mirostat_eta.unwrap_or(0.1) as f32;
         let n_vocab = unsafe { llama_n_vocab(self.ctx) };
 @@ -140,3 +118,5 @@ let logits_ptr = unsafe { llama_get_logits(self.ctx) };
         // TODO: apply logit bias
         for i in logit_bias.iter() {
             logits[i.token as usize] += i.bias as f32;
 @@ -205,4 +185,29 @@ let mut candidates: Vec<llama_token_data> = Vec::with_capacity(n_vocab as usize);
             id = unsafe { llama_sample_token_greedy(self.ctx, candidates_p) };
         } else if mirostat == 1 {
             let mut mirostat_mu = 2.0_f32 * mirostat_tau;
             let mirostat_m = 100;
             unsafe { llama_sample_temperature(self.ctx, candidates_p, temp) };
             id = unsafe {
                 llama_sample_token_mirostat(
                     self.ctx,
                     candidates_p,
                     mirostat_tau,
                     mirostat_eta,
                     mirostat_m,
                     &mut mirostat_mu,
         } else if mirostat == 2 {
             let mut mirostat_mu = 2.0_f32 * mirostat_tau;
             unsafe { llama_sample_temperature(self.ctx, candidates_p, temp) };
             id = unsafe {
                 llama_sample_token_mirostat_v2(
                     self.ctx,
                     candidates_p,
                     mirostat_tau,
                     mirostat_eta,
                     &mut mirostat_mu,
         } else {
             // TODO: here we just do temp for first approach, I dont understand microstat very well, will impl later
             id = unsafe {
 @@ -251,3 +256,3 @@ llama_sample_top_k(self.ctx, candidates_p, top_k, 1);
         n_past: i32,
         input: &LlamaInvocation,
         input: &Generate,
     ) -> Result<(), napi::Error> {
 @@ -254,0 +259,0 @@ let res =

+7

-7

src/lib.rs

		@@ -24,3 +24,3 @@ #![deny(clippy::all)]
		use tokio::sync::Mutex;
		use types::{InferenceResult, InferenceResultType, LlamaContextParams, LlamaInvocation};
		use types::{InferenceResult, InferenceResultType, Generate, ModelLoad};

		@@ -36,12 +36,12 @@ #[napi]
		pub async fn load(
		path: String,
		params: Option<LlamaContextParams>,
		#[napi(ts_arg_type = "Partial<LoadModel>")] params: serde_json::Value,
		enable_logger: bool,
		) -> Result<LLama> {
		let params = serde_json::from_value::<ModelLoad>(params).unwrap();

		let logger = LLamaLogger::get_singleton();

		logger.set_enabled(enable_logger);

		Ok(Self {
		llama: LLamaInternal::load(path, params, enable_logger).await?,
		llama: LLamaInternal::load(params, enable_logger).await?,
		})
		@@ -51,3 +51,3 @@ }
		#[napi]
		pub async fn get_word_embedding(&self, params: LlamaInvocation) -> Result<Vec<f64>> {
		pub async fn get_word_embedding(&self, params: Generate) -> Result<Vec<f64>> {
		let llama = self.llama.lock().await;
		@@ -67,3 +67,3 @@ llama.embedding(&params).await
		env: Env,
		params: LlamaInvocation,
		params: Generate,
		#[napi(ts_arg_type = "(result: InferenceResult) => void")] callback: JsFunction,
		@@ -70,0 +70,0 @@ ) -> Result<JsFunction> {

+8

-11

src/llama.rs

		@@ -9,5 +9,3 @@ use std::sync::Arc;
		tokenizer::{llama_token_eos, tokenize},
		types::{
		InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation,
		},
		types::{InferenceResult, InferenceResultType, InferenceToken, Generate, ModelLoad},
		};
		@@ -17,3 +15,3 @@
		context: LLamaContext,
		context_params: Option<LlamaContextParams>,
		context_params: ModelLoad,
		}
		@@ -23,8 +21,7 @@
		pub async fn load(
		path: String,
		params: Option<LlamaContextParams>,
		params: ModelLoad,
		enable_logger: bool,
		) -> Result<Arc<Mutex<Self>>, napi::Error> {
		) -> Result<Arc<Mutex<LLamaInternal>>, napi::Error> {
		let llama = LLamaInternal {
		context: LLamaContext::from_file_and_params(&path, &params).await?,
		context: LLamaContext::from_file_and_params(&params).await?,
		context_params: params,
		@@ -46,3 +43,3 @@ };

		pub async fn embedding(&self, input: &LlamaInvocation) -> Result<Vec<f64>, napi::Error> {
		pub async fn embedding(&self, input: &Generate) -> Result<Vec<f64>, napi::Error> {
		let context = &self.context;
		@@ -70,3 +67,3 @@ let embd_inp = tokenize(context, input.prompt.as_str(), true);
		&self,
		input: &LlamaInvocation,
		input: &Generate,
		running: Arc<Mutex<bool>>,
		@@ -76,3 +73,3 @@ callback: impl Fn(InferenceResult),
		let context = &self.context;
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		let context_params_c = ModelLoad::to_llama_context_params(&self.context_params);
		// Tokenize the stop sequence and input prompt.
		@@ -79,0 +76,0 @@ let tokenized_stop_prompt = input

+132

-14

src/types.rs

		@@ -0,2 +1,4 @@
		use llama_sys::llama_context_params;
		use napi::bindgen_prelude::*;
		use serde::{Deserialize, Serialize};

		@@ -26,16 +28,87 @@ #[napi(object)]
		#[derive(Debug, Clone)]
		pub struct LlamaInvocation {
		pub struct LogitBias {
		pub token: i32,
		pub bias: f64,
		}

		#[napi(object)]
		#[derive(Debug, Clone)]
		pub struct Generate {
		pub n_threads: i32,
		pub n_tok_predict: i32,
		pub top_k: i32, // 40
		pub top_p: Option<f64>, // default 0.95f, 1.0 = disabled
		pub tfs_z: Option<f64>, // default 1.00f, 1.0 = disabled
		pub temp: Option<f64>, // default 0.80f, 1.0 = disabled
		pub typical_p: Option<f64>, // default 1.00f, 1.0 = disabled
		pub repeat_penalty: Option<f64>, // default 1.10f, 1.0 = disabled
		pub repeat_last_n: Option<i32>, // default 64, last n tokens to penalize (0 = disable penalty, -1 = context size)
		pub frequency_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
		pub presence_penalty: Option<f64>, // default 0.00f, 1.0 = disabled

		/// logit bias for specific tokens
		/// Default: None
		pub logit_bias: Option<Vec<LogitBias>>,

		/// top k tokens to sample from
		/// Range: <= 0 to use vocab size
		/// Default: 40
		pub top_k: Option<i32>,

		/// top p tokens to sample from
		/// Default: 0.95
		/// 1.0 = disabled
		pub top_p: Option<f64>,

		/// tail free sampling
		/// Default: 1.0
		/// 1.0 = disabled
		pub tfs_z: Option<f64>,

		/// temperature
		/// Default: 0.80
		/// 1.0 = disabled
		pub temp: Option<f64>,

		/// locally typical sampling
		/// Default: 1.0
		/// 1.0 = disabled
		pub typical_p: Option<f64>,

		/// repeat penalty
		/// Default: 1.10
		/// 1.0 = disabled
		pub repeat_penalty: Option<f64>,

		/// last n tokens to penalize
		/// Default: 64
		/// 0 = disable penalty, -1 = context size
		pub repeat_last_n: Option<i32>,

		/// frequency penalty
		/// Default: 0.00
		/// 1.0 = disabled
		pub frequency_penalty: Option<f64>,

		/// presence penalty
		/// Default: 0.00
		/// 1.0 = disabled
		pub presence_penalty: Option<f64>,

		/// Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
		/// Mirostat: A Neural Text Decoding Algorithm that Directly Controls Perplexity
		/// Default: 0
		/// 0 = disabled
		/// 1 = mirostat 1.0
		/// 2 = mirostat 2.0
		pub mirostat: Option<i32>,

		/// The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
		/// Default: 5.0
		pub mirostat_tau: Option<f64>,

		/// The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
		/// Default: 0.1
		pub mirostat_eta: Option<f64>,

		/// stop sequence
		/// Default: None
		pub stop_sequence: Option<String>,

		/// consider newlines as a repeatable token
		/// Default: true
		pub penalize_nl: Option<bool>,

		/// prompt
		pub prompt: String,
		@@ -46,6 +119,7 @@ }
		#[napi(object)]
		#[derive(Debug, Clone)]
		pub struct LlamaContextParams {
		#[derive(Debug, Clone, Serialize, Deserialize)]
		#[serde(default, rename_all = "camelCase")]
		pub struct ModelLoad {
		pub model_path: String,
		pub n_ctx: i32,
		pub n_parts: i32,
		pub n_gpu_layers: i32,
		@@ -62,4 +136,48 @@ pub seed: i32,

		impl Default for ModelLoad {
		fn default() -> Self {
		Self {
		model_path: "".to_string(),
		n_ctx: 2048,
		n_gpu_layers: 0,
		seed: 0,
		f16_kv: true,
		logits_all: false,
		vocab_only: false,
		use_mlock: false,
		embedding: false,
		use_mmap: true,
		lora: None,
		}
		}
		}

		impl ModelLoad {
		// Returns the default parameters or the user-specified parameters.
		pub fn to_llama_context_params(params: &ModelLoad) -> llama_context_params {
		params.clone().into()
		}
		}

		impl From<ModelLoad> for llama_context_params {
		fn from(params: ModelLoad) -> Self {
		llama_context_params {
		n_ctx: params.n_ctx,
		n_gpu_layers: params.n_gpu_layers,
		seed: params.seed,
		f16_kv: params.f16_kv,
		logits_all: params.logits_all,
		vocab_only: params.vocab_only,
		use_mmap: params.use_mmap,
		use_mlock: params.use_mlock,
		embedding: params.embedding,
		progress_callback: None,
		progress_callback_user_data: std::ptr::null_mut(),
		}
		}
		}

		#[napi(object)]
		#[derive(Debug, Clone)]
		#[derive(Debug, Clone, Serialize, Deserialize, Default)]
		#[serde(default, rename_all = "camelCase")]
		pub struct LlamaLoraAdaptor {
		@@ -66,0 +184,0 @@ pub lora_adapter: String,

@llama-node/llama-cpp.darwin-arm64.node