@llama-node/llama-cpp - npm Package Compare versions

+31

-27

example/embedding.ts

		import { LLama, LlamaContextParams, LlamaInvocation } from "../index";
		import path from "path";

		const llama = LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		{
		nCtx: 512,
		nParts: -1,
		seed: 0,
		f16Kv: false,
		logitsAll: false,
		vocabOnly: false,
		useMlock: false,
		embedding: true,
		useMmap: true,
		},
		false
		);
		const run = async () => {
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		{
		nCtx: 512,
		nParts: -1,
		seed: 0,
		f16Kv: false,
		logitsAll: false,
		vocabOnly: false,
		useMlock: false,
		embedding: true,
		useMmap: true,
		},
		false
		);

		const prompt = `Who is the president of the United States?`;
		const prompt = `Who is the president of the United States?`;

		const params: LlamaInvocation = {
		nThreads: 4,
		nTokPredict: 2048,
		topK: 40,
		topP: 0.1,
		temp: 0.2,
		repeatPenalty: 1,
		prompt,
		const params: LlamaInvocation = {
		nThreads: 4,
		nTokPredict: 2048,
		topK: 40,
		topP: 0.1,
		temp: 0.2,
		repeatPenalty: 1,
		prompt,
		};

		llama.getWordEmbedding(params).then((data) => {
		console.log(data);
		});
		};

		llama.getWordEmbedding(params, (data) => {
		console.log(data.data);
		});
		run();

+22

-18

example/load.ts

		import { LLama, LlamaInvocation } from "../index";
		import path from "path";

		const llama = LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		null,
		true
		);
		const run = async () => {
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		null,
		true
		);

		const template = `Who is the president of the United States?`;
		const template = `Who is the president of the United States?`;

		const prompt = `A chat between a user and an assistant.
		const prompt = `A chat between a user and an assistant.
		USER: ${template}
		ASSISTANT:`;

		const params: LlamaInvocation = {
		nThreads: 4,
		nTokPredict: 2048,
		topK: 40,
		topP: 0.1,
		temp: 0.2,
		repeatPenalty: 1,
		prompt,
		const params: LlamaInvocation = {
		nThreads: 4,
		nTokPredict: 2048,
		topK: 40,
		topP: 0.1,
		temp: 0.2,
		repeatPenalty: 1,
		prompt,
		};

		llama.inference(params, (data) => {
		process.stdout.write(data.data?.token ?? "");
		});
		};

		llama.inference(params, (data) => {
		process.stdout.write(data.data?.token ?? "");
		});
		run();

+13

-9

example/tokenize.ts

		import { LLama } from "../index";
		import path from "path";

		const llama = LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		null,
		false
		);
		const run = async () => {
		const llama = await LLama.load(
		path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
		null,
		false
		);

		const template = `Who is the president of the United States?`;
		const template = `Who is the president of the United States?`;

		llama.tokenize(template, 2048, (data) => {
		console.log(data.data);
		});
		llama.tokenize(template, 2048).then((data) => {
		console.log(data);
		});
		};

		run();

+18

-34

index.d.ts

		@@ -6,2 +6,16 @@ /* tslint:disable */

		export interface InferenceToken {
		token: string
		completed: boolean
		}
		export const enum InferenceResultType {
		Error = 'Error',
		Data = 'Data',
		End = 'End'
		}
		export interface InferenceResult {
		type: InferenceResultType
		data?: InferenceToken
		message?: string
		}
		export interface LlamaInvocation {
		@@ -34,37 +48,7 @@ nThreads: number
		}
		export const enum TokenizeResultType {
		Error = 'Error',
		Data = 'Data'
		}
		export interface TokenizeResult {
		type: TokenizeResultType
		data: Array<number>
		}
		export interface InferenceToken {
		token: string
		completed: boolean
		}
		export const enum InferenceResultType {
		Error = 'Error',
		Data = 'Data',
		End = 'End'
		}
		export interface InferenceResult {
		type: InferenceResultType
		data?: InferenceToken
		message?: string
		}
		export const enum EmbeddingResultType {
		Error = 'Error',
		Data = 'Data'
		}
		export interface EmbeddingResult {
		type: EmbeddingResultType
		data: Array<number>
		}
		export class LLama {
		static load(path: string, params: LlamaContextParams \| undefined \| null, enableLogger: boolean): LLama
		getWordEmbedding(input: LlamaInvocation, callback: (result: EmbeddingResult) => void): void
		tokenize(params: string, nCtx: number, callback: (result: TokenizeResult) => void): void
		inference(input: LlamaInvocation, callback: (result: InferenceResult) => void): void
		static load(path: string, params: LlamaContextParams \| undefined \| null, enableLogger: boolean): Promise<LLama>
		getWordEmbedding(params: LlamaInvocation): Promise<Array<number>>
		tokenize(params: string, nCtx: number): Promise<Array<number>>
		inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): void
		}

+1

-3

index.js

		@@ -255,7 +255,5 @@ /* tslint:disable */

		const { TokenizeResultType, InferenceResultType, EmbeddingResultType, LLama } = nativeBinding
		const { InferenceResultType, LLama } = nativeBinding

		module.exports.TokenizeResultType = TokenizeResultType
		module.exports.InferenceResultType = InferenceResultType
		module.exports.EmbeddingResultType = EmbeddingResultType
		module.exports.LLama = LLama

+1

-1

package.json

		{
		"name": "@llama-node/llama-cpp",
		"version": "0.0.37",
		"version": "0.1.0",
		"main": "index.js",
		@@ -5,0 +5,0 @@ "types": "index.d.ts",

+3

-34

src/context.rs

 @@ -14,36 +14,4 @@ use std::{ffi::CStr, ptr::null_mut, slice};
 #[napi(object)]
 #[derive(Debug, Clone)]
 pub struct LlamaInvocation {
     pub n_threads: i32,
     pub n_tok_predict: i32,
     pub top_k: i32,                     // 40
     pub top_p: Option<f64>,             // default 0.95f, 1.0 = disabled
     pub tfs_z: Option<f64>,             // default 1.00f, 1.0 = disabled
     pub temp: Option<f64>,              // default 0.80f, 1.0 = disabled
     pub typical_p: Option<f64>,         // default 1.00f, 1.0 = disabled
     pub repeat_penalty: Option<f64>,    // default 1.10f, 1.0 = disabled
     pub repeat_last_n: Option<i32>, // default 64, last n tokens to penalize (0 = disable penalty, -1 = context size)
     pub frequency_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
     pub presence_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
     pub stop_sequence: Option<String>,
     pub penalize_nl: Option<bool>,
     pub prompt: String,
 use crate::types::{LlamaContextParams, LlamaInvocation};
 // Represents the configuration parameters for a LLamaContext.
 #[napi(object)]
 #[derive(Debug, Clone)]
 pub struct LlamaContextParams {
     pub n_ctx: i32,
     pub n_parts: i32,
     pub seed: i32,
     pub f16_kv: bool,
     pub logits_all: bool,
     pub vocab_only: bool,
     pub use_mlock: bool,
     pub embedding: bool,
     pub use_mmap: bool,
 impl LlamaContextParams {
 @@ -78,2 +46,3 @@ // Returns the default parameters or the user-specified parameters.
 // Represents the LLamaContext which wraps FFI calls to the llama.cpp library.
 #[derive(Clone)]
 pub struct LLamaContext {
 @@ -85,3 +54,3 @@ ctx: *mut llama_context,
     // Creates a new LLamaContext from the specified file and configuration parameters.
     pub fn from_file_and_params(path: &str, params: &Option<LlamaContextParams>) -> Self {
     pub async fn from_file_and_params(path: &str, params: &Option<LlamaContextParams>) -> Self {
         let params = LlamaContextParams::or_default(params);
 @@ -88,0 +57,0 @@ let ctx = unsafe { llama_init_from_file(path.as_ptr() as *const i8, params) };

+28

-110

src/lib.rs

		@@ -11,9 +11,5 @@ #![deny(clippy::all)]

		use std::{
		sync::{mpsc::channel, Arc},
		thread, time,
		};
		use std::sync::Arc;

		use context::{LlamaContextParams, LlamaInvocation};
		use llama::LLamaChannel;
		use llama::LLamaInternal;
		use napi::{
		@@ -26,7 +22,8 @@ bindgen_prelude::*,
		};
		use types::{EmbeddingResult, InferenceResult, TokenizeResult};
		use tokio::sync::Mutex;
		use types::{InferenceResult, LlamaContextParams, LlamaInvocation};

		#[napi]
		pub struct LLama {
		llama_channel: Arc<LLamaChannel>,
		llama: Arc<Mutex<LLamaInternal>>,
		}
		@@ -37,3 +34,3 @@
		#[napi]
		pub fn load(
		pub async fn load(
		path: String,
		@@ -50,89 +47,17 @@ params: Option<LlamaContextParams>,

		let (load_result_sender, load_result_receiver) = channel::<bool>();
		let llama_channel = LLamaChannel::new(path, params, load_result_sender, enable_logger);
		'waiting_load: loop {
		let recv = load_result_receiver.recv();
		match recv {
		Ok(r) => {
		if !r {
		return Err(Error::new(Status::InvalidArg, "Load error".to_string()));
		}
		break 'waiting_load;
		}
		_ => {
		thread::yield_now();
		}
		}
		}
		Ok(Self { llama_channel })
		Ok(Self {
		llama: LLamaInternal::load(path, params, enable_logger).await,
		})
		}

		#[napi]
		pub fn get_word_embedding(
		&self,
		input: LlamaInvocation,
		#[napi(ts_arg_type = "(result: EmbeddingResult) => void")] callback: JsFunction,
		) -> Result<()> {
		let tsfn: ThreadsafeFunction<EmbeddingResult, ErrorStrategy::Fatal> =
		callback.create_threadsafe_function(0, \|ctx\| Ok(vec![ctx.value]))?;
		let (embeddings_sender, embeddings_receiver) = channel();
		let llama_channel = self.llama_channel.clone();

		llama_channel.embedding(input, embeddings_sender);

		thread::spawn(move \|\| {
		loop {
		let result = embeddings_receiver.recv();
		match result {
		Ok(result) => {
		tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
		}
		Err(_) => {
		break;
		}
		}
		}
		thread::sleep(time::Duration::from_millis(300)); // wait for end signal
		tsfn.abort().unwrap();
		});

		Ok(())
		pub async fn get_word_embedding(&self, params: LlamaInvocation) -> Result<Vec<f64>> {
		let llama = self.llama.lock().await;
		llama.embedding(&params).await
		}

		#[napi]
		pub fn tokenize(
		&self,
		params: String,
		n_ctx: i32,
		#[napi(ts_arg_type = "(result: TokenizeResult) => void")] callback: JsFunction,
		) -> Result<()> {
		let (tokenize_sender, tokenize_receiver) = channel::<TokenizeResult>();

		let tsfn: ThreadsafeFunction<TokenizeResult, ErrorStrategy::Fatal> = callback
		.create_threadsafe_function(0, \|ctx: ThreadSafeCallContext<TokenizeResult>\| {
		Ok(vec![ctx.value])
		})?;

		let llama_channel = self.llama_channel.clone();

		llama_channel.tokenize(params, n_ctx as usize, tokenize_sender);

		thread::spawn(move \|\| {
		'waiting_tokenize: loop {
		let recv = tokenize_receiver.recv();
		match recv {
		Ok(callback) => {
		tsfn.call(callback, ThreadsafeFunctionCallMode::Blocking);
		break 'waiting_tokenize;
		}
		_ => {
		thread::yield_now();
		}
		}
		}
		thread::sleep(time::Duration::from_millis(300)); // wait for end signal
		tsfn.abort().unwrap();
		});

		Ok(())
		pub async fn tokenize(&self, params: String, n_ctx: i32) -> Result<Vec<i32>> {
		let llama = self.llama.lock().await;
		llama.tokenize(&params, n_ctx as usize).await
		}
		@@ -143,26 +68,19 @@
		&self,
		input: LlamaInvocation,
		params: LlamaInvocation,
		#[napi(ts_arg_type = "(result: InferenceResult) => void")] callback: JsFunction,
		) -> Result<()> {
		let tsfn: ThreadsafeFunction<InferenceResult, ErrorStrategy::Fatal> =
		callback.create_threadsafe_function(0, \|ctx\| Ok(vec![ctx.value]))?;
		let (inference_sender, inference_receiver) = channel();
		let llama_channel = self.llama_channel.clone();
		let tsfn: ThreadsafeFunction<InferenceResult, ErrorStrategy::Fatal> = callback
		.create_threadsafe_function(0, \|ctx: ThreadSafeCallContext<InferenceResult>\| {
		Ok(vec![ctx.value])
		})?;

		llama_channel.inference(input, inference_sender);
		let llama = self.llama.clone();

		thread::spawn(move \|\| {
		loop {
		let result = inference_receiver.recv();
		match result {
		Ok(result) => {
		tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
		}
		Err(_) => {
		break;
		}
		}
		}
		thread::sleep(time::Duration::from_millis(300)); // wait for end signal
		tsfn.abort().unwrap();
		tokio::spawn(async move {
		let llama = llama.lock().await;
		llama
		.inference(&params, \|result\| {
		tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
		})
		.await;
		});
		@@ -169,0 +87,0 @@

+53

-166

src/llama.rs

		@@ -1,24 +0,12 @@
		use std::{
		sync::{
		mpsc::{channel, Receiver, Sender, TryRecvError},
		Arc, Mutex,
		},
		thread,
		};
		use std::sync::Arc;

		use tokio::sync::Mutex;

		use crate::{
		context::{LLamaContext, LlamaContextParams, LlamaInvocation},
		context::{LLamaContext},
		tokenizer::{llama_token_eos, tokenize},
		types::{
		EmbeddingResult, EmbeddingResultType, InferenceResult, InferenceResultType, InferenceToken,
		LLamaCommand, TokenizeResult, TokenizeResultType,
		},
		types::{InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation},
		};

		#[derive(Clone)]
		pub struct LLamaChannel {
		command_sender: Sender<LLamaCommand>,
		command_receiver: Arc<Mutex<Receiver<LLamaCommand>>>,
		}

		pub struct LLamaInternal {
		@@ -30,21 +18,27 @@ context: LLamaContext,
		impl LLamaInternal {
		pub fn tokenize(&self, input: &str, n_ctx: usize, sender: &Sender<TokenizeResult>) {
		pub async fn load(
		path: String,
		params: Option<LlamaContextParams>,
		enable_logger: bool,
		) -> Arc<Mutex<Self>> {
		let llama = LLamaInternal {
		context: LLamaContext::from_file_and_params(&path, &params).await,
		context_params: params,
		};

		if enable_logger {
		llama.context.llama_print_system_info();
		}

		Arc::new(Mutex::new(llama))
		}
		pub async fn tokenize(&self, input: &str, n_ctx: usize) -> Result<Vec<i32>, napi::Error> {
		if let Ok(data) = tokenize(&self.context, input, n_ctx, false) {
		sender
		.send(TokenizeResult {
		data,
		r#type: TokenizeResultType::Data,
		})
		.unwrap();
		Ok(data)
		} else {
		sender
		.send(TokenizeResult {
		data: vec![],
		r#type: TokenizeResultType::Error,
		})
		.unwrap();
		Err(napi::Error::from_reason("Failed to tokenize"))
		}
		}

		pub fn embedding(&self, input: &LlamaInvocation, sender: &Sender<EmbeddingResult>) {
		pub async fn embedding(&self, input: &LlamaInvocation) -> Result<Vec<f64>, napi::Error> {
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		@@ -71,19 +65,9 @@ let input_ctx = &self.context;
		if let Ok(embeddings) = embeddings {
		sender
		.send(EmbeddingResult {
		r#type: EmbeddingResultType::Data,
		data: embeddings.iter().map(\|&x\| x as f64).collect(),
		})
		.unwrap();
		Ok(embeddings.iter().map(\|&x\| x as f64).collect())
		} else {
		sender
		.send(EmbeddingResult {
		r#type: EmbeddingResultType::Error,
		data: vec![],
		})
		.unwrap();
		Err(napi::Error::from_reason("Failed to get embeddings"))
		}
		}

		pub fn inference(&self, input: &LlamaInvocation, sender: &Sender<InferenceResult>) {
		pub async fn inference(&self, input: &LlamaInvocation, callback: impl Fn(InferenceResult)) {
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		@@ -144,9 +128,7 @@ let input_ctx = &self.context;
		{
		sender
		.send(InferenceResult {
		r#type: InferenceResultType::Error,
		data: None,
		message: Some("Too many tokens predicted".to_string()),
		})
		.unwrap();
		callback(InferenceResult {
		r#type: InferenceResultType::Error,
		data: None,
		message: Some("Too many tokens predicted".to_string()),
		});
		break;
		@@ -175,12 +157,10 @@ }
		if stop_sequence_i == 0 {
		sender
		.send(InferenceResult {
		r#type: InferenceResultType::Data,
		data: Some(InferenceToken {
		token: output,
		completed: false,
		}),
		message: None,
		})
		.unwrap();
		callback(InferenceResult {
		r#type: InferenceResultType::Data,
		data: Some(InferenceToken {
		token: output,
		completed: false,
		}),
		message: None,
		});
		}
		@@ -191,111 +171,18 @@ }
		if completed {
		sender
		.send(InferenceResult {
		r#type: InferenceResultType::Data,
		data: Some(InferenceToken {
		token: "\n\n<end>\n".to_string(),
		completed: true,
		}),
		message: None,
		})
		.unwrap();
		callback(InferenceResult {
		r#type: InferenceResultType::Data,
		data: Some(InferenceToken {
		token: "\n\n<end>\n".to_string(),
		completed: true,
		}),
		message: None,
		});
		}

		sender
		.send(InferenceResult {
		r#type: InferenceResultType::End,
		data: None,
		message: None,
		})
		.unwrap();
		// embedding_to_output(
		// input_ctx,
		// &embd[tokenized_input.len()..n_used + 1 - stop_sequence_i],
		// );
		}
		}

		impl LLamaChannel {
		pub fn new(
		path: String,
		params: Option<LlamaContextParams>,
		load_result_sender: Sender<bool>,
		enable_logger: bool,
		) -> Arc<Self> {
		let (command_sender, command_receiver) = channel::<LLamaCommand>();

		let channel = LLamaChannel {
		command_receiver: Arc::new(Mutex::new(command_receiver)),
		command_sender,
		};

		channel.spawn(path, params, load_result_sender, enable_logger);

		Arc::new(channel)
		}

		pub fn tokenize(&self, input: String, n_ctx: usize, sender: Sender<TokenizeResult>) {
		self.command_sender
		.send(LLamaCommand::Tokenize(input, n_ctx, sender))
		.unwrap();
		}

		pub fn embedding(&self, params: LlamaInvocation, sender: Sender<EmbeddingResult>) {
		self.command_sender
		.send(LLamaCommand::Embedding(params, sender))
		.unwrap();
		}

		pub fn inference(&self, params: LlamaInvocation, sender: Sender<InferenceResult>) {
		self.command_sender
		.send(LLamaCommand::Inference(params, sender))
		.unwrap();
		}

		// llama instance main loop
		pub fn spawn(
		&self,
		path: String,
		params: Option<LlamaContextParams>,
		load_result_sender: Sender<bool>,
		enable_logger: bool,
		) {
		let rv = self.command_receiver.clone();

		thread::spawn(move \|\| {
		let llama = LLamaInternal {
		context: LLamaContext::from_file_and_params(&path, &params),
		context_params: params,
		};

		if enable_logger {
		llama.context.llama_print_system_info();
		}

		load_result_sender.send(true).unwrap();

		let rv = rv.lock().unwrap();

		'llama_loop: loop {
		let command = rv.try_recv();
		match command {
		Ok(LLamaCommand::Inference(params, sender)) => {
		llama.inference(&params, &sender);
		}
		Ok(LLamaCommand::Embedding(params, sender)) => {
		llama.embedding(&params, &sender);
		}
		Ok(LLamaCommand::Tokenize(text, n_ctx, sender)) => {
		llama.tokenize(&text, n_ctx, &sender);
		}
		Err(TryRecvError::Disconnected) => {
		break 'llama_loop;
		}
		_ => {
		thread::yield_now();
		}
		}
		}
		callback(InferenceResult {
		r#type: InferenceResultType::End,
		data: None,
		message: None,
		});
		}
		}

+29

-28

src/types.rs

		@@ -1,25 +0,4 @@
		use crate::context::LlamaInvocation;
		use napi::bindgen_prelude::*;
		use std::sync::mpsc::Sender;

		#[derive(Clone, Debug)]
		pub enum LLamaCommand {
		Inference(LlamaInvocation, Sender<InferenceResult>),
		Tokenize(String, usize, Sender<TokenizeResult>),
		Embedding(LlamaInvocation, Sender<EmbeddingResult>),
		}

		#[napi(string_enum)]
		pub enum TokenizeResultType {
		Error,
		Data,
		}

		#[napi(object)]
		pub struct TokenizeResult {
		pub r#type: TokenizeResultType,
		pub data: Vec<i32>,
		}

		#[napi(object)]
		#[derive(Clone, Debug)]
		@@ -45,12 +24,34 @@ pub struct InferenceToken {

		#[napi(string_enum)]
		pub enum EmbeddingResultType {
		Error,
		Data,
		#[napi(object)]
		#[derive(Debug, Clone)]
		pub struct LlamaInvocation {
		pub n_threads: i32,
		pub n_tok_predict: i32,
		pub top_k: i32, // 40
		pub top_p: Option<f64>, // default 0.95f, 1.0 = disabled
		pub tfs_z: Option<f64>, // default 1.00f, 1.0 = disabled
		pub temp: Option<f64>, // default 0.80f, 1.0 = disabled
		pub typical_p: Option<f64>, // default 1.00f, 1.0 = disabled
		pub repeat_penalty: Option<f64>, // default 1.10f, 1.0 = disabled
		pub repeat_last_n: Option<i32>, // default 64, last n tokens to penalize (0 = disable penalty, -1 = context size)
		pub frequency_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
		pub presence_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
		pub stop_sequence: Option<String>,
		pub penalize_nl: Option<bool>,
		pub prompt: String,
		}

		// Represents the configuration parameters for a LLamaContext.
		#[napi(object)]
		pub struct EmbeddingResult {
		pub r#type: EmbeddingResultType,
		pub data: Vec<f64>,
		#[derive(Debug, Clone)]
		pub struct LlamaContextParams {
		pub n_ctx: i32,
		pub n_parts: i32,
		pub seed: i32,
		pub f16_kv: bool,
		pub logits_all: bool,
		pub vocab_only: bool,
		pub use_mlock: bool,
		pub embedding: bool,
		pub use_mmap: bool,
		}

@llama-node/llama-cpp.darwin-arm64.node