New Research: Supply Chain Attack on Axios Pulls Malicious Dependency from npm.Details →
Socket
Book a DemoSign in
Socket

@llama-node/llama-cpp

Package Overview
Dependencies
Maintainers
1
Versions
25
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@llama-node/llama-cpp - npm Package Compare versions

Comparing version
0.0.37
to
0.1.0
+31
-27
example/embedding.ts
import { LLama, LlamaContextParams, LlamaInvocation } from "../index";
import path from "path";
const llama = LLama.load(
path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
{
nCtx: 512,
nParts: -1,
seed: 0,
f16Kv: false,
logitsAll: false,
vocabOnly: false,
useMlock: false,
embedding: true,
useMmap: true,
},
false
);
const run = async () => {
const llama = await LLama.load(
path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
{
nCtx: 512,
nParts: -1,
seed: 0,
f16Kv: false,
logitsAll: false,
vocabOnly: false,
useMlock: false,
embedding: true,
useMmap: true,
},
false
);
const prompt = `Who is the president of the United States?`;
const prompt = `Who is the president of the United States?`;
const params: LlamaInvocation = {
nThreads: 4,
nTokPredict: 2048,
topK: 40,
topP: 0.1,
temp: 0.2,
repeatPenalty: 1,
prompt,
const params: LlamaInvocation = {
nThreads: 4,
nTokPredict: 2048,
topK: 40,
topP: 0.1,
temp: 0.2,
repeatPenalty: 1,
prompt,
};
llama.getWordEmbedding(params).then((data) => {
console.log(data);
});
};
llama.getWordEmbedding(params, (data) => {
console.log(data.data);
});
run();
import { LLama, LlamaInvocation } from "../index";
import path from "path";
const llama = LLama.load(
path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
null,
true
);
const run = async () => {
const llama = await LLama.load(
path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
null,
true
);
const template = `Who is the president of the United States?`;
const template = `Who is the president of the United States?`;
const prompt = `A chat between a user and an assistant.
const prompt = `A chat between a user and an assistant.
USER: ${template}
ASSISTANT:`;
const params: LlamaInvocation = {
nThreads: 4,
nTokPredict: 2048,
topK: 40,
topP: 0.1,
temp: 0.2,
repeatPenalty: 1,
prompt,
const params: LlamaInvocation = {
nThreads: 4,
nTokPredict: 2048,
topK: 40,
topP: 0.1,
temp: 0.2,
repeatPenalty: 1,
prompt,
};
llama.inference(params, (data) => {
process.stdout.write(data.data?.token ?? "");
});
};
llama.inference(params, (data) => {
process.stdout.write(data.data?.token ?? "");
});
run();
import { LLama } from "../index";
import path from "path";
const llama = LLama.load(
path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
null,
false
);
const run = async () => {
const llama = await LLama.load(
path.resolve(process.cwd(), "../../ggml-vicuna-7b-1.1-q4_1.bin"),
null,
false
);
const template = `Who is the president of the United States?`;
const template = `Who is the president of the United States?`;
llama.tokenize(template, 2048, (data) => {
console.log(data.data);
});
llama.tokenize(template, 2048).then((data) => {
console.log(data);
});
};
run();

@@ -6,2 +6,16 @@ /* tslint:disable */

export interface InferenceToken {
token: string
completed: boolean
}
export const enum InferenceResultType {
Error = 'Error',
Data = 'Data',
End = 'End'
}
export interface InferenceResult {
type: InferenceResultType
data?: InferenceToken
message?: string
}
export interface LlamaInvocation {

@@ -34,37 +48,7 @@ nThreads: number

}
export const enum TokenizeResultType {
Error = 'Error',
Data = 'Data'
}
export interface TokenizeResult {
type: TokenizeResultType
data: Array<number>
}
export interface InferenceToken {
token: string
completed: boolean
}
export const enum InferenceResultType {
Error = 'Error',
Data = 'Data',
End = 'End'
}
export interface InferenceResult {
type: InferenceResultType
data?: InferenceToken
message?: string
}
export const enum EmbeddingResultType {
Error = 'Error',
Data = 'Data'
}
export interface EmbeddingResult {
type: EmbeddingResultType
data: Array<number>
}
export class LLama {
static load(path: string, params: LlamaContextParams | undefined | null, enableLogger: boolean): LLama
getWordEmbedding(input: LlamaInvocation, callback: (result: EmbeddingResult) => void): void
tokenize(params: string, nCtx: number, callback: (result: TokenizeResult) => void): void
inference(input: LlamaInvocation, callback: (result: InferenceResult) => void): void
static load(path: string, params: LlamaContextParams | undefined | null, enableLogger: boolean): Promise<LLama>
getWordEmbedding(params: LlamaInvocation): Promise<Array<number>>
tokenize(params: string, nCtx: number): Promise<Array<number>>
inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): void
}

@@ -255,7 +255,5 @@ /* tslint:disable */

const { TokenizeResultType, InferenceResultType, EmbeddingResultType, LLama } = nativeBinding
const { InferenceResultType, LLama } = nativeBinding
module.exports.TokenizeResultType = TokenizeResultType
module.exports.InferenceResultType = InferenceResultType
module.exports.EmbeddingResultType = EmbeddingResultType
module.exports.LLama = LLama
{
"name": "@llama-node/llama-cpp",
"version": "0.0.37",
"version": "0.1.0",
"main": "index.js",

@@ -5,0 +5,0 @@ "types": "index.d.ts",

@@ -14,36 +14,4 @@ use std::{ffi::CStr, ptr::null_mut, slice};

#[napi(object)]
#[derive(Debug, Clone)]
pub struct LlamaInvocation {
pub n_threads: i32,
pub n_tok_predict: i32,
pub top_k: i32, // 40
pub top_p: Option<f64>, // default 0.95f, 1.0 = disabled
pub tfs_z: Option<f64>, // default 1.00f, 1.0 = disabled
pub temp: Option<f64>, // default 0.80f, 1.0 = disabled
pub typical_p: Option<f64>, // default 1.00f, 1.0 = disabled
pub repeat_penalty: Option<f64>, // default 1.10f, 1.0 = disabled
pub repeat_last_n: Option<i32>, // default 64, last n tokens to penalize (0 = disable penalty, -1 = context size)
pub frequency_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
pub presence_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
pub stop_sequence: Option<String>,
pub penalize_nl: Option<bool>,
pub prompt: String,
}
use crate::types::{LlamaContextParams, LlamaInvocation};
// Represents the configuration parameters for a LLamaContext.
#[napi(object)]
#[derive(Debug, Clone)]
pub struct LlamaContextParams {
pub n_ctx: i32,
pub n_parts: i32,
pub seed: i32,
pub f16_kv: bool,
pub logits_all: bool,
pub vocab_only: bool,
pub use_mlock: bool,
pub embedding: bool,
pub use_mmap: bool,
}
impl LlamaContextParams {

@@ -78,2 +46,3 @@ // Returns the default parameters or the user-specified parameters.

// Represents the LLamaContext which wraps FFI calls to the llama.cpp library.
#[derive(Clone)]
pub struct LLamaContext {

@@ -85,3 +54,3 @@ ctx: *mut llama_context,

// Creates a new LLamaContext from the specified file and configuration parameters.
pub fn from_file_and_params(path: &str, params: &Option<LlamaContextParams>) -> Self {
pub async fn from_file_and_params(path: &str, params: &Option<LlamaContextParams>) -> Self {
let params = LlamaContextParams::or_default(params);

@@ -88,0 +57,0 @@ let ctx = unsafe { llama_init_from_file(path.as_ptr() as *const i8, params) };

+28
-110

@@ -11,9 +11,5 @@ #![deny(clippy::all)]

use std::{
sync::{mpsc::channel, Arc},
thread, time,
};
use std::sync::Arc;
use context::{LlamaContextParams, LlamaInvocation};
use llama::LLamaChannel;
use llama::LLamaInternal;
use napi::{

@@ -26,7 +22,8 @@ bindgen_prelude::*,

};
use types::{EmbeddingResult, InferenceResult, TokenizeResult};
use tokio::sync::Mutex;
use types::{InferenceResult, LlamaContextParams, LlamaInvocation};
#[napi]
pub struct LLama {
llama_channel: Arc<LLamaChannel>,
llama: Arc<Mutex<LLamaInternal>>,
}

@@ -37,3 +34,3 @@

#[napi]
pub fn load(
pub async fn load(
path: String,

@@ -50,89 +47,17 @@ params: Option<LlamaContextParams>,

let (load_result_sender, load_result_receiver) = channel::<bool>();
let llama_channel = LLamaChannel::new(path, params, load_result_sender, enable_logger);
'waiting_load: loop {
let recv = load_result_receiver.recv();
match recv {
Ok(r) => {
if !r {
return Err(Error::new(Status::InvalidArg, "Load error".to_string()));
}
break 'waiting_load;
}
_ => {
thread::yield_now();
}
}
}
Ok(Self { llama_channel })
Ok(Self {
llama: LLamaInternal::load(path, params, enable_logger).await,
})
}
#[napi]
pub fn get_word_embedding(
&self,
input: LlamaInvocation,
#[napi(ts_arg_type = "(result: EmbeddingResult) => void")] callback: JsFunction,
) -> Result<()> {
let tsfn: ThreadsafeFunction<EmbeddingResult, ErrorStrategy::Fatal> =
callback.create_threadsafe_function(0, |ctx| Ok(vec![ctx.value]))?;
let (embeddings_sender, embeddings_receiver) = channel();
let llama_channel = self.llama_channel.clone();
llama_channel.embedding(input, embeddings_sender);
thread::spawn(move || {
loop {
let result = embeddings_receiver.recv();
match result {
Ok(result) => {
tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
}
Err(_) => {
break;
}
}
}
thread::sleep(time::Duration::from_millis(300)); // wait for end signal
tsfn.abort().unwrap();
});
Ok(())
pub async fn get_word_embedding(&self, params: LlamaInvocation) -> Result<Vec<f64>> {
let llama = self.llama.lock().await;
llama.embedding(&params).await
}
#[napi]
pub fn tokenize(
&self,
params: String,
n_ctx: i32,
#[napi(ts_arg_type = "(result: TokenizeResult) => void")] callback: JsFunction,
) -> Result<()> {
let (tokenize_sender, tokenize_receiver) = channel::<TokenizeResult>();
let tsfn: ThreadsafeFunction<TokenizeResult, ErrorStrategy::Fatal> = callback
.create_threadsafe_function(0, |ctx: ThreadSafeCallContext<TokenizeResult>| {
Ok(vec![ctx.value])
})?;
let llama_channel = self.llama_channel.clone();
llama_channel.tokenize(params, n_ctx as usize, tokenize_sender);
thread::spawn(move || {
'waiting_tokenize: loop {
let recv = tokenize_receiver.recv();
match recv {
Ok(callback) => {
tsfn.call(callback, ThreadsafeFunctionCallMode::Blocking);
break 'waiting_tokenize;
}
_ => {
thread::yield_now();
}
}
}
thread::sleep(time::Duration::from_millis(300)); // wait for end signal
tsfn.abort().unwrap();
});
Ok(())
pub async fn tokenize(&self, params: String, n_ctx: i32) -> Result<Vec<i32>> {
let llama = self.llama.lock().await;
llama.tokenize(&params, n_ctx as usize).await
}

@@ -143,26 +68,19 @@

&self,
input: LlamaInvocation,
params: LlamaInvocation,
#[napi(ts_arg_type = "(result: InferenceResult) => void")] callback: JsFunction,
) -> Result<()> {
let tsfn: ThreadsafeFunction<InferenceResult, ErrorStrategy::Fatal> =
callback.create_threadsafe_function(0, |ctx| Ok(vec![ctx.value]))?;
let (inference_sender, inference_receiver) = channel();
let llama_channel = self.llama_channel.clone();
let tsfn: ThreadsafeFunction<InferenceResult, ErrorStrategy::Fatal> = callback
.create_threadsafe_function(0, |ctx: ThreadSafeCallContext<InferenceResult>| {
Ok(vec![ctx.value])
})?;
llama_channel.inference(input, inference_sender);
let llama = self.llama.clone();
thread::spawn(move || {
loop {
let result = inference_receiver.recv();
match result {
Ok(result) => {
tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
}
Err(_) => {
break;
}
}
}
thread::sleep(time::Duration::from_millis(300)); // wait for end signal
tsfn.abort().unwrap();
tokio::spawn(async move {
let llama = llama.lock().await;
llama
.inference(&params, |result| {
tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
})
.await;
});

@@ -169,0 +87,0 @@

@@ -1,24 +0,12 @@

use std::{
sync::{
mpsc::{channel, Receiver, Sender, TryRecvError},
Arc, Mutex,
},
thread,
};
use std::sync::Arc;
use tokio::sync::Mutex;
use crate::{
context::{LLamaContext, LlamaContextParams, LlamaInvocation},
context::{LLamaContext},
tokenizer::{llama_token_eos, tokenize},
types::{
EmbeddingResult, EmbeddingResultType, InferenceResult, InferenceResultType, InferenceToken,
LLamaCommand, TokenizeResult, TokenizeResultType,
},
types::{InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation},
};
#[derive(Clone)]
pub struct LLamaChannel {
command_sender: Sender<LLamaCommand>,
command_receiver: Arc<Mutex<Receiver<LLamaCommand>>>,
}
pub struct LLamaInternal {

@@ -30,21 +18,27 @@ context: LLamaContext,

impl LLamaInternal {
pub fn tokenize(&self, input: &str, n_ctx: usize, sender: &Sender<TokenizeResult>) {
pub async fn load(
path: String,
params: Option<LlamaContextParams>,
enable_logger: bool,
) -> Arc<Mutex<Self>> {
let llama = LLamaInternal {
context: LLamaContext::from_file_and_params(&path, &params).await,
context_params: params,
};
if enable_logger {
llama.context.llama_print_system_info();
}
Arc::new(Mutex::new(llama))
}
pub async fn tokenize(&self, input: &str, n_ctx: usize) -> Result<Vec<i32>, napi::Error> {
if let Ok(data) = tokenize(&self.context, input, n_ctx, false) {
sender
.send(TokenizeResult {
data,
r#type: TokenizeResultType::Data,
})
.unwrap();
Ok(data)
} else {
sender
.send(TokenizeResult {
data: vec![],
r#type: TokenizeResultType::Error,
})
.unwrap();
Err(napi::Error::from_reason("Failed to tokenize"))
}
}
pub fn embedding(&self, input: &LlamaInvocation, sender: &Sender<EmbeddingResult>) {
pub async fn embedding(&self, input: &LlamaInvocation) -> Result<Vec<f64>, napi::Error> {
let context_params_c = LlamaContextParams::or_default(&self.context_params);

@@ -71,19 +65,9 @@ let input_ctx = &self.context;

if let Ok(embeddings) = embeddings {
sender
.send(EmbeddingResult {
r#type: EmbeddingResultType::Data,
data: embeddings.iter().map(|&x| x as f64).collect(),
})
.unwrap();
Ok(embeddings.iter().map(|&x| x as f64).collect())
} else {
sender
.send(EmbeddingResult {
r#type: EmbeddingResultType::Error,
data: vec![],
})
.unwrap();
Err(napi::Error::from_reason("Failed to get embeddings"))
}
}
pub fn inference(&self, input: &LlamaInvocation, sender: &Sender<InferenceResult>) {
pub async fn inference(&self, input: &LlamaInvocation, callback: impl Fn(InferenceResult)) {
let context_params_c = LlamaContextParams::or_default(&self.context_params);

@@ -144,9 +128,7 @@ let input_ctx = &self.context;

{
sender
.send(InferenceResult {
r#type: InferenceResultType::Error,
data: None,
message: Some("Too many tokens predicted".to_string()),
})
.unwrap();
callback(InferenceResult {
r#type: InferenceResultType::Error,
data: None,
message: Some("Too many tokens predicted".to_string()),
});
break;

@@ -175,12 +157,10 @@ }

if stop_sequence_i == 0 {
sender
.send(InferenceResult {
r#type: InferenceResultType::Data,
data: Some(InferenceToken {
token: output,
completed: false,
}),
message: None,
})
.unwrap();
callback(InferenceResult {
r#type: InferenceResultType::Data,
data: Some(InferenceToken {
token: output,
completed: false,
}),
message: None,
});
}

@@ -191,111 +171,18 @@ }

if completed {
sender
.send(InferenceResult {
r#type: InferenceResultType::Data,
data: Some(InferenceToken {
token: "\n\n<end>\n".to_string(),
completed: true,
}),
message: None,
})
.unwrap();
callback(InferenceResult {
r#type: InferenceResultType::Data,
data: Some(InferenceToken {
token: "\n\n<end>\n".to_string(),
completed: true,
}),
message: None,
});
}
sender
.send(InferenceResult {
r#type: InferenceResultType::End,
data: None,
message: None,
})
.unwrap();
// embedding_to_output(
// input_ctx,
// &embd[tokenized_input.len()..n_used + 1 - stop_sequence_i],
// );
}
}
impl LLamaChannel {
pub fn new(
path: String,
params: Option<LlamaContextParams>,
load_result_sender: Sender<bool>,
enable_logger: bool,
) -> Arc<Self> {
let (command_sender, command_receiver) = channel::<LLamaCommand>();
let channel = LLamaChannel {
command_receiver: Arc::new(Mutex::new(command_receiver)),
command_sender,
};
channel.spawn(path, params, load_result_sender, enable_logger);
Arc::new(channel)
}
pub fn tokenize(&self, input: String, n_ctx: usize, sender: Sender<TokenizeResult>) {
self.command_sender
.send(LLamaCommand::Tokenize(input, n_ctx, sender))
.unwrap();
}
pub fn embedding(&self, params: LlamaInvocation, sender: Sender<EmbeddingResult>) {
self.command_sender
.send(LLamaCommand::Embedding(params, sender))
.unwrap();
}
pub fn inference(&self, params: LlamaInvocation, sender: Sender<InferenceResult>) {
self.command_sender
.send(LLamaCommand::Inference(params, sender))
.unwrap();
}
// llama instance main loop
pub fn spawn(
&self,
path: String,
params: Option<LlamaContextParams>,
load_result_sender: Sender<bool>,
enable_logger: bool,
) {
let rv = self.command_receiver.clone();
thread::spawn(move || {
let llama = LLamaInternal {
context: LLamaContext::from_file_and_params(&path, &params),
context_params: params,
};
if enable_logger {
llama.context.llama_print_system_info();
}
load_result_sender.send(true).unwrap();
let rv = rv.lock().unwrap();
'llama_loop: loop {
let command = rv.try_recv();
match command {
Ok(LLamaCommand::Inference(params, sender)) => {
llama.inference(&params, &sender);
}
Ok(LLamaCommand::Embedding(params, sender)) => {
llama.embedding(&params, &sender);
}
Ok(LLamaCommand::Tokenize(text, n_ctx, sender)) => {
llama.tokenize(&text, n_ctx, &sender);
}
Err(TryRecvError::Disconnected) => {
break 'llama_loop;
}
_ => {
thread::yield_now();
}
}
}
callback(InferenceResult {
r#type: InferenceResultType::End,
data: None,
message: None,
});
}
}

@@ -1,25 +0,4 @@

use crate::context::LlamaInvocation;
use napi::bindgen_prelude::*;
use std::sync::mpsc::Sender;
#[derive(Clone, Debug)]
pub enum LLamaCommand {
Inference(LlamaInvocation, Sender<InferenceResult>),
Tokenize(String, usize, Sender<TokenizeResult>),
Embedding(LlamaInvocation, Sender<EmbeddingResult>),
}
#[napi(string_enum)]
pub enum TokenizeResultType {
Error,
Data,
}
#[napi(object)]
pub struct TokenizeResult {
pub r#type: TokenizeResultType,
pub data: Vec<i32>,
}
#[napi(object)]
#[derive(Clone, Debug)]

@@ -45,12 +24,34 @@ pub struct InferenceToken {

#[napi(string_enum)]
pub enum EmbeddingResultType {
Error,
Data,
#[napi(object)]
#[derive(Debug, Clone)]
pub struct LlamaInvocation {
pub n_threads: i32,
pub n_tok_predict: i32,
pub top_k: i32, // 40
pub top_p: Option<f64>, // default 0.95f, 1.0 = disabled
pub tfs_z: Option<f64>, // default 1.00f, 1.0 = disabled
pub temp: Option<f64>, // default 0.80f, 1.0 = disabled
pub typical_p: Option<f64>, // default 1.00f, 1.0 = disabled
pub repeat_penalty: Option<f64>, // default 1.10f, 1.0 = disabled
pub repeat_last_n: Option<i32>, // default 64, last n tokens to penalize (0 = disable penalty, -1 = context size)
pub frequency_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
pub presence_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
pub stop_sequence: Option<String>,
pub penalize_nl: Option<bool>,
pub prompt: String,
}
// Represents the configuration parameters for a LLamaContext.
#[napi(object)]
pub struct EmbeddingResult {
pub r#type: EmbeddingResultType,
pub data: Vec<f64>,
#[derive(Debug, Clone)]
pub struct LlamaContextParams {
pub n_ctx: i32,
pub n_parts: i32,
pub seed: i32,
pub f16_kv: bool,
pub logits_all: bool,
pub vocab_only: bool,
pub use_mlock: bool,
pub embedding: bool,
pub use_mmap: bool,
}

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet