New Research: Supply Chain Attack on Axios Pulls Malicious Dependency from npm.Details
Socket
Book a DemoSign in
Socket

@llama-node/llama-cpp

Package Overview
Dependencies
Maintainers
1
Versions
25
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@llama-node/llama-cpp - npm Package Compare versions

Comparing version
0.1.5
to
0.1.6
+4
-3
Cargo.toml

@@ -17,5 +17,6 @@ [package]

llama-sys = { path = "./llama-sys" }
# llm-chain-llama-sys = { path = "../../../llm-chain/llm-chain-llama/sys" }
napi = { version = "2.12.2", default-features = false, features = ["napi6", "async"] }
napi-derive = "2.12.2"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"
napi = { version = "2.12.4", default-features = false, features = ["napi6", "async", "serde-json"] }
napi-derive = "2.12.3"
tokio = { version = "1.26.0", features = ["full"] }

@@ -22,0 +23,0 @@ futures = "0.3"

@@ -1,2 +0,2 @@

import { LLama, LlamaInvocation } from "../index";
import { LLama, Generate } from "../index";
import path from "path";

@@ -6,4 +6,5 @@

const llama = await LLama.load(
path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
null,
{
modelPath: path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
},
true

@@ -18,3 +19,3 @@ );

const params: LlamaInvocation = {
const params: Generate = {
nThreads: 4,

@@ -21,0 +22,0 @@ nTokPredict: 2048,

@@ -1,2 +0,2 @@

import { LLama, LlamaContextParams, LlamaInvocation } from "../index";
import { LLama, Generate } from "../index";
import path from "path";

@@ -6,6 +6,5 @@

const llama = await LLama.load(
path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
{
modelPath: path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
nCtx: 512,
nParts: -1,
nGpuLayers: 0,

@@ -25,3 +24,3 @@ seed: 0,

const params: LlamaInvocation = {
const params: Generate = {
nThreads: 4,

@@ -28,0 +27,0 @@ nTokPredict: 2048,

import { InferenceResultType } from "../index";
import { LLama, LlamaInvocation } from "../index";
import { LLama, Generate } from "../index";
import path from "path";

@@ -7,7 +7,6 @@

const llama = await LLama.load(
path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
{
modelPath: path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
nGpuLayers: 32,
nCtx: 1024,
nParts: 1,
seed: 0,

@@ -30,3 +29,3 @@ f16Kv: false,

const params: LlamaInvocation = {
const params: Generate = {
nThreads: 4,

@@ -33,0 +32,0 @@ nTokPredict: 2048,

@@ -6,4 +6,5 @@ import { LLama } from "../index";

const llama = await LLama.load(
path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
null,
{
modelPath: path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
},
false

@@ -10,0 +11,0 @@ );

@@ -20,21 +20,103 @@ /* tslint:disable */

}
export interface LlamaInvocation {
export interface LogitBias {
token: number
bias: number
}
export interface Generate {
nThreads: number
nTokPredict: number
topK: number
/**
* logit bias for specific tokens
* Default: None
*/
logitBias?: Array<LogitBias>
/**
* top k tokens to sample from
* Range: <= 0 to use vocab size
* Default: 40
*/
topK?: number
/**
* top p tokens to sample from
* Default: 0.95
* 1.0 = disabled
*/
topP?: number
/**
* tail free sampling
* Default: 1.0
* 1.0 = disabled
*/
tfsZ?: number
/**
* temperature
* Default: 0.80
* 1.0 = disabled
*/
temp?: number
/**
* locally typical sampling
* Default: 1.0
* 1.0 = disabled
*/
typicalP?: number
/**
* repeat penalty
* Default: 1.10
* 1.0 = disabled
*/
repeatPenalty?: number
/**
* last n tokens to penalize
* Default: 64
* 0 = disable penalty, -1 = context size
*/
repeatLastN?: number
/**
* frequency penalty
* Default: 0.00
* 1.0 = disabled
*/
frequencyPenalty?: number
/**
* presence penalty
* Default: 0.00
* 1.0 = disabled
*/
presencePenalty?: number
/**
* Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
* Mirostat: A Neural Text Decoding Algorithm that Directly Controls Perplexity
* Default: 0
* 0 = disabled
* 1 = mirostat 1.0
* 2 = mirostat 2.0
*/
mirostat?: number
/**
* The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
* Default: 5.0
*/
mirostatTau?: number
/**
* The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
* Default: 0.1
*/
mirostatEta?: number
/**
* stop sequence
* Default: None
*/
stopSequence?: string
/**
* consider newlines as a repeatable token
* Default: true
*/
penalizeNl?: boolean
/** prompt */
prompt: string
}
export interface LlamaContextParams {
export interface ModelLoad {
modelPath: string
nCtx: number
nParts: number
nGpuLayers: number

@@ -56,6 +138,6 @@ seed: number

export class LLama {
static load(path: string, params: LlamaContextParams | undefined | null, enableLogger: boolean): Promise<LLama>
getWordEmbedding(params: LlamaInvocation): Promise<Array<number>>
static load(params: Partial<LoadModel>, enableLogger: boolean): Promise<LLama>
getWordEmbedding(params: Generate): Promise<Array<number>>
tokenize(params: string): Promise<Array<number>>
inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): () => void
inference(params: Generate, callback: (result: InferenceResult) => void): () => void
}
{
"name": "@llama-node/llama-cpp",
"version": "0.1.5",
"version": "0.1.6",
"main": "index.js",

@@ -5,0 +5,0 @@ "types": "index.d.ts",

@@ -164,3 +164,3 @@ import { exec, execSync } from "child_process";

const run = async () => {
console.log("Checking environment...\n...");
console.log("Checking environment...\n");
checkEnv();

@@ -167,0 +167,0 @@

@@ -5,42 +5,13 @@ use std::{ffi::CStr, ptr::null_mut, slice};

use llama_sys::{
llama_apply_lora_from_file, llama_context, llama_context_default_params, llama_context_params,
llama_eval, llama_free, llama_get_embeddings, llama_get_logits, llama_init_from_file,
llama_n_embd, llama_n_vocab, llama_print_system_info,
llama_sample_frequency_and_presence_penalties, llama_sample_repetition_penalty,
llama_sample_tail_free, llama_sample_temperature, llama_sample_token,
llama_sample_token_greedy, llama_sample_top_k, llama_sample_top_p, llama_sample_typical,
llama_apply_lora_from_file, llama_context, llama_context_params, llama_eval, llama_free,
llama_get_embeddings, llama_get_logits, llama_init_from_file, llama_n_embd, llama_n_vocab,
llama_print_system_info, llama_sample_frequency_and_presence_penalties,
llama_sample_repetition_penalty, llama_sample_tail_free, llama_sample_temperature,
llama_sample_token, llama_sample_token_greedy, llama_sample_token_mirostat,
llama_sample_token_mirostat_v2, llama_sample_top_k, llama_sample_top_p, llama_sample_typical,
llama_token, llama_token_data, llama_token_data_array, llama_token_nl, llama_token_to_str,
};
use crate::types::{LlamaContextParams, LlamaInvocation};
use crate::types::{Generate, ModelLoad};
impl LlamaContextParams {
// Returns the default parameters or the user-specified parameters.
pub fn or_default(params: &Option<LlamaContextParams>) -> llama_context_params {
match params {
Some(params) => params.clone().into(),
None => unsafe { llama_context_default_params() },
}
}
}
impl From<LlamaContextParams> for llama_context_params {
fn from(params: LlamaContextParams) -> Self {
llama_context_params {
n_ctx: params.n_ctx,
n_parts: params.n_parts,
n_gpu_layers: params.n_gpu_layers,
seed: params.seed,
f16_kv: params.f16_kv,
logits_all: params.logits_all,
vocab_only: params.vocab_only,
use_mmap: params.use_mmap,
use_mlock: params.use_mlock,
embedding: params.embedding,
progress_callback: None,
progress_callback_user_data: null_mut(),
}
}
}
// Represents the LLamaContext which wraps FFI calls to the llama.cpp library.

@@ -53,9 +24,8 @@ pub struct LLamaContext {

// Creates a new LLamaContext from the specified file and configuration parameters.
pub async fn from_file_and_params(
path: &str,
params: &Option<LlamaContextParams>,
) -> Result<Self, napi::Error> {
let lora_params = params.as_ref().and_then(|p| p.lora.clone());
let params = LlamaContextParams::or_default(params);
let ctx = unsafe { llama_init_from_file(path.as_ptr() as *const i8, params) };
pub async fn from_file_and_params(params: &ModelLoad) -> Result<Self, napi::Error> {
let lora_params = &params.lora;
let context_params = ModelLoad::to_llama_context_params(params);
let ctx = unsafe {
llama_init_from_file(params.model_path.as_ptr() as *const i8, context_params)
};

@@ -65,3 +35,3 @@ if ctx.is_null() {

"Failed to initialize LLama context from file: {}",
path
params.model_path,
)));

@@ -109,3 +79,3 @@ }

last_n_tokens: &mut [llama_token],
input: &LlamaInvocation,
input: &Generate,
context_params: &llama_context_params,

@@ -115,6 +85,7 @@ ) -> i32 {

let top_p = input.top_p.unwrap_or(0.95) as f32;
let top_k = if input.top_k <= 0 {
let top_k = input.top_k.unwrap_or(40);
let top_k = if top_k <= 0 {
unsafe { llama_n_vocab(self.ctx) }
} else {
input.top_k
top_k
};

@@ -135,2 +106,9 @@ let tfs_z = input.tfs_z.unwrap_or(1.0) as f32;

let empty_logit_bias = Vec::new();
let logit_bias = input.logit_bias.as_ref().unwrap_or(&empty_logit_bias);
let mirostat = input.mirostat.unwrap_or(0);
let mirostat_tau = input.mirostat_tau.unwrap_or(5.0) as f32;
let mirostat_eta = input.mirostat_eta.unwrap_or(0.1) as f32;
let n_vocab = unsafe { llama_n_vocab(self.ctx) };

@@ -140,3 +118,5 @@ let logits_ptr = unsafe { llama_get_logits(self.ctx) };

// TODO: apply logit bias
for i in logit_bias.iter() {
logits[i.token as usize] += i.bias as f32;
}

@@ -205,4 +185,29 @@ let mut candidates: Vec<llama_token_data> = Vec::with_capacity(n_vocab as usize);

id = unsafe { llama_sample_token_greedy(self.ctx, candidates_p) };
} else if mirostat == 1 {
let mut mirostat_mu = 2.0_f32 * mirostat_tau;
let mirostat_m = 100;
unsafe { llama_sample_temperature(self.ctx, candidates_p, temp) };
id = unsafe {
llama_sample_token_mirostat(
self.ctx,
candidates_p,
mirostat_tau,
mirostat_eta,
mirostat_m,
&mut mirostat_mu,
)
}
} else if mirostat == 2 {
let mut mirostat_mu = 2.0_f32 * mirostat_tau;
unsafe { llama_sample_temperature(self.ctx, candidates_p, temp) };
id = unsafe {
llama_sample_token_mirostat_v2(
self.ctx,
candidates_p,
mirostat_tau,
mirostat_eta,
&mut mirostat_mu,
)
}
} else {
// TODO: here we just do temp for first approach, I dont understand microstat very well, will impl later
id = unsafe {

@@ -251,3 +256,3 @@ llama_sample_top_k(self.ctx, candidates_p, top_k, 1);

n_past: i32,
input: &LlamaInvocation,
input: &Generate,
) -> Result<(), napi::Error> {

@@ -254,0 +259,0 @@ let res =

@@ -24,3 +24,3 @@ #![deny(clippy::all)]

use tokio::sync::Mutex;
use types::{InferenceResult, InferenceResultType, LlamaContextParams, LlamaInvocation};
use types::{InferenceResult, InferenceResultType, Generate, ModelLoad};

@@ -36,12 +36,12 @@ #[napi]

pub async fn load(
path: String,
params: Option<LlamaContextParams>,
#[napi(ts_arg_type = "Partial<LoadModel>")] params: serde_json::Value,
enable_logger: bool,
) -> Result<LLama> {
let params = serde_json::from_value::<ModelLoad>(params).unwrap();
let logger = LLamaLogger::get_singleton();
logger.set_enabled(enable_logger);
Ok(Self {
llama: LLamaInternal::load(path, params, enable_logger).await?,
llama: LLamaInternal::load(params, enable_logger).await?,
})

@@ -51,3 +51,3 @@ }

#[napi]
pub async fn get_word_embedding(&self, params: LlamaInvocation) -> Result<Vec<f64>> {
pub async fn get_word_embedding(&self, params: Generate) -> Result<Vec<f64>> {
let llama = self.llama.lock().await;

@@ -67,3 +67,3 @@ llama.embedding(&params).await

env: Env,
params: LlamaInvocation,
params: Generate,
#[napi(ts_arg_type = "(result: InferenceResult) => void")] callback: JsFunction,

@@ -70,0 +70,0 @@ ) -> Result<JsFunction> {

@@ -9,5 +9,3 @@ use std::sync::Arc;

tokenizer::{llama_token_eos, tokenize},
types::{
InferenceResult, InferenceResultType, InferenceToken, LlamaContextParams, LlamaInvocation,
},
types::{InferenceResult, InferenceResultType, InferenceToken, Generate, ModelLoad},
};

@@ -17,3 +15,3 @@

context: LLamaContext,
context_params: Option<LlamaContextParams>,
context_params: ModelLoad,
}

@@ -23,8 +21,7 @@

pub async fn load(
path: String,
params: Option<LlamaContextParams>,
params: ModelLoad,
enable_logger: bool,
) -> Result<Arc<Mutex<Self>>, napi::Error> {
) -> Result<Arc<Mutex<LLamaInternal>>, napi::Error> {
let llama = LLamaInternal {
context: LLamaContext::from_file_and_params(&path, &params).await?,
context: LLamaContext::from_file_and_params(&params).await?,
context_params: params,

@@ -46,3 +43,3 @@ };

pub async fn embedding(&self, input: &LlamaInvocation) -> Result<Vec<f64>, napi::Error> {
pub async fn embedding(&self, input: &Generate) -> Result<Vec<f64>, napi::Error> {
let context = &self.context;

@@ -70,3 +67,3 @@ let embd_inp = tokenize(context, input.prompt.as_str(), true);

&self,
input: &LlamaInvocation,
input: &Generate,
running: Arc<Mutex<bool>>,

@@ -76,3 +73,3 @@ callback: impl Fn(InferenceResult),

let context = &self.context;
let context_params_c = LlamaContextParams::or_default(&self.context_params);
let context_params_c = ModelLoad::to_llama_context_params(&self.context_params);
// Tokenize the stop sequence and input prompt.

@@ -79,0 +76,0 @@ let tokenized_stop_prompt = input

@@ -0,2 +1,4 @@

use llama_sys::llama_context_params;
use napi::bindgen_prelude::*;
use serde::{Deserialize, Serialize};

@@ -26,16 +28,87 @@ #[napi(object)]

#[derive(Debug, Clone)]
pub struct LlamaInvocation {
pub struct LogitBias {
pub token: i32,
pub bias: f64,
}
#[napi(object)]
#[derive(Debug, Clone)]
pub struct Generate {
pub n_threads: i32,
pub n_tok_predict: i32,
pub top_k: i32, // 40
pub top_p: Option<f64>, // default 0.95f, 1.0 = disabled
pub tfs_z: Option<f64>, // default 1.00f, 1.0 = disabled
pub temp: Option<f64>, // default 0.80f, 1.0 = disabled
pub typical_p: Option<f64>, // default 1.00f, 1.0 = disabled
pub repeat_penalty: Option<f64>, // default 1.10f, 1.0 = disabled
pub repeat_last_n: Option<i32>, // default 64, last n tokens to penalize (0 = disable penalty, -1 = context size)
pub frequency_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
pub presence_penalty: Option<f64>, // default 0.00f, 1.0 = disabled
/// logit bias for specific tokens
/// Default: None
pub logit_bias: Option<Vec<LogitBias>>,
/// top k tokens to sample from
/// Range: <= 0 to use vocab size
/// Default: 40
pub top_k: Option<i32>,
/// top p tokens to sample from
/// Default: 0.95
/// 1.0 = disabled
pub top_p: Option<f64>,
/// tail free sampling
/// Default: 1.0
/// 1.0 = disabled
pub tfs_z: Option<f64>,
/// temperature
/// Default: 0.80
/// 1.0 = disabled
pub temp: Option<f64>,
/// locally typical sampling
/// Default: 1.0
/// 1.0 = disabled
pub typical_p: Option<f64>,
/// repeat penalty
/// Default: 1.10
/// 1.0 = disabled
pub repeat_penalty: Option<f64>,
/// last n tokens to penalize
/// Default: 64
/// 0 = disable penalty, -1 = context size
pub repeat_last_n: Option<i32>,
/// frequency penalty
/// Default: 0.00
/// 1.0 = disabled
pub frequency_penalty: Option<f64>,
/// presence penalty
/// Default: 0.00
/// 1.0 = disabled
pub presence_penalty: Option<f64>,
/// Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// Mirostat: A Neural Text Decoding Algorithm that Directly Controls Perplexity
/// Default: 0
/// 0 = disabled
/// 1 = mirostat 1.0
/// 2 = mirostat 2.0
pub mirostat: Option<i32>,
/// The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// Default: 5.0
pub mirostat_tau: Option<f64>,
/// The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// Default: 0.1
pub mirostat_eta: Option<f64>,
/// stop sequence
/// Default: None
pub stop_sequence: Option<String>,
/// consider newlines as a repeatable token
/// Default: true
pub penalize_nl: Option<bool>,
/// prompt
pub prompt: String,

@@ -46,6 +119,7 @@ }

#[napi(object)]
#[derive(Debug, Clone)]
pub struct LlamaContextParams {
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default, rename_all = "camelCase")]
pub struct ModelLoad {
pub model_path: String,
pub n_ctx: i32,
pub n_parts: i32,
pub n_gpu_layers: i32,

@@ -62,4 +136,48 @@ pub seed: i32,

impl Default for ModelLoad {
fn default() -> Self {
Self {
model_path: "".to_string(),
n_ctx: 2048,
n_gpu_layers: 0,
seed: 0,
f16_kv: true,
logits_all: false,
vocab_only: false,
use_mlock: false,
embedding: false,
use_mmap: true,
lora: None,
}
}
}
impl ModelLoad {
// Returns the default parameters or the user-specified parameters.
pub fn to_llama_context_params(params: &ModelLoad) -> llama_context_params {
params.clone().into()
}
}
impl From<ModelLoad> for llama_context_params {
fn from(params: ModelLoad) -> Self {
llama_context_params {
n_ctx: params.n_ctx,
n_gpu_layers: params.n_gpu_layers,
seed: params.seed,
f16_kv: params.f16_kv,
logits_all: params.logits_all,
vocab_only: params.vocab_only,
use_mmap: params.use_mmap,
use_mlock: params.use_mlock,
embedding: params.embedding,
progress_callback: None,
progress_callback_user_data: std::ptr::null_mut(),
}
}
}
#[napi(object)]
#[derive(Debug, Clone)]
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(default, rename_all = "camelCase")]
pub struct LlamaLoraAdaptor {

@@ -66,0 +184,0 @@ pub lora_adapter: String,

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet