@llama-node/llama-cpp - npm Package Compare versions

+179

scripts/cuda-compile.mts

		import { exec, execSync } from "child_process";

		const checkClang: () => boolean = () => {
		try {
		process.stdout.write("Checking clang...");
		execSync("clang --version");
		console.log("✅");
		return true;
		} catch (error) {
		return false;
		}
		};

		const checkGcc: () => boolean = () => {
		try {
		process.stdout.write("Checking gcc...");
		execSync("gcc --version");
		console.log("✅");
		return true;
		} catch (error) {
		return false;
		}
		};

		const checkEnv = () => {
		// check if rustc is installed and available
		try {
		process.stdout.write("Checking rustc...");
		execSync("rustc --version");
		console.log("✅");
		} catch (error) {
		console.log("❌");
		console.error("rustc is not installed or not available in PATH");
		console.log("Please install rustc from https://rustup.rs/");
		process.exit(1);
		}

		// check if cargo is installed and available
		try {
		process.stdout.write("Checking cargo...");
		execSync("cargo --version");
		console.log("✅");
		} catch (error) {
		console.log("❌");
		console.error("cargo is not installed or not available in PATH");
		console.log("Please install cargo from https://rustup.rs/");
		process.exit(1);
		}

		// check if cmake is installed and available
		try {
		process.stdout.write("Checking cmake...");
		execSync("cmake --version");
		console.log("✅");
		} catch (error) {
		console.log("❌");
		console.error("cmake is not installed or not available in PATH");
		console.log(
		"Please install cmake from https://cmake.org/install/ or your package manager. Make sure to add it to PATH."
		);
		process.exit(1);
		}

		// check if llvm is installed and available
		try {
		process.stdout.write("Checking llvm...");
		execSync("llvm-config --version");
		console.log("✅");
		} catch (error) {
		console.log("❌");
		console.error("llvm is not installed or not available in PATH");
		console.log(
		"Please install llvm from https://releases.llvm.org/download.html or your package manager. Make sure to add it to PATH."
		);
		process.exit(1);
		}

		// check if clang or gcc is installed and available
		if (!checkClang() && !checkGcc()) {
		console.log("❌");

		console.error("clang or gcc is not installed or not available in PATH");

		// install clang
		console.log(
		"Please install clang from https://releases.llvm.org/download.html or your package manager. Make sure to add it to PATH."
		);

		// or install gcc
		console.log(
		"Alternatively, you can install gcc from https://gcc.gnu.org/install/ or your package manager. Make sure to add it to PATH."
		);

		process.exit(1);
		}

		// check if nvcc is installed and available
		try {
		process.stdout.write("Checking nvcc...");
		execSync("nvcc --version");
		console.log("✅");
		} catch (error) {
		console.log("❌");
		console.error("nvcc is not installed or not available in PATH");
		console.log(
		"Please install nvcc from https://developer.nvidia.com/cuda-downloads or your package manager. Make sure to add it to PATH."
		);
		process.exit(1);
		}
		};

		const compile = () => {
		const buildProcess = exec(
		`napi build --platform --release --features=cublas`
		);
		buildProcess.stdout?.pipe(process.stdout);
		buildProcess.stderr?.pipe(process.stderr);

		return new Promise<boolean>((resolve, reject) => {
		buildProcess.on("close", (code) => {
		if (code !== 0) {
		reject(code);
		} else {
		resolve(true);
		}
		});
		});
		};

		const postCompile = async () => {
		const homeDir = process.env.HOME \|\| process.env.USERPROFILE;
		const extension = process.platform === "win32" ? ".dll" : ".so";
		const libPath = `${homeDir}/.llama-node/libllama${extension}`;

		// check if libllama.so exists
		try {
		process.stdout.write("Checking libllama...");
		execSync(`ls ${libPath}`);
		console.log("✅");
		} catch (error) {
		console.error("libllama is not found");
		console.log(
		"Please make sure that libllama is compiled and installed under ~/.llama-node/"
		);
		process.exit(1);
		}

		// check if libllama.so is under the LD_LIBRARY_PATH
		try {
		process.stdout.write("Checking LD_LIBRARY_PATH...");
		execSync(`echo $LD_LIBRARY_PATH \| grep ${homeDir}/.llama-node`)
		console.log("✅");
		} catch (error) {
		console.log("\n\n");
		console.log("libllama is not under LD_LIBRARY_PATH");
		console.log("add this to your .bashrc or .zshrc:");
		console.log(
		`export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/.llama-node`
		);
		}
		};

		const run = async () => {
		console.log("Checking environment...\n...");
		checkEnv();

		console.log("\n\n");

		console.log("Compiling...\n");
		await compile();

		console.log("\n\n");

		console.log("Post-compiling...\n");
		await postCompile();
		console.log("Compile successful!");
		};

		run();

+7

-3

Cargo.toml

		@@ -13,5 +13,3 @@ [package]
		# Default enable napi4 feature, see https://nodejs.org/api/n-api.html#node-api-version-matrix
		env_logger = "0.10.0"
		log = "0.4"
		once_cell = "1.17.1"

		num_cpus = "1.15.0"
		@@ -27,4 +25,10 @@ rand = "0.8.5"

		log = "0.4.17"
		common-rs = { path = "../common-rs" }

		[build-dependencies]
		napi-build = "2.0.1"

		[features]
		default = []
		cublas = ["llama-sys/cublas"]

+1

-0

example/embedding.ts

		@@ -10,2 +10,3 @@ import { LLama, LlamaContextParams, LlamaInvocation } from "../index";
		nParts: -1,
		nGpuLayers: 0,
		seed: 0,
		@@ -12,0 +13,0 @@ f16Kv: false,

+22

-1

example/inference.ts

		@@ -0,1 +1,2 @@
		import { InferenceResultType } from "../index";
		import { LLama, LlamaInvocation } from "../index";
		@@ -7,3 +8,14 @@ import path from "path";
		path.resolve(process.cwd(), "../../ggml-vic7b-q5_1.bin"),
		null,
		{
		nGpuLayers: 32,
		nCtx: 1024,
		nParts: 1,
		seed: 0,
		f16Kv: false,
		logitsAll: false,
		vocabOnly: false,
		useMlock: false,
		embedding: false,
		useMmap: true,
		},
		true
		@@ -28,4 +40,13 @@ );

		const start = Date.now();

		let count = 0;
		llama.inference(params, (data) => {
		count += 1;
		process.stdout.write(data.data?.token ?? "");
		if (data.type === InferenceResultType.End) {
		const end = Date.now();
		console.log(`\n\nToken Count: ${count}`);
		console.log(`\n\nTime: ${end - start}ms`);
		}
		});
		@@ -32,0 +53,0 @@ };

+1

-1

example/tokenize.ts

		@@ -13,3 +13,3 @@ import { LLama } from "../index";

		llama.tokenize(template, 2048).then((data) => {
		llama.tokenize(template).then((data) => {
		console.log(data);
		@@ -16,0 +16,0 @@ });

+8

-1

index.d.ts

		@@ -39,2 +39,3 @@ /* tslint:disable */
		nParts: number
		nGpuLayers: number
		seed: number
		@@ -47,8 +48,14 @@ f16Kv: boolean
		useMmap: boolean
		lora?: LlamaLoraAdaptor
		}
		export interface LlamaLoraAdaptor {
		loraAdapter: string
		loraBase?: string
		nThreads: number
		}
		export class LLama {
		static load(path: string, params: LlamaContextParams \| undefined \| null, enableLogger: boolean): Promise<LLama>
		getWordEmbedding(params: LlamaInvocation): Promise<Array<number>>
		tokenize(params: string, nCtx: number): Promise<Array<number>>
		tokenize(params: string): Promise<Array<number>>
		inference(params: LlamaInvocation, callback: (result: InferenceResult) => void): () => void
		}

+90

-45

llama-sys/build.rs

		@@ -7,2 +7,3 @@ #![allow(clippy::uninlined_format_args)]

		use dirs::home_dir;
		use platforms::{Arch, Platform, OS};
		@@ -12,6 +13,45 @@ use std::env;

		struct BuildLinkInfo {
		link_type: String,
		#[cfg(target_os = "windows")]
		link_extension_windows: String,
		link_extension_nix: String,
		link_out_dir: String,
		cmake_link_flag: Vec<String>,
		}

		#[cfg(not(feature = "dynamic"))]
		fn get_link_info() -> BuildLinkInfo {
		BuildLinkInfo {
		link_type: "static".to_owned(),
		#[cfg(target_os = "windows")]
		link_extension_windows: "lib".to_owned(),
		link_extension_nix: "a".to_owned(),
		link_out_dir: env::var("OUT_DIR").unwrap(),
		cmake_link_flag: vec!["-DLLAMA_STATIC=ON".to_owned()],
		}
		}

		#[cfg(feature = "dynamic")]
		fn get_link_info() -> BuildLinkInfo {
		BuildLinkInfo {
		link_type: "dylib".to_owned(),
		#[cfg(target_os = "windows")]
		link_extension_windows: "dll".to_owned(),
		link_extension_nix: "so".to_owned(),
		link_out_dir: env::var("OUT_DIR").unwrap(),
		cmake_link_flag: vec![
		"-DLLAMA_STATIC=OFF".to_owned(),
		"-DBUILD_SHARED_LIBS=ON".to_owned(),
		],
		}
		}

		fn main() {
		let initial_dir = env::current_dir().unwrap();
		let home_dir = home_dir().unwrap();
		let llama_node_dir = home_dir.join(".llama-node");

		println!("cargo:warning=working_dir: {}", initial_dir.display());
		if !llama_node_dir.exists() {
		std::fs::create_dir(&llama_node_dir).expect("Unable to create .llama-node directory");
		}

		@@ -28,12 +68,6 @@ let target = env::var("TARGET").unwrap();

		#[allow(unused_mut, unused_assignments)]
		let mut link_type = "static";
		let build_link_info = get_link_info();

		#[cfg(feature = "dynamic")]
		{
		link_type = "dylib";
		}

		println!("cargo:rustc-link-search={}", env::var("OUT_DIR").unwrap());
		println!("cargo:rustc-link-lib={}=llama", link_type);
		println!("cargo:rustc-link-search={}", build_link_info.link_out_dir);
		println!("cargo:rustc-link-lib={}=llama", build_link_info.link_type);
		println!("cargo:rerun-if-changed=wrapper.h");
		@@ -51,3 +85,3 @@
		Ok(b) => {
		let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
		let out_path = PathBuf::from(build_link_info.link_out_dir.clone());
		b.write_to_file(out_path.join("bindings.rs"))
		@@ -88,17 +122,13 @@ .expect("Couldn't write bindings!");

		#[cfg(feature = "cublas")]
		{
		command.arg("-DLLAMA_CUBLAS=ON");
		for flag in build_link_info.cmake_link_flag {
		command.arg(&flag);
		}

		#[allow(unused_mut, unused_assignments)]
		let mut link_type = "-DLLAMA_STATIC=ON";
		#[cfg(feature = "dynamic")]
		#[cfg(feature = "cublas")]
		{
		command.arg("-DBUILD_SHARED_LIBS=ON");
		link_type = "-DLLAMA_STATIC=OFF";
		command
		.arg("-DLLAMA_CUBLAS=ON")
		.arg("-DCMAKE_POSITION_INDEPENDENT_CODE=ON");
		}

		command.arg(link_type);

		if platform.target_os == OS::MacOS {
		@@ -137,21 +167,2 @@ if platform.target_arch == Arch::AArch64 {

		#[allow(unused_mut, unused_assignments)]
		let mut link_ext = ("lib", "a");

		#[allow(unused_mut, unused_assignments)]
		let mut out_dir = env::var("OUT_DIR").unwrap();
		#[cfg(feature = "dynamic")]
		{
		link_ext = ("dll", "so");
		let bin_dir = initial_dir.parent().unwrap();
		let bin_dir = bin_dir.join("./@llama-node");
		println!("cargo:warning=bin_dir: {:?}", bin_dir.display());
		if !bin_dir.exists() {
		std::fs::create_dir(bin_dir.clone()).unwrap();
		}
		out_dir = bin_dir.to_str().unwrap().to_string();
		}

		println!("cargo:warning=out_dir: {:?}", out_dir);

		// move libllama.a to where Cargo expects it (OUT_DIR)
		@@ -161,6 +172,23 @@ #[cfg(target_os = "windows")]
		std::fs::copy(
		format!("Release/llama.{}", link_ext.0),
		format!("{}/llama.{}", out_dir, link_ext.0),
		format!("Release/llama.{}", build_link_info.link_extension_windows),
		format!(
		"{}/llama.{}",
		build_link_info.link_out_dir, build_link_info.link_extension_windows
		),
		)
		.expect("Failed to copy lib");

		#[cfg(feature = "dynamic")]
		{
		// move libllama.dll to llama_node_dir
		std::fs::copy(
		format!("Release/llama.{}", build_link_info.link_extension_windows),
		format!(
		"{}/llama.{}",
		llama_node_dir.display(),
		build_link_info.link_extension_windows
		),
		)
		.expect("Failed to copy lib");
		}
		}
		@@ -171,6 +199,23 @@
		std::fs::copy(
		format!("libllama.{}", link_ext.1),
		format!("{}/libllama.{}", out_dir, link_ext.1),
		format!("libllama.{}", build_link_info.link_extension_nix),
		format!(
		"{}/libllama.{}",
		build_link_info.link_out_dir, build_link_info.link_extension_nix
		),
		)
		.expect("Failed to copy lib");

		#[cfg(feature = "dynamic")]
		{
		// move libllama.so to llama_node_dir
		std::fs::copy(
		format!("libllama.{}", build_link_info.link_extension_nix),
		format!(
		"{}/libllama.{}",
		llama_node_dir.display(),
		build_link_info.link_extension_nix
		),
		)
		.expect("Failed to copy lib");
		}
		}
		@@ -177,0 +222,0 @@ // clean the llama build directory to prevent Cargo from complaining during crate publish

+1

-0

llama-sys/Cargo.toml

		@@ -12,2 +12,3 @@ [package]
		platforms = "3.0.2"
		dirs = "5.0.1"

		@@ -14,0 +15,0 @@ [features]

+2

-1

package.json

		{
		"name": "@llama-node/llama-cpp",
		"version": "0.1.4",
		"version": "0.1.5",
		"main": "index.js",
		@@ -28,2 +28,3 @@ "types": "index.d.ts",
		"build": "napi build --platform --release",
		"build:cuda": "tsx scripts/cuda-compile.mts",
		"build:debug": "napi build --platform",
		@@ -30,0 +31,0 @@ "test": "vitest",

+22

-10

src/context.rs

 @@ -14,3 +14,3 @@ use std::{ffi::CStr, ptr::null_mut, slice};
 use crate::types::{LlamaContextParams, LlamaInvocation, LlamaLoraAdaptor};
 use crate::types::{LlamaContextParams, LlamaInvocation};
 @@ -32,2 +32,3 @@ impl LlamaContextParams {
             n_parts: params.n_parts,
             n_gpu_layers: params.n_gpu_layers,
             seed: params.seed,
 @@ -56,6 +57,14 @@ f16_kv: params.f16_kv,
         params: &Option<LlamaContextParams>,
         lora_params: &Option<LlamaLoraAdaptor>,
     ) -> Self {
     ) -> Result<Self, napi::Error> {
         let lora_params = params.as_ref().and_then(|p| p.lora.clone());
         let params = LlamaContextParams::or_default(params);
         let ctx = unsafe { llama_init_from_file(path.as_ptr() as *const i8, params) };
         if ctx.is_null() {
             return Err(napi::Error::from_reason(format!(
                 "Failed to initialize LLama context from file: {}",
                 path
             )));
         if let Some(lora_params) = lora_params {
 @@ -78,15 +87,18 @@ let lora_base_path = lora_params
             if err != 0 {
                 panic!("Failed to apply LORA adapter");
                 return Err(napi::Error::from_reason(format!(
                     "Failed to apply lora adapter: {}",
                     err
                 )));
         Self { ctx }
         Ok(Self { ctx })
     pub fn llama_print_system_info(&self) {
     pub fn llama_print_system_info(&self) -> Result<()> {
         let sys_info_c_str = unsafe { llama_print_system_info() };
         let sys_info = unsafe { CStr::from_ptr(sys_info_c_str) }
             .to_str()
             .unwrap()
             .to_str()?
             .to_owned();
         log::info!("{}", sys_info);
         Ok(())
 @@ -236,3 +248,3 @@
         input: &LlamaInvocation,
     ) -> Result<(), ()> {
     ) -> Result<(), napi::Error> {
         let res =
 @@ -243,3 +255,3 @@ unsafe { llama_eval(self.ctx, tokens.as_ptr(), n_tokens, n_past, input.n_threads) };
         } else {
             Err(())
             Err(napi::Error::from_reason("LLama eval failed"))
 @@ -246,0 +258,0 @@ }

+20

-11

src/lib.rs

		@@ -13,2 +13,4 @@ #![deny(clippy::all)]

		use common_rs::logger::LLamaLogger;

		use llama::LLamaInternal;
		@@ -23,3 +25,3 @@ use napi::{
		use tokio::sync::Mutex;
		use types::{InferenceResult, LlamaContextParams, LlamaInvocation};
		use types::{InferenceResult, InferenceResultType, LlamaContextParams, LlamaInvocation};

		@@ -39,11 +41,8 @@ #[napi]
		) -> Result<LLama> {
		if enable_logger {
		env_logger::builder()
		.filter_level(log::LevelFilter::Info)
		.parse_default_env()
		.init();
		}
		let logger = LLamaLogger::get_singleton();

		logger.set_enabled(enable_logger);

		Ok(Self {
		llama: LLamaInternal::load(path, params, enable_logger).await,
		llama: LLamaInternal::load(path, params, enable_logger).await?,
		})
		@@ -59,5 +58,5 @@ }
		#[napi]
		pub async fn tokenize(&self, params: String, n_ctx: i32) -> Result<Vec<i32>> {
		pub async fn tokenize(&self, params: String) -> Result<Vec<i32>> {
		let llama = self.llama.lock().await;
		llama.tokenize(&params, n_ctx as usize).await
		llama.tokenize(&params).await
		}
		@@ -85,5 +84,15 @@
		let llama = llama.blocking_lock();
		llama.inference(&params, running, \|result\| {
		let res = llama.inference(&params, running, \|result\| {
		tsfn.call(result, ThreadsafeFunctionCallMode::NonBlocking);
		});
		if let Err(e) = res {
		tsfn.call(
		InferenceResult {
		r#type: InferenceResultType::Error,
		data: None,
		message: Some(format!("Failed to run inference: {:?}", e)),
		},
		ThreadsafeFunctionCallMode::NonBlocking,
		);
		}
		});
		@@ -90,0 +99,0 @@ }

+27

-53

src/llama.rs

		use std::sync::Arc;

		use anyhow::Result;
		use tokio::sync::Mutex;
		@@ -23,5 +24,5 @@
		enable_logger: bool,
		) -> Arc<Mutex<Self>> {
		) -> Result<Arc<Mutex<Self>>, napi::Error> {
		let llama = LLamaInternal {
		context: LLamaContext::from_file_and_params(&path, &params, &None).await,
		context: LLamaContext::from_file_and_params(&path, &params).await?,
		context_params: params,
		@@ -31,14 +32,12 @@ };
		if enable_logger {
		llama.context.llama_print_system_info();
		llama.context.llama_print_system_info().map_err(\|e\| {
		napi::Error::from_reason(format!("Failed to print system info: {:?}", e))
		})?;
		}

		Arc::new(Mutex::new(llama))
		Ok(Arc::new(Mutex::new(llama)))
		}
		pub async fn tokenize(&self, input: &str, n_ctx: usize) -> Result<Vec<i32>, napi::Error> {
		pub async fn tokenize(&self, input: &str) -> Result<Vec<i32>, napi::Error> {
		let context = &self.context;
		if let Ok(data) = tokenize(context, input, n_ctx, false) {
		Ok(data)
		} else {
		Err(napi::Error::from_reason("Failed to tokenize"))
		}
		Ok(tokenize(context, input, false))
		}
		@@ -48,10 +47,3 @@
		let context = &self.context;
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		let embd_inp = tokenize(
		context,
		input.prompt.as_str(),
		context_params_c.n_ctx as usize,
		true,
		)
		.unwrap();
		let embd_inp = tokenize(context, input.prompt.as_str(), true);

		@@ -64,3 +56,3 @@ // let end_text = "\n";
		.llama_eval(embd_inp.as_slice(), embd_inp.len() as i32, 0, input)
		.unwrap();
		.map_err(\|e\| napi::Error::from_reason(format!("Failed to evaluate input: {:?}", e)))?;

		@@ -81,25 +73,14 @@ let embeddings = context.llama_get_embeddings();
		callback: impl Fn(InferenceResult),
		) {
		) -> Result<(), napi::Error> {
		let context = &self.context;
		let context_params_c = LlamaContextParams::or_default(&self.context_params);
		// Tokenize the stop sequence and input prompt.
		let tokenized_stop_prompt = input.stop_sequence.as_ref().map(\|stop_sequence\| {
		tokenize(
		context,
		stop_sequence,
		context_params_c.n_ctx as usize,
		false,
		)
		.unwrap()
		});
		let tokenized_stop_prompt = input
		.stop_sequence
		.as_ref()
		.map(\|stop_sequence\| tokenize(context, stop_sequence, false));

		log::info!("tokenized_stop_prompt: {:?}", tokenized_stop_prompt);

		let tokenized_input = tokenize(
		context,
		input.prompt.as_str(),
		context_params_c.n_ctx as usize,
		true,
		)
		.unwrap();
		let tokenized_input = tokenize(context, input.prompt.as_str(), true);

		@@ -111,10 +92,8 @@ // Embd contains the prompt and the completion. The longer the prompt, the shorter the completion.
		// Feed prompt to the model.
		context
		.llama_eval(
		tokenized_input.as_slice(),
		tokenized_input.len() as i32,
		0,
		input,
		)
		.unwrap();
		context.llama_eval(
		tokenized_input.as_slice(),
		tokenized_input.len() as i32,
		0,
		input,
		)?;
		let token_eos = llama_token_eos();
		@@ -150,8 +129,3 @@
		{
		callback(InferenceResult {
		r#type: InferenceResultType::Error,
		data: None,
		message: Some("Too many tokens predicted".to_string()),
		});
		break;
		return Err(napi::Error::from_reason("Too many tokens predicted"));
		}
		@@ -188,5 +162,3 @@
		// Continue feeding the token to the model.
		context
		.llama_eval(&embd[n_used..], 1, n_used as i32, input)
		.unwrap();
		context.llama_eval(&embd[n_used..], 1, n_used as i32, input)?;
		}
		@@ -210,3 +182,5 @@
		});

		Ok(())
		}
		}

+3

-12

src/tokenizer.rs

		// use crate::output::Output;
		use anyhow::Result;
		// use anyhow::Result;
		use std::ffi::CString;
		@@ -31,13 +31,4 @@ use std::os::raw::c_char;
		/// A Result containing a Vec of llama_tokens on success, or an error if the tokenized input is too long.
		pub(crate) fn tokenize(
		context: &LLamaContext,
		text: &str,
		context_window_size: usize,
		add_bos: bool,
		) -> Result<Vec<llama_token>> {
		let tokenized_input = llama_tokenize_helper(context, text, add_bos);
		if tokenized_input.len() > context_window_size {
		anyhow::bail!("Input too long")
		}
		Ok(tokenized_input)
		pub(crate) fn tokenize(context: &LLamaContext, text: &str, add_bos: bool) -> Vec<llama_token> {
		llama_tokenize_helper(context, text, add_bos)
		}
		@@ -44,0 +35,0 @@

+2

-0

src/types.rs

		@@ -49,2 +49,3 @@ use napi::bindgen_prelude::*;
		pub n_parts: i32,
		pub n_gpu_layers: i32,
		pub seed: i32,
		@@ -57,2 +58,3 @@ pub f16_kv: bool,
		pub use_mmap: bool,
		pub lora: Option<LlamaLoraAdaptor>,
		}
		@@ -59,0 +61,0 @@

@llama-node/llama-cpp.darwin-arm64.node