node-llama-cpp - npm Package Compare versions

dist/chatWrappers/ChatMLPromptWrapper.d.ts

dist/chatWrappers/ChatMLPromptWrapper.js

dist/chatWrappers/ChatMLPromptWrapper.js.map

dist/chatWrappers/createChatWrapperByBos.d.ts

dist/chatWrappers/createChatWrapperByBos.js

dist/chatWrappers/createChatWrapperByBos.js.map

dist/llamaEvaluator/LlamaGrammar.d.ts

dist/llamaEvaluator/LlamaGrammar.js

dist/llamaEvaluator/LlamaGrammar.js.map

dist/types.d.ts

dist/types.js

dist/types.js.map

dist/utils/getGrammarsFolder.d.ts

dist/utils/getGrammarsFolder.js

dist/utils/getGrammarsFolder.js.map

dist/utils/getTextCompletion.d.ts

dist/utils/getTextCompletion.js

dist/utils/getTextCompletion.js.map

dist/utils/removeNullFields.d.ts

dist/utils/removeNullFields.js

dist/utils/removeNullFields.js.map

llama/grammars/arithmetic.gbnf

llama/grammars/chess.gbnf

llama/grammars/japanese.gbnf

llama/grammars/json.gbnf

llama/grammars/list.gbnf

llama/grammars/README.md

3

dist/ChatPromptWrapper.d.ts

		export declare abstract class ChatPromptWrapper {
		abstract readonly wrapperName: string;
		wrapPrompt(prompt: string, { systemPrompt, promptIndex }: {
		systemPrompt: string;
		promptIndex: number;
		lastStopString: string \| null;
		lastStopStringSuffix: string \| null;
		}): string;
		getStopStrings(): string[];
		}

1

dist/chatWrappers/EmptyChatPromptWrapper.d.ts

		import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
		export declare class EmptyChatPromptWrapper extends ChatPromptWrapper {
		readonly wrapperName: string;
		}

1

dist/chatWrappers/EmptyChatPromptWrapper.js

		import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
		export class EmptyChatPromptWrapper extends ChatPromptWrapper {
		wrapperName = "Empty";
		}
		//# sourceMappingURL=EmptyChatPromptWrapper.js.map

12

dist/chatWrappers/GeneralChatPromptWrapper.d.ts

		import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
		export declare class GeneralChatPromptWrapper extends ChatPromptWrapper {
		wrapPrompt(prompt: string, { systemPrompt, promptIndex }: {
		readonly wrapperName: string;
		private readonly _instructionName;
		private readonly _responseName;
		constructor({ instructionName, responseName }?: {
		instructionName?: string;
		responseName?: string;
		});
		wrapPrompt(prompt: string, { systemPrompt, promptIndex, lastStopString, lastStopStringSuffix }: {
		systemPrompt: string;
		promptIndex: number;
		lastStopString: string \| null;
		lastStopStringSuffix: string \| null;
		}): string;
		getStopStrings(): string[];
		private _getPromptPrefix;
		}

32

dist/chatWrappers/GeneralChatPromptWrapper.js

		import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
		import { getTextCompletion } from "../utils/getTextCompletion.js";
		export class GeneralChatPromptWrapper extends ChatPromptWrapper {
		wrapPrompt(prompt, { systemPrompt, promptIndex }) {
		const conversationPrompt = "\n\n### Human:\n\n" + prompt + "\n\n### Assistant:\n\n";
		return promptIndex === 0 ? systemPrompt + conversationPrompt : conversationPrompt;
		wrapperName = "General";
		_instructionName;
		_responseName;
		constructor({ instructionName = "Human", responseName = "Assistant" } = {}) {
		super();
		this._instructionName = instructionName;
		this._responseName = responseName;
		}
		wrapPrompt(prompt, { systemPrompt, promptIndex, lastStopString, lastStopStringSuffix }) {
		if (promptIndex === 0)
		return systemPrompt + `\n\n### ${this._instructionName}:\n\n` + prompt + `\n\n### ${this._responseName}:\n\n`;
		return this._getPromptPrefix(lastStopString, lastStopStringSuffix) + prompt + `\n\n### ${this._responseName}:\n\n`;
		}
		getStopStrings() {
		return ["### Human:", "Human:", "### Assistant:", "Assistant:", "<end>"];
		return [
		`\n\n### ${this._instructionName}`,
		`### ${this._instructionName}`,
		`\n\n### ${this._responseName}`,
		`### ${this._responseName}`,
		"<end>"
		];
		}
		_getPromptPrefix(lastStopString, lastStopStringSuffix) {
		return getTextCompletion(lastStopString === "<end>"
		? lastStopStringSuffix
		: (lastStopString + (lastStopStringSuffix ?? "")), [
		`\n\n### ${this._instructionName}:\n\n`,
		`### ${this._instructionName}:\n\n`
		]) ?? `\n\n### ${this._instructionName}:\n\n`;
		}
		}
		//# sourceMappingURL=GeneralChatPromptWrapper.js.map

5

dist/chatWrappers/LlamaChatPromptWrapper.d.ts

		import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
		export declare class LlamaChatPromptWrapper extends ChatPromptWrapper {
		wrapPrompt(prompt: string, { systemPrompt, promptIndex }: {
		readonly wrapperName: string;
		wrapPrompt(prompt: string, { systemPrompt, promptIndex, lastStopString, lastStopStringSuffix }: {
		systemPrompt: string;
		promptIndex: number;
		lastStopString: string \| null;
		lastStopStringSuffix: string \| null;
		}): string;
		getStopStrings(): string[];
		}

12

dist/chatWrappers/LlamaChatPromptWrapper.js

		import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
		import { getTextCompletion } from "../utils/getTextCompletion.js";
		// source: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
		export class LlamaChatPromptWrapper extends ChatPromptWrapper {
		wrapPrompt(prompt, { systemPrompt, promptIndex }) {
		wrapperName = "LlamaChat";
		wrapPrompt(prompt, { systemPrompt, promptIndex, lastStopString, lastStopStringSuffix }) {
		const previousCompletionEnd = (lastStopString ?? "") + (lastStopStringSuffix ?? "");
		if (promptIndex === 0 && systemPrompt != "") {
		return "<s>[INST] <<SYS>>\n" + systemPrompt + "\n<</SYS>>\n\n" + prompt + " [/INST]\n\n";
		return (getTextCompletion(previousCompletionEnd, "<s>[INST] <<SYS>>\n") ?? "<s>[INST] <<SYS>>\n") + systemPrompt +
		"\n<</SYS>>\n\n" + prompt + " [/INST]\n\n";
		}
		else {
		return "<s>[INST] " + prompt + " [/INST]\n\n";
		return (getTextCompletion(previousCompletionEnd, "</s><s>[INST] ") ?? "<s>[INST] ") + prompt + " [/INST]\n\n";
		}
		}
		getStopStrings() {
		return ["</s><s>[INST]"];
		return ["</s>"];
		}
		}
		//# sourceMappingURL=LlamaChatPromptWrapper.js.map

4

dist/cli/commands/BuildCommand.d.ts

		@@ -5,5 +5,7 @@ import { CommandModule } from "yargs";
		nodeTarget?: string;
		metal: boolean;
		cuda: boolean;
		};
		export declare const BuildCommand: CommandModule<object, BuildCommand>;
		export declare function BuildLlamaCppCommand({ arch, nodeTarget }: BuildCommand): Promise<void>;
		export declare function BuildLlamaCppCommand({ arch, nodeTarget, metal, cuda }: BuildCommand): Promise<void>;
		export {};

26

dist/cli/commands/BuildCommand.js

		@@ -0,1 +1,2 @@
		import process from "process";
		import chalk from "chalk";
		@@ -5,2 +6,3 @@ import { compileLlamaCpp } from "../../utils/compileLLamaCpp.js";
		import { clearTempFolder } from "../../utils/clearTempFolder.js";
		import { defaultLlamaCppCudaSupport, defaultLlamaCppMetalSupport } from "../../config.js";
		export const BuildCommand = {
		@@ -12,2 +14,3 @@ command: "build",
		.option("arch", {
		alias: "a",
		type: "string",
		@@ -17,4 +20,15 @@ description: "The architecture to compile llama.cpp for"
		.option("nodeTarget", {
		alias: "t",
		type: "string",
		description: "The Node.js version to compile llama.cpp for. Example: v18.0.0"
		})
		.option("metal", {
		type: "boolean",
		default: defaultLlamaCppMetalSupport,
		description: "Compile llama.cpp with Metal support. Can also be set via the NODE_LLAMA_CPP_METAL environment variable"
		})
		.option("cuda", {
		type: "boolean",
		default: defaultLlamaCppCudaSupport,
		description: "Compile llama.cpp with CUDA support. Can also be set via the NODE_LLAMA_CPP_CUDA environment variable"
		});
		@@ -24,3 +38,9 @@ },
		};
		export async function BuildLlamaCppCommand({ arch, nodeTarget }) {
		export async function BuildLlamaCppCommand({ arch, nodeTarget, metal, cuda }) {
		if (metal && process.platform === "darwin") {
		console.log(`${chalk.yellow("Metal:")} enabled`);
		}
		if (cuda) {
		console.log(`${chalk.yellow("CUDA:")} enabled`);
		}
		await withOra({
		@@ -34,3 +54,5 @@ loading: chalk.blue("Compiling llama.cpp"),
		nodeTarget: nodeTarget ? nodeTarget : undefined,
		setUsedBingFlag: true
		setUsedBingFlag: true,
		metal,
		cuda
		});
		@@ -37,0 +59,0 @@ });

8

dist/cli/commands/ChatCommand.d.ts

		import { CommandModule } from "yargs";
		import type { LlamaGrammar } from "../../llamaEvaluator/LlamaGrammar.js";
		type ChatCommand = {
		@@ -6,6 +7,11 @@ model: string;
		systemPrompt: string;
		wrapper: string;
		wrapper: "auto" \| "general" \| "llamaChat" \| "chatML";
		contextSize: number;
		grammar: "text" \| Parameters<typeof LlamaGrammar.getFor>[0];
		temperature: number;
		topK: number;
		topP: number;
		maxTokens: number;
		};
		export declare const ChatCommand: CommandModule<object, ChatCommand>;
		export {};

99

dist/cli/commands/ChatCommand.js

		@@ -8,2 +8,4 @@ import * as readline from "readline/promises";
		import { GeneralChatPromptWrapper } from "../../chatWrappers/GeneralChatPromptWrapper.js";
		import { ChatMLPromptWrapper } from "../../chatWrappers/ChatMLPromptWrapper.js";
		import { getChatWrapperByBos } from "../../chatWrappers/createChatWrapperByBos.js";
		export const ChatCommand = {
		@@ -15,2 +17,3 @@ command: "chat",
		.option("model", {
		alias: "m",
		type: "string",
		@@ -22,2 +25,3 @@ demandOption: true,
		.option("systemInfo", {
		alias: "i",
		type: "boolean",
		@@ -29,2 +33,3 @@ default: false,
		.option("systemPrompt", {
		alias: "s",
		type: "string",
		@@ -38,9 +43,11 @@ default: defaultChatSystemPrompt,
		.option("wrapper", {
		alias: "w",
		type: "string",
		default: "general",
		choices: ["general", "llama"],
		description: "Chat wrapper to use",
		choices: ["auto", "general", "llamaChat", "chatML"],
		description: "Chat wrapper to use. Use `auto` to automatically select a wrapper based on the model's BOS token",
		group: "Optional:"
		})
		.option("contextSize", {
		alias: "c",
		type: "number",
		@@ -50,7 +57,43 @@ default: 1024 * 4,
		group: "Optional:"
		})
		.option("grammar", {
		alias: "g",
		type: "string",
		default: "text",
		choices: ["text", "json", "list", "arithmetic", "japanese", "chess"],
		description: "Restrict the model response to a specific grammar, like JSON for example",
		group: "Optional:"
		})
		.option("temperature", {
		alias: "t",
		type: "number",
		default: 0,
		description: "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The suggested temperature is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. Set to `0` to disable.",
		group: "Optional:"
		})
		.option("topK", {
		alias: "k",
		type: "number",
		default: 40,
		description: "Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation. An integer number between `1` and the size of the vocabulary. Set to `0` to disable (which uses the full vocabulary). Only relevant when `temperature` is set to a value greater than 0.",
		group: "Optional:"
		})
		.option("topP", {
		alias: "p",
		type: "number",
		default: 0.95,
		description: "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`.",
		group: "Optional:"
		})
		.option("maxTokens", {
		alias: "mt",
		type: "number",
		default: 0,
		description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size",
		group: "Optional:"
		});
		},
		async handler({ model, systemInfo, systemPrompt, wrapper, contextSize }) {
		async handler({ model, systemInfo, systemPrompt, wrapper, contextSize, grammar, temperature, topK, topP, maxTokens }) {
		try {
		await RunChat({ model, systemInfo, systemPrompt, wrapper, contextSize });
		await RunChat({ model, systemInfo, systemPrompt, wrapper, contextSize, grammar, temperature, topK, topP, maxTokens });
		}
		@@ -63,11 +106,23 @@ catch (err) {
		};
		async function RunChat({ model: modelArg, systemInfo, systemPrompt, wrapper, contextSize }) {
		async function RunChat({ model: modelArg, systemInfo, systemPrompt, wrapper, contextSize, grammar: grammarArg, temperature, topK, topP, maxTokens }) {
		const { LlamaChatSession } = await import("../../llamaEvaluator/LlamaChatSession.js");
		const { LlamaModel } = await import("../../llamaEvaluator/LlamaModel.js");
		const { LlamaContext } = await import("../../llamaEvaluator/LlamaContext.js");
		const { LlamaGrammar } = await import("../../llamaEvaluator/LlamaGrammar.js");
		const model = new LlamaModel({
		modelPath: modelArg,
		contextSize
		contextSize,
		temperature,
		topK,
		topP
		});
		const context = new LlamaContext({ model });
		const context = new LlamaContext({
		model,
		grammar: grammarArg !== "text"
		? await LlamaGrammar.getFor(grammarArg)
		: undefined
		});
		const bos = context.getBosString(); // bos = beginning of sequence
		const eos = context.getEosString(); // eos = end of sequence
		const promptWrapper = getChatWrapper(wrapper, bos);
		const session = new LlamaChatSession({
		@@ -77,4 +132,7 @@ context,
		systemPrompt,
		promptWrapper: createChatWrapper(wrapper)
		promptWrapper
		});
		console.info(`${chalk.yellow("BOS:")} ${bos}`);
		console.info(`${chalk.yellow("EOS:")} ${eos}`);
		console.info(`${chalk.yellow("Chat wrapper:")} ${promptWrapper.wrapperName}`);
		await withOra({
		@@ -101,4 +159,11 @@ loading: chalk.blue("Loading model"),
		process.stdout.write(startColor);
		await session.prompt(input, (chunk) => {
		process.stdout.write(session.context.decode(Uint32Array.from(chunk)));
		await session.prompt(input, {
		maxTokens: maxTokens === -1
		? context.getContextSize()
		: maxTokens <= 0
		? undefined
		: maxTokens,
		onToken(chunk) {
		process.stdout.write(session.context.decode(Uint32Array.from(chunk)));
		}
		});
		@@ -109,11 +174,21 @@ process.stdout.write(endColor);
		}
		function createChatWrapper(wrapper) {
		function getChatWrapper(wrapper, bos) {
		switch (wrapper) {
		case "general":
		return new GeneralChatPromptWrapper();
		case "llama":
		case "llamaChat":
		return new LlamaChatPromptWrapper();
		case "chatML":
		return new ChatMLPromptWrapper();
		default:
		}
		if (wrapper === "auto") {
		const chatWrapper = getChatWrapperByBos(bos);
		if (chatWrapper != null)
		return new chatWrapper();
		return new GeneralChatPromptWrapper();
		}
		void (wrapper);
		throw new Error("Unknown wrapper: " + wrapper);
		}
		//# sourceMappingURL=ChatCommand.js.map

2

dist/cli/commands/ClearCommand.js

		@@ -9,3 +9,3 @@ import * as fs from "fs-extra";
		command: "clear [type]",
		describe: "Clear files created by llama-cli",
		describe: "Clear files created by node-llama-cpp",
		builder(yargs) {
		@@ -12,0 +12,0 @@ return yargs

4

dist/cli/commands/DownloadCommand.d.ts

		@@ -7,2 +7,4 @@ import { CommandModule } from "yargs";
		nodeTarget?: string;
		metal: boolean;
		cuda: boolean;
		skipBuild?: boolean;
		@@ -12,3 +14,3 @@ updateBinariesReleaseMetadata?: boolean;
		export declare const DownloadCommand: CommandModule<object, DownloadCommandArgs>;
		export declare function DownloadLlamaCppCommand({ repo, release, arch, nodeTarget, skipBuild, updateBinariesReleaseMetadata }: DownloadCommandArgs): Promise<void>;
		export declare function DownloadLlamaCppCommand({ repo, release, arch, nodeTarget, metal, cuda, skipBuild, updateBinariesReleaseMetadata }: DownloadCommandArgs): Promise<void>;
		export {};

129

dist/cli/commands/DownloadCommand.js

		@@ -1,10 +0,9 @@
		import * as path from "path";
		import process from "process";
		import path from "path";
		import { Octokit } from "octokit";
		import * as fs from "fs-extra";
		import fs from "fs-extra";
		import chalk from "chalk";
		import { DownloaderHelper } from "node-downloader-helper";
		import cliProgress from "cli-progress";
		import bytes from "bytes";
		import StreamZip from "node-stream-zip";
		import { defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, llamaCppDirectory, tempDownloadDirectory } from "../../config.js";
		import simpleGit from "simple-git";
		import { defaultLlamaCppCudaSupport, defaultLlamaCppGitHubRepo, defaultLlamaCppMetalSupport, defaultLlamaCppRelease, llamaCppDirectory } from "../../config.js";
		import { compileLlamaCpp } from "../../utils/compileLLamaCpp.js";
		@@ -30,2 +29,3 @@ import withOra from "../../utils/withOra.js";
		.option("arch", {
		alias: "a",
		type: "string",
		@@ -35,6 +35,19 @@ description: "The architecture to compile llama.cpp for"
		.option("nodeTarget", {
		alias: "t",
		type: "string",
		description: "The Node.js version to compile llama.cpp for. Example: v18.0.0"
		})
		.option("metal", {
		type: "boolean",
		default: defaultLlamaCppMetalSupport,
		hidden: process.platform !== "darwin",
		description: "Compile llama.cpp with Metal support. Can also be set via the NODE_LLAMA_CPP_METAL environment variable"
		})
		.option("cuda", {
		type: "boolean",
		default: defaultLlamaCppCudaSupport,
		description: "Compile llama.cpp with CUDA support. Can also be set via the NODE_LLAMA_CPP_CUDA environment variable"
		})
		.option("skipBuild", {
		alias: "sb",
		type: "boolean",
		@@ -53,3 +66,3 @@ default: false,
		};
		export async function DownloadLlamaCppCommand({ repo, release, arch, nodeTarget, skipBuild, updateBinariesReleaseMetadata }) {
		export async function DownloadLlamaCppCommand({ repo, release, arch, nodeTarget, metal, cuda, skipBuild, updateBinariesReleaseMetadata }) {
		const octokit = new Octokit();
		@@ -59,5 +72,12 @@ const [githubOwner, githubRepo] = repo.split("/");
		console.log(`${chalk.yellow("Release:")} ${release}`);
		if (!skipBuild) {
		if (metal && process.platform === "darwin") {
		console.log(`${chalk.yellow("Metal:")} enabled`);
		}
		if (cuda) {
		console.log(`${chalk.yellow("CUDA:")} enabled`);
		}
		}
		console.log();
		let githubRelease = null;
		let zipUrl;
		await withOra({
		@@ -89,18 +109,7 @@ loading: chalk.blue("Fetching llama.cpp info"),
		}
		if (githubRelease.data?.zipball_url == null) {
		throw new Error(`Failed to find a zip archive for release "${release}" of "${repo}"`);
		if (githubRelease.data.tag_name == null) {
		throw new Error(`Failed to find tag of release "${release}" of "${repo}"`);
		}
		const zipUrlResponse = await octokit.rest.repos.downloadZipballArchive({
		owner: githubOwner,
		repo: githubRepo,
		ref: githubRelease.data.target_commitish
		});
		if (zipUrlResponse.url == null)
		throw new Error(`Failed to get zip archive url for release "${release}" of "${repo}"`);
		zipUrl = zipUrlResponse.url;
		});
		await clearTempFolder();
		console.log(chalk.blue("Downloading zip file"));
		await fs.ensureDir(tempDownloadDirectory);
		await downloadFile(zipUrl, "llama.cpp.zip", tempDownloadDirectory);
		await withOra({
		@@ -113,16 +122,17 @@ loading: chalk.blue("Removing existing llama.cpp directory"),
		});
		console.log(chalk.blue("Cloning llama.cpp"));
		await cloneTag(githubOwner, githubRepo, githubRelease.data.tag_name, llamaCppDirectory);
		await withOra({
		loading: chalk.blue("Extracting llama.cpp.zip file"),
		success: chalk.blue("Extracted llama.cpp.zip file"),
		fail: chalk.blue("Failed to extract llama.cpp.zip file")
		loading: chalk.blue("Generating required files"),
		success: chalk.blue("Generated required files"),
		fail: chalk.blue("Failed to generate required files")
		}, async () => {
		await unzipLlamaReleaseZipFile(path.join(tempDownloadDirectory, "llama.cpp.zip"), llamaCppDirectory);
		const buildInfoTemplateFilePath = path.join(llamaCppDirectory, "scripts", "build-info.h.in");
		const buildInfoResultFilePath = path.join(llamaCppDirectory, "build-info.h");
		const buildInfoTemplateFile = await fs.readFile(buildInfoTemplateFilePath, "utf8");
		const finalFile = buildInfoTemplateFile
		.replaceAll("@BUILD_NUMBER@", "1")
		.replaceAll("@BUILD_COMMIT@", githubRelease.data.tag_name);
		await fs.writeFile(buildInfoResultFilePath, finalFile, "utf8");
		});
		await withOra({
		loading: chalk.blue("Removing temporary files"),
		success: chalk.blue("Removed temporary files"),
		fail: chalk.blue("Failed to remove temporary files")
		}, async () => {
		await clearTempFolder();
		});
		if (!skipBuild) {
		@@ -133,3 +143,5 @@ console.log(chalk.blue("Compiling llama.cpp"));
		nodeTarget: nodeTarget ? nodeTarget : undefined,
		setUsedBingFlag: true
		setUsedBingFlag: true,
		metal,
		cuda
		});
		@@ -147,10 +159,3 @@ }
		}
		async function downloadFile(url, fileName, directory) {
		const download = new DownloaderHelper(url, directory, {
		fileName: fileName,
		retry: {
		maxRetries: 10,
		delay: 1000 * 6
		}
		});
		async function cloneTag(githubOwner, githubRepo, tag, directory) {
		const progressBar = new cliProgress.Bar({
		@@ -160,39 +165,25 @@ clearOnComplete: false,
		autopadding: true,
		format: `${chalk.bold("{filename}")} ${chalk.yellow("{percentage}%")} ${chalk.cyan("{bar}")} {speed}${chalk.grey("{eta_formatted}")}`
		format: `${chalk.bold("Clone {repo}")} ${chalk.yellow("{percentage}%")} ${chalk.cyan("{bar}")} ${chalk.grey("{eta_formatted}")}`
		}, cliProgress.Presets.shades_classic);
		progressBar.start(100, 0, {
		speed: "",
		filename: fileName
		repo: `${githubOwner}/${githubRepo}`
		});
		download.on("progress", (stats) => {
		progressBar.update(Math.floor((stats.downloaded / stats.total) * 10000) / 100, {
		speed: Number.isFinite(stats.speed) ? chalk.blue((bytes(stats.speed) + "/s").padEnd(10)) + chalk.grey(" \| ") : ""
		try {
		await simpleGit({
		progress({ progress, total, processed }) {
		const totalProgress = (processed / 100) + (progress / total);
		progressBar.update(Math.floor(totalProgress * 10000) / 100);
		}
		}).clone(`https://github.com/${githubOwner}/${githubRepo}.git`, directory, {
		"--depth": 1,
		"--branch": tag,
		"--quiet": null
		});
		});
		download.on("end", () => {
		}
		finally {
		progressBar.update(100);
		progressBar.stop();
		});
		// errors are handled by the .start() method
		// this listener is here to not get an unhandled error exception
		download.on("error", () => { });
		await download.start();
		}
		async function unzipLlamaReleaseZipFile(zipFilePath, directory) {
		const zip = new StreamZip.async({ file: zipFilePath });
		const entires = await zip.entries();
		const rootFolderEntries = new Map();
		for (const entry of Object.values(entires)) {
		const entryPath = entry.name.split("/");
		const rootFolderName = entryPath[0];
		const rootFolderEntryCount = rootFolderEntries.get(rootFolderName) ?? 0;
		rootFolderEntries.set(rootFolderName, rootFolderEntryCount + 1);
		}
		const mostUsedRootFolderName = [...rootFolderEntries.keys()]
		.sort((a, b) => rootFolderEntries.get(b) - rootFolderEntries.get(a))
		.shift();
		if (mostUsedRootFolderName == null)
		throw new Error("Failed to find the root folder of the llama.cpp release zip file");
		await zip.extract(mostUsedRootFolderName, directory);
		}
		//# sourceMappingURL=DownloadCommand.js.map

6

dist/cli/commands/OnPostInstallCommand.js

		@@ -1,2 +0,2 @@
		import { defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, defaultSkipDownload } from "../../config.js";
		import { defaultLlamaCppCudaSupport, defaultLlamaCppGitHubRepo, defaultLlamaCppMetalSupport, defaultLlamaCppRelease, defaultSkipDownload } from "../../config.js";
		import { getPrebuildBinPath } from "../../utils/getBin.js";
		@@ -15,3 +15,5 @@ import { DownloadLlamaCppCommand } from "./DownloadCommand.js";
		repo: defaultLlamaCppGitHubRepo,
		release: defaultLlamaCppRelease
		release: defaultLlamaCppRelease,
		metal: defaultLlamaCppMetalSupport,
		cuda: defaultLlamaCppCudaSupport
		});
		@@ -18,0 +20,0 @@ }

4

dist/config.d.ts

		export declare const llamaDirectory: string;
		export declare const llamaBinsDirectory: string;
		export declare const llamaBinsGrammarsDirectory: string;
		export declare const llamaCppDirectory: string;
		export declare const llamaCppGrammarsDirectory: string;
		export declare const tempDownloadDirectory: string;
		@@ -9,3 +11,5 @@ export declare const usedBinFlagJsonPath: string;
		export declare const defaultLlamaCppRelease: string;
		export declare const defaultLlamaCppMetalSupport: boolean;
		export declare const defaultLlamaCppCudaSupport: boolean;
		export declare const defaultSkipDownload: boolean;
		export declare const defaultChatSystemPrompt: string;

8

dist/config.js

		@@ -11,3 +11,5 @@ import { fileURLToPath } from "url";
		export const llamaBinsDirectory = path.join(__dirname, "..", "llamaBins");
		export const llamaBinsGrammarsDirectory = path.join(__dirname, "..", "llama", "grammars");
		export const llamaCppDirectory = path.join(llamaDirectory, "llama.cpp");
		export const llamaCppGrammarsDirectory = path.join(llamaDirectory, "llama.cpp", "grammars");
		export const tempDownloadDirectory = path.join(os.tmpdir(), "node-llama-cpp", uuid.v4());
		@@ -22,2 +24,8 @@ export const usedBinFlagJsonPath = path.join(llamaDirectory, "usedBin.json");
		.asString();
		export const defaultLlamaCppMetalSupport = env.get("NODE_LLAMA_CPP_METAL")
		.default("false")
		.asBool();
		export const defaultLlamaCppCudaSupport = env.get("NODE_LLAMA_CPP_CUDA")
		.default("false")
		.asBool();
		export const defaultSkipDownload = env.get("NODE_LLAMA_CPP_SKIP_DOWNLOAD")
		@@ -24,0 +32,0 @@ .default("false")

6

dist/index.d.ts

		@@ -0,1 +1,2 @@
		import { LlamaGrammar } from "llamaEvaluator/LlamaGrammar.js";
		import { LlamaChatSession } from "./llamaEvaluator/LlamaChatSession.js";
		@@ -9,2 +10,5 @@ import { LlamaModel } from "./llamaEvaluator/LlamaModel.js";
		import { LlamaContext } from "./llamaEvaluator/LlamaContext.js";
		export { LlamaModel, LlamaContext, LlamaChatSession, AbortError, ChatPromptWrapper, EmptyChatPromptWrapper, LlamaChatPromptWrapper, GeneralChatPromptWrapper };
		import { ChatMLPromptWrapper } from "./chatWrappers/ChatMLPromptWrapper.js";
		import { getChatWrapperByBos } from "./chatWrappers/createChatWrapperByBos.js";
		import { type Token } from "./types.js";
		export { LlamaModel, LlamaGrammar, LlamaContext, LlamaChatSession, AbortError, ChatPromptWrapper, EmptyChatPromptWrapper, LlamaChatPromptWrapper, GeneralChatPromptWrapper, ChatMLPromptWrapper, getChatWrapperByBos, type Token };

5

dist/index.js

		@@ -0,1 +1,2 @@
		import { LlamaGrammar } from "llamaEvaluator/LlamaGrammar.js";
		import { LlamaChatSession } from "./llamaEvaluator/LlamaChatSession.js";
		@@ -9,3 +10,5 @@ import { LlamaModel } from "./llamaEvaluator/LlamaModel.js";
		import { LlamaContext } from "./llamaEvaluator/LlamaContext.js";
		export { LlamaModel, LlamaContext, LlamaChatSession, AbortError, ChatPromptWrapper, EmptyChatPromptWrapper, LlamaChatPromptWrapper, GeneralChatPromptWrapper };
		import { ChatMLPromptWrapper } from "./chatWrappers/ChatMLPromptWrapper.js";
		import { getChatWrapperByBos } from "./chatWrappers/createChatWrapperByBos.js";
		export { LlamaModel, LlamaGrammar, LlamaContext, LlamaChatSession, AbortError, ChatPromptWrapper, EmptyChatPromptWrapper, LlamaChatPromptWrapper, GeneralChatPromptWrapper, ChatMLPromptWrapper, getChatWrapperByBos };
		//# sourceMappingURL=index.js.map

6

dist/llamaEvaluator/LlamaBins.d.ts

		@@ -1,4 +0,4 @@
		import { type LLAMAModel, type LLAMAContext } from "../utils/getBin.js";
		import { type LLAMAModel, type LLAMAContext, type LLAMAGrammar } from "../utils/getBin.js";
		export declare const llamaCppNode: import("../utils/getBin.js").LlamaCppNodeModule;
		declare const LLAMAModel: LLAMAModel, LLAMAContext: LLAMAContext;
		export { LLAMAModel, LLAMAContext };
		declare const LLAMAModel: LLAMAModel, LLAMAContext: LLAMAContext, LLAMAGrammar: LLAMAGrammar;
		export { LLAMAModel, LLAMAContext, LLAMAGrammar };

4

dist/llamaEvaluator/LlamaBins.js

		import { loadBin } from "../utils/getBin.js";
		export const llamaCppNode = await loadBin();
		const { LLAMAModel, LLAMAContext } = llamaCppNode;
		export { LLAMAModel, LLAMAContext };
		const { LLAMAModel, LLAMAContext, LLAMAGrammar } = llamaCppNode;
		export { LLAMAModel, LLAMAContext, LLAMAGrammar };
		//# sourceMappingURL=LlamaBins.js.map

9

dist/llamaEvaluator/LlamaChatSession.d.ts

		/// <reference types="node" />
		import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
		import { Token } from "../types.js";
		import { LlamaContext } from "./LlamaContext.js";
		@@ -10,2 +11,4 @@ export declare class LlamaChatSession {
		private _initialized;
		private _lastStopString;
		private _lastStopStringSuffix;
		private readonly _ctx;
		@@ -15,3 +18,3 @@ constructor({ context, printLLamaSystemInfo, promptWrapper, systemPrompt }: {
		printLLamaSystemInfo?: boolean;
		promptWrapper?: ChatPromptWrapper;
		promptWrapper?: ChatPromptWrapper \| "auto";
		systemPrompt?: string;
		@@ -22,4 +25,6 @@ });
		init(): Promise<void>;
		prompt(prompt: string, onToken?: (tokens: number[]) => void, { signal }?: {
		prompt(prompt: string, { onToken, signal, maxTokens }?: {
		onToken?(tokens: Token[]): void;
		signal?: AbortSignal;
		maxTokens?: number;
		}): Promise<string>;
		@@ -26,0 +31,0 @@ private _evalTokens;

62

dist/llamaEvaluator/LlamaChatSession.js

		@@ -5,2 +5,3 @@ import { defaultChatSystemPrompt } from "../config.js";
		import { GeneralChatPromptWrapper } from "../chatWrappers/GeneralChatPromptWrapper.js";
		import { getChatWrapperByBos } from "../chatWrappers/createChatWrapperByBos.js";
		import { LlamaModel } from "./LlamaModel.js";
		@@ -14,2 +15,4 @@ const UNKNOWN_UNICODE_CHAR = "\ufffd";
		_initialized = false;
		_lastStopString = null;
		_lastStopStringSuffix = null;
		_ctx;
		@@ -19,4 +22,12 @@ constructor({ context, printLLamaSystemInfo = false, promptWrapper = new GeneralChatPromptWrapper(), systemPrompt = defaultChatSystemPrompt }) {
		this._printLLamaSystemInfo = printLLamaSystemInfo;
		this._promptWrapper = promptWrapper;
		this._systemPrompt = systemPrompt;
		if (promptWrapper === "auto") {
		const chatWrapper = getChatWrapperByBos(context.getBosString());
		if (chatWrapper != null)
		this._promptWrapper = new chatWrapper();
		else
		this._promptWrapper = new GeneralChatPromptWrapper();
		}
		else
		this._promptWrapper = promptWrapper;
		}
		@@ -38,13 +49,26 @@ get initialized() {
		}
		async prompt(prompt, onToken, { signal } = {}) {
		async prompt(prompt, { onToken, signal, maxTokens } = {}) {
		if (!this.initialized)
		await this.init();
		return await withLock(this, "prompt", async () => {
		const promptText = this._promptWrapper.wrapPrompt(prompt, { systemPrompt: this._systemPrompt, promptIndex: this._promptIndex });
		const promptText = this._promptWrapper.wrapPrompt(prompt, {
		systemPrompt: this._systemPrompt,
		promptIndex: this._promptIndex,
		lastStopString: this._lastStopString,
		lastStopStringSuffix: this._promptIndex == 0
		? (this._ctx.prependBos
		? this._ctx.getBosString()
		: null)
		: this._lastStopStringSuffix
		});
		this._promptIndex++;
		return await this._evalTokens(this._ctx.encode(promptText), onToken, { signal });
		this._lastStopString = null;
		this._lastStopStringSuffix = null;
		const { text, stopString, stopStringSuffix } = await this._evalTokens(this._ctx.encode(promptText), { onToken, signal, maxTokens });
		this._lastStopString = stopString;
		this._lastStopStringSuffix = stopStringSuffix;
		return text;
		});
		}
		async _evalTokens(tokens, onToken, { signal } = {}) {
		const decodeTokens = (tokens) => this._ctx.decode(Uint32Array.from(tokens));
		async _evalTokens(tokens, { onToken, signal, maxTokens } = {}) {
		const stopStrings = this._promptWrapper.getStopStrings();
		@@ -57,6 +81,10 @@ const stopStringIndexes = Array(stopStrings.length).fill(0);
		throw new AbortError();
		const tokenStr = decodeTokens([chunk]);
		const { shouldReturn, skipTokenEvent } = this._checkStopString(tokenStr, stopStringIndexes);
		const tokenStr = this._ctx.decode(Uint32Array.from([chunk]));
		const { shouldReturn, skipTokenEvent, stopString, stopStringSuffix } = this._checkStopString(tokenStr, stopStringIndexes);
		if (shouldReturn)
		return decodeTokens(res);
		return {
		text: this._ctx.decode(Uint32Array.from(res)),
		stopString,
		stopStringSuffix
		};
		// if the token is unknown, it means it's not complete character
		@@ -74,4 +102,10 @@ if (tokenStr === UNKNOWN_UNICODE_CHAR \|\| skipTokenEvent) {
		onToken?.([chunk]);
		if (maxTokens != null && maxTokens > 0 && res.length >= maxTokens)
		break;
		}
		return decodeTokens(res);
		return {
		text: this._ctx.decode(Uint32Array.from(res)),
		stopString: null,
		stopStringSuffix: null
		};
		}
		@@ -96,3 +130,9 @@ _checkStopString(tokenStr, stopStringIndexes) {
		if (stopStringIndexes[stopStringIndex] === stopString.length) {
		return { shouldReturn: true };
		return {
		shouldReturn: true,
		stopString,
		stopStringSuffix: tokenStr.length === stopString.length
		? null
		: tokenStr.slice(stopString.length)
		};
		}
		@@ -99,0 +139,0 @@ skipTokenEvent \|\|= localShouldSkipTokenEvent;

33

dist/llamaEvaluator/LlamaContext.d.ts

		@@ -0,7 +1,10 @@
		import { Token } from "../types.js";
		import { LlamaModel } from "./LlamaModel.js";
		import { LlamaGrammar } from "./LlamaGrammar.js";
		export declare class LlamaContext {
		private readonly _ctx;
		private _prependBos;
		constructor({ model, prependBos }: {
		constructor({ model, grammar, prependBos }: {
		model: LlamaModel;
		grammar?: LlamaGrammar;
		prependBos?: boolean;
		@@ -11,3 +14,29 @@ });
		decode(tokens: Uint32Array): string;
		evaluate(tokens: Uint32Array): AsyncGenerator<number, void, unknown>;
		get prependBos(): boolean;
		/**
		* @returns {Token \| null} The BOS (Beginning Of Sequence) token.
		*/
		getBosToken(): Token \| null;
		/**
		* @returns {Token \| null} The EOS (End Of Sequence) token.
		*/
		getEosToken(): Token \| null;
		/**
		* @returns {Token \| null} The NL (New Line) token.
		*/
		getNlToken(): Token \| null;
		/**
		* @returns {string \| null} The BOS (Beginning Of Sequence) token as a string.
		*/
		getBosString(): string \| null;
		/**
		* @returns {string \| null} The EOS (End Of Sequence) token as a string.
		*/
		getEosString(): string \| null;
		/**
		* @returns {string \| null} The NL (New Line) token as a string.
		*/
		getNlString(): string \| null;
		getContextSize(): number;
		evaluate(tokens: Uint32Array): AsyncGenerator<Token, void>;
		}

71

dist/llamaEvaluator/LlamaContext.js

		@@ -0,1 +1,2 @@
		import { removeNullFields } from "../utils/removeNullFields.js";
		import { LLAMAContext } from "./LlamaBins.js";
		@@ -5,12 +6,78 @@ export class LlamaContext {
		_prependBos;
		constructor({ model, prependBos = true }) {
		this._ctx = new LLAMAContext(model._model);
		constructor({ model, grammar, prependBos = true }) {
		this._ctx = new LLAMAContext(model._model, removeNullFields({
		grammar: grammar?._grammar
		}));
		this._prependBos = prependBos;
		}
		encode(text) {
		if (text === "")
		return new Uint32Array();
		return this._ctx.encode(text);
		}
		decode(tokens) {
		if (tokens.length === 0)
		return "";
		return this._ctx.decode(tokens);
		}
		get prependBos() {
		return this._prependBos;
		}
		/**
		* @returns {Token \| null} The BOS (Beginning Of Sequence) token.
		*/
		getBosToken() {
		const bosToken = this._ctx.tokenBos();
		if (bosToken === -1)
		return null;
		return bosToken;
		}
		/**
		* @returns {Token \| null} The EOS (End Of Sequence) token.
		*/
		getEosToken() {
		const eosToken = this._ctx.tokenEos();
		if (eosToken === -1)
		return null;
		return eosToken;
		}
		/**
		* @returns {Token \| null} The NL (New Line) token.
		*/
		getNlToken() {
		const nlToken = this._ctx.tokenNl();
		if (nlToken === -1)
		return null;
		return nlToken;
		}
		/**
		* @returns {string \| null} The BOS (Beginning Of Sequence) token as a string.
		*/
		getBosString() {
		const bosToken = this.getBosToken();
		if (bosToken == null)
		return null;
		return this._ctx.getTokenString(bosToken);
		}
		/**
		* @returns {string \| null} The EOS (End Of Sequence) token as a string.
		*/
		getEosString() {
		const eosToken = this.getEosToken();
		if (eosToken == null)
		return null;
		return this._ctx.getTokenString(eosToken);
		}
		/**
		* @returns {string \| null} The NL (New Line) token as a string.
		*/
		getNlString() {
		const nlToken = this.getNlToken();
		if (nlToken == null)
		return null;
		return this._ctx.getTokenString(nlToken);
		}
		getContextSize() {
		return this._ctx.getContextSize();
		}
		async *evaluate(tokens) {
		@@ -17,0 +84,0 @@ let evalTokens = tokens;

50

dist/llamaEvaluator/LlamaModel.d.ts

		@@ -12,2 +12,22 @@ export declare class LlamaModel {
		* @param {boolean} [options.lowVram] - if true, reduce VRAM usage at the cost of performance
		* @param {number} [options.temperature] - Temperature is a hyperparameter that controls the randomness of the generated text.
		* It affects the probability distribution of the model's output tokens.
		* A higher temperature (e.g., 1.5) makes the output more random and creative,
		* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
		* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
		* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
		*
		* Set to `0` to disable.
		* @param {number} [options.topK] - Limits the model to consider only the K most likely next tokens for sampling at each step of
		* sequence generation.
		* An integer number between `1` and the size of the vocabulary.
		* Set to `0` to disable (which uses the full vocabulary).
		*
		* Only relevant when `temperature` is set to a value greater than 0.
		* @param {number} [options.topP] - Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
		* and samples the next token only from this set.
		* A float number between `0` and `1`.
		* Set to `1` to disable.
		*
		* Only relevant when `temperature` is set to a value greater than `0`.
		* @param {boolean} [options.f16Kv] - use fp16 for KV cache
		@@ -20,3 +40,3 @@ * @param {boolean} [options.logitsAll] - the llama_eval() call computes all logits, not just the last one
		*/
		constructor({ modelPath, seed, contextSize, batchSize, gpuLayers, lowVram, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }: {
		constructor({ modelPath, seed, contextSize, batchSize, gpuLayers, lowVram, temperature, topK, topP, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }: {
		/** path to the model on the filesystem */
		@@ -34,2 +54,30 @@ modelPath: string;
		lowVram?: boolean;
		/**
		* Temperature is a hyperparameter that controls the randomness of the generated text.
		* It affects the probability distribution of the model's output tokens.
		* A higher temperature (e.g., 1.5) makes the output more random and creative,
		* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
		* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
		* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
		*
		* Set to `0` to disable.
		*/
		temperature?: number;
		/**
		* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
		* An integer number between `1` and the size of the vocabulary.
		* Set to `0` to disable (which uses the full vocabulary).
		*
		* Only relevant when `temperature` is set to a value greater than 0.
		* */
		topK?: number;
		/**
		* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
		* and samples the next token only from this set.
		* A float number between `0` and `1`.
		* Set to `1` to disable.
		*
		* Only relevant when `temperature` is set to a value greater than `0`.
		* */
		topP?: number;
		/** use fp16 for KV cache */
		@@ -36,0 +84,0 @@ f16Kv?: boolean;

34

dist/llamaEvaluator/LlamaModel.js

		@@ -0,1 +1,2 @@
		import { removeNullFields } from "../utils/removeNullFields.js";
		import { llamaCppNode, LLAMAModel } from "./LlamaBins.js";
		@@ -15,2 +16,22 @@ export class LlamaModel {
		* @param {boolean} [options.lowVram] - if true, reduce VRAM usage at the cost of performance
		* @param {number} [options.temperature] - Temperature is a hyperparameter that controls the randomness of the generated text.
		* It affects the probability distribution of the model's output tokens.
		* A higher temperature (e.g., 1.5) makes the output more random and creative,
		* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
		* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
		* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
		*
		* Set to `0` to disable.
		* @param {number} [options.topK] - Limits the model to consider only the K most likely next tokens for sampling at each step of
		* sequence generation.
		* An integer number between `1` and the size of the vocabulary.
		* Set to `0` to disable (which uses the full vocabulary).
		*
		* Only relevant when `temperature` is set to a value greater than 0.
		* @param {number} [options.topP] - Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
		* and samples the next token only from this set.
		* A float number between `0` and `1`.
		* Set to `1` to disable.
		*
		* Only relevant when `temperature` is set to a value greater than `0`.
		* @param {boolean} [options.f16Kv] - use fp16 for KV cache
		@@ -23,3 +44,3 @@ * @param {boolean} [options.logitsAll] - the llama_eval() call computes all logits, not just the last one
		*/
		constructor({ modelPath, seed = null, contextSize = 1024 * 4, batchSize, gpuLayers, lowVram, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }) {
		constructor({ modelPath, seed = null, contextSize = 1024 * 4, batchSize, gpuLayers, lowVram, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }) {
		this._model = new LLAMAModel(modelPath, removeNullFields({
		@@ -31,2 +52,5 @@ seed: seed != null ? Math.max(-1, seed) : undefined,
		lowVram,
		temperature,
		topK,
		topP,
		f16Kv,
		@@ -44,10 +68,2 @@ logitsAll,
		}
		function removeNullFields(obj) {
		const newObj = Object.assign({}, obj);
		for (const key in obj) {
		if (newObj[key] == null)
		delete newObj[key];
		}
		return newObj;
		}
		//# sourceMappingURL=LlamaModel.js.map

4

dist/utils/compileLLamaCpp.d.ts

		@@ -1,6 +0,8 @@
		export declare function compileLlamaCpp({ arch, nodeTarget, setUsedBingFlag }: {
		export declare function compileLlamaCpp({ arch, nodeTarget, setUsedBingFlag, metal, cuda }: {
		arch?: string;
		nodeTarget?: string;
		setUsedBingFlag?: boolean;
		metal?: boolean;
		cuda?: boolean;
		}): Promise<void>;
		export declare function getCompiledLlamaCppBinaryPath(): Promise<string \| null>;

38

dist/utils/compileLLamaCpp.js

		@@ -10,3 +10,3 @@ import path from "path";
		const __dirname = path.dirname(fileURLToPath(import.meta.url));
		export async function compileLlamaCpp({ arch = process.arch, nodeTarget = process.version, setUsedBingFlag = true }) {
		export async function compileLlamaCpp({ arch = process.arch, nodeTarget = process.version, setUsedBingFlag = true, metal = false, cuda = false }) {
		try {
		@@ -16,5 +16,35 @@ if (!(await fs.exists(llamaCppDirectory))) {
		}
		const gypDefines = ["GGML_USE_K_QUANTS", "NAPI_CPP_EXCEPTIONS"];
		if ((metal && process.platform === "darwin") \|\| process.env.LLAMA_METAL === "1")
		gypDefines.push("LLAMA_METAL=1");
		if (cuda \|\| process.env.LLAMA_CUBLAS === "1")
		gypDefines.push("LLAMA_CUBLAS=1");
		if (process.env.LLAMA_MPI === "1")
		gypDefines.push("LLAMA_MPI=1");
		if (process.env.LLAMA_OPENBLAS === "1")
		gypDefines.push("LLAMA_OPENBLAS=1");
		if (process.env.LLAMA_BLAS_VENDOR != null)
		gypDefines.push("LLAMA_BLAS_VENDOR=" + process.env.LLAMA_BLAS_VENDOR);
		if (process.env.LLAMA_CUDA_FORCE_DMMV != null)
		gypDefines.push("LLAMA_CUDA_FORCE_DMMV=" + process.env.LLAMA_CUDA_FORCE_DMMV);
		if (process.env.LLAMA_CUDA_DMMV_X != null)
		gypDefines.push("LLAMA_CUDA_DMMV_X=" + process.env.LLAMA_CUDA_DMMV_X);
		if (process.env.LLAMA_CUDA_MMV_Y != null)
		gypDefines.push("LLAMA_CUDA_MMV_Y=" + process.env.LLAMA_CUDA_MMV_Y);
		if (process.env.LLAMA_CUDA_F16 != null)
		gypDefines.push("LLAMA_CUDA_F16=" + process.env.LLAMA_CUDA_F16);
		if (process.env.LLAMA_CUDA_KQUANTS_ITER != null)
		gypDefines.push("LLAMA_CUDA_KQUANTS_ITER=" + process.env.LLAMA_CUDA_KQUANTS_ITER);
		if (process.env.LLAMA_HIPBLAS === "1")
		gypDefines.push("LLAMA_HIPBLAS=1");
		if (process.env.LLAMA_CLBLAST === "1")
		gypDefines.push("LLAMA_CLBLAST=1");
		const nodeGypEnv = {
		...process.env,
		"CMAKE_CURRENT_SOURCE_DIR": llamaCppDirectory,
		"GYP_DEFINES": gypDefines.join(" ")
		};
		await clearLlamaBuild();
		await spawnCommand("npm", ["run", "-s", "node-gyp-llama", "--", "configure", "--arch=" + arch, "--target=" + nodeTarget], __dirname);
		await spawnCommand("npm", ["run", "-s", "node-gyp-llama", "--", "configure", "--arch=" + arch, "--target=" + nodeTarget, "--", "-f", "compile_commands_json"], __dirname);
		await spawnCommand("npm", ["run", "-s", "node-gyp-llama", "--", "configure", "--arch=" + arch, "--target=" + nodeTarget], __dirname, nodeGypEnv);
		await spawnCommand("npm", ["run", "-s", "node-gyp-llama", "--", "configure", "--arch=" + arch, "--target=" + nodeTarget, "--", "-f", "compile_commands_json"], __dirname, nodeGypEnv);
		if (await fs.exists(path.join(llamaDirectory, "Release", "compile_commands.json"))) {
		@@ -28,3 +58,3 @@ await fs.move(path.join(llamaDirectory, "Release", "compile_commands.json"), path.join(llamaDirectory, "compile_commands.json"));
		await fs.remove(path.join(llamaDirectory, "Debug"));
		await spawnCommand("npm", ["run", "-s", "node-gyp-llama-build", "--", "--arch=" + arch, "--target=" + nodeTarget], __dirname);
		await spawnCommand("npm", ["run", "-s", "node-gyp-llama-build", "--", "--arch=" + arch, "--target=" + nodeTarget], __dirname, nodeGypEnv);
		if (setUsedBingFlag) {
		@@ -31,0 +61,0 @@ await setUsedBinFlag("localBuildFromSource");

17

dist/utils/getBin.d.ts

		@@ -6,2 +6,3 @@ export declare function getPrebuildBinPath(): Promise<string \| null>;
		LLAMAContext: LLAMAContext;
		LLAMAGrammar: LLAMAGrammar;
		systemInfo(): string;
		@@ -22,6 +23,11 @@ };
		embedding?: boolean;
		temperature?: number;
		topK?: number;
		topP?: number;
		}): LLAMAModel;
		};
		export type LLAMAContext = {
		new (model: LLAMAModel): LLAMAContext;
		new (model: LLAMAModel, params?: {
		grammar?: LLAMAGrammar;
		}): LLAMAContext;
		encode(text: string): Uint32Array;
		@@ -32,3 +38,10 @@ eval(tokens: Uint32Array): Promise<number>;
		tokenEos(): number;
		getMaxContextSize(): number;
		tokenNl(): number;
		getContextSize(): number;
		getTokenString(token: number): string;
		};
		export type LLAMAGrammar = {
		new (grammarPath: string, params?: {
		printGrammar?: boolean;
		}): LLAMAGrammar;
		};

6

dist/utils/getBin.js

		@@ -6,3 +6,3 @@ import { createRequire } from "module";
		import fs from "fs-extra";
		import { defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, defaultSkipDownload, llamaBinsDirectory } from "../config.js";
		import { defaultLlamaCppCudaSupport, defaultLlamaCppGitHubRepo, defaultLlamaCppMetalSupport, defaultLlamaCppRelease, defaultSkipDownload, llamaBinsDirectory } from "../config.js";
		import { DownloadLlamaCppCommand } from "../cli/commands/DownloadCommand.js";
		@@ -60,3 +60,5 @@ import { getUsedBinFlag } from "./usedBinFlag.js";
		repo: defaultLlamaCppGitHubRepo,
		release: defaultLlamaCppRelease
		release: defaultLlamaCppRelease,
		metal: defaultLlamaCppMetalSupport,
		cuda: defaultLlamaCppCudaSupport
		});
		@@ -63,0 +65,0 @@ const modulePath = await getCompiledLlamaCppBinaryPath();

3

dist/utils/spawnCommand.d.ts

		@@ -1,1 +0,2 @@
		export declare function spawnCommand(command: string, args: string[], cwd: string): Promise<void>;
		/// <reference types="node" />
		export declare function spawnCommand(command: string, args: string[], cwd: string, env?: NodeJS.ProcessEnv): Promise<void>;

4

dist/utils/spawnCommand.js

		import spawn from "cross-spawn";
		export function spawnCommand(command, args, cwd) {
		export function spawnCommand(command, args, cwd, env = process.env) {
		function getCommandString() {
		@@ -19,3 +19,3 @@ let res = command;
		cwd,
		env: process.env,
		env,
		detached: false,
		@@ -22,0 +22,0 @@ windowsHide: true

2

llama/binariesGithubRelease.json

		{
		"release": "b1069"
		"release": "b1107"
		}

13

package.json

		{
		"name": "node-llama-cpp",
		"version": "2.0.0",
		"version": "2.1.0",
		"description": "node.js bindings for llama.cpp",
		@@ -43,6 +43,6 @@ "main": "dist/index.js",
		"scripts": {
		"postinstall": "node ./dist/cli/cli.js postinstall",
		"prepare": "[ $CI = true ] \|\| [ -d '.husky/_' ] \|\| husky install",
		"prebuild": "rm -rf ./dist ./tsconfig.tsbuildinfo",
		"build": "tsc --build tsconfig.json --force",
		"addPostinstallScript": "npm pkg set scripts.postinstall=\"node ./dist/cli/cli.js postinstall\"",
		"generate-docs": "typedoc",
		@@ -58,3 +58,4 @@ "prewatch": "rm -rf ./dist ./tsconfig.tsbuildinfo",
		"format": "npm run lint:eslint -- --fix",
		"clean": "rm -rf ./node_modules ./dist ./tsconfig.tsbuildinfo"
		"clean": "rm -rf ./node_modules ./dist ./tsconfig.tsbuildinfo",
		"postinstall": "node ./dist/cli/cli.js postinstall"
		},
		@@ -74,2 +75,7 @@ "repository": {
		"gguf",
		"grammar",
		"json-grammar",
		"temperature",
		"topK",
		"topP",
		"raspberry-pi",
		@@ -124,2 +130,3 @@ "self-hosted",
		"ora": "^7.0.1",
		"simple-git": "^3.19.1",
		"uuid": "^9.0.0",
		@@ -126,0 +133,0 @@ "yargs": "^17.7.2"

122

README.md

		@@ -0,9 +1,14 @@
		<div align="center">

		# Node Llama.cpp
		Node.js bindings for llama.cpp.

		Pre-built bindings are provided with a fallback to building from source with `node-gyp`.
		<sub>Pre-built bindings are provided with a fallback to building from source with `node-gyp`.<sub>

		[![Build](https://github.com/withcatai/node-llama-cpp/actions/workflows/build.yml/badge.svg)](https://github.com/withcatai/node-llama-cpp/actions/workflows/build.yml)
		[![License](https://badgen.net/badge/color/MIT/green?label=license)](https://www.npmjs.com/package/node-llama-cpp)
		[![License](https://badgen.net/badge/color/TypeScript/blue?label=types)](https://www.npmjs.com/package/node-llama-cpp)
		[![Version](https://badgen.net/npm/v/node-llama-cpp)](https://www.npmjs.com/package/node-llama-cpp)

		</div>

		@@ -116,4 +121,4 @@ ## Installation
		const res: number[] = [];
		for await (const chunk of context.evaluate(tokens)) {
		res.push(chunk);
		for await (const modelToken of context.evaluate(tokens)) {
		res.push(modelToken);

		@@ -134,2 +139,42 @@ // it's important to not concatinate the results as strings,

		#### With grammar
		Use this to direct the model to generate a specific format of text, like `JSON` for example.

		> Note: there's an issue with some grammars where the model won't stop generating output,
		> so it's advised to use it together with `maxTokens` set to the context size of the model

		```typescript
		import {fileURLToPath} from "url";
		import path from "path";
		import {LlamaModel, LlamaGrammar, LlamaContext, LlamaChatSession} from "node-llama-cpp";

		const __dirname = path.dirname(fileURLToPath(import.meta.url));

		const model = new LlamaModel({
		modelPath: path.join(__dirname, "models", "codellama-13b.Q3_K_M.gguf")
		})
		const grammar = await LlamaGrammar.getFor("json");
		const context = new LlamaContext({
		model,
		grammar
		});
		const session = new LlamaChatSession({context});


		const q1 = 'Create a JSON that contains a message saying "hi there"';
		console.log("User: " + q1);

		const a1 = await session.prompt(q1, {maxTokens: context.getContextSize()});
		console.log("AI: " + a1);
		console.log(JSON.parse(a1));


		const q2 = 'Add another field to the JSON with the key being "author" and the value being "LLama"';
		console.log("User: " + q2);

		const a2 = await session.prompt(q2, {maxTokens: context.getContextSize()});
		console.log("AI: " + a2);
		console.log(JSON.parse(a2));
		```

		### CLI
		@@ -140,6 +185,6 @@ ```
		Commands:
		node-llama-cpp download Download a release of llama.cpp and compile it
		node-llama-cpp build Compile the currently downloaded llama.cpp
		node-llama-cpp clear [type] Clear files created by llama-cli
		node-llama-cpp chat Chat with a LLama model
		node-llama-cpp download Download a release of llama.cpp and compile it
		node-llama-cpp build Compile the currently downloaded llama.cpp
		node-llama-cpp clear [type] Clear files created by node-llama-cpp
		node-llama-cpp chat Chat with a LLama model

		@@ -158,11 +203,13 @@ Options:
		Options:
		-h, --help Show help [boolean]
		--repo The GitHub repository to download a release of llama.cpp from. Can also be set v
		ia the NODE_LLAMA_CPP_REPO environment variable
		-h, --help Show help [boolean]
		--repo The GitHub repository to download a release of llama.cpp from. Can also be
		set via the NODE_LLAMA_CPP_REPO environment variable
		[string] [default: "ggerganov/llama.cpp"]
		--release The tag of the llama.cpp release to download. Can also be set via the NODE_LLAMA
		_CPP_REPO_RELEASE environment variable [string] [default: "latest"]
		--arch The architecture to compile llama.cpp for [string]
		--nodeTarget The Node.js version to compile llama.cpp for. Example: v18.0.0 [string]
		-v, --version Show version number [boolean]
		--release The tag of the llama.cpp release to download. Set to "latest" to download t
		he latest release. Can also be set via the NODE_LLAMA_CPP_REPO_RELEASE envi
		ronment variable [string] [default: "latest"]
		-a, --arch The architecture to compile llama.cpp for [string]
		-t, --nodeTarget The Node.js version to compile llama.cpp for. Example: v18.0.0 [string]
		--skipBuild, --sb Skip building llama.cpp after downloading it [boolean] [default: false]
		-v, --version Show version number [boolean]
		```
		@@ -178,4 +225,4 @@
		-h, --help Show help [boolean]
		--arch The architecture to compile llama.cpp for [string]
		--nodeTarget The Node.js version to compile llama.cpp for. Example: v18.0.0 [string]
		-a, --arch The architecture to compile llama.cpp for [string]
		-t, --nodeTarget The Node.js version to compile llama.cpp for. Example: v18.0.0 [string]
		-v, --version Show version number [boolean]
		@@ -188,3 +235,3 @@ ```

		Clear files created by llama-cli
		Clear files created by node-llama-cpp

		@@ -204,11 +251,11 @@ Options:
		Required:
		--model LLama model file to use for the chat [string] [required]
		-m, --model LLama model file to use for the chat [string] [required]

		Optional:
		--systemInfo Print llama.cpp system info [boolean] [default: false]
		--systemPrompt System prompt to use against the model. [default value: You are a helpful, res
		pectful and honest assistant. Always answer as helpfully as possible. If a que
		stion does not make any sense, or is not factually coherent, explain why inste
		ad of answering something not correct. If you don't know the answer to a quest
		ion, please don't share false information.]
		-i, --systemInfo Print llama.cpp system info [boolean] [default: false]
		-s, --systemPrompt System prompt to use against the model. [default value: You are a helpful,
		respectful and honest assistant. Always answer as helpfully as possible. If
		a question does not make any sense, or is not factually coherent, explain
		why instead of answering something not correct. If you don't know the answe
		r to a question, please don't share false information.]
		[string] [default: "You are a helpful, respectful and honest assistant. Always answer as helpfully
		@@ -219,2 +266,27 @@ as possible.
		share false information."]
		-w, --wrapper Chat wrapper to use. Use `auto` to automatically select a wrapper based on
		the model's BOS token
		[string] [choices: "auto", "general", "llamaChat", "chatML"] [default: "general"]
		-c, --contextSize Context size to use for the model [number] [default: 4096]
		-g, --grammar Restrict the model response to a specific grammar, like JSON for example
		[string] [choices: "text", "json", "list", "arithmetic", "japanese", "chess"] [default: "text"]
		-t, --temperature Temperature is a hyperparameter that controls the randomness of the generat
		ed text. It affects the probability distribution of the model's output toke
		ns. A higher temperature (e.g., 1.5) makes the output more random and creat
		ive, while a lower temperature (e.g., 0.5) makes the output more focused, d
		eterministic, and conservative. The suggested temperature is 0.8, which pro
		vides a balance between randomness and determinism. At the extreme, a tempe
		rature of 0 will always pick the most likely next token, leading to identic
		al outputs in each run. Set to `0` to disable. [number] [default: 0]
		-k, --topK Limits the model to consider only the K most likely next tokens for samplin
		g at each step of sequence generation. An integer number between `1` and th
		e size of the vocabulary. Set to `0` to disable (which uses the full vocabu
		lary). Only relevant when `temperature` is set to a value greater than 0.
		[number] [default: 40]
		-p, --topP Dynamically selects the smallest set of tokens whose cumulative probability
		exceeds the threshold P, and samples the next token only from this set. A
		float number between `0` and `1`. Set to `1` to disable. Only relevant when
		`temperature` is set to a value greater than `0`. [number] [default: 0.95]
		--maxTokens, --mt Maximum number of tokens to generate in responses. Set to `0` to disable. S
		et to `-1` to set to the context size [number] [default: 0]

		@@ -221,0 +293,0 @@ Options:

dist/ChatPromptWrapper.js.map