🚀 Big News: Socket Acquires Coana to Bring Reachability Analysis to Every Appsec Team.Learn more
Socket
DemoInstallSign in
Socket

node-llama-cpp

Package Overview
Dependencies
Maintainers
1
Versions
114
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

node-llama-cpp - npm Package Compare versions

Comparing version

to
2.1.0

dist/chatWrappers/ChatMLPromptWrapper.d.ts

3

dist/ChatPromptWrapper.d.ts
export declare abstract class ChatPromptWrapper {
abstract readonly wrapperName: string;
wrapPrompt(prompt: string, { systemPrompt, promptIndex }: {
systemPrompt: string;
promptIndex: number;
lastStopString: string | null;
lastStopStringSuffix: string | null;
}): string;
getStopStrings(): string[];
}
import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
export declare class EmptyChatPromptWrapper extends ChatPromptWrapper {
readonly wrapperName: string;
}
import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
export class EmptyChatPromptWrapper extends ChatPromptWrapper {
wrapperName = "Empty";
}
//# sourceMappingURL=EmptyChatPromptWrapper.js.map
import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
export declare class GeneralChatPromptWrapper extends ChatPromptWrapper {
wrapPrompt(prompt: string, { systemPrompt, promptIndex }: {
readonly wrapperName: string;
private readonly _instructionName;
private readonly _responseName;
constructor({ instructionName, responseName }?: {
instructionName?: string;
responseName?: string;
});
wrapPrompt(prompt: string, { systemPrompt, promptIndex, lastStopString, lastStopStringSuffix }: {
systemPrompt: string;
promptIndex: number;
lastStopString: string | null;
lastStopStringSuffix: string | null;
}): string;
getStopStrings(): string[];
private _getPromptPrefix;
}
import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
import { getTextCompletion } from "../utils/getTextCompletion.js";
export class GeneralChatPromptWrapper extends ChatPromptWrapper {
wrapPrompt(prompt, { systemPrompt, promptIndex }) {
const conversationPrompt = "\n\n### Human:\n\n" + prompt + "\n\n### Assistant:\n\n";
return promptIndex === 0 ? systemPrompt + conversationPrompt : conversationPrompt;
wrapperName = "General";
_instructionName;
_responseName;
constructor({ instructionName = "Human", responseName = "Assistant" } = {}) {
super();
this._instructionName = instructionName;
this._responseName = responseName;
}
wrapPrompt(prompt, { systemPrompt, promptIndex, lastStopString, lastStopStringSuffix }) {
if (promptIndex === 0)
return systemPrompt + `\n\n### ${this._instructionName}:\n\n` + prompt + `\n\n### ${this._responseName}:\n\n`;
return this._getPromptPrefix(lastStopString, lastStopStringSuffix) + prompt + `\n\n### ${this._responseName}:\n\n`;
}
getStopStrings() {
return ["### Human:", "Human:", "### Assistant:", "Assistant:", "<end>"];
return [
`\n\n### ${this._instructionName}`,
`### ${this._instructionName}`,
`\n\n### ${this._responseName}`,
`### ${this._responseName}`,
"<end>"
];
}
_getPromptPrefix(lastStopString, lastStopStringSuffix) {
return getTextCompletion(lastStopString === "<end>"
? lastStopStringSuffix
: (lastStopString + (lastStopStringSuffix ?? "")), [
`\n\n### ${this._instructionName}:\n\n`,
`### ${this._instructionName}:\n\n`
]) ?? `\n\n### ${this._instructionName}:\n\n`;
}
}
//# sourceMappingURL=GeneralChatPromptWrapper.js.map

5

dist/chatWrappers/LlamaChatPromptWrapper.d.ts
import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
export declare class LlamaChatPromptWrapper extends ChatPromptWrapper {
wrapPrompt(prompt: string, { systemPrompt, promptIndex }: {
readonly wrapperName: string;
wrapPrompt(prompt: string, { systemPrompt, promptIndex, lastStopString, lastStopStringSuffix }: {
systemPrompt: string;
promptIndex: number;
lastStopString: string | null;
lastStopStringSuffix: string | null;
}): string;
getStopStrings(): string[];
}
import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
import { getTextCompletion } from "../utils/getTextCompletion.js";
// source: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
export class LlamaChatPromptWrapper extends ChatPromptWrapper {
wrapPrompt(prompt, { systemPrompt, promptIndex }) {
wrapperName = "LlamaChat";
wrapPrompt(prompt, { systemPrompt, promptIndex, lastStopString, lastStopStringSuffix }) {
const previousCompletionEnd = (lastStopString ?? "") + (lastStopStringSuffix ?? "");
if (promptIndex === 0 && systemPrompt != "") {
return "<s>[INST] <<SYS>>\n" + systemPrompt + "\n<</SYS>>\n\n" + prompt + " [/INST]\n\n";
return (getTextCompletion(previousCompletionEnd, "<s>[INST] <<SYS>>\n") ?? "<s>[INST] <<SYS>>\n") + systemPrompt +
"\n<</SYS>>\n\n" + prompt + " [/INST]\n\n";
}
else {
return "<s>[INST] " + prompt + " [/INST]\n\n";
return (getTextCompletion(previousCompletionEnd, "</s><s>[INST] ") ?? "<s>[INST] ") + prompt + " [/INST]\n\n";
}
}
getStopStrings() {
return ["</s><s>[INST]"];
return ["</s>"];
}
}
//# sourceMappingURL=LlamaChatPromptWrapper.js.map

@@ -5,5 +5,7 @@ import { CommandModule } from "yargs";

nodeTarget?: string;
metal: boolean;
cuda: boolean;
};
export declare const BuildCommand: CommandModule<object, BuildCommand>;
export declare function BuildLlamaCppCommand({ arch, nodeTarget }: BuildCommand): Promise<void>;
export declare function BuildLlamaCppCommand({ arch, nodeTarget, metal, cuda }: BuildCommand): Promise<void>;
export {};

@@ -0,1 +1,2 @@

import process from "process";
import chalk from "chalk";

@@ -5,2 +6,3 @@ import { compileLlamaCpp } from "../../utils/compileLLamaCpp.js";

import { clearTempFolder } from "../../utils/clearTempFolder.js";
import { defaultLlamaCppCudaSupport, defaultLlamaCppMetalSupport } from "../../config.js";
export const BuildCommand = {

@@ -12,2 +14,3 @@ command: "build",

.option("arch", {
alias: "a",
type: "string",

@@ -17,4 +20,15 @@ description: "The architecture to compile llama.cpp for"

.option("nodeTarget", {
alias: "t",
type: "string",
description: "The Node.js version to compile llama.cpp for. Example: v18.0.0"
})
.option("metal", {
type: "boolean",
default: defaultLlamaCppMetalSupport,
description: "Compile llama.cpp with Metal support. Can also be set via the NODE_LLAMA_CPP_METAL environment variable"
})
.option("cuda", {
type: "boolean",
default: defaultLlamaCppCudaSupport,
description: "Compile llama.cpp with CUDA support. Can also be set via the NODE_LLAMA_CPP_CUDA environment variable"
});

@@ -24,3 +38,9 @@ },

};
export async function BuildLlamaCppCommand({ arch, nodeTarget }) {
export async function BuildLlamaCppCommand({ arch, nodeTarget, metal, cuda }) {
if (metal && process.platform === "darwin") {
console.log(`${chalk.yellow("Metal:")} enabled`);
}
if (cuda) {
console.log(`${chalk.yellow("CUDA:")} enabled`);
}
await withOra({

@@ -34,3 +54,5 @@ loading: chalk.blue("Compiling llama.cpp"),

nodeTarget: nodeTarget ? nodeTarget : undefined,
setUsedBingFlag: true
setUsedBingFlag: true,
metal,
cuda
});

@@ -37,0 +59,0 @@ });

import { CommandModule } from "yargs";
import type { LlamaGrammar } from "../../llamaEvaluator/LlamaGrammar.js";
type ChatCommand = {

@@ -6,6 +7,11 @@ model: string;

systemPrompt: string;
wrapper: string;
wrapper: "auto" | "general" | "llamaChat" | "chatML";
contextSize: number;
grammar: "text" | Parameters<typeof LlamaGrammar.getFor>[0];
temperature: number;
topK: number;
topP: number;
maxTokens: number;
};
export declare const ChatCommand: CommandModule<object, ChatCommand>;
export {};

@@ -8,2 +8,4 @@ import * as readline from "readline/promises";

import { GeneralChatPromptWrapper } from "../../chatWrappers/GeneralChatPromptWrapper.js";
import { ChatMLPromptWrapper } from "../../chatWrappers/ChatMLPromptWrapper.js";
import { getChatWrapperByBos } from "../../chatWrappers/createChatWrapperByBos.js";
export const ChatCommand = {

@@ -15,2 +17,3 @@ command: "chat",

.option("model", {
alias: "m",
type: "string",

@@ -22,2 +25,3 @@ demandOption: true,

.option("systemInfo", {
alias: "i",
type: "boolean",

@@ -29,2 +33,3 @@ default: false,

.option("systemPrompt", {
alias: "s",
type: "string",

@@ -38,9 +43,11 @@ default: defaultChatSystemPrompt,

.option("wrapper", {
alias: "w",
type: "string",
default: "general",
choices: ["general", "llama"],
description: "Chat wrapper to use",
choices: ["auto", "general", "llamaChat", "chatML"],
description: "Chat wrapper to use. Use `auto` to automatically select a wrapper based on the model's BOS token",
group: "Optional:"
})
.option("contextSize", {
alias: "c",
type: "number",

@@ -50,7 +57,43 @@ default: 1024 * 4,

group: "Optional:"
})
.option("grammar", {
alias: "g",
type: "string",
default: "text",
choices: ["text", "json", "list", "arithmetic", "japanese", "chess"],
description: "Restrict the model response to a specific grammar, like JSON for example",
group: "Optional:"
})
.option("temperature", {
alias: "t",
type: "number",
default: 0,
description: "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The suggested temperature is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run. Set to `0` to disable.",
group: "Optional:"
})
.option("topK", {
alias: "k",
type: "number",
default: 40,
description: "Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation. An integer number between `1` and the size of the vocabulary. Set to `0` to disable (which uses the full vocabulary). Only relevant when `temperature` is set to a value greater than 0.",
group: "Optional:"
})
.option("topP", {
alias: "p",
type: "number",
default: 0.95,
description: "Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P, and samples the next token only from this set. A float number between `0` and `1`. Set to `1` to disable. Only relevant when `temperature` is set to a value greater than `0`.",
group: "Optional:"
})
.option("maxTokens", {
alias: "mt",
type: "number",
default: 0,
description: "Maximum number of tokens to generate in responses. Set to `0` to disable. Set to `-1` to set to the context size",
group: "Optional:"
});
},
async handler({ model, systemInfo, systemPrompt, wrapper, contextSize }) {
async handler({ model, systemInfo, systemPrompt, wrapper, contextSize, grammar, temperature, topK, topP, maxTokens }) {
try {
await RunChat({ model, systemInfo, systemPrompt, wrapper, contextSize });
await RunChat({ model, systemInfo, systemPrompt, wrapper, contextSize, grammar, temperature, topK, topP, maxTokens });
}

@@ -63,11 +106,23 @@ catch (err) {

};
async function RunChat({ model: modelArg, systemInfo, systemPrompt, wrapper, contextSize }) {
async function RunChat({ model: modelArg, systemInfo, systemPrompt, wrapper, contextSize, grammar: grammarArg, temperature, topK, topP, maxTokens }) {
const { LlamaChatSession } = await import("../../llamaEvaluator/LlamaChatSession.js");
const { LlamaModel } = await import("../../llamaEvaluator/LlamaModel.js");
const { LlamaContext } = await import("../../llamaEvaluator/LlamaContext.js");
const { LlamaGrammar } = await import("../../llamaEvaluator/LlamaGrammar.js");
const model = new LlamaModel({
modelPath: modelArg,
contextSize
contextSize,
temperature,
topK,
topP
});
const context = new LlamaContext({ model });
const context = new LlamaContext({
model,
grammar: grammarArg !== "text"
? await LlamaGrammar.getFor(grammarArg)
: undefined
});
const bos = context.getBosString(); // bos = beginning of sequence
const eos = context.getEosString(); // eos = end of sequence
const promptWrapper = getChatWrapper(wrapper, bos);
const session = new LlamaChatSession({

@@ -77,4 +132,7 @@ context,

systemPrompt,
promptWrapper: createChatWrapper(wrapper)
promptWrapper
});
console.info(`${chalk.yellow("BOS:")} ${bos}`);
console.info(`${chalk.yellow("EOS:")} ${eos}`);
console.info(`${chalk.yellow("Chat wrapper:")} ${promptWrapper.wrapperName}`);
await withOra({

@@ -101,4 +159,11 @@ loading: chalk.blue("Loading model"),

process.stdout.write(startColor);
await session.prompt(input, (chunk) => {
process.stdout.write(session.context.decode(Uint32Array.from(chunk)));
await session.prompt(input, {
maxTokens: maxTokens === -1
? context.getContextSize()
: maxTokens <= 0
? undefined
: maxTokens,
onToken(chunk) {
process.stdout.write(session.context.decode(Uint32Array.from(chunk)));
}
});

@@ -109,11 +174,21 @@ process.stdout.write(endColor);

}
function createChatWrapper(wrapper) {
function getChatWrapper(wrapper, bos) {
switch (wrapper) {
case "general":
return new GeneralChatPromptWrapper();
case "llama":
case "llamaChat":
return new LlamaChatPromptWrapper();
case "chatML":
return new ChatMLPromptWrapper();
default:
}
if (wrapper === "auto") {
const chatWrapper = getChatWrapperByBos(bos);
if (chatWrapper != null)
return new chatWrapper();
return new GeneralChatPromptWrapper();
}
void (wrapper);
throw new Error("Unknown wrapper: " + wrapper);
}
//# sourceMappingURL=ChatCommand.js.map

@@ -9,3 +9,3 @@ import * as fs from "fs-extra";

command: "clear [type]",
describe: "Clear files created by llama-cli",
describe: "Clear files created by node-llama-cpp",
builder(yargs) {

@@ -12,0 +12,0 @@ return yargs

@@ -7,2 +7,4 @@ import { CommandModule } from "yargs";

nodeTarget?: string;
metal: boolean;
cuda: boolean;
skipBuild?: boolean;

@@ -12,3 +14,3 @@ updateBinariesReleaseMetadata?: boolean;

export declare const DownloadCommand: CommandModule<object, DownloadCommandArgs>;
export declare function DownloadLlamaCppCommand({ repo, release, arch, nodeTarget, skipBuild, updateBinariesReleaseMetadata }: DownloadCommandArgs): Promise<void>;
export declare function DownloadLlamaCppCommand({ repo, release, arch, nodeTarget, metal, cuda, skipBuild, updateBinariesReleaseMetadata }: DownloadCommandArgs): Promise<void>;
export {};

@@ -1,10 +0,9 @@

import * as path from "path";
import process from "process";
import path from "path";
import { Octokit } from "octokit";
import * as fs from "fs-extra";
import fs from "fs-extra";
import chalk from "chalk";
import { DownloaderHelper } from "node-downloader-helper";
import cliProgress from "cli-progress";
import bytes from "bytes";
import StreamZip from "node-stream-zip";
import { defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, llamaCppDirectory, tempDownloadDirectory } from "../../config.js";
import simpleGit from "simple-git";
import { defaultLlamaCppCudaSupport, defaultLlamaCppGitHubRepo, defaultLlamaCppMetalSupport, defaultLlamaCppRelease, llamaCppDirectory } from "../../config.js";
import { compileLlamaCpp } from "../../utils/compileLLamaCpp.js";

@@ -30,2 +29,3 @@ import withOra from "../../utils/withOra.js";

.option("arch", {
alias: "a",
type: "string",

@@ -35,6 +35,19 @@ description: "The architecture to compile llama.cpp for"

.option("nodeTarget", {
alias: "t",
type: "string",
description: "The Node.js version to compile llama.cpp for. Example: v18.0.0"
})
.option("metal", {
type: "boolean",
default: defaultLlamaCppMetalSupport,
hidden: process.platform !== "darwin",
description: "Compile llama.cpp with Metal support. Can also be set via the NODE_LLAMA_CPP_METAL environment variable"
})
.option("cuda", {
type: "boolean",
default: defaultLlamaCppCudaSupport,
description: "Compile llama.cpp with CUDA support. Can also be set via the NODE_LLAMA_CPP_CUDA environment variable"
})
.option("skipBuild", {
alias: "sb",
type: "boolean",

@@ -53,3 +66,3 @@ default: false,

};
export async function DownloadLlamaCppCommand({ repo, release, arch, nodeTarget, skipBuild, updateBinariesReleaseMetadata }) {
export async function DownloadLlamaCppCommand({ repo, release, arch, nodeTarget, metal, cuda, skipBuild, updateBinariesReleaseMetadata }) {
const octokit = new Octokit();

@@ -59,5 +72,12 @@ const [githubOwner, githubRepo] = repo.split("/");

console.log(`${chalk.yellow("Release:")} ${release}`);
if (!skipBuild) {
if (metal && process.platform === "darwin") {
console.log(`${chalk.yellow("Metal:")} enabled`);
}
if (cuda) {
console.log(`${chalk.yellow("CUDA:")} enabled`);
}
}
console.log();
let githubRelease = null;
let zipUrl;
await withOra({

@@ -89,18 +109,7 @@ loading: chalk.blue("Fetching llama.cpp info"),

}
if (githubRelease.data?.zipball_url == null) {
throw new Error(`Failed to find a zip archive for release "${release}" of "${repo}"`);
if (githubRelease.data.tag_name == null) {
throw new Error(`Failed to find tag of release "${release}" of "${repo}"`);
}
const zipUrlResponse = await octokit.rest.repos.downloadZipballArchive({
owner: githubOwner,
repo: githubRepo,
ref: githubRelease.data.target_commitish
});
if (zipUrlResponse.url == null)
throw new Error(`Failed to get zip archive url for release "${release}" of "${repo}"`);
zipUrl = zipUrlResponse.url;
});
await clearTempFolder();
console.log(chalk.blue("Downloading zip file"));
await fs.ensureDir(tempDownloadDirectory);
await downloadFile(zipUrl, "llama.cpp.zip", tempDownloadDirectory);
await withOra({

@@ -113,16 +122,17 @@ loading: chalk.blue("Removing existing llama.cpp directory"),

});
console.log(chalk.blue("Cloning llama.cpp"));
await cloneTag(githubOwner, githubRepo, githubRelease.data.tag_name, llamaCppDirectory);
await withOra({
loading: chalk.blue("Extracting llama.cpp.zip file"),
success: chalk.blue("Extracted llama.cpp.zip file"),
fail: chalk.blue("Failed to extract llama.cpp.zip file")
loading: chalk.blue("Generating required files"),
success: chalk.blue("Generated required files"),
fail: chalk.blue("Failed to generate required files")
}, async () => {
await unzipLlamaReleaseZipFile(path.join(tempDownloadDirectory, "llama.cpp.zip"), llamaCppDirectory);
const buildInfoTemplateFilePath = path.join(llamaCppDirectory, "scripts", "build-info.h.in");
const buildInfoResultFilePath = path.join(llamaCppDirectory, "build-info.h");
const buildInfoTemplateFile = await fs.readFile(buildInfoTemplateFilePath, "utf8");
const finalFile = buildInfoTemplateFile
.replaceAll("@BUILD_NUMBER@", "1")
.replaceAll("@BUILD_COMMIT@", githubRelease.data.tag_name);
await fs.writeFile(buildInfoResultFilePath, finalFile, "utf8");
});
await withOra({
loading: chalk.blue("Removing temporary files"),
success: chalk.blue("Removed temporary files"),
fail: chalk.blue("Failed to remove temporary files")
}, async () => {
await clearTempFolder();
});
if (!skipBuild) {

@@ -133,3 +143,5 @@ console.log(chalk.blue("Compiling llama.cpp"));

nodeTarget: nodeTarget ? nodeTarget : undefined,
setUsedBingFlag: true
setUsedBingFlag: true,
metal,
cuda
});

@@ -147,10 +159,3 @@ }

}
async function downloadFile(url, fileName, directory) {
const download = new DownloaderHelper(url, directory, {
fileName: fileName,
retry: {
maxRetries: 10,
delay: 1000 * 6
}
});
async function cloneTag(githubOwner, githubRepo, tag, directory) {
const progressBar = new cliProgress.Bar({

@@ -160,39 +165,25 @@ clearOnComplete: false,

autopadding: true,
format: `${chalk.bold("{filename}")} ${chalk.yellow("{percentage}%")} ${chalk.cyan("{bar}")} {speed}${chalk.grey("{eta_formatted}")}`
format: `${chalk.bold("Clone {repo}")} ${chalk.yellow("{percentage}%")} ${chalk.cyan("{bar}")} ${chalk.grey("{eta_formatted}")}`
}, cliProgress.Presets.shades_classic);
progressBar.start(100, 0, {
speed: "",
filename: fileName
repo: `${githubOwner}/${githubRepo}`
});
download.on("progress", (stats) => {
progressBar.update(Math.floor((stats.downloaded / stats.total) * 10000) / 100, {
speed: Number.isFinite(stats.speed) ? chalk.blue((bytes(stats.speed) + "/s").padEnd(10)) + chalk.grey(" | ") : ""
try {
await simpleGit({
progress({ progress, total, processed }) {
const totalProgress = (processed / 100) + (progress / total);
progressBar.update(Math.floor(totalProgress * 10000) / 100);
}
}).clone(`https://github.com/${githubOwner}/${githubRepo}.git`, directory, {
"--depth": 1,
"--branch": tag,
"--quiet": null
});
});
download.on("end", () => {
}
finally {
progressBar.update(100);
progressBar.stop();
});
// errors are handled by the .start() method
// this listener is here to not get an unhandled error exception
download.on("error", () => { });
await download.start();
}
async function unzipLlamaReleaseZipFile(zipFilePath, directory) {
const zip = new StreamZip.async({ file: zipFilePath });
const entires = await zip.entries();
const rootFolderEntries = new Map();
for (const entry of Object.values(entires)) {
const entryPath = entry.name.split("/");
const rootFolderName = entryPath[0];
const rootFolderEntryCount = rootFolderEntries.get(rootFolderName) ?? 0;
rootFolderEntries.set(rootFolderName, rootFolderEntryCount + 1);
}
const mostUsedRootFolderName = [...rootFolderEntries.keys()]
.sort((a, b) => rootFolderEntries.get(b) - rootFolderEntries.get(a))
.shift();
if (mostUsedRootFolderName == null)
throw new Error("Failed to find the root folder of the llama.cpp release zip file");
await zip.extract(mostUsedRootFolderName, directory);
}
//# sourceMappingURL=DownloadCommand.js.map

@@ -1,2 +0,2 @@

import { defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, defaultSkipDownload } from "../../config.js";
import { defaultLlamaCppCudaSupport, defaultLlamaCppGitHubRepo, defaultLlamaCppMetalSupport, defaultLlamaCppRelease, defaultSkipDownload } from "../../config.js";
import { getPrebuildBinPath } from "../../utils/getBin.js";

@@ -15,3 +15,5 @@ import { DownloadLlamaCppCommand } from "./DownloadCommand.js";

repo: defaultLlamaCppGitHubRepo,
release: defaultLlamaCppRelease
release: defaultLlamaCppRelease,
metal: defaultLlamaCppMetalSupport,
cuda: defaultLlamaCppCudaSupport
});

@@ -18,0 +20,0 @@ }

export declare const llamaDirectory: string;
export declare const llamaBinsDirectory: string;
export declare const llamaBinsGrammarsDirectory: string;
export declare const llamaCppDirectory: string;
export declare const llamaCppGrammarsDirectory: string;
export declare const tempDownloadDirectory: string;

@@ -9,3 +11,5 @@ export declare const usedBinFlagJsonPath: string;

export declare const defaultLlamaCppRelease: string;
export declare const defaultLlamaCppMetalSupport: boolean;
export declare const defaultLlamaCppCudaSupport: boolean;
export declare const defaultSkipDownload: boolean;
export declare const defaultChatSystemPrompt: string;

@@ -11,3 +11,5 @@ import { fileURLToPath } from "url";

export const llamaBinsDirectory = path.join(__dirname, "..", "llamaBins");
export const llamaBinsGrammarsDirectory = path.join(__dirname, "..", "llama", "grammars");
export const llamaCppDirectory = path.join(llamaDirectory, "llama.cpp");
export const llamaCppGrammarsDirectory = path.join(llamaDirectory, "llama.cpp", "grammars");
export const tempDownloadDirectory = path.join(os.tmpdir(), "node-llama-cpp", uuid.v4());

@@ -22,2 +24,8 @@ export const usedBinFlagJsonPath = path.join(llamaDirectory, "usedBin.json");

.asString();
export const defaultLlamaCppMetalSupport = env.get("NODE_LLAMA_CPP_METAL")
.default("false")
.asBool();
export const defaultLlamaCppCudaSupport = env.get("NODE_LLAMA_CPP_CUDA")
.default("false")
.asBool();
export const defaultSkipDownload = env.get("NODE_LLAMA_CPP_SKIP_DOWNLOAD")

@@ -24,0 +32,0 @@ .default("false")

@@ -0,1 +1,2 @@

import { LlamaGrammar } from "llamaEvaluator/LlamaGrammar.js";
import { LlamaChatSession } from "./llamaEvaluator/LlamaChatSession.js";

@@ -9,2 +10,5 @@ import { LlamaModel } from "./llamaEvaluator/LlamaModel.js";

import { LlamaContext } from "./llamaEvaluator/LlamaContext.js";
export { LlamaModel, LlamaContext, LlamaChatSession, AbortError, ChatPromptWrapper, EmptyChatPromptWrapper, LlamaChatPromptWrapper, GeneralChatPromptWrapper };
import { ChatMLPromptWrapper } from "./chatWrappers/ChatMLPromptWrapper.js";
import { getChatWrapperByBos } from "./chatWrappers/createChatWrapperByBos.js";
import { type Token } from "./types.js";
export { LlamaModel, LlamaGrammar, LlamaContext, LlamaChatSession, AbortError, ChatPromptWrapper, EmptyChatPromptWrapper, LlamaChatPromptWrapper, GeneralChatPromptWrapper, ChatMLPromptWrapper, getChatWrapperByBos, type Token };

@@ -0,1 +1,2 @@

import { LlamaGrammar } from "llamaEvaluator/LlamaGrammar.js";
import { LlamaChatSession } from "./llamaEvaluator/LlamaChatSession.js";

@@ -9,3 +10,5 @@ import { LlamaModel } from "./llamaEvaluator/LlamaModel.js";

import { LlamaContext } from "./llamaEvaluator/LlamaContext.js";
export { LlamaModel, LlamaContext, LlamaChatSession, AbortError, ChatPromptWrapper, EmptyChatPromptWrapper, LlamaChatPromptWrapper, GeneralChatPromptWrapper };
import { ChatMLPromptWrapper } from "./chatWrappers/ChatMLPromptWrapper.js";
import { getChatWrapperByBos } from "./chatWrappers/createChatWrapperByBos.js";
export { LlamaModel, LlamaGrammar, LlamaContext, LlamaChatSession, AbortError, ChatPromptWrapper, EmptyChatPromptWrapper, LlamaChatPromptWrapper, GeneralChatPromptWrapper, ChatMLPromptWrapper, getChatWrapperByBos };
//# sourceMappingURL=index.js.map

@@ -1,4 +0,4 @@

import { type LLAMAModel, type LLAMAContext } from "../utils/getBin.js";
import { type LLAMAModel, type LLAMAContext, type LLAMAGrammar } from "../utils/getBin.js";
export declare const llamaCppNode: import("../utils/getBin.js").LlamaCppNodeModule;
declare const LLAMAModel: LLAMAModel, LLAMAContext: LLAMAContext;
export { LLAMAModel, LLAMAContext };
declare const LLAMAModel: LLAMAModel, LLAMAContext: LLAMAContext, LLAMAGrammar: LLAMAGrammar;
export { LLAMAModel, LLAMAContext, LLAMAGrammar };
import { loadBin } from "../utils/getBin.js";
export const llamaCppNode = await loadBin();
const { LLAMAModel, LLAMAContext } = llamaCppNode;
export { LLAMAModel, LLAMAContext };
const { LLAMAModel, LLAMAContext, LLAMAGrammar } = llamaCppNode;
export { LLAMAModel, LLAMAContext, LLAMAGrammar };
//# sourceMappingURL=LlamaBins.js.map
/// <reference types="node" />
import { ChatPromptWrapper } from "../ChatPromptWrapper.js";
import { Token } from "../types.js";
import { LlamaContext } from "./LlamaContext.js";

@@ -10,2 +11,4 @@ export declare class LlamaChatSession {

private _initialized;
private _lastStopString;
private _lastStopStringSuffix;
private readonly _ctx;

@@ -15,3 +18,3 @@ constructor({ context, printLLamaSystemInfo, promptWrapper, systemPrompt }: {

printLLamaSystemInfo?: boolean;
promptWrapper?: ChatPromptWrapper;
promptWrapper?: ChatPromptWrapper | "auto";
systemPrompt?: string;

@@ -22,4 +25,6 @@ });

init(): Promise<void>;
prompt(prompt: string, onToken?: (tokens: number[]) => void, { signal }?: {
prompt(prompt: string, { onToken, signal, maxTokens }?: {
onToken?(tokens: Token[]): void;
signal?: AbortSignal;
maxTokens?: number;
}): Promise<string>;

@@ -26,0 +31,0 @@ private _evalTokens;

@@ -5,2 +5,3 @@ import { defaultChatSystemPrompt } from "../config.js";

import { GeneralChatPromptWrapper } from "../chatWrappers/GeneralChatPromptWrapper.js";
import { getChatWrapperByBos } from "../chatWrappers/createChatWrapperByBos.js";
import { LlamaModel } from "./LlamaModel.js";

@@ -14,2 +15,4 @@ const UNKNOWN_UNICODE_CHAR = "\ufffd";

_initialized = false;
_lastStopString = null;
_lastStopStringSuffix = null;
_ctx;

@@ -19,4 +22,12 @@ constructor({ context, printLLamaSystemInfo = false, promptWrapper = new GeneralChatPromptWrapper(), systemPrompt = defaultChatSystemPrompt }) {

this._printLLamaSystemInfo = printLLamaSystemInfo;
this._promptWrapper = promptWrapper;
this._systemPrompt = systemPrompt;
if (promptWrapper === "auto") {
const chatWrapper = getChatWrapperByBos(context.getBosString());
if (chatWrapper != null)
this._promptWrapper = new chatWrapper();
else
this._promptWrapper = new GeneralChatPromptWrapper();
}
else
this._promptWrapper = promptWrapper;
}

@@ -38,13 +49,26 @@ get initialized() {

}
async prompt(prompt, onToken, { signal } = {}) {
async prompt(prompt, { onToken, signal, maxTokens } = {}) {
if (!this.initialized)
await this.init();
return await withLock(this, "prompt", async () => {
const promptText = this._promptWrapper.wrapPrompt(prompt, { systemPrompt: this._systemPrompt, promptIndex: this._promptIndex });
const promptText = this._promptWrapper.wrapPrompt(prompt, {
systemPrompt: this._systemPrompt,
promptIndex: this._promptIndex,
lastStopString: this._lastStopString,
lastStopStringSuffix: this._promptIndex == 0
? (this._ctx.prependBos
? this._ctx.getBosString()
: null)
: this._lastStopStringSuffix
});
this._promptIndex++;
return await this._evalTokens(this._ctx.encode(promptText), onToken, { signal });
this._lastStopString = null;
this._lastStopStringSuffix = null;
const { text, stopString, stopStringSuffix } = await this._evalTokens(this._ctx.encode(promptText), { onToken, signal, maxTokens });
this._lastStopString = stopString;
this._lastStopStringSuffix = stopStringSuffix;
return text;
});
}
async _evalTokens(tokens, onToken, { signal } = {}) {
const decodeTokens = (tokens) => this._ctx.decode(Uint32Array.from(tokens));
async _evalTokens(tokens, { onToken, signal, maxTokens } = {}) {
const stopStrings = this._promptWrapper.getStopStrings();

@@ -57,6 +81,10 @@ const stopStringIndexes = Array(stopStrings.length).fill(0);

throw new AbortError();
const tokenStr = decodeTokens([chunk]);
const { shouldReturn, skipTokenEvent } = this._checkStopString(tokenStr, stopStringIndexes);
const tokenStr = this._ctx.decode(Uint32Array.from([chunk]));
const { shouldReturn, skipTokenEvent, stopString, stopStringSuffix } = this._checkStopString(tokenStr, stopStringIndexes);
if (shouldReturn)
return decodeTokens(res);
return {
text: this._ctx.decode(Uint32Array.from(res)),
stopString,
stopStringSuffix
};
// if the token is unknown, it means it's not complete character

@@ -74,4 +102,10 @@ if (tokenStr === UNKNOWN_UNICODE_CHAR || skipTokenEvent) {

onToken?.([chunk]);
if (maxTokens != null && maxTokens > 0 && res.length >= maxTokens)
break;
}
return decodeTokens(res);
return {
text: this._ctx.decode(Uint32Array.from(res)),
stopString: null,
stopStringSuffix: null
};
}

@@ -96,3 +130,9 @@ _checkStopString(tokenStr, stopStringIndexes) {

if (stopStringIndexes[stopStringIndex] === stopString.length) {
return { shouldReturn: true };
return {
shouldReturn: true,
stopString,
stopStringSuffix: tokenStr.length === stopString.length
? null
: tokenStr.slice(stopString.length)
};
}

@@ -99,0 +139,0 @@ skipTokenEvent ||= localShouldSkipTokenEvent;

@@ -0,7 +1,10 @@

import { Token } from "../types.js";
import { LlamaModel } from "./LlamaModel.js";
import { LlamaGrammar } from "./LlamaGrammar.js";
export declare class LlamaContext {
private readonly _ctx;
private _prependBos;
constructor({ model, prependBos }: {
constructor({ model, grammar, prependBos }: {
model: LlamaModel;
grammar?: LlamaGrammar;
prependBos?: boolean;

@@ -11,3 +14,29 @@ });

decode(tokens: Uint32Array): string;
evaluate(tokens: Uint32Array): AsyncGenerator<number, void, unknown>;
get prependBos(): boolean;
/**
* @returns {Token | null} The BOS (Beginning Of Sequence) token.
*/
getBosToken(): Token | null;
/**
* @returns {Token | null} The EOS (End Of Sequence) token.
*/
getEosToken(): Token | null;
/**
* @returns {Token | null} The NL (New Line) token.
*/
getNlToken(): Token | null;
/**
* @returns {string | null} The BOS (Beginning Of Sequence) token as a string.
*/
getBosString(): string | null;
/**
* @returns {string | null} The EOS (End Of Sequence) token as a string.
*/
getEosString(): string | null;
/**
* @returns {string | null} The NL (New Line) token as a string.
*/
getNlString(): string | null;
getContextSize(): number;
evaluate(tokens: Uint32Array): AsyncGenerator<Token, void>;
}

@@ -0,1 +1,2 @@

import { removeNullFields } from "../utils/removeNullFields.js";
import { LLAMAContext } from "./LlamaBins.js";

@@ -5,12 +6,78 @@ export class LlamaContext {

_prependBos;
constructor({ model, prependBos = true }) {
this._ctx = new LLAMAContext(model._model);
constructor({ model, grammar, prependBos = true }) {
this._ctx = new LLAMAContext(model._model, removeNullFields({
grammar: grammar?._grammar
}));
this._prependBos = prependBos;
}
encode(text) {
if (text === "")
return new Uint32Array();
return this._ctx.encode(text);
}
decode(tokens) {
if (tokens.length === 0)
return "";
return this._ctx.decode(tokens);
}
get prependBos() {
return this._prependBos;
}
/**
* @returns {Token | null} The BOS (Beginning Of Sequence) token.
*/
getBosToken() {
const bosToken = this._ctx.tokenBos();
if (bosToken === -1)
return null;
return bosToken;
}
/**
* @returns {Token | null} The EOS (End Of Sequence) token.
*/
getEosToken() {
const eosToken = this._ctx.tokenEos();
if (eosToken === -1)
return null;
return eosToken;
}
/**
* @returns {Token | null} The NL (New Line) token.
*/
getNlToken() {
const nlToken = this._ctx.tokenNl();
if (nlToken === -1)
return null;
return nlToken;
}
/**
* @returns {string | null} The BOS (Beginning Of Sequence) token as a string.
*/
getBosString() {
const bosToken = this.getBosToken();
if (bosToken == null)
return null;
return this._ctx.getTokenString(bosToken);
}
/**
* @returns {string | null} The EOS (End Of Sequence) token as a string.
*/
getEosString() {
const eosToken = this.getEosToken();
if (eosToken == null)
return null;
return this._ctx.getTokenString(eosToken);
}
/**
* @returns {string | null} The NL (New Line) token as a string.
*/
getNlString() {
const nlToken = this.getNlToken();
if (nlToken == null)
return null;
return this._ctx.getTokenString(nlToken);
}
getContextSize() {
return this._ctx.getContextSize();
}
async *evaluate(tokens) {

@@ -17,0 +84,0 @@ let evalTokens = tokens;

@@ -12,2 +12,22 @@ export declare class LlamaModel {

* @param {boolean} [options.lowVram] - if true, reduce VRAM usage at the cost of performance
* @param {number} [options.temperature] - Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
* A higher temperature (e.g., 1.5) makes the output more random and creative,
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
*
* Set to `0` to disable.
* @param {number} [options.topK] - Limits the model to consider only the K most likely next tokens for sampling at each step of
* sequence generation.
* An integer number between `1` and the size of the vocabulary.
* Set to `0` to disable (which uses the full vocabulary).
*
* Only relevant when `temperature` is set to a value greater than 0.
* @param {number} [options.topP] - Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
* and samples the next token only from this set.
* A float number between `0` and `1`.
* Set to `1` to disable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
* @param {boolean} [options.f16Kv] - use fp16 for KV cache

@@ -20,3 +40,3 @@ * @param {boolean} [options.logitsAll] - the llama_eval() call computes all logits, not just the last one

*/
constructor({ modelPath, seed, contextSize, batchSize, gpuLayers, lowVram, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }: {
constructor({ modelPath, seed, contextSize, batchSize, gpuLayers, lowVram, temperature, topK, topP, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }: {
/** path to the model on the filesystem */

@@ -34,2 +54,30 @@ modelPath: string;

lowVram?: boolean;
/**
* Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
* A higher temperature (e.g., 1.5) makes the output more random and creative,
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
*
* Set to `0` to disable.
*/
temperature?: number;
/**
* Limits the model to consider only the K most likely next tokens for sampling at each step of sequence generation.
* An integer number between `1` and the size of the vocabulary.
* Set to `0` to disable (which uses the full vocabulary).
*
* Only relevant when `temperature` is set to a value greater than 0.
* */
topK?: number;
/**
* Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
* and samples the next token only from this set.
* A float number between `0` and `1`.
* Set to `1` to disable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
* */
topP?: number;
/** use fp16 for KV cache */

@@ -36,0 +84,0 @@ f16Kv?: boolean;

@@ -0,1 +1,2 @@

import { removeNullFields } from "../utils/removeNullFields.js";
import { llamaCppNode, LLAMAModel } from "./LlamaBins.js";

@@ -15,2 +16,22 @@ export class LlamaModel {

* @param {boolean} [options.lowVram] - if true, reduce VRAM usage at the cost of performance
* @param {number} [options.temperature] - Temperature is a hyperparameter that controls the randomness of the generated text.
* It affects the probability distribution of the model's output tokens.
* A higher temperature (e.g., 1.5) makes the output more random and creative,
* while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative.
* The suggested temperature is 0.8, which provides a balance between randomness and determinism.
* At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
*
* Set to `0` to disable.
* @param {number} [options.topK] - Limits the model to consider only the K most likely next tokens for sampling at each step of
* sequence generation.
* An integer number between `1` and the size of the vocabulary.
* Set to `0` to disable (which uses the full vocabulary).
*
* Only relevant when `temperature` is set to a value greater than 0.
* @param {number} [options.topP] - Dynamically selects the smallest set of tokens whose cumulative probability exceeds the threshold P,
* and samples the next token only from this set.
* A float number between `0` and `1`.
* Set to `1` to disable.
*
* Only relevant when `temperature` is set to a value greater than `0`.
* @param {boolean} [options.f16Kv] - use fp16 for KV cache

@@ -23,3 +44,3 @@ * @param {boolean} [options.logitsAll] - the llama_eval() call computes all logits, not just the last one

*/
constructor({ modelPath, seed = null, contextSize = 1024 * 4, batchSize, gpuLayers, lowVram, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }) {
constructor({ modelPath, seed = null, contextSize = 1024 * 4, batchSize, gpuLayers, lowVram, temperature = 0, topK = 40, topP = 0.95, f16Kv, logitsAll, vocabOnly, useMmap, useMlock, embedding }) {
this._model = new LLAMAModel(modelPath, removeNullFields({

@@ -31,2 +52,5 @@ seed: seed != null ? Math.max(-1, seed) : undefined,

lowVram,
temperature,
topK,
topP,
f16Kv,

@@ -44,10 +68,2 @@ logitsAll,

}
function removeNullFields(obj) {
const newObj = Object.assign({}, obj);
for (const key in obj) {
if (newObj[key] == null)
delete newObj[key];
}
return newObj;
}
//# sourceMappingURL=LlamaModel.js.map

@@ -1,6 +0,8 @@

export declare function compileLlamaCpp({ arch, nodeTarget, setUsedBingFlag }: {
export declare function compileLlamaCpp({ arch, nodeTarget, setUsedBingFlag, metal, cuda }: {
arch?: string;
nodeTarget?: string;
setUsedBingFlag?: boolean;
metal?: boolean;
cuda?: boolean;
}): Promise<void>;
export declare function getCompiledLlamaCppBinaryPath(): Promise<string | null>;

@@ -10,3 +10,3 @@ import path from "path";

const __dirname = path.dirname(fileURLToPath(import.meta.url));
export async function compileLlamaCpp({ arch = process.arch, nodeTarget = process.version, setUsedBingFlag = true }) {
export async function compileLlamaCpp({ arch = process.arch, nodeTarget = process.version, setUsedBingFlag = true, metal = false, cuda = false }) {
try {

@@ -16,5 +16,35 @@ if (!(await fs.exists(llamaCppDirectory))) {

}
const gypDefines = ["GGML_USE_K_QUANTS", "NAPI_CPP_EXCEPTIONS"];
if ((metal && process.platform === "darwin") || process.env.LLAMA_METAL === "1")
gypDefines.push("LLAMA_METAL=1");
if (cuda || process.env.LLAMA_CUBLAS === "1")
gypDefines.push("LLAMA_CUBLAS=1");
if (process.env.LLAMA_MPI === "1")
gypDefines.push("LLAMA_MPI=1");
if (process.env.LLAMA_OPENBLAS === "1")
gypDefines.push("LLAMA_OPENBLAS=1");
if (process.env.LLAMA_BLAS_VENDOR != null)
gypDefines.push("LLAMA_BLAS_VENDOR=" + process.env.LLAMA_BLAS_VENDOR);
if (process.env.LLAMA_CUDA_FORCE_DMMV != null)
gypDefines.push("LLAMA_CUDA_FORCE_DMMV=" + process.env.LLAMA_CUDA_FORCE_DMMV);
if (process.env.LLAMA_CUDA_DMMV_X != null)
gypDefines.push("LLAMA_CUDA_DMMV_X=" + process.env.LLAMA_CUDA_DMMV_X);
if (process.env.LLAMA_CUDA_MMV_Y != null)
gypDefines.push("LLAMA_CUDA_MMV_Y=" + process.env.LLAMA_CUDA_MMV_Y);
if (process.env.LLAMA_CUDA_F16 != null)
gypDefines.push("LLAMA_CUDA_F16=" + process.env.LLAMA_CUDA_F16);
if (process.env.LLAMA_CUDA_KQUANTS_ITER != null)
gypDefines.push("LLAMA_CUDA_KQUANTS_ITER=" + process.env.LLAMA_CUDA_KQUANTS_ITER);
if (process.env.LLAMA_HIPBLAS === "1")
gypDefines.push("LLAMA_HIPBLAS=1");
if (process.env.LLAMA_CLBLAST === "1")
gypDefines.push("LLAMA_CLBLAST=1");
const nodeGypEnv = {
...process.env,
"CMAKE_CURRENT_SOURCE_DIR": llamaCppDirectory,
"GYP_DEFINES": gypDefines.join(" ")
};
await clearLlamaBuild();
await spawnCommand("npm", ["run", "-s", "node-gyp-llama", "--", "configure", "--arch=" + arch, "--target=" + nodeTarget], __dirname);
await spawnCommand("npm", ["run", "-s", "node-gyp-llama", "--", "configure", "--arch=" + arch, "--target=" + nodeTarget, "--", "-f", "compile_commands_json"], __dirname);
await spawnCommand("npm", ["run", "-s", "node-gyp-llama", "--", "configure", "--arch=" + arch, "--target=" + nodeTarget], __dirname, nodeGypEnv);
await spawnCommand("npm", ["run", "-s", "node-gyp-llama", "--", "configure", "--arch=" + arch, "--target=" + nodeTarget, "--", "-f", "compile_commands_json"], __dirname, nodeGypEnv);
if (await fs.exists(path.join(llamaDirectory, "Release", "compile_commands.json"))) {

@@ -28,3 +58,3 @@ await fs.move(path.join(llamaDirectory, "Release", "compile_commands.json"), path.join(llamaDirectory, "compile_commands.json"));

await fs.remove(path.join(llamaDirectory, "Debug"));
await spawnCommand("npm", ["run", "-s", "node-gyp-llama-build", "--", "--arch=" + arch, "--target=" + nodeTarget], __dirname);
await spawnCommand("npm", ["run", "-s", "node-gyp-llama-build", "--", "--arch=" + arch, "--target=" + nodeTarget], __dirname, nodeGypEnv);
if (setUsedBingFlag) {

@@ -31,0 +61,0 @@ await setUsedBinFlag("localBuildFromSource");

@@ -6,2 +6,3 @@ export declare function getPrebuildBinPath(): Promise<string | null>;

LLAMAContext: LLAMAContext;
LLAMAGrammar: LLAMAGrammar;
systemInfo(): string;

@@ -22,6 +23,11 @@ };

embedding?: boolean;
temperature?: number;
topK?: number;
topP?: number;
}): LLAMAModel;
};
export type LLAMAContext = {
new (model: LLAMAModel): LLAMAContext;
new (model: LLAMAModel, params?: {
grammar?: LLAMAGrammar;
}): LLAMAContext;
encode(text: string): Uint32Array;

@@ -32,3 +38,10 @@ eval(tokens: Uint32Array): Promise<number>;

tokenEos(): number;
getMaxContextSize(): number;
tokenNl(): number;
getContextSize(): number;
getTokenString(token: number): string;
};
export type LLAMAGrammar = {
new (grammarPath: string, params?: {
printGrammar?: boolean;
}): LLAMAGrammar;
};

@@ -6,3 +6,3 @@ import { createRequire } from "module";

import fs from "fs-extra";
import { defaultLlamaCppGitHubRepo, defaultLlamaCppRelease, defaultSkipDownload, llamaBinsDirectory } from "../config.js";
import { defaultLlamaCppCudaSupport, defaultLlamaCppGitHubRepo, defaultLlamaCppMetalSupport, defaultLlamaCppRelease, defaultSkipDownload, llamaBinsDirectory } from "../config.js";
import { DownloadLlamaCppCommand } from "../cli/commands/DownloadCommand.js";

@@ -60,3 +60,5 @@ import { getUsedBinFlag } from "./usedBinFlag.js";

repo: defaultLlamaCppGitHubRepo,
release: defaultLlamaCppRelease
release: defaultLlamaCppRelease,
metal: defaultLlamaCppMetalSupport,
cuda: defaultLlamaCppCudaSupport
});

@@ -63,0 +65,0 @@ const modulePath = await getCompiledLlamaCppBinaryPath();

@@ -1,1 +0,2 @@

export declare function spawnCommand(command: string, args: string[], cwd: string): Promise<void>;
/// <reference types="node" />
export declare function spawnCommand(command: string, args: string[], cwd: string, env?: NodeJS.ProcessEnv): Promise<void>;
import spawn from "cross-spawn";
export function spawnCommand(command, args, cwd) {
export function spawnCommand(command, args, cwd, env = process.env) {
function getCommandString() {

@@ -19,3 +19,3 @@ let res = command;

cwd,
env: process.env,
env,
detached: false,

@@ -22,0 +22,0 @@ windowsHide: true

{
"release": "b1069"
"release": "b1107"
}
{
"name": "node-llama-cpp",
"version": "2.0.0",
"version": "2.1.0",
"description": "node.js bindings for llama.cpp",

@@ -43,6 +43,6 @@ "main": "dist/index.js",

"scripts": {
"postinstall": "node ./dist/cli/cli.js postinstall",
"prepare": "[ $CI = true ] || [ -d '.husky/_' ] || husky install",
"prebuild": "rm -rf ./dist ./tsconfig.tsbuildinfo",
"build": "tsc --build tsconfig.json --force",
"addPostinstallScript": "npm pkg set scripts.postinstall=\"node ./dist/cli/cli.js postinstall\"",
"generate-docs": "typedoc",

@@ -58,3 +58,4 @@ "prewatch": "rm -rf ./dist ./tsconfig.tsbuildinfo",

"format": "npm run lint:eslint -- --fix",
"clean": "rm -rf ./node_modules ./dist ./tsconfig.tsbuildinfo"
"clean": "rm -rf ./node_modules ./dist ./tsconfig.tsbuildinfo",
"postinstall": "node ./dist/cli/cli.js postinstall"
},

@@ -74,2 +75,7 @@ "repository": {

"gguf",
"grammar",
"json-grammar",
"temperature",
"topK",
"topP",
"raspberry-pi",

@@ -124,2 +130,3 @@ "self-hosted",

"ora": "^7.0.1",
"simple-git": "^3.19.1",
"uuid": "^9.0.0",

@@ -126,0 +133,0 @@ "yargs": "^17.7.2"

@@ -0,9 +1,14 @@

<div align="center">
# Node Llama.cpp
Node.js bindings for llama.cpp.
Pre-built bindings are provided with a fallback to building from source with `node-gyp`.
<sub>Pre-built bindings are provided with a fallback to building from source with `node-gyp`.<sub>
[![Build](https://github.com/withcatai/node-llama-cpp/actions/workflows/build.yml/badge.svg)](https://github.com/withcatai/node-llama-cpp/actions/workflows/build.yml)
[![License](https://badgen.net/badge/color/MIT/green?label=license)](https://www.npmjs.com/package/node-llama-cpp)
[![License](https://badgen.net/badge/color/TypeScript/blue?label=types)](https://www.npmjs.com/package/node-llama-cpp)
[![Version](https://badgen.net/npm/v/node-llama-cpp)](https://www.npmjs.com/package/node-llama-cpp)
</div>

@@ -116,4 +121,4 @@ ## Installation

const res: number[] = [];
for await (const chunk of context.evaluate(tokens)) {
res.push(chunk);
for await (const modelToken of context.evaluate(tokens)) {
res.push(modelToken);

@@ -134,2 +139,42 @@ // it's important to not concatinate the results as strings,

#### With grammar
Use this to direct the model to generate a specific format of text, like `JSON` for example.
> **Note:** there's an issue with some grammars where the model won't stop generating output,
> so it's advised to use it together with `maxTokens` set to the context size of the model
```typescript
import {fileURLToPath} from "url";
import path from "path";
import {LlamaModel, LlamaGrammar, LlamaContext, LlamaChatSession} from "node-llama-cpp";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const model = new LlamaModel({
modelPath: path.join(__dirname, "models", "codellama-13b.Q3_K_M.gguf")
})
const grammar = await LlamaGrammar.getFor("json");
const context = new LlamaContext({
model,
grammar
});
const session = new LlamaChatSession({context});
const q1 = 'Create a JSON that contains a message saying "hi there"';
console.log("User: " + q1);
const a1 = await session.prompt(q1, {maxTokens: context.getContextSize()});
console.log("AI: " + a1);
console.log(JSON.parse(a1));
const q2 = 'Add another field to the JSON with the key being "author" and the value being "LLama"';
console.log("User: " + q2);
const a2 = await session.prompt(q2, {maxTokens: context.getContextSize()});
console.log("AI: " + a2);
console.log(JSON.parse(a2));
```
### CLI

@@ -140,6 +185,6 @@ ```

Commands:
node-llama-cpp download Download a release of llama.cpp and compile it
node-llama-cpp build Compile the currently downloaded llama.cpp
node-llama-cpp clear [type] Clear files created by llama-cli
node-llama-cpp chat Chat with a LLama model
node-llama-cpp download Download a release of llama.cpp and compile it
node-llama-cpp build Compile the currently downloaded llama.cpp
node-llama-cpp clear [type] Clear files created by node-llama-cpp
node-llama-cpp chat Chat with a LLama model

@@ -158,11 +203,13 @@ Options:

Options:
-h, --help Show help [boolean]
--repo The GitHub repository to download a release of llama.cpp from. Can also be set v
ia the NODE_LLAMA_CPP_REPO environment variable
-h, --help Show help [boolean]
--repo The GitHub repository to download a release of llama.cpp from. Can also be
set via the NODE_LLAMA_CPP_REPO environment variable
[string] [default: "ggerganov/llama.cpp"]
--release The tag of the llama.cpp release to download. Can also be set via the NODE_LLAMA
_CPP_REPO_RELEASE environment variable [string] [default: "latest"]
--arch The architecture to compile llama.cpp for [string]
--nodeTarget The Node.js version to compile llama.cpp for. Example: v18.0.0 [string]
-v, --version Show version number [boolean]
--release The tag of the llama.cpp release to download. Set to "latest" to download t
he latest release. Can also be set via the NODE_LLAMA_CPP_REPO_RELEASE envi
ronment variable [string] [default: "latest"]
-a, --arch The architecture to compile llama.cpp for [string]
-t, --nodeTarget The Node.js version to compile llama.cpp for. Example: v18.0.0 [string]
--skipBuild, --sb Skip building llama.cpp after downloading it [boolean] [default: false]
-v, --version Show version number [boolean]
```

@@ -178,4 +225,4 @@

-h, --help Show help [boolean]
--arch The architecture to compile llama.cpp for [string]
--nodeTarget The Node.js version to compile llama.cpp for. Example: v18.0.0 [string]
-a, --arch The architecture to compile llama.cpp for [string]
-t, --nodeTarget The Node.js version to compile llama.cpp for. Example: v18.0.0 [string]
-v, --version Show version number [boolean]

@@ -188,3 +235,3 @@ ```

Clear files created by llama-cli
Clear files created by node-llama-cpp

@@ -204,11 +251,11 @@ Options:

Required:
--model LLama model file to use for the chat [string] [required]
-m, --model LLama model file to use for the chat [string] [required]
Optional:
--systemInfo Print llama.cpp system info [boolean] [default: false]
--systemPrompt System prompt to use against the model. [default value: You are a helpful, res
pectful and honest assistant. Always answer as helpfully as possible. If a que
stion does not make any sense, or is not factually coherent, explain why inste
ad of answering something not correct. If you don't know the answer to a quest
ion, please don't share false information.]
-i, --systemInfo Print llama.cpp system info [boolean] [default: false]
-s, --systemPrompt System prompt to use against the model. [default value: You are a helpful,
respectful and honest assistant. Always answer as helpfully as possible. If
a question does not make any sense, or is not factually coherent, explain
why instead of answering something not correct. If you don't know the answe
r to a question, please don't share false information.]
[string] [default: "You are a helpful, respectful and honest assistant. Always answer as helpfully

@@ -219,2 +266,27 @@ as possible.

share false information."]
-w, --wrapper Chat wrapper to use. Use `auto` to automatically select a wrapper based on
the model's BOS token
[string] [choices: "auto", "general", "llamaChat", "chatML"] [default: "general"]
-c, --contextSize Context size to use for the model [number] [default: 4096]
-g, --grammar Restrict the model response to a specific grammar, like JSON for example
[string] [choices: "text", "json", "list", "arithmetic", "japanese", "chess"] [default: "text"]
-t, --temperature Temperature is a hyperparameter that controls the randomness of the generat
ed text. It affects the probability distribution of the model's output toke
ns. A higher temperature (e.g., 1.5) makes the output more random and creat
ive, while a lower temperature (e.g., 0.5) makes the output more focused, d
eterministic, and conservative. The suggested temperature is 0.8, which pro
vides a balance between randomness and determinism. At the extreme, a tempe
rature of 0 will always pick the most likely next token, leading to identic
al outputs in each run. Set to `0` to disable. [number] [default: 0]
-k, --topK Limits the model to consider only the K most likely next tokens for samplin
g at each step of sequence generation. An integer number between `1` and th
e size of the vocabulary. Set to `0` to disable (which uses the full vocabu
lary). Only relevant when `temperature` is set to a value greater than 0.
[number] [default: 40]
-p, --topP Dynamically selects the smallest set of tokens whose cumulative probability
exceeds the threshold P, and samples the next token only from this set. A
float number between `0` and `1`. Set to `1` to disable. Only relevant when
`temperature` is set to a value greater than `0`. [number] [default: 0.95]
--maxTokens, --mt Maximum number of tokens to generate in responses. Set to `0` to disable. S
et to `-1` to set to the context size [number] [default: 0]

@@ -221,0 +293,0 @@ Options:

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet