gpt-tokenizer
Advanced tools
Comparing version 2.2.3 to 2.3.0
@@ -1,25 +0,31 @@ | ||
import { EncoderMap } from './EncoderMap.js'; | ||
export declare class BytePairEncodingCore { | ||
encoder: EncoderMap; | ||
decoder: Map<number, Uint8Array>; | ||
export type RawBytePairRanks = readonly (string | readonly number[])[]; | ||
export interface BytePairEncodingConfig { | ||
mergeableBytePairRanks: RawBytePairRanks; | ||
specialTokenMapping?: Map<string, number>; | ||
tokenSplitRegex: RegExp; | ||
specialTokensEncoder: Map<string, number>; | ||
specialTokensDecoder: Map<number, Uint8Array>; | ||
specialTokenPatternRegex: RegExp; | ||
textEncoder: TextEncoder; | ||
constructor({ bytePairEncoder, specialTokenEncoder, tokenSplitRegex, }: { | ||
bytePairEncoder: EncoderMap; | ||
specialTokenEncoder?: Map<string, number>; | ||
tokenSplitRegex: RegExp; | ||
}); | ||
encodeNative(text: string, allowedSpecial: Set<string>): Generator<number[], number, undefined>; | ||
findNextSpecialStartIndex(text: string, allowedSpecial: Set<string>, startIndex: number, specialRegex: RegExp): number | undefined; | ||
decodeNative(tokens: Iterable<number>): Generator<Uint8Array>; | ||
decodeNativeAsync(tokens: AsyncIterable<number>): AsyncGenerator<Uint8Array>; | ||
tryDecodeToken(token: number): Uint8Array | undefined; | ||
bytePairEncode(inputBytes: Uint8Array, bytePairRanks: EncoderMap): number[]; | ||
bytePairMerge(piece: Uint8Array, bytePairRanks: EncoderMap, transform: (pair: { | ||
start: number; | ||
end: number; | ||
}) => number): number[]; | ||
} | ||
export declare class BytePairEncodingCore { | ||
readonly bytePairEncoderSize: number; | ||
private bytePairEncoder; | ||
private bytePairEncoderSortedLookup; | ||
private bytePairRanksDecoder; | ||
private tokenSplitRegex; | ||
private specialTokensEncoder; | ||
private specialTokensDecoder; | ||
private specialTokenPatternRegex; | ||
private stringDecoder; | ||
private textEncoder; | ||
constructor({ mergeableBytePairRanks: bytePairEncoder, specialTokenMapping: specialTokenEncoder, tokenSplitRegex, }: BytePairEncodingConfig); | ||
getBpeRankFromString(key: string): number | undefined; | ||
getBpeRankFromStringOrThrow(key: string): number; | ||
getBpeRankFromBytes(key: Uint8Array): number | undefined; | ||
getBpeRankFromBytesOrThrow(key: Uint8Array): number; | ||
binarySearch(key: Uint8Array): number; | ||
encodeNative(text: string, allowedSpecial?: Set<string>): Generator<number[], number, undefined>; | ||
findNextSpecialStartIndex(text: string, allowedSpecial: Set<string> | undefined, startIndex: number, specialRegex: RegExp): number | undefined; | ||
decodeNative(tokens: Iterable<number>): Generator<Uint8Array | string, void, void>; | ||
decodeNativeAsync(tokens: AsyncIterable<number>): AsyncGenerator<Uint8Array | string>; | ||
tryDecodeToken(tokenRank: number): Uint8Array | string | undefined; | ||
bytePairEncode(input: string): number[]; | ||
bytePairMerge(piece: Uint8Array, getByteForRange: (start: number, end: number) => number): number[]; | ||
} |
"use strict"; | ||
/* eslint-disable no-continue */ | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.BytePairEncodingCore = void 0; | ||
const EncoderMap_js_1 = require("./EncoderMap.js"); | ||
const escapeRegExp_js_1 = require("./escapeRegExp.js"); | ||
const utfUtil_js_1 = require("./utfUtil.js"); | ||
const util_js_1 = require("./util.js"); | ||
class BytePairEncodingCore { | ||
encoder; | ||
decoder; | ||
bytePairEncoderSize; | ||
bytePairEncoder; | ||
bytePairEncoderSortedLookup; | ||
bytePairRanksDecoder = new Map(); | ||
tokenSplitRegex; | ||
@@ -13,17 +16,27 @@ specialTokensEncoder; | ||
specialTokenPatternRegex; | ||
stringDecoder; | ||
textEncoder = new TextEncoder(); | ||
constructor({ bytePairEncoder, specialTokenEncoder, tokenSplitRegex, }) { | ||
this.encoder = bytePairEncoder ?? new EncoderMap_js_1.EncoderMap(); | ||
this.decoder = bytePairEncoder | ||
? new Map([...bytePairEncoder].map(([key, value]) => [value, key])) | ||
: new Map(); | ||
constructor({ mergeableBytePairRanks: bytePairEncoder, specialTokenMapping: specialTokenEncoder, tokenSplitRegex, }) { | ||
this.bytePairEncoder = bytePairEncoder; | ||
this.stringDecoder = new Map(); | ||
// size without array holes (which may be present in the encoder) | ||
this.bytePairEncoderSize = Object.keys(bytePairEncoder).length; | ||
const binaryLookup = []; | ||
// forEach skips array holes: | ||
bytePairEncoder.forEach((value, rank) => { | ||
if (typeof value === 'string') { | ||
this.stringDecoder.set(value, rank); | ||
return; | ||
} | ||
const byteArray = new Uint8Array(value); | ||
binaryLookup.push([byteArray, rank]); | ||
this.bytePairRanksDecoder.set(rank, byteArray); | ||
}); | ||
this.bytePairEncoderSortedLookup = binaryLookup.sort((a, b) => (0, utfUtil_js_1.compareUint8Arrays)(a[0], b[0])); | ||
this.specialTokensEncoder = specialTokenEncoder ?? new Map(); | ||
this.specialTokensDecoder = specialTokenEncoder | ||
? new Map([...specialTokenEncoder].map(([key, value]) => [ | ||
value, | ||
this.textEncoder.encode(key), | ||
])) | ||
? new Map([...specialTokenEncoder].map(([key, value]) => [value, key])) | ||
: new Map(); | ||
this.tokenSplitRegex = tokenSplitRegex; | ||
const parts = [...this.specialTokensEncoder.keys()].map(escapeRegExp_js_1.escapeRegExp); | ||
const parts = [...this.specialTokensEncoder.keys()].map(util_js_1.escapeRegExp); | ||
const joinedParts = parts.join('|'); | ||
@@ -37,2 +50,60 @@ try { | ||
} | ||
getBpeRankFromString(key) { | ||
return this.stringDecoder.get(key); | ||
} | ||
getBpeRankFromStringOrThrow(key) { | ||
const value = this.getBpeRankFromString(key); | ||
if (value === undefined) { | ||
throw new Error(`The byte-pair encoding does not contain a value for: ${key}`); | ||
} | ||
return value; | ||
} | ||
getBpeRankFromBytes(key) { | ||
const keyAsString = (0, utfUtil_js_1.tryConvertToString)(key); | ||
if (keyAsString !== undefined) { | ||
return this.getBpeRankFromString(keyAsString); | ||
} | ||
// Perform binary search on the binary keys | ||
const index = this.binarySearch(key); | ||
if (index !== -1) { | ||
return this.bytePairEncoderSortedLookup[index][1]; | ||
} | ||
return undefined; | ||
} | ||
getBpeRankFromBytesOrThrow(key) { | ||
const value = this.getBpeRankFromBytes(key); | ||
if (value === undefined) { | ||
throw new Error(`The byte-pair encoding does not contain a value for: ${key.toString()}`); | ||
} | ||
return value; | ||
} | ||
// Binary search on the binary keys | ||
binarySearch(key) { | ||
let low = 0; | ||
let high = this.bytePairEncoderSortedLookup.length - 1; | ||
while (low <= high) { | ||
// eslint-disable-next-line no-bitwise | ||
const mid = (low + high) >>> 1; | ||
const midKey = this.bytePairEncoderSortedLookup[mid][0]; | ||
let cmp = 0; | ||
for (let i = 0; i < Math.min(midKey.length, key.length); i++) { | ||
cmp = midKey[i] - key[i]; | ||
if (cmp !== 0) | ||
break; | ||
} | ||
if (cmp === 0) { | ||
cmp = midKey.length - key.length; | ||
} | ||
if (cmp === 0) { | ||
return mid; | ||
} | ||
if (cmp < 0) { | ||
low = mid + 1; | ||
} | ||
else { | ||
high = mid - 1; | ||
} | ||
} | ||
return -1; | ||
} | ||
*encodeNative(text, allowedSpecial) { | ||
@@ -48,11 +119,9 @@ let startIndex = 0; | ||
for (const [match] of textSegment.matchAll(this.tokenSplitRegex)) { | ||
const encodedPiece = this.textEncoder.encode(match); | ||
const token = this.encoder.get(encodedPiece); | ||
const token = this.getBpeRankFromString(match); | ||
if (token !== undefined) { | ||
lastTokenLength = 1; | ||
yield [token]; | ||
// eslint-disable-next-line no-continue | ||
continue; | ||
} | ||
const tokens = this.bytePairEncode(encodedPiece, this.encoder); | ||
const tokens = this.bytePairEncode(match); | ||
lastTokenLength = tokens.length; | ||
@@ -86,3 +155,3 @@ yield tokens; | ||
const [specialToken] = nextSpecialMatch; | ||
if (allowedSpecial.has(specialToken)) { | ||
if (allowedSpecial?.has(specialToken)) { | ||
return nextSpecialMatch.index + searchIndex; | ||
@@ -103,41 +172,69 @@ } | ||
for await (const token of tokens) { | ||
const tokenBytes = this.tryDecodeToken(token); | ||
if (tokenBytes) { | ||
yield tokenBytes; | ||
const tokenBytesOrString = this.tryDecodeToken(token); | ||
if (tokenBytesOrString) { | ||
yield tokenBytesOrString; | ||
} | ||
} | ||
} | ||
tryDecodeToken(token) { | ||
return this.decoder.get(token) ?? this.specialTokensDecoder.get(token); | ||
tryDecodeToken(tokenRank) { | ||
const value = this.bytePairEncoder[tokenRank]; | ||
if (typeof value === 'string') { | ||
return value; | ||
} | ||
if (typeof value === 'object') { | ||
const fromBinary = this.bytePairRanksDecoder.get(tokenRank); | ||
if (fromBinary) { | ||
return fromBinary; | ||
} | ||
} | ||
return this.specialTokensDecoder.get(tokenRank); | ||
} | ||
bytePairEncode(inputBytes, bytePairRanks) { | ||
if (inputBytes.length === 1) { | ||
return [bytePairRanks.getOrThrow(inputBytes)]; | ||
bytePairEncode(input) { | ||
if (input.length === 1 && (0, utfUtil_js_1.isAscii)(input.codePointAt(0))) { | ||
return [this.getBpeRankFromStringOrThrow(input)]; | ||
} | ||
return this.bytePairMerge(inputBytes, bytePairRanks, (pair) => { | ||
const key = inputBytes.slice(pair.start, pair.end); | ||
return bytePairRanks.getOrThrow(key); | ||
const inputBytes = this.textEncoder.encode(input); | ||
return this.bytePairMerge(inputBytes, (start, end) => { | ||
const key = inputBytes.subarray(start, end); | ||
return this.getBpeRankFromBytesOrThrow(key); | ||
}); | ||
} | ||
bytePairMerge(piece, bytePairRanks, transform) { | ||
bytePairMerge( | ||
// Input array of bytes to process | ||
piece, | ||
// Function to apply to each final segment after merging | ||
getByteForRange) { | ||
// Create an array of partition objects. Each partition tracks the start index in 'piece' | ||
// and a rank value for adjacent pairs (initially set to positive infinity). | ||
const partitions = Array.from({ length: piece.length + 1 }, (_, i) => ({ | ||
start: i, | ||
rank: Number.POSITIVE_INFINITY, | ||
rank: Number.POSITIVE_INFINITY, // Rank starts at infinity (unmerged) | ||
})); | ||
// Helper function to get the rank of a byte pair starting at 'startIndex'. | ||
// 'skip' determines how far we look ahead (usually 0, for consecutive pairs). | ||
const getRank = (startIndex, skip) => { | ||
if (startIndex + skip + 2 >= partitions.length) { | ||
// Avoid out-of-bounds errors, return undefined when no valid pair exists | ||
return undefined; | ||
} | ||
const key = piece.slice(partitions[startIndex].start, partitions[startIndex + skip + 2].start); | ||
return bytePairRanks.get(key); | ||
// Get the byte pair by extracting a subarray starting at 'startIndex' and ending at | ||
// the start of the partition after 'skip + 2'. | ||
const key = piece.subarray(partitions[startIndex].start, partitions[startIndex + skip + 2].start); | ||
// Retrieve the rank of this byte pair from the BPE rank function | ||
return this.getBpeRankFromBytes(key); | ||
}; | ||
// Initialize the ranks for all adjacent pairs in the array | ||
for (let i = 0; i < partitions.length - 2; i++) { | ||
// Get the rank for the pair starting at index 'i' | ||
const rank = getRank(i, 0); | ||
if (rank !== undefined) { | ||
// Assign the rank to the partition at index 'i' | ||
partitions[i].rank = rank; | ||
} | ||
} | ||
// Iteratively merge byte pairs until no more useful merges can be done | ||
while (partitions.length > 1) { | ||
let minRank = Number.POSITIVE_INFINITY; | ||
let minRankIdx = 0; | ||
// Find the partition with the minimum rank, i.e., the most important pair to merge next | ||
let i = 0; | ||
@@ -151,7 +248,10 @@ for (const partition of partitions) { | ||
} | ||
// If no valid pair is left to merge, exit the loop | ||
if (minRank === Number.POSITIVE_INFINITY) { | ||
break; | ||
} | ||
// Update the rank of the partition after the merged one | ||
partitions[minRankIdx].rank = | ||
getRank(minRankIdx, 1) ?? Number.POSITIVE_INFINITY; | ||
// Update the rank of the partition before the merged one (if exists) | ||
if (minRankIdx > 0) { | ||
@@ -161,10 +261,13 @@ partitions[minRankIdx - 1].rank = | ||
} | ||
// Merge by removing the partition after the one we just merged | ||
partitions.splice(minRankIdx + 1, 1); | ||
} | ||
// Create the final output by applying the transform function to each partitioned range | ||
const output = []; | ||
for (let i = 0; i < partitions.length - 1; i++) { | ||
output.push(transform({ | ||
start: partitions[i].start, | ||
end: partitions[i + 1].start, | ||
})); | ||
output.push(getByteForRange( | ||
// start index | ||
partitions[i].start, | ||
// end index | ||
partitions[i + 1].start)); | ||
} | ||
@@ -171,0 +274,0 @@ return output; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,6 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('cl100k_base', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('cl100k_base', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +28,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,6 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const o200k_base_js_1 = __importDefault(require("../encodings/o200k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('o200k_base', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(o200k_base_js_1.default)); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('o200k_base', () => o200k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +28,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,6 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const p50k_base_js_1 = __importDefault(require("../encodings/p50k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('p50k_base', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(p50k_base_js_1.default)); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('p50k_base', () => p50k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, } = api; | ||
@@ -29,0 +28,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,6 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const p50k_base_js_1 = __importDefault(require("../encodings/p50k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('p50k_edit', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(p50k_base_js_1.default)); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('p50k_edit', () => p50k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, } = api; | ||
@@ -29,0 +28,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,6 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const r50k_base_js_1 = __importDefault(require("../encodings/r50k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('r50k_base', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(r50k_base_js_1.default)); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApi('r50k_base', () => r50k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, } = api; | ||
@@ -29,0 +28,0 @@ exports.decode = decode; |
export default encoder; | ||
/** @type {[string, number][]} */ | ||
declare const encoder: [string, number][]; | ||
/** @type {(string | number[])[]} */ | ||
declare const encoder: (string | number[])[]; |
export default encoder; | ||
/** @type {[string, number][]} */ | ||
declare const encoder: [string, number][]; | ||
/** @type {(string | number[])[]} */ | ||
declare const encoder: (string | number[])[]; |
export default encoder; | ||
/** @type {[string, number][]} */ | ||
declare const encoder: [string, number][]; | ||
/** @type {(string | number[])[]} */ | ||
declare const encoder: (string | number[])[]; |
export default encoder; | ||
/** @type {[string, number][]} */ | ||
declare const encoder: [string, number][]; | ||
/** @type {(string | number[])[]} */ | ||
declare const encoder: (string | number[])[]; |
@@ -22,6 +22,8 @@ import { type EncodingName, type ModelName } from './mapping.js'; | ||
static FimSuffix: string; | ||
decoder: TextDecoder; | ||
modelName?: ModelName; | ||
private decoder; | ||
private bytePairEncodingCoreProcessor; | ||
private specialTokenMapping; | ||
private specialTokensSet; | ||
private allSpecialTokenRegex; | ||
private constructor(); | ||
@@ -32,3 +34,3 @@ static getEncodingApi(encodingName: EncodingName, getMergeableRanks: GetMergeableRanksFn): GptEncoding; | ||
static getEncodingApiForModelAsync(modelName: ModelName, getMergeableRanks: GetMergeableRanksAsyncFn): Promise<GptEncoding>; | ||
encodeGenerator(lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: EncodeOptions): Generator<number[], number, undefined>; | ||
encodeGenerator(lineToEncode: string, { allowedSpecial, disallowedSpecial }?: EncodeOptions): Generator<number[], number, undefined>; | ||
encode(lineToEncode: string, encodeOptions?: EncodeOptions): number[]; | ||
@@ -42,3 +44,3 @@ /** | ||
*/ | ||
encodeChatGenerator(chat: Iterable<ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined): Generator<number[], void, undefined>; | ||
encodeChatGenerator(chat: Iterable<ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined): Generator<number[], void, undefined>; | ||
/** | ||
@@ -51,3 +53,3 @@ * Encodes a chat into a single array of tokens. | ||
*/ | ||
encodeChat(chat: readonly ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined): number[]; | ||
encodeChat(chat: readonly ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined): number[]; | ||
/** | ||
@@ -54,0 +56,0 @@ * @returns {false | number} false if token limit is exceeded, otherwise the number of tokens |
@@ -18,11 +18,21 @@ "use strict"; | ||
static FimSuffix = specialTokens_js_1.FimSuffix; | ||
modelName; | ||
decoder = new TextDecoder('utf8'); | ||
modelName; | ||
bytePairEncodingCoreProcessor; | ||
specialTokenMapping; | ||
constructor({ tokenSplitRegex, mergeableBytePairRanks, specialTokenMapping, expectedVocabularySize, modelName, }) { | ||
const maxTokenValue = Math.max((0, util_js_1.getMaxValueFromMap)(mergeableBytePairRanks), (0, util_js_1.getMaxValueFromMap)(specialTokenMapping)); | ||
specialTokensSet; | ||
allSpecialTokenRegex; | ||
constructor({ mergeableBytePairRanks, specialTokenMapping, expectedVocabularySize, modelName, ...rest }) { | ||
this.specialTokenMapping = specialTokenMapping; | ||
this.specialTokensSet = new Set(this.specialTokenMapping.keys()); | ||
this.allSpecialTokenRegex = (0, util_js_1.getSpecialTokenRegex)(this.specialTokensSet); | ||
this.bytePairEncodingCoreProcessor = new BytePairEncodingCore_js_1.BytePairEncodingCore({ | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
...rest, | ||
}); | ||
const maxTokenValue = Math.max(mergeableBytePairRanks.length - 1, (0, util_js_1.getMaxValueFromMap)(specialTokenMapping)); | ||
if (expectedVocabularySize !== undefined) { | ||
if (mergeableBytePairRanks.size + specialTokenMapping.size !== | ||
if (this.bytePairEncodingCoreProcessor.bytePairEncoderSize + | ||
specialTokenMapping.size !== | ||
expectedVocabularySize) { | ||
@@ -32,10 +42,5 @@ throw new Error('The number of mergeable tokens and special tokens must be equal to explicit_n_vocab.'); | ||
if (maxTokenValue !== expectedVocabularySize - 1) { | ||
throw new Error('The maximum token value must be equal to explicit_n_vocab - 1.'); | ||
throw new Error(`The model encodings are invalid. The maximum token value must be equal to expectedVocabularySize - 1. Currently ${maxTokenValue}, expected ${expectedVocabularySize - 1}`); | ||
} | ||
} | ||
this.bytePairEncodingCoreProcessor = new BytePairEncodingCore_js_1.BytePairEncodingCore({ | ||
bytePairEncoder: mergeableBytePairRanks, | ||
specialTokenEncoder: specialTokenMapping, | ||
tokenSplitRegex, | ||
}); | ||
this.encode = this.encode.bind(this); | ||
@@ -69,14 +74,20 @@ this.decode = this.decode.bind(this); | ||
} | ||
encodeGenerator(lineToEncode, { allowedSpecial = new Set(), disallowedSpecial = new Set([exports.ALL_SPECIAL_TOKENS]), } = {}) { | ||
const specialTokensSet = new Set(this.specialTokenMapping.keys()); | ||
if (disallowedSpecial.has(exports.ALL_SPECIAL_TOKENS)) { | ||
disallowedSpecial = new Set(specialTokensSet); | ||
allowedSpecial.forEach((val) => disallowedSpecial.delete(val)); | ||
disallowedSpecial.forEach((val) => allowedSpecial.delete(val)); | ||
encodeGenerator(lineToEncode, { allowedSpecial, disallowedSpecial } = {}) { | ||
let regexPattern; | ||
if (allowedSpecial?.has(exports.ALL_SPECIAL_TOKENS)) { | ||
allowedSpecial = new Set(this.specialTokensSet); | ||
} | ||
if (allowedSpecial.has(exports.ALL_SPECIAL_TOKENS)) { | ||
allowedSpecial = specialTokensSet; | ||
if (!disallowedSpecial || disallowedSpecial.has(exports.ALL_SPECIAL_TOKENS)) { | ||
// by default, all special tokens are disallowed | ||
disallowedSpecial = new Set(this.specialTokensSet); | ||
if (allowedSpecial?.size) { | ||
allowedSpecial.forEach((val) => disallowedSpecial.delete(val)); | ||
disallowedSpecial.forEach((val) => allowedSpecial.delete(val)); | ||
regexPattern = (0, util_js_1.getSpecialTokenRegex)(disallowedSpecial); | ||
} | ||
else { | ||
regexPattern = this.allSpecialTokenRegex; | ||
} | ||
} | ||
if (disallowedSpecial.size > 0) { | ||
const regexPattern = (0, util_js_1.getSpecialTokenRegex)(disallowedSpecial); | ||
if (regexPattern) { | ||
const match = lineToEncode.match(regexPattern); | ||
@@ -168,3 +179,6 @@ if (match !== null) { | ||
for (const decodedPart of decodedByteGenerator) { | ||
buffer += this.decoder.decode(decodedPart, { stream: true }); | ||
buffer += | ||
typeof decodedPart === 'string' | ||
? decodedPart | ||
: this.decoder.decode(decodedPart, { stream: true }); | ||
if (buffer.length === 0 || (0, utfUtil_js_1.endsWithIncompleteUtfPairSurrogate)(buffer)) { | ||
@@ -190,3 +204,6 @@ // Keep the high surrogate in the buffer and continue with the next token | ||
for await (const decodedPart of decodedByteGenerator) { | ||
buffer += this.decoder.decode(decodedPart, { stream: true }); | ||
buffer += | ||
typeof decodedPart === 'string' | ||
? decodedPart | ||
: this.decoder.decode(decodedPart, { stream: true }); | ||
if (buffer.length === 0 || (0, utfUtil_js_1.endsWithIncompleteUtfPairSurrogate)(buffer)) { | ||
@@ -193,0 +210,0 @@ // Keep the high surrogate in the buffer and continue with the next token |
@@ -187,3 +187,3 @@ "use strict"; | ||
? 127 | ||
: modelName === 'gpt-4o' | ||
: modelName.startsWith('gpt-4o') | ||
? 120 | ||
@@ -190,0 +190,0 @@ : 121; |
@@ -7,12 +7,31 @@ export declare const cl100k_base = "cl100k_base"; | ||
export declare const encodingNames: readonly ["cl100k_base", "p50k_base", "r50k_base", "p50k_edit", "o200k_base"]; | ||
export declare const modelToEncodingMap: { | ||
declare const chatEnabledModelsMap: { | ||
readonly 'gpt-4': "cl100k_base"; | ||
readonly 'gpt-4-0314': "cl100k_base"; | ||
readonly 'gpt-4-0613': "cl100k_base"; | ||
readonly 'gpt-4-32k': "cl100k_base"; | ||
readonly 'gpt-4-0314': "cl100k_base"; | ||
readonly 'gpt-4-32k-0314': "cl100k_base"; | ||
readonly 'gpt-4-32k-0613': "cl100k_base"; | ||
readonly 'gpt-4-turbo': "cl100k_base"; | ||
readonly 'gpt-4-turbo-2024-04-09': "cl100k_base"; | ||
readonly 'gpt-4-turbo-preview': "cl100k_base"; | ||
readonly 'gpt-4-1106-preview': "cl100k_base"; | ||
readonly 'gpt-4-0125-preview': "cl100k_base"; | ||
readonly 'gpt-4-vision-preview': "cl100k_base"; | ||
readonly 'gpt-4o': "o200k_base"; | ||
readonly 'gpt-4o-2024-05-13': "o200k_base"; | ||
readonly 'gpt-4o-2024-08-06': "o200k_base"; | ||
readonly 'gpt-4o-mini-2024-07-18': "o200k_base"; | ||
readonly 'gpt-4o-mini': "o200k_base"; | ||
readonly 'gpt-3.5-turbo': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0301': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0613': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-1106': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0125': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-16k': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-16k-0613': "cl100k_base"; | ||
readonly 'gpt-4o': "o200k_base"; | ||
readonly 'gpt-3.5-turbo-instruct': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-instruct-0914': "cl100k_base"; | ||
}; | ||
export declare const modelToEncodingMap: { | ||
readonly 'text-davinci-003': "p50k_base"; | ||
@@ -37,2 +56,4 @@ readonly 'text-davinci-002': "p50k_base"; | ||
readonly 'text-embedding-ada-002': "cl100k_base"; | ||
readonly 'text-embedding-3-small': "cl100k_base"; | ||
readonly 'text-embedding-3-large': "cl100k_base"; | ||
readonly 'text-similarity-davinci-001': "r50k_base"; | ||
@@ -48,2 +69,28 @@ readonly 'text-similarity-curie-001': "r50k_base"; | ||
readonly 'code-search-ada-code-001': "r50k_base"; | ||
readonly 'gpt-4': "cl100k_base"; | ||
readonly 'gpt-4-0314': "cl100k_base"; | ||
readonly 'gpt-4-0613': "cl100k_base"; | ||
readonly 'gpt-4-32k': "cl100k_base"; | ||
readonly 'gpt-4-32k-0314': "cl100k_base"; | ||
readonly 'gpt-4-32k-0613': "cl100k_base"; | ||
readonly 'gpt-4-turbo': "cl100k_base"; | ||
readonly 'gpt-4-turbo-2024-04-09': "cl100k_base"; | ||
readonly 'gpt-4-turbo-preview': "cl100k_base"; | ||
readonly 'gpt-4-1106-preview': "cl100k_base"; | ||
readonly 'gpt-4-0125-preview': "cl100k_base"; | ||
readonly 'gpt-4-vision-preview': "cl100k_base"; | ||
readonly 'gpt-4o': "o200k_base"; | ||
readonly 'gpt-4o-2024-05-13': "o200k_base"; | ||
readonly 'gpt-4o-2024-08-06': "o200k_base"; | ||
readonly 'gpt-4o-mini-2024-07-18': "o200k_base"; | ||
readonly 'gpt-4o-mini': "o200k_base"; | ||
readonly 'gpt-3.5-turbo': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0301': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0613': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-1106': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0125': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-16k': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-16k-0613': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-instruct': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-instruct-0914': "cl100k_base"; | ||
}; | ||
@@ -54,44 +101,7 @@ export interface ChatParameters { | ||
} | ||
declare const internalChatModelParams: { | ||
'gpt-3.5-turbo': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-3.5-turbo-0301': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-3.5-turbo-0613': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-3.5-turbo-16k-0613': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4-0314': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4-32k': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4-32k-0314': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4o': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
}; | ||
export declare const chatModelParams: Partial<Record<ModelName, ChatParameters>>; | ||
export type ModelName = keyof typeof modelToEncodingMap; | ||
export type ChatModelName = keyof typeof internalChatModelParams; | ||
export type ChatModelName = keyof typeof chatEnabledModelsMap; | ||
export type EncodingName = (typeof modelToEncodingMap)[ModelName]; | ||
export declare const chatModelParams: Record<ChatModelName, ChatParameters>; | ||
export declare const chatEnabledModels: ChatModelName[]; | ||
export {}; |
"use strict"; | ||
/* eslint-disable camelcase */ | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.chatModelParams = exports.modelToEncodingMap = exports.encodingNames = exports.o200k_base = exports.r50k_base = exports.p50k_edit = exports.p50k_base = exports.cl100k_base = void 0; | ||
exports.chatEnabledModels = exports.chatModelParams = exports.modelToEncodingMap = exports.encodingNames = exports.o200k_base = exports.r50k_base = exports.p50k_edit = exports.p50k_base = exports.cl100k_base = void 0; | ||
const specialTokens_js_1 = require("./specialTokens.js"); | ||
@@ -18,13 +18,33 @@ exports.cl100k_base = 'cl100k_base'; | ||
]; | ||
exports.modelToEncodingMap = { | ||
// chat | ||
const chatEnabledModelsMap = { | ||
'gpt-4': exports.cl100k_base, | ||
'gpt-4-0314': exports.cl100k_base, | ||
'gpt-4-0613': exports.cl100k_base, | ||
'gpt-4-32k': exports.cl100k_base, | ||
'gpt-4-0314': exports.cl100k_base, | ||
'gpt-4-32k-0314': exports.cl100k_base, | ||
'gpt-4-32k-0613': exports.cl100k_base, | ||
'gpt-4-turbo': exports.cl100k_base, | ||
'gpt-4-turbo-2024-04-09': exports.cl100k_base, | ||
'gpt-4-turbo-preview': exports.cl100k_base, | ||
'gpt-4-1106-preview': exports.cl100k_base, | ||
'gpt-4-0125-preview': exports.cl100k_base, | ||
'gpt-4-vision-preview': exports.cl100k_base, | ||
'gpt-4o': exports.o200k_base, | ||
'gpt-4o-2024-05-13': exports.o200k_base, | ||
'gpt-4o-2024-08-06': exports.o200k_base, | ||
'gpt-4o-mini-2024-07-18': exports.o200k_base, | ||
'gpt-4o-mini': exports.o200k_base, | ||
'gpt-3.5-turbo': exports.cl100k_base, | ||
'gpt-3.5-turbo-0301': exports.cl100k_base, | ||
'gpt-3.5-turbo-0613': exports.cl100k_base, | ||
'gpt-3.5-turbo-1106': exports.cl100k_base, | ||
'gpt-3.5-turbo-0125': exports.cl100k_base, | ||
'gpt-3.5-turbo-16k': exports.cl100k_base, | ||
'gpt-3.5-turbo-16k-0613': exports.cl100k_base, | ||
'gpt-4o': exports.o200k_base, | ||
'gpt-3.5-turbo-instruct': exports.cl100k_base, | ||
'gpt-3.5-turbo-instruct-0914': exports.cl100k_base, | ||
}; | ||
exports.modelToEncodingMap = { | ||
// chat | ||
...chatEnabledModelsMap, | ||
// text | ||
@@ -53,2 +73,4 @@ 'text-davinci-003': exports.p50k_base, | ||
'text-embedding-ada-002': exports.cl100k_base, | ||
'text-embedding-3-small': exports.cl100k_base, | ||
'text-embedding-3-large': exports.cl100k_base, | ||
// old embeddings | ||
@@ -66,41 +88,16 @@ 'text-similarity-davinci-001': exports.r50k_base, | ||
}; | ||
const internalChatModelParams = { | ||
'gpt-3.5-turbo': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-0301': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-0613': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-16k-0613': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-4': { | ||
messageSeparator: '', | ||
roleSeparator: specialTokens_js_1.ImSep, | ||
}, | ||
'gpt-4-0314': { | ||
messageSeparator: '', | ||
roleSeparator: specialTokens_js_1.ImSep, | ||
}, | ||
'gpt-4-32k': { | ||
messageSeparator: '', | ||
roleSeparator: specialTokens_js_1.ImSep, | ||
}, | ||
'gpt-4-32k-0314': { | ||
messageSeparator: '', | ||
roleSeparator: specialTokens_js_1.ImSep, | ||
}, | ||
'gpt-4o': { | ||
messageSeparator: '', | ||
roleSeparator: specialTokens_js_1.ImSep, | ||
}, | ||
const gpt3params = { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}; | ||
exports.chatModelParams = internalChatModelParams; | ||
const gpt4params = { | ||
messageSeparator: '', | ||
roleSeparator: specialTokens_js_1.ImSep, | ||
}; | ||
exports.chatModelParams = Object.fromEntries(Object.keys(chatEnabledModelsMap).flatMap((modelName) => modelName.startsWith('gpt-4') | ||
? [[modelName, gpt4params]] | ||
: modelName.startsWith('gpt-3.5-turbo') | ||
? [[modelName, gpt3params]] | ||
: [])); | ||
exports.chatEnabledModels = Object.keys(chatEnabledModelsMap); | ||
//# sourceMappingURL=mapping.js.map |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0301', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0301', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0613', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0613', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-16k-0613', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-16k-0613', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-3.5-turbo', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-3.5-turbo', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4-0314', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4-0314', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4-32k-0314', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4-32k-0314', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4-32k', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4-32k', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("../encodings/cl100k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4', () => cl100k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
@@ -22,7 +22,7 @@ "use strict"; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("../convertTokenBytePairEncodingFromTuples.js"); | ||
const o200k_base_js_1 = __importDefault(require("../encodings/o200k_base.js")); | ||
const GptEncoding_js_1 = require("../GptEncoding.js"); | ||
__exportStar(require("../specialTokens.js"), exports); | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4o', () => (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(o200k_base_js_1.default)); | ||
// prettier-ignore | ||
const api = GptEncoding_js_1.GptEncoding.getEncodingApiForModel('gpt-4o', () => o200k_base_js_1.default); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -29,0 +29,0 @@ exports.decode = decode; |
@@ -1,4 +0,4 @@ | ||
import { EncoderMap } from './EncoderMap.js'; | ||
import type { BytePairEncodingConfig, RawBytePairRanks } from './BytePairEncodingCore.js'; | ||
import type { EncodingName, ModelName } from './mapping.js'; | ||
export interface EncodingParams { | ||
export interface EncodingParams extends BytePairEncodingConfig { | ||
/** | ||
@@ -16,9 +16,11 @@ * The expected total number of tokens in the vocabulary, including both regular and special tokens. | ||
tokenSplitRegex: RegExp; | ||
mergeableBytePairRanks: EncoderMap; | ||
specialTokenMapping: Map<string, number>; | ||
modelName?: ModelName; | ||
/** increases memory consumption, but speeds up subsequent decoding */ | ||
enableCache?: boolean; | ||
} | ||
export type GetMergeableRanksFn = (encodingName: EncodingName) => EncoderMap; | ||
export type GetMergeableRanksAsyncFn = (encodingName: EncodingName) => Promise<EncoderMap>; | ||
export declare const tokenSplitRegex: RegExp; | ||
export type GetMergeableRanksFn = (encodingName: EncodingName) => RawBytePairRanks; | ||
export type GetMergeableRanksAsyncFn = (encodingName: EncodingName) => Promise<RawBytePairRanks>; | ||
export declare function getEncodingParams(encodingName: EncodingName, getMergeableRanks: GetMergeableRanksFn): EncodingParams; | ||
export declare function getModelParamsAsync(encodingName: EncodingName, getMergeableRanks: GetMergeableRanksAsyncFn): Promise<EncodingParams>; |
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.tokenSplitRegex = void 0; | ||
exports.getEncodingParams = getEncodingParams; | ||
exports.getModelParamsAsync = getModelParamsAsync; | ||
const specialTokens_js_1 = require("./specialTokens.js"); | ||
const tokenSplitRegex = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu; | ||
function R50KBase(mergeableBytePairRanks) { | ||
return { | ||
expectedVocabularySize: 50_257, | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping: new Map([[specialTokens_js_1.EndOfText, 50_256]]), | ||
}; | ||
} | ||
function P50KBase(mergeableBytePairRanks) { | ||
return { | ||
expectedVocabularySize: 50_281, | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping: new Map([[specialTokens_js_1.EndOfText, 50_256]]), | ||
}; | ||
} | ||
function P50KEdit(mergeableBytePairRanks) { | ||
const specialTokenMapping = new Map([ | ||
[specialTokens_js_1.EndOfText, 50_256], | ||
[specialTokens_js_1.FimPrefix, 50_281], | ||
[specialTokens_js_1.FimMiddle, 50_282], | ||
[specialTokens_js_1.FimSuffix, 50_283], | ||
]); | ||
return { | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
}; | ||
} | ||
function Cl100KBase(mergeableBytePairRanks) { | ||
const specialTokenMapping = new Map([ | ||
[specialTokens_js_1.EndOfText, 100_257], | ||
[specialTokens_js_1.FimPrefix, 100_258], | ||
[specialTokens_js_1.FimMiddle, 100_259], | ||
[specialTokens_js_1.FimSuffix, 100_260], | ||
[specialTokens_js_1.ImStart, 100_264], | ||
[specialTokens_js_1.ImEnd, 100_265], | ||
[specialTokens_js_1.ImSep, 100_266], | ||
[specialTokens_js_1.EndOfPrompt, 100_276], | ||
]); | ||
return { | ||
tokenSplitRegex: /(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
}; | ||
} | ||
function O200KBase(mergeableBytePairRanks) { | ||
const specialTokenMapping = new Map([ | ||
[specialTokens_js_1.EndOfText, 199_999], | ||
[specialTokens_js_1.FimPrefix, 200_000], | ||
[specialTokens_js_1.FimMiddle, 200_001], | ||
[specialTokens_js_1.FimSuffix, 200_002], | ||
[specialTokens_js_1.ImStart, 200_003], | ||
[specialTokens_js_1.ImEnd, 200_004], | ||
[specialTokens_js_1.ImSep, 200_005], | ||
[specialTokens_js_1.EndOfPrompt, 200_006], | ||
]); | ||
return { | ||
tokenSplitRegex: /(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
}; | ||
} | ||
const Cl100KBase_js_1 = require("./encodingParams/Cl100KBase.js"); | ||
const O200KBase_js_1 = require("./encodingParams/O200KBase.js"); | ||
const P50KBase_js_1 = require("./encodingParams/P50KBase.js"); | ||
const P50KEdit_js_1 = require("./encodingParams/P50KEdit.js"); | ||
const R50KBase_js_1 = require("./encodingParams/R50KBase.js"); | ||
exports.tokenSplitRegex = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu; | ||
function getEncodingParams(encodingName, getMergeableRanks) { | ||
@@ -74,11 +16,11 @@ const mergeableBytePairRanks = getMergeableRanks(encodingName); | ||
case 'r50k_base': | ||
return R50KBase(mergeableBytePairRanks); | ||
return (0, R50KBase_js_1.R50KBase)(mergeableBytePairRanks); | ||
case 'p50k_base': | ||
return P50KBase(mergeableBytePairRanks); | ||
return (0, P50KBase_js_1.P50KBase)(mergeableBytePairRanks); | ||
case 'p50k_edit': | ||
return P50KEdit(mergeableBytePairRanks); | ||
return (0, P50KEdit_js_1.P50KEdit)(mergeableBytePairRanks); | ||
case 'cl100k_base': | ||
return Cl100KBase(mergeableBytePairRanks); | ||
return (0, Cl100KBase_js_1.Cl100KBase)(mergeableBytePairRanks); | ||
case 'o200k_base': | ||
return O200KBase(mergeableBytePairRanks); | ||
return (0, O200KBase_js_1.O200KBase)(mergeableBytePairRanks); | ||
default: | ||
@@ -85,0 +27,0 @@ throw new Error(`Unknown encoding name: ${encodingName}`); |
@@ -1,3 +0,3 @@ | ||
import type { EncoderMap } from './EncoderMap.js'; | ||
import type { RawBytePairRanks } from './BytePairEncodingCore.js'; | ||
import type { EncodingName } from './mapping.js'; | ||
export declare const resolveEncoding: (encoding: EncodingName) => EncoderMap; | ||
export declare const resolveEncoding: (encoding: EncodingName) => RawBytePairRanks; |
@@ -7,4 +7,2 @@ "use strict"; | ||
exports.resolveEncoding = void 0; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("./convertTokenBytePairEncodingFromTuples.js"); | ||
const cl100k_base_js_1 = __importDefault(require("./encodings/cl100k_base.js")); | ||
@@ -17,10 +15,10 @@ const o200k_base_js_1 = __importDefault(require("./encodings/o200k_base.js")); | ||
case 'r50k_base': | ||
return (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(r50k_base_js_1.default); | ||
return r50k_base_js_1.default; | ||
case 'p50k_base': | ||
case 'p50k_edit': | ||
return (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(p50k_base_js_1.default); | ||
return p50k_base_js_1.default; | ||
case 'cl100k_base': | ||
return (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(cl100k_base_js_1.default); | ||
return cl100k_base_js_1.default; | ||
case 'o200k_base': | ||
return (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(o200k_base_js_1.default); | ||
return o200k_base_js_1.default; | ||
default: { | ||
@@ -27,0 +25,0 @@ throw new Error(`Unknown encoding name: ${encoding}`); |
@@ -1,3 +0,3 @@ | ||
import type { EncoderMap } from './EncoderMap.js'; | ||
import type { RawBytePairRanks } from './BytePairEncodingCore.js'; | ||
import type { EncodingName } from './mapping.js'; | ||
export declare const resolveEncodingAsync: (encoding: EncodingName) => Promise<EncoderMap>; | ||
export declare const resolveEncodingAsync: (encoding: EncodingName) => Promise<RawBytePairRanks>; |
@@ -27,15 +27,13 @@ "use strict"; | ||
exports.resolveEncodingAsync = void 0; | ||
/* eslint-disable import/extensions */ | ||
const convertTokenBytePairEncodingFromTuples_js_1 = require("./convertTokenBytePairEncodingFromTuples.js"); | ||
const resolveEncodingAsync = async (encoding) => { | ||
switch (encoding) { | ||
case 'r50k_base': | ||
return (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(await Promise.resolve().then(() => __importStar(require('./encodings/r50k_base.js'))).then(({ default: encodingTuples }) => encodingTuples)); | ||
return Promise.resolve().then(() => __importStar(require('./encodings/r50k_base.js'))).then(({ default: rawBytePairRanks }) => rawBytePairRanks); | ||
case 'p50k_base': | ||
case 'p50k_edit': | ||
return (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(await Promise.resolve().then(() => __importStar(require('./encodings/p50k_base.js'))).then(({ default: encodingTuples }) => encodingTuples)); | ||
return Promise.resolve().then(() => __importStar(require('./encodings/p50k_base.js'))).then(({ default: rawBytePairRanks }) => rawBytePairRanks); | ||
case 'cl100k_base': | ||
return (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(await Promise.resolve().then(() => __importStar(require('./encodings/cl100k_base.js'))).then(({ default: encodingTuples }) => encodingTuples)); | ||
return Promise.resolve().then(() => __importStar(require('./encodings/cl100k_base.js'))).then(({ default: rawBytePairRanks }) => rawBytePairRanks); | ||
case 'o200k_base': | ||
return (0, convertTokenBytePairEncodingFromTuples_js_1.convertTokenBytePairEncodingFromTuples)(await Promise.resolve().then(() => __importStar(require('./encodings/o200k_base.js'))).then(({ default: encodingTuples }) => encodingTuples)); | ||
return Promise.resolve().then(() => __importStar(require('./encodings/o200k_base.js'))).then(({ default: rawBytePairRanks }) => rawBytePairRanks); | ||
default: { | ||
@@ -42,0 +40,0 @@ throw new Error(`Unknown encoding name: ${encoding}`); |
@@ -0,1 +1,4 @@ | ||
export declare const isAscii: (codePoint: number) => boolean; | ||
export declare function endsWithIncompleteUtfPairSurrogate(string: string): boolean; | ||
export declare function tryConvertToString(arr: Uint8Array): string | undefined; | ||
export declare function compareUint8Arrays(a: Uint8Array, b: Uint8Array): number; |
"use strict"; | ||
/* eslint-disable no-bitwise */ | ||
/* eslint-disable no-magic-numbers */ | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.isAscii = void 0; | ||
exports.endsWithIncompleteUtfPairSurrogate = endsWithIncompleteUtfPairSurrogate; | ||
exports.tryConvertToString = tryConvertToString; | ||
exports.compareUint8Arrays = compareUint8Arrays; | ||
const isAscii = (codePoint) => codePoint <= 0x7f; | ||
exports.isAscii = isAscii; | ||
const HIGH_SURROGATE_START = 55_296; | ||
@@ -14,2 +21,82 @@ const HIGH_SURROGATE_END = 56_319; | ||
} | ||
function isValidUTF8(bytes) { | ||
let i = 0; | ||
while (i < bytes.length) { | ||
const byte1 = bytes[i]; | ||
let numBytes = 0; | ||
let codePoint = 0; | ||
// Determine the number of bytes in the current UTF-8 character | ||
if (byte1 <= 0x7f) { | ||
// 1-byte character (ASCII) | ||
numBytes = 1; | ||
codePoint = byte1; | ||
} | ||
else if ((byte1 & 0xe0) === 0xc0) { | ||
// 2-byte character | ||
numBytes = 2; | ||
codePoint = byte1 & 0x1f; | ||
if (byte1 <= 0xc1) | ||
return false; // Overlong encoding not allowed | ||
} | ||
else if ((byte1 & 0xf0) === 0xe0) { | ||
// 3-byte character | ||
numBytes = 3; | ||
codePoint = byte1 & 0x0f; | ||
} | ||
else if ((byte1 & 0xf8) === 0xf0) { | ||
// 4-byte character | ||
numBytes = 4; | ||
codePoint = byte1 & 0x07; | ||
if (byte1 > 0xf4) | ||
return false; // Code points above U+10FFFF not allowed | ||
} | ||
else { | ||
// Invalid first byte of UTF-8 character | ||
return false; | ||
} | ||
// Ensure there are enough continuation bytes | ||
if (i + numBytes > bytes.length) | ||
return false; | ||
// Process the continuation bytes | ||
for (let j = 1; j < numBytes; j++) { | ||
const byte = bytes[i + j]; | ||
if (byte === undefined || (byte & 0xc0) !== 0x80) | ||
return false; // Continuation bytes must start with '10' | ||
codePoint = (codePoint << 6) | (byte & 0x3f); | ||
} | ||
// Check for overlong encodings | ||
if (numBytes === 2 && codePoint < 0x80) | ||
return false; // Overlong 2-byte sequence | ||
if (numBytes === 3 && codePoint < 2_048) | ||
return false; // Overlong 3-byte sequence | ||
if (numBytes === 4 && codePoint < 65_536) | ||
return false; // Overlong 4-byte sequence | ||
// Check for surrogate halves (U+D800 to U+DFFF) | ||
if (codePoint >= 55_296 && codePoint <= 57_343) | ||
return false; | ||
// Check for code points above U+10FFFF | ||
if (codePoint > 1_114_111) | ||
return false; | ||
// Move to the next character | ||
i += numBytes; | ||
} | ||
return true; | ||
} | ||
const textDecoder = new TextDecoder('utf8', { fatal: false }); | ||
function tryConvertToString(arr) { | ||
if (!isValidUTF8(arr)) { | ||
return undefined; | ||
} | ||
return textDecoder.decode(arr); | ||
} | ||
// Helper function to compare two Uint8Arrays lexicographically | ||
function compareUint8Arrays(a, b) { | ||
const len = Math.min(a.length, b.length); | ||
for (let i = 0; i < len; i++) { | ||
if (a[i] !== b[i]) { | ||
return a[i] - b[i]; | ||
} | ||
} | ||
return a.length - b.length; | ||
} | ||
//# sourceMappingURL=utfUtil.js.map |
export declare function getMaxValueFromMap(map: Map<unknown, number>): number; | ||
export declare function escapeRegExp(string: string): string; | ||
export declare function getSpecialTokenRegex(tokens: Set<string>): RegExp; |
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.getMaxValueFromMap = getMaxValueFromMap; | ||
exports.escapeRegExp = escapeRegExp; | ||
exports.getSpecialTokenRegex = getSpecialTokenRegex; | ||
const escapeRegExp_js_1 = require("./escapeRegExp.js"); | ||
function getMaxValueFromMap(map) { | ||
@@ -13,4 +13,7 @@ let max = 0; | ||
} | ||
function escapeRegExp(string) { | ||
return string.replace(/[$()*+.?[\\\]^{|}]/g, '\\$&'); // $& means the whole matched string | ||
} | ||
function getSpecialTokenRegex(tokens) { | ||
const escapedTokens = [...tokens].map(escapeRegExp_js_1.escapeRegExp); | ||
const escapedTokens = [...tokens].map(escapeRegExp); | ||
const inner = escapedTokens.join('|'); | ||
@@ -17,0 +20,0 @@ return new RegExp(`(${inner})`); |
@@ -1,25 +0,31 @@ | ||
import { EncoderMap } from './EncoderMap.js'; | ||
export declare class BytePairEncodingCore { | ||
encoder: EncoderMap; | ||
decoder: Map<number, Uint8Array>; | ||
export type RawBytePairRanks = readonly (string | readonly number[])[]; | ||
export interface BytePairEncodingConfig { | ||
mergeableBytePairRanks: RawBytePairRanks; | ||
specialTokenMapping?: Map<string, number>; | ||
tokenSplitRegex: RegExp; | ||
specialTokensEncoder: Map<string, number>; | ||
specialTokensDecoder: Map<number, Uint8Array>; | ||
specialTokenPatternRegex: RegExp; | ||
textEncoder: TextEncoder; | ||
constructor({ bytePairEncoder, specialTokenEncoder, tokenSplitRegex, }: { | ||
bytePairEncoder: EncoderMap; | ||
specialTokenEncoder?: Map<string, number>; | ||
tokenSplitRegex: RegExp; | ||
}); | ||
encodeNative(text: string, allowedSpecial: Set<string>): Generator<number[], number, undefined>; | ||
findNextSpecialStartIndex(text: string, allowedSpecial: Set<string>, startIndex: number, specialRegex: RegExp): number | undefined; | ||
decodeNative(tokens: Iterable<number>): Generator<Uint8Array>; | ||
decodeNativeAsync(tokens: AsyncIterable<number>): AsyncGenerator<Uint8Array>; | ||
tryDecodeToken(token: number): Uint8Array | undefined; | ||
bytePairEncode(inputBytes: Uint8Array, bytePairRanks: EncoderMap): number[]; | ||
bytePairMerge(piece: Uint8Array, bytePairRanks: EncoderMap, transform: (pair: { | ||
start: number; | ||
end: number; | ||
}) => number): number[]; | ||
} | ||
export declare class BytePairEncodingCore { | ||
readonly bytePairEncoderSize: number; | ||
private bytePairEncoder; | ||
private bytePairEncoderSortedLookup; | ||
private bytePairRanksDecoder; | ||
private tokenSplitRegex; | ||
private specialTokensEncoder; | ||
private specialTokensDecoder; | ||
private specialTokenPatternRegex; | ||
private stringDecoder; | ||
private textEncoder; | ||
constructor({ mergeableBytePairRanks: bytePairEncoder, specialTokenMapping: specialTokenEncoder, tokenSplitRegex, }: BytePairEncodingConfig); | ||
getBpeRankFromString(key: string): number | undefined; | ||
getBpeRankFromStringOrThrow(key: string): number; | ||
getBpeRankFromBytes(key: Uint8Array): number | undefined; | ||
getBpeRankFromBytesOrThrow(key: Uint8Array): number; | ||
binarySearch(key: Uint8Array): number; | ||
encodeNative(text: string, allowedSpecial?: Set<string>): Generator<number[], number, undefined>; | ||
findNextSpecialStartIndex(text: string, allowedSpecial: Set<string> | undefined, startIndex: number, specialRegex: RegExp): number | undefined; | ||
decodeNative(tokens: Iterable<number>): Generator<Uint8Array | string, void, void>; | ||
decodeNativeAsync(tokens: AsyncIterable<number>): AsyncGenerator<Uint8Array | string>; | ||
tryDecodeToken(tokenRank: number): Uint8Array | string | undefined; | ||
bytePairEncode(input: string): number[]; | ||
bytePairMerge(piece: Uint8Array, getByteForRange: (start: number, end: number) => number): number[]; | ||
} |
@@ -1,6 +0,9 @@ | ||
import { EncoderMap } from './EncoderMap.js'; | ||
import { escapeRegExp } from './escapeRegExp.js'; | ||
/* eslint-disable no-continue */ | ||
import { compareUint8Arrays, isAscii, tryConvertToString } from './utfUtil.js'; | ||
import { escapeRegExp } from './util.js'; | ||
export class BytePairEncodingCore { | ||
encoder; | ||
decoder; | ||
bytePairEncoderSize; | ||
bytePairEncoder; | ||
bytePairEncoderSortedLookup; | ||
bytePairRanksDecoder = new Map(); | ||
tokenSplitRegex; | ||
@@ -10,14 +13,24 @@ specialTokensEncoder; | ||
specialTokenPatternRegex; | ||
stringDecoder; | ||
textEncoder = new TextEncoder(); | ||
constructor({ bytePairEncoder, specialTokenEncoder, tokenSplitRegex, }) { | ||
this.encoder = bytePairEncoder ?? new EncoderMap(); | ||
this.decoder = bytePairEncoder | ||
? new Map([...bytePairEncoder].map(([key, value]) => [value, key])) | ||
: new Map(); | ||
constructor({ mergeableBytePairRanks: bytePairEncoder, specialTokenMapping: specialTokenEncoder, tokenSplitRegex, }) { | ||
this.bytePairEncoder = bytePairEncoder; | ||
this.stringDecoder = new Map(); | ||
// size without array holes (which may be present in the encoder) | ||
this.bytePairEncoderSize = Object.keys(bytePairEncoder).length; | ||
const binaryLookup = []; | ||
// forEach skips array holes: | ||
bytePairEncoder.forEach((value, rank) => { | ||
if (typeof value === 'string') { | ||
this.stringDecoder.set(value, rank); | ||
return; | ||
} | ||
const byteArray = new Uint8Array(value); | ||
binaryLookup.push([byteArray, rank]); | ||
this.bytePairRanksDecoder.set(rank, byteArray); | ||
}); | ||
this.bytePairEncoderSortedLookup = binaryLookup.sort((a, b) => compareUint8Arrays(a[0], b[0])); | ||
this.specialTokensEncoder = specialTokenEncoder ?? new Map(); | ||
this.specialTokensDecoder = specialTokenEncoder | ||
? new Map([...specialTokenEncoder].map(([key, value]) => [ | ||
value, | ||
this.textEncoder.encode(key), | ||
])) | ||
? new Map([...specialTokenEncoder].map(([key, value]) => [value, key])) | ||
: new Map(); | ||
@@ -34,2 +47,60 @@ this.tokenSplitRegex = tokenSplitRegex; | ||
} | ||
getBpeRankFromString(key) { | ||
return this.stringDecoder.get(key); | ||
} | ||
getBpeRankFromStringOrThrow(key) { | ||
const value = this.getBpeRankFromString(key); | ||
if (value === undefined) { | ||
throw new Error(`The byte-pair encoding does not contain a value for: ${key}`); | ||
} | ||
return value; | ||
} | ||
getBpeRankFromBytes(key) { | ||
const keyAsString = tryConvertToString(key); | ||
if (keyAsString !== undefined) { | ||
return this.getBpeRankFromString(keyAsString); | ||
} | ||
// Perform binary search on the binary keys | ||
const index = this.binarySearch(key); | ||
if (index !== -1) { | ||
return this.bytePairEncoderSortedLookup[index][1]; | ||
} | ||
return undefined; | ||
} | ||
getBpeRankFromBytesOrThrow(key) { | ||
const value = this.getBpeRankFromBytes(key); | ||
if (value === undefined) { | ||
throw new Error(`The byte-pair encoding does not contain a value for: ${key.toString()}`); | ||
} | ||
return value; | ||
} | ||
// Binary search on the binary keys | ||
binarySearch(key) { | ||
let low = 0; | ||
let high = this.bytePairEncoderSortedLookup.length - 1; | ||
while (low <= high) { | ||
// eslint-disable-next-line no-bitwise | ||
const mid = (low + high) >>> 1; | ||
const midKey = this.bytePairEncoderSortedLookup[mid][0]; | ||
let cmp = 0; | ||
for (let i = 0; i < Math.min(midKey.length, key.length); i++) { | ||
cmp = midKey[i] - key[i]; | ||
if (cmp !== 0) | ||
break; | ||
} | ||
if (cmp === 0) { | ||
cmp = midKey.length - key.length; | ||
} | ||
if (cmp === 0) { | ||
return mid; | ||
} | ||
if (cmp < 0) { | ||
low = mid + 1; | ||
} | ||
else { | ||
high = mid - 1; | ||
} | ||
} | ||
return -1; | ||
} | ||
*encodeNative(text, allowedSpecial) { | ||
@@ -45,11 +116,9 @@ let startIndex = 0; | ||
for (const [match] of textSegment.matchAll(this.tokenSplitRegex)) { | ||
const encodedPiece = this.textEncoder.encode(match); | ||
const token = this.encoder.get(encodedPiece); | ||
const token = this.getBpeRankFromString(match); | ||
if (token !== undefined) { | ||
lastTokenLength = 1; | ||
yield [token]; | ||
// eslint-disable-next-line no-continue | ||
continue; | ||
} | ||
const tokens = this.bytePairEncode(encodedPiece, this.encoder); | ||
const tokens = this.bytePairEncode(match); | ||
lastTokenLength = tokens.length; | ||
@@ -83,3 +152,3 @@ yield tokens; | ||
const [specialToken] = nextSpecialMatch; | ||
if (allowedSpecial.has(specialToken)) { | ||
if (allowedSpecial?.has(specialToken)) { | ||
return nextSpecialMatch.index + searchIndex; | ||
@@ -100,41 +169,69 @@ } | ||
for await (const token of tokens) { | ||
const tokenBytes = this.tryDecodeToken(token); | ||
if (tokenBytes) { | ||
yield tokenBytes; | ||
const tokenBytesOrString = this.tryDecodeToken(token); | ||
if (tokenBytesOrString) { | ||
yield tokenBytesOrString; | ||
} | ||
} | ||
} | ||
tryDecodeToken(token) { | ||
return this.decoder.get(token) ?? this.specialTokensDecoder.get(token); | ||
tryDecodeToken(tokenRank) { | ||
const value = this.bytePairEncoder[tokenRank]; | ||
if (typeof value === 'string') { | ||
return value; | ||
} | ||
if (typeof value === 'object') { | ||
const fromBinary = this.bytePairRanksDecoder.get(tokenRank); | ||
if (fromBinary) { | ||
return fromBinary; | ||
} | ||
} | ||
return this.specialTokensDecoder.get(tokenRank); | ||
} | ||
bytePairEncode(inputBytes, bytePairRanks) { | ||
if (inputBytes.length === 1) { | ||
return [bytePairRanks.getOrThrow(inputBytes)]; | ||
bytePairEncode(input) { | ||
if (input.length === 1 && isAscii(input.codePointAt(0))) { | ||
return [this.getBpeRankFromStringOrThrow(input)]; | ||
} | ||
return this.bytePairMerge(inputBytes, bytePairRanks, (pair) => { | ||
const key = inputBytes.slice(pair.start, pair.end); | ||
return bytePairRanks.getOrThrow(key); | ||
const inputBytes = this.textEncoder.encode(input); | ||
return this.bytePairMerge(inputBytes, (start, end) => { | ||
const key = inputBytes.subarray(start, end); | ||
return this.getBpeRankFromBytesOrThrow(key); | ||
}); | ||
} | ||
bytePairMerge(piece, bytePairRanks, transform) { | ||
bytePairMerge( | ||
// Input array of bytes to process | ||
piece, | ||
// Function to apply to each final segment after merging | ||
getByteForRange) { | ||
// Create an array of partition objects. Each partition tracks the start index in 'piece' | ||
// and a rank value for adjacent pairs (initially set to positive infinity). | ||
const partitions = Array.from({ length: piece.length + 1 }, (_, i) => ({ | ||
start: i, | ||
rank: Number.POSITIVE_INFINITY, | ||
rank: Number.POSITIVE_INFINITY, // Rank starts at infinity (unmerged) | ||
})); | ||
// Helper function to get the rank of a byte pair starting at 'startIndex'. | ||
// 'skip' determines how far we look ahead (usually 0, for consecutive pairs). | ||
const getRank = (startIndex, skip) => { | ||
if (startIndex + skip + 2 >= partitions.length) { | ||
// Avoid out-of-bounds errors, return undefined when no valid pair exists | ||
return undefined; | ||
} | ||
const key = piece.slice(partitions[startIndex].start, partitions[startIndex + skip + 2].start); | ||
return bytePairRanks.get(key); | ||
// Get the byte pair by extracting a subarray starting at 'startIndex' and ending at | ||
// the start of the partition after 'skip + 2'. | ||
const key = piece.subarray(partitions[startIndex].start, partitions[startIndex + skip + 2].start); | ||
// Retrieve the rank of this byte pair from the BPE rank function | ||
return this.getBpeRankFromBytes(key); | ||
}; | ||
// Initialize the ranks for all adjacent pairs in the array | ||
for (let i = 0; i < partitions.length - 2; i++) { | ||
// Get the rank for the pair starting at index 'i' | ||
const rank = getRank(i, 0); | ||
if (rank !== undefined) { | ||
// Assign the rank to the partition at index 'i' | ||
partitions[i].rank = rank; | ||
} | ||
} | ||
// Iteratively merge byte pairs until no more useful merges can be done | ||
while (partitions.length > 1) { | ||
let minRank = Number.POSITIVE_INFINITY; | ||
let minRankIdx = 0; | ||
// Find the partition with the minimum rank, i.e., the most important pair to merge next | ||
let i = 0; | ||
@@ -148,7 +245,10 @@ for (const partition of partitions) { | ||
} | ||
// If no valid pair is left to merge, exit the loop | ||
if (minRank === Number.POSITIVE_INFINITY) { | ||
break; | ||
} | ||
// Update the rank of the partition after the merged one | ||
partitions[minRankIdx].rank = | ||
getRank(minRankIdx, 1) ?? Number.POSITIVE_INFINITY; | ||
// Update the rank of the partition before the merged one (if exists) | ||
if (minRankIdx > 0) { | ||
@@ -158,10 +258,13 @@ partitions[minRankIdx - 1].rank = | ||
} | ||
// Merge by removing the partition after the one we just merged | ||
partitions.splice(minRankIdx + 1, 1); | ||
} | ||
// Create the final output by applying the transform function to each partitioned range | ||
const output = []; | ||
for (let i = 0; i < partitions.length - 1; i++) { | ||
output.push(transform({ | ||
start: partitions[i].start, | ||
end: partitions[i + 1].start, | ||
})); | ||
output.push(getByteForRange( | ||
// start index | ||
partitions[i].start, | ||
// end index | ||
partitions[i + 1].start)); | ||
} | ||
@@ -168,0 +271,0 @@ return output; |
@@ -15,2 +15,3 @@ import * as fs from 'fs/promises'; | ||
.replace(`getEncodingApi('cl100k_base'`, `getEncodingApiForModel('${modelName}'`) | ||
.replace('\nconst api =', '// prettier-ignore\nconst api =') | ||
.replaceAll(`cl100k_base.js`, `${encoding}.js`) | ||
@@ -17,0 +18,0 @@ : `// eslint-disable-next-line no-restricted-exports, import/no-default-export\nexport { default } from '../encoding/${encoding}.js'\nexport * from '../encoding/${encoding}.js'\n`; |
@@ -0,4 +1,6 @@ | ||
/* eslint-disable no-console */ | ||
import * as fs from 'fs/promises'; | ||
import * as path from 'path'; | ||
import { fileURLToPath } from 'url'; | ||
const DEBUG = process.env.DEBUG === 'true'; | ||
const processFilesInDirectory = async (directoryPath, fn) => { | ||
@@ -17,3 +19,2 @@ try { | ||
catch (error) { | ||
// eslint-disable-next-line no-console | ||
console.error('An error occurred:', error); | ||
@@ -24,2 +25,18 @@ } | ||
const __dirname = path.dirname(fileURLToPath(import.meta.url)); | ||
const textDecoder = new TextDecoder('utf8', { fatal: true }); | ||
const textEncoder = new TextEncoder(); | ||
function safeDecodeUtf8(bytes) { | ||
try { | ||
const v = textDecoder.decode(bytes); | ||
const encoded = textEncoder.encode(v); | ||
if (encoded.byteLength !== bytes.byteLength) { | ||
console.log('Mismatch:', new Uint8Array(bytes), encoded); | ||
return undefined; | ||
} | ||
return v; | ||
} | ||
catch { | ||
return undefined; | ||
} | ||
} | ||
await processFilesInDirectory(path.join(__dirname, '../../data'), async (filePath) => { | ||
@@ -29,2 +46,3 @@ if (!filePath.endsWith('.tiktoken')) | ||
const modelName = path.basename(filePath, '.tiktoken'); | ||
console.log(`Processing ${modelName}`); | ||
const bpeFile = await fs.readFile(filePath, 'utf8'); | ||
@@ -34,9 +52,22 @@ const lines = bpeFile.split('\n'); | ||
const [token, rank] = x.split(' '); | ||
return [token, Number.parseInt(rank, 10)]; | ||
if (!token || token.length === 0 || !rank || rank.length === 0) { | ||
throw new Error(`Invalid token encoding: ${x}`); | ||
} | ||
const tokenArray = Buffer.from(token, 'base64'); | ||
return [tokenArray, Number.parseInt(rank, 10)]; | ||
}); | ||
const jsCodeBpeArray = encoder.reduce((acc, [token, rank]) => { | ||
const decoded = safeDecodeUtf8(token) ?? token; | ||
return { | ||
string: `${acc.string}${','.repeat(rank - acc.lastRank)}${DEBUG ? `\n/** ${rank} = */` : ''}${typeof decoded === 'string' | ||
? JSON.stringify(decoded) | ||
: `[${token.join(',')}]`}`, | ||
lastRank: rank, | ||
}; | ||
}, { string: '', lastRank: 0 }).string; | ||
const firstTokenRank = encoder[0]?.[1] ?? 0; | ||
await fs.mkdir(path.join(__dirname, '../encodings'), { recursive: true }); | ||
await fs.writeFile(path.join(__dirname, `../encodings/${modelName}.js`), `/* eslint-disable */\n// @ts-nocheck\n// prettier-ignore\n/** @type {[string, number][]} */\nconst encoder = ${JSON.stringify(encoder)};\nexport default encoder;`); | ||
// eslint-disable-next-line no-console | ||
await fs.writeFile(path.join(__dirname, `../encodings/${modelName}.js`), `/* eslint-disable */\n// @ts-nocheck\n// prettier-ignore\n/** @type {(string | number[])[]} */\nconst encoder = [${','.repeat(firstTokenRank)}${jsCodeBpeArray}];\nexport default encoder;`); | ||
console.log(`Wrote ${modelName}.js`); | ||
}); | ||
//# sourceMappingURL=generateJsEncodings.js.map |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApi('cl100k_base', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
const api = GptEncoding.getEncodingApi('cl100k_base', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +7,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/o200k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApi('o200k_base', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
const api = GptEncoding.getEncodingApi('o200k_base', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +7,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/p50k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApi('p50k_base', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
const api = GptEncoding.getEncodingApi('p50k_base', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, } = api; | ||
@@ -8,0 +7,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/p50k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApi('p50k_edit', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
const api = GptEncoding.getEncodingApi('p50k_edit', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, } = api; | ||
@@ -8,0 +7,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/r50k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApi('r50k_base', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
const api = GptEncoding.getEncodingApi('r50k_base', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, } = api; | ||
@@ -8,0 +7,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, }; |
export default encoder; | ||
/** @type {[string, number][]} */ | ||
declare const encoder: [string, number][]; | ||
/** @type {(string | number[])[]} */ | ||
declare const encoder: (string | number[])[]; |
export default encoder; | ||
/** @type {[string, number][]} */ | ||
declare const encoder: [string, number][]; | ||
/** @type {(string | number[])[]} */ | ||
declare const encoder: (string | number[])[]; |
export default encoder; | ||
/** @type {[string, number][]} */ | ||
declare const encoder: [string, number][]; | ||
/** @type {(string | number[])[]} */ | ||
declare const encoder: (string | number[])[]; |
export default encoder; | ||
/** @type {[string, number][]} */ | ||
declare const encoder: [string, number][]; | ||
/** @type {(string | number[])[]} */ | ||
declare const encoder: (string | number[])[]; |
@@ -22,6 +22,8 @@ import { type EncodingName, type ModelName } from './mapping.js'; | ||
static FimSuffix: string; | ||
decoder: TextDecoder; | ||
modelName?: ModelName; | ||
private decoder; | ||
private bytePairEncodingCoreProcessor; | ||
private specialTokenMapping; | ||
private specialTokensSet; | ||
private allSpecialTokenRegex; | ||
private constructor(); | ||
@@ -32,3 +34,3 @@ static getEncodingApi(encodingName: EncodingName, getMergeableRanks: GetMergeableRanksFn): GptEncoding; | ||
static getEncodingApiForModelAsync(modelName: ModelName, getMergeableRanks: GetMergeableRanksAsyncFn): Promise<GptEncoding>; | ||
encodeGenerator(lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: EncodeOptions): Generator<number[], number, undefined>; | ||
encodeGenerator(lineToEncode: string, { allowedSpecial, disallowedSpecial }?: EncodeOptions): Generator<number[], number, undefined>; | ||
encode(lineToEncode: string, encodeOptions?: EncodeOptions): number[]; | ||
@@ -42,3 +44,3 @@ /** | ||
*/ | ||
encodeChatGenerator(chat: Iterable<ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined): Generator<number[], void, undefined>; | ||
encodeChatGenerator(chat: Iterable<ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined): Generator<number[], void, undefined>; | ||
/** | ||
@@ -51,3 +53,3 @@ * Encodes a chat into a single array of tokens. | ||
*/ | ||
encodeChat(chat: readonly ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined): number[]; | ||
encodeChat(chat: readonly ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined): number[]; | ||
/** | ||
@@ -54,0 +56,0 @@ * @returns {false | number} false if token limit is exceeded, otherwise the number of tokens |
@@ -15,11 +15,21 @@ /* eslint-disable no-param-reassign */ | ||
static FimSuffix = FimSuffix; | ||
modelName; | ||
decoder = new TextDecoder('utf8'); | ||
modelName; | ||
bytePairEncodingCoreProcessor; | ||
specialTokenMapping; | ||
constructor({ tokenSplitRegex, mergeableBytePairRanks, specialTokenMapping, expectedVocabularySize, modelName, }) { | ||
const maxTokenValue = Math.max(getMaxValueFromMap(mergeableBytePairRanks), getMaxValueFromMap(specialTokenMapping)); | ||
specialTokensSet; | ||
allSpecialTokenRegex; | ||
constructor({ mergeableBytePairRanks, specialTokenMapping, expectedVocabularySize, modelName, ...rest }) { | ||
this.specialTokenMapping = specialTokenMapping; | ||
this.specialTokensSet = new Set(this.specialTokenMapping.keys()); | ||
this.allSpecialTokenRegex = getSpecialTokenRegex(this.specialTokensSet); | ||
this.bytePairEncodingCoreProcessor = new BytePairEncodingCore({ | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
...rest, | ||
}); | ||
const maxTokenValue = Math.max(mergeableBytePairRanks.length - 1, getMaxValueFromMap(specialTokenMapping)); | ||
if (expectedVocabularySize !== undefined) { | ||
if (mergeableBytePairRanks.size + specialTokenMapping.size !== | ||
if (this.bytePairEncodingCoreProcessor.bytePairEncoderSize + | ||
specialTokenMapping.size !== | ||
expectedVocabularySize) { | ||
@@ -29,10 +39,5 @@ throw new Error('The number of mergeable tokens and special tokens must be equal to explicit_n_vocab.'); | ||
if (maxTokenValue !== expectedVocabularySize - 1) { | ||
throw new Error('The maximum token value must be equal to explicit_n_vocab - 1.'); | ||
throw new Error(`The model encodings are invalid. The maximum token value must be equal to expectedVocabularySize - 1. Currently ${maxTokenValue}, expected ${expectedVocabularySize - 1}`); | ||
} | ||
} | ||
this.bytePairEncodingCoreProcessor = new BytePairEncodingCore({ | ||
bytePairEncoder: mergeableBytePairRanks, | ||
specialTokenEncoder: specialTokenMapping, | ||
tokenSplitRegex, | ||
}); | ||
this.encode = this.encode.bind(this); | ||
@@ -66,14 +71,20 @@ this.decode = this.decode.bind(this); | ||
} | ||
encodeGenerator(lineToEncode, { allowedSpecial = new Set(), disallowedSpecial = new Set([ALL_SPECIAL_TOKENS]), } = {}) { | ||
const specialTokensSet = new Set(this.specialTokenMapping.keys()); | ||
if (disallowedSpecial.has(ALL_SPECIAL_TOKENS)) { | ||
disallowedSpecial = new Set(specialTokensSet); | ||
allowedSpecial.forEach((val) => disallowedSpecial.delete(val)); | ||
disallowedSpecial.forEach((val) => allowedSpecial.delete(val)); | ||
encodeGenerator(lineToEncode, { allowedSpecial, disallowedSpecial } = {}) { | ||
let regexPattern; | ||
if (allowedSpecial?.has(ALL_SPECIAL_TOKENS)) { | ||
allowedSpecial = new Set(this.specialTokensSet); | ||
} | ||
if (allowedSpecial.has(ALL_SPECIAL_TOKENS)) { | ||
allowedSpecial = specialTokensSet; | ||
if (!disallowedSpecial || disallowedSpecial.has(ALL_SPECIAL_TOKENS)) { | ||
// by default, all special tokens are disallowed | ||
disallowedSpecial = new Set(this.specialTokensSet); | ||
if (allowedSpecial?.size) { | ||
allowedSpecial.forEach((val) => disallowedSpecial.delete(val)); | ||
disallowedSpecial.forEach((val) => allowedSpecial.delete(val)); | ||
regexPattern = getSpecialTokenRegex(disallowedSpecial); | ||
} | ||
else { | ||
regexPattern = this.allSpecialTokenRegex; | ||
} | ||
} | ||
if (disallowedSpecial.size > 0) { | ||
const regexPattern = getSpecialTokenRegex(disallowedSpecial); | ||
if (regexPattern) { | ||
const match = lineToEncode.match(regexPattern); | ||
@@ -165,3 +176,6 @@ if (match !== null) { | ||
for (const decodedPart of decodedByteGenerator) { | ||
buffer += this.decoder.decode(decodedPart, { stream: true }); | ||
buffer += | ||
typeof decodedPart === 'string' | ||
? decodedPart | ||
: this.decoder.decode(decodedPart, { stream: true }); | ||
if (buffer.length === 0 || endsWithIncompleteUtfPairSurrogate(buffer)) { | ||
@@ -187,3 +201,6 @@ // Keep the high surrogate in the buffer and continue with the next token | ||
for await (const decodedPart of decodedByteGenerator) { | ||
buffer += this.decoder.decode(decodedPart, { stream: true }); | ||
buffer += | ||
typeof decodedPart === 'string' | ||
? decodedPart | ||
: this.decoder.decode(decodedPart, { stream: true }); | ||
if (buffer.length === 0 || endsWithIncompleteUtfPairSurrogate(buffer)) { | ||
@@ -190,0 +207,0 @@ // Keep the high surrogate in the buffer and continue with the next token |
@@ -182,3 +182,3 @@ import fs from 'fs'; | ||
? 127 | ||
: modelName === 'gpt-4o' | ||
: modelName.startsWith('gpt-4o') | ||
? 120 | ||
@@ -185,0 +185,0 @@ : 121; |
@@ -7,12 +7,31 @@ export declare const cl100k_base = "cl100k_base"; | ||
export declare const encodingNames: readonly ["cl100k_base", "p50k_base", "r50k_base", "p50k_edit", "o200k_base"]; | ||
export declare const modelToEncodingMap: { | ||
declare const chatEnabledModelsMap: { | ||
readonly 'gpt-4': "cl100k_base"; | ||
readonly 'gpt-4-0314': "cl100k_base"; | ||
readonly 'gpt-4-0613': "cl100k_base"; | ||
readonly 'gpt-4-32k': "cl100k_base"; | ||
readonly 'gpt-4-0314': "cl100k_base"; | ||
readonly 'gpt-4-32k-0314': "cl100k_base"; | ||
readonly 'gpt-4-32k-0613': "cl100k_base"; | ||
readonly 'gpt-4-turbo': "cl100k_base"; | ||
readonly 'gpt-4-turbo-2024-04-09': "cl100k_base"; | ||
readonly 'gpt-4-turbo-preview': "cl100k_base"; | ||
readonly 'gpt-4-1106-preview': "cl100k_base"; | ||
readonly 'gpt-4-0125-preview': "cl100k_base"; | ||
readonly 'gpt-4-vision-preview': "cl100k_base"; | ||
readonly 'gpt-4o': "o200k_base"; | ||
readonly 'gpt-4o-2024-05-13': "o200k_base"; | ||
readonly 'gpt-4o-2024-08-06': "o200k_base"; | ||
readonly 'gpt-4o-mini-2024-07-18': "o200k_base"; | ||
readonly 'gpt-4o-mini': "o200k_base"; | ||
readonly 'gpt-3.5-turbo': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0301': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0613': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-1106': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0125': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-16k': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-16k-0613': "cl100k_base"; | ||
readonly 'gpt-4o': "o200k_base"; | ||
readonly 'gpt-3.5-turbo-instruct': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-instruct-0914': "cl100k_base"; | ||
}; | ||
export declare const modelToEncodingMap: { | ||
readonly 'text-davinci-003': "p50k_base"; | ||
@@ -37,2 +56,4 @@ readonly 'text-davinci-002': "p50k_base"; | ||
readonly 'text-embedding-ada-002': "cl100k_base"; | ||
readonly 'text-embedding-3-small': "cl100k_base"; | ||
readonly 'text-embedding-3-large': "cl100k_base"; | ||
readonly 'text-similarity-davinci-001': "r50k_base"; | ||
@@ -48,2 +69,28 @@ readonly 'text-similarity-curie-001': "r50k_base"; | ||
readonly 'code-search-ada-code-001': "r50k_base"; | ||
readonly 'gpt-4': "cl100k_base"; | ||
readonly 'gpt-4-0314': "cl100k_base"; | ||
readonly 'gpt-4-0613': "cl100k_base"; | ||
readonly 'gpt-4-32k': "cl100k_base"; | ||
readonly 'gpt-4-32k-0314': "cl100k_base"; | ||
readonly 'gpt-4-32k-0613': "cl100k_base"; | ||
readonly 'gpt-4-turbo': "cl100k_base"; | ||
readonly 'gpt-4-turbo-2024-04-09': "cl100k_base"; | ||
readonly 'gpt-4-turbo-preview': "cl100k_base"; | ||
readonly 'gpt-4-1106-preview': "cl100k_base"; | ||
readonly 'gpt-4-0125-preview': "cl100k_base"; | ||
readonly 'gpt-4-vision-preview': "cl100k_base"; | ||
readonly 'gpt-4o': "o200k_base"; | ||
readonly 'gpt-4o-2024-05-13': "o200k_base"; | ||
readonly 'gpt-4o-2024-08-06': "o200k_base"; | ||
readonly 'gpt-4o-mini-2024-07-18': "o200k_base"; | ||
readonly 'gpt-4o-mini': "o200k_base"; | ||
readonly 'gpt-3.5-turbo': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0301': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0613': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-1106': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-0125': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-16k': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-16k-0613': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-instruct': "cl100k_base"; | ||
readonly 'gpt-3.5-turbo-instruct-0914': "cl100k_base"; | ||
}; | ||
@@ -54,44 +101,7 @@ export interface ChatParameters { | ||
} | ||
declare const internalChatModelParams: { | ||
'gpt-3.5-turbo': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-3.5-turbo-0301': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-3.5-turbo-0613': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-3.5-turbo-16k-0613': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4-0314': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4-32k': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4-32k-0314': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
'gpt-4o': { | ||
messageSeparator: string; | ||
roleSeparator: string; | ||
}; | ||
}; | ||
export declare const chatModelParams: Partial<Record<ModelName, ChatParameters>>; | ||
export type ModelName = keyof typeof modelToEncodingMap; | ||
export type ChatModelName = keyof typeof internalChatModelParams; | ||
export type ChatModelName = keyof typeof chatEnabledModelsMap; | ||
export type EncodingName = (typeof modelToEncodingMap)[ModelName]; | ||
export declare const chatModelParams: Record<ChatModelName, ChatParameters>; | ||
export declare const chatEnabledModels: ChatModelName[]; | ||
export {}; |
@@ -15,13 +15,33 @@ /* eslint-disable camelcase */ | ||
]; | ||
export const modelToEncodingMap = { | ||
// chat | ||
const chatEnabledModelsMap = { | ||
'gpt-4': cl100k_base, | ||
'gpt-4-0314': cl100k_base, | ||
'gpt-4-0613': cl100k_base, | ||
'gpt-4-32k': cl100k_base, | ||
'gpt-4-0314': cl100k_base, | ||
'gpt-4-32k-0314': cl100k_base, | ||
'gpt-4-32k-0613': cl100k_base, | ||
'gpt-4-turbo': cl100k_base, | ||
'gpt-4-turbo-2024-04-09': cl100k_base, | ||
'gpt-4-turbo-preview': cl100k_base, | ||
'gpt-4-1106-preview': cl100k_base, | ||
'gpt-4-0125-preview': cl100k_base, | ||
'gpt-4-vision-preview': cl100k_base, | ||
'gpt-4o': o200k_base, | ||
'gpt-4o-2024-05-13': o200k_base, | ||
'gpt-4o-2024-08-06': o200k_base, | ||
'gpt-4o-mini-2024-07-18': o200k_base, | ||
'gpt-4o-mini': o200k_base, | ||
'gpt-3.5-turbo': cl100k_base, | ||
'gpt-3.5-turbo-0301': cl100k_base, | ||
'gpt-3.5-turbo-0613': cl100k_base, | ||
'gpt-3.5-turbo-1106': cl100k_base, | ||
'gpt-3.5-turbo-0125': cl100k_base, | ||
'gpt-3.5-turbo-16k': cl100k_base, | ||
'gpt-3.5-turbo-16k-0613': cl100k_base, | ||
'gpt-4o': o200k_base, | ||
'gpt-3.5-turbo-instruct': cl100k_base, | ||
'gpt-3.5-turbo-instruct-0914': cl100k_base, | ||
}; | ||
export const modelToEncodingMap = { | ||
// chat | ||
...chatEnabledModelsMap, | ||
// text | ||
@@ -50,2 +70,4 @@ 'text-davinci-003': p50k_base, | ||
'text-embedding-ada-002': cl100k_base, | ||
'text-embedding-3-small': cl100k_base, | ||
'text-embedding-3-large': cl100k_base, | ||
// old embeddings | ||
@@ -63,41 +85,16 @@ 'text-similarity-davinci-001': r50k_base, | ||
}; | ||
const internalChatModelParams = { | ||
'gpt-3.5-turbo': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-0301': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-0613': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-16k-0613': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-4': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
'gpt-4-0314': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
'gpt-4-32k': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
'gpt-4-32k-0314': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
'gpt-4o': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
const gpt3params = { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}; | ||
export const chatModelParams = internalChatModelParams; | ||
const gpt4params = { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}; | ||
export const chatModelParams = Object.fromEntries(Object.keys(chatEnabledModelsMap).flatMap((modelName) => modelName.startsWith('gpt-4') | ||
? [[modelName, gpt4params]] | ||
: modelName.startsWith('gpt-3.5-turbo') | ||
? [[modelName, gpt3params]] | ||
: [])); | ||
export const chatEnabledModels = Object.keys(chatEnabledModelsMap); | ||
//# sourceMappingURL=mapping.js.map |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0301', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0301', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0613', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0613', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-16k-0613', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-16k-0613', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-0314', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-0314', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-32k-0314', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-32k-0314', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-32k', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-32k', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/cl100k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
declare const api: GptEncoding; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial, }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "gpt-4" | "gpt-4-32k" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-4o" | "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | undefined) => Generator<number[], void, undefined>; | ||
declare const decode: (inputTokensToDecode: Iterable<number>) => string, decodeAsyncGenerator: (inputTokensToDecode: AsyncIterable<number>) => AsyncGenerator<string, void>, decodeGenerator: (inputTokensToDecode: Iterable<number>) => Generator<string, void>, encode: (lineToEncode: string, encodeOptions?: import("../GptEncoding.js").EncodeOptions) => number[], encodeGenerator: (lineToEncode: string, { allowedSpecial, disallowedSpecial }?: import("../GptEncoding.js").EncodeOptions) => Generator<number[], number, undefined>, isWithinTokenLimit: (input: string | Iterable<import("../GptEncoding.js").ChatMessage>, tokenLimit: number) => false | number, encodeChat: (chat: readonly import("../GptEncoding.js").ChatMessage[], model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => number[], encodeChatGenerator: (chat: Iterable<import("../GptEncoding.js").ChatMessage>, model?: "text-davinci-003" | "text-davinci-002" | "text-davinci-001" | "text-curie-001" | "text-babbage-001" | "text-ada-001" | "davinci" | "curie" | "babbage" | "ada" | "code-davinci-002" | "code-davinci-001" | "code-cushman-002" | "code-cushman-001" | "davinci-codex" | "cushman-codex" | "text-davinci-edit-001" | "code-davinci-edit-001" | "text-embedding-ada-002" | "text-embedding-3-small" | "text-embedding-3-large" | "text-similarity-davinci-001" | "text-similarity-curie-001" | "text-similarity-babbage-001" | "text-similarity-ada-001" | "text-search-davinci-doc-001" | "text-search-curie-doc-001" | "text-search-babbage-doc-001" | "text-search-ada-doc-001" | "code-search-babbage-code-001" | "code-search-ada-code-001" | "gpt-4" | "gpt-4-0314" | "gpt-4-0613" | "gpt-4-32k" | "gpt-4-32k-0314" | "gpt-4-32k-0613" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-1106-preview" | "gpt-4-0125-preview" | "gpt-4-vision-preview" | "gpt-4o" | "gpt-4o-2024-05-13" | "gpt-4o-2024-08-06" | "gpt-4o-mini-2024-07-18" | "gpt-4o-mini" | "gpt-3.5-turbo" | "gpt-3.5-turbo-0301" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-16k" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-instruct" | "gpt-3.5-turbo-instruct-0914" | undefined) => Generator<number[], void, undefined>; | ||
export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; | ||
export default api; |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js'; | ||
import encoder from '../encodings/o200k_base.js'; | ||
import { GptEncoding } from '../GptEncoding.js'; | ||
export * from '../specialTokens.js'; | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4o', () => convertTokenBytePairEncodingFromTuples(encoder)); | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4o', () => encoder); | ||
const { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeGenerator, isWithinTokenLimit, encodeChat, encodeChatGenerator, } = api; | ||
@@ -8,0 +8,0 @@ export { decode, decodeAsyncGenerator, decodeGenerator, encode, encodeChat, encodeChatGenerator, encodeGenerator, isWithinTokenLimit, }; |
@@ -1,4 +0,4 @@ | ||
import { EncoderMap } from './EncoderMap.js'; | ||
import type { BytePairEncodingConfig, RawBytePairRanks } from './BytePairEncodingCore.js'; | ||
import type { EncodingName, ModelName } from './mapping.js'; | ||
export interface EncodingParams { | ||
export interface EncodingParams extends BytePairEncodingConfig { | ||
/** | ||
@@ -16,9 +16,11 @@ * The expected total number of tokens in the vocabulary, including both regular and special tokens. | ||
tokenSplitRegex: RegExp; | ||
mergeableBytePairRanks: EncoderMap; | ||
specialTokenMapping: Map<string, number>; | ||
modelName?: ModelName; | ||
/** increases memory consumption, but speeds up subsequent decoding */ | ||
enableCache?: boolean; | ||
} | ||
export type GetMergeableRanksFn = (encodingName: EncodingName) => EncoderMap; | ||
export type GetMergeableRanksAsyncFn = (encodingName: EncodingName) => Promise<EncoderMap>; | ||
export declare const tokenSplitRegex: RegExp; | ||
export type GetMergeableRanksFn = (encodingName: EncodingName) => RawBytePairRanks; | ||
export type GetMergeableRanksAsyncFn = (encodingName: EncodingName) => Promise<RawBytePairRanks>; | ||
export declare function getEncodingParams(encodingName: EncodingName, getMergeableRanks: GetMergeableRanksFn): EncodingParams; | ||
export declare function getModelParamsAsync(encodingName: EncodingName, getMergeableRanks: GetMergeableRanksAsyncFn): Promise<EncodingParams>; |
@@ -1,68 +0,7 @@ | ||
/* eslint-disable no-magic-numbers */ | ||
import { EncoderMap } from './EncoderMap.js'; | ||
import { EndOfPrompt, EndOfText, FimMiddle, FimPrefix, FimSuffix, ImEnd, ImSep, ImStart, } from './specialTokens.js'; | ||
const tokenSplitRegex = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu; | ||
function R50KBase(mergeableBytePairRanks) { | ||
return { | ||
expectedVocabularySize: 50_257, | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping: new Map([[EndOfText, 50_256]]), | ||
}; | ||
} | ||
function P50KBase(mergeableBytePairRanks) { | ||
return { | ||
expectedVocabularySize: 50_281, | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping: new Map([[EndOfText, 50_256]]), | ||
}; | ||
} | ||
function P50KEdit(mergeableBytePairRanks) { | ||
const specialTokenMapping = new Map([ | ||
[EndOfText, 50_256], | ||
[FimPrefix, 50_281], | ||
[FimMiddle, 50_282], | ||
[FimSuffix, 50_283], | ||
]); | ||
return { | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
}; | ||
} | ||
function Cl100KBase(mergeableBytePairRanks) { | ||
const specialTokenMapping = new Map([ | ||
[EndOfText, 100_257], | ||
[FimPrefix, 100_258], | ||
[FimMiddle, 100_259], | ||
[FimSuffix, 100_260], | ||
[ImStart, 100_264], | ||
[ImEnd, 100_265], | ||
[ImSep, 100_266], | ||
[EndOfPrompt, 100_276], | ||
]); | ||
return { | ||
tokenSplitRegex: /(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
}; | ||
} | ||
function O200KBase(mergeableBytePairRanks) { | ||
const specialTokenMapping = new Map([ | ||
[EndOfText, 199_999], | ||
[FimPrefix, 200_000], | ||
[FimMiddle, 200_001], | ||
[FimSuffix, 200_002], | ||
[ImStart, 200_003], | ||
[ImEnd, 200_004], | ||
[ImSep, 200_005], | ||
[EndOfPrompt, 200_006], | ||
]); | ||
return { | ||
tokenSplitRegex: /(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
}; | ||
} | ||
import { Cl100KBase } from './encodingParams/Cl100KBase.js'; | ||
import { O200KBase } from './encodingParams/O200KBase.js'; | ||
import { P50KBase } from './encodingParams/P50KBase.js'; | ||
import { P50KEdit } from './encodingParams/P50KEdit.js'; | ||
import { R50KBase } from './encodingParams/R50KBase.js'; | ||
export const tokenSplitRegex = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu; | ||
export function getEncodingParams(encodingName, getMergeableRanks) { | ||
@@ -69,0 +8,0 @@ const mergeableBytePairRanks = getMergeableRanks(encodingName); |
@@ -1,3 +0,3 @@ | ||
import type { EncoderMap } from './EncoderMap.js'; | ||
import type { RawBytePairRanks } from './BytePairEncodingCore.js'; | ||
import type { EncodingName } from './mapping.js'; | ||
export declare const resolveEncoding: (encoding: EncodingName) => EncoderMap; | ||
export declare const resolveEncoding: (encoding: EncodingName) => RawBytePairRanks; |
@@ -1,3 +0,1 @@ | ||
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from './convertTokenBytePairEncodingFromTuples.js'; | ||
import cl100k from './encodings/cl100k_base.js'; | ||
@@ -10,10 +8,10 @@ import o200k from './encodings/o200k_base.js'; | ||
case 'r50k_base': | ||
return convertTokenBytePairEncodingFromTuples(r50k); | ||
return r50k; | ||
case 'p50k_base': | ||
case 'p50k_edit': | ||
return convertTokenBytePairEncodingFromTuples(p50k); | ||
return p50k; | ||
case 'cl100k_base': | ||
return convertTokenBytePairEncodingFromTuples(cl100k); | ||
return cl100k; | ||
case 'o200k_base': | ||
return convertTokenBytePairEncodingFromTuples(o200k); | ||
return o200k; | ||
default: { | ||
@@ -20,0 +18,0 @@ throw new Error(`Unknown encoding name: ${encoding}`); |
@@ -1,3 +0,3 @@ | ||
import type { EncoderMap } from './EncoderMap.js'; | ||
import type { RawBytePairRanks } from './BytePairEncodingCore.js'; | ||
import type { EncodingName } from './mapping.js'; | ||
export declare const resolveEncodingAsync: (encoding: EncodingName) => Promise<EncoderMap>; | ||
export declare const resolveEncodingAsync: (encoding: EncodingName) => Promise<RawBytePairRanks>; |
@@ -1,14 +0,12 @@ | ||
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from './convertTokenBytePairEncodingFromTuples.js'; | ||
export const resolveEncodingAsync = async (encoding) => { | ||
switch (encoding) { | ||
case 'r50k_base': | ||
return convertTokenBytePairEncodingFromTuples(await import('./encodings/r50k_base.js').then(({ default: encodingTuples }) => encodingTuples)); | ||
return import('./encodings/r50k_base.js').then(({ default: rawBytePairRanks }) => rawBytePairRanks); | ||
case 'p50k_base': | ||
case 'p50k_edit': | ||
return convertTokenBytePairEncodingFromTuples(await import('./encodings/p50k_base.js').then(({ default: encodingTuples }) => encodingTuples)); | ||
return import('./encodings/p50k_base.js').then(({ default: rawBytePairRanks }) => rawBytePairRanks); | ||
case 'cl100k_base': | ||
return convertTokenBytePairEncodingFromTuples(await import('./encodings/cl100k_base.js').then(({ default: encodingTuples }) => encodingTuples)); | ||
return import('./encodings/cl100k_base.js').then(({ default: rawBytePairRanks }) => rawBytePairRanks); | ||
case 'o200k_base': | ||
return convertTokenBytePairEncodingFromTuples(await import('./encodings/o200k_base.js').then(({ default: encodingTuples }) => encodingTuples)); | ||
return import('./encodings/o200k_base.js').then(({ default: rawBytePairRanks }) => rawBytePairRanks); | ||
default: { | ||
@@ -15,0 +13,0 @@ throw new Error(`Unknown encoding name: ${encoding}`); |
@@ -0,1 +1,4 @@ | ||
export declare const isAscii: (codePoint: number) => boolean; | ||
export declare function endsWithIncompleteUtfPairSurrogate(string: string): boolean; | ||
export declare function tryConvertToString(arr: Uint8Array): string | undefined; | ||
export declare function compareUint8Arrays(a: Uint8Array, b: Uint8Array): number; |
@@ -0,1 +1,4 @@ | ||
/* eslint-disable no-bitwise */ | ||
/* eslint-disable no-magic-numbers */ | ||
export const isAscii = (codePoint) => codePoint <= 0x7f; | ||
const HIGH_SURROGATE_START = 55_296; | ||
@@ -11,2 +14,82 @@ const HIGH_SURROGATE_END = 56_319; | ||
} | ||
function isValidUTF8(bytes) { | ||
let i = 0; | ||
while (i < bytes.length) { | ||
const byte1 = bytes[i]; | ||
let numBytes = 0; | ||
let codePoint = 0; | ||
// Determine the number of bytes in the current UTF-8 character | ||
if (byte1 <= 0x7f) { | ||
// 1-byte character (ASCII) | ||
numBytes = 1; | ||
codePoint = byte1; | ||
} | ||
else if ((byte1 & 0xe0) === 0xc0) { | ||
// 2-byte character | ||
numBytes = 2; | ||
codePoint = byte1 & 0x1f; | ||
if (byte1 <= 0xc1) | ||
return false; // Overlong encoding not allowed | ||
} | ||
else if ((byte1 & 0xf0) === 0xe0) { | ||
// 3-byte character | ||
numBytes = 3; | ||
codePoint = byte1 & 0x0f; | ||
} | ||
else if ((byte1 & 0xf8) === 0xf0) { | ||
// 4-byte character | ||
numBytes = 4; | ||
codePoint = byte1 & 0x07; | ||
if (byte1 > 0xf4) | ||
return false; // Code points above U+10FFFF not allowed | ||
} | ||
else { | ||
// Invalid first byte of UTF-8 character | ||
return false; | ||
} | ||
// Ensure there are enough continuation bytes | ||
if (i + numBytes > bytes.length) | ||
return false; | ||
// Process the continuation bytes | ||
for (let j = 1; j < numBytes; j++) { | ||
const byte = bytes[i + j]; | ||
if (byte === undefined || (byte & 0xc0) !== 0x80) | ||
return false; // Continuation bytes must start with '10' | ||
codePoint = (codePoint << 6) | (byte & 0x3f); | ||
} | ||
// Check for overlong encodings | ||
if (numBytes === 2 && codePoint < 0x80) | ||
return false; // Overlong 2-byte sequence | ||
if (numBytes === 3 && codePoint < 2_048) | ||
return false; // Overlong 3-byte sequence | ||
if (numBytes === 4 && codePoint < 65_536) | ||
return false; // Overlong 4-byte sequence | ||
// Check for surrogate halves (U+D800 to U+DFFF) | ||
if (codePoint >= 55_296 && codePoint <= 57_343) | ||
return false; | ||
// Check for code points above U+10FFFF | ||
if (codePoint > 1_114_111) | ||
return false; | ||
// Move to the next character | ||
i += numBytes; | ||
} | ||
return true; | ||
} | ||
const textDecoder = new TextDecoder('utf8', { fatal: false }); | ||
export function tryConvertToString(arr) { | ||
if (!isValidUTF8(arr)) { | ||
return undefined; | ||
} | ||
return textDecoder.decode(arr); | ||
} | ||
// Helper function to compare two Uint8Arrays lexicographically | ||
export function compareUint8Arrays(a, b) { | ||
const len = Math.min(a.length, b.length); | ||
for (let i = 0; i < len; i++) { | ||
if (a[i] !== b[i]) { | ||
return a[i] - b[i]; | ||
} | ||
} | ||
return a.length - b.length; | ||
} | ||
//# sourceMappingURL=utfUtil.js.map |
export declare function getMaxValueFromMap(map: Map<unknown, number>): number; | ||
export declare function escapeRegExp(string: string): string; | ||
export declare function getSpecialTokenRegex(tokens: Set<string>): RegExp; |
@@ -1,2 +0,1 @@ | ||
import { escapeRegExp } from './escapeRegExp.js'; | ||
export function getMaxValueFromMap(map) { | ||
@@ -9,2 +8,5 @@ let max = 0; | ||
} | ||
export function escapeRegExp(string) { | ||
return string.replace(/[$()*+.?[\\\]^{|}]/g, '\\$&'); // $& means the whole matched string | ||
} | ||
export function getSpecialTokenRegex(tokens) { | ||
@@ -11,0 +13,0 @@ const escapedTokens = [...tokens].map(escapeRegExp); |
{ | ||
"name": "gpt-tokenizer", | ||
"version": "2.2.3", | ||
"version": "2.3.0", | ||
"description": "A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models", | ||
@@ -80,3 +80,3 @@ "keywords": [ | ||
"build:cjs": "yarn rrun tsc --outDir cjs --module commonjs --target es2022 --project tsconfig-cjs.json", | ||
"build:esm": "yarn rrun tsc --outDir esm --module esnext --target es2022 && echo '{\"name\": \"gpt-tokenizer\", \"type\": \"module\"}' > ./esm/package.json", | ||
"build:esm": "mkdir -p esm && echo '{\"name\": \"gpt-tokenizer\", \"type\": \"module\"}' > ./esm/package.json && yarn rrun tsc --outDir esm --module esnext --target es2022", | ||
"build:umd": "yarn build:umd:cl100k_base && yarn build:umd:p50k_base && yarn build:umd:p50k_edit && yarn build:umd:r50k_base && yarn build:umd:o200k_base", | ||
@@ -130,6 +130,3 @@ "build:umd:cl100k_base": "beemo webpack --entry='./src/main.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_cl100k_base' --env 'filename=cl100k_base.js'", | ||
"access": "public" | ||
}, | ||
"dependencies": { | ||
"rfc4648": "^1.5.3" | ||
} | ||
} |
@@ -15,2 +15,3 @@ # gpt-tokenizer | ||
- Support for all current OpenAI models (available encodings: `r50k_base`, `p50k_base`, `p50k_edit`, `cl100k_base` and `o200k_base`) | ||
- Can be loaded and work synchronously! (i.e. in non async/await contexts) | ||
- Generator function versions of both the decoder and encoder functions | ||
@@ -49,7 +50,7 @@ - Provides the ability to decode an asynchronous stream of data (using `decodeAsyncGenerator` and `decodeGenerator` with any iterable input) | ||
- https://unpkg.com/gpt-tokenizer/dist/cl100k_base.js | ||
- https://unpkg.com/gpt-tokenizer/dist/o200k_base.js (for `gpt-4o`) | ||
- https://unpkg.com/gpt-tokenizer/dist/cl100k_base.js (for `gpt-4-*` and `gpt-3.5-turbo`) | ||
- https://unpkg.com/gpt-tokenizer/dist/p50k_base.js | ||
- https://unpkg.com/gpt-tokenizer/dist/p50k_edit.js | ||
- https://unpkg.com/gpt-tokenizer/dist/r50k_base.js | ||
- https://unpkg.com/gpt-tokenizer/dist/o200k_base.js | ||
@@ -136,3 +137,4 @@ The global name is a concatenation: `GPTTokenizer_${encoding}`. | ||
isWithinTokenLimit, | ||
} from 'gpt-tokenizer/model/text-davinci-003' | ||
// etc... | ||
} from 'gpt-tokenizer/model/gpt-3.5-turbo' | ||
``` | ||
@@ -147,62 +149,44 @@ | ||
isWithinTokenLimit, | ||
} from 'gpt-tokenizer/cjs/model/text-davinci-003' | ||
// etc... | ||
} from 'gpt-tokenizer/cjs/model/gpt-3.5-turbo' | ||
``` | ||
### Supported models and their encodings | ||
#### Lazy loading | ||
chat: | ||
If you don't mind loading the tokenizer asynchronously, you can use a dynamic import inside your function, like so: | ||
- `gpt-4-32k` (`cl100k_base`) | ||
- `gpt-4-0314` (`cl100k_base`) | ||
- `gpt-4-32k-0314` (`cl100k_base`) | ||
- `gpt-3.5-turbo` (`cl100k_base`) | ||
- `gpt-3.5-turbo-0301` (`cl100k_base`) | ||
- `gpt-4o` (`o200k_base`) | ||
```ts | ||
const { | ||
encode, | ||
decode, | ||
isWithinTokenLimit, | ||
// etc... | ||
} = await import('gpt-tokenizer/model/gpt-3.5-turbo') | ||
``` | ||
note: if you're using `gpt-3.5-*` or `gpt-4-*` and don't see the model you're looking for, use the `cl100k_base` encoding directly. | ||
#### Loading an encoding | ||
text-only: | ||
If your model isn't supported by the package, but you know which BPE encoding it uses, you can load the encoding directly, e.g.: | ||
```ts | ||
import { | ||
encode, | ||
decode, | ||
isWithinTokenLimit, | ||
// etc... | ||
} from 'gpt-tokenizer/encoding/cl100k_base' | ||
``` | ||
### Supported models and their encodings | ||
- `gpt-4o` (`o200k_base`) | ||
- `gpt-4-*` (`cl100k_base`) | ||
- `gpt-3.5-turbo` (`cl100k_base`) | ||
- `text-davinci-003` (`p50k_base`) | ||
- `text-davinci-002` (`p50k_base`) | ||
- `text-davinci-001` (`r50k_base`) | ||
- `text-curie-001` (`r50k_base`) | ||
- `text-babbage-001` (`r50k_base`) | ||
- `text-ada-001` (`r50k_base`) | ||
- `davinci` (`r50k_base`) | ||
- `curie` (`r50k_base`) | ||
- `babbage` (`r50k_base`) | ||
- `ada` (`r50k_base`) | ||
- ...and many other models, see [mapping](./src/mapping.ts) for an up-to-date list of supported models and their encodings. | ||
code: | ||
Note: if you're using `gpt-3.5-*` or `gpt-4-*` and don't see the model you're looking for, use the `cl100k_base` encoding directly. | ||
- `code-davinci-002` (`p50k_base`) | ||
- `code-davinci-001` (`p50k_base`) | ||
- `code-cushman-002` (`p50k_base`) | ||
- `code-cushman-001` (`p50k_base`) | ||
- `davinci-codex` (`p50k_base`) | ||
- `cushman-codex` (`p50k_base`) | ||
edit: | ||
- `text-davinci-edit-001` (`p50k_edit`) | ||
- `code-davinci-edit-001` (`p50k_edit`) | ||
embeddings: | ||
- `text-embedding-ada-002` (`cl100k_base`) | ||
old embeddings: | ||
- `text-similarity-davinci-001` (`r50k_base`) | ||
- `text-similarity-curie-001` (`r50k_base`) | ||
- `text-similarity-babbage-001` (`r50k_base`) | ||
- `text-similarity-ada-001` (`r50k_base`) | ||
- `text-search-davinci-doc-001` (`r50k_base`) | ||
- `text-search-curie-doc-001` (`r50k_base`) | ||
- `text-search-babbage-doc-001` (`r50k_base`) | ||
- `text-search-ada-doc-001` (`r50k_base`) | ||
- `code-search-babbage-code-001` (`r50k_base`) | ||
- `code-search-ada-code-001` (`r50k_base`) | ||
## API | ||
@@ -268,2 +252,4 @@ | ||
Note that if you encode an empty chat, it will still contain the minimum number of special tokens. | ||
### `encodeGenerator(text: string): Generator<number[], void, undefined>` | ||
@@ -361,6 +347,6 @@ | ||
```ts | ||
import { encode } from 'gpt-tokenizer' | ||
import { encode, EndOfText } from 'gpt-tokenizer' | ||
const inputText = `Some Text` | ||
const disallowedSpecial = new Set(['Some']) | ||
const inputText = `Some Text ${EndOfText}` | ||
const disallowedSpecial = new Set([EndOfText]) | ||
// throws an error: | ||
@@ -367,0 +353,0 @@ const encoded = encode(inputText, undefined, disallowedSpecial) |
@@ -1,36 +0,54 @@ | ||
import { EncoderMap } from './EncoderMap.js' | ||
import { escapeRegExp } from './escapeRegExp.js' | ||
/* eslint-disable no-continue */ | ||
export class BytePairEncodingCore { | ||
encoder: EncoderMap | ||
decoder: Map<number, Uint8Array> | ||
import { compareUint8Arrays, isAscii, tryConvertToString } from './utfUtil.js' | ||
import { escapeRegExp } from './util.js' | ||
export type RawBytePairRanks = readonly (string | readonly number[])[] | ||
export interface BytePairEncodingConfig { | ||
mergeableBytePairRanks: RawBytePairRanks | ||
specialTokenMapping?: Map<string, number> | ||
tokenSplitRegex: RegExp | ||
specialTokensEncoder: Map<string, number> | ||
specialTokensDecoder: Map<number, Uint8Array> | ||
specialTokenPatternRegex: RegExp | ||
} | ||
textEncoder = new TextEncoder() | ||
export class BytePairEncodingCore { | ||
readonly bytePairEncoderSize: number | ||
private bytePairEncoder: RawBytePairRanks | ||
private bytePairEncoderSortedLookup: readonly [Uint8Array, number][] | ||
private bytePairRanksDecoder = new Map<number, Uint8Array>() | ||
private tokenSplitRegex: RegExp | ||
private specialTokensEncoder: Map<string, number> | ||
private specialTokensDecoder: Map<number, string> | ||
private specialTokenPatternRegex: RegExp | ||
private stringDecoder: Map<string, number> | ||
private textEncoder = new TextEncoder() | ||
constructor({ | ||
bytePairEncoder, | ||
specialTokenEncoder, | ||
mergeableBytePairRanks: bytePairEncoder, | ||
specialTokenMapping: specialTokenEncoder, | ||
tokenSplitRegex, | ||
}: { | ||
bytePairEncoder: EncoderMap | ||
specialTokenEncoder?: Map<string, number> | ||
tokenSplitRegex: RegExp | ||
}) { | ||
this.encoder = bytePairEncoder ?? new EncoderMap() | ||
this.decoder = bytePairEncoder | ||
? new Map([...bytePairEncoder].map(([key, value]) => [value, key])) | ||
: new Map<number, Uint8Array>() | ||
}: BytePairEncodingConfig) { | ||
this.bytePairEncoder = bytePairEncoder | ||
this.stringDecoder = new Map<string, number>() | ||
// size without array holes (which may be present in the encoder) | ||
this.bytePairEncoderSize = Object.keys(bytePairEncoder).length | ||
const binaryLookup: [Uint8Array, number][] = [] | ||
// forEach skips array holes: | ||
bytePairEncoder.forEach((value, rank) => { | ||
if (typeof value === 'string') { | ||
this.stringDecoder.set(value, rank) | ||
return | ||
} | ||
const byteArray = new Uint8Array(value) | ||
binaryLookup.push([byteArray, rank]) | ||
this.bytePairRanksDecoder.set(rank, byteArray) | ||
}) | ||
this.bytePairEncoderSortedLookup = binaryLookup.sort((a, b) => | ||
compareUint8Arrays(a[0], b[0]), | ||
) | ||
this.specialTokensEncoder = specialTokenEncoder ?? new Map<string, number>() | ||
this.specialTokensDecoder = specialTokenEncoder | ||
? new Map( | ||
[...specialTokenEncoder].map(([key, value]) => [ | ||
value, | ||
this.textEncoder.encode(key), | ||
]), | ||
) | ||
: new Map<number, Uint8Array>() | ||
? new Map([...specialTokenEncoder].map(([key, value]) => [value, key])) | ||
: new Map<number, string>() | ||
this.tokenSplitRegex = tokenSplitRegex | ||
@@ -47,5 +65,74 @@ | ||
getBpeRankFromString(key: string): number | undefined { | ||
return this.stringDecoder.get(key) | ||
} | ||
getBpeRankFromStringOrThrow(key: string): number { | ||
const value = this.getBpeRankFromString(key) | ||
if (value === undefined) { | ||
throw new Error( | ||
`The byte-pair encoding does not contain a value for: ${key}`, | ||
) | ||
} | ||
return value | ||
} | ||
getBpeRankFromBytes(key: Uint8Array): number | undefined { | ||
const keyAsString = tryConvertToString(key) | ||
if (keyAsString !== undefined) { | ||
return this.getBpeRankFromString(keyAsString) | ||
} | ||
// Perform binary search on the binary keys | ||
const index = this.binarySearch(key) | ||
if (index !== -1) { | ||
return this.bytePairEncoderSortedLookup[index]![1] | ||
} | ||
return undefined | ||
} | ||
getBpeRankFromBytesOrThrow(key: Uint8Array): number { | ||
const value = this.getBpeRankFromBytes(key) | ||
if (value === undefined) { | ||
throw new Error( | ||
`The byte-pair encoding does not contain a value for: ${key.toString()}`, | ||
) | ||
} | ||
return value | ||
} | ||
// Binary search on the binary keys | ||
binarySearch(key: Uint8Array): number { | ||
let low = 0 | ||
let high = this.bytePairEncoderSortedLookup.length - 1 | ||
while (low <= high) { | ||
// eslint-disable-next-line no-bitwise | ||
const mid = (low + high) >>> 1 | ||
const midKey = this.bytePairEncoderSortedLookup[mid]![0] | ||
let cmp = 0 | ||
for (let i = 0; i < Math.min(midKey.length, key.length); i++) { | ||
cmp = midKey[i]! - key[i]! | ||
if (cmp !== 0) break | ||
} | ||
if (cmp === 0) { | ||
cmp = midKey.length - key.length | ||
} | ||
if (cmp === 0) { | ||
return mid | ||
} | ||
if (cmp < 0) { | ||
low = mid + 1 | ||
} else { | ||
high = mid - 1 | ||
} | ||
} | ||
return -1 | ||
} | ||
*encodeNative( | ||
text: string, | ||
allowedSpecial: Set<string>, | ||
allowedSpecial?: Set<string>, | ||
): Generator<number[], number, undefined> { | ||
@@ -71,12 +158,11 @@ let startIndex = 0 | ||
for (const [match] of textSegment.matchAll(this.tokenSplitRegex)) { | ||
const encodedPiece = this.textEncoder.encode(match) | ||
const token = this.encoder.get(encodedPiece) | ||
const token = this.getBpeRankFromString(match) | ||
if (token !== undefined) { | ||
lastTokenLength = 1 | ||
yield [token] | ||
// eslint-disable-next-line no-continue | ||
continue | ||
} | ||
const tokens = this.bytePairEncode(encodedPiece, this.encoder) | ||
const tokens = this.bytePairEncode(match) | ||
lastTokenLength = tokens.length | ||
@@ -107,3 +193,3 @@ yield tokens | ||
text: string, | ||
allowedSpecial: Set<string>, | ||
allowedSpecial: Set<string> | undefined, | ||
startIndex: number, | ||
@@ -126,3 +212,3 @@ specialRegex: RegExp, | ||
if (allowedSpecial.has(specialToken)) { | ||
if (allowedSpecial?.has(specialToken)) { | ||
return nextSpecialMatch.index + searchIndex | ||
@@ -135,3 +221,5 @@ } | ||
*decodeNative(tokens: Iterable<number>): Generator<Uint8Array> { | ||
*decodeNative( | ||
tokens: Iterable<number>, | ||
): Generator<Uint8Array | string, void, void> { | ||
for (const token of tokens) { | ||
@@ -147,7 +235,7 @@ const tokenBytes = this.tryDecodeToken(token) | ||
tokens: AsyncIterable<number>, | ||
): AsyncGenerator<Uint8Array> { | ||
): AsyncGenerator<Uint8Array | string> { | ||
for await (const token of tokens) { | ||
const tokenBytes = this.tryDecodeToken(token) | ||
if (tokenBytes) { | ||
yield tokenBytes | ||
const tokenBytesOrString = this.tryDecodeToken(token) | ||
if (tokenBytesOrString) { | ||
yield tokenBytesOrString | ||
} | ||
@@ -157,14 +245,26 @@ } | ||
tryDecodeToken(token: number): Uint8Array | undefined { | ||
return this.decoder.get(token) ?? this.specialTokensDecoder.get(token) | ||
tryDecodeToken(tokenRank: number): Uint8Array | string | undefined { | ||
const value = this.bytePairEncoder[tokenRank] | ||
if (typeof value === 'string') { | ||
return value | ||
} | ||
if (typeof value === 'object') { | ||
const fromBinary = this.bytePairRanksDecoder.get(tokenRank) | ||
if (fromBinary) { | ||
return fromBinary | ||
} | ||
} | ||
return this.specialTokensDecoder.get(tokenRank) | ||
} | ||
bytePairEncode(inputBytes: Uint8Array, bytePairRanks: EncoderMap): number[] { | ||
if (inputBytes.length === 1) { | ||
return [bytePairRanks.getOrThrow(inputBytes)] | ||
bytePairEncode(input: string): number[] { | ||
if (input.length === 1 && isAscii(input.codePointAt(0)!)) { | ||
return [this.getBpeRankFromStringOrThrow(input)] | ||
} | ||
return this.bytePairMerge(inputBytes, bytePairRanks, (pair) => { | ||
const key = inputBytes.slice(pair.start, pair.end) | ||
return bytePairRanks.getOrThrow(key) | ||
const inputBytes = this.textEncoder.encode(input) | ||
return this.bytePairMerge(inputBytes, (start, end) => { | ||
const key = inputBytes.subarray(start, end) | ||
return this.getBpeRankFromBytesOrThrow(key) | ||
}) | ||
@@ -174,26 +274,39 @@ } | ||
bytePairMerge( | ||
// Input array of bytes to process | ||
piece: Uint8Array, | ||
bytePairRanks: EncoderMap, | ||
transform: (pair: { start: number; end: number }) => number, | ||
// Function to apply to each final segment after merging | ||
getByteForRange: (start: number, end: number) => number, | ||
): number[] { | ||
// Create an array of partition objects. Each partition tracks the start index in 'piece' | ||
// and a rank value for adjacent pairs (initially set to positive infinity). | ||
const partitions = Array.from({ length: piece.length + 1 }, (_, i) => ({ | ||
start: i, | ||
rank: Number.POSITIVE_INFINITY, | ||
rank: Number.POSITIVE_INFINITY, // Rank starts at infinity (unmerged) | ||
})) | ||
// Helper function to get the rank of a byte pair starting at 'startIndex'. | ||
// 'skip' determines how far we look ahead (usually 0, for consecutive pairs). | ||
const getRank = (startIndex: number, skip: number): number | undefined => { | ||
if (startIndex + skip + 2 >= partitions.length) { | ||
// Avoid out-of-bounds errors, return undefined when no valid pair exists | ||
return undefined | ||
} | ||
const key = piece.slice( | ||
// Get the byte pair by extracting a subarray starting at 'startIndex' and ending at | ||
// the start of the partition after 'skip + 2'. | ||
const key = piece.subarray( | ||
partitions[startIndex]!.start, | ||
partitions[startIndex + skip + 2]!.start, | ||
) | ||
return bytePairRanks.get(key) | ||
// Retrieve the rank of this byte pair from the BPE rank function | ||
return this.getBpeRankFromBytes(key) | ||
} | ||
// Initialize the ranks for all adjacent pairs in the array | ||
for (let i = 0; i < partitions.length - 2; i++) { | ||
// Get the rank for the pair starting at index 'i' | ||
const rank = getRank(i, 0) | ||
if (rank !== undefined) { | ||
// Assign the rank to the partition at index 'i' | ||
partitions[i]!.rank = rank | ||
@@ -203,2 +316,3 @@ } | ||
// Iteratively merge byte pairs until no more useful merges can be done | ||
while (partitions.length > 1) { | ||
@@ -208,2 +322,3 @@ let minRank = Number.POSITIVE_INFINITY | ||
// Find the partition with the minimum rank, i.e., the most important pair to merge next | ||
let i = 0 | ||
@@ -218,2 +333,3 @@ for (const partition of partitions) { | ||
// If no valid pair is left to merge, exit the loop | ||
if (minRank === Number.POSITIVE_INFINITY) { | ||
@@ -223,5 +339,7 @@ break | ||
// Update the rank of the partition after the merged one | ||
partitions[minRankIdx]!.rank = | ||
getRank(minRankIdx, 1) ?? Number.POSITIVE_INFINITY | ||
// Update the rank of the partition before the merged one (if exists) | ||
if (minRankIdx > 0) { | ||
@@ -232,12 +350,16 @@ partitions[minRankIdx - 1]!.rank = | ||
// Merge by removing the partition after the one we just merged | ||
partitions.splice(minRankIdx + 1, 1) | ||
} | ||
// Create the final output by applying the transform function to each partitioned range | ||
const output: number[] = [] | ||
for (let i = 0; i < partitions.length - 1; i++) { | ||
output.push( | ||
transform({ | ||
start: partitions[i]!.start, | ||
end: partitions[i + 1]!.start, | ||
}), | ||
getByteForRange( | ||
// start index | ||
partitions[i]!.start, | ||
// end index | ||
partitions[i + 1]!.start, | ||
), | ||
) | ||
@@ -244,0 +366,0 @@ } |
@@ -27,2 +27,3 @@ import * as fs from 'fs/promises' | ||
) | ||
.replace('\nconst api =', '// prettier-ignore\nconst api =') | ||
.replaceAll(`cl100k_base.js`, `${encoding}.js`) | ||
@@ -29,0 +30,0 @@ : `// eslint-disable-next-line no-restricted-exports, import/no-default-export\nexport { default } from '../encoding/${encoding}.js'\nexport * from '../encoding/${encoding}.js'\n` |
@@ -0,1 +1,2 @@ | ||
/* eslint-disable no-console */ | ||
import * as fs from 'fs/promises' | ||
@@ -6,2 +7,3 @@ import * as path from 'path' | ||
type CallbackFunction = (filename: string) => Promise<void> | void | ||
const DEBUG = process.env.DEBUG === 'true' | ||
@@ -23,3 +25,2 @@ const processFilesInDirectory = async ( | ||
} catch (error) { | ||
// eslint-disable-next-line no-console | ||
console.error('An error occurred:', error) | ||
@@ -31,3 +32,20 @@ } | ||
const __dirname = path.dirname(fileURLToPath(import.meta.url)) | ||
const textDecoder = new TextDecoder('utf8', { fatal: true }) | ||
const textEncoder = new TextEncoder() | ||
function safeDecodeUtf8(bytes: Buffer): string | undefined { | ||
try { | ||
const v = textDecoder.decode(bytes) | ||
const encoded = textEncoder.encode(v) | ||
if (encoded.byteLength !== bytes.byteLength) { | ||
console.log('Mismatch:', new Uint8Array(bytes), encoded) | ||
return undefined | ||
} | ||
return v | ||
} catch { | ||
return undefined | ||
} | ||
} | ||
await processFilesInDirectory( | ||
@@ -39,2 +57,3 @@ path.join(__dirname, '../../data'), | ||
const modelName = path.basename(filePath, '.tiktoken') | ||
console.log(`Processing ${modelName}`) | ||
const bpeFile = await fs.readFile(filePath, 'utf8') | ||
@@ -44,16 +63,38 @@ const lines = bpeFile.split('\n') | ||
const [token, rank] = x.split(' ') | ||
return [token, Number.parseInt(rank!, 10)] | ||
if (!token || token.length === 0 || !rank || rank.length === 0) { | ||
throw new Error(`Invalid token encoding: ${x}`) | ||
} | ||
const tokenArray = Buffer.from(token, 'base64') | ||
return [tokenArray, Number.parseInt(rank, 10)] as const | ||
}) | ||
const jsCodeBpeArray = encoder.reduce( | ||
(acc, [token, rank]) => { | ||
const decoded = safeDecodeUtf8(token) ?? token | ||
return { | ||
string: `${acc.string}${','.repeat(rank - acc.lastRank)}${ | ||
DEBUG ? `\n/** ${rank} = */` : '' | ||
}${ | ||
typeof decoded === 'string' | ||
? JSON.stringify(decoded) | ||
: `[${token.join(',')}]` | ||
}`, | ||
lastRank: rank, | ||
} | ||
}, | ||
{ string: '', lastRank: 0 }, | ||
).string | ||
const firstTokenRank = encoder[0]?.[1] ?? 0 | ||
await fs.mkdir(path.join(__dirname, '../encodings'), { recursive: true }) | ||
await fs.writeFile( | ||
path.join(__dirname, `../encodings/${modelName}.js`), | ||
`/* eslint-disable */\n// @ts-nocheck\n// prettier-ignore\n/** @type {[string, number][]} */\nconst encoder = ${JSON.stringify( | ||
encoder, | ||
)};\nexport default encoder;`, | ||
`/* eslint-disable */\n// @ts-nocheck\n// prettier-ignore\n/** @type {(string | number[])[]} */\nconst encoder = [${','.repeat( | ||
firstTokenRank, | ||
)}${jsCodeBpeArray}];\nexport default encoder;`, | ||
) | ||
// eslint-disable-next-line no-console | ||
console.log(`Wrote ${modelName}.js`) | ||
}, | ||
) |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -8,5 +7,3 @@ import { GptEncoding } from '../GptEncoding.js' | ||
const api = GptEncoding.getEncodingApi('cl100k_base', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
const api = GptEncoding.getEncodingApi('cl100k_base', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/o200k_base.js' | ||
@@ -8,5 +7,3 @@ import { GptEncoding } from '../GptEncoding.js' | ||
const api = GptEncoding.getEncodingApi('o200k_base', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
const api = GptEncoding.getEncodingApi('o200k_base', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/p50k_base.js' | ||
@@ -8,5 +7,3 @@ import { GptEncoding } from '../GptEncoding.js' | ||
const api = GptEncoding.getEncodingApi('p50k_base', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
const api = GptEncoding.getEncodingApi('p50k_base', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/p50k_base.js' | ||
@@ -8,5 +7,3 @@ import { GptEncoding } from '../GptEncoding.js' | ||
const api = GptEncoding.getEncodingApi('p50k_edit', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
const api = GptEncoding.getEncodingApi('p50k_edit', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/r50k_base.js' | ||
@@ -8,5 +7,3 @@ import { GptEncoding } from '../GptEncoding.js' | ||
const api = GptEncoding.getEncodingApi('r50k_base', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
const api = GptEncoding.getEncodingApi('r50k_base', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
@@ -221,3 +221,3 @@ import fs from 'fs' | ||
? 127 | ||
: modelName === 'gpt-4o' | ||
: modelName.startsWith('gpt-4o') | ||
? 120 | ||
@@ -224,0 +224,0 @@ : 121 |
/* eslint-disable no-param-reassign */ | ||
import { BytePairEncodingCore } from './BytePairEncodingCore.js' | ||
import { | ||
type ChatModelName, | ||
type ChatParameters, | ||
type EncodingName, | ||
@@ -53,9 +55,10 @@ type ModelName, | ||
decoder = new TextDecoder('utf8') | ||
modelName?: ModelName | ||
private decoder = new TextDecoder('utf8') | ||
private bytePairEncodingCoreProcessor: BytePairEncodingCore | ||
private specialTokenMapping: Map<string, number> | ||
private specialTokensSet: Set<string> | ||
private allSpecialTokenRegex: RegExp | ||
private constructor({ | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
@@ -65,12 +68,22 @@ specialTokenMapping, | ||
modelName, | ||
...rest | ||
}: EncodingParams) { | ||
this.specialTokenMapping = specialTokenMapping | ||
this.specialTokensSet = new Set<string>(this.specialTokenMapping.keys()) | ||
this.allSpecialTokenRegex = getSpecialTokenRegex(this.specialTokensSet) | ||
this.bytePairEncodingCoreProcessor = new BytePairEncodingCore({ | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
...rest, | ||
}) | ||
const maxTokenValue = Math.max( | ||
getMaxValueFromMap(mergeableBytePairRanks), | ||
mergeableBytePairRanks.length - 1, | ||
getMaxValueFromMap(specialTokenMapping), | ||
) | ||
this.specialTokenMapping = specialTokenMapping | ||
if (expectedVocabularySize !== undefined) { | ||
if ( | ||
mergeableBytePairRanks.size + specialTokenMapping.size !== | ||
this.bytePairEncodingCoreProcessor.bytePairEncoderSize + | ||
specialTokenMapping.size !== | ||
expectedVocabularySize | ||
@@ -85,3 +98,5 @@ ) { | ||
throw new Error( | ||
'The maximum token value must be equal to explicit_n_vocab - 1.', | ||
`The model encodings are invalid. The maximum token value must be equal to expectedVocabularySize - 1. Currently ${maxTokenValue}, expected ${ | ||
expectedVocabularySize - 1 | ||
}`, | ||
) | ||
@@ -91,8 +106,2 @@ } | ||
this.bytePairEncodingCoreProcessor = new BytePairEncodingCore({ | ||
bytePairEncoder: mergeableBytePairRanks, | ||
specialTokenEncoder: specialTokenMapping, | ||
tokenSplitRegex, | ||
}) | ||
this.encode = this.encode.bind(this) | ||
@@ -151,21 +160,23 @@ this.decode = this.decode.bind(this) | ||
lineToEncode: string, | ||
{ | ||
allowedSpecial = new Set<string>(), | ||
disallowedSpecial = new Set<string>([ALL_SPECIAL_TOKENS]), | ||
}: EncodeOptions = {}, | ||
{ allowedSpecial, disallowedSpecial }: EncodeOptions = {}, | ||
): Generator<number[], number, undefined> { | ||
const specialTokensSet = new Set<string>(this.specialTokenMapping.keys()) | ||
let regexPattern: RegExp | undefined | ||
if (disallowedSpecial.has(ALL_SPECIAL_TOKENS)) { | ||
disallowedSpecial = new Set<string>(specialTokensSet) | ||
allowedSpecial.forEach((val) => disallowedSpecial.delete(val)) | ||
disallowedSpecial.forEach((val) => allowedSpecial.delete(val)) | ||
if (allowedSpecial?.has(ALL_SPECIAL_TOKENS)) { | ||
allowedSpecial = new Set(this.specialTokensSet) | ||
} | ||
if (allowedSpecial.has(ALL_SPECIAL_TOKENS)) { | ||
allowedSpecial = specialTokensSet | ||
if (!disallowedSpecial || disallowedSpecial.has(ALL_SPECIAL_TOKENS)) { | ||
// by default, all special tokens are disallowed | ||
disallowedSpecial = new Set(this.specialTokensSet) | ||
if (allowedSpecial?.size) { | ||
allowedSpecial.forEach((val) => disallowedSpecial!.delete(val)) | ||
disallowedSpecial.forEach((val) => allowedSpecial.delete(val)) | ||
regexPattern = getSpecialTokenRegex(disallowedSpecial) | ||
} else { | ||
regexPattern = this.allSpecialTokenRegex | ||
} | ||
} | ||
if (disallowedSpecial.size > 0) { | ||
const regexPattern = getSpecialTokenRegex(disallowedSpecial) | ||
if (regexPattern) { | ||
const match = lineToEncode.match(regexPattern) | ||
@@ -203,3 +214,5 @@ if (match !== null) { | ||
} | ||
const params = chatModelParams[model] | ||
const params: ChatParameters | undefined = | ||
chatModelParams[model as ChatModelName] | ||
const chatStartToken = this.specialTokenMapping.get(ImStart) | ||
@@ -287,3 +300,6 @@ const chatEndToken = this.specialTokenMapping.get(ImEnd) | ||
for (const decodedPart of decodedByteGenerator) { | ||
buffer += this.decoder.decode(decodedPart, { stream: true }) | ||
buffer += | ||
typeof decodedPart === 'string' | ||
? decodedPart | ||
: this.decoder.decode(decodedPart, { stream: true }) | ||
@@ -316,3 +332,6 @@ if (buffer.length === 0 || endsWithIncompleteUtfPairSurrogate(buffer)) { | ||
for await (const decodedPart of decodedByteGenerator) { | ||
buffer += this.decoder.decode(decodedPart, { stream: true }) | ||
buffer += | ||
typeof decodedPart === 'string' | ||
? decodedPart | ||
: this.decoder.decode(decodedPart, { stream: true }) | ||
@@ -319,0 +338,0 @@ if (buffer.length === 0 || endsWithIncompleteUtfPairSurrogate(buffer)) { |
@@ -19,13 +19,34 @@ /* eslint-disable camelcase */ | ||
export const modelToEncodingMap = { | ||
// chat | ||
const chatEnabledModelsMap = { | ||
'gpt-4': cl100k_base, | ||
'gpt-4-0314': cl100k_base, | ||
'gpt-4-0613': cl100k_base, | ||
'gpt-4-32k': cl100k_base, | ||
'gpt-4-0314': cl100k_base, | ||
'gpt-4-32k-0314': cl100k_base, | ||
'gpt-4-32k-0613': cl100k_base, | ||
'gpt-4-turbo': cl100k_base, | ||
'gpt-4-turbo-2024-04-09': cl100k_base, | ||
'gpt-4-turbo-preview': cl100k_base, | ||
'gpt-4-1106-preview': cl100k_base, | ||
'gpt-4-0125-preview': cl100k_base, | ||
'gpt-4-vision-preview': cl100k_base, | ||
'gpt-4o': o200k_base, | ||
'gpt-4o-2024-05-13': o200k_base, | ||
'gpt-4o-2024-08-06': o200k_base, | ||
'gpt-4o-mini-2024-07-18': o200k_base, | ||
'gpt-4o-mini': o200k_base, | ||
'gpt-3.5-turbo': cl100k_base, | ||
'gpt-3.5-turbo-0301': cl100k_base, | ||
'gpt-3.5-turbo-0613': cl100k_base, | ||
'gpt-3.5-turbo-1106': cl100k_base, | ||
'gpt-3.5-turbo-0125': cl100k_base, | ||
'gpt-3.5-turbo-16k': cl100k_base, | ||
'gpt-3.5-turbo-16k-0613': cl100k_base, | ||
'gpt-4o': o200k_base, | ||
'gpt-3.5-turbo-instruct': cl100k_base, | ||
'gpt-3.5-turbo-instruct-0914': cl100k_base, | ||
} as const | ||
export const modelToEncodingMap = { | ||
// chat | ||
...chatEnabledModelsMap, | ||
// text | ||
@@ -54,2 +75,4 @@ 'text-davinci-003': p50k_base, | ||
'text-embedding-ada-002': cl100k_base, | ||
'text-embedding-3-small': cl100k_base, | ||
'text-embedding-3-large': cl100k_base, | ||
// old embeddings | ||
@@ -73,45 +96,28 @@ 'text-similarity-davinci-001': r50k_base, | ||
const internalChatModelParams = { | ||
'gpt-3.5-turbo': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-0301': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-0613': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-3.5-turbo-16k-0613': { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
}, | ||
'gpt-4': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
'gpt-4-0314': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
'gpt-4-32k': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
'gpt-4-32k-0314': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
'gpt-4o': { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
}, | ||
const gpt3params = { | ||
messageSeparator: '\n', | ||
roleSeparator: '\n', | ||
} | ||
export const chatModelParams: Partial<Record<ModelName, ChatParameters>> = | ||
internalChatModelParams | ||
const gpt4params = { | ||
messageSeparator: '', | ||
roleSeparator: ImSep, | ||
} | ||
export type ModelName = keyof typeof modelToEncodingMap | ||
export type ChatModelName = keyof typeof internalChatModelParams | ||
export type ChatModelName = keyof typeof chatEnabledModelsMap | ||
export type EncodingName = (typeof modelToEncodingMap)[ModelName] | ||
export const chatModelParams = Object.fromEntries( | ||
Object.keys(chatEnabledModelsMap).flatMap((modelName) => | ||
modelName.startsWith('gpt-4') | ||
? ([[modelName, gpt4params] as const] as const) | ||
: modelName.startsWith('gpt-3.5-turbo') | ||
? ([[modelName, gpt3params] as const] as const) | ||
: [], | ||
), | ||
) as Record<ChatModelName, ChatParameters> | ||
export const chatEnabledModels = Object.keys( | ||
chatEnabledModelsMap, | ||
) as ChatModelName[] |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0301', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0301', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0613', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-0613', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-16k-0613', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo-16k-0613', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-3.5-turbo', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-0314', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-0314', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-32k-0314', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-32k-0314', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-32k', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4-32k', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/cl100k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from '../convertTokenBytePairEncodingFromTuples.js' | ||
import encoder from '../encodings/o200k_base.js' | ||
@@ -7,6 +6,4 @@ import { GptEncoding } from '../GptEncoding.js' | ||
export * from '../specialTokens.js' | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4o', () => | ||
convertTokenBytePairEncodingFromTuples(encoder), | ||
) | ||
// prettier-ignore | ||
const api = GptEncoding.getEncodingApiForModel('gpt-4o', () => encoder) | ||
const { | ||
@@ -13,0 +10,0 @@ decode, |
@@ -1,16 +0,13 @@ | ||
/* eslint-disable no-magic-numbers */ | ||
import { EncoderMap } from './EncoderMap.js' | ||
import type { | ||
BytePairEncodingConfig, | ||
RawBytePairRanks, | ||
} from './BytePairEncodingCore.js' | ||
import { Cl100KBase } from './encodingParams/Cl100KBase.js' | ||
import { O200KBase } from './encodingParams/O200KBase.js' | ||
import { P50KBase } from './encodingParams/P50KBase.js' | ||
import { P50KEdit } from './encodingParams/P50KEdit.js' | ||
import { R50KBase } from './encodingParams/R50KBase.js' | ||
import type { EncodingName, ModelName } from './mapping.js' | ||
import { | ||
EndOfPrompt, | ||
EndOfText, | ||
FimMiddle, | ||
FimPrefix, | ||
FimSuffix, | ||
ImEnd, | ||
ImSep, | ||
ImStart, | ||
} from './specialTokens.js' | ||
export interface EncodingParams { | ||
export interface EncodingParams extends BytePairEncodingConfig { | ||
/** | ||
@@ -28,87 +25,18 @@ * The expected total number of tokens in the vocabulary, including both regular and special tokens. | ||
tokenSplitRegex: RegExp | ||
mergeableBytePairRanks: EncoderMap | ||
specialTokenMapping: Map<string, number> | ||
modelName?: ModelName | ||
/** increases memory consumption, but speeds up subsequent decoding */ | ||
enableCache?: boolean | ||
} | ||
const tokenSplitRegex = | ||
export const tokenSplitRegex = | ||
/'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu | ||
function R50KBase(mergeableBytePairRanks: EncoderMap): EncodingParams { | ||
return { | ||
expectedVocabularySize: 50_257, | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping: new Map<string, number>([[EndOfText, 50_256]]), | ||
} | ||
} | ||
function P50KBase(mergeableBytePairRanks: EncoderMap): EncodingParams { | ||
return { | ||
expectedVocabularySize: 50_281, | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping: new Map<string, number>([[EndOfText, 50_256]]), | ||
} | ||
} | ||
function P50KEdit(mergeableBytePairRanks: EncoderMap): EncodingParams { | ||
const specialTokenMapping = new Map<string, number>([ | ||
[EndOfText, 50_256], | ||
[FimPrefix, 50_281], | ||
[FimMiddle, 50_282], | ||
[FimSuffix, 50_283], | ||
]) | ||
return { | ||
tokenSplitRegex, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
} | ||
} | ||
function Cl100KBase(mergeableBytePairRanks: EncoderMap): EncodingParams { | ||
const specialTokenMapping = new Map<string, number>([ | ||
[EndOfText, 100_257], | ||
[FimPrefix, 100_258], | ||
[FimMiddle, 100_259], | ||
[FimSuffix, 100_260], | ||
[ImStart, 100_264], | ||
[ImEnd, 100_265], | ||
[ImSep, 100_266], | ||
[EndOfPrompt, 100_276], | ||
]) | ||
return { | ||
tokenSplitRegex: | ||
/(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
} | ||
} | ||
function O200KBase(mergeableBytePairRanks: EncoderMap): EncodingParams { | ||
const specialTokenMapping = new Map<string, number>([ | ||
[EndOfText, 199_999], | ||
[FimPrefix, 200_000], | ||
[FimMiddle, 200_001], | ||
[FimSuffix, 200_002], | ||
[ImStart, 200_003], | ||
[ImEnd, 200_004], | ||
[ImSep, 200_005], | ||
[EndOfPrompt, 200_006], | ||
]) | ||
return { | ||
tokenSplitRegex: | ||
/(?:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+/giu, | ||
mergeableBytePairRanks, | ||
specialTokenMapping, | ||
} | ||
} | ||
export type GetMergeableRanksFn = (encodingName: EncodingName) => EncoderMap | ||
export type GetMergeableRanksFn = ( | ||
encodingName: EncodingName, | ||
) => RawBytePairRanks | ||
export type GetMergeableRanksAsyncFn = ( | ||
encodingName: EncodingName, | ||
) => Promise<EncoderMap> | ||
) => Promise<RawBytePairRanks> | ||
@@ -115,0 +43,0 @@ export function getEncodingParams( |
{ | ||
"name": "gpt-tokenizer", | ||
"type": "module", | ||
"dependencies": { | ||
"rfc4648": "^1.5.3" | ||
} | ||
"dependencies": {} | ||
} |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from './convertTokenBytePairEncodingFromTuples.js' | ||
import type { EncoderMap } from './EncoderMap.js' | ||
import type { RawBytePairRanks } from './BytePairEncodingCore.js' | ||
import cl100k from './encodings/cl100k_base.js' | ||
@@ -10,13 +9,13 @@ import o200k from './encodings/o200k_base.js' | ||
export const resolveEncoding = (encoding: EncodingName): EncoderMap => { | ||
export const resolveEncoding = (encoding: EncodingName): RawBytePairRanks => { | ||
switch (encoding) { | ||
case 'r50k_base': | ||
return convertTokenBytePairEncodingFromTuples(r50k) | ||
return r50k | ||
case 'p50k_base': | ||
case 'p50k_edit': | ||
return convertTokenBytePairEncodingFromTuples(p50k) | ||
return p50k | ||
case 'cl100k_base': | ||
return convertTokenBytePairEncodingFromTuples(cl100k) | ||
return cl100k | ||
case 'o200k_base': | ||
return convertTokenBytePairEncodingFromTuples(o200k) | ||
return o200k | ||
default: { | ||
@@ -23,0 +22,0 @@ throw new Error(`Unknown encoding name: ${encoding}`) |
/* eslint-disable import/extensions */ | ||
import { convertTokenBytePairEncodingFromTuples } from './convertTokenBytePairEncodingFromTuples.js' | ||
import type { EncoderMap } from './EncoderMap.js' | ||
import type { RawBytePairRanks } from './BytePairEncodingCore.js' | ||
import type { EncodingName } from './mapping.js' | ||
@@ -8,28 +7,20 @@ | ||
encoding: EncodingName, | ||
): Promise<EncoderMap> => { | ||
): Promise<RawBytePairRanks> => { | ||
switch (encoding) { | ||
case 'r50k_base': | ||
return convertTokenBytePairEncodingFromTuples( | ||
await import('./encodings/r50k_base.js').then( | ||
({ default: encodingTuples }) => encodingTuples, | ||
), | ||
return import('./encodings/r50k_base.js').then( | ||
({ default: rawBytePairRanks }) => rawBytePairRanks, | ||
) | ||
case 'p50k_base': | ||
case 'p50k_edit': | ||
return convertTokenBytePairEncodingFromTuples( | ||
await import('./encodings/p50k_base.js').then( | ||
({ default: encodingTuples }) => encodingTuples, | ||
), | ||
return import('./encodings/p50k_base.js').then( | ||
({ default: rawBytePairRanks }) => rawBytePairRanks, | ||
) | ||
case 'cl100k_base': | ||
return convertTokenBytePairEncodingFromTuples( | ||
await import('./encodings/cl100k_base.js').then( | ||
({ default: encodingTuples }) => encodingTuples, | ||
), | ||
return import('./encodings/cl100k_base.js').then( | ||
({ default: rawBytePairRanks }) => rawBytePairRanks, | ||
) | ||
case 'o200k_base': | ||
return convertTokenBytePairEncodingFromTuples( | ||
await import('./encodings/o200k_base.js').then( | ||
({ default: encodingTuples }) => encodingTuples, | ||
), | ||
return import('./encodings/o200k_base.js').then( | ||
({ default: rawBytePairRanks }) => rawBytePairRanks, | ||
) | ||
@@ -36,0 +27,0 @@ default: { |
@@ -0,1 +1,6 @@ | ||
/* eslint-disable no-bitwise */ | ||
/* eslint-disable no-magic-numbers */ | ||
export const isAscii = (codePoint: number) => codePoint <= 0x7f | ||
const HIGH_SURROGATE_START = 55_296 | ||
@@ -13,1 +18,80 @@ const HIGH_SURROGATE_END = 56_319 | ||
} | ||
function isValidUTF8(bytes: Uint8Array): boolean { | ||
let i = 0 | ||
while (i < bytes.length) { | ||
const byte1 = bytes[i]! | ||
let numBytes = 0 | ||
let codePoint = 0 | ||
// Determine the number of bytes in the current UTF-8 character | ||
if (byte1 <= 0x7f) { | ||
// 1-byte character (ASCII) | ||
numBytes = 1 | ||
codePoint = byte1 | ||
} else if ((byte1 & 0xe0) === 0xc0) { | ||
// 2-byte character | ||
numBytes = 2 | ||
codePoint = byte1 & 0x1f | ||
if (byte1 <= 0xc1) return false // Overlong encoding not allowed | ||
} else if ((byte1 & 0xf0) === 0xe0) { | ||
// 3-byte character | ||
numBytes = 3 | ||
codePoint = byte1 & 0x0f | ||
} else if ((byte1 & 0xf8) === 0xf0) { | ||
// 4-byte character | ||
numBytes = 4 | ||
codePoint = byte1 & 0x07 | ||
if (byte1 > 0xf4) return false // Code points above U+10FFFF not allowed | ||
} else { | ||
// Invalid first byte of UTF-8 character | ||
return false | ||
} | ||
// Ensure there are enough continuation bytes | ||
if (i + numBytes > bytes.length) return false | ||
// Process the continuation bytes | ||
for (let j = 1; j < numBytes; j++) { | ||
const byte = bytes[i + j] | ||
if (byte === undefined || (byte & 0xc0) !== 0x80) return false // Continuation bytes must start with '10' | ||
codePoint = (codePoint << 6) | (byte & 0x3f) | ||
} | ||
// Check for overlong encodings | ||
if (numBytes === 2 && codePoint < 0x80) return false // Overlong 2-byte sequence | ||
if (numBytes === 3 && codePoint < 2_048) return false // Overlong 3-byte sequence | ||
if (numBytes === 4 && codePoint < 65_536) return false // Overlong 4-byte sequence | ||
// Check for surrogate halves (U+D800 to U+DFFF) | ||
if (codePoint >= 55_296 && codePoint <= 57_343) return false | ||
// Check for code points above U+10FFFF | ||
if (codePoint > 1_114_111) return false | ||
// Move to the next character | ||
i += numBytes | ||
} | ||
return true | ||
} | ||
const textDecoder = new TextDecoder('utf8', { fatal: false }) | ||
export function tryConvertToString(arr: Uint8Array): string | undefined { | ||
if (!isValidUTF8(arr)) { | ||
return undefined | ||
} | ||
return textDecoder.decode(arr) | ||
} | ||
// Helper function to compare two Uint8Arrays lexicographically | ||
export function compareUint8Arrays(a: Uint8Array, b: Uint8Array): number { | ||
const len = Math.min(a.length, b.length) | ||
for (let i = 0; i < len; i++) { | ||
if (a[i] !== b[i]) { | ||
return a[i]! - b[i]! | ||
} | ||
} | ||
return a.length - b.length | ||
} |
@@ -1,3 +0,1 @@ | ||
import { escapeRegExp } from './escapeRegExp.js' | ||
export function getMaxValueFromMap(map: Map<unknown, number>): number { | ||
@@ -11,2 +9,6 @@ let max = 0 | ||
export function escapeRegExp(string: string) { | ||
return string.replace(/[$()*+.?[\\\]^{|}]/g, '\\$&') // $& means the whole matched string | ||
} | ||
export function getSpecialTokenRegex(tokens: Set<string>): RegExp { | ||
@@ -13,0 +15,0 @@ const escapedTokens = [...tokens].map(escapeRegExp) |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
0
603
40424742
68448
367
5
1
- Removedrfc4648@^1.5.3
- Removedrfc4648@1.5.3(transitive)