@microsoft/tiktokenizer
Advanced tools
Comparing version 1.0.7 to 1.0.8
@@ -22,3 +22,3 @@ import { ILRUCache } from './lru'; | ||
* https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken | ||
* @param tikTokenBpeFile BPE rank file path | ||
* @param tikTokenBpeFileOrDict BPE rank file path or parsed dictionary | ||
* @param specialTokensEncoder special tokens encoder | ||
@@ -28,3 +28,3 @@ * @param regexPattern regex pattern to split the input text | ||
*/ | ||
constructor(tikTokenBpeFile: string, specialTokensEncoder: ReadonlyMap<string, number>, regexPattern: string, cacheSize?: number); | ||
constructor(tikTokenBpeFileOrDict: string | Map<Uint8Array, number>, specialTokensEncoder: ReadonlyMap<string, number>, regexPattern: string, cacheSize?: number); | ||
protected init(bpeDict: ReadonlyMap<Uint8Array, number>, specialTokensEncoder: ReadonlyMap<string, number>, regexPattern: string): void; | ||
@@ -31,0 +31,0 @@ private findNextSpecialToken; |
@@ -64,3 +64,3 @@ "use strict"; | ||
* https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken | ||
* @param tikTokenBpeFile BPE rank file path | ||
* @param tikTokenBpeFileOrDict BPE rank file path or parsed dictionary | ||
* @param specialTokensEncoder special tokens encoder | ||
@@ -70,7 +70,7 @@ * @param regexPattern regex pattern to split the input text | ||
*/ | ||
constructor(tikTokenBpeFile, specialTokensEncoder, regexPattern, cacheSize = 8192) { | ||
constructor(tikTokenBpeFileOrDict, specialTokensEncoder, regexPattern, cacheSize = 8192) { | ||
this.textEncoder = (0, textEncoder_1.makeTextEncoder)(); | ||
this.textDecoder = new util_1.TextDecoder("utf-8"); | ||
this.cache = new lru_1.LRUCache(cacheSize); | ||
const bpeDict = loadTikTokenBpe(tikTokenBpeFile); | ||
const bpeDict = typeof tikTokenBpeFileOrDict === 'string' ? loadTikTokenBpe(tikTokenBpeFileOrDict) : tikTokenBpeFileOrDict; | ||
this.init(bpeDict, specialTokensEncoder, regexPattern); | ||
@@ -77,0 +77,0 @@ } |
@@ -42,3 +42,3 @@ import { TikTokenizer } from "./tikTokenizer"; | ||
* Create a tokenizer from a file | ||
* @param tikTokenBpeFile BPE rank file in tiktoken format | ||
* @param tikTokenBpeFileOrDict BPE rank file in tiktoken format or parsed dictionary | ||
* @param specialTokensEncoder special tokens mapping | ||
@@ -49,2 +49,2 @@ * @param regexPattern regex pattern | ||
*/ | ||
export declare function createTokenizer(tikTokenBpeFile: string, specialTokensEncoder: ReadonlyMap<string, number>, regexPattern: string, cacheSize?: number): TikTokenizer; | ||
export declare function createTokenizer(tikTokenBpeFileOrDict: string | Map<Uint8Array, number>, specialTokensEncoder: ReadonlyMap<string, number>, regexPattern: string, cacheSize?: number): TikTokenizer; |
@@ -225,3 +225,3 @@ "use strict"; | ||
regexPattern = REGEX_PATTERN_1; | ||
mergeableRanksFileUrl = `https://pythia.blob.core.windows.net/public/encoding/gpt2.tiktoken`; | ||
mergeableRanksFileUrl = `https://raw.githubusercontent.com/microsoft/Tokenizer/main/model/gpt2.tiktoken`; | ||
break; | ||
@@ -251,3 +251,3 @@ default: | ||
* Create a tokenizer from a file | ||
* @param tikTokenBpeFile BPE rank file in tiktoken format | ||
* @param tikTokenBpeFileOrDict BPE rank file in tiktoken format or parsed dictionary | ||
* @param specialTokensEncoder special tokens mapping | ||
@@ -258,4 +258,4 @@ * @param regexPattern regex pattern | ||
*/ | ||
function createTokenizer(tikTokenBpeFile, specialTokensEncoder, regexPattern, cacheSize = 8192) { | ||
const tikTokenizer = new tikTokenizer_1.TikTokenizer(tikTokenBpeFile, specialTokensEncoder, regexPattern, cacheSize); | ||
function createTokenizer(tikTokenBpeFileOrDict, specialTokensEncoder, regexPattern, cacheSize = 8192) { | ||
const tikTokenizer = new tikTokenizer_1.TikTokenizer(tikTokenBpeFileOrDict, specialTokensEncoder, regexPattern, cacheSize); | ||
return tikTokenizer; | ||
@@ -262,0 +262,0 @@ } |
@@ -5,3 +5,3 @@ { | ||
"description": "Tokenizer for OpenAI large language models.", | ||
"version": "1.0.7", | ||
"version": "1.0.8", | ||
"author": { | ||
@@ -8,0 +8,0 @@ "name": "Microsoft Corporation" |
47505