llama3-tokenizer-js
Advanced tools
Comparing version 1.1.2 to 1.1.3
{ | ||
"name": "llama3-tokenizer-js", | ||
"version": "1.1.2", | ||
"version": "1.1.3", | ||
"description": "JS tokenizer for LLaMA 3", | ||
@@ -5,0 +5,0 @@ "main": "bundle/llama3-tokenizer-with-baked-data.js", |
@@ -103,4 +103,5 @@ # ๐ฆ llama3-tokenizer-js ๐ฆ | ||
1) If you need exact token counts, you can work around this issue by using this library to tokenize _only_ user input text (which shouldn't contain any special tokens) and then programmatically adding the relevant counts for the special tokens that you are using to wrap the input text. | ||
2) Alternatively, you can choose to ignore this issue, in which case you will be overcounting tokens by a little bit, which is not too bad (in typical use cases, undercounting can lead to more severe quality issues than overcounting). | ||
1) Instead of calling `.encode(str).length`, you can call `.optimisticCount(str)`. Optimistic count is a convenience function which parses the text with the assumption that anything that looks like a special token (e.g. `<|boom|>`) is actually a special token. | ||
2) If you need exact token counts, you can work around this issue by using this library to tokenize _only_ user input text (which shouldn't contain any special tokens) and then programmatically adding the relevant counts for the special tokens that you are using to wrap the input text. | ||
3) Alternatively, you can choose to ignore this issue, in which case you will be overcounting tokens by a little bit, which is not too bad (in typical use cases, undercounting can lead to more severe quality issues than overcounting). | ||
@@ -107,0 +108,0 @@ ## Tests |
@@ -329,2 +329,5 @@ /** | ||
} | ||
// Allow passing an alternative special token regex. Needed for optimisticCount. | ||
const specialTokenRegex = options.specialTokenRegex ?? /<\|(?:begin_of_text|end_of_text|start_header_id|end_header_id|eot_id|reserved_special_token_(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|250))\|>/g | ||
@@ -367,147 +370,154 @@ if (!this.vocabById || !this.vocabByString || !this.merges) { | ||
// Split prompt by special tokens, e.g. <|eot_id|> | ||
const regexSpecialToken = /<\|(?:begin_of_text|end_of_text|start_header_id|end_header_id|eot_id|reserved_special_token_(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|250))\|>/g | ||
const splittedBySpecialTokens = regexSplit(prompt, regexSpecialToken) | ||
const splittedBySpecialTokens = regexSplit(prompt, specialTokenRegex) | ||
for (let specialSplitIndex=0; specialSplitIndex<splittedBySpecialTokens.length; specialSplitIndex++) { | ||
const specialSplitString = splittedBySpecialTokens[specialSplitIndex] | ||
if (specialSplitString.match(regexSpecialToken) && this.vocabByString.has(specialSplitString)) { | ||
if (specialSplitString.match(specialTokenRegex) && this.vocabByString.has(specialSplitString)) { | ||
// If this string is a special token according to our regex AND is found from vocabulary, output the corresponding special token id. | ||
output.push(this.vocabByString.get(specialSplitString)) | ||
} else if (specialSplitString.match(regexSpecialToken)) { | ||
// If this string is a special token according to our regex BUT is not found from vocabulary, there is a mismatch between our regex | ||
// and our vocabulary. The input has been incorrectly split and we cannot guarantee correctness if we continued, so we throw an error. | ||
continue | ||
} | ||
if (specialSplitString.match(specialTokenRegex) && options.unexpectedSpecialTokenFallback) { | ||
// This is used in optimisticCount to map each unexpected special token into some fallback token. | ||
output.push(options.unexpectedSpecialTokenFallback) | ||
continue | ||
} | ||
if (specialSplitString.match(specialTokenRegex)) { | ||
// This string is a special token according to our regex BUT is not found from vocabulary. | ||
// We don't have options.unexpectedSpecialTokenFallback that we could use. | ||
// This situation is unexpected and indicates a mismatch between our regex and our vocabulary. | ||
// The input has been incorrectly split and we cannot guarantee correctness if we continue, so we throw an error. | ||
throw new Error('Internal error occurred in llama3-tokenizer-js while processing special tokens in input.') | ||
} else { | ||
// Else treat this string like normal text | ||
} | ||
// If we are here then we treat this string like normal text, not like a special token. | ||
// SplitPretokenizer | ||
const splittedByPretokenizer = regexSplit(specialSplitString, CLEAN_LLAMA3_REGEX_STRING) | ||
// SplitPretokenizer | ||
const splittedByPretokenizer = regexSplit(specialSplitString, CLEAN_LLAMA3_REGEX_STRING) | ||
// ByteLevelPretokenizer maps all our bytes to unicode strings, this also does the mapping from space to ฤ (charCode 32 -> 32+256) | ||
const bytemappedSplit = splittedByPretokenizer.map(pretoken => Array.from(this.utf8Encoder.encode(pretoken), byte => BYTES_TO_UNICODE[byte]).join('')) | ||
// ByteLevelPretokenizer maps all our bytes to unicode strings, this also does the mapping from space to ฤ (charCode 32 -> 32+256) | ||
const bytemappedSplit = splittedByPretokenizer.map(pretoken => Array.from(this.utf8Encoder.encode(pretoken), byte => BYTES_TO_UNICODE[byte]).join('')) | ||
// The results from pretokenizer are handled one by one | ||
for (let pretokenIndex=0; pretokenIndex<bytemappedSplit.length; pretokenIndex++) { | ||
const pretoken = bytemappedSplit[pretokenIndex] | ||
// The results from pretokenizer are handled one by one | ||
for (let pretokenIndex=0; pretokenIndex<bytemappedSplit.length; pretokenIndex++) { | ||
const pretoken = bytemappedSplit[pretokenIndex] | ||
// Because LLaMA 3 tokenizer is configured with ignore_merges, | ||
// we check if the pretoken is found in our vocabulary, | ||
// and if it is, we map it to tokenId directly from vocabulary | ||
// (instead of normal BPE processing, like in LLaMA 1 tokenizer, | ||
// where the BPE process sometimes leads to different tokens). | ||
if (this.vocabByString.has(pretoken)) { | ||
output.push(this.vocabByString.get(pretoken)) | ||
continue | ||
} | ||
// Because LLaMA 3 tokenizer is configured with ignore_merges, | ||
// we check if the pretoken is found in our vocabulary, | ||
// and if it is, we map it to tokenId directly from vocabulary | ||
// (instead of normal BPE processing, like in LLaMA 1 tokenizer, | ||
// where the BPE process sometimes leads to different tokens). | ||
if (this.vocabByString.has(pretoken)) { | ||
output.push(this.vocabByString.get(pretoken)) | ||
continue | ||
} | ||
// Pretoken was not found from vocabulary, so we proceed with normal BPE processing, | ||
// which will result in a sequence of at least 2 tokens that represent the pretoken. | ||
// Pretoken was not found from vocabulary, so we proceed with normal BPE processing, | ||
// which will result in a sequence of at least 2 tokens that represent the pretoken. | ||
// Cache used for performance | ||
if (cache.has(pretoken)) { | ||
output.push(...(cache.get(pretoken))) | ||
continue | ||
} | ||
// Cache used for performance | ||
if (cache.has(pretoken)) { | ||
output.push(...(cache.get(pretoken))) | ||
continue | ||
} | ||
// Initially each character is transformed to a tokenId, later there will be merges of these. | ||
// Note that this array represents the tokenIds of the pretoken, not the entire sequence (there are typically multiple pretokens). | ||
const tokenIds = [] | ||
// Initially each character is transformed to a tokenId, later there will be merges of these. | ||
// Note that this array represents the tokenIds of the pretoken, not the entire sequence (there are typically multiple pretokens). | ||
const tokenIds = [] | ||
// Transform each character to its corresponding token | ||
const charArray = Array.from(pretoken) | ||
for (let i=0; i<charArray.length; i++) { | ||
const c = charArray[i] | ||
if (!this.vocabByString.has(c)) { | ||
throw Error(`Character ${c} not found from vocabulary. This is not supposed to happen (vocab is supposed to cover everything that comes out of pretokenization).`) | ||
} | ||
tokenIds.push(this.vocabByString.get(c)) | ||
// Transform each character to its corresponding token | ||
const charArray = Array.from(pretoken) | ||
for (let i=0; i<charArray.length; i++) { | ||
const c = charArray[i] | ||
if (!this.vocabByString.has(c)) { | ||
throw Error(`Character ${c} not found from vocabulary. This is not supposed to happen (vocab is supposed to cover everything that comes out of pretokenization).`) | ||
} | ||
tokenIds.push(this.vocabByString.get(c)) | ||
} | ||
// Fill merge queue from initial merge possibilities and construct linked list | ||
let firstTokenNode = { | ||
origPos: 0, | ||
tokenId: tokenIds[0], | ||
prev: null, | ||
next: null, | ||
// Fill merge queue from initial merge possibilities and construct linked list | ||
let firstTokenNode = { | ||
origPos: 0, | ||
tokenId: tokenIds[0], | ||
prev: null, | ||
next: null, | ||
} | ||
let prevTokenNode = firstTokenNode | ||
for (let i=1; i<tokenIds.length; i++) { | ||
const currTokenNode = { | ||
origPos: i, | ||
tokenId: tokenIds[i], | ||
prev: prevTokenNode, | ||
next: null | ||
} | ||
let prevTokenNode = firstTokenNode | ||
for (let i=1; i<tokenIds.length; i++) { | ||
const currTokenNode = { | ||
origPos: i, | ||
tokenId: tokenIds[i], | ||
prev: prevTokenNode, | ||
next: null | ||
prevTokenNode.next = currTokenNode | ||
addToMergeQueue(prevTokenNode) | ||
prevTokenNode = currTokenNode | ||
} | ||
// Perform merges in priority order | ||
while (!mergeQueue.isEmpty()) { | ||
const leftOfMerge = mergeQueue.pop() | ||
// Check that this merge is still possible | ||
if (leftOfMerge.deleted) continue | ||
if (!leftOfMerge.next) continue | ||
if (leftOfMerge.next.deleted) continue | ||
// Mark leftOfMerge and rightOfMerge as being deleted, because they are actually being replaced by a merged token. | ||
leftOfMerge.deleted = true | ||
leftOfMerge.next.deleted = true | ||
// It's a little bit more complicated to fix the prev of leftOfMerge. | ||
if (leftOfMerge.prev) { | ||
const oldPrev = leftOfMerge.prev | ||
// Mark oldPrev as deleted, to avoid erroneous merges later (ref to this node might exist in priorityqueue) | ||
oldPrev.deleted = true | ||
// Replace oldPrev within the linked list with a copy of itself | ||
const newPrev = { | ||
origPos: oldPrev.origPos, | ||
tokenId: oldPrev.tokenId, | ||
prev: oldPrev.prev, | ||
next: oldPrev.next | ||
} | ||
prevTokenNode.next = currTokenNode | ||
addToMergeQueue(prevTokenNode) | ||
prevTokenNode = currTokenNode | ||
} | ||
// Perform merges in priority order | ||
while (!mergeQueue.isEmpty()) { | ||
const leftOfMerge = mergeQueue.pop() | ||
// Check that this merge is still possible | ||
if (leftOfMerge.deleted) continue | ||
if (!leftOfMerge.next) continue | ||
if (leftOfMerge.next.deleted) continue | ||
// Mark leftOfMerge and rightOfMerge as being deleted, because they are actually being replaced by a merged token. | ||
leftOfMerge.deleted = true | ||
leftOfMerge.next.deleted = true | ||
// It's a little bit more complicated to fix the prev of leftOfMerge. | ||
if (leftOfMerge.prev) { | ||
const oldPrev = leftOfMerge.prev | ||
// Mark oldPrev as deleted, to avoid erroneous merges later (ref to this node might exist in priorityqueue) | ||
oldPrev.deleted = true | ||
// Replace oldPrev within the linked list with a copy of itself | ||
const newPrev = { | ||
origPos: oldPrev.origPos, | ||
tokenId: oldPrev.tokenId, | ||
prev: oldPrev.prev, | ||
next: oldPrev.next | ||
} | ||
leftOfMerge.prev = newPrev | ||
// Update linked list reference of "prev of prev" | ||
if (newPrev.prev) { | ||
newPrev.prev.next = newPrev | ||
} else { | ||
// If "prev of prev" does not exist, that means newPrev must be the new firstNode | ||
firstTokenNode = newPrev | ||
} | ||
} | ||
// Create node representing merge result | ||
const resultOfMerge = { | ||
origPos: leftOfMerge.origPos, | ||
tokenId: this.vocabByString.get(leftOfMerge.mergeToString), | ||
prev: leftOfMerge.prev, | ||
next: leftOfMerge.next.next | ||
} | ||
// Consider adding to merge queue: prev--resultOfMerge | ||
if (resultOfMerge.prev) { | ||
resultOfMerge.prev.next = resultOfMerge | ||
resultOfMerge.prev | ||
addToMergeQueue(resultOfMerge.prev) | ||
leftOfMerge.prev = newPrev | ||
// Update linked list reference of "prev of prev" | ||
if (newPrev.prev) { | ||
newPrev.prev.next = newPrev | ||
} else { | ||
// If prev does not exist then this is the new firstNode | ||
firstTokenNode = resultOfMerge | ||
// If "prev of prev" does not exist, that means newPrev must be the new firstNode | ||
firstTokenNode = newPrev | ||
} | ||
// Consider adding to merge queue: resultOfMerge--next | ||
if (resultOfMerge.next) { | ||
resultOfMerge.next.prev = resultOfMerge | ||
addToMergeQueue(resultOfMerge) | ||
} | ||
} | ||
// Get final tokenIds for this pretoken by traversing the linked list | ||
const mergedTokenIds = [] | ||
for (let currTokenNode = firstTokenNode; currTokenNode !== null; currTokenNode = currTokenNode.next) { | ||
mergedTokenIds.push(currTokenNode.tokenId) | ||
// Create node representing merge result | ||
const resultOfMerge = { | ||
origPos: leftOfMerge.origPos, | ||
tokenId: this.vocabByString.get(leftOfMerge.mergeToString), | ||
prev: leftOfMerge.prev, | ||
next: leftOfMerge.next.next | ||
} | ||
// Consider adding to merge queue: prev--resultOfMerge | ||
if (resultOfMerge.prev) { | ||
resultOfMerge.prev.next = resultOfMerge | ||
resultOfMerge.prev | ||
addToMergeQueue(resultOfMerge.prev) | ||
} else { | ||
// If prev does not exist then this is the new firstNode | ||
firstTokenNode = resultOfMerge | ||
} | ||
// Consider adding to merge queue: resultOfMerge--next | ||
if (resultOfMerge.next) { | ||
resultOfMerge.next.prev = resultOfMerge | ||
addToMergeQueue(resultOfMerge) | ||
} | ||
} | ||
// Get final tokenIds for this pretoken by traversing the linked list | ||
const mergedTokenIds = [] | ||
for (let currTokenNode = firstTokenNode; currTokenNode !== null; currTokenNode = currTokenNode.next) { | ||
mergedTokenIds.push(currTokenNode.tokenId) | ||
} | ||
// Add to cache | ||
cache.set(pretoken, mergedTokenIds) | ||
// Add to output the tokenIds that correspond to this pretoken | ||
output.push(...mergedTokenIds) | ||
} | ||
// Add to cache | ||
cache.set(pretoken, mergedTokenIds) | ||
// Add to output the tokenIds that correspond to this pretoken | ||
output.push(...mergedTokenIds) | ||
} | ||
@@ -536,2 +546,15 @@ } | ||
// This is named "optimistic" in the sense that it optimistically assumes that | ||
// anything that looks like a special token is actually a special token (as opposed to normal text). | ||
// Intent is to use this when working with various fine tunes which have modified special tokens. | ||
optimisticCount(prompt) { | ||
const options = { | ||
bos: true, | ||
eos: true, | ||
specialTokenRegex: /<\|[a-zA-Z0-9_]+\|>/g, | ||
unexpectedSpecialTokenFallback: 1, // Don't care which token we map to, we only care about count | ||
} | ||
return this.encode(prompt, options).length | ||
} | ||
defaultTests(tokenizer) { | ||
@@ -572,2 +595,11 @@ | ||
} | ||
function testOptimisticCount(inputString, expectedCount) { | ||
let startTime = performance.now() | ||
const actualCount = tokenizer.optimisticCount(inputString) | ||
if (actualCount !== expectedCount) { | ||
throw Error(`Optimistic count test failed: expected ${expectedCount}, actual was: ${actualCount}`) | ||
} | ||
console.log(`#${testNum++} test successful (optimisticCount test, running time ${Math.round(10 * (performance.now() - startTime)) / 10} ms)`) | ||
} | ||
@@ -628,3 +660,3 @@ // Simple test case | ||
// Test for regex errors in the regex that is used to split input by special tokens, it has complicated regex to validate that reserved special tokens only go from 0 to 250 | ||
testEncodeAndDecode([ | ||
const stringsThatLookLikeSpecialTokens = [ | ||
// These are real special tokens | ||
@@ -643,3 +675,3 @@ '<|reserved_special_token_0|>', | ||
'<|reserved_special_token_250|>', | ||
// These are not real special tokens and should be processed as normal text | ||
// These are not real special tokens and should be processed as normal text (unless calling optimisticCount) | ||
'<|reserved_special_token_00|>', | ||
@@ -650,5 +682,16 @@ '<|reserved_special_token_09|>', | ||
'<|reserved_special_token_666|>', | ||
].join(""), | ||
] | ||
testEncodeAndDecode(stringsThatLookLikeSpecialTokens.join(""), | ||
[128002, 128014, 128015, 128024, 128058, 128104, 128105, 128183, 128204, 128205, 128254, 128255, 27, 91, 52202, 42729, 6594, 62, 410, 91, 1822, 91, 52202, 42729, 6594, 62, 2545, 91, 1822, 91, 52202, 42729, 6594, 62, 7755, 91, 1822, 91, 52202, 42729, 6594, 62, 13860, 91, 1822, 91, 52202, 42729, 6594, 62, 10943, 91, 29]) | ||
testOptimisticCount([ | ||
...stringsThatLookLikeSpecialTokens, | ||
// These should be assumed to be real tokens | ||
'<|new_tok|>', | ||
'<|t|>', | ||
// These should NOT be parsed as real tokens, but as normal text | ||
'<||>', | ||
'<|hello world|>', | ||
'<|what!|>' | ||
].join(""), 34) | ||
@@ -655,0 +698,0 @@ console.log('LLaMA 3 Tokenizer tests passed successfully.') |
Sorry, the diff of this file is too big to display
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
6446746
19941
140
0