llama3-tokenizer-js - npm Package Compare versions

Comparing version 1.1.2 to 1.1.3

package.json

		{
		"name": "llama3-tokenizer-js",
		"version": "1.1.2",
		"version": "1.1.3",
		"description": "JS tokenizer for LLaMA 3",
		@@ -5,0 +5,0 @@ "main": "bundle/llama3-tokenizer-with-baked-data.js",

README.md

		@@ -103,4 +103,5 @@ # 🦙 llama3-tokenizer-js 🦙

		1) If you need exact token counts, you can work around this issue by using this library to tokenize _only_ user input text (which shouldn't contain any special tokens) and then programmatically adding the relevant counts for the special tokens that you are using to wrap the input text.
		2) Alternatively, you can choose to ignore this issue, in which case you will be overcounting tokens by a little bit, which is not too bad (in typical use cases, undercounting can lead to more severe quality issues than overcounting).
		1) Instead of calling `.encode(str).length`, you can call `.optimisticCount(str)`. Optimistic count is a convenience function which parses the text with the assumption that anything that looks like a special token (e.g. `<\|boom\|>`) is actually a special token.
		2) If you need exact token counts, you can work around this issue by using this library to tokenize _only_ user input text (which shouldn't contain any special tokens) and then programmatically adding the relevant counts for the special tokens that you are using to wrap the input text.
		3) Alternatively, you can choose to ignore this issue, in which case you will be overcounting tokens by a little bit, which is not too bad (in typical use cases, undercounting can lead to more severe quality issues than overcounting).

		@@ -107,0 +108,0 @@ ## Tests

295

src/llama3-tokenizer.js

		@@ -329,2 +329,5 @@ /**
		}

		// Allow passing an alternative special token regex. Needed for optimisticCount.
		const specialTokenRegex = options.specialTokenRegex ?? /<\\|(?:begin_of_text\|end_of_text\|start_header_id\|end_header_id\|eot_id\|reserved_special_token_(?:[0-9]\|[1-9][0-9]\|1[0-9][0-9]\|2[0-4][0-9]\|250))\\|>/g

		@@ -367,147 +370,154 @@ if (!this.vocabById \|\| !this.vocabByString \|\| !this.merges) {
		// Split prompt by special tokens, e.g. <\|eot_id\|>
		const regexSpecialToken = /<\\|(?:begin_of_text\|end_of_text\|start_header_id\|end_header_id\|eot_id\|reserved_special_token_(?:[0-9]\|[1-9][0-9]\|1[0-9][0-9]\|2[0-4][0-9]\|250))\\|>/g
		const splittedBySpecialTokens = regexSplit(prompt, regexSpecialToken)
		const splittedBySpecialTokens = regexSplit(prompt, specialTokenRegex)
		for (let specialSplitIndex=0; specialSplitIndex<splittedBySpecialTokens.length; specialSplitIndex++) {
		const specialSplitString = splittedBySpecialTokens[specialSplitIndex]
		if (specialSplitString.match(regexSpecialToken) && this.vocabByString.has(specialSplitString)) {
		if (specialSplitString.match(specialTokenRegex) && this.vocabByString.has(specialSplitString)) {
		// If this string is a special token according to our regex AND is found from vocabulary, output the corresponding special token id.
		output.push(this.vocabByString.get(specialSplitString))
		} else if (specialSplitString.match(regexSpecialToken)) {
		// If this string is a special token according to our regex BUT is not found from vocabulary, there is a mismatch between our regex
		// and our vocabulary. The input has been incorrectly split and we cannot guarantee correctness if we continued, so we throw an error.
		continue
		}
		if (specialSplitString.match(specialTokenRegex) && options.unexpectedSpecialTokenFallback) {
		// This is used in optimisticCount to map each unexpected special token into some fallback token.
		output.push(options.unexpectedSpecialTokenFallback)
		continue
		}
		if (specialSplitString.match(specialTokenRegex)) {
		// This string is a special token according to our regex BUT is not found from vocabulary.
		// We don't have options.unexpectedSpecialTokenFallback that we could use.
		// This situation is unexpected and indicates a mismatch between our regex and our vocabulary.
		// The input has been incorrectly split and we cannot guarantee correctness if we continue, so we throw an error.
		throw new Error('Internal error occurred in llama3-tokenizer-js while processing special tokens in input.')
		} else {
		// Else treat this string like normal text
		}
		// If we are here then we treat this string like normal text, not like a special token.

		// SplitPretokenizer
		const splittedByPretokenizer = regexSplit(specialSplitString, CLEAN_LLAMA3_REGEX_STRING)
		// SplitPretokenizer
		const splittedByPretokenizer = regexSplit(specialSplitString, CLEAN_LLAMA3_REGEX_STRING)

		// ByteLevelPretokenizer maps all our bytes to unicode strings, this also does the mapping from space to Ġ (charCode 32 -> 32+256)
		const bytemappedSplit = splittedByPretokenizer.map(pretoken => Array.from(this.utf8Encoder.encode(pretoken), byte => BYTES_TO_UNICODE[byte]).join(''))
		// ByteLevelPretokenizer maps all our bytes to unicode strings, this also does the mapping from space to Ġ (charCode 32 -> 32+256)
		const bytemappedSplit = splittedByPretokenizer.map(pretoken => Array.from(this.utf8Encoder.encode(pretoken), byte => BYTES_TO_UNICODE[byte]).join(''))

		// The results from pretokenizer are handled one by one
		for (let pretokenIndex=0; pretokenIndex<bytemappedSplit.length; pretokenIndex++) {
		const pretoken = bytemappedSplit[pretokenIndex]
		// The results from pretokenizer are handled one by one
		for (let pretokenIndex=0; pretokenIndex<bytemappedSplit.length; pretokenIndex++) {
		const pretoken = bytemappedSplit[pretokenIndex]

		// Because LLaMA 3 tokenizer is configured with ignore_merges,
		// we check if the pretoken is found in our vocabulary,
		// and if it is, we map it to tokenId directly from vocabulary
		// (instead of normal BPE processing, like in LLaMA 1 tokenizer,
		// where the BPE process sometimes leads to different tokens).
		if (this.vocabByString.has(pretoken)) {
		output.push(this.vocabByString.get(pretoken))
		continue
		}
		// Because LLaMA 3 tokenizer is configured with ignore_merges,
		// we check if the pretoken is found in our vocabulary,
		// and if it is, we map it to tokenId directly from vocabulary
		// (instead of normal BPE processing, like in LLaMA 1 tokenizer,
		// where the BPE process sometimes leads to different tokens).
		if (this.vocabByString.has(pretoken)) {
		output.push(this.vocabByString.get(pretoken))
		continue
		}

		// Pretoken was not found from vocabulary, so we proceed with normal BPE processing,
		// which will result in a sequence of at least 2 tokens that represent the pretoken.
		// Pretoken was not found from vocabulary, so we proceed with normal BPE processing,
		// which will result in a sequence of at least 2 tokens that represent the pretoken.

		// Cache used for performance
		if (cache.has(pretoken)) {
		output.push(...(cache.get(pretoken)))
		continue
		}
		// Cache used for performance
		if (cache.has(pretoken)) {
		output.push(...(cache.get(pretoken)))
		continue
		}

		// Initially each character is transformed to a tokenId, later there will be merges of these.
		// Note that this array represents the tokenIds of the pretoken, not the entire sequence (there are typically multiple pretokens).
		const tokenIds = []
		// Initially each character is transformed to a tokenId, later there will be merges of these.
		// Note that this array represents the tokenIds of the pretoken, not the entire sequence (there are typically multiple pretokens).
		const tokenIds = []

		// Transform each character to its corresponding token
		const charArray = Array.from(pretoken)
		for (let i=0; i<charArray.length; i++) {
		const c = charArray[i]
		if (!this.vocabByString.has(c)) {
		throw Error(`Character ${c} not found from vocabulary. This is not supposed to happen (vocab is supposed to cover everything that comes out of pretokenization).`)
		}
		tokenIds.push(this.vocabByString.get(c))
		// Transform each character to its corresponding token
		const charArray = Array.from(pretoken)
		for (let i=0; i<charArray.length; i++) {
		const c = charArray[i]
		if (!this.vocabByString.has(c)) {
		throw Error(`Character ${c} not found from vocabulary. This is not supposed to happen (vocab is supposed to cover everything that comes out of pretokenization).`)
		}
		tokenIds.push(this.vocabByString.get(c))
		}

		// Fill merge queue from initial merge possibilities and construct linked list
		let firstTokenNode = {
		origPos: 0,
		tokenId: tokenIds[0],
		prev: null,
		next: null,
		// Fill merge queue from initial merge possibilities and construct linked list
		let firstTokenNode = {
		origPos: 0,
		tokenId: tokenIds[0],
		prev: null,
		next: null,
		}
		let prevTokenNode = firstTokenNode
		for (let i=1; i<tokenIds.length; i++) {
		const currTokenNode = {
		origPos: i,
		tokenId: tokenIds[i],
		prev: prevTokenNode,
		next: null
		}
		let prevTokenNode = firstTokenNode
		for (let i=1; i<tokenIds.length; i++) {
		const currTokenNode = {
		origPos: i,
		tokenId: tokenIds[i],
		prev: prevTokenNode,
		next: null
		prevTokenNode.next = currTokenNode
		addToMergeQueue(prevTokenNode)
		prevTokenNode = currTokenNode
		}

		// Perform merges in priority order
		while (!mergeQueue.isEmpty()) {
		const leftOfMerge = mergeQueue.pop()
		// Check that this merge is still possible
		if (leftOfMerge.deleted) continue
		if (!leftOfMerge.next) continue
		if (leftOfMerge.next.deleted) continue

		// Mark leftOfMerge and rightOfMerge as being deleted, because they are actually being replaced by a merged token.
		leftOfMerge.deleted = true
		leftOfMerge.next.deleted = true
		// It's a little bit more complicated to fix the prev of leftOfMerge.
		if (leftOfMerge.prev) {
		const oldPrev = leftOfMerge.prev
		// Mark oldPrev as deleted, to avoid erroneous merges later (ref to this node might exist in priorityqueue)
		oldPrev.deleted = true
		// Replace oldPrev within the linked list with a copy of itself
		const newPrev = {
		origPos: oldPrev.origPos,
		tokenId: oldPrev.tokenId,
		prev: oldPrev.prev,
		next: oldPrev.next
		}
		prevTokenNode.next = currTokenNode
		addToMergeQueue(prevTokenNode)
		prevTokenNode = currTokenNode
		}

		// Perform merges in priority order
		while (!mergeQueue.isEmpty()) {
		const leftOfMerge = mergeQueue.pop()
		// Check that this merge is still possible
		if (leftOfMerge.deleted) continue
		if (!leftOfMerge.next) continue
		if (leftOfMerge.next.deleted) continue

		// Mark leftOfMerge and rightOfMerge as being deleted, because they are actually being replaced by a merged token.
		leftOfMerge.deleted = true
		leftOfMerge.next.deleted = true
		// It's a little bit more complicated to fix the prev of leftOfMerge.
		if (leftOfMerge.prev) {
		const oldPrev = leftOfMerge.prev
		// Mark oldPrev as deleted, to avoid erroneous merges later (ref to this node might exist in priorityqueue)
		oldPrev.deleted = true
		// Replace oldPrev within the linked list with a copy of itself
		const newPrev = {
		origPos: oldPrev.origPos,
		tokenId: oldPrev.tokenId,
		prev: oldPrev.prev,
		next: oldPrev.next
		}
		leftOfMerge.prev = newPrev
		// Update linked list reference of "prev of prev"
		if (newPrev.prev) {
		newPrev.prev.next = newPrev
		} else {
		// If "prev of prev" does not exist, that means newPrev must be the new firstNode
		firstTokenNode = newPrev
		}
		}
		// Create node representing merge result
		const resultOfMerge = {
		origPos: leftOfMerge.origPos,
		tokenId: this.vocabByString.get(leftOfMerge.mergeToString),
		prev: leftOfMerge.prev,
		next: leftOfMerge.next.next
		}
		// Consider adding to merge queue: prev--resultOfMerge
		if (resultOfMerge.prev) {
		resultOfMerge.prev.next = resultOfMerge
		resultOfMerge.prev
		addToMergeQueue(resultOfMerge.prev)
		leftOfMerge.prev = newPrev
		// Update linked list reference of "prev of prev"
		if (newPrev.prev) {
		newPrev.prev.next = newPrev
		} else {
		// If prev does not exist then this is the new firstNode
		firstTokenNode = resultOfMerge
		// If "prev of prev" does not exist, that means newPrev must be the new firstNode
		firstTokenNode = newPrev
		}
		// Consider adding to merge queue: resultOfMerge--next
		if (resultOfMerge.next) {
		resultOfMerge.next.prev = resultOfMerge
		addToMergeQueue(resultOfMerge)
		}

		}

		// Get final tokenIds for this pretoken by traversing the linked list
		const mergedTokenIds = []
		for (let currTokenNode = firstTokenNode; currTokenNode !== null; currTokenNode = currTokenNode.next) {
		mergedTokenIds.push(currTokenNode.tokenId)
		// Create node representing merge result
		const resultOfMerge = {
		origPos: leftOfMerge.origPos,
		tokenId: this.vocabByString.get(leftOfMerge.mergeToString),
		prev: leftOfMerge.prev,
		next: leftOfMerge.next.next
		}
		// Consider adding to merge queue: prev--resultOfMerge
		if (resultOfMerge.prev) {
		resultOfMerge.prev.next = resultOfMerge
		resultOfMerge.prev
		addToMergeQueue(resultOfMerge.prev)
		} else {
		// If prev does not exist then this is the new firstNode
		firstTokenNode = resultOfMerge
		}
		// Consider adding to merge queue: resultOfMerge--next
		if (resultOfMerge.next) {
		resultOfMerge.next.prev = resultOfMerge
		addToMergeQueue(resultOfMerge)
		}

		}

		// Get final tokenIds for this pretoken by traversing the linked list
		const mergedTokenIds = []
		for (let currTokenNode = firstTokenNode; currTokenNode !== null; currTokenNode = currTokenNode.next) {
		mergedTokenIds.push(currTokenNode.tokenId)
		}

		// Add to cache
		cache.set(pretoken, mergedTokenIds)

		// Add to output the tokenIds that correspond to this pretoken
		output.push(...mergedTokenIds)
		}
		// Add to cache
		cache.set(pretoken, mergedTokenIds)

		// Add to output the tokenIds that correspond to this pretoken
		output.push(...mergedTokenIds)
		}
		@@ -536,2 +546,15 @@ }

		// This is named "optimistic" in the sense that it optimistically assumes that
		// anything that looks like a special token is actually a special token (as opposed to normal text).
		// Intent is to use this when working with various fine tunes which have modified special tokens.
		optimisticCount(prompt) {
		const options = {
		bos: true,
		eos: true,
		specialTokenRegex: /<\\|[a-zA-Z0-9_]+\\|>/g,
		unexpectedSpecialTokenFallback: 1, // Don't care which token we map to, we only care about count
		}
		return this.encode(prompt, options).length
		}

		defaultTests(tokenizer) {
		@@ -572,2 +595,11 @@
		}

		function testOptimisticCount(inputString, expectedCount) {
		let startTime = performance.now()
		const actualCount = tokenizer.optimisticCount(inputString)
		if (actualCount !== expectedCount) {
		throw Error(`Optimistic count test failed: expected ${expectedCount}, actual was: ${actualCount}`)
		}
		console.log(`#${testNum++} test successful (optimisticCount test, running time ${Math.round(10 * (performance.now() - startTime)) / 10} ms)`)
		}

		@@ -628,3 +660,3 @@ // Simple test case
		// Test for regex errors in the regex that is used to split input by special tokens, it has complicated regex to validate that reserved special tokens only go from 0 to 250
		testEncodeAndDecode([
		const stringsThatLookLikeSpecialTokens = [
		// These are real special tokens
		@@ -643,3 +675,3 @@ '<\|reserved_special_token_0\|>',
		'<\|reserved_special_token_250\|>',
		// These are not real special tokens and should be processed as normal text
		// These are not real special tokens and should be processed as normal text (unless calling optimisticCount)
		'<\|reserved_special_token_00\|>',
		@@ -650,5 +682,16 @@ '<\|reserved_special_token_09\|>',
		'<\|reserved_special_token_666\|>',
		].join(""),
		]
		testEncodeAndDecode(stringsThatLookLikeSpecialTokens.join(""),
		[128002, 128014, 128015, 128024, 128058, 128104, 128105, 128183, 128204, 128205, 128254, 128255, 27, 91, 52202, 42729, 6594, 62, 410, 91, 1822, 91, 52202, 42729, 6594, 62, 2545, 91, 1822, 91, 52202, 42729, 6594, 62, 7755, 91, 1822, 91, 52202, 42729, 6594, 62, 13860, 91, 1822, 91, 52202, 42729, 6594, 62, 10943, 91, 29])

		testOptimisticCount([
		...stringsThatLookLikeSpecialTokens,
		// These should be assumed to be real tokens
		'<\|new_tok\|>',
		'<\|t\|>',
		// These should NOT be parsed as real tokens, but as normal text
		'<\|\|>',
		'<\|hello world\|>',
		'<\|what!\|>'
		].join(""), 34)

		@@ -655,0 +698,0 @@ console.log('LLaMA 3 Tokenizer tests passed successfully.')

bundle/llama3-tokenizer-with-baked-data.js

Sorry, the diff of this file is too big to display

llama3-tokenizer-js - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics