@lenml/tokenizers
Advanced tools
+10
-1
@@ -173,2 +173,6 @@ /** | ||
| unk_token_id: number | undefined; | ||
| bos_token: string | null; | ||
| bos_token_id: number | undefined; | ||
| eos_token: string | null; | ||
| eos_token_id: number | undefined; | ||
| model_max_length: any; | ||
@@ -689,2 +693,4 @@ /** @type {boolean} Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). */ | ||
| } | ||
| declare class MgpstrTokenizer extends PreTrainedTokenizer { | ||
| } | ||
| /** | ||
@@ -743,2 +749,3 @@ * Helper class which is used to instantiate pretrained tokenizers with the `from_pretrained` function. | ||
| CohereTokenizer: typeof CohereTokenizer; | ||
| MgpstrTokenizer: typeof MgpstrTokenizer; | ||
| PreTrainedTokenizer: typeof PreTrainedTokenizer; | ||
@@ -1109,2 +1116,4 @@ }; | ||
| type tokenizers_Message = Message; | ||
| type tokenizers_MgpstrTokenizer = MgpstrTokenizer; | ||
| declare const tokenizers_MgpstrTokenizer: typeof MgpstrTokenizer; | ||
| type tokenizers_MobileBertTokenizer = MobileBertTokenizer; | ||
@@ -1150,3 +1159,3 @@ declare const tokenizers_MobileBertTokenizer: typeof MobileBertTokenizer; | ||
| declare namespace tokenizers { | ||
| export { tokenizers_AlbertTokenizer as AlbertTokenizer, tokenizers_AutoTokenizer as AutoTokenizer, type tokenizers_BPENode as BPENode, tokenizers_BartTokenizer as BartTokenizer, tokenizers_BertTokenizer as BertTokenizer, tokenizers_BlenderbotSmallTokenizer as BlenderbotSmallTokenizer, tokenizers_BlenderbotTokenizer as BlenderbotTokenizer, tokenizers_BloomTokenizer as BloomTokenizer, tokenizers_CLIPTokenizer as CLIPTokenizer, tokenizers_CamembertTokenizer as CamembertTokenizer, tokenizers_CodeGenTokenizer as CodeGenTokenizer, tokenizers_CodeLlamaTokenizer as CodeLlamaTokenizer, tokenizers_CohereTokenizer as CohereTokenizer, tokenizers_ConvBertTokenizer as ConvBertTokenizer, tokenizers_DebertaTokenizer as DebertaTokenizer, tokenizers_DebertaV2Tokenizer as DebertaV2Tokenizer, tokenizers_DistilBertTokenizer as DistilBertTokenizer, tokenizers_ElectraTokenizer as ElectraTokenizer, type tokenizers_EncodingSingle as EncodingSingle, tokenizers_EsmTokenizer as EsmTokenizer, tokenizers_FalconTokenizer as FalconTokenizer, tokenizers_GPT2Tokenizer as GPT2Tokenizer, tokenizers_GPTNeoXTokenizer as GPTNeoXTokenizer, tokenizers_GemmaTokenizer as GemmaTokenizer, tokenizers_Grok1Tokenizer as Grok1Tokenizer, tokenizers_HerbertTokenizer as HerbertTokenizer, tokenizers_LlamaTokenizer as LlamaTokenizer, tokenizers_M2M100Tokenizer as M2M100Tokenizer, tokenizers_MBart50Tokenizer as MBart50Tokenizer, tokenizers_MBartTokenizer as MBartTokenizer, tokenizers_MPNetTokenizer as MPNetTokenizer, tokenizers_MarianTokenizer as MarianTokenizer, type tokenizers_Message as Message, tokenizers_MobileBertTokenizer as MobileBertTokenizer, tokenizers_NllbTokenizer as NllbTokenizer, tokenizers_NougatTokenizer as NougatTokenizer, type tokenizers_PostProcessedOutput as PostProcessedOutput, tokenizers_PreTrainedTokenizer as PreTrainedTokenizer, type tokenizers_PretrainedTokenizerOptions as PretrainedTokenizerOptions, tokenizers_Qwen2Tokenizer as Qwen2Tokenizer, tokenizers_RoFormerTokenizer as RoFormerTokenizer, tokenizers_RobertaTokenizer as RobertaTokenizer, tokenizers_SiglipTokenizer as SiglipTokenizer, tokenizers_SpeechT5Tokenizer as SpeechT5Tokenizer, type tokenizers_SplitDelimiterBehavior as SplitDelimiterBehavior, tokenizers_SqueezeBertTokenizer as SqueezeBertTokenizer, tokenizers_T5Tokenizer as T5Tokenizer, tokenizers_TokenizerModel as TokenizerModel, type tokenizers_TokenizerProperties as TokenizerProperties, tokenizers_VitsTokenizer as VitsTokenizer, tokenizers_Wav2Vec2CTCTokenizer as Wav2Vec2CTCTokenizer, tokenizers_WhisperTokenizer as WhisperTokenizer, tokenizers_XLMRobertaTokenizer as XLMRobertaTokenizer, tokenizers_XLMTokenizer as XLMTokenizer, tokenizers_is_chinese_char as is_chinese_char }; | ||
| export { tokenizers_AlbertTokenizer as AlbertTokenizer, tokenizers_AutoTokenizer as AutoTokenizer, type tokenizers_BPENode as BPENode, tokenizers_BartTokenizer as BartTokenizer, tokenizers_BertTokenizer as BertTokenizer, tokenizers_BlenderbotSmallTokenizer as BlenderbotSmallTokenizer, tokenizers_BlenderbotTokenizer as BlenderbotTokenizer, tokenizers_BloomTokenizer as BloomTokenizer, tokenizers_CLIPTokenizer as CLIPTokenizer, tokenizers_CamembertTokenizer as CamembertTokenizer, tokenizers_CodeGenTokenizer as CodeGenTokenizer, tokenizers_CodeLlamaTokenizer as CodeLlamaTokenizer, tokenizers_CohereTokenizer as CohereTokenizer, tokenizers_ConvBertTokenizer as ConvBertTokenizer, tokenizers_DebertaTokenizer as DebertaTokenizer, tokenizers_DebertaV2Tokenizer as DebertaV2Tokenizer, tokenizers_DistilBertTokenizer as DistilBertTokenizer, tokenizers_ElectraTokenizer as ElectraTokenizer, type tokenizers_EncodingSingle as EncodingSingle, tokenizers_EsmTokenizer as EsmTokenizer, tokenizers_FalconTokenizer as FalconTokenizer, tokenizers_GPT2Tokenizer as GPT2Tokenizer, tokenizers_GPTNeoXTokenizer as GPTNeoXTokenizer, tokenizers_GemmaTokenizer as GemmaTokenizer, tokenizers_Grok1Tokenizer as Grok1Tokenizer, tokenizers_HerbertTokenizer as HerbertTokenizer, tokenizers_LlamaTokenizer as LlamaTokenizer, tokenizers_M2M100Tokenizer as M2M100Tokenizer, tokenizers_MBart50Tokenizer as MBart50Tokenizer, tokenizers_MBartTokenizer as MBartTokenizer, tokenizers_MPNetTokenizer as MPNetTokenizer, tokenizers_MarianTokenizer as MarianTokenizer, type tokenizers_Message as Message, tokenizers_MgpstrTokenizer as MgpstrTokenizer, tokenizers_MobileBertTokenizer as MobileBertTokenizer, tokenizers_NllbTokenizer as NllbTokenizer, tokenizers_NougatTokenizer as NougatTokenizer, type tokenizers_PostProcessedOutput as PostProcessedOutput, tokenizers_PreTrainedTokenizer as PreTrainedTokenizer, type tokenizers_PretrainedTokenizerOptions as PretrainedTokenizerOptions, tokenizers_Qwen2Tokenizer as Qwen2Tokenizer, tokenizers_RoFormerTokenizer as RoFormerTokenizer, tokenizers_RobertaTokenizer as RobertaTokenizer, tokenizers_SiglipTokenizer as SiglipTokenizer, tokenizers_SpeechT5Tokenizer as SpeechT5Tokenizer, type tokenizers_SplitDelimiterBehavior as SplitDelimiterBehavior, tokenizers_SqueezeBertTokenizer as SqueezeBertTokenizer, tokenizers_T5Tokenizer as T5Tokenizer, tokenizers_TokenizerModel as TokenizerModel, type tokenizers_TokenizerProperties as TokenizerProperties, tokenizers_VitsTokenizer as VitsTokenizer, tokenizers_Wav2Vec2CTCTokenizer as Wav2Vec2CTCTokenizer, tokenizers_WhisperTokenizer as WhisperTokenizer, tokenizers_XLMRobertaTokenizer as XLMRobertaTokenizer, tokenizers_XLMTokenizer as XLMTokenizer, tokenizers_is_chinese_char as is_chinese_char }; | ||
| } | ||
@@ -1153,0 +1162,0 @@ |
+1
-1
| { | ||
| "name": "@lenml/tokenizers", | ||
| "version": "3.0.3", | ||
| "version": "3.4.0", | ||
| "description": "a lightweight no-dependency fork of transformers.js (only tokenizers)", | ||
@@ -5,0 +5,0 @@ "source": "src/main.ts", |
+7
-1
@@ -45,5 +45,11 @@ # @lenml/tokenizers | ||
| import { TokenizerLoader } from "@lenml/tokenizers"; | ||
| const tokenizer = await TokenizerLoader.fromPreTrainedUrls({ | ||
| const sourceUrls = { | ||
| tokenizerJSON: "https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1/resolve/main/tokenizer.json?download=true", | ||
| tokenizerConfig: "https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1/resolve/main/tokenizer_config.json?download=true" | ||
| } | ||
| const tokenizer = await TokenizerLoader.fromPreTrainedUrls(sourceUrls); | ||
| // or from fetch | ||
| const tokenizer = TokenizerLoader.fromPreTrained({ | ||
| tokenizerJSON: await fetch(sourceUrls.tokenizerJSON).then(r => r.json()), | ||
| tokenizerConfig: await fetch(sourceUrls.tokenizerConfig).then(r => r.json()) | ||
| }); | ||
@@ -50,0 +56,0 @@ ``` |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
2091973
0.55%8610
0.5%127
4.96%