gptoken - npm Package Compare versions

gptoken

Package Overview

Dependencies

Maintainers

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.0.1 to 0.0.3

bpe_data/encoderMapToTokens.js

171

Encoder.js



		const encoder = require("./bpe_data/encoder");
		const encoder = require("./bpe_data/encoderMapToTokens");

		@@ -8,2 +8,9 @@ // This file includes code which was modified from https://github.com/openai/gpt-2

		/**
		* Returns an array of numbers between x and y (inclusive).
		*
		* @param {number} x - The starting number.
		* @param {number} y - The ending number.
		* @returns {number[]} - An array of numbers between x and y (inclusive).
		*/
		const range = (x, y) => {
		@@ -14,2 +21,8 @@ const res = Array.from(Array(y).keys()).slice(x)

		/**
		* Returns the Unicode code point of the first character in a string.
		* In computer science, the term "ord" is short for "ordinal" or "order"
		* @param {string} x - The string to get the code point of.
		* @returns {number} - The Unicode code point of the first character in the string.
		*/
		const ord = x => {
		@@ -19,2 +32,8 @@ return x.charCodeAt(0)

		/**
		* Returns the character corresponding to a Unicode code point.
		* inverse of ord
		* @param {number} x - The Unicode code point to get the corresponding character for.
		* @returns {string} - The character corresponding to the given Unicode code point.
		*/
		const chr = x => {
		@@ -24,2 +43,8 @@ return String.fromCharCode(x)

		/**
		* Encodes a given string as an array of string representations of its UTF-8 encoded bytes.
		*
		* @param {string} str - The string to encode.
		* @returns {string[]} - An array of string representations of the UTF-8 encoded bytes of the input string.
		*/
		const encodeStr = str => {
		@@ -33,2 +58,8 @@ return Array.from(Buffer.from(str, 'utf-8')).map(x => x.toString());


		/**
		* Returns a mapping of byte values to their corresponding Unicode characters.
		*
		* @returns {Object.<number, string>} - A mapping of byte values to Unicode characters.
		*/
		function bytes_to_unicode() {
		@@ -56,2 +87,8 @@ const bs = range(ord('!'), ord('~') + 1).concat(range(ord('¡'), ord('¬') + 1), range(ord('®'), ord('ÿ') + 1))

		/**
		* Returns a set of all the pairs of adjacent characters in a given string.
		*
		* @param {string} word - The string to get pairs of adjacent characters from.
		* @returns {Set.<Array.<string>>} - A set of all the pairs of adjacent characters in the string.
		*/
		function get_pairs(word) {
		@@ -88,2 +125,15 @@ const pairs = new Set()


		/**
		* This code sets up a decoder object to convert encoded tokens back to their original string form,
		* by creating a mapping of the keys and values in the encoder object.
		* It also sets up a byte encoder and decoder object for converting bytes to unicode characters and vice versa.
		* Finally, it initializes a cache Map to store previously processed inputs for faster encoding.
		*
		* @member {Object} encoder - An object containing mappings of strings to numerical token values.
		* @member {Object} decoder - An object containing mappings of numerical token values to their original string form.
		* @member {Object} byte_encoder - An object containing mappings of byte values to their corresponding unicode characters.
		* @member {Object} byte_decoder - An object containing mappings of unicode characters to their corresponding byte values.
		* @member {Map} cache - A Map object used to cache previously processed inputs for faster encoding.
		*/
		const decoder = {}
		@@ -328,2 +378,103 @@ Object.keys(encoder).map(x => {


		/**
		* Module for encoding and decoding text using byte pair encoding (BPE).
		*
		* @module gptoken
		* @property {Function} encode - Function for encoding text using BPE.
		* @property {Function} decode - Function for decoding text using BPE.
		* @property {Function} countTokens - Function for counting the number of tokens in a BPE encoding.
		* @property {Function} tokenStats - Function for computing statistics on the tokens in a BPE encoding.
		* @property {Object} util - Utility functions used by the main functions.
		* @property {Function} util.ord - Function for getting the Unicode code point of a character.
		* @property {Function} util.chr - Function for getting the character corresponding to a Unicode code point.
		* @property {number} util.bpe - Implements the Byte Pair Encoding (BPE) algorithm for subword tokenization.
		* @property {Function} util.range - Function for generating a range of numbers.
		* @property {RegExp} util.pat - Regular expression for matching token words in text.
		* @property {Function} util.get_pairs - Function for getting all pairs of adjacent characters in a string.
		* @property {function} encodeStr - Encodes a string as an array of UTF-8 byte values.
		* @property {function} decodeStr - Decodes an array of UTF-8 byte values as a
		* @property {Object.<string, number>} maps.bpe_ranks - Object mapping BPE tokens to their ranks.
		* @property {Object.<string, number>} maps.encoder - Object mapping tokens to their integer representations.
		* @property {Object.<string, string>} maps.decoder - Object mapping integer representations to their tokens.
		* @property {Object.<string, string>} maps.byte_decoder - Object mapping UTF-8 byte values to their characters.
		* @property {Object.<string, number>} maps.byte_encoder - Object mapping characters to their UTF-8 byte values.
		*
		* @example
		* const gptoken = require('gptoken');
		*
		* const text = 'The quick brown fox jumps over the lazy dog';
		* const encoded = gptoken.encode(text);
		* console.log('Encoded:', encoded.slice(0, 5).concat(['...']).concat(encoded.slice(-5)));
		*
		* const decoded = gptoken.decode(encoded);
		* console.log('Decoded:', decoded);
		*
		* const count = gptoken.countTokens(text);
		* console.log('Token Count:', count);
		*
		* const stats = gptoken.tokenStats(encoded);
		* console.log('Token Stats:', JSON.stringify(stats, null, 4));
		*
		*
		* //OUTPUT
		* Encoded: [ 464, 2068, 7586, 21831, 18045, '...', 262, 16931, 3290 ]
		* Decoded: The quick brown fox jumps over the lazy dog
		* Token Count: 9
		* Token Stats: {
		* "count": 9,
		* "unique": 9,
		* "frequency": {
		* "262": 1,
		* "464": 1,
		* "625": 1,
		* "2068": 1,
		* "3290": 1,
		* "7586": 1,
		* "16931": 1,
		* "18045": 1,
		* "21831": 1
		* },
		* "positions": {
		* "262": [
		* 6
		* ],
		* "464": [
		* 0
		* ],
		* "625": [
		* 5
		* ],
		* "2068": [
		* 1
		* ],
		* "3290": [
		* 8
		* ],
		* "7586": [
		* 2
		* ],
		* "16931": [
		* 7
		* ],
		* "18045": [
		* 4
		* ],
		* "21831": [
		* 3
		* ]
		* },
		* "tokens": [
		* 464,
		* 2068,
		* 7586,
		* 21831,
		* 18045,
		* 625,
		* 262,
		* 16931,
		* 3290
		* ]
		* }
		*/
		module.exports = {
		@@ -333,3 +484,17 @@ encode,
		countTokens,
		tokenStats
		};
		tokenStats,
		util: {
		ord, chr, bpe, range, pat,
		get_pairs,
		encodeStr, decodeStr,
		},
		maps: {
		encoder,
		decoder,
		byte_decoder,
		byte_encoder,
		bpe_ranks,
		cache,
		}

		};

115

index.d.ts

		@@ -1,10 +0,77 @@
		// declare module "gpt-3-encoder"

		// declare module "gptoken" {
		//
		// export function encode(text: string): number[];
		//
		// export function decode(tokens: number[]): string;
		//
		// export function countTokens(text: string): number;
		//
		// export function tokenStats(input: string \| number[]): TokenStats;
		//
		// export interface TokenStats {
		// count: number;
		// unique: number;
		// frequency: Record<string, number>;
		// positions: Record<string, number[]>;
		// tokens: string[];
		// }
		//
		// export interface Utils {
		// ord: (x: string) => number;
		// char: (x: number) => string;
		// bpe: number;
		// range: (start: number, end: number) => number[];
		// pat: RegExp;
		// get_pairs: (word: string) => Set<string[]>;
		// bpe_ranks: Record<string, number>;
		// }
		//
		// export const util: Utils;
		// }
		// declare module "@syonfox/gpt-3-encoder" GPToken
		//this dosent seem to work :(
		// export function extendModule<T e declare module "@syonfox/gpt-3-encoder" GPToken
		// xtends Record<string, any>>(moduleName: string, moduleToExtend: T): T {
		// return {
		// ...moduleToExtend,
		// __esModule: true,
		// default: moduleToExtend,
		// } as unknown as T & { default: T };
		// }






		/**
		* Module for encoding and decoding text using byte pair encoding (BPE).
		*
		* @module gptoken
		* @property {Function} encode - Function for encoding text using BPE.
		* @property {Function} decode - Function for decoding text using BPE.
		* @property {Function} countTokens - Function for counting the number of tokens in a BPE encoding.
		* @property {Function} tokenStats - Function for computing statistics on the tokens in a BPE encoding.
		* @property {Object} util - Utility functions used by the main functions.
		* @property {Function} util.ord - Function for getting the Unicode code point of a character.
		* @property {Function} util.chr - Function for getting the character corresponding to a Unicode code point.
		* @property {number} util.bpe - Implements the Byte Pair Encoding (BPE) algorithm for subword tokenization.
		* @property {Function} util.range - Function for generating a range of numbers.
		* @property {RegExp} util.pat - Regular expression for matching token words in text.
		* @property {Function} util.get_pairs - Function for getting all pairs of adjacent characters in a string.
		* @property {function} encodeStr - Encodes a string as an array of UTF-8 byte values.
		* @property {function} decodeStr - Decodes an array of UTF-8 byte values as a string.
		* @property {Object} maps - Objects containing various maps used by the BPE algorithm.
		* @property {Object.<string, number>} maps.encoder - Object mapping characters to their BPE encodings.
		* @property {Object.<number, string>} maps.decoder - Object mapping BPE encodings to their characters.
		* @property {Object.<number, string>} maps.byte_decoder - Object mapping byte values to their Unicode characters.
		* @property {Object.<string, number>} maps.byte_encoder - Object mapping Unicode characters to their byte values.
		* @property {Object.<string, number>} maps.bpe_ranks - Object mapping BPE tokens to their ranks.
		* @property {Map} maps.cache - Map for caching BPE encodings.
		*/
		declare module "gptoken" {

		export function encode(text: string): number[];

		export function decode(tokens: number[]): string;

		export function countTokens(text: string): number;

		export function tokenStats(input: string \| number[]): TokenStats;
		@@ -19,13 +86,29 @@
		}
		export interface Utils {
		ord: (char: string) => number;
		chr: (code: number) => string;
		bpe: (codes: number[], options?: BpeOptions) => number[];
		range: (start: number, end: number, step?: number) => number[];
		pat: RegExp;
		get_pairs: (word: string) => string[];
		encodeStr: (str: string) => number[];
		decodeStr: (arr: number[]) => string;
		}

		}
		// declare module "@syonfox/gpt-3-encoder" GPToken
		//this dosent seem to work :(
		// export function extendModule<T e declare module "@syonfox/gpt-3-encoder" GPToken
		// xtends Record<string, any>>(moduleName: string, moduleToExtend: T): T {
		// return {
		// ...moduleToExtend,
		// __esModule: true,
		// default: moduleToExtend,
		// } as unknown as T & { default: T };
		// }
		export const util: Utils;

		export const maps: {
		encoder: Record<string, number>;
		decoder: Record<number, string>;
		byte_decoder: Record<number, string>;
		byte_encoder: Record<string, number>;
		bpe_ranks: Record<string, number>;
		cache: Map<string, number[]>;
		};
		export interface BpeOptions {
		isDict?: boolean;
		vocab?: Record<string, number>;
		merges?: Record<string, string>;
		separator?: string;
		}
		}

index.js

		@@ -1,2 +0,2 @@
		const { encode, decode, countTokens, tokenStats } = require("./Encoder");
		const { encode, decode, countTokens, tokenStats, util, maps } = require("./Encoder");

		@@ -7,3 +7,5 @@ module.exports = {
		countTokens,
		tokenStats
		tokenStats,
		util,
		maps
		};

package.json

		{
		"name": "gptoken",
		"version": "0.0.1",
		"version": "0.0.3",
		"description": "Javascript BPE Encoder Decoder for GPT-2 / GPT-3. The \"gpt-3-encoder\" module provides functions for encoding and decoding text using the Byte Pair Encoding (BPE) algorithm. It can be used to process text data for input into machine learning models, or to convert tokenized text back into human-readable format. It also includes functions for counting tokens in a given text and generating statistics about the tokens in a string or array.",
		@@ -11,3 +11,3 @@ "main": "index.js",
		"Encoder.js",
		"bpe_data/encoder.js",
		"bpe_data/encoderMapToTokens.js",
		"bpe_data/bpe_ranks.js",
		@@ -14,0 +14,0 @@ "browser.js",

bpe_data/encoder.js

browser.html

Sorry, the diff of this file is not supported yet

browser.js

Sorry, the diff of this file is too big to display

Improved metrics