embeddings-splitter
Advanced tools
| import { TiktokenEmbedding } from '@dqbd/tiktoken'; | ||
| export declare function getChunksByMaxToken(text: string, callback: (chunk: string) => void, { maxTokens, encoding_name }: { | ||
| export declare function splitText(text: string, { maxTokens, chunkOverlap, encodingName, }: { | ||
| maxTokens?: number; | ||
| encoding_name?: TiktokenEmbedding; | ||
| }): Promise<void>; | ||
| chunkOverlap?: number; | ||
| encodingName?: TiktokenEmbedding; | ||
| }, callback?: (chunk: string) => void): string[]; |
| "use strict"; | ||
| var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
| function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
| return new (P || (P = Promise))(function (resolve, reject) { | ||
| function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
| function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
| function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
| step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
| }); | ||
| }; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.getChunksByMaxToken = void 0; | ||
| exports.splitText = void 0; | ||
| const tiktoken_1 = require("@dqbd/tiktoken"); | ||
| const EMBEDDING_CTX_LENGTH = 8191; | ||
| const MAX_CHUNK_LENGTH = 8191; | ||
| const EMBEDDING_ENCODING = 'cl100k_base'; | ||
| function* batched(iterable, n) { | ||
| /* Batch data into tuples of length n. The last batch may be shorter. */ | ||
| if (n < 1) { | ||
| throw new Error('n must be at least one'); | ||
| const CHUNK_OVERLAP = 0; | ||
| function splitText(text, { maxTokens = MAX_CHUNK_LENGTH, chunkOverlap = CHUNK_OVERLAP, encodingName = EMBEDDING_ENCODING, }, callback) { | ||
| if (chunkOverlap >= maxTokens) { | ||
| throw new Error('Cannot have chunkOverlap >= chunkSize'); | ||
| } | ||
| const it = iterable[Symbol.iterator](); | ||
| while (true) { | ||
| const batch = [...Array(n)].map(() => it.next().value).filter((x) => x !== undefined); | ||
| if (batch.length === 0) { | ||
| break; | ||
| } | ||
| yield batch; | ||
| const tokenizer = (0, tiktoken_1.get_encoding)(encodingName); | ||
| const input_ids = tokenizer.encode(text); | ||
| const chunkSize = maxTokens; | ||
| let start_idx = 0; | ||
| let cur_idx = Math.min(start_idx + chunkSize, input_ids.length); | ||
| let chunk_ids = input_ids.slice(start_idx, cur_idx); | ||
| const decoder = new TextDecoder(); | ||
| const chunks = []; | ||
| console.log('starting while loop'); | ||
| while (start_idx < input_ids.length) { | ||
| const chunk = decoder.decode(tokenizer.decode(chunk_ids)); | ||
| start_idx += chunkSize - chunkOverlap; | ||
| cur_idx = Math.min(start_idx + chunkSize, input_ids.length); | ||
| chunk_ids = input_ids.slice(start_idx, cur_idx); | ||
| chunks.push(chunk); | ||
| callback && callback(chunk); | ||
| } | ||
| tokenizer.free(); | ||
| return chunks; | ||
| } | ||
| function* chunked_tokens(text, encoding_name, chunk_length) { | ||
| const encoding = (0, tiktoken_1.get_encoding)(encoding_name); | ||
| const tokens = encoding.encode(text); | ||
| const chunks_iterator = batched(tokens, chunk_length); | ||
| yield* chunks_iterator; | ||
| } | ||
| function getChunksByMaxToken(text, callback, { maxTokens = EMBEDDING_CTX_LENGTH, encoding_name = EMBEDDING_ENCODING }) { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| for (const chunk of chunked_tokens(text, encoding_name, maxTokens)) { | ||
| const enc = (0, tiktoken_1.get_encoding)(encoding_name); | ||
| // eslint-disable-next-line @typescript-eslint/ban-ts-comment | ||
| // @ts-ignore | ||
| const _chunk = new TextDecoder().decode(enc.decode(chunk)); | ||
| callback(_chunk); | ||
| } | ||
| // removing for now but would be cool to add it as a seperate function | ||
| // if (average) { | ||
| // let chunk_embeddings_array = np.array(chunk_embeddings); | ||
| // chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens)); | ||
| // chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1 | ||
| // chunk_embeddings_array = chunk_embeddings_array.tolist(); | ||
| // return chunk_embeddings_array; | ||
| // } | ||
| }); | ||
| } | ||
| exports.getChunksByMaxToken = getChunksByMaxToken; | ||
| exports.splitText = splitText; | ||
| // removing for now but would be cool to add it as a seperate function | ||
| // if (average) { | ||
| // let chunk_embeddings_array = np.array(chunk_embeddings); | ||
| // chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens)); | ||
| // chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1 | ||
| // chunk_embeddings_array = chunk_embeddings_array.tolist(); | ||
| // return chunk_embeddings_array; | ||
| // } |
+2
-2
| import { getChunksSimple } from './helpers/getChunksSimple'; | ||
| import { getChunksByMaxToken } from './helpers/getChunksByMaxToken'; | ||
| import { splitText } from './helpers/getChunksByMaxToken'; | ||
| import { getChunksByNewLine } from './helpers/getChunksByNewLine'; | ||
| import { getChunksByPython } from './helpers/getChunksByPython'; | ||
| import { getAllFilesFromGithubRepo } from './helpers/github'; | ||
| export { getChunksSimple, getChunksByMaxToken, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo }; | ||
| export { getChunksSimple, splitText, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo }; |
+2
-2
| "use strict"; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.getChunksByMaxToken = exports.getChunksSimple = void 0; | ||
| exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.splitText = exports.getChunksSimple = void 0; | ||
| const getChunksSimple_1 = require("./helpers/getChunksSimple"); | ||
| Object.defineProperty(exports, "getChunksSimple", { enumerable: true, get: function () { return getChunksSimple_1.getChunksSimple; } }); | ||
| const getChunksByMaxToken_1 = require("./helpers/getChunksByMaxToken"); | ||
| Object.defineProperty(exports, "getChunksByMaxToken", { enumerable: true, get: function () { return getChunksByMaxToken_1.getChunksByMaxToken; } }); | ||
| Object.defineProperty(exports, "splitText", { enumerable: true, get: function () { return getChunksByMaxToken_1.splitText; } }); | ||
| const getChunksByNewLine_1 = require("./helpers/getChunksByNewLine"); | ||
@@ -9,0 +9,0 @@ Object.defineProperty(exports, "getChunksByNewLine", { enumerable: true, get: function () { return getChunksByNewLine_1.getChunksByNewLine; } }); |
+3
-2
| { | ||
| "name": "embeddings-splitter", | ||
| "version": "0.2.0", | ||
| "version": "0.2.1", | ||
| "description": "A typescript library to split your long texts into smaller chunks to send them to OpenAI Embeddings API", | ||
@@ -9,6 +9,7 @@ "main": "lib/index.js", | ||
| "build": "tsc", | ||
| "format": "prettier --write \"src/**/*.(js|ts)\"", | ||
| "lint": "eslint src --ext .js,.ts", | ||
| "lint:fix": "eslint src --fix --ext .js,.ts", | ||
| "test": "jest --config jest.config.js", | ||
| "test": "jest --runInBand --config jest.config.js --silent=false", | ||
| "test:watch": "jest --config jest.config.js --watch", | ||
@@ -15,0 +16,0 @@ "prepare": "npm run build", |
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
220954
-0.43%1928
-0.72%