import { TiktokenEmbedding } from '@dqbd/tiktoken';
		export declare function getChunksByMaxToken(text: string, callback: (chunk: string) => void, { maxTokens, encoding_name }: {
		export declare function splitText(text: string, { maxTokens, chunkOverlap, encodingName, }: {
		maxTokens?: number;
		encoding_name?: TiktokenEmbedding;
		}): Promise<void>;
		chunkOverlap?: number;
		encodingName?: TiktokenEmbedding;
		}, callback?: (chunk: string) => void): string[];

+33

-48

lib/helpers/getChunksByMaxToken.js

		"use strict";
		var __awaiter = (this && this.__awaiter) \|\| function (thisArg, _arguments, P, generator) {
		function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
		return new (P \|\| (P = Promise))(function (resolve, reject) {
		function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
		function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
		function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
		step((generator = generator.apply(thisArg, _arguments \|\| [])).next());
		});
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getChunksByMaxToken = void 0;
		exports.splitText = void 0;
		const tiktoken_1 = require("@dqbd/tiktoken");
		const EMBEDDING_CTX_LENGTH = 8191;
		const MAX_CHUNK_LENGTH = 8191;
		const EMBEDDING_ENCODING = 'cl100k_base';
		function* batched(iterable, n) {
		/* Batch data into tuples of length n. The last batch may be shorter. */
		if (n < 1) {
		throw new Error('n must be at least one');
		const CHUNK_OVERLAP = 0;
		function splitText(text, { maxTokens = MAX_CHUNK_LENGTH, chunkOverlap = CHUNK_OVERLAP, encodingName = EMBEDDING_ENCODING, }, callback) {
		if (chunkOverlap >= maxTokens) {
		throw new Error('Cannot have chunkOverlap >= chunkSize');
		}
		const it = iterable[Symbol.iterator]();
		while (true) {
		const batch = [...Array(n)].map(() => it.next().value).filter((x) => x !== undefined);
		if (batch.length === 0) {
		break;
		}
		yield batch;
		const tokenizer = (0, tiktoken_1.get_encoding)(encodingName);
		const input_ids = tokenizer.encode(text);
		const chunkSize = maxTokens;
		let start_idx = 0;
		let cur_idx = Math.min(start_idx + chunkSize, input_ids.length);
		let chunk_ids = input_ids.slice(start_idx, cur_idx);
		const decoder = new TextDecoder();
		const chunks = [];
		console.log('starting while loop');
		while (start_idx < input_ids.length) {
		const chunk = decoder.decode(tokenizer.decode(chunk_ids));
		start_idx += chunkSize - chunkOverlap;
		cur_idx = Math.min(start_idx + chunkSize, input_ids.length);
		chunk_ids = input_ids.slice(start_idx, cur_idx);
		chunks.push(chunk);
		callback && callback(chunk);
		}
		tokenizer.free();
		return chunks;
		}
		function* chunked_tokens(text, encoding_name, chunk_length) {
		const encoding = (0, tiktoken_1.get_encoding)(encoding_name);
		const tokens = encoding.encode(text);
		const chunks_iterator = batched(tokens, chunk_length);
		yield* chunks_iterator;
		}
		function getChunksByMaxToken(text, callback, { maxTokens = EMBEDDING_CTX_LENGTH, encoding_name = EMBEDDING_ENCODING }) {
		return __awaiter(this, void 0, void 0, function* () {
		for (const chunk of chunked_tokens(text, encoding_name, maxTokens)) {
		const enc = (0, tiktoken_1.get_encoding)(encoding_name);
		// eslint-disable-next-line @typescript-eslint/ban-ts-comment
		// @ts-ignore
		const _chunk = new TextDecoder().decode(enc.decode(chunk));
		callback(_chunk);
		}
		// removing for now but would be cool to add it as a seperate function
		// if (average) {
		// let chunk_embeddings_array = np.array(chunk_embeddings);
		// chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens));
		// chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1
		// chunk_embeddings_array = chunk_embeddings_array.tolist();
		// return chunk_embeddings_array;
		// }
		});
		}
		exports.getChunksByMaxToken = getChunksByMaxToken;
		exports.splitText = splitText;
		// removing for now but would be cool to add it as a seperate function
		// if (average) {
		// let chunk_embeddings_array = np.array(chunk_embeddings);
		// chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens));
		// chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1
		// chunk_embeddings_array = chunk_embeddings_array.tolist();
		// return chunk_embeddings_array;
		// }

+2

-2

lib/index.d.ts

		import { getChunksSimple } from './helpers/getChunksSimple';
		import { getChunksByMaxToken } from './helpers/getChunksByMaxToken';
		import { splitText } from './helpers/getChunksByMaxToken';
		import { getChunksByNewLine } from './helpers/getChunksByNewLine';
		import { getChunksByPython } from './helpers/getChunksByPython';
		import { getAllFilesFromGithubRepo } from './helpers/github';
		export { getChunksSimple, getChunksByMaxToken, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo };
		export { getChunksSimple, splitText, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo };

+2

-2

lib/index.js

		"use strict";
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.getChunksByMaxToken = exports.getChunksSimple = void 0;
		exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.splitText = exports.getChunksSimple = void 0;
		const getChunksSimple_1 = require("./helpers/getChunksSimple");
		Object.defineProperty(exports, "getChunksSimple", { enumerable: true, get: function () { return getChunksSimple_1.getChunksSimple; } });
		const getChunksByMaxToken_1 = require("./helpers/getChunksByMaxToken");
		Object.defineProperty(exports, "getChunksByMaxToken", { enumerable: true, get: function () { return getChunksByMaxToken_1.getChunksByMaxToken; } });
		Object.defineProperty(exports, "splitText", { enumerable: true, get: function () { return getChunksByMaxToken_1.splitText; } });
		const getChunksByNewLine_1 = require("./helpers/getChunksByNewLine");
		@@ -9,0 +9,0 @@ Object.defineProperty(exports, "getChunksByNewLine", { enumerable: true, get: function () { return getChunksByNewLine_1.getChunksByNewLine; } });

+3

-2

package.json

		{
		"name": "embeddings-splitter",
		"version": "0.2.0",
		"version": "0.2.1",
		"description": "A typescript library to split your long texts into smaller chunks to send them to OpenAI Embeddings API",
		@@ -9,6 +9,7 @@ "main": "lib/index.js",
		"build": "tsc",

		"format": "prettier --write \"src/*/.(js\|ts)\"",
		"lint": "eslint src --ext .js,.ts",
		"lint:fix": "eslint src --fix --ext .js,.ts",
		"test": "jest --config jest.config.js",
		"test": "jest --runInBand --config jest.config.js --silent=false",
		"test:watch": "jest --config jest.config.js --watch",
		@@ -15,0 +16,0 @@ "prepare": "npm run build",

embeddings-splitter - npm Package Compare versions

New alerts

Fixed alerts

Worsened metrics