You're Invited:Meet the Socket Team at RSAC and BSidesSF 2026, March 23–26.RSVP
Socket
Book a DemoSign in
Socket

embeddings-splitter

Package Overview
Dependencies
Maintainers
1
Versions
7
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

embeddings-splitter - npm Package Compare versions

Comparing version
0.2.0
to
0.2.1
+4
-3
lib/helpers/getChunksByMaxToken.d.ts
import { TiktokenEmbedding } from '@dqbd/tiktoken';
export declare function getChunksByMaxToken(text: string, callback: (chunk: string) => void, { maxTokens, encoding_name }: {
export declare function splitText(text: string, { maxTokens, chunkOverlap, encodingName, }: {
maxTokens?: number;
encoding_name?: TiktokenEmbedding;
}): Promise<void>;
chunkOverlap?: number;
encodingName?: TiktokenEmbedding;
}, callback?: (chunk: string) => void): string[];
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getChunksByMaxToken = void 0;
exports.splitText = void 0;
const tiktoken_1 = require("@dqbd/tiktoken");
const EMBEDDING_CTX_LENGTH = 8191;
const MAX_CHUNK_LENGTH = 8191;
const EMBEDDING_ENCODING = 'cl100k_base';
function* batched(iterable, n) {
/* Batch data into tuples of length n. The last batch may be shorter. */
if (n < 1) {
throw new Error('n must be at least one');
const CHUNK_OVERLAP = 0;
function splitText(text, { maxTokens = MAX_CHUNK_LENGTH, chunkOverlap = CHUNK_OVERLAP, encodingName = EMBEDDING_ENCODING, }, callback) {
if (chunkOverlap >= maxTokens) {
throw new Error('Cannot have chunkOverlap >= chunkSize');
}
const it = iterable[Symbol.iterator]();
while (true) {
const batch = [...Array(n)].map(() => it.next().value).filter((x) => x !== undefined);
if (batch.length === 0) {
break;
}
yield batch;
const tokenizer = (0, tiktoken_1.get_encoding)(encodingName);
const input_ids = tokenizer.encode(text);
const chunkSize = maxTokens;
let start_idx = 0;
let cur_idx = Math.min(start_idx + chunkSize, input_ids.length);
let chunk_ids = input_ids.slice(start_idx, cur_idx);
const decoder = new TextDecoder();
const chunks = [];
console.log('starting while loop');
while (start_idx < input_ids.length) {
const chunk = decoder.decode(tokenizer.decode(chunk_ids));
start_idx += chunkSize - chunkOverlap;
cur_idx = Math.min(start_idx + chunkSize, input_ids.length);
chunk_ids = input_ids.slice(start_idx, cur_idx);
chunks.push(chunk);
callback && callback(chunk);
}
tokenizer.free();
return chunks;
}
function* chunked_tokens(text, encoding_name, chunk_length) {
const encoding = (0, tiktoken_1.get_encoding)(encoding_name);
const tokens = encoding.encode(text);
const chunks_iterator = batched(tokens, chunk_length);
yield* chunks_iterator;
}
function getChunksByMaxToken(text, callback, { maxTokens = EMBEDDING_CTX_LENGTH, encoding_name = EMBEDDING_ENCODING }) {
return __awaiter(this, void 0, void 0, function* () {
for (const chunk of chunked_tokens(text, encoding_name, maxTokens)) {
const enc = (0, tiktoken_1.get_encoding)(encoding_name);
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
const _chunk = new TextDecoder().decode(enc.decode(chunk));
callback(_chunk);
}
// removing for now but would be cool to add it as a seperate function
// if (average) {
// let chunk_embeddings_array = np.array(chunk_embeddings);
// chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens));
// chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1
// chunk_embeddings_array = chunk_embeddings_array.tolist();
// return chunk_embeddings_array;
// }
});
}
exports.getChunksByMaxToken = getChunksByMaxToken;
exports.splitText = splitText;
// removing for now but would be cool to add it as a seperate function
// if (average) {
// let chunk_embeddings_array = np.array(chunk_embeddings);
// chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens));
// chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1
// chunk_embeddings_array = chunk_embeddings_array.tolist();
// return chunk_embeddings_array;
// }
import { getChunksSimple } from './helpers/getChunksSimple';
import { getChunksByMaxToken } from './helpers/getChunksByMaxToken';
import { splitText } from './helpers/getChunksByMaxToken';
import { getChunksByNewLine } from './helpers/getChunksByNewLine';
import { getChunksByPython } from './helpers/getChunksByPython';
import { getAllFilesFromGithubRepo } from './helpers/github';
export { getChunksSimple, getChunksByMaxToken, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo };
export { getChunksSimple, splitText, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo };
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.getChunksByMaxToken = exports.getChunksSimple = void 0;
exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.splitText = exports.getChunksSimple = void 0;
const getChunksSimple_1 = require("./helpers/getChunksSimple");
Object.defineProperty(exports, "getChunksSimple", { enumerable: true, get: function () { return getChunksSimple_1.getChunksSimple; } });
const getChunksByMaxToken_1 = require("./helpers/getChunksByMaxToken");
Object.defineProperty(exports, "getChunksByMaxToken", { enumerable: true, get: function () { return getChunksByMaxToken_1.getChunksByMaxToken; } });
Object.defineProperty(exports, "splitText", { enumerable: true, get: function () { return getChunksByMaxToken_1.splitText; } });
const getChunksByNewLine_1 = require("./helpers/getChunksByNewLine");

@@ -9,0 +9,0 @@ Object.defineProperty(exports, "getChunksByNewLine", { enumerable: true, get: function () { return getChunksByNewLine_1.getChunksByNewLine; } });

{
"name": "embeddings-splitter",
"version": "0.2.0",
"version": "0.2.1",
"description": "A typescript library to split your long texts into smaller chunks to send them to OpenAI Embeddings API",

@@ -9,6 +9,7 @@ "main": "lib/index.js",

"build": "tsc",
"format": "prettier --write \"src/**/*.(js|ts)\"",
"lint": "eslint src --ext .js,.ts",
"lint:fix": "eslint src --fix --ext .js,.ts",
"test": "jest --config jest.config.js",
"test": "jest --runInBand --config jest.config.js --silent=false",
"test:watch": "jest --config jest.config.js --watch",

@@ -15,0 +16,0 @@ "prepare": "npm run build",