You're Invited:Meet the Socket Team at RSAC and BSidesSF 2026, March 23–26.RSVP
Socket
Book a DemoSign in
Socket

embeddings-splitter

Package Overview
Dependencies
Maintainers
1
Versions
7
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

embeddings-splitter - npm Package Compare versions

Comparing version
0.0.5
to
0.1.0
+5
lib/chunkText.d.ts
export declare function chunkText({ text, // The input text to be split
maxCharLength, }: {
text: string;
maxCharLength?: number;
}): string[];
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.chunkText = void 0;
// A function that splits a text into smaller pieces of roughly equal length
// The pieces are delimited by sentences and try to avoid breaking words or punctuation
// This can be useful for processing long texts with natural language models that have a limited input size
function chunkText({ text, // The input text to be split
// The desired maximum length of each piece in characters
// This uses 4 characters as an approximation of the average token length
// since there isn't a good JS tokenizer at the moment
maxCharLength = 250 * 4, }) {
// Create an empty array to store the pieces
const chunks = [];
// Create a variable to hold the current piece
let currentChunk = '';
// Remove any newline characters from the text and split it by periods
// This assumes that periods mark the end of sentences, which may not be true for some languages
const sentences = text.replace(/\n/g, ' ').split(/([.])/);
for (const sentence of sentences) {
// Remove any extra whitespace from the beginning and end of the sentence
const trimmedSentence = sentence.trim();
// If the sentence is empty, skip it
if (!trimmedSentence)
continue;
// Check if adding the sentence to the current piece would make it too long, too short, or just right
// This uses a tolerance range of 50% of the maximum length to allow some flexibility
// If the piece is too long, save it and start a new one
// If the piece is too short, add the sentence and continue
// If the piece is just right, save it and start a new one
const chunkLength = currentChunk.length + trimmedSentence.length + 1;
const lowerBound = maxCharLength - maxCharLength * 0.5;
const upperBound = maxCharLength + maxCharLength * 0.5;
if (chunkLength >= lowerBound && chunkLength <= upperBound && currentChunk) {
// The piece is just right, so we save it and start a new one
// We remove any periods or spaces from the beginning of the piece and trim any whitespace
currentChunk = currentChunk.replace(/^[. ]+/, '').trim();
// We only push the piece if it is not empty
if (currentChunk)
chunks.push(currentChunk);
// Reset the current piece
currentChunk = '';
}
else if (chunkLength > upperBound) {
// The piece is too long, so save it and start a new one with the sentence
// Remove any periods or spaces from the beginning of the piece and trim any whitespace
currentChunk = currentChunk.replace(/^[. ]+/, '').trim();
// We only push the piece if it is not empty
if (currentChunk)
chunks.push(currentChunk);
// Set the current piece to the sentence
currentChunk = trimmedSentence;
}
else {
// The piece is too short, so add the sentence and continue
// Add a space before the sentence unless it is a period
currentChunk += `${trimmedSentence === '.' ? '' : ' '}${trimmedSentence}`;
}
}
// If there is any remaining piece, save it
if (currentChunk) {
chunks.push(currentChunk);
}
// Return the array of pieces
return chunks;
}
exports.chunkText = chunkText;
export {};
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const cross_fetch_1 = __importDefault(require("cross-fetch"));
const githubToken = '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';
// get all files from agithub repo
const getAllFilesFromGithubRepo = (path) => __awaiter(void 0, void 0, void 0, function* () {
const response = yield (0, cross_fetch_1.default)(path, {
headers: {
Authorization: `token ${githubToken}`,
},
});
const data = yield response.json();
const dataList = [];
data.forEach((data) => __awaiter(void 0, void 0, void 0, function* () {
// if dir then get all files from that dir
if (data.type === 'dir') {
return yield getAllFilesFromGithubRepo(data._links.self); // recursive call
}
console.log(data);
dataList.push(data);
}));
return Promise.all(dataList);
});
const main = (repo) => __awaiter(void 0, void 0, void 0, function* () {
// get all files from github repo
const files = yield getAllFilesFromGithubRepo(`https://api.github.com/repos/${repo}/contents/`);
// keep only python files
console.log(files);
});
main('different-ai/embedbase');
// split the files in token chunks of 1000
//
declare function extractFunctions(filename: string): string[];
export { extractFunctions as getChunksByJavascript };
"use strict";
// function extractFunctions(code) {
// const functionRegex = /(?:async\s+)?function\s+(\w+)\s*\(([\w\s,]*)\)\s*{([\S\s]*?)}/g;
// const arrowFunctionRegex = /const\s+(\w+)\s*=\s*\(([\w\s,]*)\)\s*=>\s*{([\S\s]*?)}/g;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getChunksByJavascript = void 0;
// const functions = [];
// let match;
// while ((match = functionRegex.exec(code))) {
// const [, name, parameters, body] = match;
// functions.push({ name, parameters, body });
// }
// while ((match = arrowFunctionRegex.exec(code))) {
// const [, name, parameters, body] = match;
// functions.push({ name, parameters, body });
// }
// return functions;
// }
const typescript_1 = __importDefault(require("typescript"));
function extractFunctions(filename) {
console.log(filename);
const program = typescript_1.default.createProgram([filename], {});
const sourceFile = program.getSourceFile(filename);
console.log(sourceFile);
const collectedFunctions = [];
function visit(node) {
if (typescript_1.default.isFunctionDeclaration(node)) {
collectedFunctions.push(node.name.text);
}
typescript_1.default.forEachChild(node, visit);
}
visit(sourceFile);
return collectedFunctions;
}
exports.getChunksByJavascript = extractFunctions;
import { TiktokenEmbedding } from '@dqbd/tiktoken';
export declare function getChunksByMaxToken(text: string, max_tokens?: number, encoding_name?: TiktokenEmbedding): Promise<any[]>;
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getChunksByMaxToken = void 0;
const tiktoken_1 = require("@dqbd/tiktoken");
const EMBEDDING_CTX_LENGTH = 8191;
const EMBEDDING_ENCODING = 'cl100k_base';
function* batched(iterable, n) {
/* Batch data into tuples of length n. The last batch may be shorter. */
if (n < 1) {
throw new Error('n must be at least one');
}
const it = iterable[Symbol.iterator]();
while (true) {
const batch = [...Array(n)].map(() => it.next().value).filter((x) => x !== undefined);
if (batch.length === 0) {
break;
}
yield batch;
}
}
function* chunked_tokens(text, encoding_name, chunk_length) {
const encoding = (0, tiktoken_1.get_encoding)(encoding_name);
const tokens = encoding.encode(text);
const chunks_iterator = batched(tokens, chunk_length);
yield* chunks_iterator;
}
function getChunksByMaxToken(text, max_tokens = EMBEDDING_CTX_LENGTH, encoding_name = EMBEDDING_ENCODING) {
return __awaiter(this, void 0, void 0, function* () {
const chunks = [];
for (const chunk of chunked_tokens(text, encoding_name, max_tokens)) {
const enc = (0, tiktoken_1.get_encoding)(encoding_name);
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
const _chunk = new TextDecoder().decode(enc.decode(chunk));
chunks.push(_chunk);
}
// removing for now but would be cool to add it as a seperate function
// if (average) {
// let chunk_embeddings_array = np.array(chunk_embeddings);
// chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens));
// chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1
// chunk_embeddings_array = chunk_embeddings_array.tolist();
// return chunk_embeddings_array;
// }
return chunks;
});
}
exports.getChunksByMaxToken = getChunksByMaxToken;
export declare function getChunksByNewLine(text: string): string[];
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.getChunksByNewLine = void 0;
function getChunksByNewLine(text) {
return text.split(/\r?\n/);
}
exports.getChunksByNewLine = getChunksByNewLine;
interface FunctionData {
code: string;
function_name: string;
}
declare function getFunctions(code: string): FunctionData[];
export { getFunctions as getChunksByPython };
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.getChunksByPython = void 0;
function get_function_name(code) {
/**
Extract function name from a line beginning with "def "
**/
if (!code.startsWith('def '))
throw new Error("Code does not start with 'def'");
return code.slice('def '.length, code.indexOf('('));
}
function get_until_no_space(all_lines, i) {
/**
Get all lines until a line outside the function definition is found.
**/
const ret = [all_lines[i]];
for (let j = i + 1; j < i + 10000; j++) {
if (j < all_lines.length) {
if (all_lines[j].length === 0 || all_lines[j][0] === ' ' || all_lines[j][0] === '\t' || all_lines[j][0] === ')') {
ret.push(all_lines[j]);
}
else {
break;
}
}
}
return ret.join('\n');
}
function getFunctions(code) {
/**
Get all functions in a Python file.
**/
if (code.length === 0)
throw new Error('Code is empty');
const all_lines = code.split('\n');
const functions = [];
for (let i = 0; i < all_lines.length; i++) {
if (all_lines[i].startsWith('def ')) {
const code = get_until_no_space(all_lines, i);
const function_name = get_function_name(all_lines[i]);
functions.push({ code, function_name });
}
}
return functions;
}
exports.getChunksByPython = getFunctions;
export declare function getChunksSimple({ text, // The input text to be split
maxCharLength, }: {
text: string;
maxCharLength?: number;
}): string[];
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.getChunksSimple = void 0;
// A function that splits a text into smaller pieces of roughly equal length
// The pieces are delimited by sentences and try to avoid breaking words or punctuation
// This can be useful for processing long texts with natural language models that have a limited input size
function getChunksSimple({ text, // The input text to be split
// The desired maximum length of each piece in characters
// This uses 4 characters as an approximation of the average token length
// since there isn't a good JS tokenizer at the moment
maxCharLength = 250 * 4, }) {
// Create an empty array to store the pieces
const chunks = [];
// Create a variable to hold the current piece
let currentChunk = '';
// Remove any newline characters from the text and split it by periods
// This assumes that periods mark the end of sentences, which may not be true for some languages
const sentences = text.replace(/\n/g, ' ').split(/([.])/);
for (const sentence of sentences) {
// Remove any extra whitespace from the beginning and end of the sentence
const trimmedSentence = sentence.trim();
// If the sentence is empty, skip it
if (!trimmedSentence)
continue;
// Check if adding the sentence to the current piece would make it too long, too short, or just right
// This uses a tolerance range of 50% of the maximum length to allow some flexibility
// If the piece is too long, save it and start a new one
// If the piece is too short, add the sentence and continue
// If the piece is just right, save it and start a new one
const chunkLength = currentChunk.length + trimmedSentence.length + 1;
const lowerBound = maxCharLength - maxCharLength * 0.5;
const upperBound = maxCharLength + maxCharLength * 0.5;
if (chunkLength >= lowerBound && chunkLength <= upperBound && currentChunk) {
// The piece is just right, so we save it and start a new one
// We remove any periods or spaces from the beginning of the piece and trim any whitespace
currentChunk = currentChunk.replace(/^[. ]+/, '').trim();
// We only push the piece if it is not empty
if (currentChunk)
chunks.push(currentChunk);
// Reset the current piece
currentChunk = '';
}
else if (chunkLength > upperBound) {
// The piece is too long, so save it and start a new one with the sentence
// Remove any periods or spaces from the beginning of the piece and trim any whitespace
currentChunk = currentChunk.replace(/^[. ]+/, '').trim();
// We only push the piece if it is not empty
if (currentChunk)
chunks.push(currentChunk);
// Set the current piece to the sentence
currentChunk = trimmedSentence;
}
else {
// The piece is too short, so add the sentence and continue
// Add a space before the sentence unless it is a period
currentChunk += `${trimmedSentence === '.' ? '' : ' '}${trimmedSentence}`;
}
}
// If there is any remaining piece, save it
if (currentChunk) {
chunks.push(currentChunk);
}
// Return the array of pieces
return chunks;
}
exports.getChunksSimple = getChunksSimple;
interface GithubFile {
name: string;
path: string;
sha: string;
size: number;
url: string;
html_url: string;
git_url: string;
download_url: string;
type: 'file' | 'dir';
_links: {
self: string;
git: string;
html: string;
};
}
export declare const getAllFilesFromGithubRepo: (url: string, githubToken: string) => Promise<GithubFile[]>;
export {};
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getAllFilesFromGithubRepo = void 0;
const cross_fetch_1 = __importDefault(require("cross-fetch"));
// get all files from agithub repo
const getAllFilesFromGithubRepo = (url, githubToken) => __awaiter(void 0, void 0, void 0, function* () {
if (!url) {
throw new Error('No url provided');
}
if (!githubToken) {
throw new Error('No github token provided');
}
const response = yield (0, cross_fetch_1.default)(url, {
headers: {
Authorization: `token ${githubToken}`,
},
});
const data = yield response.json();
const dataList = [];
for (const item of data) {
if (item.type === 'file') {
dataList.push(item);
}
else if (item.type === 'dir') {
const subdirFiles = yield (0, exports.getAllFilesFromGithubRepo)(item._links.self, githubToken);
dataList.push(...subdirFiles);
}
}
return dataList;
});
exports.getAllFilesFromGithubRepo = getAllFilesFromGithubRepo;
import type * as tiktoken from "@dqbd/tiktoken";
import { Document } from "./document.js";
interface TextSplitterParams {
chunkSize: number;
chunkOverlap: number;
}
export declare abstract class TextSplitter implements TextSplitterParams {
chunkSize: number;
chunkOverlap: number;
constructor(fields?: Partial<TextSplitterParams>);
abstract splitText(text: string): Promise<string[]>;
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Promise<Document[]>;
splitDocuments(documents: Document[]): Promise<Document[]>;
private joinDocs;
mergeSplits(splits: string[], separator: string): string[];
}
export interface CharacterTextSplitterParams extends TextSplitterParams {
separator: string;
}
export declare class CharacterTextSplitter extends TextSplitter implements CharacterTextSplitterParams {
separator: string;
constructor(fields?: Partial<CharacterTextSplitterParams>);
splitText(text: string): Promise<string[]>;
}
export interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
separators: string[];
}
export declare class RecursiveCharacterTextSplitter extends TextSplitter implements RecursiveCharacterTextSplitterParams {
separators: string[];
constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>);
splitText(text: string): Promise<string[]>;
}
export interface TokenTextSplitterParams extends TextSplitterParams {
encodingName: tiktoken.TiktokenEmbedding;
allowedSpecial: "all" | Array<string>;
disallowedSpecial: "all" | Array<string>;
}
/**
* Implementation of splitter which looks at tokens.
*/
export declare class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams {
encodingName: tiktoken.TiktokenEmbedding;
allowedSpecial: "all" | Array<string>;
disallowedSpecial: "all" | Array<string>;
private tokenizer;
constructor(fields?: Partial<TokenTextSplitterParams>);
splitText(text: string): Promise<string[]>;
static imports(): Promise<typeof tiktoken>;
}
export {};
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.TokenTextSplitter = exports.RecursiveCharacterTextSplitter = exports.CharacterTextSplitter = exports.TextSplitter = void 0;
const document_js_1 = require("./document.js");
class TextSplitter {
constructor(fields) {
var _a, _b;
this.chunkSize = 1000;
this.chunkOverlap = 200;
this.chunkSize = (_a = fields === null || fields === void 0 ? void 0 : fields.chunkSize) !== null && _a !== void 0 ? _a : this.chunkSize;
this.chunkOverlap = (_b = fields === null || fields === void 0 ? void 0 : fields.chunkOverlap) !== null && _b !== void 0 ? _b : this.chunkOverlap;
if (this.chunkOverlap >= this.chunkSize) {
throw new Error("Cannot have chunkOverlap >= chunkSize");
}
}
createDocuments(texts,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
metadatas = []) {
return __awaiter(this, void 0, void 0, function* () {
const _metadatas = metadatas.length > 0 ? metadatas : new Array(texts.length).fill({});
const documents = new Array();
for (let i = 0; i < texts.length; i += 1) {
const text = texts[i];
for (const chunk of yield this.splitText(text)) {
documents.push(new document_js_1.Document({ pageContent: chunk, metadata: _metadatas[i] }));
}
}
return documents;
});
}
splitDocuments(documents) {
return __awaiter(this, void 0, void 0, function* () {
const texts = documents.map((doc) => doc.pageContent);
const metadatas = documents.map((doc) => doc.metadata);
return this.createDocuments(texts, metadatas);
});
}
joinDocs(docs, separator) {
const text = docs.join(separator).trim();
return text === "" ? null : text;
}
mergeSplits(splits, separator) {
const docs = [];
const currentDoc = [];
let total = 0;
for (const d of splits) {
const _len = d.length;
if (total + _len >= this.chunkSize) {
if (total > this.chunkSize) {
console.warn(`Created a chunk of size ${total}, +
which is longer than the specified ${this.chunkSize}`);
}
if (currentDoc.length > 0) {
const doc = this.joinDocs(currentDoc, separator);
if (doc !== null) {
docs.push(doc);
}
// Keep on popping if:
// - we have a larger chunk than in the chunk overlap
// - or if we still have any chunks and the length is long
while (total > this.chunkOverlap ||
(total + _len > this.chunkSize && total > 0)) {
total -= currentDoc[0].length;
currentDoc.shift();
}
}
}
currentDoc.push(d);
total += _len;
}
const doc = this.joinDocs(currentDoc, separator);
if (doc !== null) {
docs.push(doc);
}
return docs;
}
}
exports.TextSplitter = TextSplitter;
class CharacterTextSplitter extends TextSplitter {
constructor(fields) {
var _a;
super(fields);
this.separator = "\n\n";
this.separator = (_a = fields === null || fields === void 0 ? void 0 : fields.separator) !== null && _a !== void 0 ? _a : this.separator;
}
splitText(text) {
return __awaiter(this, void 0, void 0, function* () {
// First we naively split the large input into a bunch of smaller ones.
let splits;
if (this.separator) {
splits = text.split(this.separator);
}
else {
splits = text.split("");
}
return this.mergeSplits(splits, this.separator);
});
}
}
exports.CharacterTextSplitter = CharacterTextSplitter;
class RecursiveCharacterTextSplitter extends TextSplitter {
constructor(fields) {
var _a;
super(fields);
this.separators = ["\n\n", "\n", " ", ""];
this.separators = (_a = fields === null || fields === void 0 ? void 0 : fields.separators) !== null && _a !== void 0 ? _a : this.separators;
}
splitText(text) {
return __awaiter(this, void 0, void 0, function* () {
const finalChunks = [];
// Get appropriate separator to use
let separator = this.separators[this.separators.length - 1];
for (const s of this.separators) {
if (s === "") {
separator = s;
break;
}
if (text.includes(s)) {
separator = s;
break;
}
}
// Now that we have the separator, split the text
let splits;
if (separator) {
splits = text.split(separator);
}
else {
splits = text.split("");
}
// Now go merging things, recursively splitting longer texts.
let goodSplits = [];
for (const s of splits) {
if (s.length < this.chunkSize) {
goodSplits.push(s);
}
else {
if (goodSplits.length) {
const mergedText = this.mergeSplits(goodSplits, separator);
finalChunks.push(...mergedText);
goodSplits = [];
}
const otherInfo = yield this.splitText(s);
finalChunks.push(...otherInfo);
}
}
if (goodSplits.length) {
const mergedText = this.mergeSplits(goodSplits, separator);
finalChunks.push(...mergedText);
}
return finalChunks;
});
}
}
exports.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter;
/**
* Implementation of splitter which looks at tokens.
*/
class TokenTextSplitter extends TextSplitter {
constructor(fields) {
var _a, _b, _c;
super(fields);
this.encodingName = (_a = fields === null || fields === void 0 ? void 0 : fields.encodingName) !== null && _a !== void 0 ? _a : "gpt2";
this.allowedSpecial = (_b = fields === null || fields === void 0 ? void 0 : fields.allowedSpecial) !== null && _b !== void 0 ? _b : [];
this.disallowedSpecial = (_c = fields === null || fields === void 0 ? void 0 : fields.disallowedSpecial) !== null && _c !== void 0 ? _c : "all";
}
splitText(text) {
return __awaiter(this, void 0, void 0, function* () {
if (!this.tokenizer) {
const tiktoken = yield TokenTextSplitter.imports();
this.tokenizer = tiktoken.get_encoding(this.encodingName);
}
const splits = [];
const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
let start_idx = 0;
let cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
let chunk_ids = input_ids.slice(start_idx, cur_idx);
const decoder = new TextDecoder();
while (start_idx < input_ids.length) {
splits.push(decoder.decode(this.tokenizer.decode(chunk_ids)));
start_idx += this.chunkSize - this.chunkOverlap;
cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
chunk_ids = input_ids.slice(start_idx, cur_idx);
}
return splits;
});
}
static imports() {
return __awaiter(this, void 0, void 0, function* () {
try {
return yield Promise.resolve().then(() => __importStar(require("@dqbd/tiktoken")));
}
catch (err) {
console.error(err);
throw new Error("Please install @dqbd/tiktoken as a dependency with, e.g. `npm install -S @dqbd/tiktoken`");
}
});
}
}
exports.TokenTextSplitter = TokenTextSplitter;
+6
-12

@@ -1,12 +0,6 @@

export declare function split(prompt: string, maxTokens?: number): string[];
type Batches = {
data: string;
}[];
export declare function index(chunks: string[], embedCallback: (batch: Batches) => void): Promise<void[]>;
export declare const merge: (chunks: string[], maxLen?: number) => Promise<string>;
declare const _default: {
split: typeof split;
index: typeof index;
merge: (chunks: string[], maxLen?: number) => Promise<string>;
};
export default _default;
import { getChunksSimple } from './helpers/getChunksSimple';
import { getChunksByMaxToken } from './helpers/getChunksByMaxToken';
import { getChunksByNewLine } from './helpers/getChunksByNewLine';
import { getChunksByPython } from './helpers/getChunksByPython';
import { getAllFilesFromGithubRepo } from './helpers/github';
export { getChunksSimple, getChunksByMaxToken, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo };
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.merge = exports.index = exports.split = void 0;
const tiktoken_1 = require("@dqbd/tiktoken");
const tokenizer = (0, tiktoken_1.get_encoding)('cl100k_base');
function splitIntoMany(text, maxTokens) {
// Split the text into sentences
const sentences = text.split('. ');
// Get the number of tokens for each sentence
const nTokens = sentences.map((sentence) => tokenizer.encode(' ' + sentence).length);
const chunks = [];
let tokensSoFar = 0;
let chunk = [];
// Loop through the sentences and tokens joined together in a tuple
for (let i = 0; i < sentences.length; i++) {
const sentence = sentences[i];
const token = nTokens[i];
// If the number of tokens so far plus the number of tokens in the current sentence is greater
// than the max number of tokens, then add the chunk to the list of chunks and reset
// the chunk and tokens so far
if (tokensSoFar + token > maxTokens) {
chunks.push(chunk.join('. ') + '.');
chunk = [];
tokensSoFar = 0;
}
// If the number of tokens in the current sentence is greater than the max number of
// tokens, go to the next sentence
if (token > maxTokens) {
continue;
}
// Otherwise, add the sentence to the chunk and add the number of tokens to the total
chunk.push(sentence);
tokensSoFar += token + 1;
}
return chunks;
}
function split(prompt, maxTokens = 500) {
// should split into tokens
if (!prompt || prompt.length === 0) {
throw new Error('Nothing to embeddify');
}
const chunks = splitIntoMany(prompt, maxTokens);
return chunks;
}
exports.split = split;
function index(chunks, embedCallback) {
return __awaiter(this, void 0, void 0, function* () {
const batches = [];
for (let i = 0; i < chunks.length; i += 100) {
batches.push(chunks.slice(i, i + 100).map((text) => ({ data: text })));
}
return yield Promise.all(batches.map((batch) => embedCallback(batch)));
});
}
exports.index = index;
// should index chunks
const merge = (chunks, maxLen = 1800) => __awaiter(void 0, void 0, void 0, function* () {
let curLen = 0;
const context = [];
for (const chunk of chunks) {
const nTokens = tokenizer.encode(chunk).length;
curLen += nTokens + 4;
if (curLen > maxLen) {
break;
}
context.push(chunk);
}
return context.join('\n\n###\n\n');
});
exports.merge = merge;
exports.default = { split, index, merge: exports.merge };
exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.getChunksByMaxToken = exports.getChunksSimple = void 0;
const getChunksSimple_1 = require("./helpers/getChunksSimple");
Object.defineProperty(exports, "getChunksSimple", { enumerable: true, get: function () { return getChunksSimple_1.getChunksSimple; } });
const getChunksByMaxToken_1 = require("./helpers/getChunksByMaxToken");
Object.defineProperty(exports, "getChunksByMaxToken", { enumerable: true, get: function () { return getChunksByMaxToken_1.getChunksByMaxToken; } });
const getChunksByNewLine_1 = require("./helpers/getChunksByNewLine");
Object.defineProperty(exports, "getChunksByNewLine", { enumerable: true, get: function () { return getChunksByNewLine_1.getChunksByNewLine; } });
const getChunksByPython_1 = require("./helpers/getChunksByPython");
Object.defineProperty(exports, "getChunksByPython", { enumerable: true, get: function () { return getChunksByPython_1.getChunksByPython; } });
const github_1 = require("./helpers/github");
Object.defineProperty(exports, "getAllFilesFromGithubRepo", { enumerable: true, get: function () { return github_1.getAllFilesFromGithubRepo; } });
{
"name": "embeddings-splitter",
"version": "0.0.5",
"version": "0.1.0",
"description": "A typescript library to split your long texts into smaller chunks to send them to OpenAI Embeddings API",

@@ -22,3 +22,3 @@ "main": "lib/index.js",

"type": "git",
"url": "git+https://github.com/another-ai/embeddings-splitter.git"
"url": "git+https://github.com/different-ai/embeddings-splitter.git"
},

@@ -30,8 +30,8 @@ "keywords": [

],
"author": "another AI",
"author": "Different AI",
"license": "MIT",
"bugs": {
"url": "https://github.com/another-ai/embeddings-splitter/issues"
"url": "https://github.com/different-ai/embeddings-splitter/issues"
},
"homepage": "https://github.com/another-ai/embeddings-splitter#readme",
"homepage": "https://github.com/different-ai/embeddings-splitter#readme",
"devDependencies": {

@@ -52,4 +52,5 @@ "@types/jest": "29.2.4",

"dependencies": {
"@dqbd/tiktoken": "^0.2.1"
"@dqbd/tiktoken": "^0.4.0",
"cross-fetch": "^3.1.5"
}
}
+13
-25

@@ -15,3 +15,3 @@ <p align="center">embeddings-splitter</p>

`split` makes sure your string will are short enough to be embedded
`split` makes sure your string will are short enough to be embedded. (default split size is 500 tokens, but you OpenAI embeddings allow you to go up to 8191)

@@ -21,30 +21,11 @@ ```js

// chunks to iterate on and send to a server
const chunks = split('somVeryLongText...');
```
### Batch send (experimental)
// example with biggest chunk size
const chunks = split('someVeryLongText', 8191)
```js
import {index} from 'embeddings-splitter';
// now you can send these chunks to be embedded
```
// used to send batches to a server in parellel
index(chunks, (batch) => {
// this example is using Embedbase, but it can be replaced with openai.createEmbeddings
const vaultId = 'youtube video id';
await fetch(url + '/v1/' + 'your api key', {
method: 'POST',
headers: {
Authorization: 'Bearer ' + apiKey,
'Content-Type': 'application/json',
},
body: JSON.stringify({
documents: batch,
}),
});
});
```
### Merge chunks into single string

@@ -58,3 +39,10 @@

const chunks = ['i am a text', 'that needs to be interpreted as one ', 'for a prompt to make sense'];
const merged = merge(chunks);
const context = merge(chunks);
// e.g. of what to do with merged array
const question = 'what is this text about?"
const prompt = Answer the question based on the context below, and if the question can't be answered based on the context, say "I don't know"\n\nContext: ${context}\n\n---\n\nQuestion: ${question}\nAnswer:
createCompletion(prompt)
```

@@ -61,0 +49,0 @@