embeddings-splitter - npm Package Compare versions

+5

lib/chunkText.d.ts

		export declare function chunkText({ text, // The input text to be split
		maxCharLength, }: {
		text: string;
		maxCharLength?: number;
		}): string[];

+66

lib/chunkText.js

		"use strict";
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.chunkText = void 0;
		// A function that splits a text into smaller pieces of roughly equal length
		// The pieces are delimited by sentences and try to avoid breaking words or punctuation
		// This can be useful for processing long texts with natural language models that have a limited input size
		function chunkText({ text, // The input text to be split
		// The desired maximum length of each piece in characters
		// This uses 4 characters as an approximation of the average token length
		// since there isn't a good JS tokenizer at the moment
		maxCharLength = 250 * 4, }) {
		// Create an empty array to store the pieces
		const chunks = [];
		// Create a variable to hold the current piece
		let currentChunk = '';
		// Remove any newline characters from the text and split it by periods
		// This assumes that periods mark the end of sentences, which may not be true for some languages
		const sentences = text.replace(/\n/g, ' ').split(/([.])/);
		for (const sentence of sentences) {
		// Remove any extra whitespace from the beginning and end of the sentence
		const trimmedSentence = sentence.trim();
		// If the sentence is empty, skip it
		if (!trimmedSentence)
		continue;
		// Check if adding the sentence to the current piece would make it too long, too short, or just right
		// This uses a tolerance range of 50% of the maximum length to allow some flexibility
		// If the piece is too long, save it and start a new one
		// If the piece is too short, add the sentence and continue
		// If the piece is just right, save it and start a new one
		const chunkLength = currentChunk.length + trimmedSentence.length + 1;
		const lowerBound = maxCharLength - maxCharLength * 0.5;
		const upperBound = maxCharLength + maxCharLength * 0.5;
		if (chunkLength >= lowerBound && chunkLength <= upperBound && currentChunk) {
		// The piece is just right, so we save it and start a new one
		// We remove any periods or spaces from the beginning of the piece and trim any whitespace
		currentChunk = currentChunk.replace(/^[. ]+/, '').trim();
		// We only push the piece if it is not empty
		if (currentChunk)
		chunks.push(currentChunk);
		// Reset the current piece
		currentChunk = '';
		}
		else if (chunkLength > upperBound) {
		// The piece is too long, so save it and start a new one with the sentence
		// Remove any periods or spaces from the beginning of the piece and trim any whitespace
		currentChunk = currentChunk.replace(/^[. ]+/, '').trim();
		// We only push the piece if it is not empty
		if (currentChunk)
		chunks.push(currentChunk);
		// Set the current piece to the sentence
		currentChunk = trimmedSentence;
		}
		else {
		// The piece is too short, so add the sentence and continue
		// Add a space before the sentence unless it is a period
		currentChunk += `${trimmedSentence === '.' ? '' : ' '}${trimmedSentence}`;
		}
		}
		// If there is any remaining piece, save it
		if (currentChunk) {
		chunks.push(currentChunk);
		}
		// Return the array of pieces
		return chunks;
		}
		exports.chunkText = chunkText;

+1

lib/github.d.ts

export {};

+44

lib/github.js

		"use strict";
		var __awaiter = (this && this.__awaiter) \|\| function (thisArg, _arguments, P, generator) {
		function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
		return new (P \|\| (P = Promise))(function (resolve, reject) {
		function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
		function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
		function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
		step((generator = generator.apply(thisArg, _arguments \|\| [])).next());
		});
		};
		var __importDefault = (this && this.__importDefault) \|\| function (mod) {
		return (mod && mod.__esModule) ? mod : { "default": mod };
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		const cross_fetch_1 = __importDefault(require("cross-fetch"));
		const githubToken = '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';
		// get all files from agithub repo
		const getAllFilesFromGithubRepo = (path) => __awaiter(void 0, void 0, void 0, function* () {
		const response = yield (0, cross_fetch_1.default)(path, {
		headers: {
		Authorization: `token ${githubToken}`,
		},
		});
		const data = yield response.json();
		const dataList = [];
		data.forEach((data) => __awaiter(void 0, void 0, void 0, function* () {
		// if dir then get all files from that dir
		if (data.type === 'dir') {
		return yield getAllFilesFromGithubRepo(data._links.self); // recursive call
		}
		console.log(data);
		dataList.push(data);
		}));
		return Promise.all(dataList);
		});
		const main = (repo) => __awaiter(void 0, void 0, void 0, function* () {
		// get all files from github repo
		const files = yield getAllFilesFromGithubRepo(`https://api.github.com/repos/${repo}/contents/`);
		// keep only python files
		console.log(files);
		});
		main('different-ai/embedbase');
		// split the files in token chunks of 1000
		//

+2

lib/helpers/getChunksByJavascript.d.ts

		declare function extractFunctions(filename: string): string[];
		export { extractFunctions as getChunksByJavascript };

+38

lib/helpers/getChunksByJavascript.js

		"use strict";
		// function extractFunctions(code) {
		// const functionRegex = /(?:async\s+)?function\s+(\w+)\s$([\w\s,])$\s{([\S\s]?)}/g;
		// const arrowFunctionRegex = /const\s+(\w+)\s=\s$([\w\s,])$\s=>\s{([\S\s]?)}/g;
		var __importDefault = (this && this.__importDefault) \|\| function (mod) {
		return (mod && mod.__esModule) ? mod : { "default": mod };
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getChunksByJavascript = void 0;
		// const functions = [];
		// let match;
		// while ((match = functionRegex.exec(code))) {
		// const [, name, parameters, body] = match;
		// functions.push({ name, parameters, body });
		// }
		// while ((match = arrowFunctionRegex.exec(code))) {
		// const [, name, parameters, body] = match;
		// functions.push({ name, parameters, body });
		// }
		// return functions;
		// }
		const typescript_1 = __importDefault(require("typescript"));
		function extractFunctions(filename) {
		console.log(filename);
		const program = typescript_1.default.createProgram([filename], {});
		const sourceFile = program.getSourceFile(filename);
		console.log(sourceFile);
		const collectedFunctions = [];
		function visit(node) {
		if (typescript_1.default.isFunctionDeclaration(node)) {
		collectedFunctions.push(node.name.text);
		}
		typescript_1.default.forEachChild(node, visit);
		}
		visit(sourceFile);
		return collectedFunctions;
		}
		exports.getChunksByJavascript = extractFunctions;

+2

lib/helpers/getChunksByMaxToken.d.ts

		import { TiktokenEmbedding } from '@dqbd/tiktoken';
		export declare function getChunksByMaxToken(text: string, max_tokens?: number, encoding_name?: TiktokenEmbedding): Promise<any[]>;

+57

lib/helpers/getChunksByMaxToken.js

		"use strict";
		var __awaiter = (this && this.__awaiter) \|\| function (thisArg, _arguments, P, generator) {
		function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
		return new (P \|\| (P = Promise))(function (resolve, reject) {
		function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
		function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
		function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
		step((generator = generator.apply(thisArg, _arguments \|\| [])).next());
		});
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getChunksByMaxToken = void 0;
		const tiktoken_1 = require("@dqbd/tiktoken");
		const EMBEDDING_CTX_LENGTH = 8191;
		const EMBEDDING_ENCODING = 'cl100k_base';
		function* batched(iterable, n) {
		/* Batch data into tuples of length n. The last batch may be shorter. */
		if (n < 1) {
		throw new Error('n must be at least one');
		}
		const it = iterable[Symbol.iterator]();
		while (true) {
		const batch = [...Array(n)].map(() => it.next().value).filter((x) => x !== undefined);
		if (batch.length === 0) {
		break;
		}
		yield batch;
		}
		}
		function* chunked_tokens(text, encoding_name, chunk_length) {
		const encoding = (0, tiktoken_1.get_encoding)(encoding_name);
		const tokens = encoding.encode(text);
		const chunks_iterator = batched(tokens, chunk_length);
		yield* chunks_iterator;
		}
		function getChunksByMaxToken(text, max_tokens = EMBEDDING_CTX_LENGTH, encoding_name = EMBEDDING_ENCODING) {
		return __awaiter(this, void 0, void 0, function* () {
		const chunks = [];
		for (const chunk of chunked_tokens(text, encoding_name, max_tokens)) {
		const enc = (0, tiktoken_1.get_encoding)(encoding_name);
		// eslint-disable-next-line @typescript-eslint/ban-ts-comment
		// @ts-ignore
		const _chunk = new TextDecoder().decode(enc.decode(chunk));
		chunks.push(_chunk);
		}
		// removing for now but would be cool to add it as a seperate function
		// if (average) {
		// let chunk_embeddings_array = np.array(chunk_embeddings);
		// chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens));
		// chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1
		// chunk_embeddings_array = chunk_embeddings_array.tolist();
		// return chunk_embeddings_array;
		// }
		return chunks;
		});
		}
		exports.getChunksByMaxToken = getChunksByMaxToken;

+1

lib/helpers/getChunksByNewLine.d.ts

export declare function getChunksByNewLine(text: string): string[];

+7

lib/helpers/getChunksByNewLine.js

		"use strict";
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getChunksByNewLine = void 0;
		function getChunksByNewLine(text) {
		return text.split(/\r?\n/);
		}
		exports.getChunksByNewLine = getChunksByNewLine;

+6

lib/helpers/getChunksByPython.d.ts

		interface FunctionData {
		code: string;
		function_name: string;
		}
		declare function getFunctions(code: string): FunctionData[];
		export { getFunctions as getChunksByPython };

+49

lib/helpers/getChunksByPython.js

		"use strict";
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getChunksByPython = void 0;
		function get_function_name(code) {
		/**

		Extract function name from a line beginning with "def "
		**/
		if (!code.startsWith('def '))
		throw new Error("Code does not start with 'def'");
		return code.slice('def '.length, code.indexOf('('));
		}
		function get_until_no_space(all_lines, i) {
		/**

		Get all lines until a line outside the function definition is found.
		**/
		const ret = [all_lines[i]];
		for (let j = i + 1; j < i + 10000; j++) {
		if (j < all_lines.length) {
		if (all_lines[j].length === 0 \|\| all_lines[j][0] === ' ' \|\| all_lines[j][0] === '\t' \|\| all_lines[j][0] === ')') {
		ret.push(all_lines[j]);
		}
		else {
		break;
		}
		}
		}
		return ret.join('\n');
		}
		function getFunctions(code) {
		/**

		Get all functions in a Python file.
		**/
		if (code.length === 0)
		throw new Error('Code is empty');
		const all_lines = code.split('\n');
		const functions = [];
		for (let i = 0; i < all_lines.length; i++) {
		if (all_lines[i].startsWith('def ')) {
		const code = get_until_no_space(all_lines, i);
		const function_name = get_function_name(all_lines[i]);
		functions.push({ code, function_name });
		}
		}
		return functions;
		}
		exports.getChunksByPython = getFunctions;

+5

lib/helpers/getChunksSimple.d.ts

		export declare function getChunksSimple({ text, // The input text to be split
		maxCharLength, }: {
		text: string;
		maxCharLength?: number;
		}): string[];

+66

lib/helpers/getChunksSimple.js

		"use strict";
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getChunksSimple = void 0;
		// A function that splits a text into smaller pieces of roughly equal length
		// The pieces are delimited by sentences and try to avoid breaking words or punctuation
		// This can be useful for processing long texts with natural language models that have a limited input size
		function getChunksSimple({ text, // The input text to be split
		// The desired maximum length of each piece in characters
		// This uses 4 characters as an approximation of the average token length
		// since there isn't a good JS tokenizer at the moment
		maxCharLength = 250 * 4, }) {
		// Create an empty array to store the pieces
		const chunks = [];
		// Create a variable to hold the current piece
		let currentChunk = '';
		// Remove any newline characters from the text and split it by periods
		// This assumes that periods mark the end of sentences, which may not be true for some languages
		const sentences = text.replace(/\n/g, ' ').split(/([.])/);
		for (const sentence of sentences) {
		// Remove any extra whitespace from the beginning and end of the sentence
		const trimmedSentence = sentence.trim();
		// If the sentence is empty, skip it
		if (!trimmedSentence)
		continue;
		// Check if adding the sentence to the current piece would make it too long, too short, or just right
		// This uses a tolerance range of 50% of the maximum length to allow some flexibility
		// If the piece is too long, save it and start a new one
		// If the piece is too short, add the sentence and continue
		// If the piece is just right, save it and start a new one
		const chunkLength = currentChunk.length + trimmedSentence.length + 1;
		const lowerBound = maxCharLength - maxCharLength * 0.5;
		const upperBound = maxCharLength + maxCharLength * 0.5;
		if (chunkLength >= lowerBound && chunkLength <= upperBound && currentChunk) {
		// The piece is just right, so we save it and start a new one
		// We remove any periods or spaces from the beginning of the piece and trim any whitespace
		currentChunk = currentChunk.replace(/^[. ]+/, '').trim();
		// We only push the piece if it is not empty
		if (currentChunk)
		chunks.push(currentChunk);
		// Reset the current piece
		currentChunk = '';
		}
		else if (chunkLength > upperBound) {
		// The piece is too long, so save it and start a new one with the sentence
		// Remove any periods or spaces from the beginning of the piece and trim any whitespace
		currentChunk = currentChunk.replace(/^[. ]+/, '').trim();
		// We only push the piece if it is not empty
		if (currentChunk)
		chunks.push(currentChunk);
		// Set the current piece to the sentence
		currentChunk = trimmedSentence;
		}
		else {
		// The piece is too short, so add the sentence and continue
		// Add a space before the sentence unless it is a period
		currentChunk += `${trimmedSentence === '.' ? '' : ' '}${trimmedSentence}`;
		}
		}
		// If there is any remaining piece, save it
		if (currentChunk) {
		chunks.push(currentChunk);
		}
		// Return the array of pieces
		return chunks;
		}
		exports.getChunksSimple = getChunksSimple;

+18

lib/helpers/github.d.ts

		interface GithubFile {
		name: string;
		path: string;
		sha: string;
		size: number;
		url: string;
		html_url: string;
		git_url: string;
		download_url: string;
		type: 'file' \| 'dir';
		_links: {
		self: string;
		git: string;
		html: string;
		};
		}
		export declare const getAllFilesFromGithubRepo: (url: string, githubToken: string) => Promise<GithubFile[]>;
		export {};

+43

lib/helpers/github.js

		"use strict";
		var __awaiter = (this && this.__awaiter) \|\| function (thisArg, _arguments, P, generator) {
		function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
		return new (P \|\| (P = Promise))(function (resolve, reject) {
		function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
		function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
		function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
		step((generator = generator.apply(thisArg, _arguments \|\| [])).next());
		});
		};
		var __importDefault = (this && this.__importDefault) \|\| function (mod) {
		return (mod && mod.__esModule) ? mod : { "default": mod };
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.getAllFilesFromGithubRepo = void 0;
		const cross_fetch_1 = __importDefault(require("cross-fetch"));
		// get all files from agithub repo
		const getAllFilesFromGithubRepo = (url, githubToken) => __awaiter(void 0, void 0, void 0, function* () {
		if (!url) {
		throw new Error('No url provided');
		}
		if (!githubToken) {
		throw new Error('No github token provided');
		}
		const response = yield (0, cross_fetch_1.default)(url, {
		headers: {
		Authorization: `token ${githubToken}`,
		},
		});
		const data = yield response.json();
		const dataList = [];
		for (const item of data) {
		if (item.type === 'file') {
		dataList.push(item);
		}
		else if (item.type === 'dir') {
		const subdirFiles = yield (0, exports.getAllFilesFromGithubRepo)(item._links.self, githubToken);
		dataList.push(...subdirFiles);
		}
		}
		return dataList;
		});
		exports.getAllFilesFromGithubRepo = getAllFilesFromGithubRepo;

+50

lib/newImplementation.d.ts

		import type * as tiktoken from "@dqbd/tiktoken";
		import { Document } from "./document.js";
		interface TextSplitterParams {
		chunkSize: number;
		chunkOverlap: number;
		}
		export declare abstract class TextSplitter implements TextSplitterParams {
		chunkSize: number;
		chunkOverlap: number;
		constructor(fields?: Partial<TextSplitterParams>);
		abstract splitText(text: string): Promise<string[]>;
		createDocuments(texts: string[], metadatas?: Record<string, any>[]): Promise<Document[]>;
		splitDocuments(documents: Document[]): Promise<Document[]>;
		private joinDocs;
		mergeSplits(splits: string[], separator: string): string[];
		}
		export interface CharacterTextSplitterParams extends TextSplitterParams {
		separator: string;
		}
		export declare class CharacterTextSplitter extends TextSplitter implements CharacterTextSplitterParams {
		separator: string;
		constructor(fields?: Partial<CharacterTextSplitterParams>);
		splitText(text: string): Promise<string[]>;
		}
		export interface RecursiveCharacterTextSplitterParams extends TextSplitterParams {
		separators: string[];
		}
		export declare class RecursiveCharacterTextSplitter extends TextSplitter implements RecursiveCharacterTextSplitterParams {
		separators: string[];
		constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>);
		splitText(text: string): Promise<string[]>;
		}
		export interface TokenTextSplitterParams extends TextSplitterParams {
		encodingName: tiktoken.TiktokenEmbedding;
		allowedSpecial: "all" \| Array<string>;
		disallowedSpecial: "all" \| Array<string>;
		}
		/**
		* Implementation of splitter which looks at tokens.
		*/
		export declare class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams {
		encodingName: tiktoken.TiktokenEmbedding;
		allowedSpecial: "all" \| Array<string>;
		disallowedSpecial: "all" \| Array<string>;
		private tokenizer;
		constructor(fields?: Partial<TokenTextSplitterParams>);
		splitText(text: string): Promise<string[]>;
		static imports(): Promise<typeof tiktoken>;
		}
		export {};

+232

lib/newImplementation.js

		"use strict";
		var __createBinding = (this && this.__createBinding) \|\| (Object.create ? (function(o, m, k, k2) {
		if (k2 === undefined) k2 = k;
		var desc = Object.getOwnPropertyDescriptor(m, k);
		if (!desc \|\| ("get" in desc ? !m.__esModule : desc.writable \|\| desc.configurable)) {
		desc = { enumerable: true, get: function() { return m[k]; } };
		}
		Object.defineProperty(o, k2, desc);
		}) : (function(o, m, k, k2) {
		if (k2 === undefined) k2 = k;
		o[k2] = m[k];
		}));
		var __setModuleDefault = (this && this.__setModuleDefault) \|\| (Object.create ? (function(o, v) {
		Object.defineProperty(o, "default", { enumerable: true, value: v });
		}) : function(o, v) {
		o["default"] = v;
		});
		var __importStar = (this && this.__importStar) \|\| function (mod) {
		if (mod && mod.__esModule) return mod;
		var result = {};
		if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
		__setModuleDefault(result, mod);
		return result;
		};
		var __awaiter = (this && this.__awaiter) \|\| function (thisArg, _arguments, P, generator) {
		function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
		return new (P \|\| (P = Promise))(function (resolve, reject) {
		function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
		function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
		function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
		step((generator = generator.apply(thisArg, _arguments \|\| [])).next());
		});
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.TokenTextSplitter = exports.RecursiveCharacterTextSplitter = exports.CharacterTextSplitter = exports.TextSplitter = void 0;
		const document_js_1 = require("./document.js");
		class TextSplitter {
		constructor(fields) {
		var _a, _b;
		this.chunkSize = 1000;
		this.chunkOverlap = 200;
		this.chunkSize = (_a = fields === null \|\| fields === void 0 ? void 0 : fields.chunkSize) !== null && _a !== void 0 ? _a : this.chunkSize;
		this.chunkOverlap = (_b = fields === null \|\| fields === void 0 ? void 0 : fields.chunkOverlap) !== null && _b !== void 0 ? _b : this.chunkOverlap;
		if (this.chunkOverlap >= this.chunkSize) {
		throw new Error("Cannot have chunkOverlap >= chunkSize");
		}
		}
		createDocuments(texts,
		// eslint-disable-next-line @typescript-eslint/no-explicit-any
		metadatas = []) {
		return __awaiter(this, void 0, void 0, function* () {
		const _metadatas = metadatas.length > 0 ? metadatas : new Array(texts.length).fill({});
		const documents = new Array();
		for (let i = 0; i < texts.length; i += 1) {
		const text = texts[i];
		for (const chunk of yield this.splitText(text)) {
		documents.push(new document_js_1.Document({ pageContent: chunk, metadata: _metadatas[i] }));
		}
		}
		return documents;
		});
		}
		splitDocuments(documents) {
		return __awaiter(this, void 0, void 0, function* () {
		const texts = documents.map((doc) => doc.pageContent);
		const metadatas = documents.map((doc) => doc.metadata);
		return this.createDocuments(texts, metadatas);
		});
		}
		joinDocs(docs, separator) {
		const text = docs.join(separator).trim();
		return text === "" ? null : text;
		}
		mergeSplits(splits, separator) {
		const docs = [];
		const currentDoc = [];
		let total = 0;
		for (const d of splits) {
		const _len = d.length;
		if (total + _len >= this.chunkSize) {
		if (total > this.chunkSize) {
		console.warn(`Created a chunk of size ${total}, +
		which is longer than the specified ${this.chunkSize}`);
		}
		if (currentDoc.length > 0) {
		const doc = this.joinDocs(currentDoc, separator);
		if (doc !== null) {
		docs.push(doc);
		}
		// Keep on popping if:
		// - we have a larger chunk than in the chunk overlap
		// - or if we still have any chunks and the length is long
		while (total > this.chunkOverlap \|\|
		(total + _len > this.chunkSize && total > 0)) {
		total -= currentDoc[0].length;
		currentDoc.shift();
		}
		}
		}
		currentDoc.push(d);
		total += _len;
		}
		const doc = this.joinDocs(currentDoc, separator);
		if (doc !== null) {
		docs.push(doc);
		}
		return docs;
		}
		}
		exports.TextSplitter = TextSplitter;
		class CharacterTextSplitter extends TextSplitter {
		constructor(fields) {
		var _a;
		super(fields);
		this.separator = "\n\n";
		this.separator = (_a = fields === null \|\| fields === void 0 ? void 0 : fields.separator) !== null && _a !== void 0 ? _a : this.separator;
		}
		splitText(text) {
		return __awaiter(this, void 0, void 0, function* () {
		// First we naively split the large input into a bunch of smaller ones.
		let splits;
		if (this.separator) {
		splits = text.split(this.separator);
		}
		else {
		splits = text.split("");
		}
		return this.mergeSplits(splits, this.separator);
		});
		}
		}
		exports.CharacterTextSplitter = CharacterTextSplitter;
		class RecursiveCharacterTextSplitter extends TextSplitter {
		constructor(fields) {
		var _a;
		super(fields);
		this.separators = ["\n\n", "\n", " ", ""];
		this.separators = (_a = fields === null \|\| fields === void 0 ? void 0 : fields.separators) !== null && _a !== void 0 ? _a : this.separators;
		}
		splitText(text) {
		return __awaiter(this, void 0, void 0, function* () {
		const finalChunks = [];
		// Get appropriate separator to use
		let separator = this.separators[this.separators.length - 1];
		for (const s of this.separators) {
		if (s === "") {
		separator = s;
		break;
		}
		if (text.includes(s)) {
		separator = s;
		break;
		}
		}
		// Now that we have the separator, split the text
		let splits;
		if (separator) {
		splits = text.split(separator);
		}
		else {
		splits = text.split("");
		}
		// Now go merging things, recursively splitting longer texts.
		let goodSplits = [];
		for (const s of splits) {
		if (s.length < this.chunkSize) {
		goodSplits.push(s);
		}
		else {
		if (goodSplits.length) {
		const mergedText = this.mergeSplits(goodSplits, separator);
		finalChunks.push(...mergedText);
		goodSplits = [];
		}
		const otherInfo = yield this.splitText(s);
		finalChunks.push(...otherInfo);
		}
		}
		if (goodSplits.length) {
		const mergedText = this.mergeSplits(goodSplits, separator);
		finalChunks.push(...mergedText);
		}
		return finalChunks;
		});
		}
		}
		exports.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter;
		/**
		* Implementation of splitter which looks at tokens.
		*/
		class TokenTextSplitter extends TextSplitter {
		constructor(fields) {
		var _a, _b, _c;
		super(fields);
		this.encodingName = (_a = fields === null \|\| fields === void 0 ? void 0 : fields.encodingName) !== null && _a !== void 0 ? _a : "gpt2";
		this.allowedSpecial = (_b = fields === null \|\| fields === void 0 ? void 0 : fields.allowedSpecial) !== null && _b !== void 0 ? _b : [];
		this.disallowedSpecial = (_c = fields === null \|\| fields === void 0 ? void 0 : fields.disallowedSpecial) !== null && _c !== void 0 ? _c : "all";
		}
		splitText(text) {
		return __awaiter(this, void 0, void 0, function* () {
		if (!this.tokenizer) {
		const tiktoken = yield TokenTextSplitter.imports();
		this.tokenizer = tiktoken.get_encoding(this.encodingName);
		}
		const splits = [];
		const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial);
		let start_idx = 0;
		let cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
		let chunk_ids = input_ids.slice(start_idx, cur_idx);
		const decoder = new TextDecoder();
		while (start_idx < input_ids.length) {
		splits.push(decoder.decode(this.tokenizer.decode(chunk_ids)));
		start_idx += this.chunkSize - this.chunkOverlap;
		cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length);
		chunk_ids = input_ids.slice(start_idx, cur_idx);
		}
		return splits;
		});
		}
		static imports() {
		return __awaiter(this, void 0, void 0, function* () {
		try {
		return yield Promise.resolve().then(() => __importStar(require("@dqbd/tiktoken")));
		}
		catch (err) {
		console.error(err);
		throw new Error("Please install @dqbd/tiktoken as a dependency with, e.g. `npm install -S @dqbd/tiktoken`");
		}
		});
		}
		}
		exports.TokenTextSplitter = TokenTextSplitter;

+6

-12

lib/index.d.ts

		@@ -1,12 +0,6 @@
		export declare function split(prompt: string, maxTokens?: number): string[];
		type Batches = {
		data: string;
		}[];
		export declare function index(chunks: string[], embedCallback: (batch: Batches) => void): Promise<void[]>;
		export declare const merge: (chunks: string[], maxLen?: number) => Promise<string>;
		declare const _default: {
		split: typeof split;
		index: typeof index;
		merge: (chunks: string[], maxLen?: number) => Promise<string>;
		};
		export default _default;
		import { getChunksSimple } from './helpers/getChunksSimple';
		import { getChunksByMaxToken } from './helpers/getChunksByMaxToken';
		import { getChunksByNewLine } from './helpers/getChunksByNewLine';
		import { getChunksByPython } from './helpers/getChunksByPython';
		import { getAllFilesFromGithubRepo } from './helpers/github';
		export { getChunksSimple, getChunksByMaxToken, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo };

+11

-78

lib/index.js

		"use strict";
		var __awaiter = (this && this.__awaiter) \|\| function (thisArg, _arguments, P, generator) {
		function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
		return new (P \|\| (P = Promise))(function (resolve, reject) {
		function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
		function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
		function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
		step((generator = generator.apply(thisArg, _arguments \|\| [])).next());
		});
		};
		Object.defineProperty(exports, "__esModule", { value: true });
		exports.merge = exports.index = exports.split = void 0;
		const tiktoken_1 = require("@dqbd/tiktoken");
		const tokenizer = (0, tiktoken_1.get_encoding)('cl100k_base');
		function splitIntoMany(text, maxTokens) {
		// Split the text into sentences
		const sentences = text.split('. ');
		// Get the number of tokens for each sentence
		const nTokens = sentences.map((sentence) => tokenizer.encode(' ' + sentence).length);
		const chunks = [];
		let tokensSoFar = 0;
		let chunk = [];
		// Loop through the sentences and tokens joined together in a tuple
		for (let i = 0; i < sentences.length; i++) {
		const sentence = sentences[i];
		const token = nTokens[i];
		// If the number of tokens so far plus the number of tokens in the current sentence is greater
		// than the max number of tokens, then add the chunk to the list of chunks and reset
		// the chunk and tokens so far
		if (tokensSoFar + token > maxTokens) {
		chunks.push(chunk.join('. ') + '.');
		chunk = [];
		tokensSoFar = 0;
		}
		// If the number of tokens in the current sentence is greater than the max number of
		// tokens, go to the next sentence
		if (token > maxTokens) {
		continue;
		}
		// Otherwise, add the sentence to the chunk and add the number of tokens to the total
		chunk.push(sentence);
		tokensSoFar += token + 1;
		}
		return chunks;
		}
		function split(prompt, maxTokens = 500) {
		// should split into tokens
		if (!prompt \|\| prompt.length === 0) {
		throw new Error('Nothing to embeddify');
		}
		const chunks = splitIntoMany(prompt, maxTokens);
		return chunks;
		}
		exports.split = split;
		function index(chunks, embedCallback) {
		return __awaiter(this, void 0, void 0, function* () {
		const batches = [];
		for (let i = 0; i < chunks.length; i += 100) {
		batches.push(chunks.slice(i, i + 100).map((text) => ({ data: text })));
		}
		return yield Promise.all(batches.map((batch) => embedCallback(batch)));
		});
		}
		exports.index = index;
		// should index chunks
		const merge = (chunks, maxLen = 1800) => __awaiter(void 0, void 0, void 0, function* () {
		let curLen = 0;
		const context = [];
		for (const chunk of chunks) {
		const nTokens = tokenizer.encode(chunk).length;
		curLen += nTokens + 4;
		if (curLen > maxLen) {
		break;
		}
		context.push(chunk);
		}
		return context.join('\n\n###\n\n');
		});
		exports.merge = merge;
		exports.default = { split, index, merge: exports.merge };
		exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.getChunksByMaxToken = exports.getChunksSimple = void 0;
		const getChunksSimple_1 = require("./helpers/getChunksSimple");
		Object.defineProperty(exports, "getChunksSimple", { enumerable: true, get: function () { return getChunksSimple_1.getChunksSimple; } });
		const getChunksByMaxToken_1 = require("./helpers/getChunksByMaxToken");
		Object.defineProperty(exports, "getChunksByMaxToken", { enumerable: true, get: function () { return getChunksByMaxToken_1.getChunksByMaxToken; } });
		const getChunksByNewLine_1 = require("./helpers/getChunksByNewLine");
		Object.defineProperty(exports, "getChunksByNewLine", { enumerable: true, get: function () { return getChunksByNewLine_1.getChunksByNewLine; } });
		const getChunksByPython_1 = require("./helpers/getChunksByPython");
		Object.defineProperty(exports, "getChunksByPython", { enumerable: true, get: function () { return getChunksByPython_1.getChunksByPython; } });
		const github_1 = require("./helpers/github");
		Object.defineProperty(exports, "getAllFilesFromGithubRepo", { enumerable: true, get: function () { return github_1.getAllFilesFromGithubRepo; } });

+7

-6

package.json

		{
		"name": "embeddings-splitter",
		"version": "0.0.5",
		"version": "0.1.0",
		"description": "A typescript library to split your long texts into smaller chunks to send them to OpenAI Embeddings API",
		@@ -22,3 +22,3 @@ "main": "lib/index.js",
		"type": "git",
		"url": "git+https://github.com/another-ai/embeddings-splitter.git"
		"url": "git+https://github.com/different-ai/embeddings-splitter.git"
		},
		@@ -30,8 +30,8 @@ "keywords": [
		],
		"author": "another AI",
		"author": "Different AI",
		"license": "MIT",
		"bugs": {
		"url": "https://github.com/another-ai/embeddings-splitter/issues"
		"url": "https://github.com/different-ai/embeddings-splitter/issues"
		},
		"homepage": "https://github.com/another-ai/embeddings-splitter#readme",
		"homepage": "https://github.com/different-ai/embeddings-splitter#readme",
		"devDependencies": {
		@@ -52,4 +52,5 @@ "@types/jest": "29.2.4",
		"dependencies": {
		"@dqbd/tiktoken": "^0.2.1"
		"@dqbd/tiktoken": "^0.4.0",
		"cross-fetch": "^3.1.5"
		}
		}

+13

-25

README.md

		@@ -15,3 +15,3 @@ <p align="center">embeddings-splitter</p>

		`split` makes sure your string will are short enough to be embedded
		`split` makes sure your string will are short enough to be embedded. (default split size is 500 tokens, but you OpenAI embeddings allow you to go up to 8191)

		@@ -21,30 +21,11 @@ ```js

		// chunks to iterate on and send to a server
		const chunks = split('somVeryLongText...');
		```

		### Batch send (experimental)
		// example with biggest chunk size
		const chunks = split('someVeryLongText', 8191)

		```js
		import {index} from 'embeddings-splitter';
		// now you can send these chunks to be embedded
		```


		// used to send batches to a server in parellel
		index(chunks, (batch) => {
		// this example is using Embedbase, but it can be replaced with openai.createEmbeddings
		const vaultId = 'youtube video id';
		await fetch(url + '/v1/' + 'your api key', {
		method: 'POST',
		headers: {
		Authorization: 'Bearer ' + apiKey,
		'Content-Type': 'application/json',
		},
		body: JSON.stringify({
		documents: batch,
		}),
		});
		});

		```

		### Merge chunks into single string
		@@ -58,3 +39,10 @@
		const chunks = ['i am a text', 'that needs to be interpreted as one ', 'for a prompt to make sense'];
		const merged = merge(chunks);
		const context = merge(chunks);

		// e.g. of what to do with merged array
		const question = 'what is this text about?"

		const prompt = Answer the question based on the context below, and if the question can't be answered based on the context, say "I don't know"\n\nContext: ${context}\n\n---\n\nQuestion: ${question}\nAnswer:

		createCompletion(prompt)
		```
		@@ -61,0 +49,0 @@

embeddings-splitter - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes