embeddings-splitter
Advanced tools
| export declare function chunkText({ text, // The input text to be split | ||
| maxCharLength, }: { | ||
| text: string; | ||
| maxCharLength?: number; | ||
| }): string[]; |
| "use strict"; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.chunkText = void 0; | ||
| // A function that splits a text into smaller pieces of roughly equal length | ||
| // The pieces are delimited by sentences and try to avoid breaking words or punctuation | ||
| // This can be useful for processing long texts with natural language models that have a limited input size | ||
| function chunkText({ text, // The input text to be split | ||
| // The desired maximum length of each piece in characters | ||
| // This uses 4 characters as an approximation of the average token length | ||
| // since there isn't a good JS tokenizer at the moment | ||
| maxCharLength = 250 * 4, }) { | ||
| // Create an empty array to store the pieces | ||
| const chunks = []; | ||
| // Create a variable to hold the current piece | ||
| let currentChunk = ''; | ||
| // Remove any newline characters from the text and split it by periods | ||
| // This assumes that periods mark the end of sentences, which may not be true for some languages | ||
| const sentences = text.replace(/\n/g, ' ').split(/([.])/); | ||
| for (const sentence of sentences) { | ||
| // Remove any extra whitespace from the beginning and end of the sentence | ||
| const trimmedSentence = sentence.trim(); | ||
| // If the sentence is empty, skip it | ||
| if (!trimmedSentence) | ||
| continue; | ||
| // Check if adding the sentence to the current piece would make it too long, too short, or just right | ||
| // This uses a tolerance range of 50% of the maximum length to allow some flexibility | ||
| // If the piece is too long, save it and start a new one | ||
| // If the piece is too short, add the sentence and continue | ||
| // If the piece is just right, save it and start a new one | ||
| const chunkLength = currentChunk.length + trimmedSentence.length + 1; | ||
| const lowerBound = maxCharLength - maxCharLength * 0.5; | ||
| const upperBound = maxCharLength + maxCharLength * 0.5; | ||
| if (chunkLength >= lowerBound && chunkLength <= upperBound && currentChunk) { | ||
| // The piece is just right, so we save it and start a new one | ||
| // We remove any periods or spaces from the beginning of the piece and trim any whitespace | ||
| currentChunk = currentChunk.replace(/^[. ]+/, '').trim(); | ||
| // We only push the piece if it is not empty | ||
| if (currentChunk) | ||
| chunks.push(currentChunk); | ||
| // Reset the current piece | ||
| currentChunk = ''; | ||
| } | ||
| else if (chunkLength > upperBound) { | ||
| // The piece is too long, so save it and start a new one with the sentence | ||
| // Remove any periods or spaces from the beginning of the piece and trim any whitespace | ||
| currentChunk = currentChunk.replace(/^[. ]+/, '').trim(); | ||
| // We only push the piece if it is not empty | ||
| if (currentChunk) | ||
| chunks.push(currentChunk); | ||
| // Set the current piece to the sentence | ||
| currentChunk = trimmedSentence; | ||
| } | ||
| else { | ||
| // The piece is too short, so add the sentence and continue | ||
| // Add a space before the sentence unless it is a period | ||
| currentChunk += `${trimmedSentence === '.' ? '' : ' '}${trimmedSentence}`; | ||
| } | ||
| } | ||
| // If there is any remaining piece, save it | ||
| if (currentChunk) { | ||
| chunks.push(currentChunk); | ||
| } | ||
| // Return the array of pieces | ||
| return chunks; | ||
| } | ||
| exports.chunkText = chunkText; |
| export {}; |
| "use strict"; | ||
| var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
| function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
| return new (P || (P = Promise))(function (resolve, reject) { | ||
| function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
| function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
| function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
| step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
| }); | ||
| }; | ||
| var __importDefault = (this && this.__importDefault) || function (mod) { | ||
| return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
| }; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| const cross_fetch_1 = __importDefault(require("cross-fetch")); | ||
| const githubToken = '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'; | ||
| // get all files from agithub repo | ||
| const getAllFilesFromGithubRepo = (path) => __awaiter(void 0, void 0, void 0, function* () { | ||
| const response = yield (0, cross_fetch_1.default)(path, { | ||
| headers: { | ||
| Authorization: `token ${githubToken}`, | ||
| }, | ||
| }); | ||
| const data = yield response.json(); | ||
| const dataList = []; | ||
| data.forEach((data) => __awaiter(void 0, void 0, void 0, function* () { | ||
| // if dir then get all files from that dir | ||
| if (data.type === 'dir') { | ||
| return yield getAllFilesFromGithubRepo(data._links.self); // recursive call | ||
| } | ||
| console.log(data); | ||
| dataList.push(data); | ||
| })); | ||
| return Promise.all(dataList); | ||
| }); | ||
| const main = (repo) => __awaiter(void 0, void 0, void 0, function* () { | ||
| // get all files from github repo | ||
| const files = yield getAllFilesFromGithubRepo(`https://api.github.com/repos/${repo}/contents/`); | ||
| // keep only python files | ||
| console.log(files); | ||
| }); | ||
| main('different-ai/embedbase'); | ||
| // split the files in token chunks of 1000 | ||
| // |
| declare function extractFunctions(filename: string): string[]; | ||
| export { extractFunctions as getChunksByJavascript }; |
| "use strict"; | ||
| // function extractFunctions(code) { | ||
| // const functionRegex = /(?:async\s+)?function\s+(\w+)\s*\(([\w\s,]*)\)\s*{([\S\s]*?)}/g; | ||
| // const arrowFunctionRegex = /const\s+(\w+)\s*=\s*\(([\w\s,]*)\)\s*=>\s*{([\S\s]*?)}/g; | ||
| var __importDefault = (this && this.__importDefault) || function (mod) { | ||
| return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
| }; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.getChunksByJavascript = void 0; | ||
| // const functions = []; | ||
| // let match; | ||
| // while ((match = functionRegex.exec(code))) { | ||
| // const [, name, parameters, body] = match; | ||
| // functions.push({ name, parameters, body }); | ||
| // } | ||
| // while ((match = arrowFunctionRegex.exec(code))) { | ||
| // const [, name, parameters, body] = match; | ||
| // functions.push({ name, parameters, body }); | ||
| // } | ||
| // return functions; | ||
| // } | ||
| const typescript_1 = __importDefault(require("typescript")); | ||
| function extractFunctions(filename) { | ||
| console.log(filename); | ||
| const program = typescript_1.default.createProgram([filename], {}); | ||
| const sourceFile = program.getSourceFile(filename); | ||
| console.log(sourceFile); | ||
| const collectedFunctions = []; | ||
| function visit(node) { | ||
| if (typescript_1.default.isFunctionDeclaration(node)) { | ||
| collectedFunctions.push(node.name.text); | ||
| } | ||
| typescript_1.default.forEachChild(node, visit); | ||
| } | ||
| visit(sourceFile); | ||
| return collectedFunctions; | ||
| } | ||
| exports.getChunksByJavascript = extractFunctions; |
| import { TiktokenEmbedding } from '@dqbd/tiktoken'; | ||
| export declare function getChunksByMaxToken(text: string, max_tokens?: number, encoding_name?: TiktokenEmbedding): Promise<any[]>; |
| "use strict"; | ||
| var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
| function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
| return new (P || (P = Promise))(function (resolve, reject) { | ||
| function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
| function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
| function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
| step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
| }); | ||
| }; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.getChunksByMaxToken = void 0; | ||
| const tiktoken_1 = require("@dqbd/tiktoken"); | ||
| const EMBEDDING_CTX_LENGTH = 8191; | ||
| const EMBEDDING_ENCODING = 'cl100k_base'; | ||
| function* batched(iterable, n) { | ||
| /* Batch data into tuples of length n. The last batch may be shorter. */ | ||
| if (n < 1) { | ||
| throw new Error('n must be at least one'); | ||
| } | ||
| const it = iterable[Symbol.iterator](); | ||
| while (true) { | ||
| const batch = [...Array(n)].map(() => it.next().value).filter((x) => x !== undefined); | ||
| if (batch.length === 0) { | ||
| break; | ||
| } | ||
| yield batch; | ||
| } | ||
| } | ||
| function* chunked_tokens(text, encoding_name, chunk_length) { | ||
| const encoding = (0, tiktoken_1.get_encoding)(encoding_name); | ||
| const tokens = encoding.encode(text); | ||
| const chunks_iterator = batched(tokens, chunk_length); | ||
| yield* chunks_iterator; | ||
| } | ||
| function getChunksByMaxToken(text, max_tokens = EMBEDDING_CTX_LENGTH, encoding_name = EMBEDDING_ENCODING) { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| const chunks = []; | ||
| for (const chunk of chunked_tokens(text, encoding_name, max_tokens)) { | ||
| const enc = (0, tiktoken_1.get_encoding)(encoding_name); | ||
| // eslint-disable-next-line @typescript-eslint/ban-ts-comment | ||
| // @ts-ignore | ||
| const _chunk = new TextDecoder().decode(enc.decode(chunk)); | ||
| chunks.push(_chunk); | ||
| } | ||
| // removing for now but would be cool to add it as a seperate function | ||
| // if (average) { | ||
| // let chunk_embeddings_array = np.array(chunk_embeddings); | ||
| // chunk_embeddings_array = np.average(chunk_embeddings_array, (axis = 0), (weights = chunk_lens)); | ||
| // chunk_embeddings_array = chunk_embeddings_array / np.linalg.norm(chunk_embeddings_array); // normalizes length to 1 | ||
| // chunk_embeddings_array = chunk_embeddings_array.tolist(); | ||
| // return chunk_embeddings_array; | ||
| // } | ||
| return chunks; | ||
| }); | ||
| } | ||
| exports.getChunksByMaxToken = getChunksByMaxToken; |
| export declare function getChunksByNewLine(text: string): string[]; |
| "use strict"; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.getChunksByNewLine = void 0; | ||
| function getChunksByNewLine(text) { | ||
| return text.split(/\r?\n/); | ||
| } | ||
| exports.getChunksByNewLine = getChunksByNewLine; |
| interface FunctionData { | ||
| code: string; | ||
| function_name: string; | ||
| } | ||
| declare function getFunctions(code: string): FunctionData[]; | ||
| export { getFunctions as getChunksByPython }; |
| "use strict"; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.getChunksByPython = void 0; | ||
| function get_function_name(code) { | ||
| /** | ||
| Extract function name from a line beginning with "def " | ||
| **/ | ||
| if (!code.startsWith('def ')) | ||
| throw new Error("Code does not start with 'def'"); | ||
| return code.slice('def '.length, code.indexOf('(')); | ||
| } | ||
| function get_until_no_space(all_lines, i) { | ||
| /** | ||
| Get all lines until a line outside the function definition is found. | ||
| **/ | ||
| const ret = [all_lines[i]]; | ||
| for (let j = i + 1; j < i + 10000; j++) { | ||
| if (j < all_lines.length) { | ||
| if (all_lines[j].length === 0 || all_lines[j][0] === ' ' || all_lines[j][0] === '\t' || all_lines[j][0] === ')') { | ||
| ret.push(all_lines[j]); | ||
| } | ||
| else { | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| return ret.join('\n'); | ||
| } | ||
| function getFunctions(code) { | ||
| /** | ||
| Get all functions in a Python file. | ||
| **/ | ||
| if (code.length === 0) | ||
| throw new Error('Code is empty'); | ||
| const all_lines = code.split('\n'); | ||
| const functions = []; | ||
| for (let i = 0; i < all_lines.length; i++) { | ||
| if (all_lines[i].startsWith('def ')) { | ||
| const code = get_until_no_space(all_lines, i); | ||
| const function_name = get_function_name(all_lines[i]); | ||
| functions.push({ code, function_name }); | ||
| } | ||
| } | ||
| return functions; | ||
| } | ||
| exports.getChunksByPython = getFunctions; |
| export declare function getChunksSimple({ text, // The input text to be split | ||
| maxCharLength, }: { | ||
| text: string; | ||
| maxCharLength?: number; | ||
| }): string[]; |
| "use strict"; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.getChunksSimple = void 0; | ||
| // A function that splits a text into smaller pieces of roughly equal length | ||
| // The pieces are delimited by sentences and try to avoid breaking words or punctuation | ||
| // This can be useful for processing long texts with natural language models that have a limited input size | ||
| function getChunksSimple({ text, // The input text to be split | ||
| // The desired maximum length of each piece in characters | ||
| // This uses 4 characters as an approximation of the average token length | ||
| // since there isn't a good JS tokenizer at the moment | ||
| maxCharLength = 250 * 4, }) { | ||
| // Create an empty array to store the pieces | ||
| const chunks = []; | ||
| // Create a variable to hold the current piece | ||
| let currentChunk = ''; | ||
| // Remove any newline characters from the text and split it by periods | ||
| // This assumes that periods mark the end of sentences, which may not be true for some languages | ||
| const sentences = text.replace(/\n/g, ' ').split(/([.])/); | ||
| for (const sentence of sentences) { | ||
| // Remove any extra whitespace from the beginning and end of the sentence | ||
| const trimmedSentence = sentence.trim(); | ||
| // If the sentence is empty, skip it | ||
| if (!trimmedSentence) | ||
| continue; | ||
| // Check if adding the sentence to the current piece would make it too long, too short, or just right | ||
| // This uses a tolerance range of 50% of the maximum length to allow some flexibility | ||
| // If the piece is too long, save it and start a new one | ||
| // If the piece is too short, add the sentence and continue | ||
| // If the piece is just right, save it and start a new one | ||
| const chunkLength = currentChunk.length + trimmedSentence.length + 1; | ||
| const lowerBound = maxCharLength - maxCharLength * 0.5; | ||
| const upperBound = maxCharLength + maxCharLength * 0.5; | ||
| if (chunkLength >= lowerBound && chunkLength <= upperBound && currentChunk) { | ||
| // The piece is just right, so we save it and start a new one | ||
| // We remove any periods or spaces from the beginning of the piece and trim any whitespace | ||
| currentChunk = currentChunk.replace(/^[. ]+/, '').trim(); | ||
| // We only push the piece if it is not empty | ||
| if (currentChunk) | ||
| chunks.push(currentChunk); | ||
| // Reset the current piece | ||
| currentChunk = ''; | ||
| } | ||
| else if (chunkLength > upperBound) { | ||
| // The piece is too long, so save it and start a new one with the sentence | ||
| // Remove any periods or spaces from the beginning of the piece and trim any whitespace | ||
| currentChunk = currentChunk.replace(/^[. ]+/, '').trim(); | ||
| // We only push the piece if it is not empty | ||
| if (currentChunk) | ||
| chunks.push(currentChunk); | ||
| // Set the current piece to the sentence | ||
| currentChunk = trimmedSentence; | ||
| } | ||
| else { | ||
| // The piece is too short, so add the sentence and continue | ||
| // Add a space before the sentence unless it is a period | ||
| currentChunk += `${trimmedSentence === '.' ? '' : ' '}${trimmedSentence}`; | ||
| } | ||
| } | ||
| // If there is any remaining piece, save it | ||
| if (currentChunk) { | ||
| chunks.push(currentChunk); | ||
| } | ||
| // Return the array of pieces | ||
| return chunks; | ||
| } | ||
| exports.getChunksSimple = getChunksSimple; |
| interface GithubFile { | ||
| name: string; | ||
| path: string; | ||
| sha: string; | ||
| size: number; | ||
| url: string; | ||
| html_url: string; | ||
| git_url: string; | ||
| download_url: string; | ||
| type: 'file' | 'dir'; | ||
| _links: { | ||
| self: string; | ||
| git: string; | ||
| html: string; | ||
| }; | ||
| } | ||
| export declare const getAllFilesFromGithubRepo: (url: string, githubToken: string) => Promise<GithubFile[]>; | ||
| export {}; |
| "use strict"; | ||
| var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
| function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
| return new (P || (P = Promise))(function (resolve, reject) { | ||
| function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
| function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
| function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
| step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
| }); | ||
| }; | ||
| var __importDefault = (this && this.__importDefault) || function (mod) { | ||
| return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
| }; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.getAllFilesFromGithubRepo = void 0; | ||
| const cross_fetch_1 = __importDefault(require("cross-fetch")); | ||
| // get all files from agithub repo | ||
| const getAllFilesFromGithubRepo = (url, githubToken) => __awaiter(void 0, void 0, void 0, function* () { | ||
| if (!url) { | ||
| throw new Error('No url provided'); | ||
| } | ||
| if (!githubToken) { | ||
| throw new Error('No github token provided'); | ||
| } | ||
| const response = yield (0, cross_fetch_1.default)(url, { | ||
| headers: { | ||
| Authorization: `token ${githubToken}`, | ||
| }, | ||
| }); | ||
| const data = yield response.json(); | ||
| const dataList = []; | ||
| for (const item of data) { | ||
| if (item.type === 'file') { | ||
| dataList.push(item); | ||
| } | ||
| else if (item.type === 'dir') { | ||
| const subdirFiles = yield (0, exports.getAllFilesFromGithubRepo)(item._links.self, githubToken); | ||
| dataList.push(...subdirFiles); | ||
| } | ||
| } | ||
| return dataList; | ||
| }); | ||
| exports.getAllFilesFromGithubRepo = getAllFilesFromGithubRepo; |
| import type * as tiktoken from "@dqbd/tiktoken"; | ||
| import { Document } from "./document.js"; | ||
| interface TextSplitterParams { | ||
| chunkSize: number; | ||
| chunkOverlap: number; | ||
| } | ||
| export declare abstract class TextSplitter implements TextSplitterParams { | ||
| chunkSize: number; | ||
| chunkOverlap: number; | ||
| constructor(fields?: Partial<TextSplitterParams>); | ||
| abstract splitText(text: string): Promise<string[]>; | ||
| createDocuments(texts: string[], metadatas?: Record<string, any>[]): Promise<Document[]>; | ||
| splitDocuments(documents: Document[]): Promise<Document[]>; | ||
| private joinDocs; | ||
| mergeSplits(splits: string[], separator: string): string[]; | ||
| } | ||
| export interface CharacterTextSplitterParams extends TextSplitterParams { | ||
| separator: string; | ||
| } | ||
| export declare class CharacterTextSplitter extends TextSplitter implements CharacterTextSplitterParams { | ||
| separator: string; | ||
| constructor(fields?: Partial<CharacterTextSplitterParams>); | ||
| splitText(text: string): Promise<string[]>; | ||
| } | ||
| export interface RecursiveCharacterTextSplitterParams extends TextSplitterParams { | ||
| separators: string[]; | ||
| } | ||
| export declare class RecursiveCharacterTextSplitter extends TextSplitter implements RecursiveCharacterTextSplitterParams { | ||
| separators: string[]; | ||
| constructor(fields?: Partial<RecursiveCharacterTextSplitterParams>); | ||
| splitText(text: string): Promise<string[]>; | ||
| } | ||
| export interface TokenTextSplitterParams extends TextSplitterParams { | ||
| encodingName: tiktoken.TiktokenEmbedding; | ||
| allowedSpecial: "all" | Array<string>; | ||
| disallowedSpecial: "all" | Array<string>; | ||
| } | ||
| /** | ||
| * Implementation of splitter which looks at tokens. | ||
| */ | ||
| export declare class TokenTextSplitter extends TextSplitter implements TokenTextSplitterParams { | ||
| encodingName: tiktoken.TiktokenEmbedding; | ||
| allowedSpecial: "all" | Array<string>; | ||
| disallowedSpecial: "all" | Array<string>; | ||
| private tokenizer; | ||
| constructor(fields?: Partial<TokenTextSplitterParams>); | ||
| splitText(text: string): Promise<string[]>; | ||
| static imports(): Promise<typeof tiktoken>; | ||
| } | ||
| export {}; |
| "use strict"; | ||
| var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
| if (k2 === undefined) k2 = k; | ||
| var desc = Object.getOwnPropertyDescriptor(m, k); | ||
| if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
| desc = { enumerable: true, get: function() { return m[k]; } }; | ||
| } | ||
| Object.defineProperty(o, k2, desc); | ||
| }) : (function(o, m, k, k2) { | ||
| if (k2 === undefined) k2 = k; | ||
| o[k2] = m[k]; | ||
| })); | ||
| var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
| Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
| }) : function(o, v) { | ||
| o["default"] = v; | ||
| }); | ||
| var __importStar = (this && this.__importStar) || function (mod) { | ||
| if (mod && mod.__esModule) return mod; | ||
| var result = {}; | ||
| if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
| __setModuleDefault(result, mod); | ||
| return result; | ||
| }; | ||
| var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
| function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
| return new (P || (P = Promise))(function (resolve, reject) { | ||
| function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
| function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
| function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
| step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
| }); | ||
| }; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.TokenTextSplitter = exports.RecursiveCharacterTextSplitter = exports.CharacterTextSplitter = exports.TextSplitter = void 0; | ||
| const document_js_1 = require("./document.js"); | ||
| class TextSplitter { | ||
| constructor(fields) { | ||
| var _a, _b; | ||
| this.chunkSize = 1000; | ||
| this.chunkOverlap = 200; | ||
| this.chunkSize = (_a = fields === null || fields === void 0 ? void 0 : fields.chunkSize) !== null && _a !== void 0 ? _a : this.chunkSize; | ||
| this.chunkOverlap = (_b = fields === null || fields === void 0 ? void 0 : fields.chunkOverlap) !== null && _b !== void 0 ? _b : this.chunkOverlap; | ||
| if (this.chunkOverlap >= this.chunkSize) { | ||
| throw new Error("Cannot have chunkOverlap >= chunkSize"); | ||
| } | ||
| } | ||
| createDocuments(texts, | ||
| // eslint-disable-next-line @typescript-eslint/no-explicit-any | ||
| metadatas = []) { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| const _metadatas = metadatas.length > 0 ? metadatas : new Array(texts.length).fill({}); | ||
| const documents = new Array(); | ||
| for (let i = 0; i < texts.length; i += 1) { | ||
| const text = texts[i]; | ||
| for (const chunk of yield this.splitText(text)) { | ||
| documents.push(new document_js_1.Document({ pageContent: chunk, metadata: _metadatas[i] })); | ||
| } | ||
| } | ||
| return documents; | ||
| }); | ||
| } | ||
| splitDocuments(documents) { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| const texts = documents.map((doc) => doc.pageContent); | ||
| const metadatas = documents.map((doc) => doc.metadata); | ||
| return this.createDocuments(texts, metadatas); | ||
| }); | ||
| } | ||
| joinDocs(docs, separator) { | ||
| const text = docs.join(separator).trim(); | ||
| return text === "" ? null : text; | ||
| } | ||
| mergeSplits(splits, separator) { | ||
| const docs = []; | ||
| const currentDoc = []; | ||
| let total = 0; | ||
| for (const d of splits) { | ||
| const _len = d.length; | ||
| if (total + _len >= this.chunkSize) { | ||
| if (total > this.chunkSize) { | ||
| console.warn(`Created a chunk of size ${total}, + | ||
| which is longer than the specified ${this.chunkSize}`); | ||
| } | ||
| if (currentDoc.length > 0) { | ||
| const doc = this.joinDocs(currentDoc, separator); | ||
| if (doc !== null) { | ||
| docs.push(doc); | ||
| } | ||
| // Keep on popping if: | ||
| // - we have a larger chunk than in the chunk overlap | ||
| // - or if we still have any chunks and the length is long | ||
| while (total > this.chunkOverlap || | ||
| (total + _len > this.chunkSize && total > 0)) { | ||
| total -= currentDoc[0].length; | ||
| currentDoc.shift(); | ||
| } | ||
| } | ||
| } | ||
| currentDoc.push(d); | ||
| total += _len; | ||
| } | ||
| const doc = this.joinDocs(currentDoc, separator); | ||
| if (doc !== null) { | ||
| docs.push(doc); | ||
| } | ||
| return docs; | ||
| } | ||
| } | ||
| exports.TextSplitter = TextSplitter; | ||
| class CharacterTextSplitter extends TextSplitter { | ||
| constructor(fields) { | ||
| var _a; | ||
| super(fields); | ||
| this.separator = "\n\n"; | ||
| this.separator = (_a = fields === null || fields === void 0 ? void 0 : fields.separator) !== null && _a !== void 0 ? _a : this.separator; | ||
| } | ||
| splitText(text) { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| // First we naively split the large input into a bunch of smaller ones. | ||
| let splits; | ||
| if (this.separator) { | ||
| splits = text.split(this.separator); | ||
| } | ||
| else { | ||
| splits = text.split(""); | ||
| } | ||
| return this.mergeSplits(splits, this.separator); | ||
| }); | ||
| } | ||
| } | ||
| exports.CharacterTextSplitter = CharacterTextSplitter; | ||
| class RecursiveCharacterTextSplitter extends TextSplitter { | ||
| constructor(fields) { | ||
| var _a; | ||
| super(fields); | ||
| this.separators = ["\n\n", "\n", " ", ""]; | ||
| this.separators = (_a = fields === null || fields === void 0 ? void 0 : fields.separators) !== null && _a !== void 0 ? _a : this.separators; | ||
| } | ||
| splitText(text) { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| const finalChunks = []; | ||
| // Get appropriate separator to use | ||
| let separator = this.separators[this.separators.length - 1]; | ||
| for (const s of this.separators) { | ||
| if (s === "") { | ||
| separator = s; | ||
| break; | ||
| } | ||
| if (text.includes(s)) { | ||
| separator = s; | ||
| break; | ||
| } | ||
| } | ||
| // Now that we have the separator, split the text | ||
| let splits; | ||
| if (separator) { | ||
| splits = text.split(separator); | ||
| } | ||
| else { | ||
| splits = text.split(""); | ||
| } | ||
| // Now go merging things, recursively splitting longer texts. | ||
| let goodSplits = []; | ||
| for (const s of splits) { | ||
| if (s.length < this.chunkSize) { | ||
| goodSplits.push(s); | ||
| } | ||
| else { | ||
| if (goodSplits.length) { | ||
| const mergedText = this.mergeSplits(goodSplits, separator); | ||
| finalChunks.push(...mergedText); | ||
| goodSplits = []; | ||
| } | ||
| const otherInfo = yield this.splitText(s); | ||
| finalChunks.push(...otherInfo); | ||
| } | ||
| } | ||
| if (goodSplits.length) { | ||
| const mergedText = this.mergeSplits(goodSplits, separator); | ||
| finalChunks.push(...mergedText); | ||
| } | ||
| return finalChunks; | ||
| }); | ||
| } | ||
| } | ||
| exports.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter; | ||
| /** | ||
| * Implementation of splitter which looks at tokens. | ||
| */ | ||
| class TokenTextSplitter extends TextSplitter { | ||
| constructor(fields) { | ||
| var _a, _b, _c; | ||
| super(fields); | ||
| this.encodingName = (_a = fields === null || fields === void 0 ? void 0 : fields.encodingName) !== null && _a !== void 0 ? _a : "gpt2"; | ||
| this.allowedSpecial = (_b = fields === null || fields === void 0 ? void 0 : fields.allowedSpecial) !== null && _b !== void 0 ? _b : []; | ||
| this.disallowedSpecial = (_c = fields === null || fields === void 0 ? void 0 : fields.disallowedSpecial) !== null && _c !== void 0 ? _c : "all"; | ||
| } | ||
| splitText(text) { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| if (!this.tokenizer) { | ||
| const tiktoken = yield TokenTextSplitter.imports(); | ||
| this.tokenizer = tiktoken.get_encoding(this.encodingName); | ||
| } | ||
| const splits = []; | ||
| const input_ids = this.tokenizer.encode(text, this.allowedSpecial, this.disallowedSpecial); | ||
| let start_idx = 0; | ||
| let cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length); | ||
| let chunk_ids = input_ids.slice(start_idx, cur_idx); | ||
| const decoder = new TextDecoder(); | ||
| while (start_idx < input_ids.length) { | ||
| splits.push(decoder.decode(this.tokenizer.decode(chunk_ids))); | ||
| start_idx += this.chunkSize - this.chunkOverlap; | ||
| cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length); | ||
| chunk_ids = input_ids.slice(start_idx, cur_idx); | ||
| } | ||
| return splits; | ||
| }); | ||
| } | ||
| static imports() { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| try { | ||
| return yield Promise.resolve().then(() => __importStar(require("@dqbd/tiktoken"))); | ||
| } | ||
| catch (err) { | ||
| console.error(err); | ||
| throw new Error("Please install @dqbd/tiktoken as a dependency with, e.g. `npm install -S @dqbd/tiktoken`"); | ||
| } | ||
| }); | ||
| } | ||
| } | ||
| exports.TokenTextSplitter = TokenTextSplitter; |
+6
-12
@@ -1,12 +0,6 @@ | ||
| export declare function split(prompt: string, maxTokens?: number): string[]; | ||
| type Batches = { | ||
| data: string; | ||
| }[]; | ||
| export declare function index(chunks: string[], embedCallback: (batch: Batches) => void): Promise<void[]>; | ||
| export declare const merge: (chunks: string[], maxLen?: number) => Promise<string>; | ||
| declare const _default: { | ||
| split: typeof split; | ||
| index: typeof index; | ||
| merge: (chunks: string[], maxLen?: number) => Promise<string>; | ||
| }; | ||
| export default _default; | ||
| import { getChunksSimple } from './helpers/getChunksSimple'; | ||
| import { getChunksByMaxToken } from './helpers/getChunksByMaxToken'; | ||
| import { getChunksByNewLine } from './helpers/getChunksByNewLine'; | ||
| import { getChunksByPython } from './helpers/getChunksByPython'; | ||
| import { getAllFilesFromGithubRepo } from './helpers/github'; | ||
| export { getChunksSimple, getChunksByMaxToken, getChunksByNewLine, getChunksByPython, getAllFilesFromGithubRepo }; |
+11
-78
| "use strict"; | ||
| var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
| function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
| return new (P || (P = Promise))(function (resolve, reject) { | ||
| function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
| function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
| function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
| step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
| }); | ||
| }; | ||
| Object.defineProperty(exports, "__esModule", { value: true }); | ||
| exports.merge = exports.index = exports.split = void 0; | ||
| const tiktoken_1 = require("@dqbd/tiktoken"); | ||
| const tokenizer = (0, tiktoken_1.get_encoding)('cl100k_base'); | ||
| function splitIntoMany(text, maxTokens) { | ||
| // Split the text into sentences | ||
| const sentences = text.split('. '); | ||
| // Get the number of tokens for each sentence | ||
| const nTokens = sentences.map((sentence) => tokenizer.encode(' ' + sentence).length); | ||
| const chunks = []; | ||
| let tokensSoFar = 0; | ||
| let chunk = []; | ||
| // Loop through the sentences and tokens joined together in a tuple | ||
| for (let i = 0; i < sentences.length; i++) { | ||
| const sentence = sentences[i]; | ||
| const token = nTokens[i]; | ||
| // If the number of tokens so far plus the number of tokens in the current sentence is greater | ||
| // than the max number of tokens, then add the chunk to the list of chunks and reset | ||
| // the chunk and tokens so far | ||
| if (tokensSoFar + token > maxTokens) { | ||
| chunks.push(chunk.join('. ') + '.'); | ||
| chunk = []; | ||
| tokensSoFar = 0; | ||
| } | ||
| // If the number of tokens in the current sentence is greater than the max number of | ||
| // tokens, go to the next sentence | ||
| if (token > maxTokens) { | ||
| continue; | ||
| } | ||
| // Otherwise, add the sentence to the chunk and add the number of tokens to the total | ||
| chunk.push(sentence); | ||
| tokensSoFar += token + 1; | ||
| } | ||
| return chunks; | ||
| } | ||
| function split(prompt, maxTokens = 500) { | ||
| // should split into tokens | ||
| if (!prompt || prompt.length === 0) { | ||
| throw new Error('Nothing to embeddify'); | ||
| } | ||
| const chunks = splitIntoMany(prompt, maxTokens); | ||
| return chunks; | ||
| } | ||
| exports.split = split; | ||
| function index(chunks, embedCallback) { | ||
| return __awaiter(this, void 0, void 0, function* () { | ||
| const batches = []; | ||
| for (let i = 0; i < chunks.length; i += 100) { | ||
| batches.push(chunks.slice(i, i + 100).map((text) => ({ data: text }))); | ||
| } | ||
| return yield Promise.all(batches.map((batch) => embedCallback(batch))); | ||
| }); | ||
| } | ||
| exports.index = index; | ||
| // should index chunks | ||
| const merge = (chunks, maxLen = 1800) => __awaiter(void 0, void 0, void 0, function* () { | ||
| let curLen = 0; | ||
| const context = []; | ||
| for (const chunk of chunks) { | ||
| const nTokens = tokenizer.encode(chunk).length; | ||
| curLen += nTokens + 4; | ||
| if (curLen > maxLen) { | ||
| break; | ||
| } | ||
| context.push(chunk); | ||
| } | ||
| return context.join('\n\n###\n\n'); | ||
| }); | ||
| exports.merge = merge; | ||
| exports.default = { split, index, merge: exports.merge }; | ||
| exports.getAllFilesFromGithubRepo = exports.getChunksByPython = exports.getChunksByNewLine = exports.getChunksByMaxToken = exports.getChunksSimple = void 0; | ||
| const getChunksSimple_1 = require("./helpers/getChunksSimple"); | ||
| Object.defineProperty(exports, "getChunksSimple", { enumerable: true, get: function () { return getChunksSimple_1.getChunksSimple; } }); | ||
| const getChunksByMaxToken_1 = require("./helpers/getChunksByMaxToken"); | ||
| Object.defineProperty(exports, "getChunksByMaxToken", { enumerable: true, get: function () { return getChunksByMaxToken_1.getChunksByMaxToken; } }); | ||
| const getChunksByNewLine_1 = require("./helpers/getChunksByNewLine"); | ||
| Object.defineProperty(exports, "getChunksByNewLine", { enumerable: true, get: function () { return getChunksByNewLine_1.getChunksByNewLine; } }); | ||
| const getChunksByPython_1 = require("./helpers/getChunksByPython"); | ||
| Object.defineProperty(exports, "getChunksByPython", { enumerable: true, get: function () { return getChunksByPython_1.getChunksByPython; } }); | ||
| const github_1 = require("./helpers/github"); | ||
| Object.defineProperty(exports, "getAllFilesFromGithubRepo", { enumerable: true, get: function () { return github_1.getAllFilesFromGithubRepo; } }); |
+7
-6
| { | ||
| "name": "embeddings-splitter", | ||
| "version": "0.0.5", | ||
| "version": "0.1.0", | ||
| "description": "A typescript library to split your long texts into smaller chunks to send them to OpenAI Embeddings API", | ||
@@ -22,3 +22,3 @@ "main": "lib/index.js", | ||
| "type": "git", | ||
| "url": "git+https://github.com/another-ai/embeddings-splitter.git" | ||
| "url": "git+https://github.com/different-ai/embeddings-splitter.git" | ||
| }, | ||
@@ -30,8 +30,8 @@ "keywords": [ | ||
| ], | ||
| "author": "another AI", | ||
| "author": "Different AI", | ||
| "license": "MIT", | ||
| "bugs": { | ||
| "url": "https://github.com/another-ai/embeddings-splitter/issues" | ||
| "url": "https://github.com/different-ai/embeddings-splitter/issues" | ||
| }, | ||
| "homepage": "https://github.com/another-ai/embeddings-splitter#readme", | ||
| "homepage": "https://github.com/different-ai/embeddings-splitter#readme", | ||
| "devDependencies": { | ||
@@ -52,4 +52,5 @@ "@types/jest": "29.2.4", | ||
| "dependencies": { | ||
| "@dqbd/tiktoken": "^0.2.1" | ||
| "@dqbd/tiktoken": "^0.4.0", | ||
| "cross-fetch": "^3.1.5" | ||
| } | ||
| } |
+13
-25
@@ -15,3 +15,3 @@ <p align="center">embeddings-splitter</p> | ||
| `split` makes sure your string will are short enough to be embedded | ||
| `split` makes sure your string will are short enough to be embedded. (default split size is 500 tokens, but you OpenAI embeddings allow you to go up to 8191) | ||
@@ -21,30 +21,11 @@ ```js | ||
| // chunks to iterate on and send to a server | ||
| const chunks = split('somVeryLongText...'); | ||
| ``` | ||
| ### Batch send (experimental) | ||
| // example with biggest chunk size | ||
| const chunks = split('someVeryLongText', 8191) | ||
| ```js | ||
| import {index} from 'embeddings-splitter'; | ||
| // now you can send these chunks to be embedded | ||
| ``` | ||
| // used to send batches to a server in parellel | ||
| index(chunks, (batch) => { | ||
| // this example is using Embedbase, but it can be replaced with openai.createEmbeddings | ||
| const vaultId = 'youtube video id'; | ||
| await fetch(url + '/v1/' + 'your api key', { | ||
| method: 'POST', | ||
| headers: { | ||
| Authorization: 'Bearer ' + apiKey, | ||
| 'Content-Type': 'application/json', | ||
| }, | ||
| body: JSON.stringify({ | ||
| documents: batch, | ||
| }), | ||
| }); | ||
| }); | ||
| ``` | ||
| ### Merge chunks into single string | ||
@@ -58,3 +39,10 @@ | ||
| const chunks = ['i am a text', 'that needs to be interpreted as one ', 'for a prompt to make sense']; | ||
| const merged = merge(chunks); | ||
| const context = merge(chunks); | ||
| // e.g. of what to do with merged array | ||
| const question = 'what is this text about?" | ||
| const prompt = Answer the question based on the context below, and if the question can't be answered based on the context, say "I don't know"\n\nContext: ${context}\n\n---\n\nQuestion: ${question}\nAnswer: | ||
| createCompletion(prompt) | ||
| ``` | ||
@@ -61,0 +49,0 @@ |
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
URL strings
Supply chain riskPackage contains fragments of external URLs or IP addresses, which the package may be accessing at runtime.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
221871
14.65%31
138.46%1941
46.49%2
100%58
-17.14%17
21.43%2
Infinity%+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
+ Added
- Removed
Updated