@orama/chunker
Advanced tools
Comparing version 0.0.2 to 0.0.3
@@ -0,8 +1,14 @@ | ||
/** | ||
* Represents a Chunker object that can be used to tokenize input strings and count the number of tokens. | ||
*/ | ||
export declare class Chunker { | ||
protected verbose: boolean; | ||
protected ready: Promise<boolean>; | ||
private tokenizer; | ||
constructor(); | ||
private init; | ||
getNumberOfTokens(input: string): Promise<number>; | ||
/** | ||
* Gets the number of tokens in the input string. | ||
* @param input - The input string to tokenize. | ||
* @returns A promise that resolves with the number of tokens in the input string. | ||
*/ | ||
getNumberOfTokens(input: string): number; | ||
} |
@@ -1,4 +0,5 @@ | ||
import { AutoTokenizer, env } from './deps/cdn.jsdelivr.net/npm/@xenova/transformers@2.15.0.js'; | ||
env.useBrowserCache = false; | ||
env.allowLocalModels = false; | ||
import { getEncoding } from 'js-tiktoken'; | ||
/** | ||
* Represents a Chunker object that can be used to tokenize input strings and count the number of tokens. | ||
*/ | ||
export class Chunker { | ||
@@ -12,9 +13,2 @@ constructor() { | ||
}); | ||
Object.defineProperty(this, "ready", { | ||
enumerable: true, | ||
configurable: true, | ||
writable: true, | ||
value: void 0 | ||
}); | ||
// deno-lint-ignore no-explicit-any | ||
Object.defineProperty(this, "tokenizer", { | ||
@@ -26,14 +20,12 @@ enumerable: true, | ||
}); | ||
this.ready = this.init() | ||
.then(() => true) | ||
.catch(() => false); | ||
this.tokenizer = getEncoding('gpt2'); | ||
} | ||
async init() { | ||
this.tokenizer = await AutoTokenizer.from_pretrained('Xenova/bert-base-uncased'); | ||
/** | ||
* Gets the number of tokens in the input string. | ||
* @param input - The input string to tokenize. | ||
* @returns A promise that resolves with the number of tokens in the input string. | ||
*/ | ||
getNumberOfTokens(input) { | ||
return this.tokenizer.encode(input).length; | ||
} | ||
async getNumberOfTokens(input) { | ||
await this.ready; | ||
const result = await this.tokenizer(input); | ||
return result.input_ids.size; | ||
} | ||
} |
import { Chunker } from './common.js'; | ||
/** | ||
* Represents a fixed chunker that splits a string into chunks based on a maximum number of tokens per chunk. | ||
*/ | ||
export declare class FixedChunker extends Chunker { | ||
chunk(input: string, maxTokensPerChunk: number): Promise<string[]>; | ||
/** | ||
* Splits the input string into chunks based on the maximum number of tokens per chunk. | ||
* @param {String} input - The input string to be chunked. | ||
* @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk. | ||
* @returns An array of strings representing the chunks. | ||
*/ | ||
chunk(input: string, maxTokensPerChunk: number): string[]; | ||
} |
import { Chunker } from './common.js'; | ||
/** | ||
* Represents a fixed chunker that splits a string into chunks based on a maximum number of tokens per chunk. | ||
*/ | ||
export class FixedChunker extends Chunker { | ||
async chunk(input, maxTokensPerChunk) { | ||
/** | ||
* Splits the input string into chunks based on the maximum number of tokens per chunk. | ||
* @param {String} input - The input string to be chunked. | ||
* @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk. | ||
* @returns An array of strings representing the chunks. | ||
*/ | ||
chunk(input, maxTokensPerChunk) { | ||
const words = input.split(/\s+/); | ||
@@ -15,3 +24,3 @@ const chunks = []; | ||
const testChunk = words.slice(start, mid + 1).join(' '); | ||
const tokenCount = await this.getNumberOfTokens(testChunk); | ||
const tokenCount = this.getNumberOfTokens(testChunk); | ||
if (tokenCount <= maxTokensPerChunk) { | ||
@@ -18,0 +27,0 @@ validChunk = testChunk; |
@@ -1,3 +0,2 @@ | ||
import "./_dnt.polyfills.js"; | ||
export { FixedChunker } from './fixed.js'; | ||
export { NLPChunker } from './nlp.js'; |
@@ -1,3 +0,2 @@ | ||
import "./_dnt.polyfills.js"; | ||
export { FixedChunker } from './fixed.js'; | ||
export { NLPChunker } from './nlp.js'; |
import { Chunker } from './common.js'; | ||
export declare class NLPChunker extends Chunker { | ||
chunk(input: string, maxTokensPerChunk: number): Promise<string[]>; | ||
/** | ||
* Splits the input text into chunks based on the maximum number of tokens per chunk. | ||
* @param {String} input - The input text to be chunked. | ||
* @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk. | ||
* @returns A promise that resolves to an array of chunks. | ||
*/ | ||
chunk(input: string, maxTokensPerChunk: number): string[]; | ||
} |
@@ -0,5 +1,15 @@ | ||
/** | ||
* Represents a chunker that uses natural language processing (NLP) to split text into chunks. | ||
* This chunker extends the base `Chunker` class. | ||
*/ | ||
import nlp from 'compromise/one'; | ||
import { Chunker } from './common.js'; | ||
export class NLPChunker extends Chunker { | ||
async chunk(input, maxTokensPerChunk) { | ||
/** | ||
* Splits the input text into chunks based on the maximum number of tokens per chunk. | ||
* @param {String} input - The input text to be chunked. | ||
* @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk. | ||
* @returns A promise that resolves to an array of chunks. | ||
*/ | ||
chunk(input, maxTokensPerChunk) { | ||
const sentences = nlp.tokenize(input).fullSentences().out('array'); | ||
@@ -9,8 +19,6 @@ const chunks = []; | ||
for (const sentence of sentences) { | ||
const [sentenceTokenCount, currentChunkTokenCount] = await Promise.all([ | ||
this.getNumberOfTokens(sentence), | ||
this.getNumberOfTokens(currentChunk), | ||
]); | ||
const sentenceTokenCount = this.getNumberOfTokens(sentence); | ||
const currentChunkTokenCount = this.getNumberOfTokens(currentChunk); | ||
if (sentenceTokenCount + currentChunkTokenCount <= maxTokensPerChunk) { | ||
currentChunk += (currentChunk ? ' ' : '') + sentence; // Ensure space between sentences | ||
currentChunk += (currentChunk ? ' ' : '') + sentence; | ||
} | ||
@@ -17,0 +25,0 @@ else { |
@@ -5,3 +5,3 @@ { | ||
"name": "@orama/chunker", | ||
"version": "0.0.2", | ||
"version": "0.0.3", | ||
"description": "Split large texts into chunks with a maximum number of token. Split by fixed size or by sentence.", | ||
@@ -27,4 +27,5 @@ "license": "Apache 2.0", | ||
"dependencies": { | ||
"compromise": "14.11.2" | ||
"compromise": "14.11.2", | ||
"js-tiktoken": "1.0.10" | ||
} | ||
} |
# Orama Chunker | ||
[![Tests](https://github.com/oramasearch/chunker/actions/workflows/deno.yml/badge.svg)](https://github.com/oramasearch/chunker/actions/workflows/deno.yml) | ||
[![Node.js Tests](https://github.com/askorama/chunker/actions/workflows/nodejs.yml/badge.svg)](https://github.com/askorama/chunker/actions/workflows/nodejs.yml) | ||
[![Deno Tests](https://github.com/oramasearch/chunker/actions/workflows/deno.yml/badge.svg)](https://github.com/oramasearch/chunker/actions/workflows/deno.yml) | ||
@@ -27,3 +28,3 @@ When engaging with ChatGPT or other Large Language Models (LLMs), breaking down your input into smaller chunks is a strategy that significantly enhances the interaction experience. This approach is not just about managing the technical constraints of these models, such as input length limitations, but also about improving the quality of the dialogue. By dividing a complex query or detailed discussion into more digestible parts, users can guide the model through the conversation in a step-by-step manner. This method allows for a more nuanced understanding of the context and the specifics of each query, leading to responses that are not only accurate but also highly relevant to the user's needs. | ||
![Chunking Strategies](/misc/chunking-strategies.png) | ||
![[Chunking Strategies](/misc/chunking-strategies.png)](https://raw.githubusercontent.com/askorama/chunker/main/misc/chunking-strategies.png) | ||
@@ -30,0 +31,0 @@ The **Fixed Chunker** will divide your input text into several pieces of a specified size. It does not consider the semantics of your input text, as its sole purpose is to divide the text and ensure that each piece contains a maximum number of tokens. It is slightly faster and lighter as it requires fewer computations to determine the chunking strategy. |
@@ -0,8 +1,14 @@ | ||
/** | ||
* Represents a Chunker object that can be used to tokenize input strings and count the number of tokens. | ||
*/ | ||
export declare class Chunker { | ||
protected verbose: boolean; | ||
protected ready: Promise<boolean>; | ||
private tokenizer; | ||
constructor(); | ||
private init; | ||
getNumberOfTokens(input: string): Promise<number>; | ||
/** | ||
* Gets the number of tokens in the input string. | ||
* @param input - The input string to tokenize. | ||
* @returns A promise that resolves with the number of tokens in the input string. | ||
*/ | ||
getNumberOfTokens(input: string): number; | ||
} |
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.Chunker = void 0; | ||
const transformers_2_15_0_js_1 = require("./deps/cdn.jsdelivr.net/npm/@xenova/transformers@2.15.0.js"); | ||
transformers_2_15_0_js_1.env.useBrowserCache = false; | ||
transformers_2_15_0_js_1.env.allowLocalModels = false; | ||
const js_tiktoken_1 = require("js-tiktoken"); | ||
/** | ||
* Represents a Chunker object that can be used to tokenize input strings and count the number of tokens. | ||
*/ | ||
class Chunker { | ||
@@ -15,9 +16,2 @@ constructor() { | ||
}); | ||
Object.defineProperty(this, "ready", { | ||
enumerable: true, | ||
configurable: true, | ||
writable: true, | ||
value: void 0 | ||
}); | ||
// deno-lint-ignore no-explicit-any | ||
Object.defineProperty(this, "tokenizer", { | ||
@@ -29,15 +23,13 @@ enumerable: true, | ||
}); | ||
this.ready = this.init() | ||
.then(() => true) | ||
.catch(() => false); | ||
this.tokenizer = (0, js_tiktoken_1.getEncoding)('gpt2'); | ||
} | ||
async init() { | ||
this.tokenizer = await transformers_2_15_0_js_1.AutoTokenizer.from_pretrained('Xenova/bert-base-uncased'); | ||
/** | ||
* Gets the number of tokens in the input string. | ||
* @param input - The input string to tokenize. | ||
* @returns A promise that resolves with the number of tokens in the input string. | ||
*/ | ||
getNumberOfTokens(input) { | ||
return this.tokenizer.encode(input).length; | ||
} | ||
async getNumberOfTokens(input) { | ||
await this.ready; | ||
const result = await this.tokenizer(input); | ||
return result.input_ids.size; | ||
} | ||
} | ||
exports.Chunker = Chunker; |
import { Chunker } from './common.js'; | ||
/** | ||
* Represents a fixed chunker that splits a string into chunks based on a maximum number of tokens per chunk. | ||
*/ | ||
export declare class FixedChunker extends Chunker { | ||
chunk(input: string, maxTokensPerChunk: number): Promise<string[]>; | ||
/** | ||
* Splits the input string into chunks based on the maximum number of tokens per chunk. | ||
* @param {String} input - The input string to be chunked. | ||
* @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk. | ||
* @returns An array of strings representing the chunks. | ||
*/ | ||
chunk(input: string, maxTokensPerChunk: number): string[]; | ||
} |
@@ -5,4 +5,13 @@ "use strict"; | ||
const common_js_1 = require("./common.js"); | ||
/** | ||
* Represents a fixed chunker that splits a string into chunks based on a maximum number of tokens per chunk. | ||
*/ | ||
class FixedChunker extends common_js_1.Chunker { | ||
async chunk(input, maxTokensPerChunk) { | ||
/** | ||
* Splits the input string into chunks based on the maximum number of tokens per chunk. | ||
* @param {String} input - The input string to be chunked. | ||
* @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk. | ||
* @returns An array of strings representing the chunks. | ||
*/ | ||
chunk(input, maxTokensPerChunk) { | ||
const words = input.split(/\s+/); | ||
@@ -19,3 +28,3 @@ const chunks = []; | ||
const testChunk = words.slice(start, mid + 1).join(' '); | ||
const tokenCount = await this.getNumberOfTokens(testChunk); | ||
const tokenCount = this.getNumberOfTokens(testChunk); | ||
if (tokenCount <= maxTokensPerChunk) { | ||
@@ -22,0 +31,0 @@ validChunk = testChunk; |
@@ -1,3 +0,2 @@ | ||
import "./_dnt.polyfills.js"; | ||
export { FixedChunker } from './fixed.js'; | ||
export { NLPChunker } from './nlp.js'; |
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.NLPChunker = exports.FixedChunker = void 0; | ||
require("./_dnt.polyfills.js"); | ||
var fixed_js_1 = require("./fixed.js"); | ||
@@ -6,0 +5,0 @@ Object.defineProperty(exports, "FixedChunker", { enumerable: true, get: function () { return fixed_js_1.FixedChunker; } }); |
import { Chunker } from './common.js'; | ||
export declare class NLPChunker extends Chunker { | ||
chunk(input: string, maxTokensPerChunk: number): Promise<string[]>; | ||
/** | ||
* Splits the input text into chunks based on the maximum number of tokens per chunk. | ||
* @param {String} input - The input text to be chunked. | ||
* @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk. | ||
* @returns A promise that resolves to an array of chunks. | ||
*/ | ||
chunk(input: string, maxTokensPerChunk: number): string[]; | ||
} |
@@ -7,6 +7,16 @@ "use strict"; | ||
exports.NLPChunker = void 0; | ||
/** | ||
* Represents a chunker that uses natural language processing (NLP) to split text into chunks. | ||
* This chunker extends the base `Chunker` class. | ||
*/ | ||
const one_1 = __importDefault(require("compromise/one")); | ||
const common_js_1 = require("./common.js"); | ||
class NLPChunker extends common_js_1.Chunker { | ||
async chunk(input, maxTokensPerChunk) { | ||
/** | ||
* Splits the input text into chunks based on the maximum number of tokens per chunk. | ||
* @param {String} input - The input text to be chunked. | ||
* @param {Number} maxTokensPerChunk - The maximum number of tokens allowed per chunk. | ||
* @returns A promise that resolves to an array of chunks. | ||
*/ | ||
chunk(input, maxTokensPerChunk) { | ||
const sentences = one_1.default.tokenize(input).fullSentences().out('array'); | ||
@@ -16,8 +26,6 @@ const chunks = []; | ||
for (const sentence of sentences) { | ||
const [sentenceTokenCount, currentChunkTokenCount] = await Promise.all([ | ||
this.getNumberOfTokens(sentence), | ||
this.getNumberOfTokens(currentChunk), | ||
]); | ||
const sentenceTokenCount = this.getNumberOfTokens(sentence); | ||
const currentChunkTokenCount = this.getNumberOfTokens(currentChunk); | ||
if (sentenceTokenCount + currentChunkTokenCount <= maxTokensPerChunk) { | ||
currentChunk += (currentChunk ? ' ' : '') + sentence; // Ensure space between sentences | ||
currentChunk += (currentChunk ? ' ' : '') + sentence; | ||
} | ||
@@ -24,0 +32,0 @@ else { |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
64
17008
2
21
326
1
+ Addedjs-tiktoken@1.0.10
+ Addedbase64-js@1.5.1(transitive)
+ Addedjs-tiktoken@1.0.10(transitive)