semantic-chunking
Advanced tools
Comparing version 1.1.0 to 1.2.0
228
chunkit.js
@@ -60,61 +60,98 @@ // =========================== | ||
// Load the tokenizer | ||
tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel); | ||
// Load the tokenizer | ||
tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel); | ||
// Create the embedding pipeline | ||
generateEmbedding = await pipeline('feature-extraction', onnxEmbeddingModel, { | ||
quantized: onnxEmbeddingModelQuantized, | ||
}); | ||
// Create the embedding pipeline | ||
generateEmbedding = await pipeline('feature-extraction', onnxEmbeddingModel, { | ||
quantized: onnxEmbeddingModelQuantized, | ||
}); | ||
// Split the text into sentences | ||
const sentences = sentencize(text); | ||
// Split the text into sentences | ||
const sentences = sentencize(text); | ||
// Compute the similarities between sentences | ||
const { similarities, average, variance } = await computeAdvancedSimilarities( | ||
sentences, | ||
{ | ||
numSimilaritySentencesLookahead: numSimilaritySentencesLookahead, | ||
logging: logging, | ||
} | ||
); | ||
// Dynamically adjust the similarity threshold based on variance and average | ||
let dynamicThreshold = similarityThreshold; | ||
if (average != null && variance != null) { | ||
dynamicThreshold = adjustThreshold(average, variance, similarityThreshold, dynamicThresholdLowerBound, dynamicThresholdUpperBound ); | ||
// Compute the similarities between sentences | ||
const { similarities, average, variance } = await computeAdvancedSimilarities( | ||
sentences, | ||
{ | ||
numSimilaritySentencesLookahead: numSimilaritySentencesLookahead, | ||
logging: logging, | ||
} | ||
); | ||
// Create the initial chunks using the adjusted threshold | ||
const initialChunks = createChunks(sentences, similarities, maxTokenSize, dynamicThreshold, logging); | ||
// Dynamically adjust the similarity threshold based on variance and average | ||
let dynamicThreshold = similarityThreshold; | ||
if (average != null && variance != null) { | ||
dynamicThreshold = adjustThreshold(average, variance, similarityThreshold, dynamicThresholdLowerBound, dynamicThresholdUpperBound); | ||
} | ||
// Create the initial chunks using the adjusted threshold | ||
const initialChunks = createChunks(sentences, similarities, maxTokenSize, dynamicThreshold, logging); | ||
if (logging) { | ||
console.log('\n=============\ninitialChunks\n============='); | ||
initialChunks.forEach((chunk, index) => { | ||
console.log("\n"); | ||
console.log(`--------------`); | ||
console.log(`-- Chunk ${(index + 1)} --`); | ||
console.log(`--------------`); | ||
console.log(chunk); | ||
}); | ||
} | ||
// // combine similar chunks and balance sizes | ||
if (combineChunks) { | ||
const combinedChunks = await optimizeAndRebalanceChunks(initialChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold); | ||
if (logging) { | ||
console.log('\n=============\ninitialChunks\n============='); | ||
initialChunks.forEach((chunk, index) => { | ||
console.log("\n"); | ||
console.log(`--------------`); | ||
console.log(`-- Chunk ${(index + 1)} --`); | ||
console.log(`--------------`); | ||
console.log('\n\n=============\ncombinedChunks\n============='); | ||
combinedChunks.forEach((chunk, index) => { | ||
console.log("\n\n\n"); | ||
console.log("--------------------"); | ||
console.log("Chunk " + (index + 1)); | ||
console.log("--------------------"); | ||
console.log(chunk); | ||
}); | ||
} | ||
// Return the combined chunks | ||
return combinedChunks; | ||
} else { | ||
// Return the initial chunks | ||
return initialChunks; | ||
} | ||
} | ||
// // combine similar chunks and balance sizes | ||
if (combineChunks) { | ||
const combinedChunks = await optimizeAndRebalanceChunks(initialChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold); | ||
if (logging) { | ||
console.log('\n\n=============\ncombinedChunks\n============='); | ||
combinedChunks.forEach((chunk, index) => { | ||
console.log("\n\n\n"); | ||
console.log("--------------------"); | ||
console.log("Chunk " + (index + 1)); | ||
console.log("--------------------"); | ||
console.log(chunk); | ||
}); | ||
} | ||
// Return the combined chunks | ||
return combinedChunks; | ||
} else { | ||
// Return the initial chunks | ||
return initialChunks; | ||
} | ||
// -------------------------- | ||
// -- Main cramit function -- | ||
// -------------------------- | ||
export async function cramit( | ||
text, | ||
{ | ||
logging = LOGGING, | ||
maxTokenSize = MAX_TOKEN_SIZE, | ||
onnxEmbeddingModel = ONNX_EMBEDDING_MODEL, | ||
onnxEmbeddingModelQuantized = ONNX_EMBEDDING_MODEL_QUANTIZED, | ||
} = {}) { | ||
// Load the tokenizer | ||
tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel); | ||
// Split the text into sentences | ||
const sentences = sentencize(text); | ||
// Create chunks | ||
const chunks = createChunks(sentences, null, maxTokenSize, 0, logging); | ||
if (logging) { | ||
console.log('\n=============\nChunks\n============='); | ||
chunks.forEach((chunk, index) => { | ||
console.log("\n"); | ||
console.log(`--------------`); | ||
console.log(`-- Chunk ${(index + 1)} --`); | ||
console.log(`--------------`); | ||
console.log(chunk); | ||
}); | ||
} | ||
// Return chunks | ||
return chunks; | ||
} | ||
@@ -152,4 +189,4 @@ | ||
for (let i = 0; i < embeddings.length - 1; i++) { | ||
const sim = cosineSimilarity(embeddings[i], embeddings[i + 1]); | ||
similarities.push(sim); | ||
const sim = cosineSimilarity(embeddings[i], embeddings[i + 1]); | ||
similarities.push(sim); | ||
} | ||
@@ -166,3 +203,3 @@ | ||
if (logging) console.log('numSimilaritySentencesLookahead', numSimilaritySentencesLookahead); | ||
const embeddings = await Promise.all(sentences.map(sentence => createEmbedding(sentence))); | ||
@@ -174,3 +211,3 @@ let similarities = []; | ||
let maxSimilarity = cosineSimilarity(embeddings[i], embeddings[i + 1]); | ||
for (let j = i + 2; j <= i + numSimilaritySentencesLookahead && j < embeddings.length; j++) { | ||
@@ -229,16 +266,16 @@ const sim = cosineSimilarity(embeddings[i], embeddings[j]); | ||
let normB = 0.0; | ||
for (let i = 0; i < vecA.length; i++) { | ||
dotProduct += vecA[i] * vecB[i]; | ||
normA += vecA[i] ** 2; | ||
normB += vecB[i] ** 2; | ||
dotProduct += vecA[i] * vecB[i]; | ||
normA += vecA[i] ** 2; | ||
normB += vecB[i] ** 2; | ||
} | ||
normA = Math.sqrt(normA); | ||
normB = Math.sqrt(normB); | ||
if (normA === 0 || normB === 0) { | ||
return 0; // To avoid division by zero | ||
return 0; // To avoid division by zero | ||
} else { | ||
return dotProduct / (normA * normB); | ||
return dotProduct / (normA * normB); | ||
} | ||
@@ -258,36 +295,55 @@ } | ||
if (logging) { console.log(`!! new chunk !! --> 1`) } | ||
for (let i = 1; i < sentences.length; i++) { | ||
currentChunkSize = tokenizer(currentChunk.join(" ")).input_ids.size; | ||
sentenceTokenCount = tokenizer(sentences[i]).input_ids.size; | ||
currentChunkSize = tokenizer(currentChunk.join(" ")).input_ids.size; | ||
sentenceTokenCount = tokenizer(sentences[i]).input_ids.size; | ||
if (logging) { | ||
console.log('sentenceTokenCount', sentenceTokenCount); | ||
console.log('currentChunkSize', currentChunkSize); | ||
console.log('maxTokenSize', maxTokenSize) | ||
console.log('similarity', similarities[i - 1]) | ||
console.log('similarityThreshold', similarityThreshold) | ||
} | ||
if (logging) { | ||
console.log('sentenceTokenCount', sentenceTokenCount); | ||
console.log('currentChunkSize', currentChunkSize); | ||
console.log('maxTokenSize', maxTokenSize); | ||
if (similarities) { | ||
console.log('similarity', similarities[i - 1]) | ||
console.log('similarityThreshold', similarityThreshold) | ||
} | ||
} | ||
if (similarities[i - 1] >= similarityThreshold && currentChunkSize + sentenceTokenCount <= maxTokenSize) { | ||
currentChunk.push(sentences[i]); | ||
if (similarities) { | ||
if (similarities[i - 1] >= similarityThreshold && currentChunkSize + sentenceTokenCount <= maxTokenSize) { | ||
currentChunk.push(sentences[i]); | ||
if (logging) { console.log('keep going...') } | ||
} else { | ||
chunks.push(currentChunk.join(" ")); | ||
currentChunk = [sentences[i]]; | ||
if (logging) { console.log('keep going...') } | ||
} else { | ||
chunks.push(currentChunk.join(" ")); | ||
currentChunk = [sentences[i]]; | ||
if (logging) { | ||
console.log('stop...') | ||
console.log('\n') | ||
console.log(`!! new chunk !! --> ${chunks.length + 1}`) | ||
} | ||
} | ||
if (logging) { | ||
console.log('stop...') | ||
console.log('\n') | ||
console.log(`!! new chunk !! --> ${chunks.length + 1}`) | ||
} | ||
} | ||
} else { | ||
if (currentChunkSize + sentenceTokenCount <= maxTokenSize) { | ||
currentChunk.push(sentences[i]); | ||
if (logging) { console.log('keep going...') } | ||
} else { | ||
chunks.push(currentChunk.join(" ")); | ||
currentChunk = [sentences[i]]; | ||
if (logging) { | ||
console.log('stop...') | ||
console.log('\n') | ||
console.log(`!! new chunk !! --> ${chunks.length + 1}`) | ||
} | ||
} | ||
} | ||
} | ||
// Add the last chunk if it's not empty | ||
if (currentChunk.length > 0 && currentChunk[0] !== "") { | ||
chunks.push(currentChunk.join(" ")); | ||
chunks.push(currentChunk.join(" ")); | ||
} | ||
return chunks; | ||
@@ -313,3 +369,3 @@ } | ||
const similarity = currentEmbedding ? cosineSimilarity(currentEmbedding, nextEmbedding) : 0; | ||
if (similarity >= combineChunksSimilarityThreshold) { | ||
@@ -316,0 +372,0 @@ currentChunkText += " " + chunk; |
{ | ||
"name": "semantic-chunking", | ||
"version": "1.1.0", | ||
"description": "semantically create chunks from large text (useful for passing to LLM workflows)", | ||
"version": "1.2.0", | ||
"description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).", | ||
"repository": { | ||
@@ -6,0 +6,0 @@ "type": "git", |
# 🍱 semantic-chunking | ||
Semantically create chunks from large texts, which is useful for workflows involving large language models (LLMs). | ||
Semantically create chunks from large texts. | ||
Useful for workflows involving large language models (LLMs). | ||
@@ -89,2 +90,5 @@ ## Install | ||
Look at the `example.js` file in the root of this project for a more complex example of using all the optional parameters. | ||
## Tuning | ||
@@ -170,2 +174,56 @@ | ||
Each of these parameters allows you to customize the `chunkit` function to better fit the text size, content complexity, and performance requirements of your application. | ||
Each of these parameters allows you to customize the `chunkit` function to better fit the text size, content complexity, and performance requirements of your application. | ||
--- | ||
## `cramit` - 🧼 The Quick & Dirty | ||
There is an additional function you can import to just "cram" sentences together till they meet your target token size for when you just need high quick desity chunks. | ||
Basic usage: | ||
```javascript | ||
import { cramit } from 'semantic-chunking'; | ||
let frogText = "A frog hops into a deli and croaks to the cashier, \"I'll have a sandwich, please.\" The cashier, surprised, quickly makes the sandwich and hands it over. The frog takes a big bite, looks around, and then asks, \"Do you have any flies to go with this?\" The cashier, taken aback, replies, \"Sorry, we're all out of flies today.\" The frog shrugs and continues munching on its sandwich, clearly unfazed by the lack of fly toppings. Just another day in the life of a sandwich-loving amphibian! 🐸🥪"; | ||
async function main() { | ||
let myFrogChunks = await cramit(frogText, { maxTokenSize: 65 }); | ||
console.log("myFrogChunks", myFrogChunks); | ||
} | ||
main(); | ||
``` | ||
Look at the `example2.js` file in the root of this project for a more complex example of using all the optional parameters. | ||
### Tuning | ||
The behavior of the `chunkit` function can be finely tuned using several optional parameters in the options object. Understanding how each parameter affects the function can help you optimize the chunking process for your specific requirements. | ||
#### `logging` | ||
- **Type**: Boolean | ||
- **Default**: `false` | ||
- **Description**: Enables detailed debug output during the chunking process. Turning this on can help in diagnosing how chunks are formed or why certain chunks are combined. | ||
#### `maxTokenSize` | ||
- **Type**: Integer | ||
- **Default**: `500` | ||
- **Description**: Sets the maximum number of tokens allowed in a single chunk. Smaller values result in smaller, more numerous chunks, while larger values can create fewer, larger chunks. It’s crucial for maintaining manageable chunk sizes when processing large texts. | ||
#### `onnxEmbeddingModel` | ||
- **Type**: String | ||
- **Default**: `Xenova/paraphrase-multilingual-MiniLM-L12-v2` | ||
- **Description**: Specifies the model used to generate sentence embeddings. Different models may yield different qualities of embeddings, affecting the chunking quality, especially in multilingual contexts. | ||
- **Resource Link**: [ONNX Embedding Models](https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending) | ||
Link to a filtered list of embedding models converted to ONNX library format by Xenova. | ||
Refer to the Model table below for a list of suggested models and their sizes (choose a multilingual model if you need to chunk text other than English). | ||
#### `onnxEmbeddingModelQuantized` | ||
- **Type**: Boolean | ||
- **Default**: `true` | ||
- **Description**: Indicates whether to use a quantized version of the specified model. Quantized models generally offer faster performance with a slight trade-off in accuracy, which can be beneficial when processing very large datasets. |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
47682
7
362
228
0
2