Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

semantic-chunking

Package Overview
Dependencies
Maintainers
1
Versions
31
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

semantic-chunking - npm Package Compare versions

Comparing version 1.1.0 to 1.2.0

example.js

228

chunkit.js

@@ -60,61 +60,98 @@ // ===========================

// Load the tokenizer
tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);
// Load the tokenizer
tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);
// Create the embedding pipeline
generateEmbedding = await pipeline('feature-extraction', onnxEmbeddingModel, {
quantized: onnxEmbeddingModelQuantized,
});
// Create the embedding pipeline
generateEmbedding = await pipeline('feature-extraction', onnxEmbeddingModel, {
quantized: onnxEmbeddingModelQuantized,
});
// Split the text into sentences
const sentences = sentencize(text);
// Split the text into sentences
const sentences = sentencize(text);
// Compute the similarities between sentences
const { similarities, average, variance } = await computeAdvancedSimilarities(
sentences,
{
numSimilaritySentencesLookahead: numSimilaritySentencesLookahead,
logging: logging,
}
);
// Dynamically adjust the similarity threshold based on variance and average
let dynamicThreshold = similarityThreshold;
if (average != null && variance != null) {
dynamicThreshold = adjustThreshold(average, variance, similarityThreshold, dynamicThresholdLowerBound, dynamicThresholdUpperBound );
// Compute the similarities between sentences
const { similarities, average, variance } = await computeAdvancedSimilarities(
sentences,
{
numSimilaritySentencesLookahead: numSimilaritySentencesLookahead,
logging: logging,
}
);
// Create the initial chunks using the adjusted threshold
const initialChunks = createChunks(sentences, similarities, maxTokenSize, dynamicThreshold, logging);
// Dynamically adjust the similarity threshold based on variance and average
let dynamicThreshold = similarityThreshold;
if (average != null && variance != null) {
dynamicThreshold = adjustThreshold(average, variance, similarityThreshold, dynamicThresholdLowerBound, dynamicThresholdUpperBound);
}
// Create the initial chunks using the adjusted threshold
const initialChunks = createChunks(sentences, similarities, maxTokenSize, dynamicThreshold, logging);
if (logging) {
console.log('\n=============\ninitialChunks\n=============');
initialChunks.forEach((chunk, index) => {
console.log("\n");
console.log(`--------------`);
console.log(`-- Chunk ${(index + 1)} --`);
console.log(`--------------`);
console.log(chunk);
});
}
// // combine similar chunks and balance sizes
if (combineChunks) {
const combinedChunks = await optimizeAndRebalanceChunks(initialChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold);
if (logging) {
console.log('\n=============\ninitialChunks\n=============');
initialChunks.forEach((chunk, index) => {
console.log("\n");
console.log(`--------------`);
console.log(`-- Chunk ${(index + 1)} --`);
console.log(`--------------`);
console.log('\n\n=============\ncombinedChunks\n=============');
combinedChunks.forEach((chunk, index) => {
console.log("\n\n\n");
console.log("--------------------");
console.log("Chunk " + (index + 1));
console.log("--------------------");
console.log(chunk);
});
}
// Return the combined chunks
return combinedChunks;
} else {
// Return the initial chunks
return initialChunks;
}
}
// // combine similar chunks and balance sizes
if (combineChunks) {
const combinedChunks = await optimizeAndRebalanceChunks(initialChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold);
if (logging) {
console.log('\n\n=============\ncombinedChunks\n=============');
combinedChunks.forEach((chunk, index) => {
console.log("\n\n\n");
console.log("--------------------");
console.log("Chunk " + (index + 1));
console.log("--------------------");
console.log(chunk);
});
}
// Return the combined chunks
return combinedChunks;
} else {
// Return the initial chunks
return initialChunks;
}
// --------------------------
// -- Main cramit function --
// --------------------------
export async function cramit(
text,
{
logging = LOGGING,
maxTokenSize = MAX_TOKEN_SIZE,
onnxEmbeddingModel = ONNX_EMBEDDING_MODEL,
onnxEmbeddingModelQuantized = ONNX_EMBEDDING_MODEL_QUANTIZED,
} = {}) {
// Load the tokenizer
tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);
// Split the text into sentences
const sentences = sentencize(text);
// Create chunks
const chunks = createChunks(sentences, null, maxTokenSize, 0, logging);
if (logging) {
console.log('\n=============\nChunks\n=============');
chunks.forEach((chunk, index) => {
console.log("\n");
console.log(`--------------`);
console.log(`-- Chunk ${(index + 1)} --`);
console.log(`--------------`);
console.log(chunk);
});
}
// Return chunks
return chunks;
}

@@ -152,4 +189,4 @@

for (let i = 0; i < embeddings.length - 1; i++) {
const sim = cosineSimilarity(embeddings[i], embeddings[i + 1]);
similarities.push(sim);
const sim = cosineSimilarity(embeddings[i], embeddings[i + 1]);
similarities.push(sim);
}

@@ -166,3 +203,3 @@

if (logging) console.log('numSimilaritySentencesLookahead', numSimilaritySentencesLookahead);
const embeddings = await Promise.all(sentences.map(sentence => createEmbedding(sentence)));

@@ -174,3 +211,3 @@ let similarities = [];

let maxSimilarity = cosineSimilarity(embeddings[i], embeddings[i + 1]);
for (let j = i + 2; j <= i + numSimilaritySentencesLookahead && j < embeddings.length; j++) {

@@ -229,16 +266,16 @@ const sim = cosineSimilarity(embeddings[i], embeddings[j]);

let normB = 0.0;
for (let i = 0; i < vecA.length; i++) {
dotProduct += vecA[i] * vecB[i];
normA += vecA[i] ** 2;
normB += vecB[i] ** 2;
dotProduct += vecA[i] * vecB[i];
normA += vecA[i] ** 2;
normB += vecB[i] ** 2;
}
normA = Math.sqrt(normA);
normB = Math.sqrt(normB);
if (normA === 0 || normB === 0) {
return 0; // To avoid division by zero
return 0; // To avoid division by zero
} else {
return dotProduct / (normA * normB);
return dotProduct / (normA * normB);
}

@@ -258,36 +295,55 @@ }

if (logging) { console.log(`!! new chunk !! --> 1`) }
for (let i = 1; i < sentences.length; i++) {
currentChunkSize = tokenizer(currentChunk.join(" ")).input_ids.size;
sentenceTokenCount = tokenizer(sentences[i]).input_ids.size;
currentChunkSize = tokenizer(currentChunk.join(" ")).input_ids.size;
sentenceTokenCount = tokenizer(sentences[i]).input_ids.size;
if (logging) {
console.log('sentenceTokenCount', sentenceTokenCount);
console.log('currentChunkSize', currentChunkSize);
console.log('maxTokenSize', maxTokenSize)
console.log('similarity', similarities[i - 1])
console.log('similarityThreshold', similarityThreshold)
}
if (logging) {
console.log('sentenceTokenCount', sentenceTokenCount);
console.log('currentChunkSize', currentChunkSize);
console.log('maxTokenSize', maxTokenSize);
if (similarities) {
console.log('similarity', similarities[i - 1])
console.log('similarityThreshold', similarityThreshold)
}
}
if (similarities[i - 1] >= similarityThreshold && currentChunkSize + sentenceTokenCount <= maxTokenSize) {
currentChunk.push(sentences[i]);
if (similarities) {
if (similarities[i - 1] >= similarityThreshold && currentChunkSize + sentenceTokenCount <= maxTokenSize) {
currentChunk.push(sentences[i]);
if (logging) { console.log('keep going...') }
} else {
chunks.push(currentChunk.join(" "));
currentChunk = [sentences[i]];
if (logging) { console.log('keep going...') }
} else {
chunks.push(currentChunk.join(" "));
currentChunk = [sentences[i]];
if (logging) {
console.log('stop...')
console.log('\n')
console.log(`!! new chunk !! --> ${chunks.length + 1}`)
}
}
if (logging) {
console.log('stop...')
console.log('\n')
console.log(`!! new chunk !! --> ${chunks.length + 1}`)
}
}
} else {
if (currentChunkSize + sentenceTokenCount <= maxTokenSize) {
currentChunk.push(sentences[i]);
if (logging) { console.log('keep going...') }
} else {
chunks.push(currentChunk.join(" "));
currentChunk = [sentences[i]];
if (logging) {
console.log('stop...')
console.log('\n')
console.log(`!! new chunk !! --> ${chunks.length + 1}`)
}
}
}
}
// Add the last chunk if it's not empty
if (currentChunk.length > 0 && currentChunk[0] !== "") {
chunks.push(currentChunk.join(" "));
chunks.push(currentChunk.join(" "));
}
return chunks;

@@ -313,3 +369,3 @@ }

const similarity = currentEmbedding ? cosineSimilarity(currentEmbedding, nextEmbedding) : 0;
if (similarity >= combineChunksSimilarityThreshold) {

@@ -316,0 +372,0 @@ currentChunkText += " " + chunk;

{
"name": "semantic-chunking",
"version": "1.1.0",
"description": "semantically create chunks from large text (useful for passing to LLM workflows)",
"version": "1.2.0",
"description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).",
"repository": {

@@ -6,0 +6,0 @@ "type": "git",

# 🍱 semantic-chunking
Semantically create chunks from large texts, which is useful for workflows involving large language models (LLMs).
Semantically create chunks from large texts.
Useful for workflows involving large language models (LLMs).

@@ -89,2 +90,5 @@ ## Install

Look at the `example.js` file in the root of this project for a more complex example of using all the optional parameters.
## Tuning

@@ -170,2 +174,56 @@

Each of these parameters allows you to customize the `chunkit` function to better fit the text size, content complexity, and performance requirements of your application.
Each of these parameters allows you to customize the `chunkit` function to better fit the text size, content complexity, and performance requirements of your application.
---
## `cramit` - 🧼 The Quick & Dirty
There is an additional function you can import to just "cram" sentences together till they meet your target token size for when you just need high quick desity chunks.
Basic usage:
```javascript
import { cramit } from 'semantic-chunking';
let frogText = "A frog hops into a deli and croaks to the cashier, \"I'll have a sandwich, please.\" The cashier, surprised, quickly makes the sandwich and hands it over. The frog takes a big bite, looks around, and then asks, \"Do you have any flies to go with this?\" The cashier, taken aback, replies, \"Sorry, we're all out of flies today.\" The frog shrugs and continues munching on its sandwich, clearly unfazed by the lack of fly toppings. Just another day in the life of a sandwich-loving amphibian! 🐸🥪";
async function main() {
let myFrogChunks = await cramit(frogText, { maxTokenSize: 65 });
console.log("myFrogChunks", myFrogChunks);
}
main();
```
Look at the `example2.js` file in the root of this project for a more complex example of using all the optional parameters.
### Tuning
The behavior of the `chunkit` function can be finely tuned using several optional parameters in the options object. Understanding how each parameter affects the function can help you optimize the chunking process for your specific requirements.
#### `logging`
- **Type**: Boolean
- **Default**: `false`
- **Description**: Enables detailed debug output during the chunking process. Turning this on can help in diagnosing how chunks are formed or why certain chunks are combined.
#### `maxTokenSize`
- **Type**: Integer
- **Default**: `500`
- **Description**: Sets the maximum number of tokens allowed in a single chunk. Smaller values result in smaller, more numerous chunks, while larger values can create fewer, larger chunks. It’s crucial for maintaining manageable chunk sizes when processing large texts.
#### `onnxEmbeddingModel`
- **Type**: String
- **Default**: `Xenova/paraphrase-multilingual-MiniLM-L12-v2`
- **Description**: Specifies the model used to generate sentence embeddings. Different models may yield different qualities of embeddings, affecting the chunking quality, especially in multilingual contexts.
- **Resource Link**: [ONNX Embedding Models](https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending)
Link to a filtered list of embedding models converted to ONNX library format by Xenova.
Refer to the Model table below for a list of suggested models and their sizes (choose a multilingual model if you need to chunk text other than English).
#### `onnxEmbeddingModelQuantized`
- **Type**: Boolean
- **Default**: `true`
- **Description**: Indicates whether to use a quantized version of the specified model. Quantized models generally offer faster performance with a slight trade-off in accuracy, which can be beneficial when processing very large datasets.
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc