@@ -60,61 +60,98 @@ // ===========================

		// Load the tokenizer
		tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);
		// Load the tokenizer
		tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);

		// Create the embedding pipeline
		generateEmbedding = await pipeline('feature-extraction', onnxEmbeddingModel, {
		quantized: onnxEmbeddingModelQuantized,
		});
		// Create the embedding pipeline
		generateEmbedding = await pipeline('feature-extraction', onnxEmbeddingModel, {
		quantized: onnxEmbeddingModelQuantized,
		});

		// Split the text into sentences
		const sentences = sentencize(text);
		// Split the text into sentences
		const sentences = sentencize(text);

		// Compute the similarities between sentences
		const { similarities, average, variance } = await computeAdvancedSimilarities(
		sentences,
		{
		numSimilaritySentencesLookahead: numSimilaritySentencesLookahead,
		logging: logging,
		}
		);

		// Dynamically adjust the similarity threshold based on variance and average
		let dynamicThreshold = similarityThreshold;
		if (average != null && variance != null) {
		dynamicThreshold = adjustThreshold(average, variance, similarityThreshold, dynamicThresholdLowerBound, dynamicThresholdUpperBound );
		// Compute the similarities between sentences
		const { similarities, average, variance } = await computeAdvancedSimilarities(
		sentences,
		{
		numSimilaritySentencesLookahead: numSimilaritySentencesLookahead,
		logging: logging,
		}
		);

		// Create the initial chunks using the adjusted threshold
		const initialChunks = createChunks(sentences, similarities, maxTokenSize, dynamicThreshold, logging);
		// Dynamically adjust the similarity threshold based on variance and average
		let dynamicThreshold = similarityThreshold;
		if (average != null && variance != null) {
		dynamicThreshold = adjustThreshold(average, variance, similarityThreshold, dynamicThresholdLowerBound, dynamicThresholdUpperBound);
		}

		// Create the initial chunks using the adjusted threshold
		const initialChunks = createChunks(sentences, similarities, maxTokenSize, dynamicThreshold, logging);

		if (logging) {
		console.log('\n=============\ninitialChunks\n=============');
		initialChunks.forEach((chunk, index) => {
		console.log("\n");
		console.log(`--------------`);
		console.log(`-- Chunk ${(index + 1)} --`);
		console.log(`--------------`);
		console.log(chunk);
		});
		}

		// // combine similar chunks and balance sizes
		if (combineChunks) {
		const combinedChunks = await optimizeAndRebalanceChunks(initialChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold);
		if (logging) {
		console.log('\n=============\ninitialChunks\n=============');
		initialChunks.forEach((chunk, index) => {
		console.log("\n");
		console.log(`--------------`);
		console.log(`-- Chunk ${(index + 1)} --`);
		console.log(`--------------`);
		console.log('\n\n=============\ncombinedChunks\n=============');
		combinedChunks.forEach((chunk, index) => {
		console.log("\n\n\n");
		console.log("--------------------");
		console.log("Chunk " + (index + 1));
		console.log("--------------------");
		console.log(chunk);
		});
		}
		// Return the combined chunks
		return combinedChunks;
		} else {
		// Return the initial chunks
		return initialChunks;
		}
		}

		// // combine similar chunks and balance sizes
		if (combineChunks) {
		const combinedChunks = await optimizeAndRebalanceChunks(initialChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold);
		if (logging) {
		console.log('\n\n=============\ncombinedChunks\n=============');
		combinedChunks.forEach((chunk, index) => {
		console.log("\n\n\n");
		console.log("--------------------");
		console.log("Chunk " + (index + 1));
		console.log("--------------------");
		console.log(chunk);
		});
		}
		// Return the combined chunks
		return combinedChunks;
		} else {
		// Return the initial chunks
		return initialChunks;
		}

		// --------------------------
		// -- Main cramit function --
		// --------------------------
		export async function cramit(
		text,
		{
		logging = LOGGING,
		maxTokenSize = MAX_TOKEN_SIZE,
		onnxEmbeddingModel = ONNX_EMBEDDING_MODEL,
		onnxEmbeddingModelQuantized = ONNX_EMBEDDING_MODEL_QUANTIZED,
		} = {}) {

		// Load the tokenizer
		tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);

		// Split the text into sentences
		const sentences = sentencize(text);

		// Create chunks
		const chunks = createChunks(sentences, null, maxTokenSize, 0, logging);

		if (logging) {
		console.log('\n=============\nChunks\n=============');
		chunks.forEach((chunk, index) => {
		console.log("\n");
		console.log(`--------------`);
		console.log(`-- Chunk ${(index + 1)} --`);
		console.log(`--------------`);
		console.log(chunk);
		});
		}

		// Return chunks
		return chunks;
		}
		@@ -152,4 +189,4 @@
		for (let i = 0; i < embeddings.length - 1; i++) {
		const sim = cosineSimilarity(embeddings[i], embeddings[i + 1]);
		similarities.push(sim);
		const sim = cosineSimilarity(embeddings[i], embeddings[i + 1]);
		similarities.push(sim);
		}
		@@ -166,3 +203,3 @@
		if (logging) console.log('numSimilaritySentencesLookahead', numSimilaritySentencesLookahead);


		const embeddings = await Promise.all(sentences.map(sentence => createEmbedding(sentence)));
		@@ -174,3 +211,3 @@ let similarities = [];
		let maxSimilarity = cosineSimilarity(embeddings[i], embeddings[i + 1]);


		for (let j = i + 2; j <= i + numSimilaritySentencesLookahead && j < embeddings.length; j++) {
		@@ -229,16 +266,16 @@ const sim = cosineSimilarity(embeddings[i], embeddings[j]);
		let normB = 0.0;


		for (let i = 0; i < vecA.length; i++) {
		dotProduct += vecA[i] * vecB[i];
		normA += vecA[i] ** 2;
		normB += vecB[i] ** 2;
		dotProduct += vecA[i] * vecB[i];
		normA += vecA[i] ** 2;
		normB += vecB[i] ** 2;
		}


		normA = Math.sqrt(normA);
		normB = Math.sqrt(normB);


		if (normA === 0 \|\| normB === 0) {
		return 0; // To avoid division by zero
		return 0; // To avoid division by zero
		} else {
		return dotProduct / (normA * normB);
		return dotProduct / (normA * normB);
		}
		@@ -258,36 +295,55 @@ }
		if (logging) { console.log(`!! new chunk !! --> 1`) }


		for (let i = 1; i < sentences.length; i++) {
		currentChunkSize = tokenizer(currentChunk.join(" ")).input_ids.size;
		sentenceTokenCount = tokenizer(sentences[i]).input_ids.size;
		currentChunkSize = tokenizer(currentChunk.join(" ")).input_ids.size;
		sentenceTokenCount = tokenizer(sentences[i]).input_ids.size;

		if (logging) {
		console.log('sentenceTokenCount', sentenceTokenCount);
		console.log('currentChunkSize', currentChunkSize);
		console.log('maxTokenSize', maxTokenSize)
		console.log('similarity', similarities[i - 1])
		console.log('similarityThreshold', similarityThreshold)
		}
		if (logging) {
		console.log('sentenceTokenCount', sentenceTokenCount);
		console.log('currentChunkSize', currentChunkSize);
		console.log('maxTokenSize', maxTokenSize);
		if (similarities) {
		console.log('similarity', similarities[i - 1])
		console.log('similarityThreshold', similarityThreshold)
		}
		}

		if (similarities[i - 1] >= similarityThreshold && currentChunkSize + sentenceTokenCount <= maxTokenSize) {
		currentChunk.push(sentences[i]);
		if (similarities) {
		if (similarities[i - 1] >= similarityThreshold && currentChunkSize + sentenceTokenCount <= maxTokenSize) {
		currentChunk.push(sentences[i]);

		if (logging) { console.log('keep going...') }
		} else {
		chunks.push(currentChunk.join(" "));
		currentChunk = [sentences[i]];
		if (logging) { console.log('keep going...') }
		} else {
		chunks.push(currentChunk.join(" "));
		currentChunk = [sentences[i]];

		if (logging) {
		console.log('stop...')
		console.log('\n')
		console.log(`!! new chunk !! --> ${chunks.length + 1}`)
		}
		}
		if (logging) {
		console.log('stop...')
		console.log('\n')
		console.log(`!! new chunk !! --> ${chunks.length + 1}`)
		}
		}
		} else {
		if (currentChunkSize + sentenceTokenCount <= maxTokenSize) {
		currentChunk.push(sentences[i]);

		if (logging) { console.log('keep going...') }
		} else {
		chunks.push(currentChunk.join(" "));
		currentChunk = [sentences[i]];

		if (logging) {
		console.log('stop...')
		console.log('\n')
		console.log(`!! new chunk !! --> ${chunks.length + 1}`)
		}
		}
		}
		}


		// Add the last chunk if it's not empty
		if (currentChunk.length > 0 && currentChunk[0] !== "") {
		chunks.push(currentChunk.join(" "));
		chunks.push(currentChunk.join(" "));
		}


		return chunks;
		@@ -313,3 +369,3 @@ }
		const similarity = currentEmbedding ? cosineSimilarity(currentEmbedding, nextEmbedding) : 0;


		if (similarity >= combineChunksSimilarityThreshold) {
		@@ -316,0 +372,0 @@ currentChunkText += " " + chunk;

package.json

		{
		"name": "semantic-chunking",
		"version": "1.1.0",
		"description": "semantically create chunks from large text (useful for passing to LLM workflows)",
		"version": "1.2.0",
		"description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).",
		"repository": {
		@@ -6,0 +6,0 @@ "type": "git",

README.md

		# 🍱 semantic-chunking

		Semantically create chunks from large texts, which is useful for workflows involving large language models (LLMs).
		Semantically create chunks from large texts.
		Useful for workflows involving large language models (LLMs).

		@@ -89,2 +90,5 @@ ## Install

		Look at the `example.js` file in the root of this project for a more complex example of using all the optional parameters.


		## Tuning
		@@ -170,2 +174,56 @@

		Each of these parameters allows you to customize the `chunkit` function to better fit the text size, content complexity, and performance requirements of your application.
		Each of these parameters allows you to customize the `chunkit` function to better fit the text size, content complexity, and performance requirements of your application.

		---

		## `cramit` - 🧼 The Quick & Dirty

		There is an additional function you can import to just "cram" sentences together till they meet your target token size for when you just need high quick desity chunks.

		Basic usage:

		```javascript
		import { cramit } from 'semantic-chunking';

		let frogText = "A frog hops into a deli and croaks to the cashier, \"I'll have a sandwich, please.\" The cashier, surprised, quickly makes the sandwich and hands it over. The frog takes a big bite, looks around, and then asks, \"Do you have any flies to go with this?\" The cashier, taken aback, replies, \"Sorry, we're all out of flies today.\" The frog shrugs and continues munching on its sandwich, clearly unfazed by the lack of fly toppings. Just another day in the life of a sandwich-loving amphibian! 🐸🥪";

		async function main() {
		let myFrogChunks = await cramit(frogText, { maxTokenSize: 65 });
		console.log("myFrogChunks", myFrogChunks);
		}
		main();

		```

		Look at the `example2.js` file in the root of this project for a more complex example of using all the optional parameters.

		### Tuning

		The behavior of the `chunkit` function can be finely tuned using several optional parameters in the options object. Understanding how each parameter affects the function can help you optimize the chunking process for your specific requirements.

		#### `logging`

		- Type: Boolean
		- Default: `false`
		- Description: Enables detailed debug output during the chunking process. Turning this on can help in diagnosing how chunks are formed or why certain chunks are combined.

		#### `maxTokenSize`

		- Type: Integer
		- Default: `500`
		- Description: Sets the maximum number of tokens allowed in a single chunk. Smaller values result in smaller, more numerous chunks, while larger values can create fewer, larger chunks. It’s crucial for maintaining manageable chunk sizes when processing large texts.

		#### `onnxEmbeddingModel`

		- Type: String
		- Default: `Xenova/paraphrase-multilingual-MiniLM-L12-v2`
		- Description: Specifies the model used to generate sentence embeddings. Different models may yield different qualities of embeddings, affecting the chunking quality, especially in multilingual contexts.
		- Resource Link: [ONNX Embedding Models](https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending)
		Link to a filtered list of embedding models converted to ONNX library format by Xenova.
		Refer to the Model table below for a list of suggested models and their sizes (choose a multilingual model if you need to chunk text other than English).

		#### `onnxEmbeddingModelQuantized`

		- Type: Boolean
		- Default: `true`
		- Description: Indicates whether to use a quantized version of the specified model. Quantized models generally offer faster performance with a slight trade-off in accuracy, which can be beneficial when processing very large datasets.

test.js

test.txt

semantic-chunking - npm Package Compare versions

Fixed alerts

Improved metrics

Worsened metrics