semantic-chunking - npm Package Compare versions

Comparing version 0.1.0 to 1.0.0

chunkit.js

		@@ -9,2 +9,6 @@ import { env, pipeline, AutoTokenizer } from '@xenova/transformers';

		// tokenizer and generateEmbedding global variables
		let tokenizer;
		let generateEmbedding;

		// default parameters
		@@ -16,7 +20,4 @@ const LOGGING = false;
		const ONNX_EMBEDDING_MODEL_QUANTIZED = true;
		const COMBINE_SIMILARITY_CHUNKS = true;

		// tokenizer and generateEmbedding global variables
		let tokenizer;
		let generateEmbedding;

		// ---------------------------
		@@ -26,9 +27,12 @@ // -- Main chunkit function --
		export async function chunkit(
		text = "",
		logging = LOGGING,
		maxTokenSize = MAX_TOKEN_SIZE,
		similarityThreshold = SIMILARITY_THRESHOLD,
		onnxEmbeddingModel = ONNX_EMBEDDING_MODEL,
		onnxEmbeddingModelQuantized = ONNX_EMBEDDING_MODEL_QUANTIZED
		) {
		text,
		{
		logging = LOGGING,
		maxTokenSize = MAX_TOKEN_SIZE,
		similarityThreshold = SIMILARITY_THRESHOLD,
		onnxEmbeddingModel = ONNX_EMBEDDING_MODEL,
		onnxEmbeddingModelQuantized = ONNX_EMBEDDING_MODEL_QUANTIZED,
		combineSimilarityChunks = COMBINE_SIMILARITY_CHUNKS
		} = {}) {

		// Load the tokenizer
		@@ -62,16 +66,21 @@ tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);
		// Combine initial chunks into larger ones without exceeding maxTokenSize
		const combinedChunks = combineChunks(initialChunks, maxTokenSize, tokenizer, logging);
		if (logging) {
		console.log('\n\n=============\ncombinedChunks\n=============');
		combinedChunks.forEach((chunk, index) => {
		console.log("\n\n\n");
		console.log("--------------------");
		console.log("Chunk " + (index + 1));
		console.log("--------------------");
		console.log(chunk);
		});
		if (combineSimilarityChunks) {
		const combinedChunks = combineChunks(initialChunks, maxTokenSize, tokenizer, logging);
		if (logging) {
		console.log('\n\n=============\ncombinedChunks\n=============');
		combinedChunks.forEach((chunk, index) => {
		console.log("\n\n\n");
		console.log("--------------------");
		console.log("Chunk " + (index + 1));
		console.log("--------------------");
		console.log(chunk);
		});
		}

		// Return the combined chunks
		return combinedChunks;
		} else {
		// Return the initial chunks
		return initialChunks;
		}

		// Return the combined chunks
		return combinedChunks;
		}
		@@ -84,2 +93,8 @@
		export async function test() {
		console.log('\n\n');
		console.log('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!');
		console.log('!!! Running test function... !!!');
		console.log('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!');
		console.log('\n\n');

		const text = await fs.promises.readFile('./example.txt', 'utf8');
		@@ -89,3 +104,3 @@
		try {
		await chunkit(text, true);
		await chunkit(text, { logging: true, similarityThreshold: .7 });
		} catch (error) {
		@@ -95,6 +110,6 @@ console.error(error);
		}
		// await test()




		// **********************
		@@ -101,0 +116,0 @@ // HELPER FUNCTIONS

package.json

		{
		"name": "semantic-chunking",
		"version": "0.1.0",
		"version": "1.0.0",
		"description": "semantically create chunks from large text (useful for passing to LLM workflows)",
		@@ -19,2 +19,6 @@ "repository": {
		"license": "ISC",
		"scripts": {
		"clean-models": "find ./models -type f ! -name '*.url' -delete",
		"clean-models-win": "powershell -Command \"Get-ChildItem -Path ./models -Recurse \| Where-Object { !$_.PSIsContainer -and $_.Extension -ne '.url' } \| Remove-Item\""
		},
		"dependencies": {
		@@ -21,0 +25,0 @@ "@xenova/transformers": "^2.15.1",

README.md

		@@ -1,2 +0,2 @@
		# semantic-chunking
		# 🍱 semantic-chunking
		semantically create chunks from large text (useful for passing to LLM workflows)
		@@ -11,3 +11,4 @@

		### Usage:
		### Usage

		```
		@@ -19,1 +20,71 @@ import { chunkit } from 'semantic-chunking';
		```

		### All Parameters

		```
		chunkit(
		text,
		{ // options object
		logging,
		maxTokenSize,
		similarityThreshold,
		onnxEmbeddingModel,
		onnxEmbeddingModelQuantized,
		combineSimilarityChunks
		}
		)
		```

		- `text`
		full string to split into chunks

		- options object [optional]
		- `logging` [optional \| boolean \| default `false` ]
		- `maxTokenSize` [optional \| int \| default `500`]
		max possible token size of each chunk
		- `similarityThreshold` [optional \| float \| default `.567`]
		threshold value used to determin if paired sentences are semantically close enough to be included in the same chunk
		- `onnxEmbeddingModel` [optional \| string \| default `Xenova/paraphrase-multilingual-MiniLM-L12-v2`]
		ONNX model to use for creating embeddings for similarity comparison (model name on huggingface)
		- `onnxEmbeddingModelQuantized` [optional \| boolean \| default `true`]
		if the quantized version of the model should be used
		- `combineSimilarityChunks` [optional \| boolean \| default `true`]
		if true, the initial round of smaller semantic similar chunks are combined to make larger chunks up to the defined max token limit
		---

		### Workflow

		- `text` is split into an array of `sentences`
		- a `vector` is created for each `sentence`
		- a `cosine similarity` score is created for each `sentence pair`
		- each `sentence` is added to a chunk until the `similarity threshold` or `max token size` for the `chunk` is exceeded
		- after all `similary chunks` are created `combine similary chunks` into `large chunks` up to the `max token size` unless the `combineSimilaryityChunks` was set to false

		---

		### Example Calls

		```
		import { chunkit } from 'semantic-chunking';

		const text = await fs.promises.readFile('./example.txt', 'utf8');
		let myChunks = await chunkit(text, { logging: true, similarityThreshold: .9 });

		myChunks.forEach((chunk, index) => {
		console.log("--------------------");
		console.log("Chunk " + (index + 1));
		console.log("--------------------");
		console.log(chunk);
		console.log("\n\n");
		});

		```

		```
		import { chunkit } from 'semantic-chunking';

		let frogText = "A frog hops into a deli and croaks to the cashier, \"I'll have a sandwich, please.\" The cashier, surprised, quickly makes the sandwich and hands it over. The frog takes a big bite, looks around, and then asks, \"Do you have any flies to go with this?\" The cashier, taken aback, replies, \"Sorry, we're all out of flies today.\" The frog shrugs and continues munching on its sandwich, clearly unfazed by the lack of fly toppings. Just another day in the life of a sandwich-loving amphibian! 🐸🥪";

		let myFrogChunks = await chunkit(frogText, { maxTokenSize: 65 });
		console.log("myFrogChunks", myFrogChunks);
		```

semantic-chunking - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics