semantic-chunking
Advanced tools
Comparing version 0.1.0 to 1.0.0
@@ -9,2 +9,6 @@ import { env, pipeline, AutoTokenizer } from '@xenova/transformers'; | ||
// tokenizer and generateEmbedding global variables | ||
let tokenizer; | ||
let generateEmbedding; | ||
// default parameters | ||
@@ -16,7 +20,4 @@ const LOGGING = false; | ||
const ONNX_EMBEDDING_MODEL_QUANTIZED = true; | ||
const COMBINE_SIMILARITY_CHUNKS = true; | ||
// tokenizer and generateEmbedding global variables | ||
let tokenizer; | ||
let generateEmbedding; | ||
// --------------------------- | ||
@@ -26,9 +27,12 @@ // -- Main chunkit function -- | ||
export async function chunkit( | ||
text = "", | ||
logging = LOGGING, | ||
maxTokenSize = MAX_TOKEN_SIZE, | ||
similarityThreshold = SIMILARITY_THRESHOLD, | ||
onnxEmbeddingModel = ONNX_EMBEDDING_MODEL, | ||
onnxEmbeddingModelQuantized = ONNX_EMBEDDING_MODEL_QUANTIZED | ||
) { | ||
text, | ||
{ | ||
logging = LOGGING, | ||
maxTokenSize = MAX_TOKEN_SIZE, | ||
similarityThreshold = SIMILARITY_THRESHOLD, | ||
onnxEmbeddingModel = ONNX_EMBEDDING_MODEL, | ||
onnxEmbeddingModelQuantized = ONNX_EMBEDDING_MODEL_QUANTIZED, | ||
combineSimilarityChunks = COMBINE_SIMILARITY_CHUNKS | ||
} = {}) { | ||
// Load the tokenizer | ||
@@ -62,16 +66,21 @@ tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel); | ||
// Combine initial chunks into larger ones without exceeding maxTokenSize | ||
const combinedChunks = combineChunks(initialChunks, maxTokenSize, tokenizer, logging); | ||
if (logging) { | ||
console.log('\n\n=============\ncombinedChunks\n============='); | ||
combinedChunks.forEach((chunk, index) => { | ||
console.log("\n\n\n"); | ||
console.log("--------------------"); | ||
console.log("Chunk " + (index + 1)); | ||
console.log("--------------------"); | ||
console.log(chunk); | ||
}); | ||
if (combineSimilarityChunks) { | ||
const combinedChunks = combineChunks(initialChunks, maxTokenSize, tokenizer, logging); | ||
if (logging) { | ||
console.log('\n\n=============\ncombinedChunks\n============='); | ||
combinedChunks.forEach((chunk, index) => { | ||
console.log("\n\n\n"); | ||
console.log("--------------------"); | ||
console.log("Chunk " + (index + 1)); | ||
console.log("--------------------"); | ||
console.log(chunk); | ||
}); | ||
} | ||
// Return the combined chunks | ||
return combinedChunks; | ||
} else { | ||
// Return the initial chunks | ||
return initialChunks; | ||
} | ||
// Return the combined chunks | ||
return combinedChunks; | ||
} | ||
@@ -84,2 +93,8 @@ | ||
export async function test() { | ||
console.log('\n\n'); | ||
console.log('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'); | ||
console.log('!!! Running test function... !!!'); | ||
console.log('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'); | ||
console.log('\n\n'); | ||
const text = await fs.promises.readFile('./example.txt', 'utf8'); | ||
@@ -89,3 +104,3 @@ | ||
try { | ||
await chunkit(text, true); | ||
await chunkit(text, { logging: true, similarityThreshold: .7 }); | ||
} catch (error) { | ||
@@ -95,6 +110,6 @@ console.error(error); | ||
} | ||
// await test() | ||
// ********************** | ||
@@ -101,0 +116,0 @@ // ** HELPER FUNCTIONS ** |
{ | ||
"name": "semantic-chunking", | ||
"version": "0.1.0", | ||
"version": "1.0.0", | ||
"description": "semantically create chunks from large text (useful for passing to LLM workflows)", | ||
@@ -19,2 +19,6 @@ "repository": { | ||
"license": "ISC", | ||
"scripts": { | ||
"clean-models": "find ./models -type f ! -name '*.url' -delete", | ||
"clean-models-win": "powershell -Command \"Get-ChildItem -Path ./models -Recurse | Where-Object { !$_.PSIsContainer -and $_.Extension -ne '.url' } | Remove-Item\"" | ||
}, | ||
"dependencies": { | ||
@@ -21,0 +25,0 @@ "@xenova/transformers": "^2.15.1", |
@@ -1,2 +0,2 @@ | ||
# semantic-chunking | ||
# 🍱 semantic-chunking | ||
semantically create chunks from large text (useful for passing to LLM workflows) | ||
@@ -11,3 +11,4 @@ | ||
### Usage: | ||
### Usage | ||
``` | ||
@@ -19,1 +20,71 @@ import { chunkit } from 'semantic-chunking'; | ||
``` | ||
### All Parameters | ||
``` | ||
chunkit( | ||
text, | ||
{ // options object | ||
logging, | ||
maxTokenSize, | ||
similarityThreshold, | ||
onnxEmbeddingModel, | ||
onnxEmbeddingModelQuantized, | ||
combineSimilarityChunks | ||
} | ||
) | ||
``` | ||
- `text` | ||
full string to split into chunks | ||
- options object [optional] | ||
- `logging` [optional | boolean | default `false` ] | ||
- `maxTokenSize` [optional | int | default `500`] | ||
max possible token size of each chunk | ||
- `similarityThreshold` [optional | float | default `.567`] | ||
threshold value used to determin if paired sentences are semantically close enough to be included in the same chunk | ||
- `onnxEmbeddingModel` [optional | string | default `Xenova/paraphrase-multilingual-MiniLM-L12-v2`] | ||
ONNX model to use for creating embeddings for similarity comparison (model name on huggingface) | ||
- `onnxEmbeddingModelQuantized` [optional | boolean | default `true`] | ||
if the quantized version of the model should be used | ||
- `combineSimilarityChunks` [optional | boolean | default `true`] | ||
if true, the initial round of smaller semantic similar chunks are combined to make larger chunks up to the defined max token limit | ||
--- | ||
### Workflow | ||
- `text` is split into an array of `sentences` | ||
- a `vector` is created for each `sentence` | ||
- a `cosine similarity` score is created for each `sentence pair` | ||
- each `sentence` is added to a chunk until the `similarity threshold` or `max token size` for the `chunk` is exceeded | ||
- after all `similary chunks` are created `combine similary chunks` into `large chunks` up to the `max token size` unless the `combineSimilaryityChunks` was set to false | ||
--- | ||
### Example Calls | ||
``` | ||
import { chunkit } from 'semantic-chunking'; | ||
const text = await fs.promises.readFile('./example.txt', 'utf8'); | ||
let myChunks = await chunkit(text, { logging: true, similarityThreshold: .9 }); | ||
myChunks.forEach((chunk, index) => { | ||
console.log("--------------------"); | ||
console.log("Chunk " + (index + 1)); | ||
console.log("--------------------"); | ||
console.log(chunk); | ||
console.log("\n\n"); | ||
}); | ||
``` | ||
``` | ||
import { chunkit } from 'semantic-chunking'; | ||
let frogText = "A frog hops into a deli and croaks to the cashier, \"I'll have a sandwich, please.\" The cashier, surprised, quickly makes the sandwich and hands it over. The frog takes a big bite, looks around, and then asks, \"Do you have any flies to go with this?\" The cashier, taken aback, replies, \"Sorry, we're all out of flies today.\" The frog shrugs and continues munching on its sandwich, clearly unfazed by the lack of fly toppings. Just another day in the life of a sandwich-loving amphibian! 🐸🥪"; | ||
let myFrogChunks = await chunkit(frogText, { maxTokenSize: 65 }); | ||
console.log("myFrogChunks", myFrogChunks); | ||
``` |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
17904
222
1
89