Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

semantic-chunking

Package Overview
Dependencies
Maintainers
0
Versions
30
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

semantic-chunking - npm Package Compare versions

Comparing version 2.2.4 to 2.3.0

.aider.chat.history.md

14

CHANGELOG.md

@@ -5,9 +5,19 @@ # Changelog

## [2.3.0] - 2024-11-11
### Updated
- Updated `transformers.js` from v2 to v3
- Migrated quantization option from `onnxEmbeddingModelQuantized` (boolean) to `dtype` ('p32', 'p16', 'q8', 'q4')
- Updated Web UI to use new `dtype` option
## [2.2.5] - 2024-11-08
### Updated
- Updated Web UI styles for smaller screens
## [2.2.4] - 2024-11-08
### Fixed
- Fixed issue with Web UI embedding cache not being cleared when a new model is initialized.
- Fixed issue with Web UI embedding cache not being cleared when a new model is initialized
## [2.2.3] - 2024-11-07
### Added
- Web UI adjustments for display of truncated JSON results on screen but still allowing download of full results.
- Web UI adjustments for display of truncated JSON results on screen but still allowing download of full results

@@ -14,0 +24,0 @@ ## [2.2.2] - 2024-11-07

43

chunkit.js

@@ -11,3 +11,2 @@ // ===========================

import { env } from '@xenova/transformers';
import { splitBySentence } from "string-segmenter"

@@ -34,3 +33,4 @@ import { DEFAULT_CONFIG } from './config.js';

onnxEmbeddingModel = DEFAULT_CONFIG.ONNX_EMBEDDING_MODEL,
onnxEmbeddingModelQuantized = DEFAULT_CONFIG.ONNX_EMBEDDING_MODEL_QUANTIZED,
dtype = DEFAULT_CONFIG.DTYPE,
onnxEmbeddingModelQuantized,
localModelPath = DEFAULT_CONFIG.LOCAL_MODEL_PATH,

@@ -49,8 +49,12 @@ modelCacheDir = DEFAULT_CONFIG.MODEL_CACHE_DIR,

// Set env variables if provided
if (localModelPath) env.localModelPath = localModelPath;
if (modelCacheDir) env.cacheDir = modelCacheDir;
// if legacy boolean is used (onnxEmbeddingModelQuantized), set dtype (model precision) to 'q8'
if (onnxEmbeddingModelQuantized === true) { dtype = 'q8'; }
// Initialize embedding utilities
const { modelName, isQuantized } = await initializeEmbeddingUtils(onnxEmbeddingModel, onnxEmbeddingModelQuantized);
// Initialize embedding utilities and set optional paths
const { modelName, dtype: usedDtype } = await initializeEmbeddingUtils(
onnxEmbeddingModel,
dtype,
localModelPath,
modelCacheDir
);

@@ -102,3 +106,3 @@ // Process each document

console.log(`--------------`);
console.log(chunk);
console.log(chunk.substring(0, 50) + '...');
});

@@ -119,3 +123,3 @@ }

console.log("--------------------");
console.log(chunk);
console.log(chunk.substring(0, 50) + '...');
});

@@ -139,3 +143,3 @@ }

model_name: modelName,
is_model_quantized: isQuantized,
dtype: usedDtype,
text: prefixedChunk

@@ -186,3 +190,4 @@ };

onnxEmbeddingModel = DEFAULT_CONFIG.ONNX_EMBEDDING_MODEL,
onnxEmbeddingModelQuantized = DEFAULT_CONFIG.ONNX_EMBEDDING_MODEL_QUANTIZED,
onnxEmbeddingModelQuantized,
dtype = DEFAULT_CONFIG.DTYPE,
localModelPath = DEFAULT_CONFIG.LOCAL_MODEL_PATH,

@@ -201,8 +206,12 @@ modelCacheDir = DEFAULT_CONFIG.MODEL_CACHE_DIR,

// Set env variables if provided
if (localModelPath) env.localModelPath = localModelPath;
if (modelCacheDir) env.cacheDir = modelCacheDir;
// if legacy boolean is used (onnxEmbeddingModelQuantized), set dtype (model precision) to 'q8'
if (onnxEmbeddingModelQuantized === true) { dtype = 'q8'; }
// Initialize embedding utilities
const { modelName, isQuantized } = await initializeEmbeddingUtils(onnxEmbeddingModel, onnxEmbeddingModelQuantized);
// Initialize embedding utilities with paths
const { modelName, isQuantized } = await initializeEmbeddingUtils(
onnxEmbeddingModel,
onnxEmbeddingModelQuantized,
localModelPath,
modelCacheDir
);

@@ -232,3 +241,3 @@ // Process each document

console.log(`--------------`);
console.log(chunk);
console.log(chunk.substring(0, 50) + '...');
});

@@ -235,0 +244,0 @@ }

@@ -11,3 +11,3 @@ export const DEFAULT_CONFIG = {

ONNX_EMBEDDING_MODEL: "Xenova/all-MiniLM-L6-v2",
ONNX_EMBEDDING_MODEL_QUANTIZED: true,
DTYPE: 'fp32',
LOCAL_MODEL_PATH: null,

@@ -14,0 +14,0 @@ MODEL_CACHE_DIR: null,

@@ -1,2 +0,2 @@

import { env, pipeline, AutoTokenizer } from '@xenova/transformers';
import { env, pipeline, AutoTokenizer } from '@huggingface/transformers';

@@ -10,10 +10,18 @@ let tokenizer;

// --------------------------------------------
export async function initializeEmbeddingUtils(onnxEmbeddingModel, onnxEmbeddingModelQuantized) {
export async function initializeEmbeddingUtils(
onnxEmbeddingModel,
dtype = 'fp32',
localModelPath = null,
modelCacheDir = null
) {
// Configure environment
env.allowRemoteModels = true;
if (localModelPath) env.localModelPath = localModelPath;
if (modelCacheDir) env.cacheDir = modelCacheDir;
tokenizer = await AutoTokenizer.from_pretrained(onnxEmbeddingModel);
generateEmbedding = await pipeline('feature-extraction', onnxEmbeddingModel, {
quantized: onnxEmbeddingModelQuantized,
dtype: dtype,
});
// Clear the embedding cache when initializing with a new model
embeddingCache.clear();

@@ -23,3 +31,3 @@

modelName: onnxEmbeddingModel,
isQuantized: onnxEmbeddingModelQuantized
dtype: dtype
};

@@ -26,0 +34,0 @@ }

@@ -16,3 +16,3 @@ // ------------------------

let documents = [];
let textFiles = ['./different.txt', './similar.txt'];
let textFiles = ['./example.txt', './different.txt', './similar.txt'];

@@ -42,3 +42,3 @@ // read each text file and add it to the documents array

onnxEmbeddingModel: "nomic-ai/nomic-embed-text-v1.5",
onnxEmbeddingModelQuantized: true,
dtype: "q8",
localModelPath: "../models",

@@ -60,5 +60,5 @@ modelCacheDir: "../models",

console.log("\n\n\n");
console.log("myTestChunks:");
console.log(myTestChunks);
// console.log("myTestChunks:");
// console.log(myTestChunks);
console.log("length: " + myTestChunks.length);
console.log("trackedTimeSeconds: " + trackedTimeSeconds);
{
"name": "semantic-chunking",
"version": "2.2.4",
"version": "2.3.0",
"description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).",

@@ -39,3 +39,3 @@ "repository": {

"dependencies": {
"@xenova/transformers": "^2.17.2",
"@huggingface/transformers": "^3.0.2",
"cli-progress": "^3.12.0",

@@ -42,0 +42,0 @@ "fs": "^0.0.1-security",

@@ -74,3 +74,3 @@ # 🍱 semantic-chunking

- `onnxEmbeddingModel`: String (optional, default `Xenova/all-MiniLM-L6-v2`) - ONNX model used for creating embeddings.
- `onnxEmbeddingModelQuantized`: Boolean (optional, default `true`) - Indicates whether to use a quantized version of the embedding model.
- `dtype`: String (optional, default `fp32`) - Precision of the embedding model (options: `fp32`, `fp16`, `q8`, `q4`).
- `localModelPath`: String (optional, default `null`) - Local path to save and load models (example: `./models`).

@@ -92,3 +92,3 @@ - `modelCacheDir`: String (optional, default `null`) - Directory to cache downloaded models (example: `./models`).

- `model_name`: String - The name of the embedding model used.
- `is_model_quantized`: Boolean - Indicates whether the embedding model is quantized.
- `dtype`: String - The precision of the embedding model used (options: `fp32`, `fp16`, `q8`, `q4`).
- `text`: String - The chunked text.

@@ -203,3 +203,3 @@ - `embedding`: Array - The embedding vector (if `returnEmbedding` is `true`).

- **Type**: String
- **Default**: `Xenova/paraphrase-multilingual-MiniLM-L12-v2`
- **Default**: `Xenova/all-MiniLM-L6-v2`
- **Description**: Specifies the model used to generate sentence embeddings. Different models may yield different qualities of embeddings, affecting the chunking quality, especially in multilingual contexts.

@@ -219,14 +219,12 @@ - **Resource Link**: [ONNX Embedding Models](https://huggingface.co/models?pipeline_tag=feature-extraction&library=onnx&sort=trending)

| Model | Quantized | Link | Size |
| -------------------------------------------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------- |
| nomic-ai/nomic-embed-text-v1.5 | true | [https://huggingface.co/nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) | 138 MB |
| nomic-ai/nomic-embed-text-v1.5 | false | [https://huggingface.co/nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) | 548 MB |
| Xenova/all-MiniLM-L6-v2 | true | [https://huggingface.co/Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) | 23 MB |
| Xenova/all-MiniLM-L6-v2 | false | [https://huggingface.co/Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) | 90.4 MB |
| Xenova/paraphrase-multilingual-MiniLM-L12-v2 | true | [https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) | 118 MB |
| thenlper/gte-base | false | [https://huggingface.co/thenlper/gte-base](https://huggingface.co/thenlper/gte-base) | 436 MB |
| Xenova/all-distilroberta-v1 | true | [https://huggingface.co/Xenova/all-distilroberta-v1](https://huggingface.co/Xenova/all-distilroberta-v1) | 82.1 MB |
| Xenova/all-distilroberta-v1 | false | [https://huggingface.co/Xenova/all-distilroberta-v1](https://huggingface.co/Xenova/all-distilroberta-v1) | 326 MB |
| BAAI/bge-base-en-v1.5 | false | [https://huggingface.co/BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 436 MB |
| BAAI/bge-small-en-v1.5 | false | [https://huggingface.co/BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) | 133 MB |
| Model | Precision | Link | Size |
| -------------------------------------------- | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------- |
| nomic-ai/nomic-embed-text-v1.5 | fp32, q8 | [https://huggingface.co/nomic-ai/nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) | 548 MB, 138 MB |
| thenlper/gte-base | fp32 | [https://huggingface.co/thenlper/gte-base](https://huggingface.co/thenlper/gte-base) | 436 MB |
| Xenova/all-MiniLM-L6-v2 | fp32, fp16, q8 | [https://huggingface.co/Xenova/all-MiniLM-L6-v2](https://huggingface.co/Xenova/all-MiniLM-L6-v2) | 23 MB, 45 MB, 90 MB |
| Xenova/paraphrase-multilingual-MiniLM-L12-v2 | fp32, fp16, q8 | [https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/Xenova/paraphrase-multilingual-MiniLM-L12-v2) | 470 MB, 235 MB, 118 MB |
| Xenova/all-distilroberta-v1 | fp32, fp16, q8 | [https://huggingface.co/Xenova/all-distilroberta-v1](https://huggingface.co/Xenova/all-distilroberta-v1) | 326 MB, 163 MB, 82 MB |
| BAAI/bge-base-en-v1.5 | fp32 | [https://huggingface.co/BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | 436 MB |
| BAAI/bge-small-en-v1.5 | fp32 | [https://huggingface.co/BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) | 133 MB |
| yashvardhan7/snowflake-arctic-embed-m-onnx | fp32 | [https://huggingface.co/yashvardhan7/snowflake-arctic-embed-m-onnx](https://huggingface.co/yashvardhan7/snowflake-arctic-embed-m-onnx) | 436 MB |

@@ -337,3 +335,3 @@ Each of these parameters allows you to customize the `chunkit` function to better fit the text size, content complexity, and performance requirements of your application.

];
const myDocumentChunks = await chunkit(documents, { chunkPrefix: "search_document" });
const myDocumentChunks = await chunkit(documents, { chunkPrefix: "search_document", returnEmbedding: true });
```

@@ -346,3 +344,3 @@

];
const mySearchQueryChunk = await chunkit(documents, { chunkPrefix: "search_query" });
const mySearchQueryChunk = await cramit(documents, { chunkPrefix: "search_query", returnEmbedding: true });
```

@@ -349,0 +347,0 @@

@@ -0,0 +0,0 @@ import { createEmbedding } from './embeddingUtils.js';

{
"name": "semantic-chunking-webui",
"version": "1.1.4",
"version": "1.3.0",
"lockfileVersion": 3,

@@ -9,8 +9,8 @@ "requires": true,

"name": "semantic-chunking-webui",
"version": "1.1.4",
"version": "1.3.0",
"license": "ISC",
"dependencies": {
"cors": "^2.8.5",
"express": "^4.18.3",
"highlight.js": "^11.10.0"
"dotenv": "^16.4.5",
"express": "^4.18.3"
}

@@ -169,2 +169,13 @@ },

},
"node_modules/dotenv": {
"version": "16.4.5",
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://dotenvx.com"
}
},
"node_modules/ee-first": {

@@ -370,10 +381,2 @@ "version": "1.1.1",

},
"node_modules/highlight.js": {
"version": "11.10.0",
"resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.10.0.tgz",
"integrity": "sha512-SYVnVFswQER+zu1laSya563s+F8VDGt7o35d4utbamowvUNLLMovFqwCLSocpZTz3MgaSRA1IbqRWZv97dtErQ==",
"engines": {
"node": ">=12.0.0"
}
},
"node_modules/http-errors": {

@@ -494,5 +497,5 @@ "version": "2.0.0",

"node_modules/object-inspect": {
"version": "1.13.2",
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.2.tgz",
"integrity": "sha512-IRZSRuzJiynemAXPYtPe5BoI/RESNYR7TYm50MC5Mqbd3Jmw5y790sErYw3V6SryFJD64b74qQQs9wn5Bg/k3g==",
"version": "1.13.3",
"resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.3.tgz",
"integrity": "sha512-kDCGIbxkDSXE3euJZZXzc6to7fCrKHNI/hSRQnRuQ+BWjFNzZwiFF8fj/6o2t2G9/jTj8PSIYTfCLelLZEeRpA==",
"engines": {

@@ -499,0 +502,0 @@ "node": ">= 0.4"

{
"name": "semantic-chunking-webui",
"version": "1.1.4",
"version": "1.3.0",
"type": "module",

@@ -16,5 +16,5 @@ "description": "Web UI for semantic-chunking library",

"cors": "^2.8.5",
"express": "^4.18.3",
"highlight.js": "^11.10.0"
"dotenv": "^16.4.5",
"express": "^4.18.3"
}
}

@@ -136,2 +136,12 @@ // Load sample text on page load

// Add this function near the top of the file
function scrollToResults() {
if (window.innerWidth <= 800) {
const resultsWrapper = document.querySelector('.results-wrapper');
if (resultsWrapper) {
resultsWrapper.scrollIntoView({ behavior: 'smooth' });
}
}
}
// Process form handler

@@ -151,2 +161,5 @@ form.addEventListener('submit', async (e) => {

// Scroll to results as soon as we show the spinner
scrollToResults();
// Get form data and convert checkbox values to boolean

@@ -260,2 +273,5 @@ const formData = new FormData(form);

// After results are displayed, scroll to them on mobile
scrollToResults();
} catch (error) {

@@ -266,4 +282,4 @@ console.error('Error:', error);

if (errorMessage.includes('Could not locate file:')) {
errorMessage += '<br><br>Some models may not have both a quantized & non-quantized version,';
errorMessage += '<br>please toggle this option and try again, or choose a different model';
errorMessage += '<br><br>Not all models have all precision options available.';
errorMessage += '<br>Please try a different precision level and/or model and try again.';
}

@@ -326,3 +342,3 @@

formData[element.name] = element.checked;
} else if (element.name) { // Only process elements with names
} else if (element.name) {
formData[element.name] = element.value;

@@ -332,7 +348,5 @@ }

// No need for additional processing in generateCode since we're already
// getting the actual boolean values here
codeExample.textContent = generateCode(formData);
modal.style.display = "block";
// Clear the highlighted state before highlighting again
document.body.style.overflow = 'hidden'; // Prevent body scrolling
delete codeExample.dataset.highlighted;

@@ -344,3 +358,5 @@ hljs.highlightElement(codeExample);

function generateCode(formData) {
// No need to convert checkbox values since they're already booleans
const dtypeValues = ['fp32', 'fp16', 'q8', 'q4'];
const dtype = dtypeValues[parseInt(formData.dtype)];
return `// import the semantic-chunking library

@@ -370,3 +386,3 @@ import { chunkit } from 'semantic-chunking';

onnxEmbeddingModel: "${formData.onnxEmbeddingModel}",
onnxEmbeddingModelQuantized: ${formData.onnxEmbeddingModelQuantized},
dtype: "${dtype}",
localModelPath: "./models",

@@ -388,2 +404,3 @@ modelCacheDir: "./models",

modal.style.display = "none";
document.body.style.overflow = ''; // Restore body scrolling
};

@@ -395,2 +412,3 @@

modal.style.display = "none";
document.body.style.overflow = ''; // Restore body scrolling
}

@@ -419,2 +437,3 @@ };

modal.style.display = "none";
document.body.style.overflow = ''; // Restore body scrolling
};

@@ -490,2 +509,30 @@

resizeToggle.classList.toggle('wrapped');
});
});
// Add this to your existing range input handlers
const dtypeInput = document.getElementById('dtype');
const dtypeDisplay = dtypeInput.nextElementSibling;
function updateDtypeDisplay(value) {
const dtypeValues = {
0: { text: 'fp32 - Full Precision', class: 'precision-full' },
1: { text: 'fp16 - Half Precision', class: 'precision-half' },
2: { text: 'q8 - 8-bit Quantized', class: 'precision-q8' },
3: { text: 'q4 - 4-bit Quantized', class: 'precision-q4' }
};
const dtype = dtypeValues[value];
const number = dtypeDisplay.querySelector('.number');
const description = dtypeDisplay.querySelector('.description');
number.className = `number ${dtype.class}`;
number.textContent = value;
description.className = `description ${dtype.class}`;
description.textContent = dtype.text;
}
// Initial update
updateDtypeDisplay(dtypeInput.value);
// Update on change
dtypeInput.addEventListener('input', (e) => updateDtypeDisplay(e.target.value));

@@ -30,4 +30,8 @@ {

"label": "BAAI/bge-small-en-v1.5"
},
{
"value": "yashvardhan7/snowflake-arctic-embed-m-onnx",
"label": "yashvardhan7/snowflake-arctic-embed-m-onnx"
}
]
}

@@ -6,3 +6,6 @@ import express from 'express';

import { chunkit } from '../chunkit.js';
import dotenv from 'dotenv';
dotenv.config();
const __filename = fileURLToPath(import.meta.url);

@@ -34,12 +37,17 @@ const __dirname = path.dirname(__filename);

try {
const { documentText, documentName, ...options } = req.body;
const { documentText, documentName, dtype, onnxEmbeddingModelQuantized, ...options } = req.body;
// Convert dtype value to string mapping
const dtypeValues = ['fp32', 'fp16', 'q8', 'q4'];
const dtypeString = dtypeValues[parseInt(dtype)] || 'fp32';
// Input validation
if (!documentText) {
return res.status(400).json({ error: 'Document text is required' });
}
// Process the text with new dtype option
const documents = [{
document_name: documentName || 'sample text',
document_text: documentText
}];
// Convert string values to appropriate types
const processedOptions = {
...options,
dtype: dtypeString,
maxTokenSize: parseInt(options.maxTokenSize),

@@ -52,3 +60,2 @@ similarityThreshold: parseFloat(options.similarityThreshold),

combineChunksSimilarityThreshold: parseFloat(options.combineChunksSimilarityThreshold),
onnxEmbeddingModelQuantized: options.onnxEmbeddingModelQuantized === true,
returnEmbedding: options.returnEmbedding === true,

@@ -61,9 +68,3 @@ returnTokenLength: options.returnTokenLength === true,

// Process the text
const documents = [{
document_name: documentName || 'sample text',
document_text: documentText
}];
const result = await chunkit(documents, processedOptions);
res.json(result);

@@ -70,0 +71,0 @@ } catch (error) {

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc