Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

semantic-chunking

Package Overview
Dependencies
Maintainers
0
Versions
30
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

semantic-chunking - npm Package Compare versions

Comparing version 2.3.3 to 2.3.4

webui/public/default-form-values.js

8

CHANGELOG.md

@@ -5,2 +5,10 @@ # Changelog

## [2.3.4] - 2024-11-12
### Updated
- Updated Web UI to v1.3.1
- Updated README with Web UI usage examples
- Updated default values in both the library and Web UI
- Web UI default can be set in `webui/public/default-form-values.js`
- Misc cleanup and optimizations
## [2.3.0] - 2024-11-11

@@ -7,0 +15,0 @@ ### Updated

@@ -16,3 +16,13 @@ // ===========================

import { createChunks, optimizeAndRebalanceChunks, applyPrefixToChunk } from './chunkingUtils.js';
import { readFileSync } from 'fs';
const packageJson = JSON.parse(readFileSync(new URL('./package.json', import.meta.url)));
const VERSION = packageJson.version;
export async function printVersion() {
const versionText = `-- semantic-chunking v${VERSION} --`;
const lineLength = versionText.length;
console.log(`\n${'-'.repeat(lineLength)}\n${versionText}\n${'-'.repeat(lineLength)}`);
}
// ---------------------------

@@ -43,2 +53,4 @@ // -- Main chunkit function --

printVersion();
// Input validation

@@ -196,2 +208,4 @@ if (!Array.isArray(documents)) {

printVersion();
// Input validation

@@ -198,0 +212,0 @@ if (!Array.isArray(documents)) {

12

config.js

@@ -5,11 +5,11 @@ export const DEFAULT_CONFIG = {

SIMILARITY_THRESHOLD: 0.5,
DYNAMIC_THRESHOLD_LOWER_BOUND: 0.475,
DYNAMIC_THRESHOLD_LOWER_BOUND: 0.4,
DYNAMIC_THRESHOLD_UPPER_BOUND: 0.8,
NUM_SIMILARITY_SENTENCES_LOOKAHEAD: 2,
NUM_SIMILARITY_SENTENCES_LOOKAHEAD: 3,
COMBINE_CHUNKS: true,
COMBINE_CHUNKS_SIMILARITY_THRESHOLD: 0.6,
COMBINE_CHUNKS_SIMILARITY_THRESHOLD: 0.5,
ONNX_EMBEDDING_MODEL: "Xenova/all-MiniLM-L6-v2",
DTYPE: 'fp32',
LOCAL_MODEL_PATH: null,
MODEL_CACHE_DIR: null,
DTYPE: 'q8',
LOCAL_MODEL_PATH: "./models",
MODEL_CACHE_DIR: "./models",
RETURN_EMBEDDING: false,

@@ -16,0 +16,0 @@ RETURN_TOKEN_LENGTH: true,

@@ -34,15 +34,14 @@ // ------------------------

maxTokenSize: 300,
similarityThreshold: 0.65,
dynamicThresholdLowerBound: 0.5,
dynamicThresholdUpperBound: 0.8,
similarityThreshold: 0.500,
dynamicThresholdLowerBound: 0.400,
dynamicThresholdUpperBound: 0.800,
numSimilaritySentencesLookahead: 3,
combineChunks: true, // enable rebalancing
combineChunksSimilarityThreshold: 0.6,
onnxEmbeddingModel: "nomic-ai/nomic-embed-text-v1.5",
combineChunksSimilarityThreshold: 0.700,
onnxEmbeddingModel: "Xenova/all-MiniLM-L6-v2",
dtype: "q8",
localModelPath: "../models",
modelCacheDir: "../models",
returnTokenLength: true,
returnEmbedding: false,
returnTokenLength: true,
// chunkPrefix: "search_document",
}

@@ -58,3 +57,3 @@ );

console.log("\n\n\n");
console.log("\n\n");
// console.log("myTestChunks:");

@@ -61,0 +60,0 @@ // console.log(myTestChunks);

{
"name": "semantic-chunking",
"version": "2.3.3",
"version": "2.3.4",
"description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).",

@@ -5,0 +5,0 @@ "repository": {

@@ -68,5 +68,5 @@ # 🍱 semantic-chunking

- `similarityThreshold`: Float (optional, default `0.5`) - Threshold to determine if sentences are similar enough to be in the same chunk. A higher value demands higher similarity.
- `dynamicThresholdLowerBound`: Float (optional, default `0.45`) - Minimum possible dynamic similarity threshold.
- `dynamicThresholdUpperBound`: Float (optional, default `0.75`) - Maximum possible dynamic similarity threshold.
- `numSimilaritySentencesLookahead`: Integer (optional, default `2`) - Number of sentences to look ahead for calculating similarity.
- `dynamicThresholdLowerBound`: Float (optional, default `0.4`) - Minimum possible dynamic similarity threshold.
- `dynamicThresholdUpperBound`: Float (optional, default `0.8`) - Maximum possible dynamic similarity threshold.
- `numSimilaritySentencesLookahead`: Integer (optional, default `3`) - Number of sentences to look ahead for calculating similarity.
- `combineChunks`: Boolean (optional, default `true`) - Determines whether to reblance and combine chunks into larger ones up to the max token limit.

@@ -97,2 +97,7 @@ - `combineChunksSimilarityThreshold`: Float (optional, default `0.5`) - Threshold for combining chunks based on similarity during the rebalance and combining phase.

## **NOTE** 🚨 Every Embedding Model behaves differently!
It is important to understand how the model you choose behaves when chunking your text.
It is highly recommended to tweak all the parameters using the Web UI to get the best results for your use case.
[Web UI README](webui/README.md)
## Examples

@@ -99,0 +104,0 @@

{
"name": "semantic-chunking-webui",
"version": "1.3.0",
"version": "1.3.1",
"lockfileVersion": 3,

@@ -9,3 +9,3 @@ "requires": true,

"name": "semantic-chunking-webui",
"version": "1.3.0",
"version": "1.3.1",
"license": "ISC",

@@ -12,0 +12,0 @@ "dependencies": {

{
"name": "semantic-chunking-webui",
"version": "1.3.0",
"version": "1.3.1",
"type": "module",

@@ -5,0 +5,0 @@ "description": "Web UI for semantic-chunking library",

@@ -0,3 +1,43 @@

// set form default values
import defaultFormValues from './default-form-values.js';
// Set default values for all form controls
function setDefaultFormValues() {
// Set range inputs
document.getElementById('maxTokenSize').value = defaultFormValues.maxTokenSize;
document.getElementById('similarityThreshold').value = defaultFormValues.similarityThreshold;
document.getElementById('dynamicThresholdLowerBound').value = defaultFormValues.dynamicThresholdLowerBound;
document.getElementById('dynamicThresholdUpperBound').value = defaultFormValues.dynamicThresholdUpperBound;
document.getElementById('numSimilaritySentencesLookahead').value = defaultFormValues.numSimilaritySentencesLookahead;
document.getElementById('combineChunksSimilarityThreshold').value = defaultFormValues.combineChunksSimilarityThreshold;
// Set checkboxes
document.getElementById('combineChunks').checked = defaultFormValues.combineChunks;
document.getElementById('returnEmbedding').checked = defaultFormValues.returnEmbedding;
document.getElementById('returnTokenLength').checked = defaultFormValues.returnTokenLength;
document.getElementById('excludeChunkPrefixInResults').checked = defaultFormValues.excludeChunkPrefixInResults;
// Set text input
const chunkPrefixInput = document.getElementById('chunkPrefix');
chunkPrefixInput.value = defaultFormValues.chunkPrefix || '';
// Set dtype (convert string to number index)
const dtypeMap = { 'fp32': 0, 'fp16': 1, 'q8': 2, 'q4': 3 };
document.getElementById('dtype').value = dtypeMap[defaultFormValues.dtype] || 0;
// Trigger update for all range inputs to show their values
document.querySelectorAll('input[type="range"]').forEach(input => {
const event = new Event('input');
input.dispatchEvent(event);
});
// Update dependent controls based on combineChunks
updateDependentControls();
}
// Call setDefaultFormValues after the DOM is loaded
document.addEventListener('DOMContentLoaded', setDefaultFormValues);
// Load sample text on page load
fetch('sample.txt')
fetch('./documents/sample.txt')
.then(response => response.text())

@@ -20,2 +60,5 @@ .then(text => {

});
// Set default model after options are loaded
select.value = defaultFormValues.onnxEmbeddingModel;
})

@@ -308,3 +351,3 @@ .catch(error => console.error('Error loading models:', error));

const fileType = button.dataset.file;
const fileName = `${fileType}.txt`;
const fileName = `./documents/${fileType}.txt`;

@@ -469,3 +512,3 @@ try {

// Add this with your other event listeners
// info icon event listener
document.querySelector('.info-icon').addEventListener('click', () => {

@@ -475,9 +518,7 @@ showToast('More model choices can be added by updating the "models.json" file in the "webui" directory.', 'info', 7000);

// Add after other initialization code
const resultsContent = document.querySelector('.results-content');
// Create and add the resize toggle button
const processingTimeSpan = document.getElementById('processingTime');
// Create and add the resize toggle button
// resize toggle button
const resizeToggle = document.createElement('button');

@@ -504,3 +545,3 @@ resizeToggle.className = 'resize-toggle';

// Add this to your existing range input handlers
// dtype display
const dtypeInput = document.getElementById('dtype');

@@ -531,2 +572,10 @@ const dtypeDisplay = dtypeInput.nextElementSibling;

// Update on change
dtypeInput.addEventListener('input', (e) => updateDtypeDisplay(e.target.value));
dtypeInput.addEventListener('input', (e) => updateDtypeDisplay(e.target.value));
// version display
fetch('/version')
.then(response => response.json())
.then(data => {
document.getElementById('version').textContent = `v${data.version}`;
})
.catch(error => console.error('Error fetching version:', error));

@@ -7,2 +7,3 @@ import express from 'express';

import dotenv from 'dotenv';
import { readFileSync } from 'fs';

@@ -14,2 +15,7 @@ dotenv.config();

// Read package.json
const packageJson = JSON.parse(readFileSync(path.join(__dirname, 'package.json'), 'utf8'));
const VERSION = packageJson.version;
// Initialize Express app
const app = express();

@@ -25,7 +31,2 @@ const PORT = process.env.PORT || 3000;

// Serve node_modules directory (only for highlight.js)
app.use('/node_modules/highlight.js', express.static(
path.join(__dirname, 'node_modules/highlight.js')
));
// Basic route

@@ -36,2 +37,7 @@ app.get('/', (req, res) => {

// Add a new route to serve the version
app.get('/version', (req, res) => {
res.json({ version: VERSION });
});
// Chunking API endpoint

@@ -65,4 +71,4 @@ app.post('/api/chunk', async (req, res) => {

logging: options.logging === true,
localModelPath: path.join(__dirname, 'models'),
modelCacheDir: path.join(__dirname, 'models')
localModelPath: path.join(__dirname, '../models'),
modelCacheDir: path.join(__dirname, '../models')
};

@@ -69,0 +75,0 @@

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc