semantic-chunking
Advanced tools
Comparing version 2.3.3 to 2.3.4
@@ -5,2 +5,10 @@ # Changelog | ||
## [2.3.4] - 2024-11-12 | ||
### Updated | ||
- Updated Web UI to v1.3.1 | ||
- Updated README with Web UI usage examples | ||
- Updated default values in both the library and Web UI | ||
- Web UI default can be set in `webui/public/default-form-values.js` | ||
- Misc cleanup and optimizations | ||
## [2.3.0] - 2024-11-11 | ||
@@ -7,0 +15,0 @@ ### Updated |
@@ -16,3 +16,13 @@ // =========================== | ||
import { createChunks, optimizeAndRebalanceChunks, applyPrefixToChunk } from './chunkingUtils.js'; | ||
import { readFileSync } from 'fs'; | ||
const packageJson = JSON.parse(readFileSync(new URL('./package.json', import.meta.url))); | ||
const VERSION = packageJson.version; | ||
export async function printVersion() { | ||
const versionText = `-- semantic-chunking v${VERSION} --`; | ||
const lineLength = versionText.length; | ||
console.log(`\n${'-'.repeat(lineLength)}\n${versionText}\n${'-'.repeat(lineLength)}`); | ||
} | ||
// --------------------------- | ||
@@ -43,2 +53,4 @@ // -- Main chunkit function -- | ||
printVersion(); | ||
// Input validation | ||
@@ -196,2 +208,4 @@ if (!Array.isArray(documents)) { | ||
printVersion(); | ||
// Input validation | ||
@@ -198,0 +212,0 @@ if (!Array.isArray(documents)) { |
@@ -5,11 +5,11 @@ export const DEFAULT_CONFIG = { | ||
SIMILARITY_THRESHOLD: 0.5, | ||
DYNAMIC_THRESHOLD_LOWER_BOUND: 0.475, | ||
DYNAMIC_THRESHOLD_LOWER_BOUND: 0.4, | ||
DYNAMIC_THRESHOLD_UPPER_BOUND: 0.8, | ||
NUM_SIMILARITY_SENTENCES_LOOKAHEAD: 2, | ||
NUM_SIMILARITY_SENTENCES_LOOKAHEAD: 3, | ||
COMBINE_CHUNKS: true, | ||
COMBINE_CHUNKS_SIMILARITY_THRESHOLD: 0.6, | ||
COMBINE_CHUNKS_SIMILARITY_THRESHOLD: 0.5, | ||
ONNX_EMBEDDING_MODEL: "Xenova/all-MiniLM-L6-v2", | ||
DTYPE: 'fp32', | ||
LOCAL_MODEL_PATH: null, | ||
MODEL_CACHE_DIR: null, | ||
DTYPE: 'q8', | ||
LOCAL_MODEL_PATH: "./models", | ||
MODEL_CACHE_DIR: "./models", | ||
RETURN_EMBEDDING: false, | ||
@@ -16,0 +16,0 @@ RETURN_TOKEN_LENGTH: true, |
@@ -34,15 +34,14 @@ // ------------------------ | ||
maxTokenSize: 300, | ||
similarityThreshold: 0.65, | ||
dynamicThresholdLowerBound: 0.5, | ||
dynamicThresholdUpperBound: 0.8, | ||
similarityThreshold: 0.500, | ||
dynamicThresholdLowerBound: 0.400, | ||
dynamicThresholdUpperBound: 0.800, | ||
numSimilaritySentencesLookahead: 3, | ||
combineChunks: true, // enable rebalancing | ||
combineChunksSimilarityThreshold: 0.6, | ||
onnxEmbeddingModel: "nomic-ai/nomic-embed-text-v1.5", | ||
combineChunksSimilarityThreshold: 0.700, | ||
onnxEmbeddingModel: "Xenova/all-MiniLM-L6-v2", | ||
dtype: "q8", | ||
localModelPath: "../models", | ||
modelCacheDir: "../models", | ||
returnTokenLength: true, | ||
returnEmbedding: false, | ||
returnTokenLength: true, | ||
// chunkPrefix: "search_document", | ||
} | ||
@@ -58,3 +57,3 @@ ); | ||
console.log("\n\n\n"); | ||
console.log("\n\n"); | ||
// console.log("myTestChunks:"); | ||
@@ -61,0 +60,0 @@ // console.log(myTestChunks); |
{ | ||
"name": "semantic-chunking", | ||
"version": "2.3.3", | ||
"version": "2.3.4", | ||
"description": "Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).", | ||
@@ -5,0 +5,0 @@ "repository": { |
@@ -68,5 +68,5 @@ # 🍱 semantic-chunking | ||
- `similarityThreshold`: Float (optional, default `0.5`) - Threshold to determine if sentences are similar enough to be in the same chunk. A higher value demands higher similarity. | ||
- `dynamicThresholdLowerBound`: Float (optional, default `0.45`) - Minimum possible dynamic similarity threshold. | ||
- `dynamicThresholdUpperBound`: Float (optional, default `0.75`) - Maximum possible dynamic similarity threshold. | ||
- `numSimilaritySentencesLookahead`: Integer (optional, default `2`) - Number of sentences to look ahead for calculating similarity. | ||
- `dynamicThresholdLowerBound`: Float (optional, default `0.4`) - Minimum possible dynamic similarity threshold. | ||
- `dynamicThresholdUpperBound`: Float (optional, default `0.8`) - Maximum possible dynamic similarity threshold. | ||
- `numSimilaritySentencesLookahead`: Integer (optional, default `3`) - Number of sentences to look ahead for calculating similarity. | ||
- `combineChunks`: Boolean (optional, default `true`) - Determines whether to reblance and combine chunks into larger ones up to the max token limit. | ||
@@ -97,2 +97,7 @@ - `combineChunksSimilarityThreshold`: Float (optional, default `0.5`) - Threshold for combining chunks based on similarity during the rebalance and combining phase. | ||
## **NOTE** 🚨 Every Embedding Model behaves differently! | ||
It is important to understand how the model you choose behaves when chunking your text. | ||
It is highly recommended to tweak all the parameters using the Web UI to get the best results for your use case. | ||
[Web UI README](webui/README.md) | ||
## Examples | ||
@@ -99,0 +104,0 @@ |
{ | ||
"name": "semantic-chunking-webui", | ||
"version": "1.3.0", | ||
"version": "1.3.1", | ||
"lockfileVersion": 3, | ||
@@ -9,3 +9,3 @@ "requires": true, | ||
"name": "semantic-chunking-webui", | ||
"version": "1.3.0", | ||
"version": "1.3.1", | ||
"license": "ISC", | ||
@@ -12,0 +12,0 @@ "dependencies": { |
{ | ||
"name": "semantic-chunking-webui", | ||
"version": "1.3.0", | ||
"version": "1.3.1", | ||
"type": "module", | ||
@@ -5,0 +5,0 @@ "description": "Web UI for semantic-chunking library", |
@@ -0,3 +1,43 @@ | ||
// set form default values | ||
import defaultFormValues from './default-form-values.js'; | ||
// Set default values for all form controls | ||
function setDefaultFormValues() { | ||
// Set range inputs | ||
document.getElementById('maxTokenSize').value = defaultFormValues.maxTokenSize; | ||
document.getElementById('similarityThreshold').value = defaultFormValues.similarityThreshold; | ||
document.getElementById('dynamicThresholdLowerBound').value = defaultFormValues.dynamicThresholdLowerBound; | ||
document.getElementById('dynamicThresholdUpperBound').value = defaultFormValues.dynamicThresholdUpperBound; | ||
document.getElementById('numSimilaritySentencesLookahead').value = defaultFormValues.numSimilaritySentencesLookahead; | ||
document.getElementById('combineChunksSimilarityThreshold').value = defaultFormValues.combineChunksSimilarityThreshold; | ||
// Set checkboxes | ||
document.getElementById('combineChunks').checked = defaultFormValues.combineChunks; | ||
document.getElementById('returnEmbedding').checked = defaultFormValues.returnEmbedding; | ||
document.getElementById('returnTokenLength').checked = defaultFormValues.returnTokenLength; | ||
document.getElementById('excludeChunkPrefixInResults').checked = defaultFormValues.excludeChunkPrefixInResults; | ||
// Set text input | ||
const chunkPrefixInput = document.getElementById('chunkPrefix'); | ||
chunkPrefixInput.value = defaultFormValues.chunkPrefix || ''; | ||
// Set dtype (convert string to number index) | ||
const dtypeMap = { 'fp32': 0, 'fp16': 1, 'q8': 2, 'q4': 3 }; | ||
document.getElementById('dtype').value = dtypeMap[defaultFormValues.dtype] || 0; | ||
// Trigger update for all range inputs to show their values | ||
document.querySelectorAll('input[type="range"]').forEach(input => { | ||
const event = new Event('input'); | ||
input.dispatchEvent(event); | ||
}); | ||
// Update dependent controls based on combineChunks | ||
updateDependentControls(); | ||
} | ||
// Call setDefaultFormValues after the DOM is loaded | ||
document.addEventListener('DOMContentLoaded', setDefaultFormValues); | ||
// Load sample text on page load | ||
fetch('sample.txt') | ||
fetch('./documents/sample.txt') | ||
.then(response => response.text()) | ||
@@ -20,2 +60,5 @@ .then(text => { | ||
}); | ||
// Set default model after options are loaded | ||
select.value = defaultFormValues.onnxEmbeddingModel; | ||
}) | ||
@@ -308,3 +351,3 @@ .catch(error => console.error('Error loading models:', error)); | ||
const fileType = button.dataset.file; | ||
const fileName = `${fileType}.txt`; | ||
const fileName = `./documents/${fileType}.txt`; | ||
@@ -469,3 +512,3 @@ try { | ||
// Add this with your other event listeners | ||
// info icon event listener | ||
document.querySelector('.info-icon').addEventListener('click', () => { | ||
@@ -475,9 +518,7 @@ showToast('More model choices can be added by updating the "models.json" file in the "webui" directory.', 'info', 7000); | ||
// Add after other initialization code | ||
const resultsContent = document.querySelector('.results-content'); | ||
// Create and add the resize toggle button | ||
const processingTimeSpan = document.getElementById('processingTime'); | ||
// Create and add the resize toggle button | ||
// resize toggle button | ||
const resizeToggle = document.createElement('button'); | ||
@@ -504,3 +545,3 @@ resizeToggle.className = 'resize-toggle'; | ||
// Add this to your existing range input handlers | ||
// dtype display | ||
const dtypeInput = document.getElementById('dtype'); | ||
@@ -531,2 +572,10 @@ const dtypeDisplay = dtypeInput.nextElementSibling; | ||
// Update on change | ||
dtypeInput.addEventListener('input', (e) => updateDtypeDisplay(e.target.value)); | ||
dtypeInput.addEventListener('input', (e) => updateDtypeDisplay(e.target.value)); | ||
// version display | ||
fetch('/version') | ||
.then(response => response.json()) | ||
.then(data => { | ||
document.getElementById('version').textContent = `v${data.version}`; | ||
}) | ||
.catch(error => console.error('Error fetching version:', error)); |
@@ -7,2 +7,3 @@ import express from 'express'; | ||
import dotenv from 'dotenv'; | ||
import { readFileSync } from 'fs'; | ||
@@ -14,2 +15,7 @@ dotenv.config(); | ||
// Read package.json | ||
const packageJson = JSON.parse(readFileSync(path.join(__dirname, 'package.json'), 'utf8')); | ||
const VERSION = packageJson.version; | ||
// Initialize Express app | ||
const app = express(); | ||
@@ -25,7 +31,2 @@ const PORT = process.env.PORT || 3000; | ||
// Serve node_modules directory (only for highlight.js) | ||
app.use('/node_modules/highlight.js', express.static( | ||
path.join(__dirname, 'node_modules/highlight.js') | ||
)); | ||
// Basic route | ||
@@ -36,2 +37,7 @@ app.get('/', (req, res) => { | ||
// Add a new route to serve the version | ||
app.get('/version', (req, res) => { | ||
res.json({ version: VERSION }); | ||
}); | ||
// Chunking API endpoint | ||
@@ -65,4 +71,4 @@ app.post('/api/chunk', async (req, res) => { | ||
logging: options.logging === true, | ||
localModelPath: path.join(__dirname, 'models'), | ||
modelCacheDir: path.join(__dirname, 'models') | ||
localModelPath: path.join(__dirname, '../models'), | ||
modelCacheDir: path.join(__dirname, '../models') | ||
}; | ||
@@ -69,0 +75,0 @@ |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
3511048
115
3441
359
8
6