Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

clustring

Package Overview
Dependencies
Maintainers
1
Versions
10
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

clustring - npm Package Compare versions

Comparing version 0.0.9 to 0.0.10

221

knn/levenshtein.js
'use strict';
var jsLevenshtein = (function()
{
function _min(d0, d1, d2, bx, ay)
{
return d0 < d1 || d2 < d1
? d0 > d2
? d2 + 1
: d0 + 1
: bx === ay
? d1
: d1 + 1;
function nEqualCharsAtStart(a, b, max) {
for (let i = 0; i < max; i++) {
if (a.charCodeAt(i) !== b.charCodeAt(i)) {
return i
}
}
return function(a, b)
{
if (a === b) {
return 0;
}
return max
}
if (a.length > b.length) {
var tmp = a;
a = b;
b = tmp;
function nEqualCharsAtEnd(a, b, max) {
const aLen = a.length;
const bLen = b.length;
for (let i = 0; i < max; i++) {
if (a.charCodeAt(aLen - i) !== b.charCodeAt(bLen - i)) {
return i
}
}
var la = a.length;
var lb = b.length;
return max
}
while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
la--;
lb--;
}
// Since JS is synchronous, we can allocate the buffer once and keep it around
// forever. This will "leak" an array of maximum length maxDistance ... but
// it will prevent a bunch of array resizing and it will negate garbage
// collection entirely.
const _buffer = [];
const _bChars = [];
var offset = 0;
/**
* Returns the Levenshtein edit distance between a and b, to a maximum of
* `maxDistance`.
*
* If the distance is greater than `maxDistance`, returns Infinity.
*
* This special optimization speeds up calculations when the desired distance
* is small and the string lengths aren't -- which is often, in practice.
*/
function boundedLevenshtein (a, b, maxDistance) {
// https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
// is the simple idea. Then go to
// https://bitbucket.org/clearer/iosifovich/src/1d27393502137b1ba788822f62f7155eb0c37744/levenshtein.h
// for the best implementation.
while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
offset++;
}
// Early optimizations
if (a === b) return 0
if (!a.length) return b.length > maxDistance ? Infinity : b.length
if (!b.length) return a.length > maxDistance ? Infinity : a.length
la -= offset;
lb -= offset;
// Early, redundant but super-fast optimization
if (Math.abs(a.length - b.length) > maxDistance) {
return Infinity
}
if (la === 0 || lb === 1) {
return lb;
}
// Simplify: ensure a is always the shorter string.
// This means we can avoid some Math.max, Math.min or Math.abs calls; it
// also makes fewer outer-loop iterations in the actual Levenshtein part
// of the algorithm.
if (a.length > b.length) {
const t = a;
a = b;
b = t;
}
var x = 0;
var y;
var d0;
var d1;
var d2;
var d3;
var dd;
var dy;
var ay;
var bx0;
var bx1;
var bx2;
var bx3;
let aLen = a.length;
let bLen = b.length;
var vector = [];
// Ignore common suffix -- it does not contribute to distance
const nSuffix = nEqualCharsAtEnd(a, b, aLen);
aLen -= nSuffix;
bLen -= nSuffix;
for (y = 0; y < la; y++) {
vector.push(y + 1);
vector.push(a.charCodeAt(offset + y));
}
// Exit really quickly if B is just A plus a prefix
if (aLen === 0) {
return bLen > maxDistance ? Infinity : bLen
}
for (; (x + 3) < lb;) {
bx0 = b.charCodeAt(offset + (d0 = x));
bx1 = b.charCodeAt(offset + (d1 = x + 1));
bx2 = b.charCodeAt(offset + (d2 = x + 2));
bx3 = b.charCodeAt(offset + (d3 = x + 3));
dd = (x += 4);
for (y = 0; y < vector.length; y += 2) {
dy = vector[y];
ay = vector[y + 1];
d0 = _min(dy, d0, d1, bx0, ay);
d1 = _min(d0, d1, d2, bx1, ay);
d2 = _min(d1, d2, d3, bx2, ay);
dd = _min(d2, d3, dd, bx3, ay);
vector[y] = dd;
d3 = d2;
d2 = d1;
d1 = d0;
d0 = dy;
// Slice off matching prefix -- they don't add to distance
const nPrefix = nEqualCharsAtStart(a, b, aLen);
aLen -= nPrefix;
bLen -= nPrefix;
// Exit really quickly if B is just A plus a prefix and suffix
if (aLen === 0) {
return bLen > maxDistance ? Infinity : bLen
}
// Run .charCodeAt() once, instead of in the inner loop
for (let i = 0; i < bLen; i++) {
_bChars[i] = b.charCodeAt(nPrefix + i);
}
// Initialize buffer
// Unlike in Wikipedia's Levenshtein algorithm, we set buffer to length bLen,
// not bLen+1. We dont insert the first element because we can infer it from
// `i` in our main Levenshtein loop
for (let i = 0; i < bLen; i++) {
_buffer[i] = i + 1;
}
// Idea copied from talisman's max-distance Levenshtein: we can avoid some
// comparisons that would only ever lead to distance > maxDistance.
if (maxDistance > bLen) maxDistance = bLen;
const offset = maxDistance - (bLen - aLen);
let jStart = 0;
let jEnd = maxDistance;
for (let i = 0; i < aLen; i++) {
let rowMinimum = bLen;
const ac = a.charCodeAt(nPrefix + i);
// We don't need to compare the _entire_
if (i > offset) jStart += 1;
if (jEnd < bLen) jEnd += 1;
// Calculate v1 (current distances) from previous row v0
// First distance is delete (i + 1) chars from a to match empty b
let current = i;
let left = i + 1;
for (let j = jStart; j < jEnd; j++) {
const bc = _bChars[j];
const insertDeleteCost = Math.min(left, _buffer[j]) + 1;
const substituteCost = (ac === bc) ? 0 : 1;
const d = Math.min(insertDeleteCost, current + substituteCost);
if (d < rowMinimum) {
rowMinimum = d;
}
current = _buffer[j];
left = _buffer[j] = d;
}
for (; x < lb;) {
bx0 = b.charCodeAt(offset + (d0 = x));
dd = ++x;
for (y = 0; y < vector.length; y += 2) {
dy = vector[y];
vector[y] = dd = dy < d0 || dd < d0
? dy > dd ? dd + 1 : dy + 1
: bx0 === vector[y + 1]
? d0
: d0 + 1;
d0 = dy;
}
if (rowMinimum > maxDistance) {
// We never _subtract_ from any row values, so it's impossible for the
// edit distance to be smaller than or equal to maxDistance.
return Infinity
}
}
return dd;
};
})();
return _buffer[bLen - 1]
}
const ret = () => jsLevenshtein;
var boundedLevenshtein_1 = boundedLevenshtein;
module.exports = ret;
function levenshtein(d = Infinity) {
return (a, b) => boundedLevenshtein_1(a, b, d);
}
module.exports = levenshtein;
//# sourceMappingURL=levenshtein.js.map
{
"name": "clustring",
"version": "0.0.9",
"version": "0.0.10",
"description": "Algorithms for clustering strings",

@@ -36,2 +36,3 @@ "main": "index.js",

"babel-jest": "^23.4.2",
"bounded-levenshtein": "0.0.1",
"jest": "^23.5.0",

@@ -50,6 +51,3 @@ "rollup": "^0.64.1",

]
},
"dependencies": {
"js-levenshtein": "^1.1.3"
}
}

@@ -63,3 +63,5 @@ clustring

const clusterer = clusterByKnn(bucket, levenshtein(), 2, { blockSize: 5 })
// levenshtein(2) is an optimization of levenshtein() that returns 0, 1, 2, or
// Infinity. You may use levenshtein(), but it's not recommended.
const clusterer = clusterByKnn(bucket, levenshtein(2), 2, { blockSize: 5 })
clusterer.cluster()

@@ -66,0 +68,0 @@ .then(bins => { ... })

@@ -1,5 +0,5 @@

import levenshtein from 'js-levenshtein'
import boundedLevenshtein from 'bounded-levenshtein'
const ret = () => levenshtein
export default ret
export default function levenshtein (d=Infinity) {
return (a, b) => boundedLevenshtein(a, b, d)
}
import levenshtein from './levenshtein'
const distance = levenshtein()
const distance = levenshtein(12)

@@ -27,2 +27,6 @@ const Tests = [

}
it('should work when above maxDistance', () => {
expect(levenshtein(3)('abcdef', 'zyxwvut')).toEqual(Infinity)
})
})

@@ -31,3 +31,3 @@ import { clusterByKey, clusterByKnn } from './main'

const bins = await clusterByKnn(bucket, levenshtein(), 2).cluster()
const bins = await clusterByKnn(bucket, levenshtein(2), 2).cluster()
expect(bins).toEqual([

@@ -34,0 +34,0 @@ {

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc