Comparing version 0.0.9 to 0.0.10
'use strict'; | ||
var jsLevenshtein = (function() | ||
{ | ||
function _min(d0, d1, d2, bx, ay) | ||
{ | ||
return d0 < d1 || d2 < d1 | ||
? d0 > d2 | ||
? d2 + 1 | ||
: d0 + 1 | ||
: bx === ay | ||
? d1 | ||
: d1 + 1; | ||
function nEqualCharsAtStart(a, b, max) { | ||
for (let i = 0; i < max; i++) { | ||
if (a.charCodeAt(i) !== b.charCodeAt(i)) { | ||
return i | ||
} | ||
} | ||
return function(a, b) | ||
{ | ||
if (a === b) { | ||
return 0; | ||
} | ||
return max | ||
} | ||
if (a.length > b.length) { | ||
var tmp = a; | ||
a = b; | ||
b = tmp; | ||
function nEqualCharsAtEnd(a, b, max) { | ||
const aLen = a.length; | ||
const bLen = b.length; | ||
for (let i = 0; i < max; i++) { | ||
if (a.charCodeAt(aLen - i) !== b.charCodeAt(bLen - i)) { | ||
return i | ||
} | ||
} | ||
var la = a.length; | ||
var lb = b.length; | ||
return max | ||
} | ||
while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) { | ||
la--; | ||
lb--; | ||
} | ||
// Since JS is synchronous, we can allocate the buffer once and keep it around | ||
// forever. This will "leak" an array of maximum length maxDistance ... but | ||
// it will prevent a bunch of array resizing and it will negate garbage | ||
// collection entirely. | ||
const _buffer = []; | ||
const _bChars = []; | ||
var offset = 0; | ||
/** | ||
* Returns the Levenshtein edit distance between a and b, to a maximum of | ||
* `maxDistance`. | ||
* | ||
* If the distance is greater than `maxDistance`, returns Infinity. | ||
* | ||
* This special optimization speeds up calculations when the desired distance | ||
* is small and the string lengths aren't -- which is often, in practice. | ||
*/ | ||
function boundedLevenshtein (a, b, maxDistance) { | ||
// https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows | ||
// is the simple idea. Then go to | ||
// https://bitbucket.org/clearer/iosifovich/src/1d27393502137b1ba788822f62f7155eb0c37744/levenshtein.h | ||
// for the best implementation. | ||
while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) { | ||
offset++; | ||
} | ||
// Early optimizations | ||
if (a === b) return 0 | ||
if (!a.length) return b.length > maxDistance ? Infinity : b.length | ||
if (!b.length) return a.length > maxDistance ? Infinity : a.length | ||
la -= offset; | ||
lb -= offset; | ||
// Early, redundant but super-fast optimization | ||
if (Math.abs(a.length - b.length) > maxDistance) { | ||
return Infinity | ||
} | ||
if (la === 0 || lb === 1) { | ||
return lb; | ||
} | ||
// Simplify: ensure a is always the shorter string. | ||
// This means we can avoid some Math.max, Math.min or Math.abs calls; it | ||
// also makes fewer outer-loop iterations in the actual Levenshtein part | ||
// of the algorithm. | ||
if (a.length > b.length) { | ||
const t = a; | ||
a = b; | ||
b = t; | ||
} | ||
var x = 0; | ||
var y; | ||
var d0; | ||
var d1; | ||
var d2; | ||
var d3; | ||
var dd; | ||
var dy; | ||
var ay; | ||
var bx0; | ||
var bx1; | ||
var bx2; | ||
var bx3; | ||
let aLen = a.length; | ||
let bLen = b.length; | ||
var vector = []; | ||
// Ignore common suffix -- it does not contribute to distance | ||
const nSuffix = nEqualCharsAtEnd(a, b, aLen); | ||
aLen -= nSuffix; | ||
bLen -= nSuffix; | ||
for (y = 0; y < la; y++) { | ||
vector.push(y + 1); | ||
vector.push(a.charCodeAt(offset + y)); | ||
} | ||
// Exit really quickly if B is just A plus a prefix | ||
if (aLen === 0) { | ||
return bLen > maxDistance ? Infinity : bLen | ||
} | ||
for (; (x + 3) < lb;) { | ||
bx0 = b.charCodeAt(offset + (d0 = x)); | ||
bx1 = b.charCodeAt(offset + (d1 = x + 1)); | ||
bx2 = b.charCodeAt(offset + (d2 = x + 2)); | ||
bx3 = b.charCodeAt(offset + (d3 = x + 3)); | ||
dd = (x += 4); | ||
for (y = 0; y < vector.length; y += 2) { | ||
dy = vector[y]; | ||
ay = vector[y + 1]; | ||
d0 = _min(dy, d0, d1, bx0, ay); | ||
d1 = _min(d0, d1, d2, bx1, ay); | ||
d2 = _min(d1, d2, d3, bx2, ay); | ||
dd = _min(d2, d3, dd, bx3, ay); | ||
vector[y] = dd; | ||
d3 = d2; | ||
d2 = d1; | ||
d1 = d0; | ||
d0 = dy; | ||
// Slice off matching prefix -- they don't add to distance | ||
const nPrefix = nEqualCharsAtStart(a, b, aLen); | ||
aLen -= nPrefix; | ||
bLen -= nPrefix; | ||
// Exit really quickly if B is just A plus a prefix and suffix | ||
if (aLen === 0) { | ||
return bLen > maxDistance ? Infinity : bLen | ||
} | ||
// Run .charCodeAt() once, instead of in the inner loop | ||
for (let i = 0; i < bLen; i++) { | ||
_bChars[i] = b.charCodeAt(nPrefix + i); | ||
} | ||
// Initialize buffer | ||
// Unlike in Wikipedia's Levenshtein algorithm, we set buffer to length bLen, | ||
// not bLen+1. We dont insert the first element because we can infer it from | ||
// `i` in our main Levenshtein loop | ||
for (let i = 0; i < bLen; i++) { | ||
_buffer[i] = i + 1; | ||
} | ||
// Idea copied from talisman's max-distance Levenshtein: we can avoid some | ||
// comparisons that would only ever lead to distance > maxDistance. | ||
if (maxDistance > bLen) maxDistance = bLen; | ||
const offset = maxDistance - (bLen - aLen); | ||
let jStart = 0; | ||
let jEnd = maxDistance; | ||
for (let i = 0; i < aLen; i++) { | ||
let rowMinimum = bLen; | ||
const ac = a.charCodeAt(nPrefix + i); | ||
// We don't need to compare the _entire_ | ||
if (i > offset) jStart += 1; | ||
if (jEnd < bLen) jEnd += 1; | ||
// Calculate v1 (current distances) from previous row v0 | ||
// First distance is delete (i + 1) chars from a to match empty b | ||
let current = i; | ||
let left = i + 1; | ||
for (let j = jStart; j < jEnd; j++) { | ||
const bc = _bChars[j]; | ||
const insertDeleteCost = Math.min(left, _buffer[j]) + 1; | ||
const substituteCost = (ac === bc) ? 0 : 1; | ||
const d = Math.min(insertDeleteCost, current + substituteCost); | ||
if (d < rowMinimum) { | ||
rowMinimum = d; | ||
} | ||
current = _buffer[j]; | ||
left = _buffer[j] = d; | ||
} | ||
for (; x < lb;) { | ||
bx0 = b.charCodeAt(offset + (d0 = x)); | ||
dd = ++x; | ||
for (y = 0; y < vector.length; y += 2) { | ||
dy = vector[y]; | ||
vector[y] = dd = dy < d0 || dd < d0 | ||
? dy > dd ? dd + 1 : dy + 1 | ||
: bx0 === vector[y + 1] | ||
? d0 | ||
: d0 + 1; | ||
d0 = dy; | ||
} | ||
if (rowMinimum > maxDistance) { | ||
// We never _subtract_ from any row values, so it's impossible for the | ||
// edit distance to be smaller than or equal to maxDistance. | ||
return Infinity | ||
} | ||
} | ||
return dd; | ||
}; | ||
})(); | ||
return _buffer[bLen - 1] | ||
} | ||
const ret = () => jsLevenshtein; | ||
var boundedLevenshtein_1 = boundedLevenshtein; | ||
module.exports = ret; | ||
function levenshtein(d = Infinity) { | ||
return (a, b) => boundedLevenshtein_1(a, b, d); | ||
} | ||
module.exports = levenshtein; | ||
//# sourceMappingURL=levenshtein.js.map |
{ | ||
"name": "clustring", | ||
"version": "0.0.9", | ||
"version": "0.0.10", | ||
"description": "Algorithms for clustering strings", | ||
@@ -36,2 +36,3 @@ "main": "index.js", | ||
"babel-jest": "^23.4.2", | ||
"bounded-levenshtein": "0.0.1", | ||
"jest": "^23.5.0", | ||
@@ -50,6 +51,3 @@ "rollup": "^0.64.1", | ||
] | ||
}, | ||
"dependencies": { | ||
"js-levenshtein": "^1.1.3" | ||
} | ||
} |
@@ -63,3 +63,5 @@ clustring | ||
const clusterer = clusterByKnn(bucket, levenshtein(), 2, { blockSize: 5 }) | ||
// levenshtein(2) is an optimization of levenshtein() that returns 0, 1, 2, or | ||
// Infinity. You may use levenshtein(), but it's not recommended. | ||
const clusterer = clusterByKnn(bucket, levenshtein(2), 2, { blockSize: 5 }) | ||
clusterer.cluster() | ||
@@ -66,0 +68,0 @@ .then(bins => { ... }) |
@@ -1,5 +0,5 @@ | ||
import levenshtein from 'js-levenshtein' | ||
import boundedLevenshtein from 'bounded-levenshtein' | ||
const ret = () => levenshtein | ||
export default ret | ||
export default function levenshtein (d=Infinity) { | ||
return (a, b) => boundedLevenshtein(a, b, d) | ||
} |
import levenshtein from './levenshtein' | ||
const distance = levenshtein() | ||
const distance = levenshtein(12) | ||
@@ -27,2 +27,6 @@ const Tests = [ | ||
} | ||
it('should work when above maxDistance', () => { | ||
expect(levenshtein(3)('abcdef', 'zyxwvut')).toEqual(Infinity) | ||
}) | ||
}) |
@@ -31,3 +31,3 @@ import { clusterByKey, clusterByKnn } from './main' | ||
const bins = await clusterByKnn(bucket, levenshtein(), 2).cluster() | ||
const bins = await clusterByKnn(bucket, levenshtein(2), 2).cluster() | ||
expect(bins).toEqual([ | ||
@@ -34,0 +34,0 @@ { |
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
100296
0
1202
130
11
- Removedjs-levenshtein@^1.1.3
- Removedjs-levenshtein@1.1.6(transitive)