'use strict';

		var jsLevenshtein = (function()
		{
		function _min(d0, d1, d2, bx, ay)
		{
		return d0 < d1 \|\| d2 < d1
		? d0 > d2
		? d2 + 1
		: d0 + 1
		: bx === ay
		? d1
		: d1 + 1;
		function nEqualCharsAtStart(a, b, max) {
		for (let i = 0; i < max; i++) {
		if (a.charCodeAt(i) !== b.charCodeAt(i)) {
		return i
		}
		}

		return function(a, b)
		{
		if (a === b) {
		return 0;
		}
		return max
		}

		if (a.length > b.length) {
		var tmp = a;
		a = b;
		b = tmp;
		function nEqualCharsAtEnd(a, b, max) {
		const aLen = a.length;
		const bLen = b.length;
		for (let i = 0; i < max; i++) {
		if (a.charCodeAt(aLen - i) !== b.charCodeAt(bLen - i)) {
		return i
		}
		}

		var la = a.length;
		var lb = b.length;
		return max
		}

		while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
		la--;
		lb--;
		}
		// Since JS is synchronous, we can allocate the buffer once and keep it around
		// forever. This will "leak" an array of maximum length maxDistance ... but
		// it will prevent a bunch of array resizing and it will negate garbage
		// collection entirely.
		const _buffer = [];
		const _bChars = [];

		var offset = 0;
		/**
		* Returns the Levenshtein edit distance between a and b, to a maximum of
		* `maxDistance`.
		*
		* If the distance is greater than `maxDistance`, returns Infinity.
		*
		* This special optimization speeds up calculations when the desired distance
		* is small and the string lengths aren't -- which is often, in practice.
		*/
		function boundedLevenshtein (a, b, maxDistance) {
		// https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
		// is the simple idea. Then go to
		// https://bitbucket.org/clearer/iosifovich/src/1d27393502137b1ba788822f62f7155eb0c37744/levenshtein.h
		// for the best implementation.

		while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
		offset++;
		}
		// Early optimizations
		if (a === b) return 0
		if (!a.length) return b.length > maxDistance ? Infinity : b.length
		if (!b.length) return a.length > maxDistance ? Infinity : a.length

		la -= offset;
		lb -= offset;
		// Early, redundant but super-fast optimization
		if (Math.abs(a.length - b.length) > maxDistance) {
		return Infinity
		}

		if (la === 0 \|\| lb === 1) {
		return lb;
		}
		// Simplify: ensure a is always the shorter string.
		// This means we can avoid some Math.max, Math.min or Math.abs calls; it
		// also makes fewer outer-loop iterations in the actual Levenshtein part
		// of the algorithm.
		if (a.length > b.length) {
		const t = a;
		a = b;
		b = t;
		}

		var x = 0;
		var y;
		var d0;
		var d1;
		var d2;
		var d3;
		var dd;
		var dy;
		var ay;
		var bx0;
		var bx1;
		var bx2;
		var bx3;
		let aLen = a.length;
		let bLen = b.length;

		var vector = [];
		// Ignore common suffix -- it does not contribute to distance
		const nSuffix = nEqualCharsAtEnd(a, b, aLen);
		aLen -= nSuffix;
		bLen -= nSuffix;

		for (y = 0; y < la; y++) {
		vector.push(y + 1);
		vector.push(a.charCodeAt(offset + y));
		}
		// Exit really quickly if B is just A plus a prefix
		if (aLen === 0) {
		return bLen > maxDistance ? Infinity : bLen
		}

		for (; (x + 3) < lb;) {
		bx0 = b.charCodeAt(offset + (d0 = x));
		bx1 = b.charCodeAt(offset + (d1 = x + 1));
		bx2 = b.charCodeAt(offset + (d2 = x + 2));
		bx3 = b.charCodeAt(offset + (d3 = x + 3));
		dd = (x += 4);
		for (y = 0; y < vector.length; y += 2) {
		dy = vector[y];
		ay = vector[y + 1];
		d0 = _min(dy, d0, d1, bx0, ay);
		d1 = _min(d0, d1, d2, bx1, ay);
		d2 = _min(d1, d2, d3, bx2, ay);
		dd = _min(d2, d3, dd, bx3, ay);
		vector[y] = dd;
		d3 = d2;
		d2 = d1;
		d1 = d0;
		d0 = dy;
		// Slice off matching prefix -- they don't add to distance
		const nPrefix = nEqualCharsAtStart(a, b, aLen);
		aLen -= nPrefix;
		bLen -= nPrefix;

		// Exit really quickly if B is just A plus a prefix and suffix
		if (aLen === 0) {
		return bLen > maxDistance ? Infinity : bLen
		}

		// Run .charCodeAt() once, instead of in the inner loop
		for (let i = 0; i < bLen; i++) {
		_bChars[i] = b.charCodeAt(nPrefix + i);
		}

		// Initialize buffer
		// Unlike in Wikipedia's Levenshtein algorithm, we set buffer to length bLen,
		// not bLen+1. We dont insert the first element because we can infer it from
		// `i` in our main Levenshtein loop
		for (let i = 0; i < bLen; i++) {
		_buffer[i] = i + 1;
		}

		// Idea copied from talisman's max-distance Levenshtein: we can avoid some
		// comparisons that would only ever lead to distance > maxDistance.
		if (maxDistance > bLen) maxDistance = bLen;
		const offset = maxDistance - (bLen - aLen);
		let jStart = 0;
		let jEnd = maxDistance;

		for (let i = 0; i < aLen; i++) {
		let rowMinimum = bLen;
		const ac = a.charCodeAt(nPrefix + i);

		// We don't need to compare the _entire_
		if (i > offset) jStart += 1;
		if (jEnd < bLen) jEnd += 1;

		// Calculate v1 (current distances) from previous row v0
		// First distance is delete (i + 1) chars from a to match empty b
		let current = i;
		let left = i + 1;

		for (let j = jStart; j < jEnd; j++) {
		const bc = _bChars[j];

		const insertDeleteCost = Math.min(left, _buffer[j]) + 1;
		const substituteCost = (ac === bc) ? 0 : 1;

		const d = Math.min(insertDeleteCost, current + substituteCost);

		if (d < rowMinimum) {
		rowMinimum = d;
		}

		current = _buffer[j];
		left = _buffer[j] = d;
		}
		for (; x < lb;) {
		bx0 = b.charCodeAt(offset + (d0 = x));
		dd = ++x;
		for (y = 0; y < vector.length; y += 2) {
		dy = vector[y];
		vector[y] = dd = dy < d0 \|\| dd < d0
		? dy > dd ? dd + 1 : dy + 1
		: bx0 === vector[y + 1]
		? d0
		: d0 + 1;
		d0 = dy;
		}

		if (rowMinimum > maxDistance) {
		// We never _subtract_ from any row values, so it's impossible for the
		// edit distance to be smaller than or equal to maxDistance.
		return Infinity
		}
		}

		return dd;
		};
		})();
		return _buffer[bLen - 1]
		}

		const ret = () => jsLevenshtein;
		var boundedLevenshtein_1 = boundedLevenshtein;

		module.exports = ret;
		function levenshtein(d = Infinity) {
		return (a, b) => boundedLevenshtein_1(a, b, d);
		}

		module.exports = levenshtein;
		//# sourceMappingURL=levenshtein.js.map

package.json

		{
		"name": "clustring",
		"version": "0.0.9",
		"version": "0.0.10",
		"description": "Algorithms for clustering strings",
		@@ -36,2 +36,3 @@ "main": "index.js",
		"babel-jest": "^23.4.2",
		"bounded-levenshtein": "0.0.1",
		"jest": "^23.5.0",
		@@ -50,6 +51,3 @@ "rollup": "^0.64.1",
		]
		},
		"dependencies": {
		"js-levenshtein": "^1.1.3"
		}
		}

README.md

		@@ -63,3 +63,5 @@ clustring

		const clusterer = clusterByKnn(bucket, levenshtein(), 2, { blockSize: 5 })
		// levenshtein(2) is an optimization of levenshtein() that returns 0, 1, 2, or
		// Infinity. You may use levenshtein(), but it's not recommended.
		const clusterer = clusterByKnn(bucket, levenshtein(2), 2, { blockSize: 5 })
		clusterer.cluster()
		@@ -66,0 +68,0 @@ .then(bins => { ... })

src/knn/levenshtein.js

		@@ -1,5 +0,5 @@
		import levenshtein from 'js-levenshtein'
		import boundedLevenshtein from 'bounded-levenshtein'

		const ret = () => levenshtein

		export default ret
		export default function levenshtein (d=Infinity) {
		return (a, b) => boundedLevenshtein(a, b, d)
		}

src/knn/levenshtein.test.js

		import levenshtein from './levenshtein'

		const distance = levenshtein()
		const distance = levenshtein(12)

		@@ -27,2 +27,6 @@ const Tests = [
		}

		it('should work when above maxDistance', () => {
		expect(levenshtein(3)('abcdef', 'zyxwvut')).toEqual(Infinity)
		})
		})

src/main.test.js

		@@ -31,3 +31,3 @@ import { clusterByKey, clusterByKnn } from './main'

		const bins = await clusterByKnn(bucket, levenshtein(), 2).cluster()
		const bins = await clusterByKnn(bucket, levenshtein(2), 2).cluster()
		expect(bins).toEqual([
		@@ -34,0 +34,0 @@ {

knn/levenshtein.js.map

Sorry, the diff of this file is not supported yet

clustring - npm Package Compare versions

Improved metrics

Worsened metrics

Dependency changes