Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

clustring

Package Overview
Dependencies
Maintainers
1
Versions
10
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

clustring - npm Package Compare versions

Comparing version 0.0.2 to 0.0.3

src/knn/clusterByKnn.js

442

index.js

@@ -1,39 +0,425 @@

'use strict';
function asyncGeneratorStep(gen, resolve, reject, _next, _throw, key, arg) {
try {
var info = gen[key](arg);
var value = info.value;
} catch (error) {
reject(error);
return;
}
Object.defineProperty(exports, '__esModule', { value: true });
if (info.done) {
resolve(value);
} else {
Promise.resolve(value).then(_next, _throw);
}
}
function clusterByKey(bucket, keyer) {
var bins = [];
var keyToBin = {};
function _asyncToGenerator(fn) {
return function () {
var self = this,
args = arguments;
return new Promise(function (resolve, reject) {
var gen = fn.apply(self, args);
for (var str in bucket) {
var count = bucket[str];
var key = keyer(str);
var bin = keyToBin[key];
function _next(value) {
asyncGeneratorStep(gen, resolve, reject, _next, _throw, "next", value);
}
if (!bin) {
bin = {
key: key,
name: str,
count: 0,
bucket: {}
};
keyToBin[key] = bin;
bins.push(bin);
} else {
// Maybe change name. We do it in this loop so we're O(n)
var maxCount = bin.bucket[bin.name];
function _throw(err) {
asyncGeneratorStep(gen, resolve, reject, _next, _throw, "throw", err);
}
if (count > maxCount || count === maxCount && str.localeCompare(bin.name) < 0) {
bin.name = str;
}
_next(undefined);
});
};
}
function _classCallCheck(instance, Constructor) {
if (!(instance instanceof Constructor)) {
throw new TypeError("Cannot call a class as a function");
}
}
function _defineProperties(target, props) {
for (var i = 0; i < props.length; i++) {
var descriptor = props[i];
descriptor.enumerable = descriptor.enumerable || false;
descriptor.configurable = true;
if ("value" in descriptor) descriptor.writable = true;
Object.defineProperty(target, descriptor.key, descriptor);
}
}
function _createClass(Constructor, protoProps, staticProps) {
if (protoProps) _defineProperties(Constructor.prototype, protoProps);
if (staticProps) _defineProperties(Constructor, staticProps);
return Constructor;
}
function _defineProperty(obj, key, value) {
if (key in obj) {
Object.defineProperty(obj, key, {
value: value,
enumerable: true,
configurable: true,
writable: true
});
} else {
obj[key] = value;
}
return obj;
}
function _objectSpread(target) {
for (var i = 1; i < arguments.length; i++) {
var source = arguments[i] != null ? arguments[i] : {};
var ownKeys = Object.keys(source);
if (typeof Object.getOwnPropertySymbols === 'function') {
ownKeys = ownKeys.concat(Object.getOwnPropertySymbols(source).filter(function (sym) {
return Object.getOwnPropertyDescriptor(source, sym).enumerable;
}));
}
bin.count += count;
bin.bucket[str] = count;
ownKeys.forEach(function (key) {
_defineProperty(target, key, source[key]);
});
}
return bins;
return target;
}
exports.clusterByKey = clusterByKey;
/**
* Cede control to the event loop for one tick, from within an async function.
*
* Usage:
*
* // ... slow stuff...
* await tick() // will cede control
* // ... slow stuff...
*/
function tick() {
return new Promise(function (resolve) {
return setTimeout(resolve, 0);
});
}
var KeyClusterer =
/*#__PURE__*/
function () {
function KeyClusterer(bucket, keyer, options) {
_classCallCheck(this, KeyClusterer);
this.bucket = bucket;
this.keyer = keyer;
this.options = options;
this.progress = 0;
this.canceled = false;
}
_createClass(KeyClusterer, [{
key: "cancel",
value: function cancel() {
this.canceled = true;
}
}, {
key: "cluster",
value: function () {
var _cluster = _asyncToGenerator(
/*#__PURE__*/
regeneratorRuntime.mark(function _callee() {
var bucket, keyer, _this$options, tickMs, nIterationsBetweenTickChecks, bins, keyToBin, i, t1, strs, _i, str, t2, count, key, bin, maxCount;
return regeneratorRuntime.wrap(function _callee$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
bucket = this.bucket, keyer = this.keyer;
_this$options = this.options, tickMs = _this$options.tickMs, nIterationsBetweenTickChecks = _this$options.nIterationsBetweenTickChecks;
bins = [];
keyToBin = {};
i = 0;
t1 = new Date();
strs = Object.keys(bucket);
_i = 0;
case 8:
if (!(_i < strs.length)) {
_context.next = 29;
break;
}
str = strs[_i];
i += 1;
if (!((i & nIterationsBetweenTickChecks) === 0)) {
_context.next = 20;
break;
}
t2 = new Date();
if (!(t2 - t1 >= tickMs)) {
_context.next = 20;
break;
}
this.progress = (i - 1) / strs.length;
_context.next = 17;
return tick();
case 17:
if (!this.canceled) {
_context.next = 19;
break;
}
throw new Error('canceled');
case 19:
t1 = new Date();
case 20:
count = bucket[str];
key = keyer(str);
bin = keyToBin[key];
if (!bin) {
bin = {
key: key,
name: str,
count: 0,
bucket: {}
};
keyToBin[key] = bin;
bins.push(bin);
} else {
// Maybe change name. We do it in this loop so we're O(n)
maxCount = bin.bucket[bin.name];
if (count > maxCount || count === maxCount && str.localeCompare(bin.name) < 0) {
bin.name = str;
}
}
bin.count += count;
bin.bucket[str] = count;
case 26:
_i++;
_context.next = 8;
break;
case 29:
this.progress = 1;
return _context.abrupt("return", bins.filter(function (b) {
return Object.keys(b.bucket).length > 1;
}));
case 31:
case "end":
return _context.stop();
}
}
}, _callee, this);
}));
return function cluster() {
return _cluster.apply(this, arguments);
};
}()
}]);
return KeyClusterer;
}();
function clusterByKey(bucket, keyer) {
var options = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
options = _objectSpread({
tickMs: 8,
nIterationsBetweenTickChecks: 0xfff
}, options);
return new KeyClusterer(bucket, keyer, options);
}
var KnnClusterer =
/*#__PURE__*/
function () {
function KnnClusterer(bucket, distance, radius, options) {
_classCallCheck(this, KnnClusterer);
this.bucket = bucket;
this.distance = distance;
this.radius = radius;
this.options = options;
this.progress = 0;
this.canceled = false;
}
_createClass(KnnClusterer, [{
key: "cancel",
value: function cancel() {
this.canceled = true;
}
}, {
key: "cluster",
value: function () {
var _cluster = _asyncToGenerator(
/*#__PURE__*/
regeneratorRuntime.mark(function _callee() {
var bucket, distance, radius, _this$options, tickMs, nIterationsBetweenTickChecks, strs, bins, strToBin, t1, i, _i, a, aCount, _i2, b, t2, d, bCount, bin, maxCount;
return regeneratorRuntime.wrap(function _callee$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
bucket = this.bucket, distance = this.distance, radius = this.radius;
_this$options = this.options, tickMs = _this$options.tickMs, nIterationsBetweenTickChecks = _this$options.nIterationsBetweenTickChecks;
strs = Object.keys(bucket);
bins = [];
strToBin = {}; // a => bin, only if buckets has >1 element
t1 = new Date();
i = 0;
_i = 0;
case 8:
if (!(_i < strs.length)) {
_context.next = 38;
break;
}
a = strs[_i];
aCount = bucket[a];
_i2 = 0;
case 12:
if (!(_i2 < strs.length)) {
_context.next = 35;
break;
}
b = strs[_i2];
i += 1;
if (!((i & nIterationsBetweenTickChecks) === 0)) {
_context.next = 24;
break;
}
t2 = new Date();
if (!(t2 - t1 >= tickMs)) {
_context.next = 24;
break;
}
this.progress = (i - 1) / (strs.length * strs.length);
_context.next = 21;
return tick();
case 21:
if (!this.canceled) {
_context.next = 23;
break;
}
throw new Error('canceled');
case 23:
t1 = new Date();
case 24:
if (!(a === b)) {
_context.next = 26;
break;
}
return _context.abrupt("continue", 32);
case 26:
if (!(a in strToBin && b in strToBin[a].bucket)) {
_context.next = 28;
break;
}
return _context.abrupt("continue", 32);
case 28:
if (!(b in strToBin && a in strToBin[b].bucket)) {
_context.next = 30;
break;
}
return _context.abrupt("continue", 32);
case 30:
d = distance(a, b);
if (d <= radius) {
bCount = bucket[b];
bin = strToBin[a];
if (!bin) {
bin = {
name: a,
count: aCount,
bucket: _defineProperty({}, a, aCount)
};
strToBin[a] = bin;
bins.push(bin);
}
maxCount = bin.bucket[bin.name];
if (bCount > maxCount || bCount === maxCount && b.localeCompare(bin.name) < 0) {
bin.name = b;
}
bin.count += bCount;
bin.bucket[b] = bCount;
}
case 32:
_i2++;
_context.next = 12;
break;
case 35:
_i++;
_context.next = 8;
break;
case 38:
this.progress = 1;
return _context.abrupt("return", bins);
case 40:
case "end":
return _context.stop();
}
}
}, _callee, this);
}));
return function cluster() {
return _cluster.apply(this, arguments);
};
}()
}]);
return KnnClusterer;
}();
function clusterByKnn(bucket, distance, radius) {
var options = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : {};
options = _objectSpread({
tickMs: 8,
nIterationsBetweenTickChecks: 0xfff
}, options);
return new KnnClusterer(bucket, distance, radius, options);
}
export { clusterByKey, clusterByKnn };
//# sourceMappingURL=index.js.map

10

package.json
{
"name": "clustring",
"version": "0.0.2",
"version": "0.0.3",
"description": "Algorithms for clustering strings",

@@ -8,2 +8,3 @@ "main": "index.js",

"build": "rollup -c",
"prepublish": "npm run build",
"test": "jest"

@@ -36,3 +37,5 @@ },

"rollup": "^0.64.1",
"rollup-plugin-babel": "^4.0.0-beta.8"
"rollup-plugin-babel": "^4.0.0-beta.8",
"rollup-plugin-commonjs": "^9.1.5",
"rollup-plugin-node-resolve": "^3.3.0"
},

@@ -46,3 +49,6 @@ "jest": {

]
},
"dependencies": {
"js-levenshtein": "^1.1.3"
}
}

@@ -31,3 +31,6 @@ clustring

const bins = clusterByKey(bucket, fingerprint())
const clusterer = clusterByKey(bucket, fingerprint())
clusterer.cluster()
.then(bins => { ... })
// bins is:

@@ -40,8 +43,2 @@ // [

// "bucket": { "commonWord": 3, "CommonWord": 20 }
// },
// {
// "name": "SuperRareWord",
// "key": "superrareword",
// "count": 1,
// "bucket": { "SuperRareWord": 1}
// }

@@ -60,3 +57,3 @@ // ]

characters. (Effectively, this skips comparisons by assuming infinite distance
if there is no such sequence).
if there is no such sequence). **TODO: implement this**

@@ -69,6 +66,49 @@ Here's some sample code:

const bins = clusterByKnn(bucket, levenshtein(2), { blockSize: 5 })
const clusterer = clusterByKnn(bucket, levenshtein(), 2, { blockSize: 5 })
clusterer.cluster()
.then(bins => { ... })
// bins will be same as in previous example.
```
Progress reporting
------------------
`cluster()` returns a
[Promise](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise)
immediately and processes in the background (in the current thread). It cedes
control to the event loop every few milliseconds so your app remains
responsive.
To track progress, try something like this:
```javascript
const clusterer = clusterByKey(bucket, fingerprint(), { tickMs: 8 })
let timeout = null
function reportProgressAndReschedule () {
console.log('Progress: ', clusterer.progress)
timeout = setTimeout(reportProgressAndReschedule, 1)
}
// start progress-report loop
timeout = setTimeout(reportProgressAndReschedule, 1)
clusterer.cluster()
.then(bins => {
clearTimeout(timeout)
// ... handle bins
})
```
During `cluster()`, clustring will periodically check whether it has blocked
the main thread for more than `tickMs` milliseconds. if it has, it will cede
control to the event loop for one event-loop "tick" before resuming. Your
`setTimeout()` callback will only be called once `cluster()` cedes control,
even though it requests to be called after `1` millisecond.
Cancellation
------------
If you wish to stop clustering, run `clusterer.cancel()`. Of course, you can
only execute `clusterer.cancel()` during a tick, so consider your `tickMs`.
Developing

@@ -75,0 +115,0 @@ ==========

import babel from 'rollup-plugin-babel'
import commonjs from 'rollup-plugin-commonjs'
import nodeResolve from 'rollup-plugin-node-resolve'
import pkg from './package.json'

@@ -7,3 +9,5 @@

exclude: ['node_modules/**']
})
}),
nodeResolve(),
commonjs()
]

@@ -17,3 +21,4 @@

name: 'clustring',
format: 'cjs',
format: 'es',
sourcemap: true,
file: 'index.js'

@@ -24,10 +29,11 @@ }

for (const keyFunction of [ 'fingerprint' ]) {
for (const fn of [ 'key/fingerprint', 'knn/levenshtein' ]) {
entries.push({
input: `src/key/${keyFunction}.js`,
input: `src/${fn}.js`,
plugins,
output: {
name: `key/${keyFunction}.js`,
format: 'cjs',
file: `key/${keyFunction}.js`
name: `${fn}.js`,
format: 'es',
sourcemap: true,
file: `${fn}.js`
}

@@ -34,0 +40,0 @@ })

@@ -1,31 +0,85 @@

export default function clusterByKey (bucket, keyer) {
const bins = []
const keyToBin = {}
import { tick } from '../util'
for (const str in bucket) {
const count = bucket[str]
const key = keyer(str)
class KeyClusterer {
constructor (bucket, keyer, options) {
this.bucket = bucket
this.keyer = keyer
this.options = options
let bin = keyToBin[key]
if (!bin) {
bin = {
key: key,
name: str,
count: 0,
bucket: {}
this.progress = 0
this.canceled = false
}
cancel () {
this.canceled = true
}
async cluster () {
const { bucket, keyer } = this
const { tickMs, nIterationsBetweenTickChecks } = this.options
const bins = []
const keyToBin = {}
let i = 0
let t1 = new Date()
const strs = Object.keys(bucket)
for (const str of strs) {
i += 1
if ((i & nIterationsBetweenTickChecks) === 0) {
const t2 = new Date()
if (t2 - t1 >= tickMs) {
this.progress = (i - 1) / strs.length
await tick()
// We can only be canceled while we aren't executing. So now that
// we're back from our tick is the only time we need to check.
if (this.canceled) {
throw new Error('canceled')
}
t1 = new Date()
}
}
keyToBin[key] = bin
bins.push(bin)
} else {
// Maybe change name. We do it in this loop so we're O(n)
const maxCount = bin.bucket[bin.name]
if (count > maxCount || (count === maxCount && str.localeCompare(bin.name) < 0)) {
bin.name = str
const count = bucket[str]
const key = keyer(str)
let bin = keyToBin[key]
if (!bin) {
bin = {
key: key,
name: str,
count: 0,
bucket: {}
}
keyToBin[key] = bin
bins.push(bin)
} else {
// Maybe change name. We do it in this loop so we're O(n)
const maxCount = bin.bucket[bin.name]
if (count > maxCount || (count === maxCount && str.localeCompare(bin.name) < 0)) {
bin.name = str
}
}
bin.count += count
bin.bucket[str] = count
}
bin.count += count
bin.bucket[str] = count
this.progress = 1
return bins
.filter(b => Object.keys(b.bucket).length > 1)
}
}
return bins
export default function clusterByKey (bucket, keyer, options={}) {
options = {
tickMs: 8,
nIterationsBetweenTickChecks: 0xfff, // must be power of two, minus one
...options
}
return new KeyClusterer(bucket, keyer, options)
}
import clusterByKey from './clusterByKey'
describe('clusterByKey', () => {
it('should cluster by key', () => {
it('should cluster by key', async () => {
const bucket = {

@@ -13,11 +13,5 @@ a: 3,

const result = clusterByKey(bucket, fn)
expect(result.sort((x, y) => x.count - y.count)).toEqual([
const result = await clusterByKey(bucket, fn).cluster()
expect(result).toEqual([
{
name: 'a',
key: 'x',
count: 3,
bucket: { a: 3 }
},
{
key: 'y',

@@ -31,3 +25,3 @@ name: 'b',

it('should pick highest-count name', () => {
it('should pick highest-count name', async () => {
const bucket = {

@@ -40,7 +34,7 @@ a: 3,

const result = clusterByKey(bucket, fn)
const result = await clusterByKey(bucket, fn).cluster()
expect(result[0].name).toEqual('b')
})
it('should localeCompare if counts are equal', () => {
it('should localeCompare if counts are equal', async () => {
const bucket = {

@@ -54,5 +48,42 @@ a: 3,

const result = clusterByKey(bucket, fn)
const result = await clusterByKey(bucket, fn).cluster()
expect(result[0].name).toEqual('a')
})
it('should cede control to the main loop after too many iterations', async () => {
const bucket = {
a: 3,
c: 3,
b: 3
}
const fn = _ => 'x'
let progressReport = null
setTimeout((() => progressReport = clusterer.progress), 0)
const clusterer = clusterByKey(bucket, fn, {
tickMs: 0,
nIterationsBetweenTickChecks: 0x1
})
await clusterer.cluster()
expect(progressReport).toEqual(1.0 / 3)
expect(clusterer.progress).toEqual(1)
})
it('should cancel', async () => {
const bucket = {
a: 3,
c: 3,
b: 3
}
const fn = _ => 'x'
const clusterer = clusterByKey(bucket, fn, {
tickMs: 0,
nIterationsBetweenTickChecks: 0x1
})
setTimeout((() => clusterer.cancel()), 0)
await expect(clusterer.cluster()).rejects.toThrow('canceled')
})
})
import clusterByKey from './key/clusterByKey'
import clusterByKnn from './knn/clusterByKnn'
export { clusterByKey }
export { clusterByKey, clusterByKnn }

@@ -1,6 +0,7 @@

import { clusterByKey } from './main'
import { clusterByKey, clusterByKnn } from './main'
import fingerprint from './key/fingerprint'
import levenshtein from './knn/levenshtein'
describe('main.js', () => {
it('should handle README clusterByKey example', () => {
it('should handle README clusterByKey example', async () => {
const bucket = {

@@ -12,4 +13,4 @@ "commonWord": 3,

const bins = clusterByKey(bucket, fingerprint())
expect(bins.sort((a, b) => a.key.localeCompare(b.key))).toEqual([
const bins = await clusterByKey(bucket, fingerprint()).cluster()
expect(bins).toEqual([
{

@@ -20,8 +21,19 @@ "name": "CommonWord",

"bucket": { "commonWord": 3, "CommonWord": 20 }
},
}
])
})
it('should handle README clusterByKnn example', async () => {
const bucket = {
"commonWord": 3,
"CommonWord": 20,
"SuperRareWord": 1
}
const bins = await clusterByKnn(bucket, levenshtein(), 2).cluster()
expect(bins).toEqual([
{
"name": "SuperRareWord",
"key": "superrareword",
"count": 1,
"bucket": { "SuperRareWord": 1}
"name": "CommonWord",
"count": 23,
"bucket": { "commonWord": 3, "CommonWord": 20 }
}

@@ -28,0 +40,0 @@ ])

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc