Socket
Socket
Sign inDemoInstall

tdigest

Package Overview
Dependencies
0
Maintainers
1
Versions
11
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 0.0.0 to 0.0.1

.travis.yml

7

package.json
{
"name": "tdigest",
"version": "0.0.0",
"version": "0.0.1",
"description": "javascript implementation of Dunning's T-Digest for streaming quantile approximation",
"main": "tdigest.js",
"scripts": {
"test": "mocha -b test.js"
"test": "mocha test.js"
},

@@ -30,4 +30,5 @@ "repository": {

"devDependencies": {
"better-assert": "^1.0.2"
"better-assert": "^1.0.2",
"mocha": "^2.2.5"
}
}
# tdigest
tdigest: javascript implementation of Dunning's T-Digest for streaming quantile approximation
Javascript implementation of Dunning's T-Digest for streaming quantile approximation
The T-Digest is a data structure and algorithm for constructing an
approximate distribution for a set of real numbers presented as a
stream. The algorithm makes no guarantees, but behaves well enough in
practice that implementations have been included in Apache Mahout and
ElasticSearch for computing fast summaries and approximate order
statistics over a stream.
For a pleasant overview of T-Digest's behavior, see Davidson-Pilon's
[blog post](http://dataorigami.net/blogs/napkin-folding/19055451-percentile-and-quantile-estimation-of-big-data-the-t-digest) regarding a python implementation. For more details,
there are the [tdigest paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) and [reference implementation](https://github.com/tdunning/t-digest) (Java).
This javascript implementation is based on a reading of the paper.
## Example
```
var TDigest = require('tdigest').TDigest;
var x=[], N = 100000;
for (var i = 0 ; i < N ; i += 1) {
x.push(Math.random() * 10 - 5);
};
tdigest = new TDigest();
tdigest.digest(x);
console.log(tdigest.summary());
console.log("median ~ "+tdigest.percentile(0.5));
```
See also [example.js](https://github.com/welch/tdigest/blob/master/example.js) in this package.
## Dependencies
`bintrees`: packages.json specifies a fork of [https://github.com/vadimg/js_bintrees](https://github.com/vadimg/js_bintrees)
that corrects a tree-traversal bug. You'll need it until PR#14 is merged.
var RBTree = require('bintrees').RBTree;
function TDigest(delta, K) {
function TDigest(delta, K, CX) {
// allocate a TDigest structure.

@@ -13,4 +13,9 @@ //

//
// CX specifies how often to update cached cumulative totals used
// for quantile estimation during ingest (see cumulate()). Set to
// 0 to use exact quantiles for each new point.
//
this.delta = (delta === undefined) ? 0.01 : delta;
this.K = (K === undefined) ? 25 : K;
this.CX = (CX === undefined) ? 1.1 : CX;
this.centroids = new RBTree(compare_centroids);

@@ -21,2 +26,13 @@ this.nreset = 0;

TDigest.prototype.summary = function() {
var s = ["approximating "+this.n+" samples using "+
this.size()+" centroids, delta="+this.delta+", CX="+this.CX,
"min = "+this.percentile(0),
"Q1 = "+this.percentile(0.25),
"Q2 = "+this.percentile(0.5),
"Q3 = "+this.percentile(0.75),
"max = "+this.percentile(1.0)];
return s.join('\n');
};
TDigest.prototype.reset = function() {

@@ -28,4 +44,9 @@ // prepare to digest new points.

this.nreset += 1;
this.last_cumulate = 0;
};
TDigest.prototype.size = function() {
return this.centroids.size;
};
function compare_centroids(a, b) {

@@ -36,5 +57,5 @@ // order two centroids by mean.

return (a.mean - b.mean);
} else if (a.id !== undefined && b.id !== undefined) {
// RBtrees can't accommodate duplicates, use id as tiebreaker.
return (a.id - b.id);
} else if (a.cumn !== undefined && b.cumn !== undefined) {
// RBtrees can't accommodate duplicates, use cumn as tiebreaker.
return (a.cumn - b.cumn);
} else {

@@ -58,2 +79,3 @@ return 0 ; // equality is ok during searches

if (everything) {
this._cumulate(true); // be sure cumns are exact
this.centroids.each(function(c) { result.push(c); });

@@ -66,13 +88,36 @@ } else {

TDigest.prototype.insert = function(x, n) {
// insert a new centroid into the digest
TDigest.prototype.digest = function(x, n) {
// incorporate value or array of values x, having count n into the
// TDigest. n defaults to 1. If called with an array, recompress after
// digesting the items.
n = n || 1;
if (Array.isArray(x)) {
for (var i = 0 ; i < x.length ; i++) {
this._digest(x[i], n);
}
this.redigest();
} else {
this._digest(x, n);
}
};
TDigest.prototype._cumulate = function(exact) {
// update cumulative counts for each centroid
//
this.centroids.insert({mean:x, n:n, id:this.n});
this.n = 0;
var self = this;
// exact: falsey means only cumulate after sufficient
// growth. During ingest, these counts are used as quantile
// estimates, and they work well even when somewhat out of
// date. (this is a departure from the publication, you may set CX
// to 0 to disable).
//
if (this.n === this.last_cumulate ||
!exact && this.CX && this.CX > (this.n / this.last_cumulate)) {
return;
}
var cumn = 0;
this.centroids.each(function(c) {
c.cumn = self.n + c.n/2 ; // at the mean, we've accumulated half the n.
self.n += c.n;
c.cumn = cumn + c.n/2 ; // at the mean, we've accumulated half the n.
cumn += c.n;
});
this.n = this.last_cumulate = cumn;
};

@@ -82,10 +127,15 @@

// find the centroid(s) whose means are closest to x
// (there can be multiples because of per-centroid count limits)
// return a list of centroids.
// (there can be multiples because of per-centroid count limits).
//
var iter = this.centroids.upperBound({mean:x});
var prev = (iter.data() === null) ? iter.prev() : iter.data();
var iter = this.centroids.upperBound({mean:x}); // x < iter || iter==null
var upper = iter.data();
if (upper !== null) {
// jump to the end of string of duplicates
iter = this.centroids.upperBound({mean:upper.mean});
}
var prev = (iter.data() === null) ? iter.prev() : iter.data();
if (prev === null) {
return [];
}
// walk backwards looking for closer centroids
var min = Math.abs(prev.mean - x);

@@ -104,24 +154,73 @@ var nearest = [prev];

TDigest.prototype.digest = function(x, n) {
// incorporate value x having count n into the TDigest. n defaults to 1
TDigest.prototype._new_centroid = function(x, n, cumn) {
// create and insert a new centroid into the digest (don't update
// cumulatives).
//
// XXX for inserting, cumn needs to be a unique tiebreaker (the
// RBTree implementation doesn't accept duplicates). After
// inserting, set cumn to its given value since we never rely on
// cumn order among equivalent means.
//
var c = {mean:x, n:n, cumn:this.n};
this.centroids.insert(c);
c.cumn = cumn;
this.n += n;
};
TDigest.prototype._addweight = function(c, x, n, inplace) {
// add weight at location x to centroid c
//
var newmean = c.mean + n * (x - c.mean) / (c.n + n);
if (inplace) {
c.mean = newmean;
c.n += n;
c.cumn += n / 2;
this.n += n;
} else {
this.centroids.remove(c);
this.n -= n;
this._new_centroid(newmean, c.n + n, c.cumn + n / 2);
}
};
TDigest.prototype._centroid_quantile = function(c) {
// quick-and-dirty quantile estimate for centroid's mean.
//
if (this.size() === 0) {
return NaN;
} else if (c === this.centroids.min()) {
return 0.0;
} else if (c === this.centroids.max()) {
return 1.0;
} else {
return c.cumn / this.n;
}
};
TDigest.prototype._digest = function(x, n) {
// incorporate value x, having count n into the TDigest.
//
var nearest = this.find_nearest(x);
n = n || 1;
var inplace = (nearest.length <= 1 || nearest[0].mean === x);
var cumn = (nearest.length > 0) ? nearest[0].cumn : 0;
while (nearest.length > 0 && n > 0) {
var c = pop_random(nearest);
var q = this.quantile(c.mean);
var room = 4 * this.n * this.delta * q * (1 - q) - c.n;
// even though all c's are the same distance, they have different
// cumn's. get a fresh q, it affects boundary preservation
var q = (cumn > 0) ? this._centroid_quantile(c) : 0;
var max_n = Math.floor(4 * this.n * this.delta * q * (1 - q));
var room = max_n - c.n;
if (room <= 0) {
continue;
}
var dn = (room > n) ? n : room;
c.n += dn;
c.mean += dn * (x - c.mean) / c.n;
var dn = Math.min(n, room);
this._addweight(c, x, dn, inplace);
n -= dn;
}
if (n > 0) {
// create a new centroid at x for the undigested counts
this.insert(x, n);
// create a new centroid at x for the undigested counts.
this._new_centroid(x, n, cumn + n / 2); // approximate cumn
}
if (this.K && this.centroids.size > this.K / this.delta) {
this._cumulate(false);
if (this.K && this.size() > this.K / this.delta) {
// re-process the centroids and hope for some compression.

@@ -138,3 +237,3 @@ this.redigest();

//
if (this.centroids.size === 0) {
if (this.size() === 0) {
return NaN;

@@ -146,5 +245,6 @@ } else if (x <= this.centroids.min().mean) {

}
this._cumulate(true); // be sure cumns are exact
// find centroids that bracket x and interpolate x's cumn from
// their cumn's.
var iter = this.centroids.upperBound({mean:x});
var iter = this.centroids.upperBound({mean:x}); // x.mean < iter
var c = iter.prev(); // c.mean <= x.mean

@@ -170,3 +270,3 @@ var cnext = iter.next() ; // x.mean < cnext.mean

//
if (this.centroids.size === 0) {
if (this.size() === 0) {
return NaN;

@@ -178,2 +278,3 @@ } else if (p <= 0) {

}
this._cumulate(true); // be sure cumns are exact
var target = p * (this.n - 1) + 0.5; // correct for endweight truncation

@@ -212,2 +313,3 @@ // find centroids whose cumns bracket target, then interpolate x

}
this._cumulate(true);
};

@@ -214,0 +316,0 @@

@@ -64,3 +64,3 @@ var deepEqual = require('assert').deepEqual;

it('puts singletons on the ends and piles into the middle', function(){
var tdigest = new TDigest(10, 0); // plenty of room in the center
var tdigest = new TDigest(10, 0, 0); // plenty of room in the center
var i, N = 10;

@@ -79,2 +79,29 @@ tdigest.digest(0);

});
it('preserves order as centroids merge and shift', function(){
var tdigest = new TDigest(0,0); // suppress merging
var i, N = 10;
tdigest.digest(0);
tdigest.digest(1);
tdigest.digest(0.5);
tdigest.digest(0.5);
tdigest.digest(0.5);
deepEqual( // no surprise here
tdigest.asArray(true),
[{mean:0, n:1, cumn:0.5},
{mean:0.5, n:1, cumn:1.5},
{mean:0.5, n:1, cumn:2.5},
{mean:0.5, n:1, cumn:3.5},
{mean:1, n:1, cumn:4.5}]
);
tdigest.delta = 10; // encourage merging (don't do this!)
tdigest.digest(0.6);// 2/3 of the time merge will violate the ordering
deepEqual(
tdigest.asArray(),
[{mean:0, n:1},
{mean:0.5, n:1},
{mean:0.5, n:1},
{mean:0.55, n:2},
{mean:1, n:1}]
);
});
});

@@ -84,3 +111,3 @@

it('compresses increasing-valued points', function(){
var tdigest = new TDigest(1, 0);
var tdigest = new TDigest(0, 0);
var i, N = 100;

@@ -90,3 +117,4 @@ for (i = 0 ; i < N ; i += 1) {

}
assert(tdigest.centroids.size === 100);
assert(tdigest.size() === 100);
tdigest.delta = 0.1; // encourage merging (don't do this!)
tdigest.redigest();

@@ -139,13 +167,13 @@ var points = tdigest.asArray();

var tdigest = new TDigest();
var i, N = 100000;
var i, x=[], N = 100000;
for (i = 0 ; i < N ; i += 1) {
tdigest.digest(Math.random());
x.push(Math.random());
}
tdigest.redigest();
tdigest.digest(x);
maxerr = 0;
for (var i = .01 ; i <= 1 ; i += .01) {
for (i = 0.01 ; i <= 1 ; i += 0.01) {
var q = tdigest.quantile(i);
maxerr = Math.max(maxerr, Math.abs(i-q));
}
assert(maxerr < .01);
assert(maxerr < 0.01);
});

@@ -181,13 +209,13 @@ });

var tdigest = new TDigest();
var i, N = 100000;
var i, x=[], N = 100000;
for (i = 0 ; i < N ; i += 1) {
tdigest.digest(Math.random());
x.push(Math.random());
}
tdigest.redigest();
tdigest.digest(x);
maxerr = 0;
for (var i = .01 ; i <= 1 ; i += .01) {
var x = tdigest.percentile(i);
maxerr = Math.max(maxerr, Math.abs(i-x));
for (i = 0.01 ; i <= 1 ; i += 0.01) {
var q = tdigest.percentile(i);
maxerr = Math.max(maxerr, Math.abs(i-q));
}
assert(maxerr < .01);
assert(maxerr < 0.01);
});

@@ -194,0 +222,0 @@ });

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc