tdigest
Advanced tools
Comparing version 0.0.0 to 0.0.1
{ | ||
"name": "tdigest", | ||
"version": "0.0.0", | ||
"version": "0.0.1", | ||
"description": "javascript implementation of Dunning's T-Digest for streaming quantile approximation", | ||
"main": "tdigest.js", | ||
"scripts": { | ||
"test": "mocha -b test.js" | ||
"test": "mocha test.js" | ||
}, | ||
@@ -30,4 +30,5 @@ "repository": { | ||
"devDependencies": { | ||
"better-assert": "^1.0.2" | ||
"better-assert": "^1.0.2", | ||
"mocha": "^2.2.5" | ||
} | ||
} |
# tdigest | ||
tdigest: javascript implementation of Dunning's T-Digest for streaming quantile approximation | ||
Javascript implementation of Dunning's T-Digest for streaming quantile approximation | ||
The T-Digest is a data structure and algorithm for constructing an | ||
approximate distribution for a set of real numbers presented as a | ||
stream. The algorithm makes no guarantees, but behaves well enough in | ||
practice that implementations have been included in Apache Mahout and | ||
ElasticSearch for computing fast summaries and approximate order | ||
statistics over a stream. | ||
For a pleasant overview of T-Digest's behavior, see Davidson-Pilon's | ||
[blog post](http://dataorigami.net/blogs/napkin-folding/19055451-percentile-and-quantile-estimation-of-big-data-the-t-digest) regarding a python implementation. For more details, | ||
there are the [tdigest paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) and [reference implementation](https://github.com/tdunning/t-digest) (Java). | ||
This javascript implementation is based on a reading of the paper. | ||
## Example | ||
``` | ||
var TDigest = require('tdigest').TDigest; | ||
var x=[], N = 100000; | ||
for (var i = 0 ; i < N ; i += 1) { | ||
x.push(Math.random() * 10 - 5); | ||
}; | ||
tdigest = new TDigest(); | ||
tdigest.digest(x); | ||
console.log(tdigest.summary()); | ||
console.log("median ~ "+tdigest.percentile(0.5)); | ||
``` | ||
See also [example.js](https://github.com/welch/tdigest/blob/master/example.js) in this package. | ||
## Dependencies | ||
`bintrees`: packages.json specifies a fork of [https://github.com/vadimg/js_bintrees](https://github.com/vadimg/js_bintrees) | ||
that corrects a tree-traversal bug. You'll need it until PR#14 is merged. | ||
162
tdigest.js
var RBTree = require('bintrees').RBTree; | ||
function TDigest(delta, K) { | ||
function TDigest(delta, K, CX) { | ||
// allocate a TDigest structure. | ||
@@ -13,4 +13,9 @@ // | ||
// | ||
// CX specifies how often to update cached cumulative totals used | ||
// for quantile estimation during ingest (see cumulate()). Set to | ||
// 0 to use exact quantiles for each new point. | ||
// | ||
this.delta = (delta === undefined) ? 0.01 : delta; | ||
this.K = (K === undefined) ? 25 : K; | ||
this.CX = (CX === undefined) ? 1.1 : CX; | ||
this.centroids = new RBTree(compare_centroids); | ||
@@ -21,2 +26,13 @@ this.nreset = 0; | ||
TDigest.prototype.summary = function() { | ||
var s = ["approximating "+this.n+" samples using "+ | ||
this.size()+" centroids, delta="+this.delta+", CX="+this.CX, | ||
"min = "+this.percentile(0), | ||
"Q1 = "+this.percentile(0.25), | ||
"Q2 = "+this.percentile(0.5), | ||
"Q3 = "+this.percentile(0.75), | ||
"max = "+this.percentile(1.0)]; | ||
return s.join('\n'); | ||
}; | ||
TDigest.prototype.reset = function() { | ||
@@ -28,4 +44,9 @@ // prepare to digest new points. | ||
this.nreset += 1; | ||
this.last_cumulate = 0; | ||
}; | ||
TDigest.prototype.size = function() { | ||
return this.centroids.size; | ||
}; | ||
function compare_centroids(a, b) { | ||
@@ -36,5 +57,5 @@ // order two centroids by mean. | ||
return (a.mean - b.mean); | ||
} else if (a.id !== undefined && b.id !== undefined) { | ||
// RBtrees can't accommodate duplicates, use id as tiebreaker. | ||
return (a.id - b.id); | ||
} else if (a.cumn !== undefined && b.cumn !== undefined) { | ||
// RBtrees can't accommodate duplicates, use cumn as tiebreaker. | ||
return (a.cumn - b.cumn); | ||
} else { | ||
@@ -58,2 +79,3 @@ return 0 ; // equality is ok during searches | ||
if (everything) { | ||
this._cumulate(true); // be sure cumns are exact | ||
this.centroids.each(function(c) { result.push(c); }); | ||
@@ -66,13 +88,36 @@ } else { | ||
TDigest.prototype.insert = function(x, n) { | ||
// insert a new centroid into the digest | ||
TDigest.prototype.digest = function(x, n) { | ||
// incorporate value or array of values x, having count n into the | ||
// TDigest. n defaults to 1. If called with an array, recompress after | ||
// digesting the items. | ||
n = n || 1; | ||
if (Array.isArray(x)) { | ||
for (var i = 0 ; i < x.length ; i++) { | ||
this._digest(x[i], n); | ||
} | ||
this.redigest(); | ||
} else { | ||
this._digest(x, n); | ||
} | ||
}; | ||
TDigest.prototype._cumulate = function(exact) { | ||
// update cumulative counts for each centroid | ||
// | ||
this.centroids.insert({mean:x, n:n, id:this.n}); | ||
this.n = 0; | ||
var self = this; | ||
// exact: falsey means only cumulate after sufficient | ||
// growth. During ingest, these counts are used as quantile | ||
// estimates, and they work well even when somewhat out of | ||
// date. (this is a departure from the publication, you may set CX | ||
// to 0 to disable). | ||
// | ||
if (this.n === this.last_cumulate || | ||
!exact && this.CX && this.CX > (this.n / this.last_cumulate)) { | ||
return; | ||
} | ||
var cumn = 0; | ||
this.centroids.each(function(c) { | ||
c.cumn = self.n + c.n/2 ; // at the mean, we've accumulated half the n. | ||
self.n += c.n; | ||
c.cumn = cumn + c.n/2 ; // at the mean, we've accumulated half the n. | ||
cumn += c.n; | ||
}); | ||
this.n = this.last_cumulate = cumn; | ||
}; | ||
@@ -82,10 +127,15 @@ | ||
// find the centroid(s) whose means are closest to x | ||
// (there can be multiples because of per-centroid count limits) | ||
// return a list of centroids. | ||
// (there can be multiples because of per-centroid count limits). | ||
// | ||
var iter = this.centroids.upperBound({mean:x}); | ||
var prev = (iter.data() === null) ? iter.prev() : iter.data(); | ||
var iter = this.centroids.upperBound({mean:x}); // x < iter || iter==null | ||
var upper = iter.data(); | ||
if (upper !== null) { | ||
// jump to the end of string of duplicates | ||
iter = this.centroids.upperBound({mean:upper.mean}); | ||
} | ||
var prev = (iter.data() === null) ? iter.prev() : iter.data(); | ||
if (prev === null) { | ||
return []; | ||
} | ||
// walk backwards looking for closer centroids | ||
var min = Math.abs(prev.mean - x); | ||
@@ -104,24 +154,73 @@ var nearest = [prev]; | ||
TDigest.prototype.digest = function(x, n) { | ||
// incorporate value x having count n into the TDigest. n defaults to 1 | ||
TDigest.prototype._new_centroid = function(x, n, cumn) { | ||
// create and insert a new centroid into the digest (don't update | ||
// cumulatives). | ||
// | ||
// XXX for inserting, cumn needs to be a unique tiebreaker (the | ||
// RBTree implementation doesn't accept duplicates). After | ||
// inserting, set cumn to its given value since we never rely on | ||
// cumn order among equivalent means. | ||
// | ||
var c = {mean:x, n:n, cumn:this.n}; | ||
this.centroids.insert(c); | ||
c.cumn = cumn; | ||
this.n += n; | ||
}; | ||
TDigest.prototype._addweight = function(c, x, n, inplace) { | ||
// add weight at location x to centroid c | ||
// | ||
var newmean = c.mean + n * (x - c.mean) / (c.n + n); | ||
if (inplace) { | ||
c.mean = newmean; | ||
c.n += n; | ||
c.cumn += n / 2; | ||
this.n += n; | ||
} else { | ||
this.centroids.remove(c); | ||
this.n -= n; | ||
this._new_centroid(newmean, c.n + n, c.cumn + n / 2); | ||
} | ||
}; | ||
TDigest.prototype._centroid_quantile = function(c) { | ||
// quick-and-dirty quantile estimate for centroid's mean. | ||
// | ||
if (this.size() === 0) { | ||
return NaN; | ||
} else if (c === this.centroids.min()) { | ||
return 0.0; | ||
} else if (c === this.centroids.max()) { | ||
return 1.0; | ||
} else { | ||
return c.cumn / this.n; | ||
} | ||
}; | ||
TDigest.prototype._digest = function(x, n) { | ||
// incorporate value x, having count n into the TDigest. | ||
// | ||
var nearest = this.find_nearest(x); | ||
n = n || 1; | ||
var inplace = (nearest.length <= 1 || nearest[0].mean === x); | ||
var cumn = (nearest.length > 0) ? nearest[0].cumn : 0; | ||
while (nearest.length > 0 && n > 0) { | ||
var c = pop_random(nearest); | ||
var q = this.quantile(c.mean); | ||
var room = 4 * this.n * this.delta * q * (1 - q) - c.n; | ||
// even though all c's are the same distance, they have different | ||
// cumn's. get a fresh q, it affects boundary preservation | ||
var q = (cumn > 0) ? this._centroid_quantile(c) : 0; | ||
var max_n = Math.floor(4 * this.n * this.delta * q * (1 - q)); | ||
var room = max_n - c.n; | ||
if (room <= 0) { | ||
continue; | ||
} | ||
var dn = (room > n) ? n : room; | ||
c.n += dn; | ||
c.mean += dn * (x - c.mean) / c.n; | ||
var dn = Math.min(n, room); | ||
this._addweight(c, x, dn, inplace); | ||
n -= dn; | ||
} | ||
if (n > 0) { | ||
// create a new centroid at x for the undigested counts | ||
this.insert(x, n); | ||
// create a new centroid at x for the undigested counts. | ||
this._new_centroid(x, n, cumn + n / 2); // approximate cumn | ||
} | ||
if (this.K && this.centroids.size > this.K / this.delta) { | ||
this._cumulate(false); | ||
if (this.K && this.size() > this.K / this.delta) { | ||
// re-process the centroids and hope for some compression. | ||
@@ -138,3 +237,3 @@ this.redigest(); | ||
// | ||
if (this.centroids.size === 0) { | ||
if (this.size() === 0) { | ||
return NaN; | ||
@@ -146,5 +245,6 @@ } else if (x <= this.centroids.min().mean) { | ||
} | ||
this._cumulate(true); // be sure cumns are exact | ||
// find centroids that bracket x and interpolate x's cumn from | ||
// their cumn's. | ||
var iter = this.centroids.upperBound({mean:x}); | ||
var iter = this.centroids.upperBound({mean:x}); // x.mean < iter | ||
var c = iter.prev(); // c.mean <= x.mean | ||
@@ -170,3 +270,3 @@ var cnext = iter.next() ; // x.mean < cnext.mean | ||
// | ||
if (this.centroids.size === 0) { | ||
if (this.size() === 0) { | ||
return NaN; | ||
@@ -178,2 +278,3 @@ } else if (p <= 0) { | ||
} | ||
this._cumulate(true); // be sure cumns are exact | ||
var target = p * (this.n - 1) + 0.5; // correct for endweight truncation | ||
@@ -212,2 +313,3 @@ // find centroids whose cumns bracket target, then interpolate x | ||
} | ||
this._cumulate(true); | ||
}; | ||
@@ -214,0 +316,0 @@ |
58
test.js
@@ -64,3 +64,3 @@ var deepEqual = require('assert').deepEqual; | ||
it('puts singletons on the ends and piles into the middle', function(){ | ||
var tdigest = new TDigest(10, 0); // plenty of room in the center | ||
var tdigest = new TDigest(10, 0, 0); // plenty of room in the center | ||
var i, N = 10; | ||
@@ -79,2 +79,29 @@ tdigest.digest(0); | ||
}); | ||
it('preserves order as centroids merge and shift', function(){ | ||
var tdigest = new TDigest(0,0); // suppress merging | ||
var i, N = 10; | ||
tdigest.digest(0); | ||
tdigest.digest(1); | ||
tdigest.digest(0.5); | ||
tdigest.digest(0.5); | ||
tdigest.digest(0.5); | ||
deepEqual( // no surprise here | ||
tdigest.asArray(true), | ||
[{mean:0, n:1, cumn:0.5}, | ||
{mean:0.5, n:1, cumn:1.5}, | ||
{mean:0.5, n:1, cumn:2.5}, | ||
{mean:0.5, n:1, cumn:3.5}, | ||
{mean:1, n:1, cumn:4.5}] | ||
); | ||
tdigest.delta = 10; // encourage merging (don't do this!) | ||
tdigest.digest(0.6);// 2/3 of the time merge will violate the ordering | ||
deepEqual( | ||
tdigest.asArray(), | ||
[{mean:0, n:1}, | ||
{mean:0.5, n:1}, | ||
{mean:0.5, n:1}, | ||
{mean:0.55, n:2}, | ||
{mean:1, n:1}] | ||
); | ||
}); | ||
}); | ||
@@ -84,3 +111,3 @@ | ||
it('compresses increasing-valued points', function(){ | ||
var tdigest = new TDigest(1, 0); | ||
var tdigest = new TDigest(0, 0); | ||
var i, N = 100; | ||
@@ -90,3 +117,4 @@ for (i = 0 ; i < N ; i += 1) { | ||
} | ||
assert(tdigest.centroids.size === 100); | ||
assert(tdigest.size() === 100); | ||
tdigest.delta = 0.1; // encourage merging (don't do this!) | ||
tdigest.redigest(); | ||
@@ -139,13 +167,13 @@ var points = tdigest.asArray(); | ||
var tdigest = new TDigest(); | ||
var i, N = 100000; | ||
var i, x=[], N = 100000; | ||
for (i = 0 ; i < N ; i += 1) { | ||
tdigest.digest(Math.random()); | ||
x.push(Math.random()); | ||
} | ||
tdigest.redigest(); | ||
tdigest.digest(x); | ||
maxerr = 0; | ||
for (var i = .01 ; i <= 1 ; i += .01) { | ||
for (i = 0.01 ; i <= 1 ; i += 0.01) { | ||
var q = tdigest.quantile(i); | ||
maxerr = Math.max(maxerr, Math.abs(i-q)); | ||
} | ||
assert(maxerr < .01); | ||
assert(maxerr < 0.01); | ||
}); | ||
@@ -181,13 +209,13 @@ }); | ||
var tdigest = new TDigest(); | ||
var i, N = 100000; | ||
var i, x=[], N = 100000; | ||
for (i = 0 ; i < N ; i += 1) { | ||
tdigest.digest(Math.random()); | ||
x.push(Math.random()); | ||
} | ||
tdigest.redigest(); | ||
tdigest.digest(x); | ||
maxerr = 0; | ||
for (var i = .01 ; i <= 1 ; i += .01) { | ||
var x = tdigest.percentile(i); | ||
maxerr = Math.max(maxerr, Math.abs(i-x)); | ||
for (i = 0.01 ; i <= 1 ; i += 0.01) { | ||
var q = tdigest.percentile(i); | ||
maxerr = Math.max(maxerr, Math.abs(i-q)); | ||
} | ||
assert(maxerr < .01); | ||
assert(maxerr < 0.01); | ||
}); | ||
@@ -194,0 +222,0 @@ }); |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
47427
15
537
37
2