tdigest - npm Package Compare versions

		{
		"name": "tdigest",
		"version": "0.0.0",
		"version": "0.0.1",
		"description": "javascript implementation of Dunning's T-Digest for streaming quantile approximation",
		"main": "tdigest.js",
		"scripts": {
		"test": "mocha -b test.js"
		"test": "mocha test.js"
		},
		@@ -30,4 +30,5 @@ "repository": {
		"devDependencies": {
		"better-assert": "^1.0.2"
		"better-assert": "^1.0.2",
		"mocha": "^2.2.5"
		}
		}

README.md

		# tdigest
		tdigest: javascript implementation of Dunning's T-Digest for streaming quantile approximation
		Javascript implementation of Dunning's T-Digest for streaming quantile approximation

		The T-Digest is a data structure and algorithm for constructing an
		approximate distribution for a set of real numbers presented as a
		stream. The algorithm makes no guarantees, but behaves well enough in
		practice that implementations have been included in Apache Mahout and
		ElasticSearch for computing fast summaries and approximate order
		statistics over a stream.

		For a pleasant overview of T-Digest's behavior, see Davidson-Pilon's
		[blog post](http://dataorigami.net/blogs/napkin-folding/19055451-percentile-and-quantile-estimation-of-big-data-the-t-digest) regarding a python implementation. For more details,
		there are the [tdigest paper](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) and [reference implementation](https://github.com/tdunning/t-digest) (Java).
		This javascript implementation is based on a reading of the paper.

		## Example
		```
		var TDigest = require('tdigest').TDigest;
		var x=[], N = 100000;
		for (var i = 0 ; i < N ; i += 1) {
		x.push(Math.random() * 10 - 5);
		};
		tdigest = new TDigest();
		tdigest.digest(x);
		console.log(tdigest.summary());
		console.log("median ~ "+tdigest.percentile(0.5));
		```

		See also [example.js](https://github.com/welch/tdigest/blob/master/example.js) in this package.

		## Dependencies
		`bintrees`: packages.json specifies a fork of [https://github.com/vadimg/js_bintrees](https://github.com/vadimg/js_bintrees)
		that corrects a tree-traversal bug. You'll need it until PR#14 is merged.

162

tdigest.js

		var RBTree = require('bintrees').RBTree;

		function TDigest(delta, K) {
		function TDigest(delta, K, CX) {
		// allocate a TDigest structure.
		@@ -13,4 +13,9 @@ //
		//
		// CX specifies how often to update cached cumulative totals used
		// for quantile estimation during ingest (see cumulate()). Set to
		// 0 to use exact quantiles for each new point.
		//
		this.delta = (delta === undefined) ? 0.01 : delta;
		this.K = (K === undefined) ? 25 : K;
		this.CX = (CX === undefined) ? 1.1 : CX;
		this.centroids = new RBTree(compare_centroids);
		@@ -21,2 +26,13 @@ this.nreset = 0;

		TDigest.prototype.summary = function() {
		var s = ["approximating "+this.n+" samples using "+
		this.size()+" centroids, delta="+this.delta+", CX="+this.CX,
		"min = "+this.percentile(0),
		"Q1 = "+this.percentile(0.25),
		"Q2 = "+this.percentile(0.5),
		"Q3 = "+this.percentile(0.75),
		"max = "+this.percentile(1.0)];
		return s.join('\n');
		};

		TDigest.prototype.reset = function() {
		@@ -28,4 +44,9 @@ // prepare to digest new points.
		this.nreset += 1;
		this.last_cumulate = 0;
		};

		TDigest.prototype.size = function() {
		return this.centroids.size;
		};

		function compare_centroids(a, b) {
		@@ -36,5 +57,5 @@ // order two centroids by mean.
		return (a.mean - b.mean);
		} else if (a.id !== undefined && b.id !== undefined) {
		// RBtrees can't accommodate duplicates, use id as tiebreaker.
		return (a.id - b.id);
		} else if (a.cumn !== undefined && b.cumn !== undefined) {
		// RBtrees can't accommodate duplicates, use cumn as tiebreaker.
		return (a.cumn - b.cumn);
		} else {
		@@ -58,2 +79,3 @@ return 0 ; // equality is ok during searches
		if (everything) {
		this._cumulate(true); // be sure cumns are exact
		this.centroids.each(function(c) { result.push(c); });
		@@ -66,13 +88,36 @@ } else {

		TDigest.prototype.insert = function(x, n) {
		// insert a new centroid into the digest
		TDigest.prototype.digest = function(x, n) {
		// incorporate value or array of values x, having count n into the
		// TDigest. n defaults to 1. If called with an array, recompress after
		// digesting the items.
		n = n \|\| 1;
		if (Array.isArray(x)) {
		for (var i = 0 ; i < x.length ; i++) {
		this._digest(x[i], n);
		}
		this.redigest();
		} else {
		this._digest(x, n);
		}
		};

		TDigest.prototype._cumulate = function(exact) {
		// update cumulative counts for each centroid
		//
		this.centroids.insert({mean:x, n:n, id:this.n});
		this.n = 0;
		var self = this;
		// exact: falsey means only cumulate after sufficient
		// growth. During ingest, these counts are used as quantile
		// estimates, and they work well even when somewhat out of
		// date. (this is a departure from the publication, you may set CX
		// to 0 to disable).
		//
		if (this.n === this.last_cumulate \|\|
		!exact && this.CX && this.CX > (this.n / this.last_cumulate)) {
		return;
		}
		var cumn = 0;
		this.centroids.each(function(c) {
		c.cumn = self.n + c.n/2 ; // at the mean, we've accumulated half the n.
		self.n += c.n;
		c.cumn = cumn + c.n/2 ; // at the mean, we've accumulated half the n.
		cumn += c.n;
		});

		this.n = this.last_cumulate = cumn;
		};
		@@ -82,10 +127,15 @@
		// find the centroid(s) whose means are closest to x
		// (there can be multiples because of per-centroid count limits)
		// return a list of centroids.
		// (there can be multiples because of per-centroid count limits).
		//
		var iter = this.centroids.upperBound({mean:x});
		var prev = (iter.data() === null) ? iter.prev() : iter.data();
		var iter = this.centroids.upperBound({mean:x}); // x < iter \|\| iter==null
		var upper = iter.data();
		if (upper !== null) {
		// jump to the end of string of duplicates
		iter = this.centroids.upperBound({mean:upper.mean});
		}
		var prev = (iter.data() === null) ? iter.prev() : iter.data();
		if (prev === null) {
		return [];
		}
		// walk backwards looking for closer centroids
		var min = Math.abs(prev.mean - x);
		@@ -104,24 +154,73 @@ var nearest = [prev];

		TDigest.prototype.digest = function(x, n) {
		// incorporate value x having count n into the TDigest. n defaults to 1
		TDigest.prototype._new_centroid = function(x, n, cumn) {
		// create and insert a new centroid into the digest (don't update
		// cumulatives).
		//
		// XXX for inserting, cumn needs to be a unique tiebreaker (the
		// RBTree implementation doesn't accept duplicates). After
		// inserting, set cumn to its given value since we never rely on
		// cumn order among equivalent means.
		//
		var c = {mean:x, n:n, cumn:this.n};
		this.centroids.insert(c);
		c.cumn = cumn;
		this.n += n;
		};

		TDigest.prototype._addweight = function(c, x, n, inplace) {
		// add weight at location x to centroid c
		//
		var newmean = c.mean + n * (x - c.mean) / (c.n + n);
		if (inplace) {
		c.mean = newmean;
		c.n += n;
		c.cumn += n / 2;
		this.n += n;
		} else {
		this.centroids.remove(c);
		this.n -= n;
		this._new_centroid(newmean, c.n + n, c.cumn + n / 2);
		}
		};

		TDigest.prototype._centroid_quantile = function(c) {
		// quick-and-dirty quantile estimate for centroid's mean.
		//
		if (this.size() === 0) {
		return NaN;
		} else if (c === this.centroids.min()) {
		return 0.0;
		} else if (c === this.centroids.max()) {
		return 1.0;
		} else {
		return c.cumn / this.n;
		}
		};

		TDigest.prototype._digest = function(x, n) {
		// incorporate value x, having count n into the TDigest.
		//
		var nearest = this.find_nearest(x);
		n = n \|\| 1;
		var inplace = (nearest.length <= 1 \|\| nearest[0].mean === x);
		var cumn = (nearest.length > 0) ? nearest[0].cumn : 0;
		while (nearest.length > 0 && n > 0) {
		var c = pop_random(nearest);
		var q = this.quantile(c.mean);
		var room = 4 * this.n * this.delta * q * (1 - q) - c.n;
		// even though all c's are the same distance, they have different
		// cumn's. get a fresh q, it affects boundary preservation
		var q = (cumn > 0) ? this._centroid_quantile(c) : 0;
		var max_n = Math.floor(4 * this.n * this.delta * q * (1 - q));
		var room = max_n - c.n;
		if (room <= 0) {
		continue;
		}
		var dn = (room > n) ? n : room;
		c.n += dn;
		c.mean += dn * (x - c.mean) / c.n;
		var dn = Math.min(n, room);
		this._addweight(c, x, dn, inplace);
		n -= dn;
		}
		if (n > 0) {
		// create a new centroid at x for the undigested counts
		this.insert(x, n);
		// create a new centroid at x for the undigested counts.
		this._new_centroid(x, n, cumn + n / 2); // approximate cumn
		}
		if (this.K && this.centroids.size > this.K / this.delta) {
		this._cumulate(false);
		if (this.K && this.size() > this.K / this.delta) {
		// re-process the centroids and hope for some compression.
		@@ -138,3 +237,3 @@ this.redigest();
		//
		if (this.centroids.size === 0) {
		if (this.size() === 0) {
		return NaN;
		@@ -146,5 +245,6 @@ } else if (x <= this.centroids.min().mean) {
		}
		this._cumulate(true); // be sure cumns are exact
		// find centroids that bracket x and interpolate x's cumn from
		// their cumn's.
		var iter = this.centroids.upperBound({mean:x});
		var iter = this.centroids.upperBound({mean:x}); // x.mean < iter
		var c = iter.prev(); // c.mean <= x.mean
		@@ -170,3 +270,3 @@ var cnext = iter.next() ; // x.mean < cnext.mean
		//
		if (this.centroids.size === 0) {
		if (this.size() === 0) {
		return NaN;
		@@ -178,2 +278,3 @@ } else if (p <= 0) {
		}
		this._cumulate(true); // be sure cumns are exact
		var target = p * (this.n - 1) + 0.5; // correct for endweight truncation
		@@ -212,2 +313,3 @@ // find centroids whose cumns bracket target, then interpolate x
		}
		this._cumulate(true);
		};
		@@ -214,0 +316,0 @@

test.js

		@@ -64,3 +64,3 @@ var deepEqual = require('assert').deepEqual;
		it('puts singletons on the ends and piles into the middle', function(){
		var tdigest = new TDigest(10, 0); // plenty of room in the center
		var tdigest = new TDigest(10, 0, 0); // plenty of room in the center
		var i, N = 10;
		@@ -79,2 +79,29 @@ tdigest.digest(0);
		});
		it('preserves order as centroids merge and shift', function(){
		var tdigest = new TDigest(0,0); // suppress merging
		var i, N = 10;
		tdigest.digest(0);
		tdigest.digest(1);
		tdigest.digest(0.5);
		tdigest.digest(0.5);
		tdigest.digest(0.5);
		deepEqual( // no surprise here
		tdigest.asArray(true),
		[{mean:0, n:1, cumn:0.5},
		{mean:0.5, n:1, cumn:1.5},
		{mean:0.5, n:1, cumn:2.5},
		{mean:0.5, n:1, cumn:3.5},
		{mean:1, n:1, cumn:4.5}]
		);
		tdigest.delta = 10; // encourage merging (don't do this!)
		tdigest.digest(0.6);// 2/3 of the time merge will violate the ordering
		deepEqual(
		tdigest.asArray(),
		[{mean:0, n:1},
		{mean:0.5, n:1},
		{mean:0.5, n:1},
		{mean:0.55, n:2},
		{mean:1, n:1}]
		);
		});
		});
		@@ -84,3 +111,3 @@
		it('compresses increasing-valued points', function(){
		var tdigest = new TDigest(1, 0);
		var tdigest = new TDigest(0, 0);
		var i, N = 100;
		@@ -90,3 +117,4 @@ for (i = 0 ; i < N ; i += 1) {
		}
		assert(tdigest.centroids.size === 100);
		assert(tdigest.size() === 100);
		tdigest.delta = 0.1; // encourage merging (don't do this!)
		tdigest.redigest();
		@@ -139,13 +167,13 @@ var points = tdigest.asArray();
		var tdigest = new TDigest();
		var i, N = 100000;
		var i, x=[], N = 100000;
		for (i = 0 ; i < N ; i += 1) {
		tdigest.digest(Math.random());
		x.push(Math.random());
		}
		tdigest.redigest();
		tdigest.digest(x);
		maxerr = 0;
		for (var i = .01 ; i <= 1 ; i += .01) {
		for (i = 0.01 ; i <= 1 ; i += 0.01) {
		var q = tdigest.quantile(i);
		maxerr = Math.max(maxerr, Math.abs(i-q));
		}
		assert(maxerr < .01);
		assert(maxerr < 0.01);
		});
		@@ -181,13 +209,13 @@ });
		var tdigest = new TDigest();
		var i, N = 100000;
		var i, x=[], N = 100000;
		for (i = 0 ; i < N ; i += 1) {
		tdigest.digest(Math.random());
		x.push(Math.random());
		}
		tdigest.redigest();
		tdigest.digest(x);
		maxerr = 0;
		for (var i = .01 ; i <= 1 ; i += .01) {
		var x = tdigest.percentile(i);
		maxerr = Math.max(maxerr, Math.abs(i-x));
		for (i = 0.01 ; i <= 1 ; i += 0.01) {
		var q = tdigest.percentile(i);
		maxerr = Math.max(maxerr, Math.abs(i-q));
		}
		assert(maxerr < .01);
		assert(maxerr < 0.01);
		});
		@@ -194,0 +222,0 @@ });

package.json~

Sorry, the diff of this file is not supported yet

test.js~

Sorry, the diff of this file is not supported yet

Improved metrics

Worsened metrics