@@ -10,3 +10,4 @@ var TDigest = require('./tdigest').TDigest;
		};
		tdigest.digest(x);
		tdigest.push(x);
		tdigest.compress(x);
		console.log(tdigest.summary());
		@@ -13,0 +14,0 @@ for (var p = 0 ; p <= 1.0 ; p += 0.1) {

package.json

		{
		"name": "tdigest",
		"version": "0.0.4",
		"version": "0.0.5",
		"description": "javascript implementation of Dunning's T-Digest for streaming quantile approximation",
		"main": "tdigest.js",
		"scripts": {
		"test": "mocha test.js"
		"test": "mocha specs"
		},
		@@ -30,5 +30,8 @@ "repository": {
		"devDependencies": {
		"assert": "*",
		"better-assert": "^1.0.2",
		"mocha": "^2.2.5"
		"grunt": "^0.4.5",
		"grunt-dry": "^0.3.0",
		"mocha": "^2.1.0"
		}
		}

README.md

		@@ -1,3 +0,3 @@
		# tdigest

		tdigest
		============
		[![Build Status](https://travis-ci.org/welch/tdigest.svg?branch=master)](https://travis-ci.org/welch/tdigest) [![NPM version](http://img.shields.io/npm/v/tdigest.svg)](https://www.npmjs.org/package/tdigest)
		@@ -20,4 +20,24 @@

		## Example
		changes since 0.0.4:
		--------------------
		#### API Overhaul:
		* asArray() -> toArray()
		* redigest() -> compact()
		* digest() -> push()
		* pushing an array no longer triggers compaction

		#### UMD wrappers
		A grunt build task has been added to create a UMD-wrapped version of tdigest
		and dependencies for importing as a standalone module in client-side javascript.

		quickstart
		------------

		#### node.js:

		```
		npm install tdigest
		```

		```javascript
		var TDigest = require('tdigest').TDigest;
		@@ -29,3 +49,4 @@ var x=[], N = 100000;
		tdigest = new TDigest();
		tdigest.digest(x);
		tdigest.push(x);
		tdigest.compress(x);
		console.log(tdigest.summary());
		@@ -37,3 +58,19 @@ console.log("median ~ "+tdigest.percentile(0.5));

		## Dependencies
		#### In the browser:

		The following grunt tasks are configured (via the [grunt-dry](https://www.npmjs.com/package/grunt-dry) dev dependency) to generate
		a [UMD-wrapped](https://github.com/umdjs/umd) version of tdigest that can be loaded through a variety of front-end
		module systems:

		grunt build

		Uses [grunt-pure-cjs](https://github.com/RReverser/grunt-pure-cjs) to generate browser/tdigest.js and browser/specs/*.spec.js by bundling the commonjs files into a single file for both the module itself and any mocha spec files.

		grunt test

		Runs unit tests using server-side mocha in node.js from `specs/.js` and in browser using `browser/specs/.js.
		(the npm test script is configured to run this as well)

		dependencies
		-------------
		`bintrees`: [https://www.npmjs.com/package/bintrees](https://www.npmjs.com/package/bintrees)
		@@ -40,0 +77,0 @@

329

tdigest.js

		@@ -25,13 +25,2 @@ var RBTree = require('bintrees').RBTree;

		TDigest.prototype.summary = function() {
		var s = ["approximating "+this.n+" samples using "+
		this.size()+" centroids, delta="+this.delta+", CX="+this.CX,
		"min = "+this.percentile(0),
		"Q1 = "+this.percentile(0.25),
		"Q2 = "+this.percentile(0.5),
		"Q3 = "+this.percentile(0.75),
		"max = "+this.percentile(1.0)];
		return s.join('\n');
		};

		TDigest.prototype.reset = function() {
		@@ -50,2 +39,26 @@ // prepare to digest new points.

		TDigest.prototype.toArray = function(everything) {
		// return {mean,n} of centroids as an array ordered by mean.
		//
		var result = [];
		if (everything) {
		this._cumulate(true); // be sure cumns are exact
		this.centroids.each(function(c) { result.push(c); });
		} else {
		this.centroids.each(function(c) { result.push({mean:c.mean, n:c.n}); });
		}
		return result;
		};

		TDigest.prototype.summary = function() {
		var s = ["approximating "+this.n+" samples using "+
		this.size()+" centroids, delta="+this.delta+", CX="+this.CX,
		"min = "+this.percentile(0),
		"Q1 = "+this.percentile(0.25),
		"Q2 = "+this.percentile(0.5),
		"Q3 = "+this.percentile(0.75),
		"max = "+this.percentile(1.0)];
		return s.join('\n');
		};

		function compare_centroid_means(a, b) {
		@@ -55,14 +68,6 @@ // order two centroids by mean.
		if (a === null) {
		// XXX super-narrow workaround for
		// https://github.com/vadimg/js_bintrees/pull/14
		// XXX super-narrow workaround for https://github.com/vadimg/js_bintrees/pull/14
		return NaN;
		}
		if (a.mean !== b.mean) {
		return (a.mean - b.mean);
		} else if (a.cumn !== undefined && b.cumn !== undefined) {
		// RBtrees can't accommodate duplicates, use cumn as tiebreaker.
		return (a.cumn - b.cumn);
		} else {
		return 0 ; // equality is ok during searches
		}
		return a.mean - b.mean;
		}
		@@ -74,4 +79,3 @@
		if (a === null) {
		// XXX super-narrow workaround for
		// https://github.com/vadimg/js_bintrees/pull/14
		// XXX super-narrow workaround for https://github.com/vadimg/js_bintrees/pull/14
		return NaN;
		@@ -90,30 +94,22 @@ }

		TDigest.prototype.asArray = function(everything) {
		// return {mean,n} of centroids as an array ordered by mean.
		TDigest.prototype.push = function(x, n) {
		// incorporate value or array of values x, having count n into the
		// TDigest. n defaults to 1.
		//
		var result = [];
		if (everything) {
		this._cumulate(true); // be sure cumns are exact
		this.centroids.each(function(c) { result.push(c); });
		} else {
		this.centroids.each(function(c) { result.push({mean:c.mean, n:c.n}); });
		}
		return result;
		};

		TDigest.prototype.digest = function(x, n) {
		// incorporate value or array of values x, having count n into the
		// TDigest. n defaults to 1. If called with an array, recompress after
		// digesting the items.
		n = n \|\| 1;
		if (Array.isArray(x)) {
		for (var i = 0 ; i < x.length ; i++) {
		this._digest(x[i], n);
		}
		this.redigest();
		} else {
		this._digest(x, n);
		x = Array.isArray(x) ? x : [x];
		for (var i = 0 ; i < x.length ; i++) {
		this._digest(x[i], n);
		}
		};

		TDigest.prototype.push_centroid = function(c) {
		// incorporate centroid or array of centroids c
		//
		c = Array.isArray(c) ? c : [c];
		for (var i = 0 ; i < c.length ; i++) {
		this._digest(c[i].mean, c[i].n);
		}
		}

		TDigest.prototype._cumulate = function(exact) {
		@@ -141,25 +137,18 @@ // update cumulative counts for each centroid
		TDigest.prototype.find_nearest = function(x) {
		// find the centroid(s) whose means are closest to x (there can be
		// multiples because of per-centroid count limits, particularly in
		// categorical streams).
		// find the centroid closest to x. The assumption of
		// unique means and a unique nearest centroid differs from the
		// paper, see _digest() below
		//
		if (this.size() === 0) {
		return null;
		}
		var iter = this.centroids.upperBound({mean:x}); // x < iter \|\| iter==null
		var c = (iter.data() === null) ? iter.prev() : iter.data();
		if (c === null) {
		return [];
		}
		// walk the duplicates to find rightmost bound
		while (iter.next() !== null && iter.data().mean === c.mean) {
		}
		// walk backwards looking for closest centroids.
		c = iter.prev();
		var min = Math.abs(c.mean - x);
		var nearest = [c];
		// walk backwards looking for closest centroid.
		var nearest = c;
		var mindist = Math.abs(nearest.mean - x);
		var dx;
		while ((c = iter.prev()) && (dx = Math.abs(c.mean - x)) <= min) {
		if (dx < min) {
		min = dx;
		nearest = [];
		}
		nearest.push(c);
		while ((c = iter.prev()) && (dx = Math.abs(c.mean - x)) < mindist) {
		mindist = dx;
		nearest = c;
		}
		@@ -178,66 +167,69 @@ return nearest;
		//
		var c = {mean:x, n:n, cumn:this.n};
		var c = {mean:x, n:n, cumn:this.n+n};
		this.centroids.insert(c);
		c.cumn = cumn;
		this.n += n;
		return c;
		};

		TDigest.prototype._addweight = function(c, x, n, inplace) {
		// add weight at location x to centroid c
		TDigest.prototype._addweight = function(nearest, x, n) {
		// add weight at location x to nearest centroid. adding x to
		// nearest will not shift its relative position in the tree and
		// require reinsertion.
		//
		var newmean = c.mean + n * (x - c.mean) / (c.n + n);
		if (inplace) {
		c.mean = newmean;
		c.n += n;
		c.cumn += n / 2;
		this.n += n;
		} else {
		this.centroids.remove(c);
		this.n -= n;
		this._new_centroid(newmean, c.n + n, c.cumn + n / 2);
		}
		var newmean = nearest.mean + n * (x - nearest.mean) / (nearest.n + n);
		nearest.mean = newmean;
		nearest.n += n;
		nearest.cumn += n / 2;
		this.n += n;
		};

		TDigest.prototype._centroid_quantile = function(c) {
		// quantile estimate for a centroid's mean is a simple special case
		//
		if (this.size() === 0) {
		return NaN;
		} else if (c === this.centroids.min()) {
		return 0.0;
		} else if (c === this.centroids.max()) {
		return 1.0;
		} else {
		return c.cumn / this.n;
		}
		};

		TDigest.prototype._digest = function(x, n) {
		// incorporate value x, having count n into the TDigest.
		//
		var nearest = this.find_nearest(x);
		var inplace = (nearest.length <= 1 \|\| nearest[0].mean === x);
		var cumn = (nearest.length > 0) ? nearest[0].cumn : 0;
		while (nearest.length > 0 && n > 0) {
		var c = pop_random(nearest);
		// even though all c's are the same distance, they have different
		// cumn's. get a fresh q, it affects boundary preservation
		var q = (cumn > 0) ? this._centroid_quantile(c) : 0;
		var max_n = Math.floor(4 * this.n * this.delta * q * (1 - q));
		var room = max_n - c.n;
		if (room <= 0) {
		continue;
		// explicitly test for new max/min, rather than relying on 0/1
		// quantile value in the max_n calculation. quantile does not
		// achieve 0 or 1 at bounding centroid because cumn at centroid
		// median is given half the centroid mass.
		var min = this.centroids.min();
		var max = this.centroids.max();
		if (this.size() === 0 \|\| x < min.mean) {
		this._new_centroid(x, n, n / 2); // first or new min point
		} else if (max.mean < x ) {
		this._new_centroid(x, n, this.n + n / 2); // new max point
		} else {
		var nearest = this.find_nearest(x);
		if (nearest.mean === x) {
		// accumulate exact matches into the centroid without
		// limit. this is a change from the paper, made so that
		// centroid means remain unique. although the paper
		// allows for the creation of duplicate centroid means
		// (when a centroid is too full to absorb a new point) and
		// iterates over such duplicates, its quantile() and
		// icdf() do not properly account for them (and they turn
		// out to be messy to accommodate, for no apparent
		// benefit). this is much tidier.
		this._addweight(nearest, x, n);
		return;
		}
		var dn = Math.min(n, room);
		this._addweight(c, x, dn, inplace);
		n -= dn;
		var p = nearest.cumn / this.n;
		var max_n = Math.floor(4 * this.n * this.delta * p * (1 - p));
		if (nearest != min && nearest != max && max_n - nearest.n >= n) {
		// only merge when nearest is not a boundary point and
		// there is room inside nearest to absorb all of x. if
		// there's not room, don't bother splitting some of x into
		// nearest, as we'll have to make a new centroid anyway
		// for the remainder (departure from the paper suggested
		// by uniqeness of centroids).
		this._addweight(nearest, x, n);
		} else {
		// create a new centroid at x
		this._new_centroid(x, n, nearest.cumn); // approximate cumn
		}
		this._cumulate(false);
		}
		if (n > 0) {
		// create a new centroid at x for the undigested counts.
		this._new_centroid(x, n, cumn + n / 2); // approximate cumn
		}
		this._cumulate(false);
		if (this.K && this.size() > this.K / this.delta) {
		// re-process the centroids and hope for some compression.
		this.redigest();
		this.compress();
		}
		@@ -247,42 +239,29 @@ };
		TDigest.prototype.bound_mean = function(x) {
		// find rightmost centroids (in case of duplicate means) lower and
		// upper such that lower.mean <= x < upper.mean
		// find centroids lower and upper such that lower.mean < x <
		// upper.mean or lower.mean === x === upper.mean. Don't call
		// this for x out of bounds.
		//
		var iter = this.centroids.upperBound({mean:x}); // x < iter
		var c = iter.prev(); // c <= x
		var cnext = iter.next() ; // x < cnext
		while (iter.next() !== null && iter.data().mean === cnext.mean) {
		cnext = iter.data(); // walk the duplicates to find rightmost
		}
		return [c, cnext];
		var lower = iter.prev(); // lower <= x
		var upper = (lower.mean === x) ? lower : iter.next();
		return [lower, upper];
		};

		TDigest.prototype.bound_cumn = function(cumn) {
		// find centroids lower and upper such that lower.cumn <= cumn < upper.cumn
		//
		// XXX because the centroids have the same sort order by mean or
		// by cumn, swap out the comparator in our balanced tree then
		// search. sleazy!
		this.centroids._comparator = compare_centroid_cumns;
		var iter = this.centroids.upperBound({cumn:cumn}); // cumn < iter
		var c = iter.prev(); // c <= cumn
		var cnext = iter.next() ; // cumn < cnext, no worries about duplicates
		this.centroids._comparator = compare_centroid_means;
		return [c, cnext];
		};

		TDigest.prototype.quantile = function(x) {
		// return approximate quantile for value x, in the range 0..1. In
		// a small departure from the published algorithm, don't extrapolate
		// beyond endpoints (we do not expect c.n > 1 in the extreme
		// centroids, and don't want to invent density beyond them)
		// return approximate quantile (0..1) for data value x. The
		// handling of max and min data values differs from the paper,
		// which snaps them to 0 and 1. Here their centroids are not
		// treated specially, and the boundary centroids will report half
		// their weight. data values outside the observed range return 0
		// or 1.
		//
		// this triggers cumulate() if cumn's are out of date.
		//
		if (this.size() === 0) {
		return NaN;
		} else if (x <= this.centroids.min().mean) {
		} else if (x < this.centroids.min().mean) {
		return 0.0;
		} else if (x >= this.centroids.max().mean) {
		} else if (x > this.centroids.max().mean) {
		return 1.0;
		}
		this._cumulate(true); // be sure cumns are exact
		// find centroids that bracket x and interpolate x's cumn from
		@@ -292,6 +271,8 @@ // their cumn's.
		var lower = bound[0], upper = bound[1];
		var dxn = (upper.cumn - lower.cumn) * (x - lower.mean) / (upper.mean - lower.mean);
		// correct for endpoint weight truncation. Since we expect the extremes
		// to have a single point each, expect to lose a half from each.
		return (lower.cumn + dxn - 0.5) / (this.n - 1);
		this._cumulate(true); // be sure cumns are exact
		var cumn = lower.cumn;
		if (lower !== upper) {
		cumn += (upper.cumn - lower.cumn) * (x - lower.mean) / (upper.mean - lower.mean);
		}
		return cumn / this.n;
		};
		@@ -304,15 +285,34 @@

		TDigest.prototype.bound_cumn = function(cumn) {
		// find centroids lower and upper such that lower.cumn < x <
		// upper.cumn or lower.cumn === x === upper.cumn. Don't call
		// this for cumn out of bounds.
		//
		// XXX because mean and cumn give rise to the same sort order
		// (up to identical means), use the mean rbtree for our cumn search.
		this.centroids._comparator = compare_centroid_cumns;
		var iter = this.centroids.upperBound({cumn:cumn}); // cumn < iter
		this.centroids._comparator = compare_centroid_means;
		var lower = iter.prev(); // lower <= cumn
		var upper = (lower.cumn === cumn) ? lower : iter.next();
		return [lower, upper];
		};

		TDigest.prototype.percentile = function(p) {
		// return the approximate x value for the specified percentile.
		// As with quantiles, map between [0..1] and the observed range of data.
		// return the approximate data value q at the specified percentile (0..1).
		// also known as the inverse cdf.
		//
		var cumn = p * (this.n - 1) + 0.5; // correct for endweight truncation
		// this triggers cumulate() if cumn's are out of date.
		//
		this._cumulate(true); // be sure cumns are exact
		var cumn = p * this.n;
		var min = this.centroids.min();
		var max = this.centroids.max();
		if (this.size() === 0) {
		return NaN;
		} else if (cumn <= 0.5) {
		return this.centroids.min().mean;
		} else if (cumn >= this.n - 0.5) {
		return this.centroids.max().mean;
		} else if (cumn <= min.cumn) {
		return min.mean;
		} else if (cumn >= max.cumn) {
		return max.mean;
		}
		this._cumulate(true); // be sure cumns are exact
		// find centroids whose cumns bracket cumn, then interpolate x
		@@ -322,4 +322,7 @@ // from their means.
		var lower = bound[0], upper = bound[1];
		var dx = (upper.mean - lower.mean) * (cumn - lower.cumn) / (upper.cumn - lower.cumn);
		return lower.mean + dx;
		var q = lower.mean;
		if (lower !== upper) {
		q += (upper.mean - lower.mean) * (cumn - lower.cumn) / (upper.cumn - lower.cumn);
		}
		return q;
		};
		@@ -332,3 +335,3 @@

		TDigest.prototype.redigest = function() {
		TDigest.prototype.compress = function() {
		// TDigests experience worst case compression (none) when input
		@@ -339,9 +342,15 @@ // increases monotonically. Improve on any bad luck by
		//
		var points = this.asArray();
		if (this.compressing) {
		return;
		}
		var points = this.toArray();
		this.reset();
		this.compressing = true;
		this.push_centroid(points.shift()); // min
		this.push_centroid(points.pop()); // max
		while (points.length > 0) {
		var c = pop_random(points);
		this.digest(c.mean, c.n);
		this.push_centroid(pop_random(points));
		}
		this._cumulate(true);
		this.compressing = false;
		};
		@@ -348,0 +357,0 @@

test.js

.travis.yml

Sorry, the diff of this file is not supported yet

tdigest - npm Package Compare versions

New alerts

Improved metrics

Worsened metrics