tdigest
Advanced tools
Comparing version 0.0.4 to 0.0.5
@@ -10,3 +10,4 @@ var TDigest = require('./tdigest').TDigest; | ||
}; | ||
tdigest.digest(x); | ||
tdigest.push(x); | ||
tdigest.compress(x); | ||
console.log(tdigest.summary()); | ||
@@ -13,0 +14,0 @@ for (var p = 0 ; p <= 1.0 ; p += 0.1) { |
{ | ||
"name": "tdigest", | ||
"version": "0.0.4", | ||
"version": "0.0.5", | ||
"description": "javascript implementation of Dunning's T-Digest for streaming quantile approximation", | ||
"main": "tdigest.js", | ||
"scripts": { | ||
"test": "mocha test.js" | ||
"test": "mocha specs" | ||
}, | ||
@@ -30,5 +30,8 @@ "repository": { | ||
"devDependencies": { | ||
"assert": "*", | ||
"better-assert": "^1.0.2", | ||
"mocha": "^2.2.5" | ||
"grunt": "^0.4.5", | ||
"grunt-dry": "^0.3.0", | ||
"mocha": "^2.1.0" | ||
} | ||
} |
@@ -1,3 +0,3 @@ | ||
# tdigest | ||
tdigest | ||
============ | ||
[![Build Status](https://travis-ci.org/welch/tdigest.svg?branch=master)](https://travis-ci.org/welch/tdigest) [![NPM version](http://img.shields.io/npm/v/tdigest.svg)](https://www.npmjs.org/package/tdigest) | ||
@@ -20,4 +20,24 @@ | ||
## Example | ||
changes since 0.0.4: | ||
-------------------- | ||
#### API Overhaul: | ||
* asArray() -> toArray() | ||
* redigest() -> compact() | ||
* digest() -> push() | ||
* pushing an array no longer triggers compaction | ||
#### UMD wrappers | ||
A grunt build task has been added to create a UMD-wrapped version of tdigest | ||
and dependencies for importing as a standalone module in client-side javascript. | ||
quickstart | ||
------------ | ||
#### node.js: | ||
``` | ||
npm install tdigest | ||
``` | ||
```javascript | ||
var TDigest = require('tdigest').TDigest; | ||
@@ -29,3 +49,4 @@ var x=[], N = 100000; | ||
tdigest = new TDigest(); | ||
tdigest.digest(x); | ||
tdigest.push(x); | ||
tdigest.compress(x); | ||
console.log(tdigest.summary()); | ||
@@ -37,3 +58,19 @@ console.log("median ~ "+tdigest.percentile(0.5)); | ||
## Dependencies | ||
#### In the browser: | ||
The following grunt tasks are configured (via the [grunt-dry](https://www.npmjs.com/package/grunt-dry) dev dependency) to generate | ||
a [UMD-wrapped](https://github.com/umdjs/umd) version of tdigest that can be loaded through a variety of front-end | ||
module systems: | ||
**grunt build** | ||
Uses [grunt-pure-cjs](https://github.com/RReverser/grunt-pure-cjs) to generate browser/tdigest.js and browser/specs/*.spec.js by bundling the commonjs files into a single file for both the module itself and any mocha spec files. | ||
**grunt test** | ||
Runs unit tests using server-side mocha in node.js from `specs/*.js` and in browser using `browser/specs/*.js. | ||
(the npm test script is configured to run this as well) | ||
dependencies | ||
------------- | ||
`bintrees`: [https://www.npmjs.com/package/bintrees](https://www.npmjs.com/package/bintrees) | ||
@@ -40,0 +77,0 @@ |
329
tdigest.js
@@ -25,13 +25,2 @@ var RBTree = require('bintrees').RBTree; | ||
TDigest.prototype.summary = function() { | ||
var s = ["approximating "+this.n+" samples using "+ | ||
this.size()+" centroids, delta="+this.delta+", CX="+this.CX, | ||
"min = "+this.percentile(0), | ||
"Q1 = "+this.percentile(0.25), | ||
"Q2 = "+this.percentile(0.5), | ||
"Q3 = "+this.percentile(0.75), | ||
"max = "+this.percentile(1.0)]; | ||
return s.join('\n'); | ||
}; | ||
TDigest.prototype.reset = function() { | ||
@@ -50,2 +39,26 @@ // prepare to digest new points. | ||
TDigest.prototype.toArray = function(everything) { | ||
// return {mean,n} of centroids as an array ordered by mean. | ||
// | ||
var result = []; | ||
if (everything) { | ||
this._cumulate(true); // be sure cumns are exact | ||
this.centroids.each(function(c) { result.push(c); }); | ||
} else { | ||
this.centroids.each(function(c) { result.push({mean:c.mean, n:c.n}); }); | ||
} | ||
return result; | ||
}; | ||
TDigest.prototype.summary = function() { | ||
var s = ["approximating "+this.n+" samples using "+ | ||
this.size()+" centroids, delta="+this.delta+", CX="+this.CX, | ||
"min = "+this.percentile(0), | ||
"Q1 = "+this.percentile(0.25), | ||
"Q2 = "+this.percentile(0.5), | ||
"Q3 = "+this.percentile(0.75), | ||
"max = "+this.percentile(1.0)]; | ||
return s.join('\n'); | ||
}; | ||
function compare_centroid_means(a, b) { | ||
@@ -55,14 +68,6 @@ // order two centroids by mean. | ||
if (a === null) { | ||
// XXX super-narrow workaround for | ||
// https://github.com/vadimg/js_bintrees/pull/14 | ||
// XXX super-narrow workaround for https://github.com/vadimg/js_bintrees/pull/14 | ||
return NaN; | ||
} | ||
if (a.mean !== b.mean) { | ||
return (a.mean - b.mean); | ||
} else if (a.cumn !== undefined && b.cumn !== undefined) { | ||
// RBtrees can't accommodate duplicates, use cumn as tiebreaker. | ||
return (a.cumn - b.cumn); | ||
} else { | ||
return 0 ; // equality is ok during searches | ||
} | ||
return a.mean - b.mean; | ||
} | ||
@@ -74,4 +79,3 @@ | ||
if (a === null) { | ||
// XXX super-narrow workaround for | ||
// https://github.com/vadimg/js_bintrees/pull/14 | ||
// XXX super-narrow workaround for https://github.com/vadimg/js_bintrees/pull/14 | ||
return NaN; | ||
@@ -90,30 +94,22 @@ } | ||
TDigest.prototype.asArray = function(everything) { | ||
// return {mean,n} of centroids as an array ordered by mean. | ||
TDigest.prototype.push = function(x, n) { | ||
// incorporate value or array of values x, having count n into the | ||
// TDigest. n defaults to 1. | ||
// | ||
var result = []; | ||
if (everything) { | ||
this._cumulate(true); // be sure cumns are exact | ||
this.centroids.each(function(c) { result.push(c); }); | ||
} else { | ||
this.centroids.each(function(c) { result.push({mean:c.mean, n:c.n}); }); | ||
} | ||
return result; | ||
}; | ||
TDigest.prototype.digest = function(x, n) { | ||
// incorporate value or array of values x, having count n into the | ||
// TDigest. n defaults to 1. If called with an array, recompress after | ||
// digesting the items. | ||
n = n || 1; | ||
if (Array.isArray(x)) { | ||
for (var i = 0 ; i < x.length ; i++) { | ||
this._digest(x[i], n); | ||
} | ||
this.redigest(); | ||
} else { | ||
this._digest(x, n); | ||
x = Array.isArray(x) ? x : [x]; | ||
for (var i = 0 ; i < x.length ; i++) { | ||
this._digest(x[i], n); | ||
} | ||
}; | ||
TDigest.prototype.push_centroid = function(c) { | ||
// incorporate centroid or array of centroids c | ||
// | ||
c = Array.isArray(c) ? c : [c]; | ||
for (var i = 0 ; i < c.length ; i++) { | ||
this._digest(c[i].mean, c[i].n); | ||
} | ||
} | ||
TDigest.prototype._cumulate = function(exact) { | ||
@@ -141,25 +137,18 @@ // update cumulative counts for each centroid | ||
TDigest.prototype.find_nearest = function(x) { | ||
// find the centroid(s) whose means are closest to x (there can be | ||
// multiples because of per-centroid count limits, particularly in | ||
// categorical streams). | ||
// find the centroid closest to x. The assumption of | ||
// unique means and a unique nearest centroid differs from the | ||
// paper, see _digest() below | ||
// | ||
if (this.size() === 0) { | ||
return null; | ||
} | ||
var iter = this.centroids.upperBound({mean:x}); // x < iter || iter==null | ||
var c = (iter.data() === null) ? iter.prev() : iter.data(); | ||
if (c === null) { | ||
return []; | ||
} | ||
// walk the duplicates to find rightmost bound | ||
while (iter.next() !== null && iter.data().mean === c.mean) { | ||
} | ||
// walk backwards looking for closest centroids. | ||
c = iter.prev(); | ||
var min = Math.abs(c.mean - x); | ||
var nearest = [c]; | ||
// walk backwards looking for closest centroid. | ||
var nearest = c; | ||
var mindist = Math.abs(nearest.mean - x); | ||
var dx; | ||
while ((c = iter.prev()) && (dx = Math.abs(c.mean - x)) <= min) { | ||
if (dx < min) { | ||
min = dx; | ||
nearest = []; | ||
} | ||
nearest.push(c); | ||
while ((c = iter.prev()) && (dx = Math.abs(c.mean - x)) < mindist) { | ||
mindist = dx; | ||
nearest = c; | ||
} | ||
@@ -178,66 +167,69 @@ return nearest; | ||
// | ||
var c = {mean:x, n:n, cumn:this.n}; | ||
var c = {mean:x, n:n, cumn:this.n+n}; | ||
this.centroids.insert(c); | ||
c.cumn = cumn; | ||
this.n += n; | ||
return c; | ||
}; | ||
TDigest.prototype._addweight = function(c, x, n, inplace) { | ||
// add weight at location x to centroid c | ||
TDigest.prototype._addweight = function(nearest, x, n) { | ||
// add weight at location x to nearest centroid. adding x to | ||
// nearest will not shift its relative position in the tree and | ||
// require reinsertion. | ||
// | ||
var newmean = c.mean + n * (x - c.mean) / (c.n + n); | ||
if (inplace) { | ||
c.mean = newmean; | ||
c.n += n; | ||
c.cumn += n / 2; | ||
this.n += n; | ||
} else { | ||
this.centroids.remove(c); | ||
this.n -= n; | ||
this._new_centroid(newmean, c.n + n, c.cumn + n / 2); | ||
} | ||
var newmean = nearest.mean + n * (x - nearest.mean) / (nearest.n + n); | ||
nearest.mean = newmean; | ||
nearest.n += n; | ||
nearest.cumn += n / 2; | ||
this.n += n; | ||
}; | ||
TDigest.prototype._centroid_quantile = function(c) { | ||
// quantile estimate for a centroid's mean is a simple special case | ||
// | ||
if (this.size() === 0) { | ||
return NaN; | ||
} else if (c === this.centroids.min()) { | ||
return 0.0; | ||
} else if (c === this.centroids.max()) { | ||
return 1.0; | ||
} else { | ||
return c.cumn / this.n; | ||
} | ||
}; | ||
TDigest.prototype._digest = function(x, n) { | ||
// incorporate value x, having count n into the TDigest. | ||
// | ||
var nearest = this.find_nearest(x); | ||
var inplace = (nearest.length <= 1 || nearest[0].mean === x); | ||
var cumn = (nearest.length > 0) ? nearest[0].cumn : 0; | ||
while (nearest.length > 0 && n > 0) { | ||
var c = pop_random(nearest); | ||
// even though all c's are the same distance, they have different | ||
// cumn's. get a fresh q, it affects boundary preservation | ||
var q = (cumn > 0) ? this._centroid_quantile(c) : 0; | ||
var max_n = Math.floor(4 * this.n * this.delta * q * (1 - q)); | ||
var room = max_n - c.n; | ||
if (room <= 0) { | ||
continue; | ||
// explicitly test for new max/min, rather than relying on 0/1 | ||
// quantile value in the max_n calculation. quantile does not | ||
// achieve 0 or 1 at bounding centroid because cumn at centroid | ||
// median is given half the centroid mass. | ||
var min = this.centroids.min(); | ||
var max = this.centroids.max(); | ||
if (this.size() === 0 || x < min.mean) { | ||
this._new_centroid(x, n, n / 2); // first or new min point | ||
} else if (max.mean < x ) { | ||
this._new_centroid(x, n, this.n + n / 2); // new max point | ||
} else { | ||
var nearest = this.find_nearest(x); | ||
if (nearest.mean === x) { | ||
// accumulate exact matches into the centroid without | ||
// limit. this is a change from the paper, made so that | ||
// centroid means remain unique. although the paper | ||
// allows for the creation of duplicate centroid means | ||
// (when a centroid is too full to absorb a new point) and | ||
// iterates over such duplicates, its quantile() and | ||
// icdf() do not properly account for them (and they turn | ||
// out to be messy to accommodate, for no apparent | ||
// benefit). this is much tidier. | ||
this._addweight(nearest, x, n); | ||
return; | ||
} | ||
var dn = Math.min(n, room); | ||
this._addweight(c, x, dn, inplace); | ||
n -= dn; | ||
var p = nearest.cumn / this.n; | ||
var max_n = Math.floor(4 * this.n * this.delta * p * (1 - p)); | ||
if (nearest != min && nearest != max && max_n - nearest.n >= n) { | ||
// only merge when nearest is not a boundary point and | ||
// there is room inside nearest to absorb all of x. if | ||
// there's not room, don't bother splitting some of x into | ||
// nearest, as we'll have to make a new centroid anyway | ||
// for the remainder (departure from the paper suggested | ||
// by uniqeness of centroids). | ||
this._addweight(nearest, x, n); | ||
} else { | ||
// create a new centroid at x | ||
this._new_centroid(x, n, nearest.cumn); // approximate cumn | ||
} | ||
this._cumulate(false); | ||
} | ||
if (n > 0) { | ||
// create a new centroid at x for the undigested counts. | ||
this._new_centroid(x, n, cumn + n / 2); // approximate cumn | ||
} | ||
this._cumulate(false); | ||
if (this.K && this.size() > this.K / this.delta) { | ||
// re-process the centroids and hope for some compression. | ||
this.redigest(); | ||
this.compress(); | ||
} | ||
@@ -247,42 +239,29 @@ }; | ||
TDigest.prototype.bound_mean = function(x) { | ||
// find rightmost centroids (in case of duplicate means) lower and | ||
// upper such that lower.mean <= x < upper.mean | ||
// find centroids lower and upper such that lower.mean < x < | ||
// upper.mean or lower.mean === x === upper.mean. Don't call | ||
// this for x out of bounds. | ||
// | ||
var iter = this.centroids.upperBound({mean:x}); // x < iter | ||
var c = iter.prev(); // c <= x | ||
var cnext = iter.next() ; // x < cnext | ||
while (iter.next() !== null && iter.data().mean === cnext.mean) { | ||
cnext = iter.data(); // walk the duplicates to find rightmost | ||
} | ||
return [c, cnext]; | ||
var lower = iter.prev(); // lower <= x | ||
var upper = (lower.mean === x) ? lower : iter.next(); | ||
return [lower, upper]; | ||
}; | ||
TDigest.prototype.bound_cumn = function(cumn) { | ||
// find centroids lower and upper such that lower.cumn <= cumn < upper.cumn | ||
// | ||
// XXX because the centroids have the same sort order by mean or | ||
// by cumn, swap out the comparator in our balanced tree then | ||
// search. sleazy! | ||
this.centroids._comparator = compare_centroid_cumns; | ||
var iter = this.centroids.upperBound({cumn:cumn}); // cumn < iter | ||
var c = iter.prev(); // c <= cumn | ||
var cnext = iter.next() ; // cumn < cnext, no worries about duplicates | ||
this.centroids._comparator = compare_centroid_means; | ||
return [c, cnext]; | ||
}; | ||
TDigest.prototype.quantile = function(x) { | ||
// return approximate quantile for value x, in the range 0..1. In | ||
// a small departure from the published algorithm, don't extrapolate | ||
// beyond endpoints (we do not expect c.n > 1 in the extreme | ||
// centroids, and don't want to invent density beyond them) | ||
// return approximate quantile (0..1) for data value x. The | ||
// handling of max and min data values differs from the paper, | ||
// which snaps them to 0 and 1. Here their centroids are not | ||
// treated specially, and the boundary centroids will report half | ||
// their weight. data values outside the observed range return 0 | ||
// or 1. | ||
// | ||
// this triggers cumulate() if cumn's are out of date. | ||
// | ||
if (this.size() === 0) { | ||
return NaN; | ||
} else if (x <= this.centroids.min().mean) { | ||
} else if (x < this.centroids.min().mean) { | ||
return 0.0; | ||
} else if (x >= this.centroids.max().mean) { | ||
} else if (x > this.centroids.max().mean) { | ||
return 1.0; | ||
} | ||
this._cumulate(true); // be sure cumns are exact | ||
// find centroids that bracket x and interpolate x's cumn from | ||
@@ -292,6 +271,8 @@ // their cumn's. | ||
var lower = bound[0], upper = bound[1]; | ||
var dxn = (upper.cumn - lower.cumn) * (x - lower.mean) / (upper.mean - lower.mean); | ||
// correct for endpoint weight truncation. Since we expect the extremes | ||
// to have a single point each, expect to lose a half from each. | ||
return (lower.cumn + dxn - 0.5) / (this.n - 1); | ||
this._cumulate(true); // be sure cumns are exact | ||
var cumn = lower.cumn; | ||
if (lower !== upper) { | ||
cumn += (upper.cumn - lower.cumn) * (x - lower.mean) / (upper.mean - lower.mean); | ||
} | ||
return cumn / this.n; | ||
}; | ||
@@ -304,15 +285,34 @@ | ||
TDigest.prototype.bound_cumn = function(cumn) { | ||
// find centroids lower and upper such that lower.cumn < x < | ||
// upper.cumn or lower.cumn === x === upper.cumn. Don't call | ||
// this for cumn out of bounds. | ||
// | ||
// XXX because mean and cumn give rise to the same sort order | ||
// (up to identical means), use the mean rbtree for our cumn search. | ||
this.centroids._comparator = compare_centroid_cumns; | ||
var iter = this.centroids.upperBound({cumn:cumn}); // cumn < iter | ||
this.centroids._comparator = compare_centroid_means; | ||
var lower = iter.prev(); // lower <= cumn | ||
var upper = (lower.cumn === cumn) ? lower : iter.next(); | ||
return [lower, upper]; | ||
}; | ||
TDigest.prototype.percentile = function(p) { | ||
// return the approximate x value for the specified percentile. | ||
// As with quantiles, map between [0..1] and the observed range of data. | ||
// return the approximate data value q at the specified percentile (0..1). | ||
// also known as the inverse cdf. | ||
// | ||
var cumn = p * (this.n - 1) + 0.5; // correct for endweight truncation | ||
// this triggers cumulate() if cumn's are out of date. | ||
// | ||
this._cumulate(true); // be sure cumns are exact | ||
var cumn = p * this.n; | ||
var min = this.centroids.min(); | ||
var max = this.centroids.max(); | ||
if (this.size() === 0) { | ||
return NaN; | ||
} else if (cumn <= 0.5) { | ||
return this.centroids.min().mean; | ||
} else if (cumn >= this.n - 0.5) { | ||
return this.centroids.max().mean; | ||
} else if (cumn <= min.cumn) { | ||
return min.mean; | ||
} else if (cumn >= max.cumn) { | ||
return max.mean; | ||
} | ||
this._cumulate(true); // be sure cumns are exact | ||
// find centroids whose cumns bracket cumn, then interpolate x | ||
@@ -322,4 +322,7 @@ // from their means. | ||
var lower = bound[0], upper = bound[1]; | ||
var dx = (upper.mean - lower.mean) * (cumn - lower.cumn) / (upper.cumn - lower.cumn); | ||
return lower.mean + dx; | ||
var q = lower.mean; | ||
if (lower !== upper) { | ||
q += (upper.mean - lower.mean) * (cumn - lower.cumn) / (upper.cumn - lower.cumn); | ||
} | ||
return q; | ||
}; | ||
@@ -332,3 +335,3 @@ | ||
TDigest.prototype.redigest = function() { | ||
TDigest.prototype.compress = function() { | ||
// TDigests experience worst case compression (none) when input | ||
@@ -339,9 +342,15 @@ // increases monotonically. Improve on any bad luck by | ||
// | ||
var points = this.asArray(); | ||
if (this.compressing) { | ||
return; | ||
} | ||
var points = this.toArray(); | ||
this.reset(); | ||
this.compressing = true; | ||
this.push_centroid(points.shift()); // min | ||
this.push_centroid(points.pop()); // max | ||
while (points.length > 0) { | ||
var c = pop_random(points); | ||
this.digest(c.mean, c.n); | ||
this.push_centroid(pop_random(points)); | ||
} | ||
this._cumulate(true); | ||
this.compressing = false; | ||
}; | ||
@@ -348,0 +357,0 @@ |
Sorry, the diff of this file is not supported yet
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
27410
9
598
77
5
1