tdigest
Advanced tools
Comparing version 0.0.1 to 0.0.2
var TDigest = require('./tdigest').TDigest; | ||
function errtest() { | ||
var tdigest = new TDigest(.01,25,1.1); | ||
var i, x=[], N = 100000; | ||
var i, x=[], N = 1000000; | ||
// draw N from [0..10] | ||
@@ -6,0 +6,0 @@ for (i = 0 ; i < N ; i += 1) { |
{ | ||
"name": "tdigest", | ||
"version": "0.0.1", | ||
"version": "0.0.2", | ||
"description": "javascript implementation of Dunning's T-Digest for streaming quantile approximation", | ||
@@ -5,0 +5,0 @@ "main": "tdigest.js", |
@@ -1,2 +0,3 @@ | ||
# tdigest | ||
tdigest [![Build Status](https://secure.travis-ci.org/welch/tdigest.png?branch=master)](http://travis-ci.org/welch/tdigest) | ||
======== | ||
Javascript implementation of Dunning's T-Digest for streaming quantile approximation | ||
@@ -3,0 +4,0 @@ |
@@ -20,3 +20,3 @@ var RBTree = require('bintrees').RBTree; | ||
this.CX = (CX === undefined) ? 1.1 : CX; | ||
this.centroids = new RBTree(compare_centroids); | ||
this.centroids = new RBTree(compare_centroid_means); | ||
this.nreset = 0; | ||
@@ -50,3 +50,3 @@ this.reset(); | ||
function compare_centroids(a, b) { | ||
function compare_centroid_means(a, b) { | ||
// order two centroids by mean. | ||
@@ -64,2 +64,8 @@ // | ||
function compare_centroid_cumns(a, b) { | ||
// order two centroids by cumn. | ||
// | ||
return (a.cumn - b.cumn); | ||
} | ||
function pop_random(choices) { | ||
@@ -123,20 +129,20 @@ // remove and return an item randomly chosen from the array of choices | ||
TDigest.prototype.find_nearest = function(x) { | ||
// find the centroid(s) whose means are closest to x | ||
// (there can be multiples because of per-centroid count limits). | ||
// find the centroid(s) whose means are closest to x (there can be | ||
// multiples because of per-centroid count limits, particularly in | ||
// categorical streams). | ||
// | ||
var iter = this.centroids.upperBound({mean:x}); // x < iter || iter==null | ||
var upper = iter.data(); | ||
if (upper !== null) { | ||
// jump to the end of string of duplicates | ||
iter = this.centroids.upperBound({mean:upper.mean}); | ||
} | ||
var prev = (iter.data() === null) ? iter.prev() : iter.data(); | ||
if (prev === null) { | ||
var c = (iter.data() === null) ? iter.prev() : iter.data(); | ||
if (c === null) { | ||
return []; | ||
} | ||
// walk backwards looking for closer centroids | ||
var min = Math.abs(prev.mean - x); | ||
var nearest = [prev]; | ||
// walk the duplicates to find rightmost bound | ||
while (iter.next() !== null && iter.data().mean === c.mean) { | ||
} | ||
// walk backwards looking for closest centroids. | ||
c = iter.prev(); | ||
var min = Math.abs(c.mean - x); | ||
var nearest = [c]; | ||
var dx; | ||
while ((prev = iter.prev()) && (dx = Math.abs(prev.mean - x)) <= min) { | ||
while ((c = iter.prev()) && (dx = Math.abs(c.mean - x)) <= min) { | ||
if (dx < min) { | ||
@@ -146,3 +152,3 @@ min = dx; | ||
} | ||
nearest.push(prev); | ||
nearest.push(c); | ||
} | ||
@@ -184,3 +190,3 @@ return nearest; | ||
TDigest.prototype._centroid_quantile = function(c) { | ||
// quick-and-dirty quantile estimate for centroid's mean. | ||
// quantile estimate for a centroid's mean is a simple special case | ||
// | ||
@@ -229,2 +235,29 @@ if (this.size() === 0) { | ||
TDigest.prototype.bound_mean = function(x) { | ||
// find rightmost centroids (in case of duplicate means) lower and | ||
// upper such that lower.mean <= x < upper.mean | ||
// | ||
var iter = this.centroids.upperBound({mean:x}); // x < iter | ||
var c = iter.prev(); // c <= x | ||
var cnext = iter.next() ; // x < cnext | ||
while (iter.next() !== null && iter.data().mean === cnext.mean) { | ||
cnext = iter.data(); // walk the duplicates to find rightmost | ||
} | ||
return [c, cnext]; | ||
}; | ||
TDigest.prototype.bound_cumn = function(cumn) { | ||
// find centroids lower and upper such that lower.cumn <= cumn < upper.cumn | ||
// | ||
// XXX because the centroids have the same sort order by mean or | ||
// by cumn, swap out the comparator in our balanced tree then | ||
// search. sleazy! | ||
this.centroids._comparator = compare_centroid_cumns; | ||
var iter = this.centroids.upperBound({cumn:cumn}); // cumn < iter | ||
var c = iter.prev(); // c <= cumn | ||
var cnext = iter.next() ; // cumn < cnext, no worries about duplicates | ||
this.centroids._comparator = compare_centroid_means; | ||
return [c, cnext]; | ||
}; | ||
TDigest.prototype.quantile = function(x) { | ||
@@ -246,12 +279,8 @@ // return approximate quantile for value x, in the range 0..1. In | ||
// their cumn's. | ||
var iter = this.centroids.upperBound({mean:x}); // x.mean < iter | ||
var c = iter.prev(); // c.mean <= x.mean | ||
var cnext = iter.next() ; // x.mean < cnext.mean | ||
while (iter.next() !== null && iter.data().mean === cnext.mean) { | ||
cnext = iter.data(); // walk the duplicates to find max cumn | ||
} | ||
var dxn = (cnext.cumn - c.cumn) * (x - c.mean) / (cnext.mean - c.mean); | ||
var bound = this.bound_mean(x); | ||
var lower = bound[0], upper = bound[1]; | ||
var dxn = (upper.cumn - lower.cumn) * (x - lower.mean) / (upper.mean - lower.mean); | ||
// correct for endpoint weight truncation. Since we expect the extremes | ||
// to have a single point each, expect to lose a half from each. | ||
return (c.cumn + dxn - 0.5) / (this.n - 1); | ||
return (lower.cumn + dxn - 0.5) / (this.n - 1); | ||
}; | ||
@@ -278,14 +307,10 @@ | ||
// find centroids whose cumns bracket target, then interpolate x | ||
// from their means. you could binary search since cumns are sorted | ||
// and distinct, but let's just walk. | ||
var iter = this.centroids.iterator(); | ||
var c = iter.data(), cprev, cnext; | ||
while ((cnext = iter.next()) && cnext.cumn <= target) { | ||
c = cnext; | ||
// from their means. | ||
var bound = this.bound_cumn(target); | ||
var lower = bound[0], upper = bound[1]; | ||
if (upper === null) { | ||
return this.centroids.max().mean ; // ran off the edge | ||
} | ||
if (cnext === null) { | ||
return this.centroids.max().mean; // ran off the edge | ||
} | ||
var dx = (cnext.mean - c.mean) * (target - c.cumn) / (cnext.cumn - c.cumn); | ||
return c.mean + dx; | ||
var dx = (upper.mean - lower.mean) * (target - lower.cumn) / (upper.cumn - lower.cumn); | ||
return lower.mean + dx; | ||
}; | ||
@@ -292,0 +317,0 @@ |
17
test.js
@@ -105,2 +105,19 @@ var deepEqual = require('assert').deepEqual; | ||
}); | ||
it('handles duplicates', function(){ | ||
var tdigest = new TDigest(1,0,0); | ||
var i, N = 10; | ||
for (i = 0 ; i < N ; i++) { | ||
tdigest.digest(0.0); | ||
tdigest.digest(1.0); | ||
tdigest.digest(0.5); | ||
}; | ||
deepEqual( | ||
tdigest.asArray(), | ||
[{mean:0.0, n:1}, | ||
{mean:0.0, n:N-1}, | ||
{mean:0.5, n:N}, | ||
{mean:1.0, n:N-1}, | ||
{mean:1.0, n:1}] | ||
); | ||
}); | ||
}); | ||
@@ -107,0 +124,0 @@ |
Sorry, the diff of this file is not supported yet
48905
576
38