Comparing version 2.0.0 to 3.0.0
@@ -0,1 +1,21 @@ | ||
<a name="3.0.0"></a> | ||
# [3.0.0](https://github.com/mljs/kmeans/compare/v2.0.0...v3.0.0) (2016-10-07) | ||
### Bug Fixes | ||
* remove unneeded array creations in updateClusterID ([b2957d5](https://github.com/mljs/kmeans/commit/b2957d5)) | ||
### Features | ||
* accepts distance function as parameter ([68993a1](https://github.com/mljs/kmeans/commit/68993a1)) | ||
* allows to compute a new array of points their cluster id ([254f9a9](https://github.com/mljs/kmeans/commit/254f9a9)) | ||
* return a generator in the withIterations option ([3b4c7c6](https://github.com/mljs/kmeans/commit/3b4c7c6)) | ||
* return the number of iteration done ([75fd42e](https://github.com/mljs/kmeans/commit/75fd42e)) | ||
* returns the error and the size of each cluster ([e85aab7](https://github.com/mljs/kmeans/commit/e85aab7)) | ||
* when maxIterations is 0, iterates until converge ([220c828](https://github.com/mljs/kmeans/commit/220c828)) | ||
<a name="2.0.0"></a> | ||
@@ -2,0 +22,0 @@ # [2.0.0](https://github.com/mljs/kmeans/compare/v1.0.0...v2.0.0) (2016-09-27) |
{ | ||
"name": "ml-kmeans", | ||
"version": "2.0.0", | ||
"version": "3.0.0", | ||
"description": "K-means in Javascript", | ||
"main": "src/kmeans.js", | ||
"directories": { | ||
"lib": "src", | ||
"test": "test" | ||
}, | ||
"files": [ | ||
"src", | ||
"tonic.js" | ||
], | ||
"scripts": { | ||
@@ -49,4 +49,5 @@ "eslint": "eslint src test", | ||
"RandomSelection": "^1.0.1", | ||
"ml-distance-euclidean": "^1.0.0" | ||
"ml-distance-euclidean": "^1.0.0", | ||
"ml-nearest-vector": "^1.0.0" | ||
} | ||
} |
@@ -1,2 +0,2 @@ | ||
# clustering | ||
# kmeans | ||
@@ -9,4 +9,6 @@ [![NPM version][npm-image]][npm-url] | ||
[K-means](https://en.wikipedia.org/wiki/K-means_clustering) in JavaScript | ||
> [K-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) in JavaScript | ||
K-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean. | ||
## Installation | ||
@@ -18,2 +20,24 @@ | ||
## Example | ||
```js | ||
const kmeans = require('ml-kmeans'); | ||
let data = [[1, 1, 1], [1, 2, 1], [-1, -1, -1], [-1, -1, -1.5]]; | ||
let centers = [[1, 2, 1], [-1, -1, -1]]; | ||
let ans = kmeans(data, 2, {initialization: centers}); | ||
console.log(ans); | ||
/* | ||
KMeansResult { | ||
clusters: [ 0, 0, 1, 1 ], | ||
centroids: | ||
[ { centroid: [ 1, 1.5, 1 ], error: 0.25, size: 2 }, | ||
{ centroid: [ -1, -1, -1.25 ], error: 0.0625, size: 2 } ], | ||
converged: true, | ||
iterations: 1 | ||
} | ||
*/ | ||
``` | ||
## Test | ||
@@ -20,0 +44,0 @@ |
'use strict'; | ||
const distance = require('ml-distance-euclidean'); | ||
const Picker = require('RandomSelection').Picker; | ||
@@ -28,5 +27,6 @@ | ||
* @param {Number} K - Number of clusters | ||
* @param {Array<Array<Number>>} distanceMatrix - matrix with the distance values | ||
* @return {Array<Array<Number>>} - Initial random points | ||
*/ | ||
function mostDistant(data, K) { | ||
function mostDistant(data, K, distanceMatrix) { | ||
var ans = new Array(K); | ||
@@ -37,18 +37,2 @@ | ||
// calculate distance matrix | ||
var distanceMatrix = new Array(data.length); | ||
for (var i = 0; i < data.length; ++i) { | ||
for (var j = i; j < data.length; ++j) { | ||
if (!distanceMatrix[i]) { | ||
distanceMatrix[i] = new Array(data.length); | ||
} | ||
if (!distanceMatrix[j]) { | ||
distanceMatrix[j] = new Array(data.length); | ||
} | ||
const dist = distance(data[i], data[j]); | ||
distanceMatrix[i][j] = dist; | ||
distanceMatrix[j][i] = dist; | ||
} | ||
} | ||
if (K > 1) { | ||
@@ -55,0 +39,0 @@ // chooses the more distant point |
@@ -5,2 +5,4 @@ 'use strict'; | ||
const init = require('./initialization'); | ||
const KMeansResult = require('./KMeansResult'); | ||
const squaredDistance = require('ml-distance-euclidean').squared; | ||
@@ -11,6 +13,46 @@ const defaultOptions = { | ||
withIterations: false, | ||
initialization: 'mostDistant' | ||
initialization: 'mostDistant', | ||
distanceFunction: squaredDistance | ||
}; | ||
/** | ||
* Each step operation for kmeans | ||
* @ignore | ||
* @param {Array<Array<Number>>} centers - the K centers in format [x,y,z,...] | ||
* @param {Array<Array<Number>>} data - the [x,y,z,...] points to cluster | ||
* @param {Array<Number>} clusterID - the cluster identifier for each data dot | ||
* @param {Number} K - Number of clusters | ||
* @param {Object} [options] - Option object | ||
* @param {Number} iterations - Current number of iterations | ||
* @return {KMeansResult} | ||
*/ | ||
function step(centers, data, clusterID, K, options, iterations) { | ||
clusterID = utils.updateClusterID(data, centers, clusterID, options.distanceFunction); | ||
var newCenters = utils.updateCenters(data, clusterID, K); | ||
var converged = utils.converged(newCenters, centers, options.distanceFunction, options.tolerance); | ||
return new KMeansResult(clusterID, newCenters, converged, iterations, options.distanceFunction); | ||
} | ||
/** | ||
* Generator version for the algorithm | ||
* @ignore | ||
* @param {Array<Array<Number>>} centers - the K centers in format [x,y,z,...] | ||
* @param {Array<Array<Number>>} data - the [x,y,z,...] points to cluster | ||
* @param {Array<Number>} clusterID - the cluster identifier for each data dot | ||
* @param {Number} K - Number of clusters | ||
* @param {Object} [options] - Option object | ||
*/ | ||
function* kmeansGenerator(centers, data, clusterID, K, options) { | ||
var converged = false; | ||
var stepNumber = 0; | ||
var stepResult; | ||
while (!converged && (stepNumber < options.maxIterations)) { | ||
stepResult = step(centers, data, clusterID, K, options, ++stepNumber); | ||
yield stepResult.computeInformation(data); | ||
converged = stepResult.converged; | ||
centers = stepResult.centroids; | ||
} | ||
} | ||
/** | ||
* K-means algorithm | ||
@@ -23,9 +65,10 @@ * @param {Array<Array<Number>>} data - Points in the format to cluster [x,y,z,...] | ||
* @param {Boolean} [options.withIterations = false] - Store clusters and centroids for each iteration | ||
* @param {Function} [options.distanceFunction = squaredDistance] - Distance function to use between the points | ||
* @param {String|Array<Array<Number>>} [options.initialization = 'moreDistant'] - K centers in format [x,y,z,...] or a method for initialize the data: | ||
* * `'random'` will choose K random different values. | ||
* * `'mostDistant'` will choose the more distant points to a first random pick | ||
* @returns {Object} - Cluster identifier for each data dot and centroids with the following fields: | ||
* @returns {KMeansResult} - Cluster identifier for each data dot and centroids with the following fields: | ||
* * `'clusters'`: Array of indexes for the clusters. | ||
* * `'centroids'`: Array with the resulting centroids. | ||
* * `'iterations'`: Array with the state of 'clusters' and 'centroids' for each iteration. | ||
* * `'iterations'`: Number of iterations that took to converge | ||
*/ | ||
@@ -35,4 +78,4 @@ function kmeans(data, K, options) { | ||
if (K > data.length) { | ||
throw new Error('Data length should be bigger than K'); | ||
if (K <= 0 || K > data.length || !Number.isInteger(K)) { | ||
throw new Error('K should be a positive integer bigger than the number of points'); | ||
} | ||
@@ -53,3 +96,3 @@ | ||
case 'mostDistant': | ||
centers = init.mostDistant(data, K); | ||
centers = init.mostDistant(data, K, utils.calculateDistanceMatrix(data, options.distanceFunction)); | ||
break; | ||
@@ -61,49 +104,20 @@ default: | ||
var clusterID = new Array(data.length); | ||
for (var i = 0; i < data.length; ++i) { | ||
clusterID[i] = 0; | ||
// infinite loop until convergence | ||
if (options.maxIterations === 0) { | ||
options.maxIterations = Number.MAX_VALUE; | ||
} | ||
var lastDistance = 1e100; | ||
var curDistance = 0; | ||
var iterations = []; | ||
for (var iter = 0; iter < options.maxIterations; ++iter) { | ||
clusterID = utils.updateClusterID(data, centers); | ||
centers = utils.updateCenters(data, clusterID, K); | ||
curDistance = utils.computeSSE(data, centers, clusterID); | ||
if (options.withIterations) { | ||
iterations.push({ | ||
'clusters': clusterID, | ||
'centroids': centers | ||
}); | ||
} | ||
if ((lastDistance - curDistance < options.tolerance) || ((lastDistance - curDistance) / lastDistance < options.tolerance)) { | ||
if (options.withIterations) { | ||
return { | ||
'clusters': clusterID, | ||
'centroids': centers, | ||
'iterations': iterations | ||
}; | ||
} else { | ||
return { | ||
'clusters': clusterID, | ||
'centroids': centers | ||
}; | ||
} | ||
} | ||
lastDistance = curDistance; | ||
} | ||
// exceed number of iterations | ||
var clusterID = new Array(data.length); | ||
if (options.withIterations) { | ||
return { | ||
'clusters': clusterID, | ||
'centroids': centers, | ||
'iterations': iterations | ||
}; | ||
return kmeansGenerator(centers, data, clusterID, K, options); | ||
} else { | ||
return { | ||
'clusters': clusterID, | ||
'centroids': centers | ||
}; | ||
var converged = false; | ||
var stepNumber = 0; | ||
var stepResult; | ||
while (!converged && (stepNumber < options.maxIterations)) { | ||
stepResult = step(centers, data, clusterID, K, options, ++stepNumber); | ||
converged = stepResult.converged; | ||
centers = stepResult.centroids; | ||
} | ||
return stepResult.computeInformation(data); | ||
} | ||
@@ -110,0 +124,0 @@ } |
108
src/utils.js
'use strict'; | ||
const squaredDistance = require('ml-distance-euclidean').squared; | ||
const nearestVector = require('ml-nearest-vector'); | ||
/** | ||
* Calculates the sum of squared errors | ||
* Calculates the distance matrix for a given array of points | ||
* @ignore | ||
* @param {Array<Array<Number>>} data - the [x,y,z,...] points to cluster | ||
* @param {Array<Array<Number>>} centers - the K centers in format [x,y,z,...] | ||
* @param {Array<Number>} clusterID - the cluster identifier for each data dot | ||
* @returns {Number} the sum of squared errors | ||
* @param {Function} distance - Distance function to use between the points | ||
* @return {Array<Array<Number>>} - matrix with the distance values | ||
*/ | ||
function computeSSE(data, centers, clusterID) { | ||
var sse = 0; | ||
var c = 0; | ||
for (var i = 0; i < data.length; i++) { | ||
c = clusterID[i]; | ||
sse += squaredDistance(data[i], centers[c]); | ||
function calculateDistanceMatrix(data, distance) { | ||
var distanceMatrix = new Array(data.length); | ||
for (var i = 0; i < data.length; ++i) { | ||
for (var j = i; j < data.length; ++j) { | ||
if (!distanceMatrix[i]) { | ||
distanceMatrix[i] = new Array(data.length); | ||
} | ||
if (!distanceMatrix[j]) { | ||
distanceMatrix[j] = new Array(data.length); | ||
} | ||
const dist = distance(data[i], data[j]); | ||
distanceMatrix[i][j] = dist; | ||
distanceMatrix[j][i] = dist; | ||
} | ||
} | ||
return sse; | ||
return distanceMatrix; | ||
} | ||
@@ -28,26 +35,9 @@ | ||
* @param {Array<Array<Number>>} centers - the K centers in format [x,y,z,...] | ||
* @param {Array <Number>} clusterID - the cluster identifier for each data dot | ||
* @param {Function} distance - Distance function to use between the points | ||
* @returns {Array} the cluster identifier for each data dot | ||
*/ | ||
function updateClusterID(data, centers) { | ||
const k = centers.length; | ||
var aux = 0; | ||
var clusterID = new Array(data.length); | ||
for (var index = 0; index < data.length; index++) | ||
clusterID[index] = 0; | ||
var d = new Array(data.length); | ||
function updateClusterID(data, centers, clusterID, distance) { | ||
for (var i = 0; i < data.length; i++) { | ||
d[i] = new Array(k); | ||
for (var j = 0; j < k; j++) { | ||
aux = squaredDistance(data[i], centers[j]); | ||
d[i][j] = [aux, j]; | ||
} | ||
var min = d[i][0][0]; | ||
var id = 0; | ||
for (var l = 0; l < k; l++) { | ||
if (d[i][l][0] < min) { | ||
min = d[i][l][0]; | ||
id = d[i][l][1]; | ||
} | ||
} | ||
clusterID[i] = id; | ||
clusterID[i] = nearestVector(centers, data[i], {distanceFunction: distance}); | ||
} | ||
@@ -67,5 +57,9 @@ return clusterID; | ||
const nDim = data[0].length; | ||
// creates empty centers with 0 size | ||
var centers = new Array(K); | ||
var centersLen = new Array(K); | ||
for (var i = 0; i < K; i++) { | ||
centers[i] = new Array(nDim); | ||
centersLen[i] = 0; | ||
for (var j = 0; j < nDim; j++) { | ||
@@ -76,21 +70,14 @@ centers[i][j] = 0; | ||
for (var k = 0; k < K; k++) { | ||
var cluster = []; | ||
for (var l = 0; l < data.length; l++) { | ||
if (clusterID[l] === k) { | ||
cluster.push(data[l]); | ||
} | ||
// add the value for all dimensions of the point | ||
for (var l = 0; l < data.length; l++) { | ||
centersLen[clusterID[l]]++; | ||
for (var dim = 0; dim < nDim; dim++) { | ||
centers[clusterID[l]][dim] += data[l][dim]; | ||
} | ||
} | ||
// divides by length | ||
for (var id = 0; id < K; id++) { | ||
for (var d = 0; d < nDim; d++) { | ||
var x = []; | ||
for (var m = 0; m < data.length; m++) { | ||
if (clusterID[m] === k) { | ||
x.push(data[m][d]); | ||
} | ||
} | ||
var sum = 0; | ||
for (var n = 0; n < x.length; n++) { | ||
sum += x[n]; | ||
} | ||
centers[k][d] = sum / x.length; | ||
centers[id][d] /= centersLen[id]; | ||
} | ||
@@ -101,4 +88,23 @@ } | ||
exports.computeSSE = computeSSE; | ||
/** | ||
* The centers have moved more than the tolerance value? | ||
* @ignore | ||
* @param {Array<Array<Number>>} centers - the K centers in format [x,y,z,...] | ||
* @param {Array<Array<Number>>} oldCenters - the K old centers in format [x,y,z,...] | ||
* @param {Function} distanceFunction - Distance function to use between the points | ||
* @param {Number} tolerance - Allowed distance for the centroids to move | ||
* @return {boolean} | ||
*/ | ||
function converged(centers, oldCenters, distanceFunction, tolerance) { | ||
for (var i = 0; i < centers.length; i++) { | ||
if (distanceFunction(centers[i], oldCenters[i]) > tolerance) { | ||
return false; | ||
} | ||
} | ||
return true; | ||
} | ||
exports.updateClusterID = updateClusterID; | ||
exports.updateCenters = updateCenters; | ||
exports.calculateDistanceMatrix = calculateDistanceMatrix; | ||
exports.converged = converged; |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Native code
Supply chain riskContains native code (e.g., compiled binaries or shared libraries). Including native code can obscure malicious behavior.
Found 4 instances in 1 package
Mixed license
License(Experimental) Package contains multiple licenses.
Found 1 instance in 1 package
Non-permissive License
License(Experimental) A license not known to be considered permissive was found.
Found 1 instance in 1 package
0
100
66
1
19675
3
9
335
+ Addedml-nearest-vector@^1.0.0
+ Addedml-nearest-vector@1.0.1(transitive)