@thi.ng/k-means
Advanced tools
Comparing version 0.6.48 to 0.6.49
# Change Log | ||
- **Last updated**: 2023-12-09T19:12:03Z | ||
- **Last updated**: 2023-12-11T10:07:09Z | ||
- **Generator**: [thi.ng/monopub](https://thi.ng/monopub) | ||
@@ -5,0 +5,0 @@ |
287
kmeans.js
@@ -10,184 +10,127 @@ import { argmin } from "@thi.ng/distance/argmin"; | ||
import { zeroes } from "@thi.ng/vectors/setn"; | ||
/** | ||
* Takes an array of n-dimensional `samples` and attempts to assign them to up | ||
* to `k` clusters (might produce less), using the behavior defined by | ||
* (optionally) given `opts`. | ||
* | ||
* @remarks | ||
* https://en.wikipedia.org/wiki/K-medians_clustering | ||
* | ||
* @param k - | ||
* @param samples - | ||
* @param opts - | ||
*/ | ||
export const kmeans = (k, samples, opts) => { | ||
let { dist, initial, maxIter, rnd, strategy } = { | ||
dist: DIST_SQ, | ||
maxIter: 32, | ||
strategy: means, | ||
...opts, | ||
}; | ||
const num = samples.length; | ||
const dim = samples[0].length; | ||
const centroidIDs = initial || initKmeanspp(k, samples, dist, rnd); | ||
assert(centroidIDs.length > 0, `missing initial centroids`); | ||
k = centroidIDs.length; | ||
const centroids = centroidIDs.map((i) => samples[i]); | ||
const clusters = new Uint32Array(num).fill(k); | ||
let update = true; | ||
while (update && maxIter-- > 0) { | ||
update = assign(samples, centroids, clusters, dist); | ||
if (!update) | ||
break; | ||
for (let i = 0; i < k; i++) { | ||
const impl = strategy(dim); | ||
for (let j = 0; j < num; j++) { | ||
i === clusters[j] && impl.update(samples[j]); | ||
} | ||
const centroid = impl.finish(); | ||
if (centroid) | ||
centroids[i] = centroid; | ||
} | ||
const kmeans = (k, samples, opts) => { | ||
let { dist, initial, maxIter, rnd, strategy } = { | ||
dist: DIST_SQ, | ||
maxIter: 32, | ||
strategy: means, | ||
...opts | ||
}; | ||
const num = samples.length; | ||
const dim = samples[0].length; | ||
const centroidIDs = initial || initKmeanspp(k, samples, dist, rnd); | ||
assert(centroidIDs.length > 0, `missing initial centroids`); | ||
k = centroidIDs.length; | ||
const centroids = centroidIDs.map((i) => samples[i]); | ||
const clusters = new Uint32Array(num).fill(k); | ||
let update = true; | ||
while (update && maxIter-- > 0) { | ||
update = assign(samples, centroids, clusters, dist); | ||
if (!update) | ||
break; | ||
for (let i = 0; i < k; i++) { | ||
const impl = strategy(dim); | ||
for (let j = 0; j < num; j++) { | ||
i === clusters[j] && impl.update(samples[j]); | ||
} | ||
const centroid = impl.finish(); | ||
if (centroid) | ||
centroids[i] = centroid; | ||
} | ||
return buildClusters(centroids, clusters); | ||
} | ||
return buildClusters(centroids, clusters); | ||
}; | ||
/** | ||
* k-means++ initialization / selection of initial cluster centroids. Default | ||
* centroid initialization method for {@link kmeans}. | ||
* | ||
* @remarks | ||
* Might return fewer than `k` centroid IDs if the requested number cannot be | ||
* fulfilled (e.g. due to lower number of samples and/or distance metric). | ||
* Throws an error if `samples` are empty. | ||
* | ||
* @remarks | ||
* References: | ||
* - https://en.wikipedia.org/wiki/K-means%2B%2B | ||
* - http://ilpubs.stanford.edu:8090/778/1/2006-13.pdf | ||
* - http://vldb.org/pvldb/vol5/p622_bahmanbahmani_vldb2012.pdf (TODO) | ||
* | ||
* @param k - | ||
* @param samples - | ||
* @param dist - | ||
* @param rnd - | ||
*/ | ||
export const initKmeanspp = (k, samples, dist = DIST_SQ, rnd = SYSTEM) => { | ||
const num = samples.length; | ||
assert(num > 0, `missing samples`); | ||
k = Math.min(k, num); | ||
const centroidIDs = [rnd.int() % num]; | ||
const centroids = [samples[centroidIDs[0]]]; | ||
const indices = new Array(num).fill(0).map((_, i) => i); | ||
const metric = dist.metric; | ||
while (centroidIDs.length < k) { | ||
let psum = 0; | ||
const probs = samples.map((p) => { | ||
const d = dist.from(metric(p, centroids[argmin(p, centroids, dist)])) ** | ||
2; | ||
psum += d; | ||
return d; | ||
}); | ||
if (!psum) | ||
break; | ||
let id; | ||
do { | ||
id = weightedRandom(indices, probs, rnd)(); | ||
} while (centroidIDs.includes(id)); | ||
centroidIDs.push(id); | ||
centroids.push(samples[id]); | ||
} | ||
return centroidIDs; | ||
const initKmeanspp = (k, samples, dist = DIST_SQ, rnd = SYSTEM) => { | ||
const num = samples.length; | ||
assert(num > 0, `missing samples`); | ||
k = Math.min(k, num); | ||
const centroidIDs = [rnd.int() % num]; | ||
const centroids = [samples[centroidIDs[0]]]; | ||
const indices = new Array(num).fill(0).map((_, i) => i); | ||
const metric = dist.metric; | ||
while (centroidIDs.length < k) { | ||
let psum = 0; | ||
const probs = samples.map((p) => { | ||
const d = dist.from(metric(p, centroids[argmin(p, centroids, dist)])) ** 2; | ||
psum += d; | ||
return d; | ||
}); | ||
if (!psum) | ||
break; | ||
let id; | ||
do { | ||
id = weightedRandom(indices, probs, rnd)(); | ||
} while (centroidIDs.includes(id)); | ||
centroidIDs.push(id); | ||
centroids.push(samples[id]); | ||
} | ||
return centroidIDs; | ||
}; | ||
const assign = (samples, centroids, assignments, dist) => { | ||
let update = false; | ||
for (let i = samples.length; i-- > 0;) { | ||
const id = argmin(samples[i], centroids, dist); | ||
if (id !== assignments[i]) { | ||
assignments[i] = id; | ||
update = true; | ||
} | ||
let update = false; | ||
for (let i = samples.length; i-- > 0; ) { | ||
const id = argmin(samples[i], centroids, dist); | ||
if (id !== assignments[i]) { | ||
assignments[i] = id; | ||
update = true; | ||
} | ||
return update; | ||
} | ||
return update; | ||
}; | ||
const buildClusters = (centroids, assignments) => { | ||
const clusters = []; | ||
for (let i = 0, n = assignments.length; i < n; i++) { | ||
const id = assignments[i]; | ||
(clusters[id] || | ||
(clusters[id] = { | ||
id, | ||
centroid: centroids[id], | ||
items: [], | ||
})).items.push(i); | ||
} | ||
return clusters.filter((x) => !!x); | ||
const clusters = []; | ||
for (let i = 0, n = assignments.length; i < n; i++) { | ||
const id = assignments[i]; | ||
(clusters[id] || (clusters[id] = { | ||
id, | ||
centroid: centroids[id], | ||
items: [] | ||
})).items.push(i); | ||
} | ||
return clusters.filter((x) => !!x); | ||
}; | ||
/** | ||
* Default centroid strategy forming new centroids by averaging the position of | ||
* participating samples. | ||
* | ||
* @param dim - | ||
*/ | ||
export const means = (dim) => { | ||
const acc = zeroes(dim); | ||
let n = 0; | ||
return { | ||
update: (p) => { | ||
add(acc, acc, p); | ||
n++; | ||
}, | ||
finish: () => (n ? mulN(acc, acc, 1 / n) : undefined), | ||
}; | ||
const means = (dim) => { | ||
const acc = zeroes(dim); | ||
let n = 0; | ||
return { | ||
update: (p) => { | ||
add(acc, acc, p); | ||
n++; | ||
}, | ||
finish: () => n ? mulN(acc, acc, 1 / n) : void 0 | ||
}; | ||
}; | ||
/** | ||
* Centroid strategy forming new centroids via componentwise medians. | ||
* | ||
* @remarks | ||
* https://en.wikipedia.org/wiki/K-medians_clustering | ||
*/ | ||
export const medians = () => { | ||
const acc = []; | ||
return { | ||
update: (p) => acc.push(p), | ||
finish: () => (acc.length ? median([], acc) : undefined), | ||
}; | ||
const medians = () => { | ||
const acc = []; | ||
return { | ||
update: (p) => acc.push(p), | ||
finish: () => acc.length ? median([], acc) : void 0 | ||
}; | ||
}; | ||
/** | ||
* Means centroid strategy for decimal degree lat/lon positions (e.g. WGS84). | ||
* Unlike the default {@link means} strategy, this one treats latitude values | ||
* correctly in terms of the ±180 deg boundary and ensures samples on either | ||
* side of the Pacific are forming correct centroids. | ||
* | ||
* @remarks | ||
* When using this strategy, you should also use the | ||
* [`HAVERSINE_LATLON`](https://docs.thi.ng/umbrella/distance/variables/HAVERSINE_LATLON.html) | ||
* distance metric for {@link KMeansOpts.distance}. | ||
* | ||
* @example | ||
* ```ts | ||
* kmeans(3, [...], { strategy: meansLatLon, dist: HAVERSINE_LATLON }) | ||
* ``` | ||
* | ||
* https://en.wikipedia.org/wiki/World_Geodetic_System | ||
*/ | ||
export const meansLatLon = () => { | ||
let lat = 0; | ||
let lon = 0; | ||
let n = 0; | ||
return { | ||
update: ([$lat, $lon]) => { | ||
lat += $lat < 0 ? $lat + 360 : $lat; | ||
lon += $lon; | ||
n++; | ||
}, | ||
finish: () => { | ||
if (!n) | ||
return; | ||
lat /= n; | ||
if (lat > 180) | ||
lat -= 360; | ||
lon /= n; | ||
return [lat, lon]; | ||
}, | ||
}; | ||
const meansLatLon = () => { | ||
let lat = 0; | ||
let lon = 0; | ||
let n = 0; | ||
return { | ||
update: ([$lat, $lon]) => { | ||
lat += $lat < 0 ? $lat + 360 : $lat; | ||
lon += $lon; | ||
n++; | ||
}, | ||
finish: () => { | ||
if (!n) | ||
return; | ||
lat /= n; | ||
if (lat > 180) | ||
lat -= 360; | ||
lon /= n; | ||
return [lat, lon]; | ||
} | ||
}; | ||
}; | ||
export { | ||
initKmeanspp, | ||
kmeans, | ||
means, | ||
meansLatLon, | ||
medians | ||
}; |
{ | ||
"name": "@thi.ng/k-means", | ||
"version": "0.6.48", | ||
"version": "0.6.49", | ||
"description": "Configurable k-means & k-medians (with k-means++ initialization) for n-D vectors", | ||
@@ -27,3 +27,5 @@ "type": "module", | ||
"scripts": { | ||
"build": "yarn clean && tsc --declaration", | ||
"build": "yarn build:esbuild && yarn build:decl", | ||
"build:decl": "tsc --declaration --emitDeclarationOnly", | ||
"build:esbuild": "esbuild --format=esm --platform=neutral --target=es2022 --tsconfig=tsconfig.json --outdir=. src/**/*.ts", | ||
"clean": "rimraf --glob '*.js' '*.d.ts' '*.map' doc", | ||
@@ -37,10 +39,11 @@ "doc": "typedoc --excludePrivate --excludeInternal --out doc src/index.ts", | ||
"dependencies": { | ||
"@thi.ng/api": "^8.9.11", | ||
"@thi.ng/distance": "^2.4.33", | ||
"@thi.ng/errors": "^2.4.5", | ||
"@thi.ng/random": "^3.6.17", | ||
"@thi.ng/vectors": "^7.8.8" | ||
"@thi.ng/api": "^8.9.12", | ||
"@thi.ng/distance": "^2.4.34", | ||
"@thi.ng/errors": "^2.4.6", | ||
"@thi.ng/random": "^3.6.18", | ||
"@thi.ng/vectors": "^7.8.9" | ||
}, | ||
"devDependencies": { | ||
"@microsoft/api-extractor": "^7.38.3", | ||
"esbuild": "^0.19.8", | ||
"rimraf": "^5.0.5", | ||
@@ -85,3 +88,3 @@ "tools": "^0.0.1", | ||
}, | ||
"gitHead": "25f2ac8ff795a432a930119661b364d4d93b59a0\n" | ||
"gitHead": "5e7bafedfc3d53bc131469a28de31dd8e5b4a3ff\n" | ||
} |
@@ -52,3 +52,3 @@ <!-- This file is generated - DO NOT EDIT! --> | ||
Package sizes (brotli'd, pre-treeshake): ESM: 878 bytes | ||
Package sizes (brotli'd, pre-treeshake): ESM: 868 bytes | ||
@@ -55,0 +55,0 @@ ## Dependencies |
32802
6
254
Updated@thi.ng/api@^8.9.12
Updated@thi.ng/distance@^2.4.34
Updated@thi.ng/errors@^2.4.6
Updated@thi.ng/random@^3.6.18
Updated@thi.ng/vectors@^7.8.9