@wholebuzz/cluster
SimHash text clustering with OutRank outlier removal and Variation of Information analysis.
References
- [1] Ghoche, 2016. Real-Time Tf-Idf Clustering Using Simhash, Approximate Nearest Neighbors, and DBSCAN
- [2] Moonesinghe, Tan. 2008. OutRank: A GRAPH-BASED OUTLIER DETECTION FRAMEWORK USING RANDOM WALK
- [3] Ester, Kriegel. 1996. Density-based spatial clustering of applications with noise
Example
import { clustersFromLabels } from '@wholebuzz/cluster/lib/cluster'
import { zero } from '@wholebuzz/cluster/lib/hamming'
import { LocalFileSystem } from '@wholebuzz/fs/lib/fs'
import { readLines } from '@wholebuzz/fs/lib/json'
import { simhashClusterText, findOutliersByTFIDFCentrality } from '@wholebuzz/cluster/lib/text'
import { newLexicon } from '@wholebuzz/search/lib/lexicon'
import { searchConfig } from '@wholebuzz/search/lib/search'
import { FingerprintedLabeledLexiconDataset } from '@wholebuzz/search/lib/types'
// https://www.kaggle.com/rmisra/news-category-dataset
interface Headline {
authors: string
date: string
category: string
link: string
headline: string
short_description: string
fingerprint?: bigint
}
const items: Headline[] = await readLines<Headline>(
new LocalFileSystem(),
'News_Category_Dataset_v2.json.gz',
(x) => JSON.parse(x)
)
const getItemText = (x: Headline) => x.headline
const getItemLabel = (x: Headline) => x.link
// Needs more data to build Lexicon.
// https://github.com/wholebuzz/search/blob/master/docs/modules/lexicon.md#readlexicon
const lexicon = newLexicon({ items, getItemText }, searchConfig)
const dataset: FingerprintedLabeledLexiconDataset<Headline> = {
items,
getItemText,
getItemLabel,
getItemFingerprint: (x) => x.fingerprint ?? zero,
setItemFingerprint: (x, fp) => {
if (fp === undefined) delete x.fingerprint
else x.fingerprint = fp
return x
},
lexicon,
}
// Needs additional information like Headline.date for temporal filtering.
const clusters: Headline[][] = clustersFromLabels(
dataset,
simhashClusterText(dataset),
dataset.setItemFingerprint
)
for (let i = 0; i < clusters.length; i++) {
const outliers = findOutliersByTFIDFCentrality(
{ items: clusters[i], getItemText, lexicon: dataset.lexicon }
)
// Needs to filter items, sort cluster, filter clusters, etc on custom basis.
const cluster = clusters[i] = clusters[i].filter((_, i) => !outliers.outliers[i])
// Needs additional information like Headline.category for hierarchical clustering.
// for (const c of parentCategories(cluster)) ((hc[c] ?? (hc[c] = [])).push(cluster)
}
// Needs final sorts and filters on custom basis.
console.log(clusters)
// Should combine previous clusters with mapClusters.
// https://github.com/wholebuzz/cluster/blob/master/docs/modules/mapping.md#mapclusters
Table of contents
Modules