natural-content
Advanced tools
Comparing version 1.0.0 to 1.0.1
105
index.js
@@ -46,6 +46,6 @@ var _ = require('underscore'); | ||
if (withStopWords) { | ||
return _.filter(words, function(word){return (word !== ''); }); | ||
return _.filter(words, function(word){return (word !== '') && ! _.isNumber(word); }); | ||
} | ||
else { | ||
return _.filter(words, function(word){return (word !== '' && stopwords.indexOf(word) === -1); }); | ||
return _.filter(words, function(word){return (word !== '' && ! _.isNumber(word) && stopwords.indexOf(word) === -1); }); | ||
} | ||
@@ -87,3 +87,3 @@ | ||
*/ | ||
function getTf(words, n, nbrDocs) { | ||
function getTf(words, n, stats) { | ||
@@ -98,6 +98,20 @@ var ws = words; | ||
_.keys(count).forEach(function(key) { | ||
tfs[key] = count[key]/max; | ||
if (nbrDocs) { | ||
nbrDocs[key] = nbrDocs[key] ? ++nbrDocs[key] : 1; | ||
_.keys(count).forEach(function(word) { | ||
// Calculate the tf for this word | ||
tfs[word] = count[word]/max; | ||
// Update stats | ||
if (stats) { | ||
// update the number of documents for this word | ||
stats.nbrDocsByWords[word] = stats.nbrDocsByWords[word] ? ++stats.nbrDocsByWords[word] : 1; | ||
// Calculate sum & register the tf for min & max computation | ||
if (stats.words[word]) { | ||
stats.words[word].tfs.push(tfs[word]); | ||
stats.words[word].tfSum += tfs[word]; | ||
} | ||
else { | ||
stats.words[word] = initWordStat(tfs[word]); | ||
} | ||
} | ||
@@ -115,2 +129,3 @@ | ||
/** | ||
@@ -121,3 +136,3 @@ * Get the tfIdf for each word of a document | ||
* @param the document represented by an term frequency array. | ||
* the function getTf can be used for generating the term frequency array | ||
* the function getTf can be used for generating the term frequency array | ||
* @param the number of document per word (index = word) | ||
@@ -128,16 +143,16 @@ * @param the number of documents | ||
*/ | ||
function geTfIdf(document, nbrDocsByWords, nbrDocs, stats) { | ||
function geTfIdf(document, nbrDocs, stats) { | ||
var tfIdf = {}; | ||
_.keys(document.tfs).forEach(function(word){ | ||
tfIdf[word] = document.tfs[word] * (Math.log(nbrDocs/nbrDocsByWords[word]) + 1); | ||
if (stats[word]) { | ||
stats[word].min = _.min([tfIdf[word], stats[word].min]); | ||
stats[word].max = _.max([tfIdf[word], stats[word].max]); | ||
stats[word].sum += tfIdf[word]; | ||
_.keys(document.tfs).forEach(function(word) { | ||
var idf = Math.log(nbrDocs/stats.nbrDocsByWords[word]) + 1; | ||
tfIdf[word] = document.tfs[word] * idf; | ||
if (stats.words[word]) { | ||
stats.words[word].tfIdfs.push(tfIdf[word]); | ||
stats.words[word].tfIdfSum += tfIdf[word]; | ||
stats.words[word].idfs.push(idf); | ||
stats.words[word].idfSum += idf; | ||
} | ||
else { | ||
stats[word] = { min : tfIdf[word], max : tfIdf[word], sum : tfIdf[word] }; | ||
} | ||
}); | ||
@@ -149,2 +164,18 @@ | ||
function initWordStat(tf) { | ||
return { | ||
tfSum : tf, | ||
tfs : [tf], | ||
idfSum : 0, | ||
idfs : [], | ||
tfIdfSum : 0, | ||
tfIdfs : [] | ||
}; | ||
} | ||
/** | ||
@@ -160,15 +191,23 @@ * Get the TF.IDF for each words found in several documents | ||
var result = {}; | ||
var nbrDocsByWords = {}; | ||
var stats = {}; | ||
var stats = createEmptyStat(); | ||
// Calculate the TF of each words for each docs | ||
var tfs = _.map(documents, function(content){ return getTf(getWords(content, withStopWords), n, nbrDocsByWords);}); | ||
var tfs = _.map(documents, function(document){ return getTf(getWords(document, withStopWords), n, stats);}); | ||
// Calculate the tf.idf for each each docs & produce stat per word | ||
var data = _.map(tfs, function(docTfs) {return geTfIdf(docTfs, nbrDocsByWords, documents.length, stats );}); | ||
// Calculate the tf.idf for each each docs & produce stats per word | ||
var data = _.map(tfs, function(docTfs) { return geTfIdf(docTfs, documents.length, stats );}); | ||
// Calculate the average tf.idf for each word | ||
stats = _.mapObject(stats, function(val, key) { | ||
val.avg = val.sum/nbrDocsByWords[key]; | ||
return val; | ||
// Calculate min, max, avg for tf, idf & tf.idf | ||
stats.words = _.mapObject(stats.words, function(word, key){ | ||
word.tfMin = _.min(word.tfs); | ||
word.tfMax = _.max(word.tfs); | ||
word.tfAvg = word.tfSum / stats.nbrDocsByWords[key]; | ||
word.idfMax = _.max(word.idfs); | ||
word.idfAvg = word.idfSum / stats.nbrDocsByWords[key]; | ||
word.tfIdfMin = _.min(word.tfIdfs); | ||
word.tfIdfMax = _.max(word.tfIdfs); | ||
word.tfIdfAvg = word.tfIdfSum / stats.nbrDocsByWords[key]; | ||
return word; | ||
}); | ||
@@ -179,3 +218,2 @@ | ||
result.stats = stats; | ||
result.nbrDocsByWords = nbrDocsByWords; | ||
@@ -185,8 +223,9 @@ return result; | ||
function createEmptyStat() { | ||
return { | ||
nbrDocsByWords : [], | ||
words : [] | ||
}; | ||
} | ||
/* | ||
tf = nombre occurence du terme/ nombre d'occurence du terme le plus fréquent | ||
idf = log(nbr doc/nbr doc ayant le terme) + 1 | ||
*/ | ||
exports.getStatements = getStatements; | ||
@@ -193,0 +232,0 @@ exports.getWords = getWords; |
{ | ||
"name": "natural-content", | ||
"version": "1.0.0", | ||
"version": "1.0.1", | ||
"description": "A set of natural functions like tf.idf, extract words & n-grams, ... (experimental project)", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
var assert = require("assert"); | ||
var _ = require("underscore"); | ||
var natural = require("../index.js"); | ||
@@ -7,8 +8,8 @@ | ||
var documents = [ | ||
"word1 word2 word3 word4 word5 word6. word7 word1 word8 word9 word10 word11 word6. word1 word12 word13. word1 word1", | ||
"word2 word7 word8 word9 word10 word11.", | ||
"word1 word2 word3 word4 word5 word6. word7 word1 word8 word9 word10 word11 word6. word1 word12 word13. word1 word1 ", | ||
"word2 word7 word8 word9 word10 word7 word11 word7 word11 word11 word11 word11.", | ||
" word7 word2" ]; | ||
it('Statements', function() { | ||
var stats = natural.getStatements("word1 word2 word3 word4 word5 word6. word7 word1 word8 word9 word10 word11 word6. word1 word12 word13"); | ||
var stats = natural.getStatements("word1 word2 word3 word4 :word5 word6. word7 word1, word8 word9 word10 word11 word6. word1 word12 word13"); | ||
assert(stats.length === 3); | ||
@@ -58,4 +59,12 @@ | ||
var info = natural.getTfIdfs(documents, 1, false); | ||
console.log(info); | ||
var info = natural.getTfIdfs(documents, 1, false); | ||
//console.log(info); | ||
//console.log(info.stats.words['word7']); | ||
console.log("Word,TF Avg,TF Min,TF Max,IDF Avg,TF.IDF Sum,TF.IDF Avg"); | ||
_.keys(info.stats.words).forEach(function(word) { | ||
//console.log(">> ", info.stats.words[word]); | ||
console.log(word + "," + info.stats.words[word].tfAvg + "," + info.stats.words[word].tfMin + "," + info.stats.words[word].tfMax + "," + | ||
info.stats.words[word].idfAvg + ',' + info.stats.words[word].tfIdfSum + ',' + info.stats.words[word].tfIdfAvg); | ||
}); | ||
}); | ||
@@ -62,0 +71,0 @@ |
18327
850