natural-content - npm Package Compare versions

Comparing version 1.0.0 to 1.0.1

105

index.js

		@@ -46,6 +46,6 @@ var _ = require('underscore');
		if (withStopWords) {
		return _.filter(words, function(word){return (word !== ''); });
		return _.filter(words, function(word){return (word !== '') && ! _.isNumber(word); });
		}
		else {
		return _.filter(words, function(word){return (word !== '' && stopwords.indexOf(word) === -1); });
		return _.filter(words, function(word){return (word !== '' && ! _.isNumber(word) && stopwords.indexOf(word) === -1); });
		}
		@@ -87,3 +87,3 @@
		*/
		function getTf(words, n, nbrDocs) {
		function getTf(words, n, stats) {

		@@ -98,6 +98,20 @@ var ws = words;

		_.keys(count).forEach(function(key) {
		tfs[key] = count[key]/max;
		if (nbrDocs) {
		nbrDocs[key] = nbrDocs[key] ? ++nbrDocs[key] : 1;
		_.keys(count).forEach(function(word) {

		// Calculate the tf for this word
		tfs[word] = count[word]/max;

		// Update stats
		if (stats) {
		// update the number of documents for this word
		stats.nbrDocsByWords[word] = stats.nbrDocsByWords[word] ? ++stats.nbrDocsByWords[word] : 1;

		// Calculate sum & register the tf for min & max computation
		if (stats.words[word]) {
		stats.words[word].tfs.push(tfs[word]);
		stats.words[word].tfSum += tfs[word];
		}
		else {
		stats.words[word] = initWordStat(tfs[word]);
		}
		}
		@@ -115,2 +129,3 @@


		/**
		@@ -121,3 +136,3 @@ * Get the tfIdf for each word of a document
		* @param the document represented by an term frequency array.
		* the function getTf can be used for generating the term frequency array
		* the function getTf can be used for generating the term frequency array
		* @param the number of document per word (index = word)
		@@ -128,16 +143,16 @@ * @param the number of documents
		*/
		function geTfIdf(document, nbrDocsByWords, nbrDocs, stats) {
		function geTfIdf(document, nbrDocs, stats) {


		var tfIdf = {};
		_.keys(document.tfs).forEach(function(word){
		tfIdf[word] = document.tfs[word] * (Math.log(nbrDocs/nbrDocsByWords[word]) + 1);
		if (stats[word]) {
		stats[word].min = _.min([tfIdf[word], stats[word].min]);
		stats[word].max = _.max([tfIdf[word], stats[word].max]);
		stats[word].sum += tfIdf[word];
		_.keys(document.tfs).forEach(function(word) {
		var idf = Math.log(nbrDocs/stats.nbrDocsByWords[word]) + 1;
		tfIdf[word] = document.tfs[word] * idf;

		if (stats.words[word]) {
		stats.words[word].tfIdfs.push(tfIdf[word]);
		stats.words[word].tfIdfSum += tfIdf[word];
		stats.words[word].idfs.push(idf);
		stats.words[word].idfSum += idf;
		}
		else {
		stats[word] = { min : tfIdf[word], max : tfIdf[word], sum : tfIdf[word] };
		}
		});
		@@ -149,2 +164,18 @@

		function initWordStat(tf) {

		return {
		tfSum : tf,
		tfs : [tf],

		idfSum : 0,
		idfs : [],

		tfIdfSum : 0,
		tfIdfs : []

		};

		}

		/**
		@@ -160,15 +191,23 @@ * Get the TF.IDF for each words found in several documents
		var result = {};
		var nbrDocsByWords = {};
		var stats = {};
		var stats = createEmptyStat();

		// Calculate the TF of each words for each docs
		var tfs = _.map(documents, function(content){ return getTf(getWords(content, withStopWords), n, nbrDocsByWords);});
		var tfs = _.map(documents, function(document){ return getTf(getWords(document, withStopWords), n, stats);});

		// Calculate the tf.idf for each each docs & produce stat per word
		var data = _.map(tfs, function(docTfs) {return geTfIdf(docTfs, nbrDocsByWords, documents.length, stats );});
		// Calculate the tf.idf for each each docs & produce stats per word
		var data = _.map(tfs, function(docTfs) { return geTfIdf(docTfs, documents.length, stats );});

		// Calculate the average tf.idf for each word
		stats = _.mapObject(stats, function(val, key) {
		val.avg = val.sum/nbrDocsByWords[key];
		return val;
		// Calculate min, max, avg for tf, idf & tf.idf
		stats.words = _.mapObject(stats.words, function(word, key){
		word.tfMin = _.min(word.tfs);
		word.tfMax = _.max(word.tfs);
		word.tfAvg = word.tfSum / stats.nbrDocsByWords[key];

		word.idfMax = _.max(word.idfs);
		word.idfAvg = word.idfSum / stats.nbrDocsByWords[key];

		word.tfIdfMin = _.min(word.tfIdfs);
		word.tfIdfMax = _.max(word.tfIdfs);
		word.tfIdfAvg = word.tfIdfSum / stats.nbrDocsByWords[key];
		return word;
		});
		@@ -179,3 +218,2 @@
		result.stats = stats;
		result.nbrDocsByWords = nbrDocsByWords;

		@@ -185,8 +223,9 @@ return result;

		function createEmptyStat() {
		return {
		nbrDocsByWords : [],
		words : []
		};
		}

		/*
		tf = nombre occurence du terme/ nombre d'occurence du terme le plus fréquent
		idf = log(nbr doc/nbr doc ayant le terme) + 1
		*/

		exports.getStatements = getStatements;
		@@ -193,0 +232,0 @@ exports.getWords = getWords;

package.json

		{
		"name": "natural-content",
		"version": "1.0.0",
		"version": "1.0.1",
		"description": "A set of natural functions like tf.idf, extract words & n-grams, ... (experimental project)",
		@@ -5,0 +5,0 @@ "main": "index.js",

test/test.js

		var assert = require("assert");
		var _ = require("underscore");
		var natural = require("../index.js");
		@@ -7,8 +8,8 @@
		var documents = [
		"word1 word2 word3 word4 word5 word6. word7 word1 word8 word9 word10 word11 word6. word1 word12 word13. word1 word1",
		"word2 word7 word8 word9 word10 word11.",
		"word1 word2 word3 word4 word5 word6. word7 word1 word8 word9 word10 word11 word6. word1 word12 word13. word1 word1 ",
		"word2 word7 word8 word9 word10 word7 word11 word7 word11 word11 word11 word11.",
		" word7 word2" ];

		it('Statements', function() {
		var stats = natural.getStatements("word1 word2 word3 word4 word5 word6. word7 word1 word8 word9 word10 word11 word6. word1 word12 word13");
		var stats = natural.getStatements("word1 word2 word3 word4 :word5 word6. word7 word1, word8 word9 word10 word11 word6. word1 word12 word13");
		assert(stats.length === 3);
		@@ -58,4 +59,12 @@

		var info = natural.getTfIdfs(documents, 1, false);
		console.log(info);
		var info = natural.getTfIdfs(documents, 1, false);
		//console.log(info);
		//console.log(info.stats.words['word7']);
		console.log("Word,TF Avg,TF Min,TF Max,IDF Avg,TF.IDF Sum,TF.IDF Avg");
		_.keys(info.stats.words).forEach(function(word) {
		//console.log(">> ", info.stats.words[word]);
		console.log(word + "," + info.stats.words[word].tfAvg + "," + info.stats.words[word].tfMin + "," + info.stats.words[word].tfMax + "," +
		info.stats.words[word].idfAvg + ',' + info.stats.words[word].tfIdfSum + ',' + info.stats.words[word].tfIdfAvg);
		});

		});
		@@ -62,0 +71,0 @@

natural-content - npm Package Compare versions

Improved metrics