natural-content - npm Package Compare versions

Comparing version 1.0.10 to 1.0.11

189

index.js

		@@ -1,3 +0,1 @@

		const _ = require('underscore');
		const diacritics = require('./lib/diacritics.js');
		@@ -33,2 +31,6 @@
		function removeSpecials(text) {
		if (!text) {
		return '';
		}

		const cleanText = text.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs
		@@ -54,6 +56,20 @@ .replace(/[\n\r]/g, WORD_SEPARATOR)

		return result;
		return result.trim();
		}

		/**
		* removeLineBreaks - Remove line breaks in a text
		*
		* @param {type} text the text
		* @returns {type} the text without line breaks
		*/
		function removeLineBreaks(text) {
		if (!text) {
		return '';
		}

		return text.replace(/(\r\n\|\n\|\r)/gm, '').trim();
		}

		/**
		* removeDiacritics - Remove all diacritics from a text
		@@ -126,3 +142,3 @@ *

		const count = _.max([ 0, words.length - n + 1 ]);
		const count = Math.max(0, words.length - n + 1);

		@@ -133,3 +149,3 @@ for (let i = 0; i < count; i++) {
		// Convert the ngram array into a ngram string and add it in the result list
		result.push(_.reduce(slice, (memo, word) => memo ? `${ memo } ${ word }` : word));
		result.push(slice.reduce((memo, word) => memo ? `${ memo } ${ word }` : word));
		}
		@@ -140,159 +156,2 @@

		/**
		* getTf - Get the term frequency (tf) of each word of a document
		*
		* @param {string} words array of words (String) matching to the document content
		* @param {number} n cardinality of the ngrams (optional). Has to be > 0
		* @param {object} stats the current stat aout the word or ngrams (could be null). This is used to compute the tf/idf
		* @returns {object} F json structure :
		* {
		* count : an array of word with the number of occurence in the document (the array index matches to the word)
		* tf : an array of tf value of each word (the array index matches to the word)
		* max : the value of the most frequent word
		* }
		*/
		function getTf(words, n, stats) {
		let ws = words;

		if (n && n > 1) {
		ws = getNgrams(words, n);
		}

		const count = _.countBy(ws, (word) => word);
		const max = _.max(count);
		const tfs = {};

		_.keys(count).forEach((word) => {
		// Calculate the tf for this word
		tfs[word] = count[word] / max;

		// Update stats
		if (stats) {
		// Calculate sum & register the tf for min & max computation
		if (stats.has(word) && stats.get(word).tfs) {
		const wordStat = stats.get(word);

		// update the number of documents for this word
		wordStat.nbrDocs++;

		// Add the tf in the list of all tfs for this word
		wordStat.tfs.push(tfs[word]);
		wordStat.tfSum += tfs[word];
		} else {
		const newWordStat = initWordStat(word, tfs[word]);

		stats.set(word, newWordStat);
		}
		}
		});

		return {
		count,
		tfs,
		max
		};
		}

		/**
		* geTfIdf - Get the tfIdf for each word of a document
		*
		* @param {object} document the document represented by an term frequency array.
		* @param {number} nbrDocs the number of documents
		* @param {object} stats stats about the words for the full set of documents
		* @returns {object} the tf/idf info for this document
		*/
		function geTfIdf(document, nbrDocs, stats) {
		const tfIdf = {};

		_.keys(document.tfs).forEach((word) => {
		const idf = Math.log(nbrDocs / stats.get(word).nbrDocs) + 1;

		tfIdf[word] = document.tfs[word] * idf;

		if (stats.has(word) && stats.get(word).tfIdfs && stats.get(word).idfs) {
		const wordStat = stats.get(word);

		wordStat.tfIdfs.push(tfIdf[word]);
		wordStat.tfIdfSum += tfIdf[word];
		wordStat.idfs.push(idf);
		wordStat.idfSum += idf;
		}
		});

		document.tfIdf = tfIdf;

		return document;
		}

		/**
		* Get the TF.IDF for each words found in several documents
		*
		* @param an arrays of String matching to the document content. It could be Text or HTML
		* @param ngrams cardinality (optional). Has to be > 0
		* @param True if the stop words are to be present in the corpus
		*/

		/**
		* getTfIdfs - Get the TF.IDF for each words found in several documents
		*
		* @param {Array} documents arrays of String matching to the document content. It could be Text or HTML
		* @param {number} n ngram cardinality (optional). Has to be > 0
		* @param {boolean} withStopWords if true, remove the stopwords
		* @param {string} language the language code (fr, en, ... )
		* @returns {object} the tf/idf for each word/ngrams
		*/
		function getTfIdfs(documents, n, withStopWords, language) {
		const result = {};
		const stats = new Map();

		// Calculate the TF of each words for each docs
		const tfs = _.map(documents, (document) => getTf(getWords(document, withStopWords, language), n, stats));

		// Calculate the tf.idf for each each docs & produce stats per word
		const data = _.map(tfs, (docTfs) => geTfIdf(docTfs, documents.length, stats));

		// Calculate stats : min, max, avg for tf, idf & tf.idf
		for (const wordStat of stats.values()) {
		wordStat.tfMin = _.min(wordStat.tfs);
		wordStat.tfMax = _.max(wordStat.tfs);
		wordStat.tfAvg = wordStat.tfSum / wordStat.nbrDocs;

		wordStat.idfMax = _.max(wordStat.idfs);
		wordStat.idfAvg = wordStat.idfSum / wordStat.nbrDocs;

		wordStat.tfIdfMin = _.min(wordStat.tfIdfs);
		wordStat.tfIdfMax = _.max(wordStat.tfIdfs);
		wordStat.tfIdfAvg = wordStat.tfIdfSum / wordStat.nbrDocs;
		}

		result.documents = data;
		result.numberOfDocs = documents.length;
		result.stats = stats;

		return result;
		}

		/**
		* initWordStat - Create an new Stat object for one word
		*
		* @param {string} word the word
		* @param {number} tf the tf value
		* @returns {object} the stat about the word
		*/
		function initWordStat(word, tf) {
		return {
		word,
		nbrDocs: 1,
		tfSum: tf,
		tfs: [ tf ],

		idfSum: 0,
		idfs: [],

		tfIdfSum: 0,
		tfIdfs: []

		};
		}

		exports.getStatements = getStatements;
		@@ -304,8 +163,6 @@

		exports.removeLineBreaks = removeLineBreaks;

		exports.getWords = getWords;

		exports.getNgrams = getNgrams;

		exports.getTf = getTf;

		exports.getTfIdfs = getTfIdfs;

package.json

		{
		"name": "natural-content",
		"version": "1.0.10",
		"version": "1.0.11",
		"description": "A set of natural functions like tf.idf, extract words & n-grams, remove diacritics, ... (experimental project)",
		@@ -5,0 +5,0 @@ "main": "index.js",

test/test.js

		const assert = require('assert');
		const _ = require('underscore');
		const natural = require('../index.js');
		@@ -90,31 +89,2 @@
		});

		it('term frequency', () => {
		const info = natural.getTf(natural.getWords(documents[0], false, 'fr'));

		// console.log(info);
		assert(info.max === 5);
		assert(info.count.word1 === 5);
		assert(info.count.word6 === 2);
		assert(info.tfs.word1 === 1);
		assert(info.tfs.word2 === 0.2);
		assert(info.tfs.word6 === 0.4);
		});

		it('tf.idf for a set of document ', () => {
		const info = natural.getTfIdfs(documents, 1, false, 'fr');

		const sorted = _.sortBy(Array.from(info.stats.values()), (word) => -word.tfIdfSum);

		assert(sorted[0].word === 'word1');

		// console.log(sorted);
		});

		// it.only("tf.idf for a set of document in french", function() {
		// var info = natural.getTfIdfs(documentsFr, 3, true);
		// var sorted = _.sortBy(Array.from(info.stats.values()), function(word) { return -word.tfIdfSum;});
		// //assert(sorted[0].word === "word1");
		// console.log(sorted);
		// });
		});

natural-content - npm Package Compare versions

New alerts

Fixed alerts

Worsened metrics