Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

natural-content

Package Overview
Dependencies
Maintainers
1
Versions
23
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

natural-content - npm Package Compare versions

Comparing version 1.0.10 to 1.0.11

189

index.js

@@ -1,3 +0,1 @@

const _ = require('underscore');
const diacritics = require('./lib/diacritics.js');

@@ -33,2 +31,6 @@

function removeSpecials(text) {
if (!text) {
return '';
}
const cleanText = text.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs

@@ -54,6 +56,20 @@ .replace(/[\n\r]/g, WORD_SEPARATOR)

return result;
return result.trim();
}
/**
* removeLineBreaks - Remove line breaks in a text
*
* @param {type} text the text
* @returns {type} the text without line breaks
*/
function removeLineBreaks(text) {
if (!text) {
return '';
}
return text.replace(/(\r\n|\n|\r)/gm, '').trim();
}
/**
* removeDiacritics - Remove all diacritics from a text

@@ -126,3 +142,3 @@ *

const count = _.max([ 0, words.length - n + 1 ]);
const count = Math.max(0, words.length - n + 1);

@@ -133,3 +149,3 @@ for (let i = 0; i < count; i++) {

// Convert the ngram array into a ngram string and add it in the result list
result.push(_.reduce(slice, (memo, word) => memo ? `${ memo } ${ word }` : word));
result.push(slice.reduce((memo, word) => memo ? `${ memo } ${ word }` : word));
}

@@ -140,159 +156,2 @@

/**
* getTf - Get the term frequency (tf) of each word of a document
*
* @param {string} words array of words (String) matching to the document content
* @param {number} n cardinality of the ngrams (optional). Has to be > 0
* @param {object} stats the current stat aout the word or ngrams (could be null). This is used to compute the tf/idf
* @returns {object} F json structure :
* {
* count : an array of word with the number of occurence in the document (the array index matches to the word)
* tf : an array of tf value of each word (the array index matches to the word)
* max : the value of the most frequent word
* }
*/
function getTf(words, n, stats) {
let ws = words;
if (n && n > 1) {
ws = getNgrams(words, n);
}
const count = _.countBy(ws, (word) => word);
const max = _.max(count);
const tfs = {};
_.keys(count).forEach((word) => {
// Calculate the tf for this word
tfs[word] = count[word] / max;
// Update stats
if (stats) {
// Calculate sum & register the tf for min & max computation
if (stats.has(word) && stats.get(word).tfs) {
const wordStat = stats.get(word);
// update the number of documents for this word
wordStat.nbrDocs++;
// Add the tf in the list of all tfs for this word
wordStat.tfs.push(tfs[word]);
wordStat.tfSum += tfs[word];
} else {
const newWordStat = initWordStat(word, tfs[word]);
stats.set(word, newWordStat);
}
}
});
return {
count,
tfs,
max
};
}
/**
* geTfIdf - Get the tfIdf for each word of a document
*
* @param {object} document the document represented by an term frequency array.
* @param {number} nbrDocs the number of documents
* @param {object} stats stats about the words for the full set of documents
* @returns {object} the tf/idf info for this document
*/
function geTfIdf(document, nbrDocs, stats) {
const tfIdf = {};
_.keys(document.tfs).forEach((word) => {
const idf = Math.log(nbrDocs / stats.get(word).nbrDocs) + 1;
tfIdf[word] = document.tfs[word] * idf;
if (stats.has(word) && stats.get(word).tfIdfs && stats.get(word).idfs) {
const wordStat = stats.get(word);
wordStat.tfIdfs.push(tfIdf[word]);
wordStat.tfIdfSum += tfIdf[word];
wordStat.idfs.push(idf);
wordStat.idfSum += idf;
}
});
document.tfIdf = tfIdf;
return document;
}
/**
* Get the TF.IDF for each words found in several documents
*
* @param an arrays of String matching to the document content. It could be Text or HTML
* @param ngrams cardinality (optional). Has to be > 0
* @param True if the stop words are to be present in the corpus
*/
/**
* getTfIdfs - Get the TF.IDF for each words found in several documents
*
* @param {Array} documents arrays of String matching to the document content. It could be Text or HTML
* @param {number} n ngram cardinality (optional). Has to be > 0
* @param {boolean} withStopWords if true, remove the stopwords
* @param {string} language the language code (fr, en, ... )
* @returns {object} the tf/idf for each word/ngrams
*/
function getTfIdfs(documents, n, withStopWords, language) {
const result = {};
const stats = new Map();
// Calculate the TF of each words for each docs
const tfs = _.map(documents, (document) => getTf(getWords(document, withStopWords, language), n, stats));
// Calculate the tf.idf for each each docs & produce stats per word
const data = _.map(tfs, (docTfs) => geTfIdf(docTfs, documents.length, stats));
// Calculate stats : min, max, avg for tf, idf & tf.idf
for (const wordStat of stats.values()) {
wordStat.tfMin = _.min(wordStat.tfs);
wordStat.tfMax = _.max(wordStat.tfs);
wordStat.tfAvg = wordStat.tfSum / wordStat.nbrDocs;
wordStat.idfMax = _.max(wordStat.idfs);
wordStat.idfAvg = wordStat.idfSum / wordStat.nbrDocs;
wordStat.tfIdfMin = _.min(wordStat.tfIdfs);
wordStat.tfIdfMax = _.max(wordStat.tfIdfs);
wordStat.tfIdfAvg = wordStat.tfIdfSum / wordStat.nbrDocs;
}
result.documents = data;
result.numberOfDocs = documents.length;
result.stats = stats;
return result;
}
/**
* initWordStat - Create an new Stat object for one word
*
* @param {string} word the word
* @param {number} tf the tf value
* @returns {object} the stat about the word
*/
function initWordStat(word, tf) {
return {
word,
nbrDocs: 1,
tfSum: tf,
tfs: [ tf ],
idfSum: 0,
idfs: [],
tfIdfSum: 0,
tfIdfs: []
};
}
exports.getStatements = getStatements;

@@ -304,8 +163,6 @@

exports.removeLineBreaks = removeLineBreaks;
exports.getWords = getWords;
exports.getNgrams = getNgrams;
exports.getTf = getTf;
exports.getTfIdfs = getTfIdfs;

2

package.json
{
"name": "natural-content",
"version": "1.0.10",
"version": "1.0.11",
"description": "A set of natural functions like tf.idf, extract words & n-grams, remove diacritics, ... (experimental project)",

@@ -5,0 +5,0 @@ "main": "index.js",

const assert = require('assert');
const _ = require('underscore');
const natural = require('../index.js');

@@ -90,31 +89,2 @@

});
it('term frequency', () => {
const info = natural.getTf(natural.getWords(documents[0], false, 'fr'));
// console.log(info);
assert(info.max === 5);
assert(info.count.word1 === 5);
assert(info.count.word6 === 2);
assert(info.tfs.word1 === 1);
assert(info.tfs.word2 === 0.2);
assert(info.tfs.word6 === 0.4);
});
it('tf.idf for a set of document ', () => {
const info = natural.getTfIdfs(documents, 1, false, 'fr');
const sorted = _.sortBy(Array.from(info.stats.values()), (word) => -word.tfIdfSum);
assert(sorted[0].word === 'word1');
// console.log(sorted);
});
// it.only("tf.idf for a set of document in french", function() {
// var info = natural.getTfIdfs(documentsFr, 3, true);
// var sorted = _.sortBy(Array.from(info.stats.values()), function(word) { return -word.tfIdfSum;});
// //assert(sorted[0].word === "word1");
// console.log(sorted);
// });
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc