natural-content
Advanced tools
Comparing version 1.0.10 to 1.0.11
189
index.js
@@ -1,3 +0,1 @@ | ||
const _ = require('underscore'); | ||
const diacritics = require('./lib/diacritics.js'); | ||
@@ -33,2 +31,6 @@ | ||
function removeSpecials(text) { | ||
if (!text) { | ||
return ''; | ||
} | ||
const cleanText = text.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
@@ -54,6 +56,20 @@ .replace(/[\n\r]/g, WORD_SEPARATOR) | ||
return result; | ||
return result.trim(); | ||
} | ||
/** | ||
* removeLineBreaks - Remove line breaks in a text | ||
* | ||
* @param {type} text the text | ||
* @returns {type} the text without line breaks | ||
*/ | ||
function removeLineBreaks(text) { | ||
if (!text) { | ||
return ''; | ||
} | ||
return text.replace(/(\r\n|\n|\r)/gm, '').trim(); | ||
} | ||
/** | ||
* removeDiacritics - Remove all diacritics from a text | ||
@@ -126,3 +142,3 @@ * | ||
const count = _.max([ 0, words.length - n + 1 ]); | ||
const count = Math.max(0, words.length - n + 1); | ||
@@ -133,3 +149,3 @@ for (let i = 0; i < count; i++) { | ||
// Convert the ngram array into a ngram string and add it in the result list | ||
result.push(_.reduce(slice, (memo, word) => memo ? `${ memo } ${ word }` : word)); | ||
result.push(slice.reduce((memo, word) => memo ? `${ memo } ${ word }` : word)); | ||
} | ||
@@ -140,159 +156,2 @@ | ||
/** | ||
* getTf - Get the term frequency (tf) of each word of a document | ||
* | ||
* @param {string} words array of words (String) matching to the document content | ||
* @param {number} n cardinality of the ngrams (optional). Has to be > 0 | ||
* @param {object} stats the current stat aout the word or ngrams (could be null). This is used to compute the tf/idf | ||
* @returns {object} F json structure : | ||
* { | ||
* count : an array of word with the number of occurence in the document (the array index matches to the word) | ||
* tf : an array of tf value of each word (the array index matches to the word) | ||
* max : the value of the most frequent word | ||
* } | ||
*/ | ||
function getTf(words, n, stats) { | ||
let ws = words; | ||
if (n && n > 1) { | ||
ws = getNgrams(words, n); | ||
} | ||
const count = _.countBy(ws, (word) => word); | ||
const max = _.max(count); | ||
const tfs = {}; | ||
_.keys(count).forEach((word) => { | ||
// Calculate the tf for this word | ||
tfs[word] = count[word] / max; | ||
// Update stats | ||
if (stats) { | ||
// Calculate sum & register the tf for min & max computation | ||
if (stats.has(word) && stats.get(word).tfs) { | ||
const wordStat = stats.get(word); | ||
// update the number of documents for this word | ||
wordStat.nbrDocs++; | ||
// Add the tf in the list of all tfs for this word | ||
wordStat.tfs.push(tfs[word]); | ||
wordStat.tfSum += tfs[word]; | ||
} else { | ||
const newWordStat = initWordStat(word, tfs[word]); | ||
stats.set(word, newWordStat); | ||
} | ||
} | ||
}); | ||
return { | ||
count, | ||
tfs, | ||
max | ||
}; | ||
} | ||
/** | ||
* geTfIdf - Get the tfIdf for each word of a document | ||
* | ||
* @param {object} document the document represented by an term frequency array. | ||
* @param {number} nbrDocs the number of documents | ||
* @param {object} stats stats about the words for the full set of documents | ||
* @returns {object} the tf/idf info for this document | ||
*/ | ||
function geTfIdf(document, nbrDocs, stats) { | ||
const tfIdf = {}; | ||
_.keys(document.tfs).forEach((word) => { | ||
const idf = Math.log(nbrDocs / stats.get(word).nbrDocs) + 1; | ||
tfIdf[word] = document.tfs[word] * idf; | ||
if (stats.has(word) && stats.get(word).tfIdfs && stats.get(word).idfs) { | ||
const wordStat = stats.get(word); | ||
wordStat.tfIdfs.push(tfIdf[word]); | ||
wordStat.tfIdfSum += tfIdf[word]; | ||
wordStat.idfs.push(idf); | ||
wordStat.idfSum += idf; | ||
} | ||
}); | ||
document.tfIdf = tfIdf; | ||
return document; | ||
} | ||
/** | ||
* Get the TF.IDF for each words found in several documents | ||
* | ||
* @param an arrays of String matching to the document content. It could be Text or HTML | ||
* @param ngrams cardinality (optional). Has to be > 0 | ||
* @param True if the stop words are to be present in the corpus | ||
*/ | ||
/** | ||
* getTfIdfs - Get the TF.IDF for each words found in several documents | ||
* | ||
* @param {Array} documents arrays of String matching to the document content. It could be Text or HTML | ||
* @param {number} n ngram cardinality (optional). Has to be > 0 | ||
* @param {boolean} withStopWords if true, remove the stopwords | ||
* @param {string} language the language code (fr, en, ... ) | ||
* @returns {object} the tf/idf for each word/ngrams | ||
*/ | ||
function getTfIdfs(documents, n, withStopWords, language) { | ||
const result = {}; | ||
const stats = new Map(); | ||
// Calculate the TF of each words for each docs | ||
const tfs = _.map(documents, (document) => getTf(getWords(document, withStopWords, language), n, stats)); | ||
// Calculate the tf.idf for each each docs & produce stats per word | ||
const data = _.map(tfs, (docTfs) => geTfIdf(docTfs, documents.length, stats)); | ||
// Calculate stats : min, max, avg for tf, idf & tf.idf | ||
for (const wordStat of stats.values()) { | ||
wordStat.tfMin = _.min(wordStat.tfs); | ||
wordStat.tfMax = _.max(wordStat.tfs); | ||
wordStat.tfAvg = wordStat.tfSum / wordStat.nbrDocs; | ||
wordStat.idfMax = _.max(wordStat.idfs); | ||
wordStat.idfAvg = wordStat.idfSum / wordStat.nbrDocs; | ||
wordStat.tfIdfMin = _.min(wordStat.tfIdfs); | ||
wordStat.tfIdfMax = _.max(wordStat.tfIdfs); | ||
wordStat.tfIdfAvg = wordStat.tfIdfSum / wordStat.nbrDocs; | ||
} | ||
result.documents = data; | ||
result.numberOfDocs = documents.length; | ||
result.stats = stats; | ||
return result; | ||
} | ||
/** | ||
* initWordStat - Create an new Stat object for one word | ||
* | ||
* @param {string} word the word | ||
* @param {number} tf the tf value | ||
* @returns {object} the stat about the word | ||
*/ | ||
function initWordStat(word, tf) { | ||
return { | ||
word, | ||
nbrDocs: 1, | ||
tfSum: tf, | ||
tfs: [ tf ], | ||
idfSum: 0, | ||
idfs: [], | ||
tfIdfSum: 0, | ||
tfIdfs: [] | ||
}; | ||
} | ||
exports.getStatements = getStatements; | ||
@@ -304,8 +163,6 @@ | ||
exports.removeLineBreaks = removeLineBreaks; | ||
exports.getWords = getWords; | ||
exports.getNgrams = getNgrams; | ||
exports.getTf = getTf; | ||
exports.getTfIdfs = getTfIdfs; |
{ | ||
"name": "natural-content", | ||
"version": "1.0.10", | ||
"version": "1.0.11", | ||
"description": "A set of natural functions like tf.idf, extract words & n-grams, remove diacritics, ... (experimental project)", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
const assert = require('assert'); | ||
const _ = require('underscore'); | ||
const natural = require('../index.js'); | ||
@@ -90,31 +89,2 @@ | ||
}); | ||
it('term frequency', () => { | ||
const info = natural.getTf(natural.getWords(documents[0], false, 'fr')); | ||
// console.log(info); | ||
assert(info.max === 5); | ||
assert(info.count.word1 === 5); | ||
assert(info.count.word6 === 2); | ||
assert(info.tfs.word1 === 1); | ||
assert(info.tfs.word2 === 0.2); | ||
assert(info.tfs.word6 === 0.4); | ||
}); | ||
it('tf.idf for a set of document ', () => { | ||
const info = natural.getTfIdfs(documents, 1, false, 'fr'); | ||
const sorted = _.sortBy(Array.from(info.stats.values()), (word) => -word.tfIdfSum); | ||
assert(sorted[0].word === 'word1'); | ||
// console.log(sorted); | ||
}); | ||
// it.only("tf.idf for a set of document in french", function() { | ||
// var info = natural.getTfIdfs(documentsFr, 3, true); | ||
// var sorted = _.sortBy(Array.from(info.stats.values()), function(word) { return -word.tfIdfSum;}); | ||
// //assert(sorted[0].word === "word1"); | ||
// console.log(sorted); | ||
// }); | ||
}); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
27456
1175