natural-content
Advanced tools
Comparing version 1.2.2 to 1.2.3
79
index.js
@@ -18,2 +18,11 @@ const { PorterStemmerFr, TfIdf, AggressiveTokenizerFr } = require('natural'); | ||
/** | ||
* Check if a statement contains an acronym which is a substring in uppercase | ||
* @param {string} statement the statement | ||
* @returns {boolean} true if the statement contains an acronym | ||
*/ | ||
function containsAcronym(statement) { | ||
return /^(.*)([A-Z]{2,})/.test(statement); | ||
} | ||
/** | ||
* getStatements - Get all statements from a text | ||
@@ -26,21 +35,21 @@ * | ||
return text.replace(/[\n\r]/g, WORD_SEPARATOR) // Convert end of line | ||
.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/ /gi, WORD_SEPARATOR)// remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig, WORD_SEPARATOR) // remove HTML tags | ||
.replace(/ +/g, WORD_SEPARATOR) // remove multiple spaces | ||
.replace('...', STATEMENT_SEPARATOR) | ||
.replace(/[.]{3}/g, `.${ STATEMENT_SEPARATOR }`) | ||
.replace(/[.]/g, `.${ STATEMENT_SEPARATOR }`) | ||
.replace(/[!]/g, `!${ STATEMENT_SEPARATOR }`) | ||
.replace(/[?]/g, `?${ STATEMENT_SEPARATOR }`) | ||
.split(STATEMENT_SEPARATOR) | ||
.reduce((result, t) => { | ||
if (t.trim() === '') { | ||
return result; | ||
} | ||
.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/ /gi, WORD_SEPARATOR)// remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig, WORD_SEPARATOR) // remove HTML tags | ||
.replace(/ +/g, WORD_SEPARATOR) // remove multiple spaces | ||
.replace('...', STATEMENT_SEPARATOR) | ||
.replace(/[.]{3}/g, `.${STATEMENT_SEPARATOR}`) | ||
.replace(/[.]/g, `.${STATEMENT_SEPARATOR}`) | ||
.replace(/[!]/g, `!${STATEMENT_SEPARATOR}`) | ||
.replace(/[?]/g, `?${STATEMENT_SEPARATOR}`) | ||
.split(STATEMENT_SEPARATOR) | ||
.reduce((result, t) => { | ||
if (t.trim() === '') { | ||
return result; | ||
} | ||
result.push(t.trim()); | ||
result.push(t.trim()); | ||
return result; | ||
}, []); | ||
return result; | ||
}, []); | ||
} | ||
@@ -60,6 +69,6 @@ | ||
const cleanText = text.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/[\n\r]/g, WORD_SEPARATOR) | ||
.replace(/ /gi, WORD_SEPARATOR)// remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig, WORD_SEPARATOR) // remove HTML tags | ||
.replace(/[|&’«»'"\/(\/)\/!\/?\\-]/g, WORD_SEPARATOR); | ||
.replace(/[\n\r]/g, WORD_SEPARATOR) | ||
.replace(/ /gi, WORD_SEPARATOR)// remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig, WORD_SEPARATOR) // remove HTML tags | ||
.replace(/[|&’«»'"\/(\/)\/!\/?\\-]/g, WORD_SEPARATOR); | ||
@@ -95,6 +104,6 @@ const lower = cleanText.toLowerCase(); | ||
return text.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/[\n\r]/g, WORD_SEPARATOR) | ||
.replace(/[\n]/g, WORD_SEPARATOR) | ||
.replace(/\s+/g, WORD_SEPARATOR) | ||
.trim(); | ||
.replace(/[\n\r]/g, WORD_SEPARATOR) | ||
.replace(/[\n]/g, WORD_SEPARATOR) | ||
.replace(/\s+/g, WORD_SEPARATOR) | ||
.trim(); | ||
} | ||
@@ -132,9 +141,9 @@ | ||
const words = text.replace(/[\n\r]/g, WORD_SEPARATOR) // Convert end of line | ||
.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/ /gi, WORD_SEPARATOR) // remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig, WORD_SEPARATOR) // remove HTML tags | ||
.replace(/['’«»";:,.\/(\/)\/!\/?\\-]/g, WORD_SEPARATOR) // Remove punctuations | ||
.replace(/\s+/g, WORD_SEPARATOR) // remove multiple spaces | ||
.toLowerCase() | ||
.split(WORD_SEPARATOR); | ||
.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/ /gi, WORD_SEPARATOR) // remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig, WORD_SEPARATOR) // remove HTML tags | ||
.replace(/['’«»";:,.\/(\/)\/!\/?\\-]/g, WORD_SEPARATOR) // Remove punctuations | ||
.replace(/\s+/g, WORD_SEPARATOR) // remove multiple spaces | ||
.toLowerCase() | ||
.split(WORD_SEPARATOR); | ||
@@ -148,3 +157,3 @@ // Remove empty string | ||
const { stopwords } = require(`./lib/stopwords-${ language.toLowerCase() }`); | ||
const { stopwords } = require(`./lib/stopwords-${language.toLowerCase()}`); | ||
@@ -178,3 +187,3 @@ return words.filter((word) => word !== '' && stopwords.indexOf(removeDiacritics(word)) === -1); | ||
// Convert the ngram array into a ngram string and add it in the result list | ||
result.push(slice.reduce((memo, word) => memo ? `${ memo } ${ word }` : word)); | ||
result.push(slice.reduce((memo, word) => memo ? `${memo} ${word}` : word)); | ||
} | ||
@@ -222,2 +231,4 @@ | ||
exports.containsAcronym = containsAcronym; | ||
exports.getStatements = getStatements; | ||
@@ -224,0 +235,0 @@ |
{ | ||
"name": "natural-content", | ||
"version": "1.2.2", | ||
"version": "1.2.3", | ||
"description": "A set of natural functions like extracting words & n-grams, remove diacritics, get top keywords, ... (experimental project)", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
@@ -23,3 +23,4 @@ const assert = require('assert'); | ||
// console.log(natural.getStatements(txt)); | ||
assert(natural.containsAcronym('this is mister LOL')); | ||
assert(!natural.containsAcronym('this is mister John Smith')); | ||
}); | ||
@@ -26,0 +27,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
4868
73080