natural-content
Advanced tools
Comparing version 1.0.8 to 1.0.9
36
index.js
@@ -17,5 +17,5 @@ var _ = require('underscore'); | ||
return text.replace(/[\n\r]/g, WORD_SEPARATOR) // Convert end of line | ||
.replace(/[\t]/g, EMPTY) // Remove Tabs | ||
.replace(/ /gi,'')// remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig,EMPTY) // remove HTML tags | ||
.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/ /gi,WORD_SEPARATOR)// remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig, WORD_SEPARATOR) // remove HTML tags | ||
.replace(/ +/g, WORD_SEPARATOR) // remove multiple spaces | ||
@@ -35,13 +35,23 @@ .replace(/[.]/g, "." + STATEMENT_SEPARATOR) | ||
function removeSpecials(text) { | ||
var lower = text.toLowerCase(); | ||
var upper = text.toUpperCase(); | ||
var cleanText = text.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/[\n\r]/g, WORD_SEPARATOR) | ||
.replace(/ /gi,WORD_SEPARATOR)// remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig,WORD_SEPARATOR) // remove HTML tags | ||
.replace(/[|&’«»'"\/(\/)\/!\/?\\-]/g, WORD_SEPARATOR); | ||
var resut = ""; | ||
var lower = cleanText.toLowerCase(); | ||
var upper = cleanText.toUpperCase(); | ||
var result = ""; | ||
for(var i=0; i<lower.length; ++i) { | ||
if(lower[i] !== upper[i] || lower[i].trim() === ''){ | ||
resut += text[i]; | ||
result += cleanText[i]; | ||
} | ||
} | ||
return resut; | ||
result = result.replace(/\s+/g, WORD_SEPARATOR); // remove multiple spaces | ||
return result; | ||
} | ||
@@ -72,7 +82,7 @@ | ||
var words = text.replace(/[\n\r]/g, WORD_SEPARATOR) // Convert end of line | ||
.replace(/[\t]/g, EMPTY) // Remove Tabs | ||
.replace(/ /gi,'') // remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig,EMPTY) // remove HTML tags | ||
.replace(/[’«»'";:,.\/(\/)\/!\/?\\-]/g, WORD_SEPARATOR) // Remove punctuations | ||
.replace(/ +/g, WORD_SEPARATOR) // remove multiple spaces | ||
.replace(/[\t]/g, WORD_SEPARATOR) // Remove Tabs | ||
.replace(/ /gi,WORD_SEPARATOR) // remove HTML entities, only non breaking space | ||
.replace(/(<([^>]+)>)/ig,WORD_SEPARATOR) // remove HTML tags | ||
.replace(/['’«»";:,.\/(\/)\/!\/?\\-]/g, WORD_SEPARATOR) // Remove punctuations | ||
.replace(/\s+/g, WORD_SEPARATOR) // remove multiple spaces | ||
.toLowerCase() | ||
@@ -79,0 +89,0 @@ .split(WORD_SEPARATOR); |
{ | ||
"name": "natural-content", | ||
"version": "1.0.8", | ||
"version": "1.0.9", | ||
"description": "A set of natural functions like tf.idf, extract words & n-grams, remove diacritics, ... (experimental project)", | ||
"main": "index.js", | ||
"scripts": { | ||
"test": "echo \"Error: no test specified\" && exit 1" | ||
"test": "mocha" | ||
}, | ||
@@ -9,0 +9,0 @@ "author": "", |
@@ -12,2 +12,7 @@ var assert = require("assert"); | ||
var documentsFr = [ | ||
"Les conditions d'utilisations de l'objet doivent se faire dans de bonnes conditions. Sinon l'objet ne peut pas bien être utilisé.", | ||
"Les conditions d'emploi de la chose doivent se faire dans de bonne condition. Sinon l'objet n'est pas utilisable.", | ||
"Pour éviter une mauvaise utilisation, les conditions d'utilisations doivent être faite correctement." | ||
]; | ||
it('Statements', function() { | ||
@@ -22,6 +27,14 @@ var stats = natural.getStatements("word1 word2 word3 word4 :word5 word6. word7 word1, word8 word9 word10 word11 word6. word1 word12 word13"); | ||
var result = natural.removeSpecials(text); | ||
//console.log(result); | ||
assert( result === 'ceci est un texte en français sans caractères spéciaux avanthier'); | ||
assert( result === 'ceci est un texte en français sans caractères spéciaux avant hier'); | ||
}); | ||
it('apostrophe', function(){ | ||
var text = "ceci est un texte en français. l'été sera chaud. Les conditions d'utilisation de l'objet"; | ||
var result = natural.removeSpecials(text); | ||
assert( result === 'ceci est un texte en français l été sera chaud Les conditions d utilisation de l objet'); | ||
}); | ||
it('diacritics', function(){ | ||
@@ -50,2 +63,7 @@ var text = "ceci est un texte en français ! sans diacritiques çàoözęùô"; | ||
words = natural.getWords("l'été sera chaud. Les conditions d'utilisation de l'objet", false, "fr"); | ||
assert(words.length === 4); | ||
words = natural.getWords("l'été sera chaud. Les conditions d'utilisation de l'objet", true); | ||
assert(words.length === 11); | ||
}); | ||
@@ -66,2 +84,6 @@ | ||
grams = natural.getNgrams(natural.getWords("l'été sera chaud. Les conditions d'utilisation de l'objet", false, "fr"), 1); | ||
assert(grams.length === 4); | ||
}); | ||
@@ -92,2 +114,15 @@ | ||
/* | ||
it.only("tf.idf for a set of document in french", function() { | ||
var info = natural.getTfIdfs(documentsFr, 3, true); | ||
var sorted = _.sortBy(Array.from(info.stats.values()), function(word) { return -word.tfIdfSum;}); | ||
//assert(sorted[0].word === "word1"); | ||
console.log(sorted); | ||
}); | ||
*/ | ||
}); |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
No tests
QualityPackage does not have any tests. This is a strong signal of a poorly maintained or low quality package.
Found 1 instance in 1 package
32329
1296
1