nlp-toolkit
Advanced tools
Comparing version 0.2.2 to 0.2.3
@@ -20,3 +20,5 @@ /** | ||
.pipe(nlp.stopwords({ | ||
words: ['mr', 'mrs', 'miss'] | ||
additionalWords: { | ||
all: ['mr', 'mrs', 'miss'] | ||
} | ||
})) | ||
@@ -23,0 +25,0 @@ .pipe(nlp.stemmer()) |
@@ -52,2 +52,3 @@ /** | ||
var DEFAULT_STEMMER = 'en'; | ||
var stemmerCache = {}; | ||
@@ -58,10 +59,15 @@ | ||
*/ | ||
function stemmer(options) { | ||
function stemmer(text, options) { | ||
if (!options && Object.prototype.toString.call(text) !== '[object Array]') { | ||
options = text; | ||
text = ''; | ||
} | ||
options = options || {}; | ||
options.defaultStemmer = options.defaultStemmer || DEFAULT_STEMMER; | ||
var stemmerCache = { | ||
default: new stemmerLookup[options.defaultStemmer]() | ||
}; | ||
if (!stemmerCache.default) { | ||
stemmerCache.default = new stemmerLookup[options.defaultStemmer](); | ||
} | ||
@@ -82,2 +88,19 @@ var getLang = function () { | ||
if (text) { | ||
return stemWords(text, options.lang || options.defaultStemmer); | ||
} | ||
function stemWords(text, lang) { | ||
if (!stemmerLookup.hasOwnProperty(lang)) { | ||
console.log(lang + ' is not a supported language for stemming.'); | ||
return text; | ||
} | ||
if (!stemmerCache.hasOwnProperty(lang)) { | ||
stemmerCache[lang] = new stemmerLookup[lang](); | ||
} | ||
return text.map(function (token) { | ||
return stemmerCache[lang].stemWord(token); | ||
}); | ||
} | ||
debug('defaultStemmer', options.defaultStemmer); | ||
@@ -92,11 +115,4 @@ | ||
var lang = getLang(chunk); | ||
if (!stemmerCache.hasOwnProperty(lang)) { | ||
stemmerCache[lang] = new stemmerLookup[lang](); | ||
} | ||
var tokens = stemWords(_chunk, getLang(chunk)); | ||
var tokens = _chunk.map(function (token) { | ||
return stemmerCache[lang].stemWord(token); | ||
}); | ||
var response; | ||
@@ -103,0 +119,0 @@ if (Object.prototype.toString.call(chunk) !== '[object Array]') { |
@@ -24,2 +24,3 @@ /** | ||
var DEFAULT_LANG = 'en'; | ||
var stopwordsCache = {}; | ||
@@ -30,4 +31,9 @@ | ||
*/ | ||
function stopwords(options) { | ||
function stopwords(text, options) { | ||
if (!options && Object.prototype.toString.call(text) !== '[object Array]') { | ||
options = text; | ||
text = ''; | ||
} | ||
options = options || {}; | ||
@@ -47,6 +53,2 @@ options.defaultLang = options.defaultLang || DEFAULT_LANG; | ||
var stopwordsCache = { | ||
default: getStopwordsWrapper(options.defaultFilename, options.additionalWords, 'default') | ||
}; | ||
var getLang = function () { | ||
@@ -66,5 +68,24 @@ return 'default'; | ||
stopwordsCache.default = getStopwordsWrapper(options.defaultFilename, options.additionalWords, 'default'); | ||
if (text) { | ||
return removeStopwords(text, options.lang || options.defaultLang); | ||
} | ||
debug('defaultLang', options.defaultLang); | ||
debug('defaultFilename', options.defaultFilename); | ||
function removeStopwords(text, lang) { | ||
if (!stopwordsCache.hasOwnProperty(lang)) { | ||
stopwordsCache[lang] = getStopwordsWrapper(getFilename(lang), options.additionalWords, lang); | ||
} | ||
return stopwordsCache[lang]() | ||
.then(function (stopwordsCache) { | ||
var tokens = text.filter(function (token) { | ||
return !stopwordsCache.hasOwnProperty(token); | ||
}); | ||
return tokens; | ||
}); | ||
} | ||
return through2.obj(function (chunk, enc, callback) { | ||
@@ -77,14 +98,4 @@ | ||
var lang = getLang(chunk); | ||
if (!stopwordsCache.hasOwnProperty(lang)) { | ||
stopwordsCache[lang] = getStopwordsWrapper(getFilename(lang), options.additionalWords, lang); | ||
} | ||
stopwordsCache[lang]() | ||
.then(function (stopwordsCache) { | ||
var tokens = _chunk.filter(function (token) { | ||
return !stopwordsCache.hasOwnProperty(token); | ||
}); | ||
removeStopwords(_chunk, getLang(chunk)) | ||
.then(function (tokens) { | ||
var response; | ||
@@ -97,14 +108,12 @@ if (Object.prototype.toString.call(chunk) !== '[object Array]') { | ||
} | ||
return callback(null, response); | ||
}) | ||
.catch(callback); | ||
}) | ||
.catch(function (err) { | ||
return callback(err); | ||
}); | ||
}); | ||
} | ||
function getStopwordsWrapper(filename, additionalWords, lang) { | ||
var stopwordsCache; | ||
var singleStopwordsCache; | ||
var _words = [].concat(additionalWords.all); | ||
@@ -114,4 +123,4 @@ _words = _words.concat(additionalWords[lang] || additionalWords.default); | ||
return new Promise(function (resolve, reject) { | ||
if (stopwordsCache) { | ||
return resolve(stopwordsCache); | ||
if (singleStopwordsCache) { | ||
return resolve(singleStopwordsCache); | ||
} | ||
@@ -125,7 +134,7 @@ var _stopwordsCache = {}; | ||
.on('end', function () { | ||
stopwordsCache = _stopwordsCache; | ||
singleStopwordsCache = _stopwordsCache; | ||
_words.forEach(function (_word) { | ||
stopwordsCache[_word] = 1; | ||
singleStopwordsCache[_word] = 1; | ||
}) | ||
return resolve(stopwordsCache); | ||
return resolve(singleStopwordsCache); | ||
}) | ||
@@ -132,0 +141,0 @@ .on('error', function (err) { |
@@ -29,4 +29,9 @@ /** | ||
*/ | ||
function tokenizer(options) { | ||
function tokenizer(text, options) { | ||
if (!options && typeof text === 'object') { | ||
options = text; | ||
text = ''; | ||
} | ||
options = options || {}; | ||
@@ -45,2 +50,6 @@ options.characters = (options.characters instanceof RegExp) ? options.characters : DEFAULT_CHARACTERS; | ||
if (text) { | ||
return tokenize(text, options); | ||
} | ||
return through2.obj(function (chunk, enc, callback) { | ||
@@ -54,14 +63,3 @@ | ||
var tokens = _chunk.split(options.separator).map(function (token) { | ||
token = token.replace(options.characters, ''); | ||
if (options.eliminateNumbers) { | ||
token = token.replace(/^\d+$/, ''); | ||
} | ||
if (options.toLowerCase) { | ||
token = token.toLowerCase(); | ||
} | ||
return token; | ||
}).filter(function (token) { | ||
return !!token || options.emptyStrings; | ||
}); | ||
var tokens = tokenize(_chunk, options); | ||
@@ -83,3 +81,22 @@ var response; | ||
function tokenize(text, options) { | ||
var tokens = text.split(options.separator).map(function (token) { | ||
token = token.replace(options.characters, ''); | ||
if (options.eliminateNumbers) { | ||
token = token.replace(/^\d+$/, ''); | ||
} | ||
if (options.toLowerCase) { | ||
token = token.toLowerCase(); | ||
} | ||
return token; | ||
}).filter(function (token) { | ||
return !!token || options.emptyStrings; | ||
}); | ||
return tokens; | ||
} | ||
/** | ||
@@ -86,0 +103,0 @@ * EXPORTS. |
@@ -26,3 +26,3 @@ { | ||
}, | ||
"version": "0.2.2", | ||
"version": "0.2.3", | ||
"keywords": [ | ||
@@ -29,0 +29,0 @@ "nlp", |
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
755185
47
930
7