nlp-toolkit
Advanced tools
Comparing version 0.2.3 to 0.2.4
@@ -34,4 +34,7 @@ /** | ||
})) | ||
.on('data', function (sentence) { | ||
console.log(sentence); | ||
.pipe(nlp.crossValidation({ | ||
classifiers: [nlp.classifiers.bayes] | ||
})) | ||
.on('data', function (data) { | ||
console.log(nlp.calculate.accuracy(data)); | ||
}) | ||
@@ -38,0 +41,0 @@ .on('error', function (err) { |
@@ -1,1 +0,4 @@ | ||
positive|amazing, awesome movie!! Yeah!! Oh boy. | ||
positive|amazing, awesome movie!! Yeah!! Oh boy. | ||
positive|Sweet, this is incredibly, amazing, perfect, great!! | ||
negative|terrible, shitty thing. Damn. Sucks!! | ||
positive|awesome, cool, amazing!! Yay. |
@@ -91,3 +91,3 @@ /** | ||
function stemWords(text, lang) { | ||
if (!stemmerLookup.hasOwnProperty(lang)) { | ||
if (!stemmerLookup.hasOwnProperty(lang) && lang !== 'default') { | ||
console.log(lang + ' is not a supported language for stemming.'); | ||
@@ -94,0 +94,0 @@ return text; |
@@ -20,4 +20,2 @@ /** | ||
debug('validateClassifier'); | ||
options = options || {}; | ||
@@ -29,7 +27,15 @@ var trainingSet = options.trainingSet || []; | ||
debug('trainingSet.length', trainingSet.length); | ||
debug('testSet.length', testSet.length); | ||
trainingSet.forEach(function (sentence) { | ||
_classifier.learn(sentence, sentence.feature); | ||
if (sentence) { | ||
_classifier.learn(sentence, sentence.feature); | ||
} | ||
}); | ||
return testSet.map(function (sentence) { | ||
if (!sentence) { | ||
return false; | ||
} | ||
return { | ||
@@ -39,2 +45,4 @@ feature: sentence.feature, | ||
}; | ||
}).filter(function (sentence) { | ||
return !!sentence; | ||
}); | ||
@@ -41,0 +49,0 @@ |
@@ -26,3 +26,3 @@ { | ||
}, | ||
"version": "0.2.3", | ||
"version": "0.2.4", | ||
"keywords": [ | ||
@@ -29,0 +29,0 @@ "nlp", |
@@ -34,1 +34,83 @@ # Natural Language Processing Toolkit for node.js # | ||
## Modules ## | ||
### Tokenizer ### | ||
```javascript | ||
.pipe(nlp.tokenizer(options)) | ||
``` | ||
`options`: | ||
| attribute | type | description | | ||
|-----------|------|-------------| | ||
| characters | RegExp | regular expression that describes what characters to strip of off (default `/[^\w]/g`). | | ||
| separator | RegExp | regular expression that describes where to split words (default `/\s/g`). | | ||
| eliminateNumbers| boolean | discard tokens that only contain numbers (default `false`). | | ||
| toLowerCase | boolean | transform every token to lower case (default `true`). | | ||
| emptyStrings | boolean | keep empty string when through some previous steps tokens result in length === 0 (default `false`). | | ||
Tokenizer also work in a non-stream context: | ||
```javascript | ||
var tokens = nlp.tokenizer(string, options); | ||
``` | ||
### Stopwords ### | ||
```javascript | ||
.pipe(nlp.stopwords(options)) | ||
``` | ||
`options`: | ||
| attribute | type | description | | ||
|-----------|------|-------------| | ||
| defaultLang | string | default language if processed object does not provide a `lang` attribute (default `en`). | | ||
| additionalWords | object | add additional stopwords to the list of stopwords | | ||
`additionalWords`: | ||
| attribute | type | description | | ||
|-----------|------|-------------| | ||
| all | array | list of stopwords to add to every language | | ||
| default | array | list of stopwords if language is not supported | | ||
| _lang_ | array | list of stopwords specific to _lang_ | | ||
Supported languages: `da, de, en, es, fi, fr, hu, it, nl, no, pt, ro, ru, se, tr`. | ||
Stopwords also work in a non-stream context: | ||
```javascript | ||
nlp.stopwords(sentence, options) | ||
.then(function (tokens) {}}) | ||
.catch(function (err) { console.error(err); }); | ||
``` | ||
### Stemmer ### | ||
```javascript | ||
.pipe(nlp.stemmer(options)) | ||
``` | ||
`options`: | ||
| attribute | type | description | | ||
|-----------|------|-------------| | ||
| defaultStemmer | string | default stemmer for language if processed object does not provide a `lang` attribute (default `en`). | | ||
Supported languages: `da, de, en, es, fi, fr, hu, it, nl, no, pt, ro, ru, se, tr`. | ||
Stopwords also work in a non-stream context: | ||
```javascript | ||
var tokens = nlp.stemmer(sentence, options); | ||
``` | ||
This module uses the stemmer implementation of [Snowball-Stemmer](https://github.com/shibukawa/snowball-stemmer.jsx). | ||
### Frequency Distribution ### | ||
```javascript | ||
.pipe(nlp.frequency()) | ||
``` |
757981
941
115