wink-nlp-utils
Advanced tools
Comparing version 1.8.0 to 1.9.0
{ | ||
"name": "wink-nlp-utils", | ||
"version": "1.8.0", | ||
"version": "1.9.0", | ||
"description": "NLP Functions for amplifying negations, managing elisions, creating ngrams, stems, phonetic codes to tokens and more.", | ||
@@ -39,15 +39,16 @@ "keywords": [ | ||
"chai": "^4.1.2", | ||
"coveralls": "^3.0.0", | ||
"coveralls": "^3.0.1", | ||
"docker": "^1.0.0", | ||
"documentation": "^6.1.0", | ||
"documentation": "^6.3.3", | ||
"eslint": "^4.19.1", | ||
"istanbul": "^0.4.5", | ||
"jshint": "^2.9.5", | ||
"mocha": "^5.0.5", | ||
"mocha": "^5.1.1", | ||
"mocha-lcov-reporter": "^1.3.0" | ||
}, | ||
"dependencies": { | ||
"wink-helpers": "^1.3.0", | ||
"wink-porter2-stemmer": "^1.0.8" | ||
"wink-helpers": "^1.4.0", | ||
"wink-porter2-stemmer": "^1.0.8", | ||
"wink-tokenizer": "^4.0.0" | ||
} | ||
} |
@@ -50,2 +50,16 @@ | ||
// Tokenize a sentence. | ||
var s = 'For details on wink, check out http://winkjs.org/ URL!'; | ||
console.log( nlp.string.tokenize( s, true ) ); | ||
// -> [ { value: 'For', tag: 'word' }, | ||
// { value: 'details', tag: 'word' }, | ||
// { value: 'on', tag: 'word' }, | ||
// { value: 'wink', tag: 'word' }, | ||
// { value: ',', tag: 'punctuation' }, | ||
// { value: 'check', tag: 'word' }, | ||
// { value: 'out', tag: 'word' }, | ||
// { value: 'http://winkjs.org/', tag: 'url' }, | ||
// { value: 'URL', tag: 'word' }, | ||
// { value: '!', tag: 'punctuation' } ] | ||
// Remove stop words: | ||
@@ -52,0 +66,0 @@ var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] ); |
@@ -25,5 +25,3 @@ // wink-nlp-utils | ||
// | ||
var splitElisions = require( './string-split-elisions.js' ); | ||
var amplifyNotElision = require( './string-amplify-not-elision.js' ); | ||
var rgx = require( './util_regexes.js' ); | ||
var winkTokenize = require( 'wink-tokenizer' )().tokenize; | ||
@@ -35,51 +33,43 @@ // ## string | ||
* | ||
* The function uses the following set of rules to tokenize: | ||
* Tokenizes the input `sentence` according to the value of `detailed` flag. | ||
* Any occurance of `...` in the `sentence` is | ||
* converted to ellipses. In `detailed = true` mode, it | ||
* tags every token with its type; the supported tags are currency, email, | ||
* emoji, emoticon, hashtag, number, ordinal, punctuation, quoted_phrase, symbol, | ||
* time, mention, url, and word. | ||
* | ||
* 1. Single quotes are processed first as they may be part of elisions; and | ||
* `...` are converted to ellipses. | ||
* 2. `Not` elisions are amplified and then split on elisions. Thus words with elisions get tokenized. | ||
* 3. The word `cannot` is split in to `can not`. | ||
* 4. `. , -` punctuations that commonly embedded in numbers are left intact, | ||
* 5. All other punctuations are tokenized. | ||
* 6. The currency symbols are padded by space i.e. become separate tokens. | ||
* 7. Underscore (`_`) embedded in the word is preserved. | ||
* 8. Spacial characters are left untouched and may/may not become separate token. | ||
* 9. Finally after removing extra/leading/trailing spaces, split on space to tokenize. | ||
* | ||
* @name string.tokenize | ||
* @param {string} str — the input string. | ||
* @return {string[]} of tokens. | ||
* @param {string} sentence — the input string. | ||
* @param {boolean} [detailed=false] — if true, each token is a object cotaining | ||
* `value` and `tag` of each token; otherwise each token is a string. It's default | ||
* value of **false** ensures compatibility with previous version. | ||
* @return {(string[]|object[])} an array of strings if `detailed` is false otherwise | ||
* an array of objects. | ||
* @example | ||
* tokenize( "someone's wallet, isn't it? I'll return!" ); | ||
* // -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it', | ||
* // '?', 'i', '\'ll', 'return', '!' ] | ||
* // -> [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?', | ||
* // 'I', '\'ll', 'return', '!' ] | ||
* | ||
* tokenize( 'For details on wink, check out http://winkjs.org/ URL!', true ); | ||
* // -> [ { value: 'For', tag: 'word' }, | ||
* // { value: 'details', tag: 'word' }, | ||
* // { value: 'on', tag: 'word' }, | ||
* // { value: 'wink', tag: 'word' }, | ||
* // { value: ',', tag: 'punctuation' }, | ||
* // { value: 'check', tag: 'word' }, | ||
* // { value: 'out', tag: 'word' }, | ||
* // { value: 'http://winkjs.org/', tag: 'url' }, | ||
* // { value: 'URL', tag: 'word' }, | ||
* // { value: '!', tag: 'punctuation' } ] | ||
*/ | ||
var tokenize = function ( str ) { | ||
// Handle single quotes first & ellipses. | ||
var su = str | ||
// > TODO: promote to regex utils after adding more test cases | ||
.replace( /(^|[^a-z0-9])(\’|\')/gi, '$1 $2 ') | ||
.replace( /([a-z0-9])(\’|\')(\W)/gi, '$1 $2 $3') | ||
.replace( '...', '…' ) | ||
.replace( '…', ' … ' ); | ||
var tokens = splitElisions( amplifyNotElision( su ) ) | ||
// Handle cannot. | ||
.replace( rgx.cannot, '$1 $2' ) | ||
// Separate out punctuations that are not part of a number. | ||
.replace( rgx.nonNumPunctuations, ' $& ' ) | ||
// Separate out all other punctuations. | ||
.replace( /[\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig, ' $& ' ) | ||
// Separate out currency symbol; all separated stuff becomes a token. | ||
.replace( rgx.currency, ' $& ') | ||
.replace( rgx.spaces, ' ' ) | ||
.trim() | ||
// Handle period sign in the end specially. | ||
.replace( /\.$/, ' .' ) | ||
// Now tokenize on space! | ||
.split( ' ' ); | ||
// Splitting an empty string on space leaves an empty string in the array, | ||
// get rid of it. | ||
return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens ); | ||
var tokenize = function ( sentence, detailed ) { | ||
var tokens = winkTokenize( sentence.replace( '...', '…' ) ); | ||
var i; | ||
if ( !detailed ) { | ||
for ( i = 0; i < tokens.length; i += 1 ) tokens[ i ] = tokens[ i ].value; | ||
} | ||
return tokens; | ||
}; // tokenize() | ||
module.exports = tokenize; |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
83
135382
3
2649
+ Addedwink-tokenizer@^4.0.0
+ Addedwink-tokenizer@4.1.0(transitive)
Updatedwink-helpers@^1.4.0