Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

wink-nlp-utils

Package Overview
Dependencies
Maintainers
3
Versions
25
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

wink-nlp-utils - npm Package Compare versions

Comparing version 1.8.0 to 1.9.0

13

package.json
{
"name": "wink-nlp-utils",
"version": "1.8.0",
"version": "1.9.0",
"description": "NLP Functions for amplifying negations, managing elisions, creating ngrams, stems, phonetic codes to tokens and more.",

@@ -39,15 +39,16 @@ "keywords": [

"chai": "^4.1.2",
"coveralls": "^3.0.0",
"coveralls": "^3.0.1",
"docker": "^1.0.0",
"documentation": "^6.1.0",
"documentation": "^6.3.3",
"eslint": "^4.19.1",
"istanbul": "^0.4.5",
"jshint": "^2.9.5",
"mocha": "^5.0.5",
"mocha": "^5.1.1",
"mocha-lcov-reporter": "^1.3.0"
},
"dependencies": {
"wink-helpers": "^1.3.0",
"wink-porter2-stemmer": "^1.0.8"
"wink-helpers": "^1.4.0",
"wink-porter2-stemmer": "^1.0.8",
"wink-tokenizer": "^4.0.0"
}
}

@@ -50,2 +50,16 @@

// Tokenize a sentence.
var s = 'For details on wink, check out http://winkjs.org/ URL!';
console.log( nlp.string.tokenize( s, true ) );
// -> [ { value: 'For', tag: 'word' },
// { value: 'details', tag: 'word' },
// { value: 'on', tag: 'word' },
// { value: 'wink', tag: 'word' },
// { value: ',', tag: 'punctuation' },
// { value: 'check', tag: 'word' },
// { value: 'out', tag: 'word' },
// { value: 'http://winkjs.org/', tag: 'url' },
// { value: 'URL', tag: 'word' },
// { value: '!', tag: 'punctuation' } ]
// Remove stop words:

@@ -52,0 +66,0 @@ var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] );

@@ -25,5 +25,3 @@ // wink-nlp-utils

//
var splitElisions = require( './string-split-elisions.js' );
var amplifyNotElision = require( './string-amplify-not-elision.js' );
var rgx = require( './util_regexes.js' );
var winkTokenize = require( 'wink-tokenizer' )().tokenize;

@@ -35,51 +33,43 @@ // ## string

*
* The function uses the following set of rules to tokenize:
* Tokenizes the input `sentence` according to the value of `detailed` flag.
* Any occurance of `...` in the `sentence` is
* converted to ellipses. In `detailed = true` mode, it
* tags every token with its type; the supported tags are currency, email,
* emoji, emoticon, hashtag, number, ordinal, punctuation, quoted_phrase, symbol,
* time, mention, url, and word.
*
* 1. Single quotes are processed first as they may be part of elisions; and
* `...` are converted to ellipses.
* 2. `Not` elisions are amplified and then split on elisions. Thus words with elisions get tokenized.
* 3. The word `cannot` is split in to `can not`.
* 4. `. , -` punctuations that commonly embedded in numbers are left intact,
* 5. All other punctuations are tokenized.
* 6. The currency symbols are padded by space i.e. become separate tokens.
* 7. Underscore (`_`) embedded in the word is preserved.
* 8. Spacial characters are left untouched and may/may not become separate token.
* 9. Finally after removing extra/leading/trailing spaces, split on space to tokenize.
*
* @name string.tokenize
* @param {string} str — the input string.
* @return {string[]} of tokens.
* @param {string} sentence — the input string.
* @param {boolean} [detailed=false] — if true, each token is a object cotaining
* `value` and `tag` of each token; otherwise each token is a string. It's default
* value of **false** ensures compatibility with previous version.
* @return {(string[]|object[])} an array of strings if `detailed` is false otherwise
* an array of objects.
* @example
* tokenize( "someone's wallet, isn't it? I'll return!" );
* // -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',
* // '?', 'i', '\'ll', 'return', '!' ]
* // -> [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?',
* // 'I', '\'ll', 'return', '!' ]
*
* tokenize( 'For details on wink, check out http://winkjs.org/ URL!', true );
* // -> [ { value: 'For', tag: 'word' },
* // { value: 'details', tag: 'word' },
* // { value: 'on', tag: 'word' },
* // { value: 'wink', tag: 'word' },
* // { value: ',', tag: 'punctuation' },
* // { value: 'check', tag: 'word' },
* // { value: 'out', tag: 'word' },
* // { value: 'http://winkjs.org/', tag: 'url' },
* // { value: 'URL', tag: 'word' },
* // { value: '!', tag: 'punctuation' } ]
*/
var tokenize = function ( str ) {
// Handle single quotes first & ellipses.
var su = str
// > TODO: promote to regex utils after adding more test cases
.replace( /(^|[^a-z0-9])(\’|\')/gi, '$1 $2 ')
.replace( /([a-z0-9])(\’|\')(\W)/gi, '$1 $2 $3')
.replace( '...', '…' )
.replace( '…', ' … ' );
var tokens = splitElisions( amplifyNotElision( su ) )
// Handle cannot.
.replace( rgx.cannot, '$1 $2' )
// Separate out punctuations that are not part of a number.
.replace( rgx.nonNumPunctuations, ' $& ' )
// Separate out all other punctuations.
.replace( /[\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig, ' $& ' )
// Separate out currency symbol; all separated stuff becomes a token.
.replace( rgx.currency, ' $& ')
.replace( rgx.spaces, ' ' )
.trim()
// Handle period sign in the end specially.
.replace( /\.$/, ' .' )
// Now tokenize on space!
.split( ' ' );
// Splitting an empty string on space leaves an empty string in the array,
// get rid of it.
return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens );
var tokenize = function ( sentence, detailed ) {
var tokens = winkTokenize( sentence.replace( '...', '…' ) );
var i;
if ( !detailed ) {
for ( i = 0; i < tokens.length; i += 1 ) tokens[ i ] = tokens[ i ].value;
}
return tokens;
}; // tokenize()
module.exports = tokenize;
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc