wink-nlp-utils - npm Package Compare versions

Comparing version 1.8.0 to 1.9.0

package.json

		{
		"name": "wink-nlp-utils",
		"version": "1.8.0",
		"version": "1.9.0",
		"description": "NLP Functions for amplifying negations, managing elisions, creating ngrams, stems, phonetic codes to tokens and more.",
		@@ -39,15 +39,16 @@ "keywords": [
		"chai": "^4.1.2",
		"coveralls": "^3.0.0",
		"coveralls": "^3.0.1",
		"docker": "^1.0.0",
		"documentation": "^6.1.0",
		"documentation": "^6.3.3",
		"eslint": "^4.19.1",
		"istanbul": "^0.4.5",
		"jshint": "^2.9.5",
		"mocha": "^5.0.5",
		"mocha": "^5.1.1",
		"mocha-lcov-reporter": "^1.3.0"
		},
		"dependencies": {
		"wink-helpers": "^1.3.0",
		"wink-porter2-stemmer": "^1.0.8"
		"wink-helpers": "^1.4.0",
		"wink-porter2-stemmer": "^1.0.8",
		"wink-tokenizer": "^4.0.0"
		}
		}

README.md

		@@ -50,2 +50,16 @@

		// Tokenize a sentence.
		var s = 'For details on wink, check out http://winkjs.org/ URL!';
		console.log( nlp.string.tokenize( s, true ) );
		// -> [ { value: 'For', tag: 'word' },
		// { value: 'details', tag: 'word' },
		// { value: 'on', tag: 'word' },
		// { value: 'wink', tag: 'word' },
		// { value: ',', tag: 'punctuation' },
		// { value: 'check', tag: 'word' },
		// { value: 'out', tag: 'word' },
		// { value: 'http://winkjs.org/', tag: 'url' },
		// { value: 'URL', tag: 'word' },
		// { value: '!', tag: 'punctuation' } ]

		// Remove stop words:
		@@ -52,0 +66,0 @@ var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] );

src/string-tokenize.js

		@@ -25,5 +25,3 @@ // wink-nlp-utils
		//
		var splitElisions = require( './string-split-elisions.js' );
		var amplifyNotElision = require( './string-amplify-not-elision.js' );
		var rgx = require( './util_regexes.js' );
		var winkTokenize = require( 'wink-tokenizer' )().tokenize;

		@@ -35,51 +33,43 @@ // ## string
		*
		* The function uses the following set of rules to tokenize:
		* Tokenizes the input `sentence` according to the value of `detailed` flag.
		* Any occurance of `...` in the `sentence` is
		* converted to ellipses. In `detailed = true` mode, it
		* tags every token with its type; the supported tags are currency, email,
		* emoji, emoticon, hashtag, number, ordinal, punctuation, quoted_phrase, symbol,
		* time, mention, url, and word.
		*
		* 1. Single quotes are processed first as they may be part of elisions; and
		* `...` are converted to ellipses.
		* 2. `Not` elisions are amplified and then split on elisions. Thus words with elisions get tokenized.
		* 3. The word `cannot` is split in to `can not`.
		* 4. `. , -` punctuations that commonly embedded in numbers are left intact,
		* 5. All other punctuations are tokenized.
		* 6. The currency symbols are padded by space i.e. become separate tokens.
		* 7. Underscore (`_`) embedded in the word is preserved.
		* 8. Spacial characters are left untouched and may/may not become separate token.
		* 9. Finally after removing extra/leading/trailing spaces, split on space to tokenize.
		*
		* @name string.tokenize
		* @param {string} str — the input string.
		* @return {string[]} of tokens.
		* @param {string} sentence — the input string.
		* @param {boolean} [detailed=false] — if true, each token is a object cotaining
		* `value` and `tag` of each token; otherwise each token is a string. It's default
		* value of false ensures compatibility with previous version.
		* @return {(string[]\|object[])} an array of strings if `detailed` is false otherwise
		* an array of objects.
		* @example
		* tokenize( "someone's wallet, isn't it? I'll return!" );
		* // -> [ 'someone\'s', 'wallet', ',', 'is', 'not', 'it',
		* // '?', 'i', '\'ll', 'return', '!' ]
		* // -> [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?',
		* // 'I', '\'ll', 'return', '!' ]
		*
		* tokenize( 'For details on wink, check out http://winkjs.org/ URL!', true );
		* // -> [ { value: 'For', tag: 'word' },
		* // { value: 'details', tag: 'word' },
		* // { value: 'on', tag: 'word' },
		* // { value: 'wink', tag: 'word' },
		* // { value: ',', tag: 'punctuation' },
		* // { value: 'check', tag: 'word' },
		* // { value: 'out', tag: 'word' },
		* // { value: 'http://winkjs.org/', tag: 'url' },
		* // { value: 'URL', tag: 'word' },
		* // { value: '!', tag: 'punctuation' } ]
		*/
		var tokenize = function ( str ) {
		// Handle single quotes first & ellipses.
		var su = str
		// > TODO: promote to regex utils after adding more test cases
		.replace( /(^\|[^a-z0-9])(\’\|\')/gi, '$1 $2 ')
		.replace( /([a-z0-9])(\’\|\')(\W)/gi, '$1 $2 $3')
		.replace( '...', '…' )
		.replace( '…', ' … ' );
		var tokens = splitElisions( amplifyNotElision( su ) )
		// Handle cannot.
		.replace( rgx.cannot, '$1 $2' )
		// Separate out punctuations that are not part of a number.
		.replace( rgx.nonNumPunctuations, ' $& ' )
		// Separate out all other punctuations.
		.replace( /[\‘\’\`\“\”\"\[\]\{\}\…\!\;\?\/\:]/ig, ' $& ' )
		// Separate out currency symbol; all separated stuff becomes a token.
		.replace( rgx.currency, ' $& ')
		.replace( rgx.spaces, ' ' )
		.trim()
		// Handle period sign in the end specially.
		.replace( /\.$/, ' .' )
		// Now tokenize on space!
		.split( ' ' );
		// Splitting an empty string on space leaves an empty string in the array,
		// get rid of it.
		return ( ( tokens.length === 1 && tokens[ 0 ] === '' ) ? [] : tokens );
		var tokenize = function ( sentence, detailed ) {
		var tokens = winkTokenize( sentence.replace( '...', '…' ) );
		var i;
		if ( !detailed ) {
		for ( i = 0; i < tokens.length; i += 1 ) tokens[ i ] = tokens[ i ].value;
		}

		return tokens;
		}; // tokenize()

		module.exports = tokenize;

wink-nlp-utils - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics

Dependency changes