wink-tokenizer
Advanced tools
Comparing version 4.0.0 to 4.1.0
{ | ||
"name": "wink-tokenizer", | ||
"version": "4.0.0", | ||
"version": "4.1.0", | ||
"description": "Multilingual tokenizer that automatically tags each token with its type", | ||
@@ -47,4 +47,4 @@ "keywords": [ | ||
"docker": "^1.0.0", | ||
"documentation": "^6.1.0", | ||
"eslint": "^4.19.1", | ||
"documentation": "^8.1.2", | ||
"eslint": "^5.4.0", | ||
"istanbul": "^0.4.5", | ||
@@ -51,0 +51,0 @@ "jshint": "^2.9.5", |
@@ -5,3 +5,3 @@ # wink-tokenizer | ||
### [![Build Status](https://api.travis-ci.org/winkjs/wink-tokenizer.svg?branch=master)](https://travis-ci.org/winkjs/wink-tokenizer) [![Coverage Status](https://coveralls.io/repos/github/winkjs/wink-tokenizer/badge.svg?branch=master)](https://coveralls.io/github/winkjs/wink-tokenizer?branch=master) [![Inline docs](http://inch-ci.org/github/winkjs/wink-tokenizer.svg?branch=master)](http://inch-ci.org/github/winkjs/wink-tokenizer) [![devDependencies Status](https://david-dm.org/winkjs/wink-tokenizer/dev-status.svg)](https://david-dm.org/winkjs/wink-tokenizer?type=dev) | ||
### [![Build Status](https://api.travis-ci.org/winkjs/wink-tokenizer.svg?branch=master)](https://travis-ci.org/winkjs/wink-tokenizer) [![Coverage Status](https://coveralls.io/repos/github/winkjs/wink-tokenizer/badge.svg?branch=master)](https://coveralls.io/github/winkjs/wink-tokenizer?branch=master) [![Inline docs](http://inch-ci.org/github/winkjs/wink-tokenizer.svg?branch=master)](http://inch-ci.org/github/winkjs/wink-tokenizer) [![devDependencies Status](https://david-dm.org/winkjs/wink-tokenizer/dev-status.svg)](https://david-dm.org/winkjs/wink-tokenizer?type=dev) [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/winkjs/Lobby) | ||
@@ -18,4 +18,5 @@ [<img align="right" src="https://decisively.github.io/wink-logos/logo-title.png" width="100px" >](http://winkjs.org/) | ||
1. Automatic detection & tagging of token's feature; | ||
1. Automatic detection & tagging of different types of tokens based on their features: | ||
- These include word, punctuation, email, mention, hashtag, emoticon, and emoji etc. | ||
- User definable token types. | ||
@@ -22,0 +23,0 @@ |
@@ -85,3 +85,5 @@ // wink-tokenizer | ||
]; | ||
// Used to generate finger print from the tokens. | ||
// NOTE: this variable is being reset in `defineConfig()`. | ||
var fingerPrintCodes = { | ||
@@ -266,3 +268,4 @@ emoticon: 'c', | ||
* for that type of text; whereas false value will mean that the tokenization of that | ||
* type of text will not be attempted. | ||
* type of text will not be attempted. It also **resets** the effect of any previous | ||
* call(s) to the [`addRegex()`](#addregex) API. | ||
* | ||
@@ -316,2 +319,19 @@ * *An empty config object is equivalent to splitting on spaces. Whatever tokens | ||
} ); | ||
// Reset the `fingerPrintCodes` variable. | ||
fingerPrintCodes = { | ||
emoticon: 'c', | ||
email: 'e', | ||
emoji: 'j', | ||
hashtag: 'h', | ||
mention: 'm', | ||
number: 'n', | ||
ordinal: 'o', | ||
quoted_phrase: 'q', // eslint-disable-line camelcase | ||
currency: 'r', | ||
// symbol: 's', | ||
time: 't', | ||
url: 'u', | ||
word: 'w', | ||
alien: 'z' | ||
}; | ||
return ( ( Object.keys( uniqueCats ) ).length ); | ||
@@ -378,5 +398,69 @@ }; // defineConfig() | ||
// ### addTag | ||
var addTag = function (name, fingerprintCode) { | ||
if (fingerPrintCodes[name]) { | ||
throw new Error( 'Tag ' + name + ' already exists' ); | ||
} | ||
fingerPrintCodes[name] = fingerprintCode; | ||
}; // addTag() | ||
// ### addRegex | ||
/** | ||
* Adds a regex for parsing a new type of token. This regex can either be mapped | ||
* to an existing tag or it allows creation of a new tag along with its finger print. | ||
* The uniqueness of the [finger prints](#defineconfig) have to ensured by the user. | ||
* | ||
* *The added regex(s) will supersede the internal parsing.* | ||
* | ||
* @param {RegExp} regex — the new regular expression. | ||
* @param {string} tag — tokens matching the `regex` will be assigned this tag. | ||
* @param {string} [fingerprintCode=undefined] — required if adding a new | ||
* tag; ignored if using an existing tag. | ||
* @return {void} nothing! | ||
* @example | ||
* // Adding a regex for an existing tag | ||
* myTokenizer.addRegex( /\(oo\)/gi, 'emoticon' ); | ||
* myTokenizer.tokenize( '(oo) Hi!' ) | ||
* // -> [ { value: '(oo)', tag: 'emoticon' }, | ||
* // { value: 'Hi', tag: 'word' }, | ||
* // { value: '!', tag: 'punctuation' } ] | ||
* | ||
* // Adding a regex to parse a new token type | ||
* myTokenizer.addRegex( /hello/gi, 'greeting', 'g' ); | ||
* myTokenizer.tokenize( 'hello, how are you?' ); | ||
* // -> [ { value: 'hello', tag: 'greeting' }, | ||
* // { value: ',', tag: 'punctuation' }, | ||
* // { value: 'how', tag: 'word' }, | ||
* // { value: 'are', tag: 'word' }, | ||
* // { value: 'you', tag: 'word' }, | ||
* // { value: '?', tag: 'punctuation' } ] | ||
* // Notice how "hello" is now tagged as "greeting" and not as "word". | ||
* | ||
* // Using definConfig will reset the above! | ||
* myTokenizer.defineConfig( { word: true } ); | ||
* myTokenizer.tokenize( 'hello, how are you?' ); | ||
* // -> [ { value: 'hello', tag: 'word' }, | ||
* // { value: ',', tag: 'punctuation' }, | ||
* // { value: 'how', tag: 'word' }, | ||
* // { value: 'are', tag: 'word' }, | ||
* // { value: 'you', tag: 'word' }, | ||
* // { value: '?', tag: 'punctuation' } ] | ||
*/ | ||
var addRegex = function (regex, tag, fingerprintCode) { | ||
if (!fingerPrintCodes[tag] && !fingerprintCode) { | ||
throw new Error( 'Tag ' + tag + ' doesn\'t exist; Provide a \'fingerprintCode\' to add it as a tag.' ); | ||
} else if (!fingerPrintCodes[tag]) { | ||
addTag(tag, fingerprintCode); | ||
} | ||
rgxs.unshift( { regex: regex, category: tag } ); | ||
}; // addRegex() | ||
methods.defineConfig = defineConfig; | ||
methods.tokenize = tokenize; | ||
methods.getTokensFP = getTokensFP; | ||
methods.addTag = addTag; | ||
methods.addRegex = addRegex; | ||
return methods; | ||
@@ -383,0 +467,0 @@ }; |
90437
9
714
103