wink-tokenizer
Advanced tools
Comparing version 3.1.0 to 3.2.0
{ | ||
"name": "wink-tokenizer", | ||
"version": "3.1.0", | ||
"version": "3.2.0", | ||
"description": "Multilingual tokenizer that automatically tags each token with its type", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
@@ -25,2 +25,4 @@ // wink-tokenizer | ||
var rgxSpaces = /\s+/g; | ||
// Ordinals only for Latin like 1st, 2nd or 12th or 33rd. | ||
var rgxOrdinalL1 = /1\dth|[04-9]th|1st|2nd|3rd|[02-9]1st|[02-9]2nd|[02-9]3rd|[02-9][04-9]th|\d+\d[04-9]th|\d+\d1st|\d+\d2nd|\d+\d3rd/g; | ||
// Apart from detecting pure integers or decimals, also detect numbers containing | ||
@@ -71,2 +73,3 @@ // `. - / ,` so that dates, ip address, fractions and things like codes or part | ||
{ regex: rgxTime, category: 'time' }, | ||
{ regex: rgxOrdinalL1, category: 'ordinal' }, | ||
{ regex: rgxNumberL1, category: 'number' }, | ||
@@ -88,2 +91,3 @@ { regex: rgxNumberDV, category: 'number' }, | ||
number: 'n', | ||
ordinal: 'o', | ||
quoted_phrase: 'q', // eslint-disable-line camelcase | ||
@@ -222,3 +226,3 @@ currency: 'r', | ||
* @param {object} config — It defines 0 or more properties from the list of | ||
* **13** properties. A true value for a property ensures tokenization | ||
* **14** properties. A true value for a property ensures tokenization | ||
* for that type of text; whereas false value will mean that the tokenization of that | ||
@@ -241,2 +245,3 @@ * type of text will not be attempted. | ||
* or **1/4** and numerals containing "**`, - / .`**", for example 12-12-1924 (**`n`**) | ||
* @param {boolean} [config.ordinal=true] ordinals like **1st**, **2nd**, **3rd**, **4th** or **12th** or **91st** (**`o`**) | ||
* @param {boolean} [config.punctuation=true] common punctuation such as **`?`** or **`,`** | ||
@@ -254,3 +259,3 @@ * ( token becomes fingerprint ) | ||
* var myTokenizer.defineConfig( { mention: false } ); | ||
* // -> 12 | ||
* // -> 13 | ||
* // Only tokenize words as defined above. | ||
@@ -257,0 +262,0 @@ * var myTokenizer.defineConfig( {} ); |
53756
328