Comparing version 6.10.0 to 6.10.2
@@ -25,3 +25,3 @@ /* | ||
const DEBUG = true | ||
const DEBUG = false | ||
@@ -28,0 +28,0 @@ // Afinn |
@@ -26,14 +26,10 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/[^a-zA-Zá-úÁ-ÚñÑüÜ]+/)) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/[^a-zA-Zá-úÁ-ÚñÑüÜ]+/)) | ||
} |
@@ -27,25 +27,21 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
class AggressiveTokenizer extends Tokenizer { | ||
clearEmptyString (array) { | ||
return array.filter(function (a) { | ||
return a !== '' | ||
}) | ||
} | ||
module.exports = AggressiveTokenizer | ||
clearText (text) { | ||
return text.replace(/.:\+-=\(\)"'!\?،,؛;/g, ' ') | ||
} | ||
AggressiveTokenizer.prototype.clearEmptyString = function (array) { | ||
return array.filter(function (a) { | ||
return a !== '' | ||
}) | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
text = this.clearText(text) | ||
return this.clearEmptyString(text.split(/\s+/)) | ||
} | ||
} | ||
AggressiveTokenizer.prototype.clearText = function (text) { | ||
return text.replace(/.:\+-=\(\)"'!\?،,؛;/g, ' ') | ||
} | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
text = this.clearText(text) | ||
return this.clearEmptyString(text.split(/\s+/)) | ||
} | ||
module.exports = AggressiveTokenizer |
@@ -26,14 +26,10 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç-]+/i)) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/[^a-z0-9äâàéèëêïîöôùüûœç-]+/i)) | ||
} |
@@ -31,10 +31,3 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
/* | ||
@@ -48,7 +41,9 @@ To know more on hindi | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
const response = this.trim(text.replace(/[\u0964\u0965...?,]/g, '').split(/\s+|(?![\u0900-\u097F\u0020-\u007F])./u)).filter(Boolean) | ||
return response | ||
class AggressiveTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
const response = this.trim(text.replace(/[\u0964\u0965...?,]/g, '').split(/\s+|(?![\u0900-\u097F\u0020-\u007F])./u)).filter(Boolean) | ||
return response | ||
} | ||
} | ||
module.exports = AggressiveTokenizer |
@@ -26,22 +26,18 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
// Remove all non alphanumeric characters except '-' | ||
// Replace more than one space character to ' ' | ||
normalizeText (text) { | ||
const result = text.replace(/[^a-z0-9 -]/g, ' ').replace(/( +)/g, ' ') | ||
return result | ||
} | ||
tokenize (text) { | ||
// break a string up into an array of tokens by space | ||
text = this.normalizeText(text) | ||
return this.trim(text.split(' ')) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
// Remove all non alphanumeric characters except '-' | ||
// Replace more than one space character to ' ' | ||
function normalizeText (text) { | ||
const result = text.replace(/[^a-z0-9 -]/g, ' ').replace(/( +)/g, ' ') | ||
return result | ||
} | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by space | ||
text = normalizeText(text) | ||
return this.trim(text.split(' ')) | ||
} |
@@ -26,14 +26,10 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/\W+/)) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/\W+/)) | ||
} |
@@ -26,14 +26,10 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/[^a-zA-Z0-9_'-]+/)) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/[^a-zA-Z0-9_'-]+/)) | ||
} |
@@ -27,16 +27,12 @@ /* | ||
const normalizer = require('../normalizers/normalizer_no') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
text = normalizer.removeDiacritics(text) | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/[^A-Za-z0-9_æøåÆØÅäÄöÖüÜ]+/)) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
text = normalizer.removeDiacritics(text) | ||
// break a string up into an array of tokens by anything non-word | ||
return this.trim(text.split(/[^A-Za-z0-9_æøåÆØÅäÄöÖüÜ]+/)) | ||
} |
@@ -26,23 +26,18 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
} | ||
class AggressiveTokenizer extends Tokenizer { | ||
withoutEmpty (array) { | ||
return array.filter(function (a) { return a }) | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
clearText (text) { | ||
return text.replace(/[^a-zążśźęćńół0-9]/gi, ' ').replace(/[\s\n]+/g, ' ').trim() | ||
} | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.withoutEmpty = function (array) { | ||
return array.filter(function (a) { return a }) | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.withoutEmpty(this.clearText(text).split(' ')) | ||
} | ||
} | ||
AggressiveTokenizer.prototype.clearText = function (text) { | ||
return text.replace(/[^a-zążśźęćńół0-9]/gi, ' ').replace(/[\s\n]+/g, ' ').trim() | ||
} | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.withoutEmpty(this.clearText(text).split(' ')) | ||
} | ||
module.exports = AggressiveTokenizer |
@@ -26,18 +26,14 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
withoutEmpty (array) { | ||
return array.filter(function (a) { return a }) | ||
} | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.withoutEmpty(this.trim(text.split(/[^a-zA-Zà-úÀ-Ú]/))) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.withoutEmpty = function (array) { | ||
return array.filter(function (a) { return a }) | ||
} | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.withoutEmpty(this.trim(text.split(/[^a-zA-Zà-úÀ-Ú]/))) | ||
} |
@@ -26,23 +26,18 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
} | ||
class AggressiveTokenizer extends Tokenizer { | ||
withoutEmpty (array) { | ||
return array.filter(function (a) { return a }) | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
clearText (text) { | ||
return text.replace(/[^a-zа-яё0-9]/gi, ' ').replace(/[\s\n]+/g, ' ').trim() | ||
} | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.withoutEmpty = function (array) { | ||
return array.filter(function (a) { return a }) | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.withoutEmpty(this.clearText(text).split(' ')) | ||
} | ||
} | ||
AggressiveTokenizer.prototype.clearText = function (text) { | ||
return text.replace(/[^a-zа-яё0-9]/gi, ' ').replace(/[\s\n]+/g, ' ').trim() | ||
} | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.withoutEmpty(this.clearText(text).split(' ')) | ||
} | ||
module.exports = AggressiveTokenizer |
@@ -27,17 +27,13 @@ /* | ||
const normalizer = require('../normalizers/normalizer_sv') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
text = normalizer.removeDiacritics(text) | ||
// break a string up into an array of tokens by anything non-word | ||
// Ü is not part of swedish alphabet but there are words using it like müsli and München | ||
return this.trim(text.split(/[^A-Za-z0-9_åÅäÄöÖüÜ-]+/)) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
text = normalizer.removeDiacritics(text) | ||
// break a string up into an array of tokens by anything non-word | ||
// Ü is not part of swedish alphabet but there are words using it like müsli and München | ||
return this.trim(text.split(/[^A-Za-z0-9_åÅäÄöÖüÜ-]+/)) | ||
} |
@@ -0,25 +1,43 @@ | ||
/* | ||
Copyright (c) 2023, Pluto Rotegott | ||
Tokenizer for Ukrainian | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
*/ | ||
'use strict' | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
} | ||
class AggressiveTokenizer extends Tokenizer { | ||
withoutEmpty (array) { | ||
return array.filter(function (a) { return a }) | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
clearText (text) { | ||
return text.replace(/[^a-zа-яґєії0-9]/gi, ' ').replace(/[\s\n]+/g, ' ').trim() | ||
} | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.withoutEmpty = function (array) { | ||
return array.filter(function (a) { return a }) | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.withoutEmpty(this.clearText(text).split(' ')) | ||
} | ||
} | ||
AggressiveTokenizer.prototype.clearText = function (text) { | ||
return text.replace(/[^a-zа-яґєії0-9]/gi, ' ').replace(/[\s\n]+/g, ' ').trim() | ||
} | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
return this.withoutEmpty(this.clearText(text).split(' ')) | ||
} | ||
module.exports = AggressiveTokenizer |
@@ -26,15 +26,10 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
// break a string up into an array of tokens by anything non-word | ||
tokenize (text) { | ||
return this.trim(text.split(/[^a-z0-9àáảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđ]+/i)) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
// break a string up into an array of tokens by anything non-word | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
return this.trim(text.split(/[^a-z0-9àáảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵđ]+/i)) | ||
} | ||
module.exports = AggressiveTokenizer |
@@ -26,20 +26,16 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const AggressiveTokenizer = function () { | ||
Tokenizer.call(this) | ||
class AggressiveTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
// underscore is considered to be non word character | ||
// Old splitter: | ||
// return this.trim(text.split(/[\W|_]+/)) | ||
// New splitter: | ||
// Explicitly mentions which characters make up words. | ||
// So words may contain hyphen -, single quote ' and slash / | ||
return this.trim(text.split(/[^a-zA-Z0-9'\-/]+/)) | ||
} | ||
} | ||
util.inherits(AggressiveTokenizer, Tokenizer) | ||
module.exports = AggressiveTokenizer | ||
AggressiveTokenizer.prototype.tokenize = function (text) { | ||
// break a string up into an array of tokens by anything non-word | ||
// underscore is considered to be non word character | ||
// Old splitter: | ||
// return this.trim(text.split(/[\W|_]+/)) | ||
// New splitter: | ||
// Explicitly mentions which characters make up words. | ||
// So words may contain hyphen -, single quote ' and slash / | ||
return this.trim(text.split(/[^a-zA-Z0-9'\-/]+/)) | ||
} |
@@ -25,30 +25,31 @@ /* | ||
const DEBUG = false | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const _ = require('underscore') | ||
// Base Class for RegExp Matching | ||
const RegexpTokenizer = function (opts) { | ||
const options = opts || {} | ||
this._pattern = options.pattern || this._pattern | ||
this.discardEmpty = options.discardEmpty || true | ||
class RegexpTokenizer extends Tokenizer { | ||
constructor (opts) { | ||
super(opts) | ||
const options = opts || {} | ||
this._pattern = options.pattern || this._pattern | ||
this.discardEmpty = options.discardEmpty || true | ||
// Match and split on GAPS not the actual WORDS | ||
this._gaps = options.gaps | ||
// Match and split on GAPS not the actual WORDS | ||
this._gaps = options.gaps | ||
if (this._gaps === undefined) { | ||
this._gaps = true | ||
if (this._gaps === undefined) { | ||
this._gaps = true | ||
} | ||
} | ||
} | ||
util.inherits(RegexpTokenizer, Tokenizer) | ||
tokenize (s) { | ||
let results | ||
RegexpTokenizer.prototype.tokenize = function (s) { | ||
let results | ||
if (this._gaps) { | ||
results = s.split(this._pattern) | ||
return (this.discardEmpty) ? _.without(results, '', ' ') : results | ||
} else { | ||
return s.match(this._pattern) | ||
if (this._gaps) { | ||
results = s.split(this._pattern) | ||
return (this.discardEmpty) ? _.without(results, '', ' ') : results | ||
} else { | ||
return s.match(this._pattern) | ||
} | ||
} | ||
@@ -61,18 +62,21 @@ } | ||
/*** | ||
* A tokenizer that accepts an alphabet definition. | ||
* @param {string} options.language ISO 639-1 for the language, e.g. 'en' | ||
*/ | ||
const OrthographyTokenizer = function (options) { | ||
const pattern = orthographyMatchers[options.language] | ||
if (!pattern) { | ||
WordTokenizer.call(this, options) | ||
} else { | ||
this._pattern = pattern | ||
RegexpTokenizer.call(this, options) | ||
class OrthographyTokenizer { | ||
constructor (options) { | ||
const pattern = orthographyMatchers[options.language] | ||
DEBUG && console.log(pattern) | ||
if (!pattern) { | ||
this.tokenizer = new WordTokenizer() | ||
} else { | ||
this.tokenizer = new RegexpTokenizer(options) | ||
this.tokenizer._pattern = pattern | ||
DEBUG && console.log(this.tokenizer) | ||
} | ||
} | ||
tokenize (text) { | ||
return this.tokenizer.tokenize(text) | ||
} | ||
} | ||
util.inherits(OrthographyTokenizer, RegexpTokenizer) | ||
exports.OrthographyTokenizer = OrthographyTokenizer | ||
@@ -88,8 +92,9 @@ | ||
*/ | ||
const WordTokenizer = function (options) { | ||
this._pattern = /[^A-Za-zА-Яа-я0-9_]+/ | ||
RegexpTokenizer.call(this, options) | ||
class WordTokenizer extends RegexpTokenizer { | ||
constructor (options) { | ||
super(options) | ||
this._pattern = /[^A-Za-zА-Яа-я0-9_]+/ | ||
} | ||
} | ||
util.inherits(WordTokenizer, RegexpTokenizer) | ||
exports.WordTokenizer = WordTokenizer | ||
@@ -105,8 +110,12 @@ | ||
*/ | ||
const WordPunctTokenizer = function (options) { | ||
this._pattern = /([A-Za-zÀ-ÿ-]+|[0-9._]+|.|!|\?|'|"|:|;|,|-)/i | ||
RegexpTokenizer.call(this, options) | ||
class WordPunctTokenizer extends RegexpTokenizer { | ||
constructor (options) { | ||
if (!options) { | ||
options = {} | ||
} | ||
options.pattern = /([A-Za-zÀ-ÿ-]+|[0-9._]+|.|!|\?|'|"|:|;|,|-)/i | ||
super(options) | ||
} | ||
} | ||
util.inherits(WordPunctTokenizer, RegexpTokenizer) | ||
exports.WordPunctTokenizer = WordPunctTokenizer |
@@ -25,14 +25,9 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const SentenceTokenizer = function () { | ||
Tokenizer.call(this) | ||
class SentenceTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
return (parser.parse(text)) | ||
} | ||
} | ||
util.inherits(SentenceTokenizer, Tokenizer) | ||
SentenceTokenizer.prototype.tokenize = function (text) { | ||
return (parser.parse(text)) | ||
} | ||
module.exports = SentenceTokenizer |
@@ -26,30 +26,27 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const DEBUG = false | ||
const SentenceTokenizer = function () { | ||
Tokenizer.call(this) | ||
} | ||
util.inherits(SentenceTokenizer, Tokenizer) | ||
class SentenceTokenizer extends Tokenizer { | ||
tokenize (text) { | ||
// Break string up in to sentences based on punctation and quotation marks | ||
// See https://gist.github.com/Hugo-ter-Doest/4ed21fb7eb5077814d998fa61a726566 | ||
// for a breakdown of the regular expression | ||
let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|.+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g) | ||
SentenceTokenizer.prototype.tokenize = function (text) { | ||
// break string up in to sentences based on punctation and quotation marks | ||
// let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?.*?[.?!…](\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g) | ||
let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|.+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g) | ||
DEBUG && console.log('SentenceTokenizer.tokenize: ' + tokens) | ||
DEBUG && console.log('SentenceTokenizer.tokenize: ' + tokens) | ||
if (!tokens) { | ||
return [text] | ||
} | ||
if (!tokens) { | ||
return [text] | ||
} | ||
// remove unecessary white space | ||
tokens = tokens.map(Function.prototype.call, String.prototype.trim) | ||
// remove unecessary white space | ||
tokens = tokens.map(Function.prototype.call, String.prototype.trim) | ||
DEBUG && console.log('SentenceTokenizer.tokenize: tokens after removing whitespace ' + tokens) | ||
DEBUG && console.log('SentenceTokenizer.tokenize: tokens after removing whitespace ' + tokens) | ||
return this.trim(tokens) | ||
return this.trim(tokens) | ||
} | ||
} | ||
module.exports = SentenceTokenizer |
@@ -24,39 +24,24 @@ /* | ||
const Tokenizer = require('./tokenizer') | ||
const util = require('util') | ||
const CaseTokenizer = function () { | ||
Tokenizer.call(this) | ||
} | ||
util.inherits(CaseTokenizer, Tokenizer) | ||
// Changing the prototype of a native type is bad practice | ||
/* | ||
CaseTokenizer.prototype.attach = function () { | ||
const self = this | ||
String.prototype.tokenize = function (preserveApostrophe) { | ||
return self.tokenize(this, preserveApostrophe) | ||
} | ||
} | ||
*/ | ||
// Idea from Seagull: http://stackoverflow.com/a/26482650 | ||
CaseTokenizer.prototype.tokenize = function (text, preserveApostrophe) { | ||
const whitelist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] | ||
const lower = text.toLowerCase() | ||
const upper = text.toUpperCase() | ||
let result = '' | ||
let i | ||
class CaseTokenizer extends Tokenizer { | ||
tokenize (text, preserveApostrophe) { | ||
const whitelist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] | ||
const lower = text.toLowerCase() | ||
const upper = text.toUpperCase() | ||
let result = '' | ||
let i | ||
for (i = 0; i < lower.length; ++i) { | ||
if (lower[i] !== upper[i] || whitelist.indexOf(lower[i]) > -1 || (text[i] === '\'' && preserveApostrophe)) { | ||
result += text[i] | ||
} else { | ||
result += ' ' | ||
for (i = 0; i < lower.length; ++i) { | ||
if (lower[i] !== upper[i] || whitelist.indexOf(lower[i]) > -1 || (text[i] === '\'' && preserveApostrophe)) { | ||
result += text[i] | ||
} else { | ||
result += ' ' | ||
} | ||
} | ||
return this.trim(result.replace(/\s+/g, ' ').split(' ')) | ||
} | ||
return this.trim(result.replace(/\s+/g, ' ').split(' ')) | ||
} | ||
module.exports = CaseTokenizer |
@@ -23,33 +23,14 @@ /* | ||
/** | ||
* \@todo Use .bind() in Tokenizer.prototype.attach(). | ||
*/ | ||
'use strict' | ||
const Tokenizer = function () { | ||
} | ||
class Tokenizer { | ||
trim (array) { | ||
while (array[array.length - 1] === '') { array.pop() } | ||
Tokenizer.prototype.trim = function (array) { | ||
while (array[array.length - 1] === '') { array.pop() } | ||
while (array[0] === '') { array.shift() } | ||
while (array[0] === '') { array.shift() } | ||
return array | ||
} | ||
// Expose an attach function that will patch String with new methods. | ||
// Changing the prototype of a native type is bad practice | ||
/* | ||
Tokenizer.prototype.attach = function () { | ||
const self = this | ||
String.prototype.tokenize = function () { | ||
return self.tokenize(this) | ||
return array | ||
} | ||
} | ||
*/ | ||
Tokenizer.prototype.tokenize = function () {} | ||
module.exports = Tokenizer |
{ | ||
"name": "natural", | ||
"description": "General natural language (tokenizing, stemming (English, Russian, Spanish), part-of-speech tagging, sentiment analysis, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.", | ||
"version": "6.10.0", | ||
"version": "6.10.2", | ||
"homepage": "https://github.com/NaturalNode/natural", | ||
@@ -36,3 +36,3 @@ "repository": { | ||
"sinon": "^1.12.2", | ||
"standard": "^16.0.3", | ||
"standard": "^16.0.4", | ||
"ts-standard": "^12.0.2", | ||
@@ -39,0 +39,0 @@ "typescript": "^4.9.3", |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
13793949
793149