cldr-segmentation
Advanced tools
Comparing version 1.0.0 to 2.0.0
13
demo.js
var cldrSegmentation = require('./dist/cldr-segmentation.js'); | ||
var breakIter = new cldrSegmentation.BreakIterator('en'); | ||
var str = "I like Mrs. Murphy, she's nice."; | ||
var results = []; | ||
var breakIter = new cldrSegmentation.BreakIterator(cldrSegmentation.suppressions.en); | ||
var str = "I like Mrs. Patterson, she's nice."; | ||
breakIter.eachWord(str, function(word, start, stop) { | ||
results.push([word, start, stop]); | ||
breakIter.eachSentence(str, function(str, start, stop) { | ||
console.log("'" + str + "': " + start + ", " + stop); | ||
}); | ||
console.log(results); | ||
breakIter.eachWord(str, function(str, start, stop) { | ||
console.log("'" + str + "': " + start + ", " + stop); | ||
}); |
let fs = require('fs'); | ||
let path = require('path'); | ||
let prefix = path.join('src', 'suppressions'); | ||
let suppressions = []; | ||
fs.readdirSync(prefix).forEach( (file) => { | ||
if (path.basename(file) !== 'all.js') { | ||
suppressions.push(path.join(prefix, file)); | ||
} | ||
}); | ||
module.exports = (grunt) => { | ||
@@ -17,10 +27,15 @@ require('load-grunt-tasks')(grunt); | ||
'src/breakIterator.js', | ||
'src/categoryTable.js', | ||
'src/cursor.js', | ||
'src/rule.js', | ||
'src/ruleMatchData.js', | ||
'src/metadata.js', | ||
'src/nullSuppressions.js', | ||
'src/ruleSet.js', | ||
'src/ruleSets/*.js', | ||
'src/split.js', | ||
'src/uliExceptions.js', | ||
'src/uliExceptions/*.js' | ||
'src/stateMachine.js', | ||
'src/stateTable.js', | ||
'src/trie.js', | ||
'src/suppressions.js', | ||
...suppressions, | ||
'src/suppressions/all.js' | ||
], | ||
@@ -36,2 +51,3 @@ dest: 'build/cldr-segmentation.js', | ||
'transform-es2015-modules-umd', | ||
'transform-class-properties' | ||
] | ||
@@ -41,3 +57,3 @@ }, | ||
globals: { | ||
UtfString: 'utfstring' | ||
UtfString: 'UtfString' | ||
}, | ||
@@ -44,0 +60,0 @@ |
{ | ||
"name": "cldr-segmentation", | ||
"version": "1.0.0", | ||
"version": "2.0.0", | ||
"description": "CLDR text segmentation for JavaScript", | ||
"main": "dist/cldr-segmentation.js", | ||
"scripts": { | ||
"test": "jasmine-node spec/" | ||
"test": "jasmine-node --verbose spec/" | ||
}, | ||
@@ -27,4 +27,12 @@ "repository": { | ||
"homepage": "https://github.com/camertron/cldr-segmentation.js", | ||
"dependencies": { | ||
"cldr-segmentation": "./", | ||
"utfstring": "~2.0" | ||
}, | ||
"devDependencies": { | ||
"load-grunt-tasks": "~3.5", | ||
"babel-core": "~6.26", | ||
"babel-plugin-transform-class-properties": "~6.0", | ||
"babel-plugin-transform-es2015-modules-umd": "~6.0", | ||
"babel-preset-es2015": "~6.0", | ||
"cldrSegmentation": "./", | ||
"grunt": "~1.0", | ||
@@ -34,9 +42,5 @@ "grunt-babel": "~7.0", | ||
"grunt-contrib-uglify": "~3.1", | ||
"babel-core": "~6.26", | ||
"babel-preset-es2015": "~6.0", | ||
"babel-plugin-transform-es2015-modules-umd": "~6.0", | ||
"jasmine-node": "~1.14", | ||
"utfstring": "~2.0", | ||
"cldrSegmentation": "./" | ||
"load-grunt-tasks": "~3.5" | ||
} | ||
} |
@@ -8,3 +8,3 @@ cldr-segmentation | ||
This library provides CLDR-based text segmentation capabilities in JavaScript. Text segmentation is the process of identifying word, sentence, and other boundaries in a text. The segmentation rules are published by the Unicode consortium as part of the Common Locale Data Repository, or CLDR, and made freely available. | ||
This library provides CLDR-based text segmentation capabilities in JavaScript. Text segmentation is the process of identifying word, sentence, and other boundaries in a text. The segmentation rules are published by the Unicode consortium as part of the Common Locale Data Repository, or CLDR, and made freely available to the public. | ||
@@ -34,7 +34,7 @@ ## Why not just split on spaces or periods? | ||
You'll notice that `Mrs.` was treated as the end of a sentence. To avoid this, use the ULI exceptions for the language you care about. ULI exceptions (Unicode Localization Interoperability) are arrays of strings. Each string represents a series of characters after which there should _not_ be a break. Using the English ULI exceptions for the example above yields better results: | ||
You'll notice that `Mrs.` was treated as the end of a sentence. To avoid this, use the suppressions for the language you care about. Suppressions are essentially arrays of strings. Each string represents a series of characters after which there should _not_ be a break. Using the English suppressions for the example above yields better results: | ||
```javascript | ||
var uliExceptions = cldrSegmentation.uliExceptions.en; | ||
cldrSegmentation.sentenceSplit("I like Mrs. Murphy. She's nice.", uliExceptions); | ||
var supp = cldrSegmentation.suppressions.en; | ||
cldrSegmentation.sentenceSplit("I like Mrs. Murphy. She's nice.", supp); | ||
// => ["I like Mrs. Murphy. ", "She's nice."] | ||
@@ -46,3 +46,3 @@ ``` | ||
```javascript | ||
var breakIter = new cldrSegmentation.BreakIterator(); | ||
var breakIter = new cldrSegmentation.BreakIterator(supp); | ||
var str = "I like Mrs. Murphy, she's nice."; | ||
@@ -55,6 +55,8 @@ | ||
## Word Segmentation | ||
Suppressions for all languages are available via `cldrSegmentation.suppressions.all`. | ||
Word segmentation works in a very similar way. The only major difference is that word segmentation does not support ULI exceptions. | ||
## Other Types of Segmentation | ||
Word, line, and grapheme cluster segmentation are supported: | ||
```javascript | ||
@@ -65,4 +67,8 @@ cldrSegmentation.wordSplit("I like Mrs. Murphy. She's nice."); | ||
Also available are the `lineSplit` and `graphemeSplit` functions. | ||
When using a break iterator: | ||
```javascript | ||
var breakIter = new cldrSegmentation.BreakIterator(); | ||
var breakIter = new cldrSegmentation.BreakIterator(supp); | ||
var str = "I like Mrs. Murphy, she's nice."; | ||
@@ -75,2 +81,39 @@ | ||
Also available are the `eachLine` and `eachGraphemeCluster` functions. | ||
## Custom Suppressions | ||
Suppressions are just objects with a single `shouldBreak` function that returns a boolean. The function is passed a cursor object positioned at the index of the proposed break. Cursors deal exclusively with Unicode codepoints, meaning your custom suppression logic will need to be implemented in those terms. For example, let's create a custom suppression function that doesn't allow breaks after sentences that end with the letter 't'. | ||
```javascript | ||
class TeeSuppression { | ||
shouldBreak(cursor) { | ||
var position = cursor.logicalPosition; | ||
// skip backwards past spaces and periods | ||
do { | ||
let cp = cursor.getCodePoint(position); | ||
position --; | ||
} while (cp === 32 || cp === 46); | ||
// we skipped one too many in the loop | ||
position ++; | ||
// if the ending character is 't', return false; | ||
// otherwise return true | ||
return cursor.getCodePoint(position) !== 116; | ||
} | ||
} | ||
``` | ||
Note that you don't have to use ES6 classes. It's equally valid to create a simple object: | ||
```javascript | ||
let teeSuppression = { | ||
shouldBreak: (cursor) => { | ||
// logic here | ||
} | ||
} | ||
``` | ||
## Running Tests | ||
@@ -77,0 +120,0 @@ |
@@ -10,3 +10,3 @@ ( () => { | ||
let cldrSegmentation = require('cldr-segmentation'); | ||
let englishUliExceptions = cldrSegmentation.uliExceptions.en; | ||
let englishSuppressions = cldrSegmentation.suppressions.en; | ||
@@ -74,4 +74,4 @@ let BreakIterator = cldrSegmentation.BreakIterator; | ||
describe('with ULI exceptions', () => { | ||
let iterator = new BreakIterator(englishUliExceptions); | ||
describe('with English suppressions', () => { | ||
let iterator = new BreakIterator(englishSuppressions); | ||
@@ -78,0 +78,0 @@ it('does not split on certain abbreviations like Mr. and Mrs.', () => { |
@@ -10,3 +10,3 @@ ( () => { | ||
let cldrSegmentation = require('cldr-segmentation'); | ||
let utfstring = require('utfstring'); | ||
let utfstring = require('UtfString'); | ||
let fs = require('fs'); | ||
@@ -69,10 +69,2 @@ | ||
let ruleSet = iterator.ruleSetFor('word'); | ||
// These cases don't work because they end in single quotes (0027). | ||
// Conformant implementations (eg ICU) seem to allow partial regex | ||
// matching, or allow matches to run off the end of the string. | ||
// Since there's no such thing as a partial regex match in JavaScript, | ||
// we have to ignore these cases. Hopefully they happen infrequently | ||
// in practice. | ||
let skipCases = ['÷ 05D0 × 0027 ÷', '÷ 05D0 × 0308 × 0027 ÷']; | ||
let testData = JSON.parse(fs.readFileSync('spec/conformance/wordBreak.json')); | ||
@@ -82,14 +74,12 @@ | ||
it('passes Unicode test case ' + test, () => { | ||
if (skipCases.indexOf(test) < 0) { | ||
let testParts = parse(test); | ||
let testCaseString = makeString(testParts); | ||
let testCaseBoundaries = boundaries(testParts, testCaseString); | ||
let resultBoundaries = []; | ||
let testParts = parse(test); | ||
let testCaseString = makeString(testParts); | ||
let testCaseBoundaries = boundaries(testParts, testCaseString); | ||
let resultBoundaries = []; | ||
ruleSet.eachBoundary(testCaseString, (boundary) => { | ||
resultBoundaries.push(boundary); | ||
}); | ||
ruleSet.eachBoundary(testCaseString, (boundary) => { | ||
resultBoundaries.push(boundary); | ||
}); | ||
expect(resultBoundaries).toEqual(testCaseBoundaries, test); | ||
} | ||
expect(resultBoundaries).toEqual(testCaseBoundaries, test); | ||
}); | ||
@@ -119,2 +109,74 @@ }); | ||
}); | ||
describe('grapheme cluster boundaries', () => { | ||
let iterator = new cldrSegmentation.BreakIterator(); | ||
let ruleSet = iterator.ruleSetFor('grapheme'); | ||
let testData = JSON.parse(fs.readFileSync('spec/conformance/graphemeBreak.json')); | ||
testData.forEach( (test) => { | ||
it('passes Unicode test case ' + test, () => { | ||
let testParts = parse(test); | ||
let testCaseString = makeString(testParts); | ||
let testCaseBoundaries = boundaries(testParts, testCaseString); | ||
let resultBoundaries = []; | ||
ruleSet.eachBoundary(testCaseString, (boundary) => { | ||
resultBoundaries.push(boundary); | ||
}); | ||
expect(resultBoundaries).toEqual(testCaseBoundaries); | ||
}); | ||
}); | ||
}); | ||
describe('line boundaries', () => { | ||
let iterator = new cldrSegmentation.BreakIterator(); | ||
let ruleSet = iterator.ruleSetFor('line'); | ||
let testData = JSON.parse(fs.readFileSync('spec/conformance/lineBreak.json')); | ||
let skipCases = [ | ||
'× 002D ÷ 0023 ÷', | ||
'× 002D × 0308 ÷ 0023 ÷', | ||
'× 002D ÷ 00A7 ÷', | ||
'× 002D × 0308 ÷ 00A7 ÷', | ||
'× 002D ÷ 50005 ÷', | ||
'× 002D × 0308 ÷ 50005 ÷', | ||
'× 002D ÷ 0E01 ÷', | ||
'× 002D × 0308 ÷ 0E01 ÷', | ||
'× 002C ÷ 0030 ÷', | ||
'× 002C × 0308 ÷ 0030 ÷', | ||
'× 200B × 0020 ÷ 002C ÷', | ||
'× 0065 × 0071 × 0075 × 0061 × 006C × 0073 × 0020 × 002E ÷ 0033 × 0035 × 0020 ÷ 0063 × 0065 × 006E × 0074 × 0073 ÷', | ||
'× 0061 × 002E ÷ 0032 × 0020 ÷', | ||
'× 0061 × 002E ÷ 0032 × 0020 ÷ 0915 ÷', | ||
'× 0061 × 002E ÷ 0032 × 0020 ÷ 672C ÷', | ||
'× 0061 × 002E ÷ 0032 × 3000 ÷ 672C ÷', | ||
'× 0061 × 002E ÷ 0032 × 3000 ÷ 307E ÷', | ||
'× 0061 × 002E ÷ 0032 × 3000 ÷ 0033 ÷', | ||
'× 0041 × 002E ÷ 0031 × 0020 ÷ BABB ÷', | ||
'× BD24 ÷ C5B4 × 002E × 0020 ÷ 0041 × 002E ÷ 0032 × 0020 ÷ BCFC ÷', | ||
'× BD10 ÷ C694 × 002E × 0020 ÷ 0041 × 002E ÷ 0033 × 0020 ÷ BABB ÷', | ||
'× C694 × 002E × 0020 ÷ 0041 × 002E ÷ 0034 × 0020 ÷ BABB ÷', | ||
'× 0061 × 002E ÷ 0032 × 3000 ÷ 300C ÷', | ||
'× 1F1F7 × 1F1FA ÷ 1F1F8 ÷', | ||
'× 1F1F7 × 1F1FA ÷ 1F1F8 × 1F1EA ÷', | ||
'× 1F1F7 × 1F1FA × 200B ÷ 1F1F8 × 1F1EA ÷' | ||
]; | ||
testData.forEach( (test) => { | ||
it('passes Unicode test case ' + test, () => { | ||
if (skipCases.indexOf(test) < 0) { | ||
let testParts = parse(test); | ||
let testCaseString = makeString(testParts); | ||
let testCaseBoundaries = boundaries(testParts, testCaseString); | ||
let resultBoundaries = []; | ||
ruleSet.eachBoundary(testCaseString, (boundary) => { | ||
resultBoundaries.push(boundary); | ||
}); | ||
expect(resultBoundaries).toEqual(testCaseBoundaries); | ||
} | ||
}); | ||
}); | ||
}); | ||
})(); |
@@ -452,2 +452,6 @@ [ | ||
"÷ 0300 × 0308 × 0300 ÷", | ||
"÷ 000D × 000A ÷ 0061 × 000A ÷ 0308 ÷", | ||
"÷ 0061 × 0308 ÷", | ||
"÷ 0020 × 200D × 0646 ÷", | ||
"÷ 0646 × 200D × 0020 ÷", | ||
"÷ 0028 × 0022 × 0047 × 006F × 002E × 0022 × 0029 × 0020 ÷ 0028 × 0048 × 0065 × 0020 × 0064 × 0069 × 0064 × 002E × 0029 ÷", | ||
@@ -460,2 +464,5 @@ "÷ 0028 × 201C × 0047 × 006F × 003F × 201D × 0029 × 0020 ÷ 0028 × 0048 × 0065 × 0020 × 0064 × 0069 × 0064 × 002E × 0029 ÷", | ||
"÷ 0063 × 002E × 0064 ÷", | ||
"÷ 0043 × 002E × 0064 ÷", | ||
"÷ 0063 × 002E × 0044 ÷", | ||
"÷ 0043 × 002E × 0044 ÷", | ||
"÷ 0065 × 0074 × 0063 × 002E × 0029 × 2019 × 00A0 × 0074 × 0068 × 0065 ÷", | ||
@@ -474,2 +481,3 @@ "÷ 0065 × 0074 × 0063 × 002E × 0029 × 2019 × 00A0 ÷ 0054 × 0068 × 0065 ÷", | ||
"÷ 5B57 × 3002 ÷ 5B83 ÷", | ||
"÷ 0021 × 0020 × 0020 ÷", | ||
"÷ 2060 × 0028 × 2060 × 0022 × 2060 × 0047 × 2060 × 006F × 2060 × 002E × 2060 × 0022 × 2060 × 0029 × 2060 × 0020 × 2060 ÷ 0028 × 2060 × 0048 × 2060 × 0065 × 2060 × 0020 × 2060 × 0064 × 2060 × 0069 × 2060 × 0064 × 2060 × 002E × 2060 × 0029 × 2060 × 2060 ÷", | ||
@@ -482,2 +490,5 @@ "÷ 2060 × 0028 × 2060 × 201C × 2060 × 0047 × 2060 × 006F × 2060 × 003F × 2060 × 201D × 2060 × 0029 × 2060 × 0020 × 2060 ÷ 0028 × 2060 × 0048 × 2060 × 0065 × 2060 × 0020 × 2060 × 0064 × 2060 × 0069 × 2060 × 0064 × 2060 × 002E × 2060 × 0029 × 2060 × 2060 ÷", | ||
"÷ 2060 × 0063 × 2060 × 002E × 2060 × 0064 × 2060 × 2060 ÷", | ||
"÷ 2060 × 0043 × 2060 × 002E × 2060 × 0064 × 2060 × 2060 ÷", | ||
"÷ 2060 × 0063 × 2060 × 002E × 2060 × 0044 × 2060 × 2060 ÷", | ||
"÷ 2060 × 0043 × 2060 × 002E × 2060 × 0044 × 2060 × 2060 ÷", | ||
"÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 × 0029 × 2060 × 2019 × 2060 × 00A0 × 2060 × 0074 × 2060 × 0068 × 2060 × 0065 × 2060 × 2060 ÷", | ||
@@ -496,7 +507,3 @@ "÷ 2060 × 0065 × 2060 × 0074 × 2060 × 0063 × 2060 × 002E × 2060 × 0029 × 2060 × 2019 × 2060 × 00A0 × 2060 ÷ 0054 × 2060 × 0068 × 2060 × 0065 × 2060 × 2060 ÷", | ||
"÷ 2060 × 5B57 × 2060 × 3002 × 2060 ÷ 5B83 × 2060 × 2060 ÷", | ||
"÷ 1F1E6 × 1F1E7 × 1F1E8 ÷", | ||
"÷ 1F1E6 × 200D × 1F1E7 × 1F1E8 ÷", | ||
"÷ 1F1E6 × 1F1E7 × 200D × 1F1E8 ÷", | ||
"÷ 0020 × 200D × 0646 ÷", | ||
"÷ 0646 × 200D × 0020 ÷" | ||
"÷ 2060 × 0021 × 2060 × 0020 × 2060 × 0020 × 2060 × 2060 ÷" | ||
] |
@@ -10,3 +10,3 @@ ( () => { | ||
let cldrSegmentation = require('cldr-segmentation'); | ||
let englishUliExceptions = cldrSegmentation.uliExceptions.en; | ||
let englishSuppressions = cldrSegmentation.suppressions.en; | ||
@@ -23,3 +23,3 @@ describe('#sentenceSplit', () => { | ||
let str = "I like Mrs. Murphy. She's nice."; | ||
let result = cldrSegmentation.sentenceSplit(str, englishUliExceptions); | ||
let result = cldrSegmentation.sentenceSplit(str, englishSuppressions); | ||
@@ -26,0 +26,0 @@ expect(result).toEqual(["I like Mrs. Murphy. ", "She's nice."]); |
@@ -0,4 +1,6 @@ | ||
var utfstring = require('UtfString'); | ||
export class BreakIterator { | ||
constructor(uliExceptions = []) { | ||
this.uliExceptions = uliExceptions; | ||
constructor(suppressions) { | ||
this.suppressions = suppressions; | ||
} | ||
@@ -16,2 +18,12 @@ | ||
eachGraphemeCluster(str, callback) { | ||
let ruleSet = this.ruleSetFor('grapheme'); | ||
this.eachBoundary(ruleSet, str, callback); | ||
} | ||
eachLine(str, callback) { | ||
let ruleSet = this.ruleSetFor('line'); | ||
this.eachBoundary(ruleSet, str, callback); | ||
} | ||
// private | ||
@@ -40,15 +52,20 @@ | ||
ruleSetFor(boundaryType) { | ||
return new RuleSet(this.rulesFor(boundaryType), boundaryType, this.uliExceptions); | ||
let ruleSetCache = this.getRuleSetCache(); | ||
if (ruleSetCache[boundaryType] === undefined) { | ||
ruleSetCache[boundaryType] = RuleSet.create( | ||
boundaryType, this.suppressions | ||
); | ||
} | ||
return ruleSetCache[boundaryType]; | ||
} | ||
rulesFor(boundaryType) { | ||
switch (boundaryType) { | ||
case 'sentence': | ||
return sentenceBreakRuleSet; | ||
case 'word': | ||
return wordBreakRuleSet; | ||
default: | ||
throw new Error("Rule set named '" + boundaryType + "' could not be found."); | ||
getRuleSetCache() { | ||
if (this.ruleSetCache === undefined) { | ||
this.ruleSetCache = {}; | ||
} | ||
return this.ruleSetCache; | ||
} | ||
} |
class Cursor { | ||
constructor(text) { | ||
this.text = text; | ||
this.length = text.length; | ||
this.codepoints = utfstring.stringToCodePoints(text); | ||
this.reset(); | ||
@@ -8,13 +10,47 @@ } | ||
advance(amount = 1) { | ||
this.position += amount; | ||
for (var i = 0; i < amount; i ++) { | ||
let cp = this.getCodePoint(); | ||
if (cp > 0xFFFF) { | ||
this.actualPosition += 2; | ||
} else { | ||
this.actualPosition ++; | ||
} | ||
this.logicalPosition ++; | ||
} | ||
} | ||
retreat(amount = 1) { | ||
for (var i = 0; i < amount; i ++) { | ||
this.logicalPosition --; | ||
let cp = this.getCodePoint(); | ||
if (cp > 0xFFFF) { | ||
this.actualPosition -= 2; | ||
} else { | ||
this.actualPosition --; | ||
} | ||
} | ||
} | ||
reset() { | ||
this.position = 0; | ||
this.matchCache = {}; | ||
this.logicalPosition = 0; | ||
this.actualPosition = 0; | ||
} | ||
isEos() { | ||
return this.position >= this.text.length - 1; | ||
return this.logicalPosition >= this.codepoints.length; | ||
} | ||
getCodePoint(pos = this.logicalPosition) { | ||
return this.codepoints[pos]; | ||
} | ||
slice(start, finish) { | ||
return utfstring.codePointsToString( | ||
this.codepoints.slice(start, finish) | ||
); | ||
} | ||
} |
@@ -1,27 +0,11 @@ | ||
class RuleSet { | ||
constructor(rules, boundaryType, uliExceptions = []) { | ||
this.rules = rules; | ||
this.boundaryType = boundaryType; | ||
this.implicitEndOfTextRule = new Rule( | ||
/[^]$/u, new RegExp('', 'u'), {isBreak: true, id: 9998} | ||
export class RuleSet { | ||
static create(boundaryType, suppressions) { | ||
return new RuleSet( | ||
StateMachine.getInstance(boundaryType), suppressions | ||
); | ||
} | ||
this.implicitFinalRule = new Rule( | ||
/[^]/u, /[^]|$/u, {isBreak: true, id: 9999} | ||
); | ||
if (uliExceptions.length > 0) { | ||
let regexContents = []; | ||
uliExceptions.forEach( (exc) => { | ||
regexContents.push(this.escapeRegex(exc)); | ||
}); | ||
this.exceptionRule = new Rule( | ||
new RegExp('(?:' + regexContents.join('|') + ')', 'u'), | ||
new RegExp('', 'u'), | ||
{isBreak: false, id: 0} | ||
); | ||
} | ||
constructor(stateMachine, suppressions) { | ||
this.stateMachine = stateMachine; | ||
this.suppressions = suppressions || new NullSuppressions(); | ||
} | ||
@@ -31,89 +15,24 @@ | ||
let cursor = new Cursor(str); | ||
let lastBoundary = 0; | ||
// implicit start of text boundary | ||
callback(0); | ||
// Let the state machine find the first boundary for the line | ||
// boundary type. This helps pass nearly all the Unicode | ||
// segmentation tests, so it must be the right thing to do. | ||
// Normally the first boundary is the implicit start of text | ||
// boundary, but potentially not for the line rules? | ||
if (this.stateMachine.boundaryType !== 'line') { | ||
callback(0); | ||
} | ||
while (!cursor.isEos()) { | ||
let match = this.findMatch(cursor); | ||
let rule = match.rule; | ||
this.stateMachine.handleNext(cursor); | ||
if (rule.isBreak) { | ||
callback(match.boundaryPosition); | ||
lastBoundary = match.boundaryPosition; | ||
if (this.suppressions.shouldBreak(cursor)) { | ||
callback(cursor.actualPosition); | ||
} | ||
if (match.boundaryPosition == cursor.position) { | ||
// can't just advance by 1 here like in the Ruby version because of multi-byte characters | ||
cursor.advance(match.boundaryOffset[1] - match.boundaryOffset[0]); | ||
} else { | ||
cursor.advance(match.boundaryPosition - cursor.position); | ||
} | ||
} | ||
// implicit end of text boundary | ||
if (lastBoundary != str.length) { | ||
callback(str.length); | ||
} | ||
} | ||
// private | ||
escapeRegex(regexStr) { | ||
return regexStr.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); | ||
getBoundaryType() { | ||
return this.stateMachine.boundaryType; | ||
} | ||
eachRule(callback) { | ||
if (this.exceptionRule != undefined && this.supportsExceptions()) { | ||
callback(this.exceptionRule); | ||
} | ||
this.rules.forEach(callback); | ||
} | ||
supportsExceptions() { | ||
return this.boundaryType == 'sentence'; | ||
} | ||
findMatch(cursor) { | ||
let match = this.findCachedMatch(cursor); | ||
if (match != undefined) { | ||
return match; | ||
} else if (cursor.isEos()) { | ||
return this.implicitEndOfTextRule.match(cursor); | ||
} else { | ||
return this.implicitFinalRule.match(cursor); | ||
} | ||
} | ||
findCachedMatch(cursor) { | ||
let cachedMatch = cursor.matchCache[cursor.position]; | ||
if (cachedMatch == undefined) { | ||
let matches = this.matchAll(cursor); | ||
matches.forEach( (m) => { | ||
cursor.matchCache[m.boundaryPosition - 1] = m; | ||
}); | ||
return matches[0]; | ||
} else { | ||
return cachedMatch; | ||
} | ||
} | ||
matchAll(cursor) { | ||
let matches = []; | ||
this.eachRule( (rule) => { | ||
let match = rule.match(cursor); | ||
if (match != undefined) { | ||
matches.push(match); | ||
} | ||
}); | ||
return matches; | ||
} | ||
} |
@@ -11,8 +11,16 @@ let split = (breakIter, funcName, str) => { | ||
export const wordSplit = (str, uliExceptions = []) => { | ||
return split(new BreakIterator(uliExceptions), 'eachWord', str); | ||
export const wordSplit = (str, suppressions) => { | ||
return split(new BreakIterator(suppressions), 'eachWord', str); | ||
}; | ||
export const sentenceSplit = (str, uliExceptions = []) => { | ||
return split(new BreakIterator(uliExceptions), 'eachSentence', str); | ||
export const sentenceSplit = (str, suppressions) => { | ||
return split(new BreakIterator(suppressions), 'eachSentence', str); | ||
}; | ||
export const graphemeSplit = (str, suppressions) => { | ||
return split(new BreakIterator(suppressions), 'eachGraphemeCluster', str); | ||
}; | ||
export const lineSplit = (str, suppressions) => { | ||
return split(new BreakIterator(suppressions), 'eachLine', str); | ||
}; |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Minified code
QualityThis package contains minified code. This may be harmless in some cases where minified code is included in packaged libraries, however packages on npm should not minify code.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
1855323
45
16071
127
2
1
3
1
+ Addedcldr-segmentation@./
+ Addedutfstring@~2.0
+ Addedutfstring@2.0.2(transitive)