cldr-segmentation
Advanced tools
Comparing version 2.0.3 to 2.1.0
@@ -0,1 +1,6 @@ | ||
## 2.1.0 | ||
* Add ability to copy and merge suppression objects. | ||
- Enables adding custom suppression strings. | ||
* Add custom English suppression for "Dr.". | ||
## 2.0.3 | ||
@@ -2,0 +7,0 @@ * Somehow the require of 'utfstring' was changed to 'UtfString', which worked on Mac OS's case-insensitive filesystem but errored in Linux environments. |
@@ -38,2 +38,3 @@ let fs = require('fs'); | ||
'src/suppressions.js', | ||
'src/customSuppressions.js', | ||
...suppressions, | ||
@@ -40,0 +41,0 @@ 'src/suppressions/all.js' |
{ | ||
"name": "cldr-segmentation", | ||
"version": "2.0.3", | ||
"version": "2.1.0", | ||
"description": "CLDR text segmentation for JavaScript", | ||
@@ -36,3 +36,3 @@ "main": "dist/cldr-segmentation.js", | ||
"cldrSegmentation": "./", | ||
"grunt": "~1.0", | ||
"grunt": "~1.3", | ||
"grunt-babel": "~7.0", | ||
@@ -39,0 +39,0 @@ "grunt-contrib-concat": "~1.0", |
@@ -0,1 +1,3 @@ | ||
[![Build Status](https://travis-ci.org/camertron/cldr-segmentation.js.svg?branch=master)](https://travis-ci.org/camertron/cldr-segmentation.js) | ||
cldr-segmentation | ||
@@ -80,5 +82,15 @@ === | ||
Suppressions are just objects with a single `shouldBreak` function that returns a boolean. The function is passed a cursor object positioned at the index of the proposed break. Cursors deal exclusively with Unicode codepoints, meaning your custom suppression logic will need to be implemented in those terms. For example, let's create a custom suppression function that doesn't allow breaks after sentences that end with the letter 't'. | ||
Suppressions are just strings after which a break should not occur. This library comes with a set of common suppressions for a variety of languages, but you may want to add your own. Suppression objects can be merged. For example, here's how to add "Dr." to the set of English suppressions: | ||
```javascript | ||
var customSupps = cldrSegmentation.Suppressions.create(['Dr.']); | ||
var supps = cldrSegmentation.suppressions.en.merge(customSupps); | ||
cldrSegmentation.sentenceSplit("We love Dr. Strange. He's cool.", supps); | ||
``` | ||
## Custom Suppression Objects | ||
Suppression objects are just plain 'ol Javascript objects with a single `shouldBreak` function that returns a boolean. The function is passed a cursor object positioned at the index of the proposed break. Cursors deal exclusively with Unicode codepoints, meaning your custom suppression logic will need to be implemented in those terms. For example, let's create a custom suppression function that doesn't allow breaks after sentences that end with the letter 't'. | ||
```javascript | ||
class TeeSuppression { | ||
@@ -85,0 +97,0 @@ shouldBreak(cursor) { |
@@ -86,4 +86,4 @@ ( () => { | ||
describe('without ULI exceptions', () => { | ||
it('splits on certain abbreviations like Mr. and Mrs. (use ULI rules to avoid this behavior)', () => { | ||
describe('without suppressions', () => { | ||
it('splits on certain abbreviations like Mr. and Mrs. (use suppressions to avoid this behavior)', () => { | ||
let str = "I really like Mrs. Patterson. She's nice."; | ||
@@ -90,0 +90,0 @@ |
@@ -10,3 +10,3 @@ ( () => { | ||
let cldrSegmentation = require('cldr-segmentation'); | ||
let utfstring = require('UtfString'); | ||
let utfstring = require('utfstring'); | ||
let fs = require('fs'); | ||
@@ -13,0 +13,0 @@ |
@@ -21,3 +21,3 @@ ( () => { | ||
it('splits correctly using uli exceptions', () => { | ||
it('splits correctly using suppressions', () => { | ||
let str = "I like Mrs. Murphy. She's nice."; | ||
@@ -29,3 +29,10 @@ let result = cldrSegmentation.sentenceSplit(str, englishSuppressions); | ||
it('splits correctly when an uli exception occurs just before a potential break', () => { | ||
it('splits correctly using custom suppressions', () => { | ||
let str = "I like Dr. Murphy. She's nice."; | ||
let result = cldrSegmentation.sentenceSplit(str, englishSuppressions); | ||
expect(result).toEqual(["I like Dr. Murphy. ", "She's nice."]); | ||
}); | ||
it('splits correctly when a suppression occurs just before a potential break', () => { | ||
let str = 'Hi, my name is Philipp. Just because I can.'; | ||
@@ -37,3 +44,3 @@ let result = cldrSegmentation.sentenceSplit(str, englishSuppressions); | ||
it('splits correctly when a German uli exception occurs just before a potential break', () => { | ||
it('splits correctly when a German suppression occurs just before a potential break', () => { | ||
let str = "Dies ist ein test Satz. Und hier ein Zweiter."; | ||
@@ -40,0 +47,0 @@ let result = cldrSegmentation.sentenceSplit(str, germanSuppressions); |
@@ -28,2 +28,10 @@ export const suppressions = {}; | ||
merge(otherSupp) { | ||
return new Suppressions( | ||
this.forwardTrie.merge(otherSupp.forwardTrie), | ||
this.backwardTrie.merge(otherSupp.backwardTrie), | ||
[...this.list, ...otherSupp.list] | ||
) | ||
} | ||
shouldBreak(cursor) { | ||
@@ -30,0 +38,0 @@ var idx = cursor.logicalPosition; |
@@ -222,4 +222,8 @@ suppressions['de'] = ( () => { | ||
if (customSuppressions['de']) { | ||
supp = supp.merge(customSuppressions['de']); | ||
} | ||
supp.lock(); | ||
return supp; | ||
})(); |
@@ -131,4 +131,8 @@ suppressions['en'] = ( () => { | ||
if (customSuppressions['en']) { | ||
supp = supp.merge(customSuppressions['en']); | ||
} | ||
supp.lock(); | ||
return supp; | ||
})(); |
@@ -165,4 +165,8 @@ suppressions['es'] = ( () => { | ||
if (customSuppressions['es']) { | ||
supp = supp.merge(customSuppressions['es']); | ||
} | ||
supp.lock(); | ||
return supp; | ||
})(); |
@@ -83,4 +83,8 @@ suppressions['fr'] = ( () => { | ||
if (customSuppressions['fr']) { | ||
supp = supp.merge(customSuppressions['fr']); | ||
} | ||
supp.lock(); | ||
return supp; | ||
})(); |
@@ -48,4 +48,8 @@ suppressions['it'] = ( () => { | ||
if (customSuppressions['it']) { | ||
supp = supp.merge(customSuppressions['it']); | ||
} | ||
supp.lock(); | ||
return supp; | ||
})(); |
@@ -174,4 +174,8 @@ suppressions['pt'] = ( () => { | ||
if (customSuppressions['pt']) { | ||
supp = supp.merge(customSuppressions['pt']); | ||
} | ||
supp.lock(); | ||
return supp; | ||
})(); |
@@ -23,4 +23,8 @@ suppressions['ru'] = ( () => { | ||
if (customSuppressions['ru']) { | ||
supp = supp.merge(customSuppressions['ru']); | ||
} | ||
supp.lock(); | ||
return supp; | ||
})(); |
@@ -14,2 +14,26 @@ class Node { | ||
} | ||
copy() { | ||
let childrenCopy = {}; | ||
for (const key in this.children) { | ||
childrenCopy[key] = this.children[key].copy(); | ||
} | ||
return new Node(this.value, childrenCopy); | ||
} | ||
forEach(callback) { | ||
this._forEach(callback, []); | ||
} | ||
_forEach(callback, path) { | ||
if (this.value) { | ||
callback(path, this.value); | ||
} | ||
for (const key in this.children) { | ||
this.children[key]._forEach(callback, [...path, key]); | ||
} | ||
} | ||
} | ||
@@ -48,2 +72,20 @@ | ||
} | ||
copy() { | ||
return new Trie(this.root.copy()); | ||
} | ||
forEach(callback) { | ||
this.root.forEach(callback); | ||
} | ||
merge(otherTrie) { | ||
let result = this.copy(); | ||
otherTrie.forEach((key, value) => { | ||
result.add(key, value); | ||
}); | ||
return result; | ||
} | ||
} |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
1866287
47
16220
139