Comparing version 1.0.0 to 1.0.1
126
lib/Match.js
var abbreviations = [ | ||
"ie", | ||
"eg", | ||
"ext", // + number? | ||
"Fig", | ||
"fig", | ||
"Figs", | ||
"figs", | ||
"et al", | ||
"Co", | ||
"Corp", | ||
"Ave", | ||
"Inc", | ||
"Ex", | ||
"Viz", | ||
"vs", | ||
"Vs", | ||
"repr", | ||
"Rep", | ||
"Dem", | ||
"trans", | ||
"Vol", | ||
"pp", | ||
"rev", | ||
"est", | ||
"Ref", | ||
"Refs", | ||
"Eq", | ||
"Eqs", | ||
"Ch", | ||
"Sec", | ||
"Secs", | ||
"mi", | ||
"Dept", | ||
"Univ", | ||
"Nos", | ||
"No", | ||
"Mol", | ||
"Cell", | ||
"Miss", "Mrs", "Mr", "Ms", | ||
"Prof", "Dr", | ||
"Sgt", "Col", "Gen", "Rep", "Sen",'Gov', "Lt", "Maj", "Capt","St", | ||
"Sr", "Jr", "jr", "Rev", | ||
"PhD", "MD", "BA", "MA", "MM", | ||
"BSc", "MSc", | ||
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec", | ||
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat" | ||
]; | ||
exports.isCapitalized = function(str) { | ||
var firstChar = str.charAt(0); | ||
var rest = str.substring(1); | ||
return firstChar === firstChar.toUpperCase() && | ||
rest === rest.toLowerCase(); | ||
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str); | ||
} | ||
@@ -12,28 +61,20 @@ | ||
exports.isSentenceStarter = function(str) { | ||
var t = /``|"|'/.test(str.substring(0,2)) || | ||
this.isCapitalized(str); | ||
return t; | ||
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0,2)); | ||
} | ||
exports.isCommonAbbreviation = function(str) { | ||
var abbreviations = [ | ||
"ie", | ||
"eg", | ||
"Fig", | ||
return ~abbreviations.indexOf(str.replace(/\W+/g, '')); | ||
} | ||
"Mrs", "Mr", "Ms", | ||
"Prof", "Dr", | ||
"Gen", "Rep", "Sen", | ||
"St", | ||
// This is going towards too much rule based | ||
exports.isTimeAbbreviation = function(word, next) { | ||
if (word === "a.m." || word === "p.m.") { | ||
var tmp = next.replace(/\W+/g, '').slice(-3).toLowerCase(); | ||
"Sr", "Jr", | ||
"PhD", "MD", "BA", "MA", | ||
"BSc", "MSc", | ||
if (tmp === "day") { | ||
return true; | ||
} | ||
} | ||
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec", | ||
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat" | ||
]; | ||
return ~abbreviations.indexOf(str.replace(/\W+/g, '')); | ||
return false; | ||
} | ||
@@ -43,15 +84,33 @@ | ||
var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/); | ||
return matches && matches[0].length > 0; | ||
} | ||
// Short word with "capital letter" and ends with "dot" | ||
// example Sen. or Gov. | ||
// TODO look for next words, if multiple capitalized -> not sentence ending | ||
exports.isCustomAbbreviation = function(str) { | ||
if (str.length > 4) | ||
return false; | ||
if (str.length <= 3) | ||
return true; | ||
return str[0] === str[0].toUpperCase(); | ||
return this.isCapitalized(str); | ||
} | ||
// Uses current word count in sentence and next few words to check if it is | ||
// more likely an abbreviation + name or new sentence. | ||
// ~ TODO Perhaps also consider prev. word? | ||
exports.isNameAbbreviation = function(wordCount, words) { | ||
if (words.length > 0) { | ||
if (wordCount < 5 && words[0].length < 6 && this.isCapitalized(words[0])) { | ||
return true; | ||
} | ||
var capitalized = words.filter(function(str) { | ||
return /[A-Z]/.test(str.charAt(0)); | ||
}); | ||
return capitalized.length >= 3; | ||
} | ||
return false; | ||
} | ||
exports.isNumber = function(str, dotPos) { | ||
@@ -82,3 +141,6 @@ if (dotPos) { | ||
if ((i = word.indexOf(".")) > -1 || (i = word.indexOf("!")) > -1 || (i = word.indexOf("?")) > -1) { | ||
if ((i = word.indexOf(".")) > -1 || | ||
(i = word.indexOf("!")) > -1 || | ||
(i = word.indexOf("?")) > -1) | ||
{ | ||
var c = word.charAt(i + 1); | ||
@@ -85,0 +147,0 @@ |
@@ -14,11 +14,7 @@ /*jshint node:true, laxcomma:true */ | ||
exports.sentences = function(text, newline_boundary) { | ||
if (text.length === 0) | ||
return []; | ||
text = sanitizeHtml(text, { "allowedTags" : [] }); | ||
text = sanitizeHtml(text, { "allowedTags" : [''] }); | ||
var index = 0; | ||
var temp = []; | ||
/** Preprocessing */ | ||
@@ -30,14 +26,27 @@ if (typeof newline_boundary === 'undefined') { | ||
if (newline_boundary) { | ||
text = text.replace(/\n+/g, newline_placeholder); | ||
text = text.replace(/\n+|[-#=_+*]{4,}/g, newline_placeholder); | ||
} | ||
var index = 0; | ||
var temp = []; | ||
// Split the text into words | ||
var words = text.match(/\S+/g); // see http://blog.tompawlak.org/split-string-into-tokens-javascript | ||
var sentences = []; | ||
var current = []; | ||
var wordCount = 0; | ||
for (var i=0, L=words.length; i < L; i++) { | ||
wordCount++; | ||
// Add the word to current sentence | ||
current.push(words[i]); | ||
// Sub-sentences (Bijzin?), reset counter | ||
if (~words[i].indexOf(',')) { | ||
wordCount = 0; | ||
} | ||
if (Match.isBoundaryChar(words[i]) || | ||
@@ -47,3 +56,3 @@ String.endsWithChar(words[i], "?!") || | ||
{ | ||
if (newline_boundary) { | ||
if (newline_boundary && words[i] === newline_placeholder_t) { | ||
current.pop(); | ||
@@ -53,4 +62,6 @@ } | ||
sentences.push(current); | ||
current = []; | ||
wordCount = 0; | ||
current = []; | ||
continue; | ||
@@ -63,8 +74,3 @@ } | ||
if (String.endsWithChar(words[i], '.')) { | ||
// Check if the word is in the abbreviation list (without symbols) | ||
if (Match.isCommonAbbreviation(words[i])) { | ||
continue; | ||
} | ||
// Check if there is a next word | ||
@@ -74,11 +80,27 @@ if (i+1 < L) { | ||
// Single character abbr. | ||
if (words[i].length === 2 && isNaN(words[i].charAt(0))) { | ||
continue; | ||
} | ||
// Common abbr. that often do not end sentences | ||
if (Match.isCommonAbbreviation(words[i])) { | ||
continue; | ||
} | ||
// Next word starts with capital word, but current sentence is | ||
// quite short | ||
if (Match.isSentenceStarter(words[i+1])) { | ||
if (current.length < 6) { | ||
// Custom dotted abbreviations (like K.L.M or I.C.T) | ||
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) { | ||
continue; | ||
} | ||
if (Match.isTimeAbbreviation(words[i], words[i+1])) { | ||
continue; | ||
} | ||
// Dealing with names at the start of sentences | ||
if (Match.isNameAbbreviation(wordCount, words.slice(i, 6))) { | ||
continue; | ||
} | ||
if (Match.isNumber(words[i+1]) && Match.isCustomAbbreviation(words[i])) { | ||
continue; | ||
} | ||
} | ||
@@ -91,3 +113,4 @@ else { | ||
// Skip abbreviations | ||
//// Skip abbreviations | ||
// Short words + dot or a dot after each letter | ||
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) { | ||
@@ -100,3 +123,5 @@ continue; | ||
sentences.push(current); | ||
current = []; | ||
current = []; | ||
wordCount = 0; | ||
continue; | ||
@@ -128,2 +153,3 @@ } | ||
current = []; | ||
wordCount = 0; | ||
current.push(temp[1]); | ||
@@ -140,3 +166,3 @@ } | ||
// Clear empty values | ||
// Clear "empty" sentences | ||
sentences = sentences.filter(function(s) { | ||
@@ -143,0 +169,0 @@ return s.length > 0; |
{ | ||
"name": "sbd", | ||
"version": "1.0.0", | ||
"version": "1.0.1", | ||
"description": "Split text into sentences with Sentence Boundary Detection (SBD).", | ||
@@ -5,0 +5,0 @@ "main": "lib/tokenizer.js", |
@@ -66,5 +66,10 @@ Sentence Boundary Detection (SBD) | ||
## Future work | ||
* Convert quotes to normalized unicode "" | ||
* Convert hex-symbols to normalized symbol (i.e &mdash -> &) | ||
* Force sentence breaking on new paragraphs (i.e </p> and <p> ==> \n\n ==> multiple newlines are sentence breaking); | ||
## Notes | ||
I cannot find a "test data set" to rate the performance, but I can imagine it needs a trained data set to help with difficult edge cases. For example, sentences that do end with an abbreviation. | ||
@@ -8,3 +8,3 @@ /*jshint node:true, laxcomma:true */ | ||
describe('Multiple sentences', function () { | ||
describe('Abbreviations in sentences', function () { | ||
@@ -20,2 +20,20 @@ describe('Skip dotted abbreviations', function () { | ||
describe('Skip dotted abbreviations (B)', function () { | ||
var entry = "From amat frequentor minimus hello there at 8 a.m. there p.m. should only be two sentences."; | ||
var sentences = tokenizer.sentences(entry); | ||
it("should get 1 sentence", function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
describe('Skip dotted abbreviations (C)', function () { | ||
var entry = "The school, called Booker T and Stevie Ray\'s Wrestling and Mixed Mart Arts Academy, will have an open house 2-6 p.m. Saturday."; | ||
var sentences = tokenizer.sentences(entry); | ||
it("should get 1 sentence", function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
describe('Skip common abbreviations', function () { | ||
@@ -22,0 +40,0 @@ var entry = "Fig. 2. displays currency rates i.e. something libsum. Currencies widely available (i.e. euro, dollar, pound), or alternatively (e.g. €, $, etc.)"; |
@@ -55,2 +55,38 @@ /*jshint node:true, laxcomma:true */ | ||
describe('Difficult two sentences (D)', function () { | ||
var entry = "Baril, a Richmond lawyer once nominated for a federal prosecutors job, endorsed a faith-based drug initiative in local jails patterned after the Henrico County jails therapeutic program called Project R.I.S.E. Just as important, he had a great foil across the net."; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get two sentence', function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
describe('Difficult two sentences (E)', function () { | ||
var entry = "Newsletter AIDs CARE, EDUCATION AND TRAINING Issue No. 7. Acet Home Care, which moves into the building in July, will share the offices with two other AIDS charities, P.A.L.S. (Portsmouth AIDS Link Support) and the Link Project."; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get two sentence', function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
describe('Difficult two sentences (F)', function () { | ||
var entry = "Another is expanded hours of operation -- from fewer than five hours a day to 9:30 a.m. to 4 p.m. Monday through Saturday. Sunday remains closed."; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get two sentence', function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
describe('Difficult two sentences (G)', function () { | ||
var entry = "Gold Wing Road Rider's Association - Coffee break, Guzzardo's Italian Villa, eat, 6 p.m.; ride, 7 p.m. Then at 9 p.m. go home."; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get two sentence', function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
describe('Dot in middle of word is not skipped if followed by capital letter', function () { | ||
@@ -91,2 +127,11 @@ var entry = "Hello Barney.The bird in the word."; | ||
}); | ||
describe('If newlines are boundaries (B)', function () { | ||
var entry = "FAMILIY HISTORY ========================================== Nothing interesting"; | ||
var sentences = tokenizer.sentences(entry, true); | ||
it("should get 2 sentences", function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
}); |
@@ -36,2 +36,47 @@ /*jshint node:true, laxcomma:true */ | ||
describe('Difficult sentence (B)', function () { | ||
var entry = "It happened around 5:30 p.m. in the 500 block of W. 82nd St. Investigators say Terrence Taylor, 22, and Deontrell Sloan, 17, got into an argument over money during the game."; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get 1 sentence', function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
describe('Difficult sentence (C)', function () { | ||
var entry = "GARY Mayor Scott L. King has declared a 'cash crisis' and has asked city department heads to put off all non-essential spending until June."; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get 1 sentence', function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
describe('Difficult sentence (D)', function () { | ||
var entry = "HOWELL, Mich. - Blissfield was only nine outs away from ending the longest winning streak"; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get 1 sentence', function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
describe('Difficult sentence (E)', function () { | ||
var entry = "33 FORT LAUDERDALE U.S. President George W Bush touted free trade as a means of strengthening democracy"; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get 1 sentence', function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
describe('Difficult sentence (F)', function () { | ||
var entry = "Mike Tyler rides his bike on Del. 1 near Lewes early last month"; | ||
var sentences = tokenizer.sentences(entry); | ||
it('should get 1 sentence', function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
// Questionable behavior, but can only be fixed using ML? | ||
@@ -38,0 +83,0 @@ describe('Dot in middle of word is skipped', function () { |
@@ -45,2 +45,20 @@ /*jshint node:true, laxcomma:true */ | ||
}); | ||
describe('Newlines/paragraph must be enabled to end sentences', function () { | ||
var entry = "The humble bundle sale\r\nDate: Monday-Fri starting 2015-01-01"; | ||
var sentences = tokenizer.sentences(entry); | ||
it("should get 1 sentences", function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
describe('Newlines/paragraph enabled ends sentences', function () { | ||
var entry = "The humble bundle sale\r\nDate: Monday-Fri starting 2015-01-01\nSales starting at ¤2,50"; | ||
var sentences = tokenizer.sentences(entry, true); | ||
it("should get 3 sentences", function () { | ||
assert.equal(sentences.length, 3); | ||
}); | ||
}); | ||
}); |
Sorry, the diff of this file is not supported yet
29760
616
75