Comparing version 0.0.5 to 0.0.6
// input: dot_index = position of a "." symbol | ||
exports.is_number = function(str, dot_index) { | ||
if (dot_index) { | ||
str = str.slice(dot_index-1, dot_index+2); | ||
exports.isCapitalized = function(str) { | ||
var firstChar = str.charAt(0); | ||
var rest = str.substring(1); | ||
return firstChar === firstChar.toUpperCase() && | ||
rest === rest.toLowerCase(); | ||
} | ||
// Start with opening quotes or capitalized letter | ||
exports.isSentenceStarter = function(str) { | ||
var t = /``|"|'/.test(str.substring(0,2)) || | ||
this.isCapitalized(str); | ||
return t; | ||
} | ||
exports.isCommonAbbreviation = function(str) { | ||
var abbreviations = [ | ||
"ie", | ||
"eg", | ||
"Fig", | ||
"Mrs", "Mr", "Ms", | ||
"Prof", "Dr", | ||
"Gen", "Rep", "Sen", | ||
"St", | ||
"Sr", "Jr", | ||
"PhD", "MD", "BA", "MA", | ||
"BSc", "MSc", | ||
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec", | ||
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat" | ||
]; | ||
return ~abbreviations.indexOf(str.replace(/\W+/g, '')); | ||
} | ||
exports.isDottedAbbreviation = function(word) { | ||
var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/); | ||
return matches && matches[0].length > 0; | ||
} | ||
// Short word with "capital letter" and ends with "dot" | ||
// example Sen. or Gov. | ||
exports.isCustomAbbreviation = function(str) { | ||
if (str.length > 4) | ||
return false; | ||
return str[0] === str[0].toUpperCase(); | ||
} | ||
exports.isNumber = function(str, dotPos) { | ||
if (dotPos) { | ||
str = str.slice(dotPos-1, dotPos+2); | ||
} | ||
@@ -13,3 +65,3 @@ | ||
// http://stackoverflow.com/a/123666/951517 | ||
exports.is_phone_nr = function(str) { | ||
exports.isPhoneNr = function(str) { | ||
return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/); | ||
@@ -20,3 +72,3 @@ }; | ||
// http://stackoverflow.com/a/3809435/951517 | ||
exports.is_url = function(str) { | ||
exports.isURL = function(str) { | ||
return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/); | ||
@@ -27,3 +79,3 @@ }; | ||
// Exception: The word is enclosed in brackets | ||
exports.is_concatenated = function(word) { | ||
exports.isConcatenated = function(word) { | ||
var i = 0; | ||
@@ -43,3 +95,3 @@ | ||
exports.is_boundary_char = function(word) { | ||
exports.isBoundaryChar = function(word) { | ||
return word === "." || | ||
@@ -46,0 +98,0 @@ word === "!" || |
@@ -0,13 +1,15 @@ | ||
module.exports = function sanitizeHtml(text, opts) { | ||
// Strip HTML from Text using browser HTML parser | ||
if (typeof text == 'string' || text instanceof String) { | ||
var $div = document.createElement("DIV"); | ||
$div.innerHTML = text; | ||
text = ($div.textContent || '').trim(); | ||
} | ||
//DOM Object | ||
else if (typeof text === 'object' && text.textContent) { | ||
text = (text.textContent || '').trim(); | ||
} | ||
if (typeof text == 'string' || text instanceof String) { // Strip HTML from Text using browser HTML parser | ||
var $div = document.createElement("DIV"); | ||
$div.innerHTML = text; | ||
text = ($div.textContent || '').trim(); | ||
} else if (typeof text === 'object' && text.textContent) { //DOM Object | ||
text = (text.textContent || '').trim(); | ||
} | ||
return text; | ||
return text; | ||
}; |
exports.ends_with_char = function ends_with_char(word, c) { | ||
exports.endsWithChar = function ends_with_char(word, c) { | ||
if (c.length > 1) { | ||
@@ -10,8 +10,4 @@ return c.indexOf(word.slice(-1)) > -1; | ||
exports.ends_with = function ends_with(word, end) { | ||
exports.endsWith = function ends_with(word, end) { | ||
return word.slice(word.length - end.length) === end; | ||
}; | ||
exports.is_dotted_abbreviation = function is_dotted_abbreviation(word) { | ||
return word.match(/(.[.])*/)[0].length > 0; | ||
} | ||
}; |
@@ -9,4 +9,2 @@ /*jshint node:true, laxcomma:true */ | ||
var abbreviations = require('../data/abbr').abbreviations; | ||
var newline_placeholder = " @~@ "; | ||
@@ -16,7 +14,11 @@ var newline_placeholder_t = newline_placeholder.trim(); | ||
// Split the entry into sentences. | ||
exports.sentences = function sentences(text, newline_boundary) { | ||
exports.sentences = function(text, newline_boundary) { | ||
if (text.length === 0) | ||
return []; | ||
text = sanitizeHtml(text, { "allowedTags" : [] }); | ||
var i,index = 0; | ||
var temp = []; | ||
var index = 0; | ||
var temp = []; | ||
@@ -37,8 +39,8 @@ /** Preprocessing */ | ||
for (i=0; i<words.length; i++) { | ||
for (var i=0, L=words.length; i < L; i++) { | ||
// Add the word to current sentence | ||
current.push(words[i]); | ||
if (Match.is_boundary_char(words[i]) || | ||
String.ends_with_char(words[i], "?!") || | ||
if (Match.isBoundaryChar(words[i]) || | ||
String.endsWithChar(words[i], "?!") || | ||
words[i] === newline_placeholder_t) | ||
@@ -56,21 +58,37 @@ { | ||
// A dot might indicate the end of a sentence | ||
if (String.ends_with_char(words[i], '.')) { | ||
// Single characters + dot are considered abbreviations | ||
if (words[i].length === 2) { | ||
continue; | ||
// A dot might indicate the end sentences | ||
// Exception: The next sentence starts with a word (non abbreviation) | ||
// that has a capital letter. | ||
if (String.endsWithChar(words[i], '.')) { | ||
// Check if the word is in the abbreviation list (without symbols) | ||
if (Match.isCommonAbbreviation(words[i])) { | ||
continue; | ||
} | ||
/** Check for abbreviations */ | ||
// Check if there is a next word | ||
if (i+1 < L) { | ||
// This should be improved with machine learning | ||
// Custom dotted abbreviations (like K.L.M or I.C.T) | ||
if (String.is_dotted_abbreviation(words[i])) { | ||
continue; | ||
} | ||
// Next word starts with capital word, but current sentence is | ||
// quite short | ||
if (Match.isSentenceStarter(words[i+1])) { | ||
if (current.length < 6) { | ||
// Custom dotted abbreviations (like K.L.M or I.C.T) | ||
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) { | ||
continue; | ||
} | ||
} | ||
} | ||
else { | ||
// Skip ellipsis | ||
if (String.endsWith(words[i], "..")) { | ||
continue; | ||
} | ||
// Check if the word is in the abbr. list (without | ||
// the period and lowercased) | ||
var w = words[i].toLowerCase().slice(0, -1); | ||
if (abbreviations.indexOf(w) > -1) { | ||
continue; | ||
// Skip abbreviations | ||
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) { | ||
continue; | ||
} | ||
} | ||
} | ||
@@ -85,3 +103,3 @@ | ||
if ((index = words[i].indexOf(".")) > -1) { | ||
if (Match.is_number(words[i], index)) { | ||
if (Match.isNumber(words[i], index)) { | ||
continue; | ||
@@ -91,3 +109,3 @@ } | ||
// Custom dotted abbreviations (like K.L.M or I.C.T) | ||
if (String.is_dotted_abbreviation(words[i])) { | ||
if (Match.isDottedAbbreviation(words[i])) { | ||
continue; | ||
@@ -97,3 +115,3 @@ } | ||
// Skip urls / emails and the like | ||
if (Match.is_url(words[i]) || Match.is_phone_nr(words[i])) { | ||
if (Match.isURL(words[i]) || Match.isPhoneNr(words[i])) { | ||
continue; | ||
@@ -103,3 +121,3 @@ } | ||
if (temp = Match.is_concatenated(words[i])) { | ||
if (temp = Match.isConcatenated(words[i])) { | ||
current.pop(); | ||
@@ -106,0 +124,0 @@ current.push(temp[0]); |
{ | ||
"name": "sbd", | ||
"version": "0.0.5", | ||
"description": "Split text into sentences", | ||
"version": "0.0.6", | ||
"description": "Split text into sentences with Sentence Boundary Detection (SBD).", | ||
"main": "lib/tokenizer.js", | ||
@@ -15,7 +15,3 @@ "scripts": { | ||
"keywords": [ | ||
"sentence splitting", | ||
"splitting", | ||
"tokenize sentences", | ||
"sentences", | ||
"sentence" | ||
"sentence", "detection", "boundary" | ||
], | ||
@@ -22,0 +18,0 @@ "author": { |
@@ -1,14 +0,13 @@ | ||
# Sentence Boundary Detection (SBD) | ||
Sentence Boundary Detection (SBD) | ||
================== | ||
Simple sentence detection (i.e working ~95% of the time): | ||
Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time). | ||
* Split a text based on period, question- and exclamation marks. | ||
* Skips abbreviations | ||
* Skips numbers, currency | ||
* Skips urls, email address, phone nr. | ||
* Skips (most) abbreviations (Mr., Mrs., PhD.) | ||
* Skips numbers/currency | ||
* Skips urls, websites, email addresses, phone nr. | ||
* Counts ellipsis and ?! as single punctuation | ||
## Future work | ||
Currently, `sbd` fails to recognize sentences ending in an abbreviation, for example "The president lives in Washington, D.C." and I do not really see a viable option other than using a real classifier with proper training. | ||
## Installation | ||
@@ -26,13 +25,23 @@ | ||
var text = "In I.C.T we have multiple challenges! | ||
This is a text of three sentences. Skip Mr. Money €10.00 right."; | ||
var text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration."; | ||
var sentences = tokenizer.sentences(text); | ||
// [ | ||
// 'On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S.', | ||
// 'Millions attended the Inauguration.', | ||
// ] | ||
var text = "Got any problems? Open an issue on github.com!"; | ||
var sentences = tokenizer.sentences(text); | ||
// [ | ||
// 'In I.C.T we have multiple challenges!', | ||
// 'This is a text of three sentences.', | ||
// 'Skip Mr. Money €10.00 right.' | ||
// 'Got any problems?', | ||
// 'Open an issue on github.com!', | ||
// ] | ||
``` | ||
## Notes | ||
I cannot find a "test data set" to rate the performance, but I can imagine it needs a trained data set to help with difficult edge cases. For example, sentences that do end with an abbreviation. | ||
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
15
47
21314
445
1