sbd - npm Package Compare versions

LICENSE

test/abbr.js

test/empty.js

test/html.js

test/lists.js

test/multiple_sentences.js

test/single_sentence.js

test/symbols.js

68

lib/Match.js


		// input: dot_index = position of a "." symbol
		exports.is_number = function(str, dot_index) {
		if (dot_index) {
		str = str.slice(dot_index-1, dot_index+2);
		exports.isCapitalized = function(str) {
		var firstChar = str.charAt(0);
		var rest = str.substring(1);

		return firstChar === firstChar.toUpperCase() &&
		rest === rest.toLowerCase();
		}

		// Start with opening quotes or capitalized letter
		exports.isSentenceStarter = function(str) {
		var t = /``\|"\|'/.test(str.substring(0,2)) \|\|
		this.isCapitalized(str);

		return t;
		}

		exports.isCommonAbbreviation = function(str) {
		var abbreviations = [
		"ie",
		"eg",
		"Fig",

		"Mrs", "Mr", "Ms",
		"Prof", "Dr",
		"Gen", "Rep", "Sen",
		"St",

		"Sr", "Jr",
		"PhD", "MD", "BA", "MA",
		"BSc", "MSc",

		"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec",
		"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"
		];

		return ~abbreviations.indexOf(str.replace(/\W+/g, ''));
		}

		exports.isDottedAbbreviation = function(word) {
		var matches = word.replace(/[\[\]\{\}]/g, '').match(/(.\.)*/);

		return matches && matches[0].length > 0;
		}

		// Short word with "capital letter" and ends with "dot"
		// example Sen. or Gov.
		exports.isCustomAbbreviation = function(str) {
		if (str.length > 4)
		return false;

		return str[0] === str[0].toUpperCase();
		}

		exports.isNumber = function(str, dotPos) {
		if (dotPos) {
		str = str.slice(dotPos-1, dotPos+2);
		}
		@@ -13,3 +65,3 @@
		// http://stackoverflow.com/a/123666/951517
		exports.is_phone_nr = function(str) {
		exports.isPhoneNr = function(str) {
		return str.match(/^(?:(?:\+?1\s(?:[.-]\s)?)?(?:$\s([2-9]1[02-9]\|[2-9][02-8]1\|[2-9][02-8][02-9])\s$\|([2-9]1[02-9]\|[2-9][02-8]1\|[2-9][02-8][02-9]))\s(?:[.-]\s)?)?([2-9]1[02-9]\|[2-9][02-9]1\|[2-9][02-9]{2})\s(?:[.-]\s)?([0-9]{4})(?:\s(?:#\|x\.?\|ext\.?\|extension)\s(\d+))?$/);
		@@ -20,3 +72,3 @@ };
		// http://stackoverflow.com/a/3809435/951517
		exports.is_url = function(str) {
		exports.isURL = function(str) {
		return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/);
		@@ -27,3 +79,3 @@ };
		// Exception: The word is enclosed in brackets
		exports.is_concatenated = function(word) {
		exports.isConcatenated = function(word) {
		var i = 0;
		@@ -43,3 +95,3 @@

		exports.is_boundary_char = function(word) {
		exports.isBoundaryChar = function(word) {
		return word === "." \|\|
		@@ -46,0 +98,0 @@ word === "!" \|\|

22

lib/sanitize-html-browser.js

		@@ -0,13 +1,15 @@

		module.exports = function sanitizeHtml(text, opts) {
		// Strip HTML from Text using browser HTML parser
		if (typeof text == 'string' \|\| text instanceof String) {
		var $div = document.createElement("DIV");
		$div.innerHTML = text;
		text = ($div.textContent \|\| '').trim();
		}
		//DOM Object
		else if (typeof text === 'object' && text.textContent) {
		text = (text.textContent \|\| '').trim();
		}

		if (typeof text == 'string' \|\| text instanceof String) { // Strip HTML from Text using browser HTML parser
		var $div = document.createElement("DIV");
		$div.innerHTML = text;
		text = ($div.textContent \|\| '').trim();
		} else if (typeof text === 'object' && text.textContent) { //DOM Object
		text = (text.textContent \|\| '').trim();
		}

		return text;

		return text;
		};

10

lib/String.js


		exports.ends_with_char = function ends_with_char(word, c) {
		exports.endsWithChar = function ends_with_char(word, c) {
		if (c.length > 1) {
		@@ -10,8 +10,4 @@ return c.indexOf(word.slice(-1)) > -1;

		exports.ends_with = function ends_with(word, end) {
		exports.endsWith = function ends_with(word, end) {
		return word.slice(word.length - end.length) === end;
		};

		exports.is_dotted_abbreviation = function is_dotted_abbreviation(word) {
		return word.match(/(.[.])*/)[0].length > 0;
		}
		};

72

lib/tokenizer.js

		@@ -9,4 +9,2 @@ /jshint node:true, laxcomma:true /

		var abbreviations = require('../data/abbr').abbreviations;

		var newline_placeholder = " @~@ ";
		@@ -16,7 +14,11 @@ var newline_placeholder_t = newline_placeholder.trim();
		// Split the entry into sentences.
		exports.sentences = function sentences(text, newline_boundary) {
		exports.sentences = function(text, newline_boundary) {

		if (text.length === 0)
		return [];

		text = sanitizeHtml(text, { "allowedTags" : [] });

		var i,index = 0;
		var temp = [];
		var index = 0;
		var temp = [];

		@@ -37,8 +39,8 @@ /** Preprocessing */

		for (i=0; i<words.length; i++) {
		for (var i=0, L=words.length; i < L; i++) {
		// Add the word to current sentence
		current.push(words[i]);

		if (Match.is_boundary_char(words[i]) \|\|
		String.ends_with_char(words[i], "?!") \|\|
		if (Match.isBoundaryChar(words[i]) \|\|
		String.endsWithChar(words[i], "?!") \|\|
		words[i] === newline_placeholder_t)
		@@ -56,21 +58,37 @@ {

		// A dot might indicate the end of a sentence
		if (String.ends_with_char(words[i], '.')) {
		// Single characters + dot are considered abbreviations
		if (words[i].length === 2) {
		continue;
		// A dot might indicate the end sentences
		// Exception: The next sentence starts with a word (non abbreviation)
		// that has a capital letter.
		if (String.endsWithChar(words[i], '.')) {
		// Check if the word is in the abbreviation list (without symbols)

		if (Match.isCommonAbbreviation(words[i])) {
		continue;
		}

		/** Check for abbreviations */
		// Check if there is a next word
		if (i+1 < L) {
		// This should be improved with machine learning

		// Custom dotted abbreviations (like K.L.M or I.C.T)
		if (String.is_dotted_abbreviation(words[i])) {
		continue;
		}
		// Next word starts with capital word, but current sentence is
		// quite short
		if (Match.isSentenceStarter(words[i+1])) {
		if (current.length < 6) {
		// Custom dotted abbreviations (like K.L.M or I.C.T)
		if (Match.isDottedAbbreviation(words[i]) \|\| Match.isCustomAbbreviation(words[i])) {
		continue;
		}
		}
		}
		else {
		// Skip ellipsis
		if (String.endsWith(words[i], "..")) {
		continue;
		}

		// Check if the word is in the abbr. list (without
		// the period and lowercased)
		var w = words[i].toLowerCase().slice(0, -1);
		if (abbreviations.indexOf(w) > -1) {
		continue;
		// Skip abbreviations
		if (Match.isDottedAbbreviation(words[i]) \|\| Match.isCustomAbbreviation(words[i])) {
		continue;
		}
		}
		}
		@@ -85,3 +103,3 @@
		if ((index = words[i].indexOf(".")) > -1) {
		if (Match.is_number(words[i], index)) {
		if (Match.isNumber(words[i], index)) {
		continue;
		@@ -91,3 +109,3 @@ }
		// Custom dotted abbreviations (like K.L.M or I.C.T)
		if (String.is_dotted_abbreviation(words[i])) {
		if (Match.isDottedAbbreviation(words[i])) {
		continue;
		@@ -97,3 +115,3 @@ }
		// Skip urls / emails and the like
		if (Match.is_url(words[i]) \|\| Match.is_phone_nr(words[i])) {
		if (Match.isURL(words[i]) \|\| Match.isPhoneNr(words[i])) {
		continue;
		@@ -103,3 +121,3 @@ }

		if (temp = Match.is_concatenated(words[i])) {
		if (temp = Match.isConcatenated(words[i])) {
		current.pop();
		@@ -106,0 +124,0 @@ current.push(temp[0]);

10

package.json

		{
		"name": "sbd",
		"version": "0.0.5",
		"description": "Split text into sentences",
		"version": "0.0.6",
		"description": "Split text into sentences with Sentence Boundary Detection (SBD).",
		"main": "lib/tokenizer.js",
		@@ -15,7 +15,3 @@ "scripts": {
		"keywords": [
		"sentence splitting",
		"splitting",
		"tokenize sentences",
		"sentences",
		"sentence"
		"sentence", "detection", "boundary"
		],
		@@ -22,0 +18,0 @@ "author": {

37

README.md

		@@ -1,14 +0,13 @@
		# Sentence Boundary Detection (SBD)
		Sentence Boundary Detection (SBD)
		==================

		Simple sentence detection (i.e working ~95% of the time):
		Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time).


		* Split a text based on period, question- and exclamation marks.
		* Skips abbreviations
		* Skips numbers, currency
		* Skips urls, email address, phone nr.
		* Skips (most) abbreviations (Mr., Mrs., PhD.)
		* Skips numbers/currency
		* Skips urls, websites, email addresses, phone nr.
		* Counts ellipsis and ?! as single punctuation

		## Future work

		Currently, `sbd` fails to recognize sentences ending in an abbreviation, for example "The president lives in Washington, D.C." and I do not really see a viable option other than using a real classifier with proper training.

		## Installation
		@@ -26,13 +25,23 @@

		var text = "In I.C.T we have multiple challenges!
		This is a text of three sentences. Skip Mr. Money €10.00 right.";
		var text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.";
		var sentences = tokenizer.sentences(text);

		// [
		// 'On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S.',
		// 'Millions attended the Inauguration.',
		// ]


		var text = "Got any problems? Open an issue on github.com!";
		var sentences = tokenizer.sentences(text);

		// [
		// 'In I.C.T we have multiple challenges!',
		// 'This is a text of three sentences.',
		// 'Skip Mr. Money €10.00 right.'
		// 'Got any problems?',
		// 'Open an issue on github.com!',
		// ]
		```

		## Notes

		I cannot find a "test data set" to rate the performance, but I can imagine it needs a trained data set to help with difficult edge cases. For example, sentences that do end with an abbreviation.

data/_build.js

data/abbr.js

test/test.js

sbd - npm Package Compare versions

New alerts

Improved metrics

Worsened metrics