Comparing version 1.0.5 to 1.0.6
@@ -89,4 +89,5 @@ (function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){ | ||
exports.isCustomAbbreviation = function(str) { | ||
if (str.length <= 3) | ||
if (str.length <= 3) { | ||
return true; | ||
} | ||
@@ -234,3 +235,3 @@ return this.isCapitalized(str); | ||
if (options.html_boundaries) { | ||
text = text.replace(/(<br \/>)/g, "$1" + newline_placeholder); | ||
text = text.replace(/(<br\s*\/?>|<\/[p|div|ul|ol]>)/g, "$1" + newline_placeholder); | ||
} | ||
@@ -291,7 +292,5 @@ | ||
if (String.endsWithChar(words[i], '.')) { | ||
// Check if there is a next word | ||
// This probably needs to be improved with machine learning | ||
if (i+1 < L) { | ||
// This should be improved with machine learning | ||
// Single character abbr. | ||
@@ -319,4 +318,6 @@ if (words[i].length === 2 && isNaN(words[i].charAt(0))) { | ||
if (Match.isNumber(words[i+1]) && Match.isCustomAbbreviation(words[i])) { | ||
continue; | ||
if (Match.isNumber(words[i+1])) { | ||
if (Match.isCustomAbbreviation(words[i])) { | ||
continue; | ||
} | ||
} | ||
@@ -332,5 +333,9 @@ } | ||
// Short words + dot or a dot after each letter | ||
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) { | ||
if (Match.isDottedAbbreviation(words[i])) { | ||
continue; | ||
} | ||
if (Match.isNameAbbreviation(wordCount, words.slice(i, 5))) { | ||
continue; | ||
} | ||
} | ||
@@ -374,4 +379,5 @@ } | ||
if (current.length) | ||
if (current.length) { | ||
sentences.push(current); | ||
} | ||
@@ -387,3 +393,3 @@ /** After processing */ | ||
for (i=0; i < sentences.length; i++) { | ||
for (var i=0; i < sentences.length; i++) { | ||
sentence = sentences[i].join(" "); | ||
@@ -390,0 +396,0 @@ |
@@ -1,1 +0,1 @@ | ||
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer=f()}})(function(){var define,module,exports;return function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}({1:[function(require,module,exports){var abbreviations=["ie","eg","ext","Fig","fig","Figs","figs","et al","Co","Corp","Ave","Inc","Ex","Viz","vs","Vs","repr","Rep","Dem","trans","Vol","pp","rev","est","Ref","Refs","Eq","Eqs","Ch","Sec","Secs","mi","Dept","Univ","Nos","No","Mol","Cell","Miss","Mrs","Mr","Ms","Prof","Dr","Sgt","Col","Gen","Rep","Sen","Gov","Lt","Maj","Capt","St","Sr","Jr","jr","Rev","PhD","MD","BA","MA","MM","BSc","MSc","Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec","Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"];exports.isCapitalized=function(str){return/^[A-Z][a-z].*/.test(str)||this.isNumber(str)};exports.isSentenceStarter=function(str){return this.isCapitalized(str)||/``|"|'/.test(str.substring(0,2))};exports.isCommonAbbreviation=function(str){return~abbreviations.indexOf(str.replace(/\W+/g,""))};exports.isTimeAbbreviation=function(word,next){if(word==="a.m."||word==="p.m."){var tmp=next.replace(/\W+/g,"").slice(-3).toLowerCase();if(tmp==="day"){return true}}return false};exports.isDottedAbbreviation=function(word){var matches=word.replace(/[\(\)\[\]\{\}]/g,"").match(/(.\.)*/);return matches&&matches[0].length>0};exports.isCustomAbbreviation=function(str){if(str.length<=3)return true;return this.isCapitalized(str)};exports.isNameAbbreviation=function(wordCount,words){if(words.length>0){if(wordCount<5&&words[0].length<6&&this.isCapitalized(words[0])){return true}var capitalized=words.filter(function(str){return/[A-Z]/.test(str.charAt(0))});return capitalized.length>=3}return false};exports.isNumber=function(str,dotPos){if(dotPos){str=str.slice(dotPos-1,dotPos+2)}return!isNaN(str)};exports.isPhoneNr=function(str){return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/)};exports.isURL=function(str){return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)/)};exports.isConcatenated=function(word){var i=0;if((i=word.indexOf("."))>-1||(i=word.indexOf("!"))>-1||(i=word.indexOf("?"))>-1){var c=word.charAt(i+1);if(c.match(/[a-zA-Z].*/)){return[word.slice(0,i),word.slice(i+1)]}}return false};exports.isBoundaryChar=function(word){return word==="."||word==="!"||word==="?"}},{}],2:[function(require,module,exports){exports.endsWithChar=function ends_with_char(word,c){if(c.length>1){return c.indexOf(word.slice(-1))>-1}return word.slice(-1)===c};exports.endsWith=function ends_with(word,end){return word.slice(word.length-end.length)===end}},{}],3:[function(require,module,exports){module.exports=function sanitizeHtml(text,opts){if(typeof text=="string"||text instanceof String){var $div=document.createElement("DIV");$div.innerHTML=text;text=($div.textContent||"").trim()}else if(typeof text==="object"&&text.textContent){text=(text.textContent||"").trim()}return text}},{}],4:[function(require,module,exports){"use strict";var sanitizeHtml=require("sanitize-html");var String=require("./String");var Match=require("./Match");var newline_placeholder=" @~@ ";var newline_placeholder_t=newline_placeholder.trim();exports.sentences=function(text,user_options){if(!text||typeof text==="undefined"||text.length===0){return[]}var options={newline_boundaries:false,html_boundaries:false,sanitize:false,allowed_tags:false};if(typeof user_options==="boolean"){options.newline_boundaries=true}else{for(var k in user_options){options[k]=user_options[k]}}if(options.newline_boundaries){text=text.replace(/\n+|[-#=_+*]{4,}/g,newline_placeholder)}if(options.html_boundaries){text=text.replace(/(<br \/>)/g,"$1"+newline_placeholder)}if(options.sanitize||options.allowed_tags){if(!options.allowed_tags){options.allowed_tags=[""]}text=sanitizeHtml(text,{allowedTags:options.allowed_tags})}var words=text.match(/\S+/g);var wordCount=0;var index=0;var temp=[];var sentences=[];var current=[];for(var i=0,L=words.length;i<L;i++){wordCount++;current.push(words[i]);if(~words[i].indexOf(",")){wordCount=0}if(Match.isBoundaryChar(words[i])||String.endsWithChar(words[i],"?!")||words[i]===newline_placeholder_t){if((options.newline_boundaries||options.html_boundaries)&&words[i]===newline_placeholder_t){current.pop()}sentences.push(current);wordCount=0;current=[];continue}if(String.endsWithChar(words[i],'"')||String.endsWithChar(words[i],"”")){words[i]=words[i].slice(0,-1)}if(String.endsWithChar(words[i],".")){if(i+1<L){if(words[i].length===2&&isNaN(words[i].charAt(0))){continue}if(Match.isCommonAbbreviation(words[i])){continue}if(Match.isSentenceStarter(words[i+1])){if(Match.isTimeAbbreviation(words[i],words[i+1])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,6))){continue}if(Match.isNumber(words[i+1])&&Match.isCustomAbbreviation(words[i])){continue}}else{if(String.endsWith(words[i],"..")){continue}if(Match.isDottedAbbreviation(words[i])||Match.isCustomAbbreviation(words[i])){continue}}}sentences.push(current);current=[];wordCount=0;continue}if((index=words[i].indexOf("."))>-1){if(Match.isNumber(words[i],index)){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isURL(words[i])||Match.isPhoneNr(words[i])){continue}}if(temp=Match.isConcatenated(words[i])){current.pop();current.push(temp[0]);sentences.push(current);current=[];wordCount=0;current.push(temp[1])}}if(current.length)sentences.push(current);var result=[];var sentence="";sentences=sentences.filter(function(s){return s.length>0});for(i=0;i<sentences.length;i++){sentence=sentences[i].join(" ");if(sentences[i].length===1&&sentences[i][0].length<4&&sentences[i][0].indexOf(".")>-1){if(sentences[i+1]&&sentences[i+1][0].indexOf(".")<0){sentence+=" "+sentences[i+1].join(" ");i++}}result.push(sentence)}return result}},{"./Match":1,"./String":2,"sanitize-html":3}]},{},[4])(4)}); | ||
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer=f()}})(function(){var define,module,exports;return function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}({1:[function(require,module,exports){var abbreviations=["ie","eg","ext","Fig","fig","Figs","figs","et al","Co","Corp","Ave","Inc","Ex","Viz","vs","Vs","repr","Rep","Dem","trans","Vol","pp","rev","est","Ref","Refs","Eq","Eqs","Ch","Sec","Secs","mi","Dept","Univ","Nos","No","Mol","Cell","Miss","Mrs","Mr","Ms","Prof","Dr","Sgt","Col","Gen","Rep","Sen","Gov","Lt","Maj","Capt","St","Sr","Jr","jr","Rev","PhD","MD","BA","MA","MM","BSc","MSc","Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec","Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"];exports.isCapitalized=function(str){return/^[A-Z][a-z].*/.test(str)||this.isNumber(str)};exports.isSentenceStarter=function(str){return this.isCapitalized(str)||/``|"|'/.test(str.substring(0,2))};exports.isCommonAbbreviation=function(str){return~abbreviations.indexOf(str.replace(/\W+/g,""))};exports.isTimeAbbreviation=function(word,next){if(word==="a.m."||word==="p.m."){var tmp=next.replace(/\W+/g,"").slice(-3).toLowerCase();if(tmp==="day"){return true}}return false};exports.isDottedAbbreviation=function(word){var matches=word.replace(/[\(\)\[\]\{\}]/g,"").match(/(.\.)*/);return matches&&matches[0].length>0};exports.isCustomAbbreviation=function(str){if(str.length<=3){return true}return this.isCapitalized(str)};exports.isNameAbbreviation=function(wordCount,words){if(words.length>0){if(wordCount<5&&words[0].length<6&&this.isCapitalized(words[0])){return true}var capitalized=words.filter(function(str){return/[A-Z]/.test(str.charAt(0))});return capitalized.length>=3}return false};exports.isNumber=function(str,dotPos){if(dotPos){str=str.slice(dotPos-1,dotPos+2)}return!isNaN(str)};exports.isPhoneNr=function(str){return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/)};exports.isURL=function(str){return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)/)};exports.isConcatenated=function(word){var i=0;if((i=word.indexOf("."))>-1||(i=word.indexOf("!"))>-1||(i=word.indexOf("?"))>-1){var c=word.charAt(i+1);if(c.match(/[a-zA-Z].*/)){return[word.slice(0,i),word.slice(i+1)]}}return false};exports.isBoundaryChar=function(word){return word==="."||word==="!"||word==="?"}},{}],2:[function(require,module,exports){exports.endsWithChar=function ends_with_char(word,c){if(c.length>1){return c.indexOf(word.slice(-1))>-1}return word.slice(-1)===c};exports.endsWith=function ends_with(word,end){return word.slice(word.length-end.length)===end}},{}],3:[function(require,module,exports){module.exports=function sanitizeHtml(text,opts){if(typeof text=="string"||text instanceof String){var $div=document.createElement("DIV");$div.innerHTML=text;text=($div.textContent||"").trim()}else if(typeof text==="object"&&text.textContent){text=(text.textContent||"").trim()}return text}},{}],4:[function(require,module,exports){"use strict";var sanitizeHtml=require("sanitize-html");var String=require("./String");var Match=require("./Match");var newline_placeholder=" @~@ ";var newline_placeholder_t=newline_placeholder.trim();exports.sentences=function(text,user_options){if(!text||typeof text==="undefined"||text.length===0){return[]}var options={newline_boundaries:false,html_boundaries:false,sanitize:false,allowed_tags:false};if(typeof user_options==="boolean"){options.newline_boundaries=true}else{for(var k in user_options){options[k]=user_options[k]}}if(options.newline_boundaries){text=text.replace(/\n+|[-#=_+*]{4,}/g,newline_placeholder)}if(options.html_boundaries){text=text.replace(/(<br\s*\/?>|<\/[p|div|ul|ol]>)/g,"$1"+newline_placeholder)}if(options.sanitize||options.allowed_tags){if(!options.allowed_tags){options.allowed_tags=[""]}text=sanitizeHtml(text,{allowedTags:options.allowed_tags})}var words=text.match(/\S+/g);var wordCount=0;var index=0;var temp=[];var sentences=[];var current=[];for(var i=0,L=words.length;i<L;i++){wordCount++;current.push(words[i]);if(~words[i].indexOf(",")){wordCount=0}if(Match.isBoundaryChar(words[i])||String.endsWithChar(words[i],"?!")||words[i]===newline_placeholder_t){if((options.newline_boundaries||options.html_boundaries)&&words[i]===newline_placeholder_t){current.pop()}sentences.push(current);wordCount=0;current=[];continue}if(String.endsWithChar(words[i],'"')||String.endsWithChar(words[i],"”")){words[i]=words[i].slice(0,-1)}if(String.endsWithChar(words[i],".")){if(i+1<L){if(words[i].length===2&&isNaN(words[i].charAt(0))){continue}if(Match.isCommonAbbreviation(words[i])){continue}if(Match.isSentenceStarter(words[i+1])){if(Match.isTimeAbbreviation(words[i],words[i+1])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,6))){continue}if(Match.isNumber(words[i+1])){if(Match.isCustomAbbreviation(words[i])){continue}}}else{if(String.endsWith(words[i],"..")){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,5))){continue}}}sentences.push(current);current=[];wordCount=0;continue}if((index=words[i].indexOf("."))>-1){if(Match.isNumber(words[i],index)){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isURL(words[i])||Match.isPhoneNr(words[i])){continue}}if(temp=Match.isConcatenated(words[i])){current.pop();current.push(temp[0]);sentences.push(current);current=[];wordCount=0;current.push(temp[1])}}if(current.length){sentences.push(current)}var result=[];var sentence="";sentences=sentences.filter(function(s){return s.length>0});for(var i=0;i<sentences.length;i++){sentence=sentences[i].join(" ");if(sentences[i].length===1&&sentences[i][0].length<4&&sentences[i][0].indexOf(".")>-1){if(sentences[i+1]&&sentences[i+1][0].indexOf(".")<0){sentence+=" "+sentences[i+1].join(" ");i++}}result.push(sentence)}return result}},{"./Match":1,"./String":2,"sanitize-html":3}]},{},[4])(4)}); |
@@ -88,4 +88,5 @@ | ||
exports.isCustomAbbreviation = function(str) { | ||
if (str.length <= 3) | ||
if (str.length <= 3) { | ||
return true; | ||
} | ||
@@ -92,0 +93,0 @@ return this.isCapitalized(str); |
@@ -42,3 +42,3 @@ /*jshint node:true, laxcomma:true */ | ||
if (options.html_boundaries) { | ||
text = text.replace(/(<br \/>)/g, "$1" + newline_placeholder); | ||
text = text.replace(/(<br\s*\/?>|<\/[p|div|ul|ol]>)/g, "$1" + newline_placeholder); | ||
} | ||
@@ -99,7 +99,5 @@ | ||
if (String.endsWithChar(words[i], '.')) { | ||
// Check if there is a next word | ||
// This probably needs to be improved with machine learning | ||
if (i+1 < L) { | ||
// This should be improved with machine learning | ||
// Single character abbr. | ||
@@ -127,4 +125,6 @@ if (words[i].length === 2 && isNaN(words[i].charAt(0))) { | ||
if (Match.isNumber(words[i+1]) && Match.isCustomAbbreviation(words[i])) { | ||
continue; | ||
if (Match.isNumber(words[i+1])) { | ||
if (Match.isCustomAbbreviation(words[i])) { | ||
continue; | ||
} | ||
} | ||
@@ -140,5 +140,9 @@ } | ||
// Short words + dot or a dot after each letter | ||
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) { | ||
if (Match.isDottedAbbreviation(words[i])) { | ||
continue; | ||
} | ||
if (Match.isNameAbbreviation(wordCount, words.slice(i, 5))) { | ||
continue; | ||
} | ||
} | ||
@@ -182,4 +186,5 @@ } | ||
if (current.length) | ||
if (current.length) { | ||
sentences.push(current); | ||
} | ||
@@ -195,3 +200,3 @@ /** After processing */ | ||
for (i=0; i < sentences.length; i++) { | ||
for (var i=0; i < sentences.length; i++) { | ||
sentence = sentences[i].join(" "); | ||
@@ -198,0 +203,0 @@ |
{ | ||
"name": "sbd", | ||
"version": "1.0.5", | ||
"version": "1.0.6", | ||
"description": "Split text into sentences with Sentence Boundary Detection (SBD).", | ||
@@ -18,3 +18,5 @@ "main": "lib/tokenizer.js", | ||
"keywords": [ | ||
"sentence", "detection", "boundary" | ||
"sentence", | ||
"detection", | ||
"boundary" | ||
], | ||
@@ -21,0 +23,0 @@ "author": { |
@@ -10,3 +10,3 @@ /*jshint node:true, laxcomma:true */ | ||
describe('HTML markup is ignored', function () { | ||
describe('HTML markup can be removed', function () { | ||
var entry = "<p>Hello this is my first sentence.</p> <br><br>There is also a second down the page."; | ||
@@ -35,10 +35,9 @@ var sentences = tokenizer.sentences(entry, { "sanitize": true }); | ||
describe('Closing html boundaries (br, p, div) split sentences.', function () { | ||
var entry = "What the Experts Say <br />In certain circumstances, “working for a manager who’s task-oriented and has a high need for achievement can be motivating,” says Linda Hill"; | ||
var entry = "What the Experts Say <br /> <p>In certain circumstances:</p> “working for a manager who’s task-oriented and has a high need for achievement can be motivating,” says Linda Hill"; | ||
var sentences = tokenizer.sentences(entry, { sanitize: false, "html_boundaries": true }); | ||
it("should get 2 sentences", function () { | ||
assert.equal(sentences.length, 2); | ||
it("should get 3 sentences", function () { | ||
assert.equal(sentences.length, 3); | ||
}); | ||
}); | ||
}); |
@@ -146,2 +146,13 @@ /*jshint node:true, laxcomma:true */ | ||
describe('Sentences with a name ending a sentence', function () { | ||
var entry = `If your boss assumes he can interrupt you any time and it’s "impacting the way you do your job," you should communicate that "you feel stretched," says Hill. A growing body of research shows that being “always on” hurts results.` | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
it("should get 2 sentences", function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
describe('If newlines are boundaries (B)', function () { | ||
@@ -148,0 +159,0 @@ var entry = "FAMILIY HISTORY ========================================== Nothing interesting"; |
57430
1057