Comparing version 1.0.6 to 1.0.8
101
dist/sbd.js
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){ | ||
var abbreviations = [ | ||
"ie", | ||
var abbreviations; | ||
var englishAbbreviations = [ | ||
"al", | ||
"adj", | ||
"assn", | ||
"Ave", | ||
"BSc", "MSc", | ||
"Cell", | ||
"Ch", | ||
"Co", | ||
"cc", | ||
"Corp", | ||
"Dem", | ||
"Dept", | ||
"ed", | ||
"eg", | ||
"Eq", | ||
"Eqs", | ||
"est", | ||
"est", | ||
"etc", | ||
"Ex", | ||
"ext", // + number? | ||
@@ -11,47 +29,47 @@ "Fig", | ||
"figs", | ||
"et al", | ||
"Co", | ||
"Corp", | ||
"Ave", | ||
"i.e", | ||
"ie", | ||
"Inc", | ||
"Ex", | ||
"Viz", | ||
"vs", | ||
"Vs", | ||
"repr", | ||
"Rep", | ||
"Dem", | ||
"trans", | ||
"Vol", | ||
"inc", | ||
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec", | ||
"jr", | ||
"mi", | ||
"Miss", "Mrs", "Mr", "Ms", | ||
"Mol", | ||
"mt", | ||
"mts", | ||
"no", | ||
"Nos", | ||
"PhD", "MD", "BA", "MA", "MM", | ||
"pl", | ||
"pop", | ||
"pp", | ||
"rev", | ||
"est", | ||
"Prof", "Dr", | ||
"pt", | ||
"Ref", | ||
"Refs", | ||
"Eq", | ||
"Eqs", | ||
"Ch", | ||
"Rep", | ||
"repr", | ||
"rev", | ||
"Sec", | ||
"Secs", | ||
"mi", | ||
"Dept", | ||
"Sgt", "Col", "Gen", "Rep", "Sen",'Gov', "Lt", "Maj", "Capt","St", | ||
"Sr", "sr", "Jr", "jr", "Rev", | ||
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat", | ||
"trans", | ||
"Univ", | ||
"Nos", | ||
"No", | ||
"Mol", | ||
"Cell", | ||
"Viz", | ||
"Vol", | ||
"vs", | ||
"v", | ||
]; | ||
"Miss", "Mrs", "Mr", "Ms", | ||
"Prof", "Dr", | ||
"Sgt", "Col", "Gen", "Rep", "Sen",'Gov', "Lt", "Maj", "Capt","St", | ||
exports.setAbbreviations = function(abbr) { | ||
if(abbr){ | ||
abbreviations = abbr; | ||
} else { | ||
abbreviations = englishAbbreviations; | ||
} | ||
} | ||
"Sr", "Jr", "jr", "Rev", | ||
"PhD", "MD", "BA", "MA", "MM", | ||
"BSc", "MSc", | ||
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec", | ||
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat" | ||
]; | ||
exports.isCapitalized = function(str) { | ||
@@ -216,3 +234,4 @@ return /^[A-Z][a-z].*/.test(str) || this.isNumber(str); | ||
"sanitize" : false, | ||
"allowed_tags" : false | ||
"allowed_tags" : false, | ||
"abbreviations" : null | ||
}; | ||
@@ -231,2 +250,4 @@ | ||
Match.setAbbreviations(options.abbreviations); | ||
if (options.newline_boundaries) { | ||
@@ -233,0 +254,0 @@ text = text.replace(/\n+|[-#=_+*]{4,}/g, newline_placeholder); |
@@ -1,1 +0,1 @@ | ||
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer=f()}})(function(){var define,module,exports;return function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}({1:[function(require,module,exports){var abbreviations=["ie","eg","ext","Fig","fig","Figs","figs","et al","Co","Corp","Ave","Inc","Ex","Viz","vs","Vs","repr","Rep","Dem","trans","Vol","pp","rev","est","Ref","Refs","Eq","Eqs","Ch","Sec","Secs","mi","Dept","Univ","Nos","No","Mol","Cell","Miss","Mrs","Mr","Ms","Prof","Dr","Sgt","Col","Gen","Rep","Sen","Gov","Lt","Maj","Capt","St","Sr","Jr","jr","Rev","PhD","MD","BA","MA","MM","BSc","MSc","Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec","Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"];exports.isCapitalized=function(str){return/^[A-Z][a-z].*/.test(str)||this.isNumber(str)};exports.isSentenceStarter=function(str){return this.isCapitalized(str)||/``|"|'/.test(str.substring(0,2))};exports.isCommonAbbreviation=function(str){return~abbreviations.indexOf(str.replace(/\W+/g,""))};exports.isTimeAbbreviation=function(word,next){if(word==="a.m."||word==="p.m."){var tmp=next.replace(/\W+/g,"").slice(-3).toLowerCase();if(tmp==="day"){return true}}return false};exports.isDottedAbbreviation=function(word){var matches=word.replace(/[\(\)\[\]\{\}]/g,"").match(/(.\.)*/);return matches&&matches[0].length>0};exports.isCustomAbbreviation=function(str){if(str.length<=3){return true}return this.isCapitalized(str)};exports.isNameAbbreviation=function(wordCount,words){if(words.length>0){if(wordCount<5&&words[0].length<6&&this.isCapitalized(words[0])){return true}var capitalized=words.filter(function(str){return/[A-Z]/.test(str.charAt(0))});return capitalized.length>=3}return false};exports.isNumber=function(str,dotPos){if(dotPos){str=str.slice(dotPos-1,dotPos+2)}return!isNaN(str)};exports.isPhoneNr=function(str){return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/)};exports.isURL=function(str){return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)/)};exports.isConcatenated=function(word){var i=0;if((i=word.indexOf("."))>-1||(i=word.indexOf("!"))>-1||(i=word.indexOf("?"))>-1){var c=word.charAt(i+1);if(c.match(/[a-zA-Z].*/)){return[word.slice(0,i),word.slice(i+1)]}}return false};exports.isBoundaryChar=function(word){return word==="."||word==="!"||word==="?"}},{}],2:[function(require,module,exports){exports.endsWithChar=function ends_with_char(word,c){if(c.length>1){return c.indexOf(word.slice(-1))>-1}return word.slice(-1)===c};exports.endsWith=function ends_with(word,end){return word.slice(word.length-end.length)===end}},{}],3:[function(require,module,exports){module.exports=function sanitizeHtml(text,opts){if(typeof text=="string"||text instanceof String){var $div=document.createElement("DIV");$div.innerHTML=text;text=($div.textContent||"").trim()}else if(typeof text==="object"&&text.textContent){text=(text.textContent||"").trim()}return text}},{}],4:[function(require,module,exports){"use strict";var sanitizeHtml=require("sanitize-html");var String=require("./String");var Match=require("./Match");var newline_placeholder=" @~@ ";var newline_placeholder_t=newline_placeholder.trim();exports.sentences=function(text,user_options){if(!text||typeof text==="undefined"||text.length===0){return[]}var options={newline_boundaries:false,html_boundaries:false,sanitize:false,allowed_tags:false};if(typeof user_options==="boolean"){options.newline_boundaries=true}else{for(var k in user_options){options[k]=user_options[k]}}if(options.newline_boundaries){text=text.replace(/\n+|[-#=_+*]{4,}/g,newline_placeholder)}if(options.html_boundaries){text=text.replace(/(<br\s*\/?>|<\/[p|div|ul|ol]>)/g,"$1"+newline_placeholder)}if(options.sanitize||options.allowed_tags){if(!options.allowed_tags){options.allowed_tags=[""]}text=sanitizeHtml(text,{allowedTags:options.allowed_tags})}var words=text.match(/\S+/g);var wordCount=0;var index=0;var temp=[];var sentences=[];var current=[];for(var i=0,L=words.length;i<L;i++){wordCount++;current.push(words[i]);if(~words[i].indexOf(",")){wordCount=0}if(Match.isBoundaryChar(words[i])||String.endsWithChar(words[i],"?!")||words[i]===newline_placeholder_t){if((options.newline_boundaries||options.html_boundaries)&&words[i]===newline_placeholder_t){current.pop()}sentences.push(current);wordCount=0;current=[];continue}if(String.endsWithChar(words[i],'"')||String.endsWithChar(words[i],"”")){words[i]=words[i].slice(0,-1)}if(String.endsWithChar(words[i],".")){if(i+1<L){if(words[i].length===2&&isNaN(words[i].charAt(0))){continue}if(Match.isCommonAbbreviation(words[i])){continue}if(Match.isSentenceStarter(words[i+1])){if(Match.isTimeAbbreviation(words[i],words[i+1])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,6))){continue}if(Match.isNumber(words[i+1])){if(Match.isCustomAbbreviation(words[i])){continue}}}else{if(String.endsWith(words[i],"..")){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,5))){continue}}}sentences.push(current);current=[];wordCount=0;continue}if((index=words[i].indexOf("."))>-1){if(Match.isNumber(words[i],index)){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isURL(words[i])||Match.isPhoneNr(words[i])){continue}}if(temp=Match.isConcatenated(words[i])){current.pop();current.push(temp[0]);sentences.push(current);current=[];wordCount=0;current.push(temp[1])}}if(current.length){sentences.push(current)}var result=[];var sentence="";sentences=sentences.filter(function(s){return s.length>0});for(var i=0;i<sentences.length;i++){sentence=sentences[i].join(" ");if(sentences[i].length===1&&sentences[i][0].length<4&&sentences[i][0].indexOf(".")>-1){if(sentences[i+1]&&sentences[i+1][0].indexOf(".")<0){sentence+=" "+sentences[i+1].join(" ");i++}}result.push(sentence)}return result}},{"./Match":1,"./String":2,"sanitize-html":3}]},{},[4])(4)}); | ||
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer=f()}})(function(){var define,module,exports;return function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}({1:[function(require,module,exports){var abbreviations;var englishAbbreviations=["al","adj","assn","Ave","BSc","MSc","Cell","Ch","Co","cc","Corp","Dem","Dept","ed","eg","Eq","Eqs","est","est","etc","Ex","ext","Fig","fig","Figs","figs","i.e","ie","Inc","inc","Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec","jr","mi","Miss","Mrs","Mr","Ms","Mol","mt","mts","no","Nos","PhD","MD","BA","MA","MM","pl","pop","pp","Prof","Dr","pt","Ref","Refs","Rep","repr","rev","Sec","Secs","Sgt","Col","Gen","Rep","Sen","Gov","Lt","Maj","Capt","St","Sr","sr","Jr","jr","Rev","Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat","trans","Univ","Viz","Vol","vs","v"];exports.setAbbreviations=function(abbr){if(abbr){abbreviations=abbr}else{abbreviations=englishAbbreviations}};exports.isCapitalized=function(str){return/^[A-Z][a-z].*/.test(str)||this.isNumber(str)};exports.isSentenceStarter=function(str){return this.isCapitalized(str)||/``|"|'/.test(str.substring(0,2))};exports.isCommonAbbreviation=function(str){return~abbreviations.indexOf(str.replace(/\W+/g,""))};exports.isTimeAbbreviation=function(word,next){if(word==="a.m."||word==="p.m."){var tmp=next.replace(/\W+/g,"").slice(-3).toLowerCase();if(tmp==="day"){return true}}return false};exports.isDottedAbbreviation=function(word){var matches=word.replace(/[\(\)\[\]\{\}]/g,"").match(/(.\.)*/);return matches&&matches[0].length>0};exports.isCustomAbbreviation=function(str){if(str.length<=3){return true}return this.isCapitalized(str)};exports.isNameAbbreviation=function(wordCount,words){if(words.length>0){if(wordCount<5&&words[0].length<6&&this.isCapitalized(words[0])){return true}var capitalized=words.filter(function(str){return/[A-Z]/.test(str.charAt(0))});return capitalized.length>=3}return false};exports.isNumber=function(str,dotPos){if(dotPos){str=str.slice(dotPos-1,dotPos+2)}return!isNaN(str)};exports.isPhoneNr=function(str){return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/)};exports.isURL=function(str){return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)/)};exports.isConcatenated=function(word){var i=0;if((i=word.indexOf("."))>-1||(i=word.indexOf("!"))>-1||(i=word.indexOf("?"))>-1){var c=word.charAt(i+1);if(c.match(/[a-zA-Z].*/)){return[word.slice(0,i),word.slice(i+1)]}}return false};exports.isBoundaryChar=function(word){return word==="."||word==="!"||word==="?"}},{}],2:[function(require,module,exports){exports.endsWithChar=function ends_with_char(word,c){if(c.length>1){return c.indexOf(word.slice(-1))>-1}return word.slice(-1)===c};exports.endsWith=function ends_with(word,end){return word.slice(word.length-end.length)===end}},{}],3:[function(require,module,exports){module.exports=function sanitizeHtml(text,opts){if(typeof text=="string"||text instanceof String){var $div=document.createElement("DIV");$div.innerHTML=text;text=($div.textContent||"").trim()}else if(typeof text==="object"&&text.textContent){text=(text.textContent||"").trim()}return text}},{}],4:[function(require,module,exports){"use strict";var sanitizeHtml=require("sanitize-html");var String=require("./String");var Match=require("./Match");var newline_placeholder=" @~@ ";var newline_placeholder_t=newline_placeholder.trim();exports.sentences=function(text,user_options){if(!text||typeof text==="undefined"||text.length===0){return[]}var options={newline_boundaries:false,html_boundaries:false,sanitize:false,allowed_tags:false,abbreviations:null};if(typeof user_options==="boolean"){options.newline_boundaries=true}else{for(var k in user_options){options[k]=user_options[k]}}Match.setAbbreviations(options.abbreviations);if(options.newline_boundaries){text=text.replace(/\n+|[-#=_+*]{4,}/g,newline_placeholder)}if(options.html_boundaries){text=text.replace(/(<br\s*\/?>|<\/[p|div|ul|ol]>)/g,"$1"+newline_placeholder)}if(options.sanitize||options.allowed_tags){if(!options.allowed_tags){options.allowed_tags=[""]}text=sanitizeHtml(text,{allowedTags:options.allowed_tags})}var words=text.match(/\S+/g);var wordCount=0;var index=0;var temp=[];var sentences=[];var current=[];for(var i=0,L=words.length;i<L;i++){wordCount++;current.push(words[i]);if(~words[i].indexOf(",")){wordCount=0}if(Match.isBoundaryChar(words[i])||String.endsWithChar(words[i],"?!")||words[i]===newline_placeholder_t){if((options.newline_boundaries||options.html_boundaries)&&words[i]===newline_placeholder_t){current.pop()}sentences.push(current);wordCount=0;current=[];continue}if(String.endsWithChar(words[i],'"')||String.endsWithChar(words[i],"”")){words[i]=words[i].slice(0,-1)}if(String.endsWithChar(words[i],".")){if(i+1<L){if(words[i].length===2&&isNaN(words[i].charAt(0))){continue}if(Match.isCommonAbbreviation(words[i])){continue}if(Match.isSentenceStarter(words[i+1])){if(Match.isTimeAbbreviation(words[i],words[i+1])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,6))){continue}if(Match.isNumber(words[i+1])){if(Match.isCustomAbbreviation(words[i])){continue}}}else{if(String.endsWith(words[i],"..")){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,5))){continue}}}sentences.push(current);current=[];wordCount=0;continue}if((index=words[i].indexOf("."))>-1){if(Match.isNumber(words[i],index)){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isURL(words[i])||Match.isPhoneNr(words[i])){continue}}if(temp=Match.isConcatenated(words[i])){current.pop();current.push(temp[0]);sentences.push(current);current=[];wordCount=0;current.push(temp[1])}}if(current.length){sentences.push(current)}var result=[];var sentence="";sentences=sentences.filter(function(s){return s.length>0});for(var i=0;i<sentences.length;i++){sentence=sentences[i].join(" ");if(sentences[i].length===1&&sentences[i][0].length<4&&sentences[i][0].indexOf(".")>-1){if(sentences[i+1]&&sentences[i+1][0].indexOf(".")<0){sentence+=" "+sentences[i+1].join(" ");i++}}result.push(sentence)}return result}},{"./Match":1,"./String":2,"sanitize-html":3}]},{},[4])(4)}); |
@@ -1,5 +0,23 @@ | ||
var abbreviations = [ | ||
"ie", | ||
var abbreviations; | ||
var englishAbbreviations = [ | ||
"al", | ||
"adj", | ||
"assn", | ||
"Ave", | ||
"BSc", "MSc", | ||
"Cell", | ||
"Ch", | ||
"Co", | ||
"cc", | ||
"Corp", | ||
"Dem", | ||
"Dept", | ||
"ed", | ||
"eg", | ||
"Eq", | ||
"Eqs", | ||
"est", | ||
"est", | ||
"etc", | ||
"Ex", | ||
"ext", // + number? | ||
@@ -10,47 +28,47 @@ "Fig", | ||
"figs", | ||
"et al", | ||
"Co", | ||
"Corp", | ||
"Ave", | ||
"i.e", | ||
"ie", | ||
"Inc", | ||
"Ex", | ||
"Viz", | ||
"vs", | ||
"Vs", | ||
"repr", | ||
"Rep", | ||
"Dem", | ||
"trans", | ||
"Vol", | ||
"inc", | ||
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec", | ||
"jr", | ||
"mi", | ||
"Miss", "Mrs", "Mr", "Ms", | ||
"Mol", | ||
"mt", | ||
"mts", | ||
"no", | ||
"Nos", | ||
"PhD", "MD", "BA", "MA", "MM", | ||
"pl", | ||
"pop", | ||
"pp", | ||
"rev", | ||
"est", | ||
"Prof", "Dr", | ||
"pt", | ||
"Ref", | ||
"Refs", | ||
"Eq", | ||
"Eqs", | ||
"Ch", | ||
"Rep", | ||
"repr", | ||
"rev", | ||
"Sec", | ||
"Secs", | ||
"mi", | ||
"Dept", | ||
"Sgt", "Col", "Gen", "Rep", "Sen",'Gov', "Lt", "Maj", "Capt","St", | ||
"Sr", "sr", "Jr", "jr", "Rev", | ||
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat", | ||
"trans", | ||
"Univ", | ||
"Nos", | ||
"No", | ||
"Mol", | ||
"Cell", | ||
"Viz", | ||
"Vol", | ||
"vs", | ||
"v", | ||
]; | ||
"Miss", "Mrs", "Mr", "Ms", | ||
"Prof", "Dr", | ||
"Sgt", "Col", "Gen", "Rep", "Sen",'Gov', "Lt", "Maj", "Capt","St", | ||
exports.setAbbreviations = function(abbr) { | ||
if(abbr){ | ||
abbreviations = abbr; | ||
} else { | ||
abbreviations = englishAbbreviations; | ||
} | ||
} | ||
"Sr", "Jr", "jr", "Rev", | ||
"PhD", "MD", "BA", "MA", "MM", | ||
"BSc", "MSc", | ||
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec", | ||
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat" | ||
]; | ||
exports.isCapitalized = function(str) { | ||
@@ -57,0 +75,0 @@ return /^[A-Z][a-z].*/.test(str) || this.isNumber(str); |
@@ -23,3 +23,4 @@ /*jshint node:true, laxcomma:true */ | ||
"sanitize" : false, | ||
"allowed_tags" : false | ||
"allowed_tags" : false, | ||
"abbreviations" : null | ||
}; | ||
@@ -38,2 +39,4 @@ | ||
Match.setAbbreviations(options.abbreviations); | ||
if (options.newline_boundaries) { | ||
@@ -40,0 +43,0 @@ text = text.replace(/\n+|[-#=_+*]{4,}/g, newline_placeholder); |
{ | ||
"name": "sbd", | ||
"version": "1.0.6", | ||
"version": "1.0.8", | ||
"description": "Split text into sentences with Sentence Boundary Detection (SBD).", | ||
@@ -28,3 +28,3 @@ "main": "lib/tokenizer.js", | ||
"devDependencies": { | ||
"mocha": "1.7.x" | ||
"mocha": "3.0.x" | ||
}, | ||
@@ -31,0 +31,0 @@ "dependencies": { |
@@ -41,3 +41,4 @@ Sentence Boundary Detection (SBD) | ||
"sanitize" : false, | ||
"allowed_tags" : false | ||
"allowed_tags" : false, | ||
"abbreviations" : null | ||
}; | ||
@@ -50,2 +51,3 @@ ``` | ||
* `allowed_tags`: To sanitize html, the library [santize-html](https://github.com/punkave/sanitize-html) is used. You can pass the allowed tags option. | ||
* `abbreviations`: list of abbreviations to override the original ones for use with other languages. Don't put dots in abbreviations. | ||
@@ -59,1 +61,10 @@ | ||
If you feel something is missing, you can open an issue stating the problem sentence and desired result. If code is unclear give me a @mention. Pull requests are welcome. | ||
## Building the (minified) scripts | ||
``` | ||
npm install -g browserify | ||
npm run-script build | ||
``` |
@@ -45,2 +45,42 @@ /*jshint node:true, laxcomma:true */ | ||
}); | ||
}); | ||
describe('Skip two worded abbreviations', function () { | ||
var entry = "Claims 1–6 and 15–26 are rejected under pre-AIA 35 USC § 103(a) as being unpatentable over Chalana et al. (US 2012/0179503) in view of Oh (US 2013/0013993)."; | ||
var sentences = tokenizer.sentences(entry); | ||
it("should get 1 sentence", function () { | ||
assert.equal(sentences.length, 1); | ||
}); | ||
}); | ||
describe('Skip two worded abbreviations', function () { | ||
var entry = "Et al. is an abbreviation of the Latin loanphrase et alii, meaning and others. It is similar to etc. (short for et cetera, meaning and the rest), but whereas etc. applies to things, et al. applies to people."; | ||
var sentences = tokenizer.sentences(entry); | ||
console.log(sentences) | ||
it("should get 2 sentences", function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
describe('Use other languages', function () { | ||
var entry = "Trzeba tu coś napisać, np. fragment odnoszący się do pkt. 3 wcześniejszego tekstu."; | ||
var sentencesEN = tokenizer.sentences(entry); | ||
var sentencesPL = tokenizer.sentences(entry,{abbreviations:["np","pkt"]}); | ||
it("should get 1 sentence", function () { | ||
console.log("#",sentencesEN, sentencesPL, sentencesPL.length) | ||
assert.equal(sentencesEN.length, 3); | ||
assert.equal(sentencesPL.length, 1); | ||
}); | ||
it("should not permanently override abbreviations", function() { | ||
var sentences = tokenizer.sentences(entry); | ||
assert.equal(sentences.length, 3); | ||
}) | ||
}); | ||
}); |
60579
1108
67