Comparing version 1.0.4 to 1.0.5
@@ -1,2 +0,2 @@ | ||
!function(e){if("object"==typeof exports&&"undefined"!=typeof module)module.exports=e();else if("function"==typeof define&&define.amd)define([],e);else{var f;"undefined"!=typeof window?f=window:"undefined"!=typeof global?f=global:"undefined"!=typeof self&&(f=self),f.tokenizer=e()}}(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){ | ||
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){ | ||
@@ -203,29 +203,51 @@ var abbreviations = [ | ||
// Split the entry into sentences. | ||
exports.sentences = function(text, newline_boundary) { | ||
if (text.length === 0) | ||
exports.sentences = function(text, user_options) { | ||
if (!text || typeof text === "undefined" || text.length === 0) { | ||
return []; | ||
} | ||
text = sanitizeHtml(text, { "allowedTags" : [''] }); | ||
var options = { | ||
"newline_boundaries" : false, | ||
"html_boundaries" : false, | ||
"sanitize" : false, | ||
"allowed_tags" : false | ||
}; | ||
/** Preprocessing */ | ||
if (typeof newline_boundary === 'undefined') { | ||
newline_boundary = false; | ||
if (typeof user_options === "boolean") { | ||
// Deprecated quick option | ||
options.newline_boundaries = true; | ||
} | ||
else { | ||
// Extend options | ||
for (var k in user_options) { | ||
options[k] = user_options[k]; | ||
} | ||
} | ||
if (newline_boundary) { | ||
if (options.newline_boundaries) { | ||
text = text.replace(/\n+|[-#=_+*]{4,}/g, newline_placeholder); | ||
} | ||
var index = 0; | ||
var temp = []; | ||
if (options.html_boundaries) { | ||
text = text.replace(/(<br \/>)/g, "$1" + newline_placeholder); | ||
} | ||
if (options.sanitize || options.allowed_tags) { | ||
if (! options.allowed_tags) { | ||
options.allowed_tags = [""]; | ||
} | ||
text = sanitizeHtml(text, { "allowedTags" : options.allowed_tags }); | ||
} | ||
// Split the text into words | ||
var words = text.match(/\S+/g); // see http://blog.tompawlak.org/split-string-into-tokens-javascript | ||
var wordCount = 0; | ||
var index = 0; | ||
var temp = []; | ||
var sentences = []; | ||
var current = []; | ||
var wordCount = 0; | ||
for (var i=0, L=words.length; i < L; i++) { | ||
@@ -237,3 +259,3 @@ wordCount++; | ||
// Sub-sentences (Bijzin?), reset counter | ||
// Sub-sentences, reset counter | ||
if (~words[i].indexOf(',')) { | ||
@@ -247,3 +269,3 @@ wordCount = 0; | ||
{ | ||
if (newline_boundary && words[i] === newline_placeholder_t) { | ||
if ((options.newline_boundaries || options.html_boundaries) && words[i] === newline_placeholder_t) { | ||
current.pop(); | ||
@@ -260,2 +282,8 @@ } | ||
if (String.endsWithChar(words[i], "\"") || String.endsWithChar(words[i], "”")) { | ||
// endQuote = words[i].slice(-1); | ||
words[i] = words[i].slice(0, -1); | ||
} | ||
// A dot might indicate the end sentences | ||
@@ -262,0 +290,0 @@ // Exception: The next sentence starts with a word (non abbreviation) |
@@ -1,1 +0,1 @@ | ||
!function(e){if("object"==typeof exports&&"undefined"!=typeof module)module.exports=e();else if("function"==typeof define&&define.amd)define([],e);else{var f;"undefined"!=typeof window?f=window:"undefined"!=typeof global?f=global:"undefined"!=typeof self&&(f=self),f.tokenizer=e()}}(function(){var define,module,exports;return function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}({1:[function(require,module,exports){var abbreviations=["ie","eg","ext","Fig","fig","Figs","figs","et al","Co","Corp","Ave","Inc","Ex","Viz","vs","Vs","repr","Rep","Dem","trans","Vol","pp","rev","est","Ref","Refs","Eq","Eqs","Ch","Sec","Secs","mi","Dept","Univ","Nos","No","Mol","Cell","Miss","Mrs","Mr","Ms","Prof","Dr","Sgt","Col","Gen","Rep","Sen","Gov","Lt","Maj","Capt","St","Sr","Jr","jr","Rev","PhD","MD","BA","MA","MM","BSc","MSc","Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec","Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"];exports.isCapitalized=function(str){return/^[A-Z][a-z].*/.test(str)||this.isNumber(str)};exports.isSentenceStarter=function(str){return this.isCapitalized(str)||/``|"|'/.test(str.substring(0,2))};exports.isCommonAbbreviation=function(str){return~abbreviations.indexOf(str.replace(/\W+/g,""))};exports.isTimeAbbreviation=function(word,next){if(word==="a.m."||word==="p.m."){var tmp=next.replace(/\W+/g,"").slice(-3).toLowerCase();if(tmp==="day"){return true}}return false};exports.isDottedAbbreviation=function(word){var matches=word.replace(/[\(\)\[\]\{\}]/g,"").match(/(.\.)*/);return matches&&matches[0].length>0};exports.isCustomAbbreviation=function(str){if(str.length<=3)return true;return this.isCapitalized(str)};exports.isNameAbbreviation=function(wordCount,words){if(words.length>0){if(wordCount<5&&words[0].length<6&&this.isCapitalized(words[0])){return true}var capitalized=words.filter(function(str){return/[A-Z]/.test(str.charAt(0))});return capitalized.length>=3}return false};exports.isNumber=function(str,dotPos){if(dotPos){str=str.slice(dotPos-1,dotPos+2)}return!isNaN(str)};exports.isPhoneNr=function(str){return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/)};exports.isURL=function(str){return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/)};exports.isConcatenated=function(word){var i=0;if((i=word.indexOf("."))>-1||(i=word.indexOf("!"))>-1||(i=word.indexOf("?"))>-1){var c=word.charAt(i+1);if(c.match(/[a-zA-Z].*/)){return[word.slice(0,i),word.slice(i+1)]}}return false};exports.isBoundaryChar=function(word){return word==="."||word==="!"||word==="?"}},{}],2:[function(require,module,exports){exports.endsWithChar=function ends_with_char(word,c){if(c.length>1){return c.indexOf(word.slice(-1))>-1}return word.slice(-1)===c};exports.endsWith=function ends_with(word,end){return word.slice(word.length-end.length)===end}},{}],3:[function(require,module,exports){module.exports=function sanitizeHtml(text,opts){if(typeof text=="string"||text instanceof String){var $div=document.createElement("DIV");$div.innerHTML=text;text=($div.textContent||"").trim()}else if(typeof text==="object"&&text.textContent){text=(text.textContent||"").trim()}return text}},{}],4:[function(require,module,exports){"use strict";var sanitizeHtml=require("sanitize-html");var String=require("./String");var Match=require("./Match");var newline_placeholder=" @~@ ";var newline_placeholder_t=newline_placeholder.trim();exports.sentences=function(text,newline_boundary){if(text.length===0)return[];text=sanitizeHtml(text,{allowedTags:[""]});if(typeof newline_boundary==="undefined"){newline_boundary=false}if(newline_boundary){text=text.replace(/\n+|[-#=_+*]{4,}/g,newline_placeholder)}var index=0;var temp=[];var words=text.match(/\S+/g);var sentences=[];var current=[];var wordCount=0;for(var i=0,L=words.length;i<L;i++){wordCount++;current.push(words[i]);if(~words[i].indexOf(",")){wordCount=0}if(Match.isBoundaryChar(words[i])||String.endsWithChar(words[i],"?!")||words[i]===newline_placeholder_t){if(newline_boundary&&words[i]===newline_placeholder_t){current.pop()}sentences.push(current);wordCount=0;current=[];continue}if(String.endsWithChar(words[i],".")){if(i+1<L){if(words[i].length===2&&isNaN(words[i].charAt(0))){continue}if(Match.isCommonAbbreviation(words[i])){continue}if(Match.isSentenceStarter(words[i+1])){if(Match.isTimeAbbreviation(words[i],words[i+1])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,6))){continue}if(Match.isNumber(words[i+1])&&Match.isCustomAbbreviation(words[i])){continue}}else{if(String.endsWith(words[i],"..")){continue}if(Match.isDottedAbbreviation(words[i])||Match.isCustomAbbreviation(words[i])){continue}}}sentences.push(current);current=[];wordCount=0;continue}if((index=words[i].indexOf("."))>-1){if(Match.isNumber(words[i],index)){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isURL(words[i])||Match.isPhoneNr(words[i])){continue}}if(temp=Match.isConcatenated(words[i])){current.pop();current.push(temp[0]);sentences.push(current);current=[];wordCount=0;current.push(temp[1])}}if(current.length)sentences.push(current);var result=[];var sentence="";sentences=sentences.filter(function(s){return s.length>0});for(i=0;i<sentences.length;i++){sentence=sentences[i].join(" ");if(sentences[i].length===1&&sentences[i][0].length<4&&sentences[i][0].indexOf(".")>-1){if(sentences[i+1]&&sentences[i+1][0].indexOf(".")<0){sentence+=" "+sentences[i+1].join(" ");i++}}result.push(sentence)}return result}},{"./Match":1,"./String":2,"sanitize-html":3}]},{},[4])(4)}); | ||
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer=f()}})(function(){var define,module,exports;return function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}({1:[function(require,module,exports){var abbreviations=["ie","eg","ext","Fig","fig","Figs","figs","et al","Co","Corp","Ave","Inc","Ex","Viz","vs","Vs","repr","Rep","Dem","trans","Vol","pp","rev","est","Ref","Refs","Eq","Eqs","Ch","Sec","Secs","mi","Dept","Univ","Nos","No","Mol","Cell","Miss","Mrs","Mr","Ms","Prof","Dr","Sgt","Col","Gen","Rep","Sen","Gov","Lt","Maj","Capt","St","Sr","Jr","jr","Rev","PhD","MD","BA","MA","MM","BSc","MSc","Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec","Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"];exports.isCapitalized=function(str){return/^[A-Z][a-z].*/.test(str)||this.isNumber(str)};exports.isSentenceStarter=function(str){return this.isCapitalized(str)||/``|"|'/.test(str.substring(0,2))};exports.isCommonAbbreviation=function(str){return~abbreviations.indexOf(str.replace(/\W+/g,""))};exports.isTimeAbbreviation=function(word,next){if(word==="a.m."||word==="p.m."){var tmp=next.replace(/\W+/g,"").slice(-3).toLowerCase();if(tmp==="day"){return true}}return false};exports.isDottedAbbreviation=function(word){var matches=word.replace(/[\(\)\[\]\{\}]/g,"").match(/(.\.)*/);return matches&&matches[0].length>0};exports.isCustomAbbreviation=function(str){if(str.length<=3)return true;return this.isCapitalized(str)};exports.isNameAbbreviation=function(wordCount,words){if(words.length>0){if(wordCount<5&&words[0].length<6&&this.isCapitalized(words[0])){return true}var capitalized=words.filter(function(str){return/[A-Z]/.test(str.charAt(0))});return capitalized.length>=3}return false};exports.isNumber=function(str,dotPos){if(dotPos){str=str.slice(dotPos-1,dotPos+2)}return!isNaN(str)};exports.isPhoneNr=function(str){return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/)};exports.isURL=function(str){return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)/)};exports.isConcatenated=function(word){var i=0;if((i=word.indexOf("."))>-1||(i=word.indexOf("!"))>-1||(i=word.indexOf("?"))>-1){var c=word.charAt(i+1);if(c.match(/[a-zA-Z].*/)){return[word.slice(0,i),word.slice(i+1)]}}return false};exports.isBoundaryChar=function(word){return word==="."||word==="!"||word==="?"}},{}],2:[function(require,module,exports){exports.endsWithChar=function ends_with_char(word,c){if(c.length>1){return c.indexOf(word.slice(-1))>-1}return word.slice(-1)===c};exports.endsWith=function ends_with(word,end){return word.slice(word.length-end.length)===end}},{}],3:[function(require,module,exports){module.exports=function sanitizeHtml(text,opts){if(typeof text=="string"||text instanceof String){var $div=document.createElement("DIV");$div.innerHTML=text;text=($div.textContent||"").trim()}else if(typeof text==="object"&&text.textContent){text=(text.textContent||"").trim()}return text}},{}],4:[function(require,module,exports){"use strict";var sanitizeHtml=require("sanitize-html");var String=require("./String");var Match=require("./Match");var newline_placeholder=" @~@ ";var newline_placeholder_t=newline_placeholder.trim();exports.sentences=function(text,user_options){if(!text||typeof text==="undefined"||text.length===0){return[]}var options={newline_boundaries:false,html_boundaries:false,sanitize:false,allowed_tags:false};if(typeof user_options==="boolean"){options.newline_boundaries=true}else{for(var k in user_options){options[k]=user_options[k]}}if(options.newline_boundaries){text=text.replace(/\n+|[-#=_+*]{4,}/g,newline_placeholder)}if(options.html_boundaries){text=text.replace(/(<br \/>)/g,"$1"+newline_placeholder)}if(options.sanitize||options.allowed_tags){if(!options.allowed_tags){options.allowed_tags=[""]}text=sanitizeHtml(text,{allowedTags:options.allowed_tags})}var words=text.match(/\S+/g);var wordCount=0;var index=0;var temp=[];var sentences=[];var current=[];for(var i=0,L=words.length;i<L;i++){wordCount++;current.push(words[i]);if(~words[i].indexOf(",")){wordCount=0}if(Match.isBoundaryChar(words[i])||String.endsWithChar(words[i],"?!")||words[i]===newline_placeholder_t){if((options.newline_boundaries||options.html_boundaries)&&words[i]===newline_placeholder_t){current.pop()}sentences.push(current);wordCount=0;current=[];continue}if(String.endsWithChar(words[i],'"')||String.endsWithChar(words[i],"”")){words[i]=words[i].slice(0,-1)}if(String.endsWithChar(words[i],".")){if(i+1<L){if(words[i].length===2&&isNaN(words[i].charAt(0))){continue}if(Match.isCommonAbbreviation(words[i])){continue}if(Match.isSentenceStarter(words[i+1])){if(Match.isTimeAbbreviation(words[i],words[i+1])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,6))){continue}if(Match.isNumber(words[i+1])&&Match.isCustomAbbreviation(words[i])){continue}}else{if(String.endsWith(words[i],"..")){continue}if(Match.isDottedAbbreviation(words[i])||Match.isCustomAbbreviation(words[i])){continue}}}sentences.push(current);current=[];wordCount=0;continue}if((index=words[i].indexOf("."))>-1){if(Match.isNumber(words[i],index)){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isURL(words[i])||Match.isPhoneNr(words[i])){continue}}if(temp=Match.isConcatenated(words[i])){current.pop();current.push(temp[0]);sentences.push(current);current=[];wordCount=0;current.push(temp[1])}}if(current.length)sentences.push(current);var result=[];var sentence="";sentences=sentences.filter(function(s){return s.length>0});for(i=0;i<sentences.length;i++){sentence=sentences[i].join(" ");if(sentences[i].length===1&&sentences[i][0].length<4&&sentences[i][0].indexOf(".")>-1){if(sentences[i+1]&&sentences[i+1][0].indexOf(".")<0){sentence+=" "+sentences[i+1].join(" ");i++}}result.push(sentence)}return result}},{"./Match":1,"./String":2,"sanitize-html":3}]},{},[4])(4)}); |
@@ -12,38 +12,51 @@ /*jshint node:true, laxcomma:true */ | ||
// Split the entry into sentences. | ||
exports.sentences = function(text, options) { | ||
if (!text || typeof text === "undefined" || text.length === 0) | ||
exports.sentences = function(text, user_options) { | ||
if (!text || typeof text === "undefined" || text.length === 0) { | ||
return []; | ||
} | ||
/** Options processing */ | ||
var newline_boundary; | ||
var do_sanitize = true; | ||
if (typeof options === 'undefined') { | ||
newline_boundary = false; | ||
var options = { | ||
"newline_boundaries" : false, | ||
"html_boundaries" : false, | ||
"sanitize" : false, | ||
"allowed_tags" : false | ||
}; | ||
if (typeof user_options === "boolean") { | ||
// Deprecated quick option | ||
options.newline_boundaries = true; | ||
} | ||
else if (typeof options === 'object') { | ||
newline_boundary = options.newline_boundary || false; | ||
do_sanitize = typeof options.sanitize === 'undefined' ? true : options.sanitize; | ||
} | ||
else { | ||
newline_boundary = options; | ||
// Extend options | ||
for (var k in user_options) { | ||
options[k] = user_options[k]; | ||
} | ||
} | ||
text = do_sanitize ? sanitizeHtml(text, { "allowedTags" : [''] }) : text; | ||
if (newline_boundary) { | ||
if (options.newline_boundaries) { | ||
text = text.replace(/\n+|[-#=_+*]{4,}/g, newline_placeholder); | ||
} | ||
var index = 0; | ||
var temp = []; | ||
if (options.html_boundaries) { | ||
text = text.replace(/(<br \/>)/g, "$1" + newline_placeholder); | ||
} | ||
if (options.sanitize || options.allowed_tags) { | ||
if (! options.allowed_tags) { | ||
options.allowed_tags = [""]; | ||
} | ||
text = sanitizeHtml(text, { "allowedTags" : options.allowed_tags }); | ||
} | ||
// Split the text into words | ||
var words = text.match(/\S+/g); // see http://blog.tompawlak.org/split-string-into-tokens-javascript | ||
var wordCount = 0; | ||
var index = 0; | ||
var temp = []; | ||
var sentences = []; | ||
var current = []; | ||
var wordCount = 0; | ||
for (var i=0, L=words.length; i < L; i++) { | ||
@@ -64,3 +77,3 @@ wordCount++; | ||
{ | ||
if (newline_boundary && words[i] === newline_placeholder_t) { | ||
if ((options.newline_boundaries || options.html_boundaries) && words[i] === newline_placeholder_t) { | ||
current.pop(); | ||
@@ -67,0 +80,0 @@ } |
{ | ||
"name": "sbd", | ||
"version": "1.0.4", | ||
"version": "1.0.5", | ||
"description": "Split text into sentences with Sentence Boundary Detection (SBD).", | ||
"main": "lib/tokenizer.js", | ||
"scripts": { | ||
"test": "mocha -R spec" | ||
"test": "mocha -R spec", | ||
"build:js": "browserify lib/tokenizer.js --standalone tokenizer > dist/sbd.js", | ||
"build:minify": "uglifyjs dist/sbd.js > dist/sbd.min.js", | ||
"build": "npm run build:js && npm run build:minify" | ||
}, | ||
@@ -9,0 +12,0 @@ "homepage": "http://github.com/Tessmore/sbd", |
@@ -25,3 +25,3 @@ Sentence Boundary Detection (SBD) | ||
var text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration."; | ||
var sentences = tokenizer.sentences(text); | ||
var sentences = tokenizer.sentences(text, optional_options); | ||
@@ -35,41 +35,24 @@ // [ | ||
The second argument can also be a configuration object, that can support the following values: | ||
#### Optional options | ||
* `newline_boundary`: the same as specifying the second argument as a boolean. | ||
* `sanitize`: set this to `false` in order to disable automatic HTML sanitization. While automatic | ||
sanitization has to remain the default for backwards compatibility purposes, unless you are | ||
specifically providing `sbd` with content you know to contain HTML it is recommended to switch | ||
this off as it can mangle your content. | ||
```javascript | ||
var options = { | ||
"newline_boundary": true, | ||
"sanitize": true | ||
``` | ||
var options = { | ||
"newline_boundaries" : false, | ||
"html_boundaries" : false, | ||
"sanitize" : false, | ||
"allowed_tags" : false | ||
}; | ||
var sentences = tokenizer.sentences(textFromFile, options); | ||
``` | ||
textFromFile = "Title of project: Hello World | ||
Author: Kenny | ||
* `newline_boundaries`, force sentence split at newlines | ||
* `html_boundaries`, force sentence split at specific tags (br, and closing p, div, ul, ol) | ||
* `sanitize`: If you don't expect nor want html in your text. | ||
* `allowed_tags`: To sanitize html, the library [santize-html](https://github.com/punkave/sanitize-html) is used. You can pass the allowed tags option. | ||
May, 2012 | ||
Lorem ipsum dolor sit amet. Consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco (laboris nisi?) ut aliquip ex ea commodo consequat. | ||
"; | ||
// Gives | ||
// [ | ||
// 'Title of project: Hello World', | ||
// 'Author: Kenny', | ||
// 'May, 2012', | ||
// 'Lorem ipsum dolor sit amet.', | ||
// 'Consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.', | ||
// 'Ut enim ad minim veniam, quis nostrud exercitation ullamco (laboris nisi?) ut aliquip ex ea commodo consequat.' | ||
// ] | ||
``` | ||
## Contributing | ||
You can run unit tests with `npm test`. | ||
You can run unit tests with `npm test`. | ||
If you feel something is missing, you can open an issue stating the problem sentence and desired result. If code is unclear give me a @mention. Pull requests are welcome. |
@@ -12,3 +12,3 @@ /*jshint node:true, laxcomma:true */ | ||
var entry = "<p>Hello this is my first sentence.</p> <br><br>There is also a second down the page."; | ||
var sentences = tokenizer.sentences(entry); | ||
var sentences = tokenizer.sentences(entry, { "sanitize": true }); | ||
@@ -22,3 +22,3 @@ it("should get 2 sentences", function () { | ||
var entry = "We find that a < b works. But in turn, c > x."; | ||
var sentences = tokenizer.sentences(entry, { sanitize: false }); | ||
var sentences = tokenizer.sentences(entry, { "sanitize": false }); | ||
@@ -34,2 +34,12 @@ it("should get 2 sentences", function () { | ||
describe('Closing html boundaries (br, p, div) split sentences.', function () { | ||
var entry = "What the Experts Say <br />In certain circumstances, “working for a manager who’s task-oriented and has a high need for achievement can be motivating,” says Linda Hill"; | ||
var sentences = tokenizer.sentences(entry, { sanitize: false, "html_boundaries": true }); | ||
it("should get 2 sentences", function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
}); |
@@ -12,3 +12,3 @@ /*jshint node:true, laxcomma:true */ | ||
var entry = "1. The item\n2. Another item"; | ||
var sentences = tokenizer.sentences(entry, true); | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
@@ -22,3 +22,3 @@ it("should get 2 sentences", function () { | ||
var entry = "a. The item\nab. Another item\n(1.) Third item"; | ||
var sentences = tokenizer.sentences(entry, true); | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
@@ -32,3 +32,3 @@ it("should get 3 sentences", function () { | ||
var entry = "a. The item\nzz.\nab.\ncd. Hello"; | ||
var sentences = tokenizer.sentences(entry, true); | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
@@ -35,0 +35,0 @@ it("should get 4 sentences", function () { |
@@ -120,3 +120,3 @@ /*jshint node:true, laxcomma:true */ | ||
var entry = "Search on http://google.com\n\nThen send me an email: gg@gggg.kk"; | ||
var sentences = tokenizer.sentences(entry, true); | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
@@ -131,3 +131,3 @@ it("should get 2 sentences", function () { | ||
var entry = "“If there’s no balance and your boss doesn’t provide support and work that’s meaningful, your chances of burning out are great.” What bothers most people in situations like these is “the lack of boundaries,” says Nancy Rothbard, the David Pottruck Professor of Management at the University of Pennsylvania’s Wharton School."; | ||
var sentences = tokenizer.sentences(entry, true); | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
@@ -141,3 +141,3 @@ it("should get 2 sentences", function () { | ||
var entry = "“If there’s no balance! And your boss doesn’t provide support and work that’s meaningful, your chances of burning out are great.” What bothers most people in situations like these is “the lack of boundaries,” says Nancy Rothbard, the David Pottruck Professor of Management at the University of Pennsylvania’s Wharton School."; | ||
var sentences = tokenizer.sentences(entry, true); | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
@@ -151,3 +151,3 @@ it("should get 3 sentences", function () { | ||
var entry = "FAMILIY HISTORY ========================================== Nothing interesting"; | ||
var sentences = tokenizer.sentences(entry, true); | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
@@ -154,0 +154,0 @@ it("should get 2 sentences", function () { |
@@ -57,3 +57,3 @@ /*jshint node:true, laxcomma:true */ | ||
var entry = "The humble bundle sale\r\nDate: Monday-Fri starting 2015-01-01\nSales starting at ¤2,50"; | ||
var sentences = tokenizer.sentences(entry, true); | ||
var sentences = tokenizer.sentences(entry, { "newline_boundaries": true }); | ||
@@ -60,0 +60,0 @@ it("should get 3 sentences", function () { |
Sorry, the diff of this file is not supported yet
56425
1035
57