New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

sbd

Package Overview
Dependencies
Maintainers
1
Versions
25
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

sbd - npm Package Compare versions

Comparing version 1.0.13 to 1.0.14

78

dist/sbd.js

@@ -74,4 +74,4 @@ (function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer = f()}})(function(){var define,module,exports;return (function(){function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}return e})()({1:[function(require,module,exports){

exports.isCapitalized = function(str) {
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str);
var isCapitalized = exports.isCapitalized = function(str) {
return /^[A-Z][a-z].*/.test(str) || isNumber(str);
}

@@ -81,3 +81,3 @@

exports.isSentenceStarter = function(str) {
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
return isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
}

@@ -107,3 +107,4 @@

// TODO look for next words, if multiple capitalized -> not sentence ending
// TODO look for next words, if multiple are capitalized,
// then it's probably not a sentence ending
exports.isCustomAbbreviation = function(str) {

@@ -114,3 +115,3 @@ if (str.length <= 3) {

return this.isCapitalized(str);
return isCapitalized(str);
}

@@ -120,7 +121,5 @@

// more likely an abbreviation + name or new sentence.
// ~ TODO Perhaps also consider prev. word?
exports.isNameAbbreviation = function(wordCount, words) {
if (words.length > 0) {
if (wordCount < 5 && words[0].length < 6 && this.isCapitalized(words[0])) {
if (wordCount < 5 && words[0].length < 6 && isCapitalized(words[0])) {
return true;

@@ -139,3 +138,3 @@ }

exports.isNumber = function(str, dotPos) {
var isNumber = exports.isNumber = function(str, dotPos) {
if (dotPos) {

@@ -440,6 +439,2 @@ str = str.slice(dotPos-1, dotPos+2);

/** After processing */
var result = [];
var sentence = "";
// Clear "empty" sentences

@@ -450,34 +445,33 @@ sentences = sentences.filter(function(s) {

for (var i=0; i < sentences.length; i++) {
if (options.preserve_whitespace && !options.newline_boundaries && !options.html_boundaries) {
// tokens looks like so: [leading-space token, non-space token, space
// token, non-space token, space token... ]. In other words, the first
// item is the leading space (or the empty string), and the rest of
// the tokens are [non-space, space] token pairs.
var tokenCount = sentences[i].length * 2;
if (i === 0) {
tokenCount += 1;
var result = sentences.slice(1).reduce(function (out, sentence) {
var lastSentence = out[out.length - 1];
// Single words, could be "enumeration lists"
if (lastSentence.length === 1 && /^.{1,2}[.]$/.test(lastSentence[0])) {
// Check if there is a next sentence
// It should not be another list item
if (!/[.]/.test(sentence[0])) {
out.pop()
out.push(lastSentence.concat(sentence));
return out;
}
}
out.push(sentence);
return out;
}, [ sentences[0] ]);
sentence = tokens.splice(0, tokenCount).join('');
// join tokens back together
return result.map(function (sentence, ii) {
if (options.preserve_whitespace && !options.newline_boundaries && !options.html_boundaries) {
// tokens looks like so: [leading-space token, non-space token, space
// token, non-space token, space token... ]. In other words, the first
// item is the leading space (or the empty string), and the rest of
// the tokens are [non-space, space] token pairs.
var tokenCount = sentence.length * 2;
if (ii === 0) {
tokenCount += 1;
}
else {
sentence = sentences[i].join(" ");
}
// Single words, could be "enumeration lists"
if (sentences[i].length === 1 && sentences[i][0].length < 4 && sentences[i][0].indexOf('.') > -1) {
// Check if there is a next sentence
// It should not be another list item
if (sentences[i+1] && sentences[i+1][0].indexOf('.') < 0) {
sentence += " " + sentences[i+1].join(" ");
i++;
}
}
result.push(sentence);
}
return result;
return tokens.splice(0, tokenCount).join('');
}
return sentence.join(" ");
});
};

@@ -484,0 +478,0 @@

@@ -1,1 +0,1 @@

(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer=f()}})(function(){var define,module,exports;return function(){function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}return e}()({1:[function(require,module,exports){var abbreviations;var englishAbbreviations=["al","adj","assn","Ave","BSc","MSc","Cell","Ch","Co","cc","Corp","Dem","Dept","ed","eg","Eq","Eqs","est","est","etc","Ex","ext","Fig","fig","Figs","figs","i.e","ie","Inc","inc","Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec","jr","mi","Miss","Mrs","Mr","Ms","Mol","mt","mts","no","Nos","PhD","MD","BA","MA","MM","pl","pop","pp","Prof","Dr","pt","Ref","Refs","Rep","repr","rev","Sec","Secs","Sgt","Col","Gen","Rep","Sen","Gov","Lt","Maj","Capt","St","Sr","sr","Jr","jr","Rev","Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat","trans","Univ","Viz","Vol","vs","v"];exports.setAbbreviations=function(abbr){if(abbr){abbreviations=abbr}else{abbreviations=englishAbbreviations}};exports.isCapitalized=function(str){return/^[A-Z][a-z].*/.test(str)||this.isNumber(str)};exports.isSentenceStarter=function(str){return this.isCapitalized(str)||/``|"|'/.test(str.substring(0,2))};exports.isCommonAbbreviation=function(str){return~abbreviations.indexOf(str.replace(/\W+/g,""))};exports.isTimeAbbreviation=function(word,next){if(word==="a.m."||word==="p.m."){var tmp=next.replace(/\W+/g,"").slice(-3).toLowerCase();if(tmp==="day"){return true}}return false};exports.isDottedAbbreviation=function(word){var matches=word.replace(/[\(\)\[\]\{\}]/g,"").match(/(.\.)*/);return matches&&matches[0].length>0};exports.isCustomAbbreviation=function(str){if(str.length<=3){return true}return this.isCapitalized(str)};exports.isNameAbbreviation=function(wordCount,words){if(words.length>0){if(wordCount<5&&words[0].length<6&&this.isCapitalized(words[0])){return true}var capitalized=words.filter(function(str){return/[A-Z]/.test(str.charAt(0))});return capitalized.length>=3}return false};exports.isNumber=function(str,dotPos){if(dotPos){str=str.slice(dotPos-1,dotPos+2)}return!isNaN(str)};exports.isPhoneNr=function(str){return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/)};exports.isURL=function(str){return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/)};exports.isConcatenated=function(word){var i=0;if((i=word.indexOf("."))>-1||(i=word.indexOf("!"))>-1||(i=word.indexOf("?"))>-1){var c=word.charAt(i+1);if(c.match(/[a-zA-Z].*/)){return[word.slice(0,i),word.slice(i+1)]}}return false};exports.isBoundaryChar=function(word){return word==="."||word==="!"||word==="?"}},{}],2:[function(require,module,exports){module.exports=function sanitizeHtml(text,opts){if((typeof text=="string"||text instanceof String)&&typeof document!=="undefined"){var $div=document.createElement("DIV");$div.innerHTML=text;text=($div.textContent||"").trim()}else if(typeof text==="object"&&text.textContent){text=(text.textContent||"").trim()}return text}},{}],3:[function(require,module,exports){exports.endsWithChar=function ends_with_char(word,c){if(c.length>1){return c.indexOf(word.slice(-1))>-1}return word.slice(-1)===c};exports.endsWith=function ends_with(word,end){return word.slice(word.length-end.length)===end}},{}],4:[function(require,module,exports){"use strict";var sanitizeHtml=require("sanitize-html");var stringHelper=require("./stringHelper");var Match=require("./Match");var newline_placeholder=" @~@ ";var newline_placeholder_t=newline_placeholder.trim();var whiteSpaceCheck=new RegExp("\\S","");var addNewLineBoundaries=new RegExp("\\n+|[-#=_+*]{4,}","g");var splitIntoWords=new RegExp("\\S+|\\n","g");exports.sentences=function(text,user_options){if(!text||typeof text!=="string"||!text.length){return[]}if(!whiteSpaceCheck.test(text)){return[]}var options={newline_boundaries:false,html_boundaries:false,html_boundaries_tags:["p","div","ul","ol"],sanitize:false,allowed_tags:false,preserve_whitespace:false,abbreviations:null};if(typeof user_options==="boolean"){options.newline_boundaries=true}else{for(var k in user_options){options[k]=user_options[k]}}Match.setAbbreviations(options.abbreviations);if(options.newline_boundaries){text=text.replace(addNewLineBoundaries,newline_placeholder)}if(options.html_boundaries){var html_boundaries_regexp="(<br\\s*\\/?>|<\\/("+options.html_boundaries_tags.join("|")+")>)";var re=new RegExp(html_boundaries_regexp,"g");text=text.replace(re,"$1"+newline_placeholder)}if(options.sanitize||options.allowed_tags){if(!options.allowed_tags){options.allowed_tags=[""]}text=sanitizeHtml(text,{allowedTags:options.allowed_tags})}var words;var tokens;if(options.preserve_whitespace){tokens=text.split(/(<br\s*\/?>|\S+|\n+)/);words=tokens.filter(function(token,ii){return ii%2})}else{words=text.trim().match(splitIntoWords)}var wordCount=0;var index=0;var temp=[];var sentences=[];var current=[];if(!words||!words.length){return[]}for(var i=0,L=words.length;i<L;i++){wordCount++;current.push(words[i]);if(~words[i].indexOf(",")){wordCount=0}if(Match.isBoundaryChar(words[i])||stringHelper.endsWithChar(words[i],"?!")||words[i]===newline_placeholder_t){if((options.newline_boundaries||options.html_boundaries)&&words[i]===newline_placeholder_t){current.pop()}sentences.push(current);wordCount=0;current=[];continue}if(stringHelper.endsWithChar(words[i],'"')||stringHelper.endsWithChar(words[i],"”")){words[i]=words[i].slice(0,-1)}if(stringHelper.endsWithChar(words[i],".")){if(i+1<L){if(words[i].length===2&&isNaN(words[i].charAt(0))){continue}if(Match.isCommonAbbreviation(words[i])){continue}if(Match.isSentenceStarter(words[i+1])){if(Match.isTimeAbbreviation(words[i],words[i+1])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,6))){continue}if(Match.isNumber(words[i+1])){if(Match.isCustomAbbreviation(words[i])){continue}}}else{if(stringHelper.endsWith(words[i],"..")){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,5))){continue}}}sentences.push(current);current=[];wordCount=0;continue}if((index=words[i].indexOf("."))>-1){if(Match.isNumber(words[i],index)){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isURL(words[i])||Match.isPhoneNr(words[i])){continue}}if(temp=Match.isConcatenated(words[i])){current.pop();current.push(temp[0]);sentences.push(current);current=[];wordCount=0;current.push(temp[1])}}if(current.length){sentences.push(current)}var result=[];var sentence="";sentences=sentences.filter(function(s){return s.length>0});for(var i=0;i<sentences.length;i++){if(options.preserve_whitespace&&!options.newline_boundaries&&!options.html_boundaries){var tokenCount=sentences[i].length*2;if(i===0){tokenCount+=1}sentence=tokens.splice(0,tokenCount).join("")}else{sentence=sentences[i].join(" ")}if(sentences[i].length===1&&sentences[i][0].length<4&&sentences[i][0].indexOf(".")>-1){if(sentences[i+1]&&sentences[i+1][0].indexOf(".")<0){sentence+=" "+sentences[i+1].join(" ");i++}}result.push(sentence)}return result}},{"./Match":1,"./stringHelper":3,"sanitize-html":2}]},{},[4])(4)});
(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.tokenizer=f()}})(function(){var define,module,exports;return function(){function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s}return e}()({1:[function(require,module,exports){var abbreviations;var englishAbbreviations=["al","adj","assn","Ave","BSc","MSc","Cell","Ch","Co","cc","Corp","Dem","Dept","ed","eg","Eq","Eqs","est","est","etc","Ex","ext","Fig","fig","Figs","figs","i.e","ie","Inc","inc","Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec","jr","mi","Miss","Mrs","Mr","Ms","Mol","mt","mts","no","Nos","PhD","MD","BA","MA","MM","pl","pop","pp","Prof","Dr","pt","Ref","Refs","Rep","repr","rev","Sec","Secs","Sgt","Col","Gen","Rep","Sen","Gov","Lt","Maj","Capt","St","Sr","sr","Jr","jr","Rev","Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat","trans","Univ","Viz","Vol","vs","v"];exports.setAbbreviations=function(abbr){if(abbr){abbreviations=abbr}else{abbreviations=englishAbbreviations}};var isCapitalized=exports.isCapitalized=function(str){return/^[A-Z][a-z].*/.test(str)||isNumber(str)};exports.isSentenceStarter=function(str){return isCapitalized(str)||/``|"|'/.test(str.substring(0,2))};exports.isCommonAbbreviation=function(str){return~abbreviations.indexOf(str.replace(/\W+/g,""))};exports.isTimeAbbreviation=function(word,next){if(word==="a.m."||word==="p.m."){var tmp=next.replace(/\W+/g,"").slice(-3).toLowerCase();if(tmp==="day"){return true}}return false};exports.isDottedAbbreviation=function(word){var matches=word.replace(/[\(\)\[\]\{\}]/g,"").match(/(.\.)*/);return matches&&matches[0].length>0};exports.isCustomAbbreviation=function(str){if(str.length<=3){return true}return isCapitalized(str)};exports.isNameAbbreviation=function(wordCount,words){if(words.length>0){if(wordCount<5&&words[0].length<6&&isCapitalized(words[0])){return true}var capitalized=words.filter(function(str){return/[A-Z]/.test(str.charAt(0))});return capitalized.length>=3}return false};var isNumber=exports.isNumber=function(str,dotPos){if(dotPos){str=str.slice(dotPos-1,dotPos+2)}return!isNaN(str)};exports.isPhoneNr=function(str){return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/)};exports.isURL=function(str){return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/)};exports.isConcatenated=function(word){var i=0;if((i=word.indexOf("."))>-1||(i=word.indexOf("!"))>-1||(i=word.indexOf("?"))>-1){var c=word.charAt(i+1);if(c.match(/[a-zA-Z].*/)){return[word.slice(0,i),word.slice(i+1)]}}return false};exports.isBoundaryChar=function(word){return word==="."||word==="!"||word==="?"}},{}],2:[function(require,module,exports){module.exports=function sanitizeHtml(text,opts){if((typeof text=="string"||text instanceof String)&&typeof document!=="undefined"){var $div=document.createElement("DIV");$div.innerHTML=text;text=($div.textContent||"").trim()}else if(typeof text==="object"&&text.textContent){text=(text.textContent||"").trim()}return text}},{}],3:[function(require,module,exports){exports.endsWithChar=function ends_with_char(word,c){if(c.length>1){return c.indexOf(word.slice(-1))>-1}return word.slice(-1)===c};exports.endsWith=function ends_with(word,end){return word.slice(word.length-end.length)===end}},{}],4:[function(require,module,exports){"use strict";var sanitizeHtml=require("sanitize-html");var stringHelper=require("./stringHelper");var Match=require("./Match");var newline_placeholder=" @~@ ";var newline_placeholder_t=newline_placeholder.trim();var whiteSpaceCheck=new RegExp("\\S","");var addNewLineBoundaries=new RegExp("\\n+|[-#=_+*]{4,}","g");var splitIntoWords=new RegExp("\\S+|\\n","g");exports.sentences=function(text,user_options){if(!text||typeof text!=="string"||!text.length){return[]}if(!whiteSpaceCheck.test(text)){return[]}var options={newline_boundaries:false,html_boundaries:false,html_boundaries_tags:["p","div","ul","ol"],sanitize:false,allowed_tags:false,preserve_whitespace:false,abbreviations:null};if(typeof user_options==="boolean"){options.newline_boundaries=true}else{for(var k in user_options){options[k]=user_options[k]}}Match.setAbbreviations(options.abbreviations);if(options.newline_boundaries){text=text.replace(addNewLineBoundaries,newline_placeholder)}if(options.html_boundaries){var html_boundaries_regexp="(<br\\s*\\/?>|<\\/("+options.html_boundaries_tags.join("|")+")>)";var re=new RegExp(html_boundaries_regexp,"g");text=text.replace(re,"$1"+newline_placeholder)}if(options.sanitize||options.allowed_tags){if(!options.allowed_tags){options.allowed_tags=[""]}text=sanitizeHtml(text,{allowedTags:options.allowed_tags})}var words;var tokens;if(options.preserve_whitespace){tokens=text.split(/(<br\s*\/?>|\S+|\n+)/);words=tokens.filter(function(token,ii){return ii%2})}else{words=text.trim().match(splitIntoWords)}var wordCount=0;var index=0;var temp=[];var sentences=[];var current=[];if(!words||!words.length){return[]}for(var i=0,L=words.length;i<L;i++){wordCount++;current.push(words[i]);if(~words[i].indexOf(",")){wordCount=0}if(Match.isBoundaryChar(words[i])||stringHelper.endsWithChar(words[i],"?!")||words[i]===newline_placeholder_t){if((options.newline_boundaries||options.html_boundaries)&&words[i]===newline_placeholder_t){current.pop()}sentences.push(current);wordCount=0;current=[];continue}if(stringHelper.endsWithChar(words[i],'"')||stringHelper.endsWithChar(words[i],"”")){words[i]=words[i].slice(0,-1)}if(stringHelper.endsWithChar(words[i],".")){if(i+1<L){if(words[i].length===2&&isNaN(words[i].charAt(0))){continue}if(Match.isCommonAbbreviation(words[i])){continue}if(Match.isSentenceStarter(words[i+1])){if(Match.isTimeAbbreviation(words[i],words[i+1])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,6))){continue}if(Match.isNumber(words[i+1])){if(Match.isCustomAbbreviation(words[i])){continue}}}else{if(stringHelper.endsWith(words[i],"..")){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isNameAbbreviation(wordCount,words.slice(i,5))){continue}}}sentences.push(current);current=[];wordCount=0;continue}if((index=words[i].indexOf("."))>-1){if(Match.isNumber(words[i],index)){continue}if(Match.isDottedAbbreviation(words[i])){continue}if(Match.isURL(words[i])||Match.isPhoneNr(words[i])){continue}}if(temp=Match.isConcatenated(words[i])){current.pop();current.push(temp[0]);sentences.push(current);current=[];wordCount=0;current.push(temp[1])}}if(current.length){sentences.push(current)}sentences=sentences.filter(function(s){return s.length>0});var result=sentences.slice(1).reduce(function(out,sentence){var lastSentence=out[out.length-1];if(lastSentence.length===1&&/^.{1,2}[.]$/.test(lastSentence[0])){if(!/[.]/.test(sentence[0])){out.pop();out.push(lastSentence.concat(sentence));return out}}out.push(sentence);return out},[sentences[0]]);return result.map(function(sentence,ii){if(options.preserve_whitespace&&!options.newline_boundaries&&!options.html_boundaries){var tokenCount=sentence.length*2;if(ii===0){tokenCount+=1}return tokens.splice(0,tokenCount).join("")}return sentence.join(" ")})}},{"./Match":1,"./stringHelper":3,"sanitize-html":2}]},{},[4])(4)});

@@ -73,4 +73,4 @@ var abbreviations;

exports.isCapitalized = function(str) {
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str);
var isCapitalized = exports.isCapitalized = function(str) {
return /^[A-Z][a-z].*/.test(str) || isNumber(str);
}

@@ -80,3 +80,3 @@

exports.isSentenceStarter = function(str) {
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
return isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
}

@@ -106,3 +106,4 @@

// TODO look for next words, if multiple capitalized -> not sentence ending
// TODO look for next words, if multiple are capitalized,
// then it's probably not a sentence ending
exports.isCustomAbbreviation = function(str) {

@@ -113,3 +114,3 @@ if (str.length <= 3) {

return this.isCapitalized(str);
return isCapitalized(str);
}

@@ -119,7 +120,5 @@

// more likely an abbreviation + name or new sentence.
// ~ TODO Perhaps also consider prev. word?
exports.isNameAbbreviation = function(wordCount, words) {
if (words.length > 0) {
if (wordCount < 5 && words[0].length < 6 && this.isCapitalized(words[0])) {
if (wordCount < 5 && words[0].length < 6 && isCapitalized(words[0])) {
return true;

@@ -138,3 +137,3 @@ }

exports.isNumber = function(str, dotPos) {
var isNumber = exports.isNumber = function(str, dotPos) {
if (dotPos) {

@@ -141,0 +140,0 @@ str = str.slice(dotPos-1, dotPos+2);

@@ -224,6 +224,2 @@ /*jshint node:true, laxcomma:true */

/** After processing */
var result = [];
var sentence = "";
// Clear "empty" sentences

@@ -234,34 +230,33 @@ sentences = sentences.filter(function(s) {

for (var i=0; i < sentences.length; i++) {
if (options.preserve_whitespace && !options.newline_boundaries && !options.html_boundaries) {
// tokens looks like so: [leading-space token, non-space token, space
// token, non-space token, space token... ]. In other words, the first
// item is the leading space (or the empty string), and the rest of
// the tokens are [non-space, space] token pairs.
var tokenCount = sentences[i].length * 2;
if (i === 0) {
tokenCount += 1;
var result = sentences.slice(1).reduce(function (out, sentence) {
var lastSentence = out[out.length - 1];
// Single words, could be "enumeration lists"
if (lastSentence.length === 1 && /^.{1,2}[.]$/.test(lastSentence[0])) {
// Check if there is a next sentence
// It should not be another list item
if (!/[.]/.test(sentence[0])) {
out.pop()
out.push(lastSentence.concat(sentence));
return out;
}
}
out.push(sentence);
return out;
}, [ sentences[0] ]);
sentence = tokens.splice(0, tokenCount).join('');
// join tokens back together
return result.map(function (sentence, ii) {
if (options.preserve_whitespace && !options.newline_boundaries && !options.html_boundaries) {
// tokens looks like so: [leading-space token, non-space token, space
// token, non-space token, space token... ]. In other words, the first
// item is the leading space (or the empty string), and the rest of
// the tokens are [non-space, space] token pairs.
var tokenCount = sentence.length * 2;
if (ii === 0) {
tokenCount += 1;
}
else {
sentence = sentences[i].join(" ");
}
// Single words, could be "enumeration lists"
if (sentences[i].length === 1 && sentences[i][0].length < 4 && sentences[i][0].indexOf('.') > -1) {
// Check if there is a next sentence
// It should not be another list item
if (sentences[i+1] && sentences[i+1][0].indexOf('.') < 0) {
sentence += " " + sentences[i+1].join(" ");
i++;
}
}
result.push(sentence);
}
return result;
return tokens.splice(0, tokenCount).join('');
}
return sentence.join(" ");
});
};
{
"name": "sbd",
"version": "1.0.13",
"version": "1.0.14",
"description": "Split text into sentences with Sentence Boundary Detection (SBD).",

@@ -5,0 +5,0 @@ "main": "lib/tokenizer.js",

@@ -26,2 +26,3 @@ Sentence Boundary Detection (SBD)

var optional_options = {};
var text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.";

@@ -55,3 +56,3 @@ var sentences = tokenizer.sentences(text, optional_options);

* `preserve_whitespace`: Preserve the literal whitespace between words and sentences (otherwise, internal spaces are normalized to a single space char, and inter-sentence whitespace is omitted). Preserve whitespace has no effect if either newline_boundaries or html_boundaries is specified.
* `abbreviations`: list of abbreviations to override the original ones for use with other languages. Don't put dots in abbreviations.
* `abbreviations`: list of abbreviations to override the original ones for use with other languages. Don't put dots in your custom abbreviations.

@@ -58,0 +59,0 @@

@@ -54,2 +54,14 @@ /*jshint node:true, laxcomma:true */

});
describe('It should properly join single-word list sentences', function () {
var entry = "iv. determining that the advertisement in the lift study is a candidate ad for the user, computing whether to include the user in a test group or a control group for the lift study ([0032]), v. based on the computation indicating that the user is in the control group, holding out the advertisement from completing the ad selection process for the user ([0032]), and vi. based on the computation indicating that the user is in the test group, allowing the advertisement to continue through the ad selection process such that the user receives either the advertisement in the lift study or another advertisement ([0032]); and ";
var sentences = tokenizer.sentences(entry, options);
it("should get the correct sentences", function () {
assert.deepEqual(sentences, [
"iv. determining that the advertisement in the lift study is a candidate ad for the user, computing whether to include the user in a test group or a control group for the lift study ([0032]), v. based on the computation indicating that the user is in the control group, holding out the advertisement from completing the ad selection process for the user ([0032]), and vi. ",
"based on the computation indicating that the user is in the test group, allowing the advertisement to continue through the ad selection process such that the user receives either the advertisement in the lift study or another advertisement ([0032]); and "
]);
});
});
});
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc