New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

sbd

Package Overview
Dependencies
Maintainers
1
Versions
25
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

sbd - npm Package Compare versions

Comparing version 1.0.0 to 1.0.1

126

lib/Match.js
var abbreviations = [
"ie",
"eg",
"ext", // + number?
"Fig",
"fig",
"Figs",
"figs",
"et al",
"Co",
"Corp",
"Ave",
"Inc",
"Ex",
"Viz",
"vs",
"Vs",
"repr",
"Rep",
"Dem",
"trans",
"Vol",
"pp",
"rev",
"est",
"Ref",
"Refs",
"Eq",
"Eqs",
"Ch",
"Sec",
"Secs",
"mi",
"Dept",
"Univ",
"Nos",
"No",
"Mol",
"Cell",
"Miss", "Mrs", "Mr", "Ms",
"Prof", "Dr",
"Sgt", "Col", "Gen", "Rep", "Sen",'Gov', "Lt", "Maj", "Capt","St",
"Sr", "Jr", "jr", "Rev",
"PhD", "MD", "BA", "MA", "MM",
"BSc", "MSc",
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec",
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"
];
exports.isCapitalized = function(str) {
var firstChar = str.charAt(0);
var rest = str.substring(1);
return firstChar === firstChar.toUpperCase() &&
rest === rest.toLowerCase();
return /^[A-Z][a-z].*/.test(str) || this.isNumber(str);
}

@@ -12,28 +61,20 @@

exports.isSentenceStarter = function(str) {
var t = /``|"|'/.test(str.substring(0,2)) ||
this.isCapitalized(str);
return t;
return this.isCapitalized(str) || /``|"|'/.test(str.substring(0,2));
}
exports.isCommonAbbreviation = function(str) {
var abbreviations = [
"ie",
"eg",
"Fig",
return ~abbreviations.indexOf(str.replace(/\W+/g, ''));
}
"Mrs", "Mr", "Ms",
"Prof", "Dr",
"Gen", "Rep", "Sen",
"St",
// This is going towards too much rule based
exports.isTimeAbbreviation = function(word, next) {
if (word === "a.m." || word === "p.m.") {
var tmp = next.replace(/\W+/g, '').slice(-3).toLowerCase();
"Sr", "Jr",
"PhD", "MD", "BA", "MA",
"BSc", "MSc",
if (tmp === "day") {
return true;
}
}
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec",
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"
];
return ~abbreviations.indexOf(str.replace(/\W+/g, ''));
return false;
}

@@ -43,15 +84,33 @@

var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/);
return matches && matches[0].length > 0;
}
// Short word with "capital letter" and ends with "dot"
// example Sen. or Gov.
// TODO look for next words, if multiple capitalized -> not sentence ending
exports.isCustomAbbreviation = function(str) {
if (str.length > 4)
return false;
if (str.length <= 3)
return true;
return str[0] === str[0].toUpperCase();
return this.isCapitalized(str);
}
// Uses current word count in sentence and next few words to check if it is
// more likely an abbreviation + name or new sentence.
// ~ TODO Perhaps also consider prev. word?
exports.isNameAbbreviation = function(wordCount, words) {
if (words.length > 0) {
if (wordCount < 5 && words[0].length < 6 && this.isCapitalized(words[0])) {
return true;
}
var capitalized = words.filter(function(str) {
return /[A-Z]/.test(str.charAt(0));
});
return capitalized.length >= 3;
}
return false;
}
exports.isNumber = function(str, dotPos) {

@@ -82,3 +141,6 @@ if (dotPos) {

if ((i = word.indexOf(".")) > -1 || (i = word.indexOf("!")) > -1 || (i = word.indexOf("?")) > -1) {
if ((i = word.indexOf(".")) > -1 ||
(i = word.indexOf("!")) > -1 ||
(i = word.indexOf("?")) > -1)
{
var c = word.charAt(i + 1);

@@ -85,0 +147,0 @@

@@ -14,11 +14,7 @@ /*jshint node:true, laxcomma:true */

exports.sentences = function(text, newline_boundary) {
if (text.length === 0)
return [];
text = sanitizeHtml(text, { "allowedTags" : [] });
text = sanitizeHtml(text, { "allowedTags" : [''] });
var index = 0;
var temp = [];
/** Preprocessing */

@@ -30,14 +26,27 @@ if (typeof newline_boundary === 'undefined') {

if (newline_boundary) {
text = text.replace(/\n+/g, newline_placeholder);
text = text.replace(/\n+|[-#=_+*]{4,}/g, newline_placeholder);
}
var index = 0;
var temp = [];
// Split the text into words
var words = text.match(/\S+/g); // see http://blog.tompawlak.org/split-string-into-tokens-javascript
var sentences = [];
var current = [];
var wordCount = 0;
for (var i=0, L=words.length; i < L; i++) {
wordCount++;
// Add the word to current sentence
current.push(words[i]);
// Sub-sentences (Bijzin?), reset counter
if (~words[i].indexOf(',')) {
wordCount = 0;
}
if (Match.isBoundaryChar(words[i]) ||

@@ -47,3 +56,3 @@ String.endsWithChar(words[i], "?!") ||

{
if (newline_boundary) {
if (newline_boundary && words[i] === newline_placeholder_t) {
current.pop();

@@ -53,4 +62,6 @@ }

sentences.push(current);
current = [];
wordCount = 0;
current = [];
continue;

@@ -63,8 +74,3 @@ }

if (String.endsWithChar(words[i], '.')) {
// Check if the word is in the abbreviation list (without symbols)
if (Match.isCommonAbbreviation(words[i])) {
continue;
}
// Check if there is a next word

@@ -74,11 +80,27 @@ if (i+1 < L) {

// Single character abbr.
if (words[i].length === 2 && isNaN(words[i].charAt(0))) {
continue;
}
// Common abbr. that often do not end sentences
if (Match.isCommonAbbreviation(words[i])) {
continue;
}
// Next word starts with capital word, but current sentence is
// quite short
if (Match.isSentenceStarter(words[i+1])) {
if (current.length < 6) {
// Custom dotted abbreviations (like K.L.M or I.C.T)
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) {
continue;
}
if (Match.isTimeAbbreviation(words[i], words[i+1])) {
continue;
}
// Dealing with names at the start of sentences
if (Match.isNameAbbreviation(wordCount, words.slice(i, 6))) {
continue;
}
if (Match.isNumber(words[i+1]) && Match.isCustomAbbreviation(words[i])) {
continue;
}
}

@@ -91,3 +113,4 @@ else {

// Skip abbreviations
//// Skip abbreviations
// Short words + dot or a dot after each letter
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) {

@@ -100,3 +123,5 @@ continue;

sentences.push(current);
current = [];
current = [];
wordCount = 0;
continue;

@@ -128,2 +153,3 @@ }

current = [];
wordCount = 0;
current.push(temp[1]);

@@ -140,3 +166,3 @@ }

// Clear empty values
// Clear "empty" sentences
sentences = sentences.filter(function(s) {

@@ -143,0 +169,0 @@ return s.length > 0;

{
"name": "sbd",
"version": "1.0.0",
"version": "1.0.1",
"description": "Split text into sentences with Sentence Boundary Detection (SBD).",

@@ -5,0 +5,0 @@ "main": "lib/tokenizer.js",

@@ -66,5 +66,10 @@ Sentence Boundary Detection (SBD)

## Future work
* Convert quotes to normalized unicode ""
* Convert hex-symbols to normalized symbol (i.e &mdash -> &)
* Force sentence breaking on new paragraphs (i.e </p> and <p> ==> \n\n ==> multiple newlines are sentence breaking);
## Notes
I cannot find a "test data set" to rate the performance, but I can imagine it needs a trained data set to help with difficult edge cases. For example, sentences that do end with an abbreviation.

@@ -8,3 +8,3 @@ /*jshint node:true, laxcomma:true */

describe('Multiple sentences', function () {
describe('Abbreviations in sentences', function () {

@@ -20,2 +20,20 @@ describe('Skip dotted abbreviations', function () {

describe('Skip dotted abbreviations (B)', function () {
var entry = "From amat frequentor minimus hello there at 8 a.m. there p.m. should only be two sentences.";
var sentences = tokenizer.sentences(entry);
it("should get 1 sentence", function () {
assert.equal(sentences.length, 1);
});
});
describe('Skip dotted abbreviations (C)', function () {
var entry = "The school, called Booker T and Stevie Ray\'s Wrestling and Mixed Mart Arts Academy, will have an open house 2-6 p.m. Saturday.";
var sentences = tokenizer.sentences(entry);
it("should get 1 sentence", function () {
assert.equal(sentences.length, 1);
});
});
describe('Skip common abbreviations', function () {

@@ -22,0 +40,0 @@ var entry = "Fig. 2. displays currency rates i.e. something libsum. Currencies widely available (i.e. euro, dollar, pound), or alternatively (e.g. €, $, etc.)";

@@ -55,2 +55,38 @@ /*jshint node:true, laxcomma:true */

describe('Difficult two sentences (D)', function () {
var entry = "Baril, a Richmond lawyer once nominated for a federal prosecutors job, endorsed a faith-based drug initiative in local jails patterned after the Henrico County jails therapeutic program called Project R.I.S.E. Just as important, he had a great foil across the net.";
var sentences = tokenizer.sentences(entry);
it('should get two sentence', function () {
assert.equal(sentences.length, 2);
});
});
describe('Difficult two sentences (E)', function () {
var entry = "Newsletter AIDs CARE, EDUCATION AND TRAINING Issue No. 7. Acet Home Care, which moves into the building in July, will share the offices with two other AIDS charities, P.A.L.S. (Portsmouth AIDS Link Support) and the Link Project.";
var sentences = tokenizer.sentences(entry);
it('should get two sentence', function () {
assert.equal(sentences.length, 2);
});
});
describe('Difficult two sentences (F)', function () {
var entry = "Another is expanded hours of operation -- from fewer than five hours a day to 9:30 a.m. to 4 p.m. Monday through Saturday. Sunday remains closed.";
var sentences = tokenizer.sentences(entry);
it('should get two sentence', function () {
assert.equal(sentences.length, 2);
});
});
describe('Difficult two sentences (G)', function () {
var entry = "Gold Wing Road Rider's Association - Coffee break, Guzzardo's Italian Villa, eat, 6 p.m.; ride, 7 p.m. Then at 9 p.m. go home.";
var sentences = tokenizer.sentences(entry);
it('should get two sentence', function () {
assert.equal(sentences.length, 2);
});
});
describe('Dot in middle of word is not skipped if followed by capital letter', function () {

@@ -91,2 +127,11 @@ var entry = "Hello Barney.The bird in the word.";

});
describe('If newlines are boundaries (B)', function () {
var entry = "FAMILIY HISTORY ========================================== Nothing interesting";
var sentences = tokenizer.sentences(entry, true);
it("should get 2 sentences", function () {
assert.equal(sentences.length, 2);
});
});
});

@@ -36,2 +36,47 @@ /*jshint node:true, laxcomma:true */

describe('Difficult sentence (B)', function () {
var entry = "It happened around 5:30 p.m. in the 500 block of W. 82nd St. Investigators say Terrence Taylor, 22, and Deontrell Sloan, 17, got into an argument over money during the game.";
var sentences = tokenizer.sentences(entry);
it('should get 1 sentence', function () {
assert.equal(sentences.length, 1);
});
});
describe('Difficult sentence (C)', function () {
var entry = "GARY Mayor Scott L. King has declared a 'cash crisis' and has asked city department heads to put off all non-essential spending until June.";
var sentences = tokenizer.sentences(entry);
it('should get 1 sentence', function () {
assert.equal(sentences.length, 1);
});
});
describe('Difficult sentence (D)', function () {
var entry = "HOWELL, Mich. - Blissfield was only nine outs away from ending the longest winning streak";
var sentences = tokenizer.sentences(entry);
it('should get 1 sentence', function () {
assert.equal(sentences.length, 1);
});
});
describe('Difficult sentence (E)', function () {
var entry = "33 FORT LAUDERDALE U.S. President George W Bush touted free trade as a means of strengthening democracy";
var sentences = tokenizer.sentences(entry);
it('should get 1 sentence', function () {
assert.equal(sentences.length, 1);
});
});
describe('Difficult sentence (F)', function () {
var entry = "Mike Tyler rides his bike on Del. 1 near Lewes early last month";
var sentences = tokenizer.sentences(entry);
it('should get 1 sentence', function () {
assert.equal(sentences.length, 1);
});
});
// Questionable behavior, but can only be fixed using ML?

@@ -38,0 +83,0 @@ describe('Dot in middle of word is skipped', function () {

@@ -45,2 +45,20 @@ /*jshint node:true, laxcomma:true */

});
describe('Newlines/paragraph must be enabled to end sentences', function () {
var entry = "The humble bundle sale\r\nDate: Monday-Fri starting 2015-01-01";
var sentences = tokenizer.sentences(entry);
it("should get 1 sentences", function () {
assert.equal(sentences.length, 1);
});
});
describe('Newlines/paragraph enabled ends sentences', function () {
var entry = "The humble bundle sale\r\nDate: Monday-Fri starting 2015-01-01\nSales starting at ¤2,50";
var sentences = tokenizer.sentences(entry, true);
it("should get 3 sentences", function () {
assert.equal(sentences.length, 3);
});
});
});

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc