New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

sbd

Package Overview
Dependencies
Maintainers
1
Versions
25
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

sbd - npm Package Compare versions

Comparing version 0.0.5 to 0.0.6

LICENSE

68

lib/Match.js
// input: dot_index = position of a "." symbol
exports.is_number = function(str, dot_index) {
if (dot_index) {
str = str.slice(dot_index-1, dot_index+2);
exports.isCapitalized = function(str) {
var firstChar = str.charAt(0);
var rest = str.substring(1);
return firstChar === firstChar.toUpperCase() &&
rest === rest.toLowerCase();
}
// Start with opening quotes or capitalized letter
exports.isSentenceStarter = function(str) {
var t = /``|"|'/.test(str.substring(0,2)) ||
this.isCapitalized(str);
return t;
}
exports.isCommonAbbreviation = function(str) {
var abbreviations = [
"ie",
"eg",
"Fig",
"Mrs", "Mr", "Ms",
"Prof", "Dr",
"Gen", "Rep", "Sen",
"St",
"Sr", "Jr",
"PhD", "MD", "BA", "MA",
"BSc", "MSc",
"Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sep","Sept","Oct","Nov","Dec",
"Sun","Mon","Tu","Tue","Tues","Wed","Th","Thu","Thur","Thurs","Fri","Sat"
];
return ~abbreviations.indexOf(str.replace(/\W+/g, ''));
}
exports.isDottedAbbreviation = function(word) {
var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/);
return matches && matches[0].length > 0;
}
// Short word with "capital letter" and ends with "dot"
// example Sen. or Gov.
exports.isCustomAbbreviation = function(str) {
if (str.length > 4)
return false;
return str[0] === str[0].toUpperCase();
}
exports.isNumber = function(str, dotPos) {
if (dotPos) {
str = str.slice(dotPos-1, dotPos+2);
}

@@ -13,3 +65,3 @@

// http://stackoverflow.com/a/123666/951517
exports.is_phone_nr = function(str) {
exports.isPhoneNr = function(str) {
return str.match(/^(?:(?:\+?1\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?$/);

@@ -20,3 +72,3 @@ };

// http://stackoverflow.com/a/3809435/951517
exports.is_url = function(str) {
exports.isURL = function(str) {
return str.match(/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)/);

@@ -27,3 +79,3 @@ };

// Exception: The word is enclosed in brackets
exports.is_concatenated = function(word) {
exports.isConcatenated = function(word) {
var i = 0;

@@ -43,3 +95,3 @@

exports.is_boundary_char = function(word) {
exports.isBoundaryChar = function(word) {
return word === "." ||

@@ -46,0 +98,0 @@ word === "!" ||

22

lib/sanitize-html-browser.js

@@ -0,13 +1,15 @@

module.exports = function sanitizeHtml(text, opts) {
// Strip HTML from Text using browser HTML parser
if (typeof text == 'string' || text instanceof String) {
var $div = document.createElement("DIV");
$div.innerHTML = text;
text = ($div.textContent || '').trim();
}
//DOM Object
else if (typeof text === 'object' && text.textContent) {
text = (text.textContent || '').trim();
}
if (typeof text == 'string' || text instanceof String) { // Strip HTML from Text using browser HTML parser
var $div = document.createElement("DIV");
$div.innerHTML = text;
text = ($div.textContent || '').trim();
} else if (typeof text === 'object' && text.textContent) { //DOM Object
text = (text.textContent || '').trim();
}
return text;
return text;
};
exports.ends_with_char = function ends_with_char(word, c) {
exports.endsWithChar = function ends_with_char(word, c) {
if (c.length > 1) {

@@ -10,8 +10,4 @@ return c.indexOf(word.slice(-1)) > -1;

exports.ends_with = function ends_with(word, end) {
exports.endsWith = function ends_with(word, end) {
return word.slice(word.length - end.length) === end;
};
exports.is_dotted_abbreviation = function is_dotted_abbreviation(word) {
return word.match(/(.[.])*/)[0].length > 0;
}
};

@@ -9,4 +9,2 @@ /*jshint node:true, laxcomma:true */

var abbreviations = require('../data/abbr').abbreviations;
var newline_placeholder = " @~@ ";

@@ -16,7 +14,11 @@ var newline_placeholder_t = newline_placeholder.trim();

// Split the entry into sentences.
exports.sentences = function sentences(text, newline_boundary) {
exports.sentences = function(text, newline_boundary) {
if (text.length === 0)
return [];
text = sanitizeHtml(text, { "allowedTags" : [] });
var i,index = 0;
var temp = [];
var index = 0;
var temp = [];

@@ -37,8 +39,8 @@ /** Preprocessing */

for (i=0; i<words.length; i++) {
for (var i=0, L=words.length; i < L; i++) {
// Add the word to current sentence
current.push(words[i]);
if (Match.is_boundary_char(words[i]) ||
String.ends_with_char(words[i], "?!") ||
if (Match.isBoundaryChar(words[i]) ||
String.endsWithChar(words[i], "?!") ||
words[i] === newline_placeholder_t)

@@ -56,21 +58,37 @@ {

// A dot might indicate the end of a sentence
if (String.ends_with_char(words[i], '.')) {
// Single characters + dot are considered abbreviations
if (words[i].length === 2) {
continue;
// A dot might indicate the end sentences
// Exception: The next sentence starts with a word (non abbreviation)
// that has a capital letter.
if (String.endsWithChar(words[i], '.')) {
// Check if the word is in the abbreviation list (without symbols)
if (Match.isCommonAbbreviation(words[i])) {
continue;
}
/** Check for abbreviations */
// Check if there is a next word
if (i+1 < L) {
// This should be improved with machine learning
// Custom dotted abbreviations (like K.L.M or I.C.T)
if (String.is_dotted_abbreviation(words[i])) {
continue;
}
// Next word starts with capital word, but current sentence is
// quite short
if (Match.isSentenceStarter(words[i+1])) {
if (current.length < 6) {
// Custom dotted abbreviations (like K.L.M or I.C.T)
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) {
continue;
}
}
}
else {
// Skip ellipsis
if (String.endsWith(words[i], "..")) {
continue;
}
// Check if the word is in the abbr. list (without
// the period and lowercased)
var w = words[i].toLowerCase().slice(0, -1);
if (abbreviations.indexOf(w) > -1) {
continue;
// Skip abbreviations
if (Match.isDottedAbbreviation(words[i]) || Match.isCustomAbbreviation(words[i])) {
continue;
}
}
}

@@ -85,3 +103,3 @@

if ((index = words[i].indexOf(".")) > -1) {
if (Match.is_number(words[i], index)) {
if (Match.isNumber(words[i], index)) {
continue;

@@ -91,3 +109,3 @@ }

// Custom dotted abbreviations (like K.L.M or I.C.T)
if (String.is_dotted_abbreviation(words[i])) {
if (Match.isDottedAbbreviation(words[i])) {
continue;

@@ -97,3 +115,3 @@ }

// Skip urls / emails and the like
if (Match.is_url(words[i]) || Match.is_phone_nr(words[i])) {
if (Match.isURL(words[i]) || Match.isPhoneNr(words[i])) {
continue;

@@ -103,3 +121,3 @@ }

if (temp = Match.is_concatenated(words[i])) {
if (temp = Match.isConcatenated(words[i])) {
current.pop();

@@ -106,0 +124,0 @@ current.push(temp[0]);

{
"name": "sbd",
"version": "0.0.5",
"description": "Split text into sentences",
"version": "0.0.6",
"description": "Split text into sentences with Sentence Boundary Detection (SBD).",
"main": "lib/tokenizer.js",

@@ -15,7 +15,3 @@ "scripts": {

"keywords": [
"sentence splitting",
"splitting",
"tokenize sentences",
"sentences",
"sentence"
"sentence", "detection", "boundary"
],

@@ -22,0 +18,0 @@ "author": {

@@ -1,14 +0,13 @@

# Sentence Boundary Detection (SBD)
Sentence Boundary Detection (SBD)
==================
Simple sentence detection (i.e working ~95% of the time):
Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time).
* Split a text based on period, question- and exclamation marks.
* Skips abbreviations
* Skips numbers, currency
* Skips urls, email address, phone nr.
* Skips (most) abbreviations (Mr., Mrs., PhD.)
* Skips numbers/currency
* Skips urls, websites, email addresses, phone nr.
* Counts ellipsis and ?! as single punctuation
## Future work
Currently, `sbd` fails to recognize sentences ending in an abbreviation, for example "The president lives in Washington, D.C." and I do not really see a viable option other than using a real classifier with proper training.
## Installation

@@ -26,13 +25,23 @@

var text = "In I.C.T we have multiple challenges!
This is a text of three sentences. Skip Mr. Money €10.00 right.";
var text = "On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S. Millions attended the Inauguration.";
var sentences = tokenizer.sentences(text);
// [
// 'On Jan. 20, former Sen. Barack Obama became the 44th President of the U.S.',
// 'Millions attended the Inauguration.',
// ]
var text = "Got any problems? Open an issue on github.com!";
var sentences = tokenizer.sentences(text);
// [
// 'In I.C.T we have multiple challenges!',
// 'This is a text of three sentences.',
// 'Skip Mr. Money €10.00 right.'
// 'Got any problems?',
// 'Open an issue on github.com!',
// ]
```
## Notes
I cannot find a "test data set" to rate the performance, but I can imagine it needs a trained data set to help with difficult edge cases. For example, sentences that do end with an abbreviation.
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc