Comparing version 1.0.3 to 1.0.4
@@ -54,3 +54,3 @@ /*jshint node:true, laxcomma:true */ | ||
// Sub-sentences (Bijzin?), reset counter | ||
// Sub-sentences, reset counter | ||
if (~words[i].indexOf(',')) { | ||
@@ -76,2 +76,8 @@ wordCount = 0; | ||
if (String.endsWithChar(words[i], "\"") || String.endsWithChar(words[i], "”")) { | ||
// endQuote = words[i].slice(-1); | ||
words[i] = words[i].slice(0, -1); | ||
} | ||
// A dot might indicate the end sentences | ||
@@ -78,0 +84,0 @@ // Exception: The next sentence starts with a word (non abbreviation) |
{ | ||
"name": "sbd", | ||
"version": "1.0.3", | ||
"version": "1.0.4", | ||
"description": "Split text into sentences with Sentence Boundary Detection (SBD).", | ||
@@ -5,0 +5,0 @@ "main": "lib/tokenizer.js", |
@@ -6,3 +6,2 @@ Sentence Boundary Detection (SBD) | ||
* Split a text based on period, question- and exclamation marks. | ||
@@ -33,17 +32,21 @@ * Skips (most) abbreviations (Mr., Mrs., PhD.) | ||
// ] | ||
``` | ||
var text = "Got any problems? Open an issue on github.com!"; | ||
var sentences = tokenizer.sentences(text); | ||
The second argument can also be a configuration object, that can support the following values: | ||
// [ | ||
// 'Got any problems?', | ||
// 'Open an issue on github.com!', | ||
// ] | ||
* `newline_boundary`: the same as specifying the second argument as a boolean. | ||
* `sanitize`: set this to `false` in order to disable automatic HTML sanitization. While automatic | ||
sanitization has to remain the default for backwards compatibility purposes, unless you are | ||
specifically providing `sbd` with content you know to contain HTML it is recommended to switch | ||
this off as it can mangle your content. | ||
// Sometimes newlines are important. Newlines can be treated as sentence endings | ||
```javascript | ||
var options = { | ||
"newline_boundary": true, | ||
"sanitize": true | ||
}; | ||
var sentences = tokenizer.sentences(textFromFile, options); | ||
file.txt | ||
""" | ||
Title of project: Hello World | ||
textFromFile = "Title of project: Hello World | ||
Author: Kenny | ||
@@ -54,6 +57,5 @@ | ||
Lorem ipsum dolor sit amet. Consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco (laboris nisi?) ut aliquip ex ea commodo consequat. | ||
""" | ||
"; | ||
var sentences = tokenizer.sentences(textFromFile, true); | ||
// Gives | ||
// [ | ||
@@ -69,18 +71,7 @@ // 'Title of project: Hello World', | ||
The second argument can also be a configuration object, that can support the following values: | ||
* `newline_boundary`: the same as specifying the second argument as a boolean. | ||
* `sanitize`: set this to `false` in order to disable automatic HTML sanitization. While automatic | ||
sanitization has to remain the default for backwards compatibility purposes, unless you are | ||
specifically providing `sbd` with content you know to contain HTML it is recommended to switch | ||
this off as it can mangle your content. | ||
## Contributing | ||
## Future work | ||
You can run unit tests with `npm test`. | ||
* Convert quotes to normalized unicode "" | ||
* Convert hex-symbols to normalized symbol (i.e &mdash -> &) | ||
* Force sentence breaking on new paragraphs (i.e </p> and <p> ==> \n\n ==> multiple newlines are sentence breaking); | ||
## Notes | ||
I cannot find a "test data set" to rate the performance, but I can imagine it needs a trained data set to help with difficult edge cases. For example, sentences that do end with an abbreviation. | ||
If you feel something is missing, you can open an issue stating the problem sentence and desired result. If code is unclear give me a @mention. Pull requests are welcome. |
@@ -127,2 +127,21 @@ /*jshint node:true, laxcomma:true */ | ||
describe('Sentences with quotations', function () { | ||
var entry = "“If there’s no balance and your boss doesn’t provide support and work that’s meaningful, your chances of burning out are great.” What bothers most people in situations like these is “the lack of boundaries,” says Nancy Rothbard, the David Pottruck Professor of Management at the University of Pennsylvania’s Wharton School."; | ||
var sentences = tokenizer.sentences(entry, true); | ||
it("should get 2 sentences", function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
}); | ||
describe('Sentences with quotations', function () { | ||
var entry = "“If there’s no balance! And your boss doesn’t provide support and work that’s meaningful, your chances of burning out are great.” What bothers most people in situations like these is “the lack of boundaries,” says Nancy Rothbard, the David Pottruck Professor of Management at the University of Pennsylvania’s Wharton School."; | ||
var sentences = tokenizer.sentences(entry, true); | ||
it("should get 3 sentences", function () { | ||
assert.equal(sentences.length, 3); | ||
}); | ||
}); | ||
describe('If newlines are boundaries (B)', function () { | ||
@@ -129,0 +148,0 @@ var entry = "FAMILIY HISTORY ========================================== Nothing interesting"; |
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
Minified code
QualityThis package contains minified code. This may be harmless in some cases where minified code is included in packaged libraries, however packages on npm should not minify code.
Found 1 instance in 1 package
54354
19
965
1
74
4