Comparing version 1.0.1 to 1.0.2
@@ -13,13 +13,22 @@ /*jshint node:true, laxcomma:true */ | ||
// Split the entry into sentences. | ||
exports.sentences = function(text, newline_boundary) { | ||
exports.sentences = function(text, options) { | ||
if (text.length === 0) | ||
return []; | ||
text = sanitizeHtml(text, { "allowedTags" : [''] }); | ||
/** Preprocessing */ | ||
if (typeof newline_boundary === 'undefined') { | ||
/** Options processing */ | ||
var newline_boundary; | ||
var do_sanitize = true; | ||
if (typeof options === 'undefined') { | ||
newline_boundary = false; | ||
} | ||
else if (typeof options === 'object') { | ||
newline_boundary = options.newline_boundary || false; | ||
do_sanitize = typeof options.sanitize === 'undefined' ? true : options.sanitize; | ||
} | ||
else { | ||
newline_boundary = options; | ||
} | ||
text = do_sanitize ? sanitizeHtml(text, { "allowedTags" : [''] }) : text; | ||
if (newline_boundary) { | ||
@@ -26,0 +35,0 @@ text = text.replace(/\n+|[-#=_+*]{4,}/g, newline_placeholder); |
{ | ||
"name": "sbd", | ||
"version": "1.0.1", | ||
"version": "1.0.2", | ||
"description": "Split text into sentences with Sentence Boundary Detection (SBD).", | ||
@@ -5,0 +5,0 @@ "main": "lib/tokenizer.js", |
@@ -66,2 +66,10 @@ Sentence Boundary Detection (SBD) | ||
The second argument can also be a configuration object, that can support the following values: | ||
* `newline_boundary`: the same as specifying the second argument as a boolean. | ||
* `sanitize`: set this to `false` in order to disable automatic HTML sanitization. While automatic | ||
sanitization has to remain the default for backwards compatibility purposes, unless you are | ||
specifically providing `sbd` with content you know to contain HTML it is recommended to switch | ||
this off as it can mangle your content. | ||
## Future work | ||
@@ -68,0 +76,0 @@ |
@@ -19,2 +19,15 @@ /*jshint node:true, laxcomma:true */ | ||
}); | ||
describe('Non-markup is not interfered with', function () { | ||
var entry = "We find that a < b works. But in turn, c > x."; | ||
var sentences = tokenizer.sentences(entry, { sanitize: false }); | ||
it("should get 2 sentences", function () { | ||
assert.equal(sentences.length, 2); | ||
}); | ||
it("should not be escaped", function () { | ||
assert(!/</.test(sentences[0])); | ||
assert(!/>/.test(sentences[1])); | ||
}); | ||
}); | ||
}); |
31096
637
83