html-parser
Advanced tools
Comparing version 0.2.0 to 0.3.0
{ | ||
"name": "html-parser", | ||
"version": "0.2.0", | ||
"version": "0.3.0", | ||
"description": "HTML/XML parser with less explosions", | ||
@@ -22,3 +22,3 @@ "keywords": [ "html", "xml", "parser", "explosion" ], | ||
"devDependencies": { | ||
"moczha": ">= 1.1.0", | ||
"mocha": ">= 1.1.0", | ||
"should": ">= 0.6.3" | ||
@@ -25,0 +25,0 @@ }, |
@@ -1,3 +0,2 @@ | ||
html-parser | ||
----------- | ||
# html-parser | ||
@@ -14,4 +13,6 @@ Now with less explosions! | ||
Callback based parsing | ||
====================== | ||
## Installation | ||
`npm install html-parser` | ||
## Callback based parsing | ||
```javascript | ||
@@ -45,5 +46,3 @@ var htmlParser = require('html-parser'); | ||
Sanitization | ||
============ | ||
## Sanitization | ||
```javascript | ||
@@ -63,1 +62,80 @@ var htmlParser = require('html-parser'); | ||
### Using callbacks | ||
```javascript | ||
var htmlParser = require('html-parser'); | ||
var html = '<script>alert(\'danger!\')</script><p onclick="alert(\'danger!\')">blah blah<!-- useless comment --></p>'; | ||
var sanitized = htmlParser.sanitize(html, { | ||
elements: function(name) { | ||
return name === 'script'; | ||
}, | ||
attributes: function(name, value) { | ||
return /^on/i.test(name) || /^javascript:/i.test(value); | ||
} | ||
comments: true | ||
}); | ||
console.log(sanitized); | ||
//<p>blah blah</p> | ||
``` | ||
## API | ||
```javascript | ||
/** | ||
* Parses the given string o' HTML, executing each callback when it | ||
* encounters a token. | ||
* | ||
* @param {String} htmlString A string o' HTML | ||
* @param {Object} [callbacks] Callbacks for each token | ||
* @param {Function} [callbacks.attribute] Takes the name of the attribute and its value | ||
* @param {Function} [callbacks.openElement] Takes the tag name of the element | ||
* @param {Function} [callbacks.closeOpenedElement] Takes the tag name of the element and the token used to | ||
* close it (">", "/>", "?>") | ||
* @param {Function} [callbacks.closeElement] Takes the name of the element | ||
* @param {Function} [callbacks.comment] Takes the content of the comment | ||
* @param {Function} [callbacks.docType] Takes the content of the document type declaration | ||
* @param {Function} [callbacks.cdata] Takes the content of the CDATA | ||
* @param {Function} [callbacks.xmlProlog] Takes no arguments | ||
* @param {Function} [callbacks.text] Takes the value of the text node | ||
*/ | ||
parse(htmlString, callbacks) | ||
/** | ||
* Parses the HTML contained in the given file asynchronously. | ||
* | ||
* Note that this is merely a convenience function, it will still read the entire | ||
* contents of the file into memory. | ||
* | ||
* @param {String} fileName Name of the file to parse | ||
* @param {String} [encoding] Optional encoding to read the file in, defaults to utf8 | ||
* @param {Object} [callbacks] Callbacks to pass to parse() | ||
* @param {Function} [callback] | ||
*/ | ||
parseFile(fileName, encoding, callbacks, callback) | ||
/** | ||
* Sanitizes an HTML string. | ||
* | ||
* If removalCallbacks is not given, it will simply reformat the HTML | ||
* (i.e. converting all tags to lowercase, etc.). Note that this function | ||
* assumes that the HTML is decently formatted and kind of valid. It | ||
* may exhibit undefined or unexpected behavior if your HTML is trash. | ||
* | ||
* @param {String} htmlString A string o' HTML | ||
* @param {Object} [removalCallbacks] Callbacks for each token type | ||
* @param {Function|Array} [removalCallbacks.attributes] Callback or array of specific attributes to strip | ||
* @param {Function|Array} [removalCallbacks.elements] Callback or array of specific elements to strip | ||
* @param {Function|Boolean} [removalCallbacks.comments] Callback or boolean indicating to strip comments | ||
* @param {Function|Boolean} [removalCallbacks.docTypes] Callback or boolean indicating to strip doc type declarations | ||
* @return {String} The sanitized HTML | ||
*/ | ||
sanitize(htmlString, removalCallbacks) | ||
``` | ||
## Development | ||
```shell | ||
git clone https://github.com/tmont/html-parser.git | ||
cd html-parser | ||
npm link | ||
npm test | ||
``` |
@@ -186,5 +186,22 @@ var parseContext = require('./context'); | ||
exports.parse = function(string, options) { | ||
string = string.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); | ||
var context = parseContext.create(string, options); | ||
/** | ||
* Parses the given string o' HTML, executing each callback when it | ||
* encounters a token. | ||
* | ||
* @param {String} htmlString A string o' HTML | ||
* @param {Object} [callbacks] Callbacks for each token | ||
* @param {Function} [callbacks.attribute] Takes the name of the attribute and its value | ||
* @param {Function} [callbacks.openElement] Takes the tag name of the element | ||
* @param {Function} [callbacks.closeOpenedElement] Takes the tag name of the element and the token used to | ||
* close it (">", "/>", "?>") | ||
* @param {Function} [callbacks.closeElement] Takes the name of the element | ||
* @param {Function} [callbacks.comment] Takes the content of the comment | ||
* @param {Function} [callbacks.docType] Takes the content of the document type declaration | ||
* @param {Function} [callbacks.cdata] Takes the content of the CDATA | ||
* @param {Function} [callbacks.xmlProlog] Takes no arguments | ||
* @param {Function} [callbacks.text] Takes the value of the text node | ||
*/ | ||
exports.parse = function(htmlString, callbacks) { | ||
htmlString = htmlString.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); | ||
var context = parseContext.create(htmlString, callbacks); | ||
do { | ||
@@ -197,3 +214,14 @@ parseNext(context); | ||
exports.parseFile = function(fileName, encoding, options, callback) { | ||
/** | ||
* Parses the HTML contained in the given file asynchronously. | ||
* | ||
* Note that this is merely a convenience function, it will still read the entire | ||
* contents of the file into memory. | ||
* | ||
* @param {String} fileName Name of the file to parse | ||
* @param {String} [encoding] Optional encoding to read the file in, defaults to utf8 | ||
* @param {Object} [callbacks] Callbacks to pass to parse() | ||
* @param {Function} [callback] | ||
*/ | ||
exports.parseFile = function(fileName, encoding, callbacks, callback) { | ||
var fs = require('fs'); | ||
@@ -206,3 +234,3 @@ fs.readFile(fileName, encoding || 'utf8', function(err, contents) { | ||
exports.parse(contents, options); | ||
exports.parse(contents, callbacks); | ||
callback && callback(); | ||
@@ -212,9 +240,52 @@ }); | ||
exports.sanitize = function(string, options) { | ||
options = options || {}; | ||
/** | ||
* Sanitizes an HTML string. | ||
* | ||
* If removalCallbacks is not given, it will simply reformat the HTML | ||
* (i.e. converting all tags to lowercase, etc.). Note that this function | ||
* assumes that the HTML is decently formatted and kind of valid. It | ||
* may exhibit undefined or unexpected behavior if your HTML is trash. | ||
* | ||
* @param {String} htmlString A string o' HTML | ||
* @param {Object} [removalCallbacks] Callbacks for each token type | ||
* @param {Function|Array} [removalCallbacks.attributes] Callback or array of specific attributes to strip | ||
* @param {Function|Array} [removalCallbacks.elements] Callback or array of specific elements to strip | ||
* @param {Function|Boolean} [removalCallbacks.comments] Callback or boolean indicating to strip comments | ||
* @param {Function|Boolean} [removalCallbacks.docTypes] Callback or boolean indicating to strip doc type declarations | ||
* @return {String} The sanitized HTML | ||
*/ | ||
exports.sanitize = function(htmlString, removalCallbacks) { | ||
removalCallbacks = removalCallbacks || {}; | ||
function createArrayCallback(index) { | ||
var callbackOrArray = removalCallbacks[index] || []; | ||
if (typeof(callbackOrArray) === 'function') { | ||
return function() { | ||
return callbackOrArray.apply(null, arguments); | ||
} | ||
} else { | ||
return function(value) { | ||
return callbackOrArray.indexOf(value) !== -1; | ||
} | ||
} | ||
} | ||
function createBoolCallback(index) { | ||
var callbackOrBool = removalCallbacks[index] || false; | ||
if (typeof(callbackOrBool) === 'function') { | ||
return function() { | ||
return callbackOrBool.apply(null, arguments); | ||
} | ||
} else { | ||
return function() { | ||
return callbackOrBool; | ||
} | ||
} | ||
} | ||
var toRemove = { | ||
attributes: options.attributes || [], | ||
elements: options.elements || [], | ||
comments: !!options.comments, | ||
docTypes: !!options.docTypes | ||
attributes: createArrayCallback('attributes'), | ||
elements: createArrayCallback('elements'), | ||
comments: createBoolCallback('comments'), | ||
docTypes: createBoolCallback('docTypes') | ||
}; | ||
@@ -226,3 +297,3 @@ | ||
docType: function(value) { | ||
if (toRemove.docTypes) { | ||
if (toRemove.docTypes(value)) { | ||
return; | ||
@@ -236,3 +307,3 @@ } | ||
tagStack.push({ name: name }); | ||
if (toRemove.elements.indexOf(name) !== -1) { | ||
if (toRemove.elements(name)) { | ||
if (!ignoring) { | ||
@@ -252,3 +323,3 @@ ignoring = tagStack[tagStack.length - 1]; | ||
} | ||
if (ignoring || toRemove.elements.indexOf(name) !== -1) { | ||
if (ignoring || toRemove.elements(name)) { | ||
return; | ||
@@ -267,3 +338,3 @@ } | ||
} | ||
if (ignoring || toRemove.elements.indexOf(name) !== -1) { | ||
if (ignoring || toRemove.elements(name)) { | ||
return; | ||
@@ -280,3 +351,3 @@ } | ||
name = name.toLowerCase(); | ||
if (toRemove.attributes.indexOf(name) !== -1) { | ||
if (toRemove.attributes(name)) { | ||
return; | ||
@@ -296,3 +367,3 @@ } | ||
comment: function(value) { | ||
if (ignoring || toRemove.comments) { | ||
if (ignoring || toRemove.comments(value)) { | ||
return; | ||
@@ -319,4 +390,4 @@ } | ||
exports.parse(string, callbacks); | ||
exports.parse(htmlString, callbacks); | ||
return sanitized; | ||
}; |
@@ -22,2 +22,11 @@ var should = require('should'); | ||
it('should remove doctypes with callback', function() { | ||
var sanitized = helpers.parser.sanitize('<!doctype html><foo><!doctype asdf></foo>', { | ||
docTypes: function(value) { | ||
return value !== 'html'; | ||
} | ||
}); | ||
sanitized.should.equal('<!doctype html><foo></foo>'); | ||
}); | ||
it('should remove comments', function() { | ||
@@ -30,2 +39,11 @@ var sanitized = helpers.parser.sanitize('<!-- foo --><foo><!-- foo --></foo><!-- foo -->', { | ||
it('should remove comments with callback', function() { | ||
var sanitized = helpers.parser.sanitize('<!-- foo --><foo><!-- bar --></foo><!-- foo -->', { | ||
comments: function(value) { | ||
return /foo/.test(value); | ||
} | ||
}); | ||
sanitized.should.equal('<foo><!-- bar --></foo>'); | ||
}); | ||
it('should remove specified attributes', function() { | ||
@@ -38,2 +56,11 @@ var sanitized = helpers.parser.sanitize('<foo bar="baz" bat="qux"></foo>', { | ||
it('should remove attributes with callback', function() { | ||
var sanitized = helpers.parser.sanitize('<foo bar="baz" bat="qux"></foo>', { | ||
attributes: function(name, value) { | ||
return name === 'bar'; | ||
} | ||
}); | ||
sanitized.should.equal('<foo bat="qux"></foo>'); | ||
}); | ||
it('should remove specified elements', function() { | ||
@@ -46,2 +73,12 @@ var html = '<foo><bar><baz><bat foo=bar>asdf</bat></baz></bar><bat><!-- comment --></bat></foo>'; | ||
}); | ||
it('should remove elements with callback', function() { | ||
var html = '<foo><bar><baz><bat foo=bar>asdf</bat></baz></bar><bat><!-- comment --></bat></foo>'; | ||
var sanitized = helpers.parser.sanitize(html, { | ||
elements: function(name) { | ||
return name === 'bat'; | ||
} | ||
}); | ||
sanitized.should.equal('<foo><bar><baz></baz></bar></foo>'); | ||
}); | ||
}); |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
67550
1211
138