Socket
Socket
Sign inDemoInstall

html-parser

Package Overview
Dependencies
Maintainers
1
Versions
20
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

html-parser - npm Package Compare versions

Comparing version 0.2.0 to 0.3.0

4

package.json
{
"name": "html-parser",
"version": "0.2.0",
"version": "0.3.0",
"description": "HTML/XML parser with less explosions",

@@ -22,3 +22,3 @@ "keywords": [ "html", "xml", "parser", "explosion" ],

"devDependencies": {
"moczha": ">= 1.1.0",
"mocha": ">= 1.1.0",
"should": ">= 0.6.3"

@@ -25,0 +25,0 @@ },

@@ -1,3 +0,2 @@

html-parser
-----------
# html-parser

@@ -14,4 +13,6 @@ Now with less explosions!

Callback based parsing
======================
## Installation
`npm install html-parser`
## Callback based parsing
```javascript

@@ -45,5 +46,3 @@ var htmlParser = require('html-parser');

Sanitization
============
## Sanitization
```javascript

@@ -63,1 +62,80 @@ var htmlParser = require('html-parser');

### Using callbacks
```javascript
var htmlParser = require('html-parser');
var html = '<script>alert(\'danger!\')</script><p onclick="alert(\'danger!\')">blah blah<!-- useless comment --></p>';
var sanitized = htmlParser.sanitize(html, {
elements: function(name) {
return name === 'script';
},
attributes: function(name, value) {
return /^on/i.test(name) || /^javascript:/i.test(value);
}
comments: true
});
console.log(sanitized);
//<p>blah blah</p>
```
## API
```javascript
/**
* Parses the given string o' HTML, executing each callback when it
* encounters a token.
*
* @param {String} htmlString A string o' HTML
* @param {Object} [callbacks] Callbacks for each token
* @param {Function} [callbacks.attribute] Takes the name of the attribute and its value
* @param {Function} [callbacks.openElement] Takes the tag name of the element
* @param {Function} [callbacks.closeOpenedElement] Takes the tag name of the element and the token used to
* close it (">", "/>", "?>")
* @param {Function} [callbacks.closeElement] Takes the name of the element
* @param {Function} [callbacks.comment] Takes the content of the comment
* @param {Function} [callbacks.docType] Takes the content of the document type declaration
* @param {Function} [callbacks.cdata] Takes the content of the CDATA
* @param {Function} [callbacks.xmlProlog] Takes no arguments
* @param {Function} [callbacks.text] Takes the value of the text node
*/
parse(htmlString, callbacks)
/**
* Parses the HTML contained in the given file asynchronously.
*
* Note that this is merely a convenience function, it will still read the entire
* contents of the file into memory.
*
* @param {String} fileName Name of the file to parse
* @param {String} [encoding] Optional encoding to read the file in, defaults to utf8
* @param {Object} [callbacks] Callbacks to pass to parse()
* @param {Function} [callback]
*/
parseFile(fileName, encoding, callbacks, callback)
/**
* Sanitizes an HTML string.
*
* If removalCallbacks is not given, it will simply reformat the HTML
* (i.e. converting all tags to lowercase, etc.). Note that this function
* assumes that the HTML is decently formatted and kind of valid. It
* may exhibit undefined or unexpected behavior if your HTML is trash.
*
* @param {String} htmlString A string o' HTML
* @param {Object} [removalCallbacks] Callbacks for each token type
* @param {Function|Array} [removalCallbacks.attributes] Callback or array of specific attributes to strip
* @param {Function|Array} [removalCallbacks.elements] Callback or array of specific elements to strip
* @param {Function|Boolean} [removalCallbacks.comments] Callback or boolean indicating to strip comments
* @param {Function|Boolean} [removalCallbacks.docTypes] Callback or boolean indicating to strip doc type declarations
* @return {String} The sanitized HTML
*/
sanitize(htmlString, removalCallbacks)
```
## Development
```shell
git clone https://github.com/tmont/html-parser.git
cd html-parser
npm link
npm test
```

@@ -186,5 +186,22 @@ var parseContext = require('./context');

exports.parse = function(string, options) {
string = string.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
var context = parseContext.create(string, options);
/**
* Parses the given string o' HTML, executing each callback when it
* encounters a token.
*
* @param {String} htmlString A string o' HTML
* @param {Object} [callbacks] Callbacks for each token
* @param {Function} [callbacks.attribute] Takes the name of the attribute and its value
* @param {Function} [callbacks.openElement] Takes the tag name of the element
* @param {Function} [callbacks.closeOpenedElement] Takes the tag name of the element and the token used to
* close it (">", "/>", "?>")
* @param {Function} [callbacks.closeElement] Takes the name of the element
* @param {Function} [callbacks.comment] Takes the content of the comment
* @param {Function} [callbacks.docType] Takes the content of the document type declaration
* @param {Function} [callbacks.cdata] Takes the content of the CDATA
* @param {Function} [callbacks.xmlProlog] Takes no arguments
* @param {Function} [callbacks.text] Takes the value of the text node
*/
exports.parse = function(htmlString, callbacks) {
htmlString = htmlString.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
var context = parseContext.create(htmlString, callbacks);
do {

@@ -197,3 +214,14 @@ parseNext(context);

exports.parseFile = function(fileName, encoding, options, callback) {
/**
* Parses the HTML contained in the given file asynchronously.
*
* Note that this is merely a convenience function, it will still read the entire
* contents of the file into memory.
*
* @param {String} fileName Name of the file to parse
* @param {String} [encoding] Optional encoding to read the file in, defaults to utf8
* @param {Object} [callbacks] Callbacks to pass to parse()
* @param {Function} [callback]
*/
exports.parseFile = function(fileName, encoding, callbacks, callback) {
var fs = require('fs');

@@ -206,3 +234,3 @@ fs.readFile(fileName, encoding || 'utf8', function(err, contents) {

exports.parse(contents, options);
exports.parse(contents, callbacks);
callback && callback();

@@ -212,9 +240,52 @@ });

exports.sanitize = function(string, options) {
options = options || {};
/**
* Sanitizes an HTML string.
*
* If removalCallbacks is not given, it will simply reformat the HTML
* (i.e. converting all tags to lowercase, etc.). Note that this function
* assumes that the HTML is decently formatted and kind of valid. It
* may exhibit undefined or unexpected behavior if your HTML is trash.
*
* @param {String} htmlString A string o' HTML
* @param {Object} [removalCallbacks] Callbacks for each token type
* @param {Function|Array} [removalCallbacks.attributes] Callback or array of specific attributes to strip
* @param {Function|Array} [removalCallbacks.elements] Callback or array of specific elements to strip
* @param {Function|Boolean} [removalCallbacks.comments] Callback or boolean indicating to strip comments
* @param {Function|Boolean} [removalCallbacks.docTypes] Callback or boolean indicating to strip doc type declarations
* @return {String} The sanitized HTML
*/
exports.sanitize = function(htmlString, removalCallbacks) {
removalCallbacks = removalCallbacks || {};
function createArrayCallback(index) {
var callbackOrArray = removalCallbacks[index] || [];
if (typeof(callbackOrArray) === 'function') {
return function() {
return callbackOrArray.apply(null, arguments);
}
} else {
return function(value) {
return callbackOrArray.indexOf(value) !== -1;
}
}
}
function createBoolCallback(index) {
var callbackOrBool = removalCallbacks[index] || false;
if (typeof(callbackOrBool) === 'function') {
return function() {
return callbackOrBool.apply(null, arguments);
}
} else {
return function() {
return callbackOrBool;
}
}
}
var toRemove = {
attributes: options.attributes || [],
elements: options.elements || [],
comments: !!options.comments,
docTypes: !!options.docTypes
attributes: createArrayCallback('attributes'),
elements: createArrayCallback('elements'),
comments: createBoolCallback('comments'),
docTypes: createBoolCallback('docTypes')
};

@@ -226,3 +297,3 @@

docType: function(value) {
if (toRemove.docTypes) {
if (toRemove.docTypes(value)) {
return;

@@ -236,3 +307,3 @@ }

tagStack.push({ name: name });
if (toRemove.elements.indexOf(name) !== -1) {
if (toRemove.elements(name)) {
if (!ignoring) {

@@ -252,3 +323,3 @@ ignoring = tagStack[tagStack.length - 1];

}
if (ignoring || toRemove.elements.indexOf(name) !== -1) {
if (ignoring || toRemove.elements(name)) {
return;

@@ -267,3 +338,3 @@ }

}
if (ignoring || toRemove.elements.indexOf(name) !== -1) {
if (ignoring || toRemove.elements(name)) {
return;

@@ -280,3 +351,3 @@ }

name = name.toLowerCase();
if (toRemove.attributes.indexOf(name) !== -1) {
if (toRemove.attributes(name)) {
return;

@@ -296,3 +367,3 @@ }

comment: function(value) {
if (ignoring || toRemove.comments) {
if (ignoring || toRemove.comments(value)) {
return;

@@ -319,4 +390,4 @@ }

exports.parse(string, callbacks);
exports.parse(htmlString, callbacks);
return sanitized;
};

@@ -22,2 +22,11 @@ var should = require('should');

it('should remove doctypes with callback', function() {
var sanitized = helpers.parser.sanitize('<!doctype html><foo><!doctype asdf></foo>', {
docTypes: function(value) {
return value !== 'html';
}
});
sanitized.should.equal('<!doctype html><foo></foo>');
});
it('should remove comments', function() {

@@ -30,2 +39,11 @@ var sanitized = helpers.parser.sanitize('<!-- foo --><foo><!-- foo --></foo><!-- foo -->', {

it('should remove comments with callback', function() {
var sanitized = helpers.parser.sanitize('<!-- foo --><foo><!-- bar --></foo><!-- foo -->', {
comments: function(value) {
return /foo/.test(value);
}
});
sanitized.should.equal('<foo><!-- bar --></foo>');
});
it('should remove specified attributes', function() {

@@ -38,2 +56,11 @@ var sanitized = helpers.parser.sanitize('<foo bar="baz" bat="qux"></foo>', {

it('should remove attributes with callback', function() {
var sanitized = helpers.parser.sanitize('<foo bar="baz" bat="qux"></foo>', {
attributes: function(name, value) {
return name === 'bar';
}
});
sanitized.should.equal('<foo bat="qux"></foo>');
});
it('should remove specified elements', function() {

@@ -46,2 +73,12 @@ var html = '<foo><bar><baz><bat foo=bar>asdf</bat></baz></bar><bat><!-- comment --></bat></foo>';

});
it('should remove elements with callback', function() {
var html = '<foo><bar><baz><bat foo=bar>asdf</bat></baz></bar><bat><!-- comment --></bat></foo>';
var sanitized = helpers.parser.sanitize(html, {
elements: function(name) {
return name === 'bat';
}
});
sanitized.should.equal('<foo><bar><baz></baz></bar></foo>');
});
});

Sorry, the diff of this file is not supported yet

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc