{
		"name": "html-parser",
		"version": "0.2.0",
		"version": "0.3.0",
		"description": "HTML/XML parser with less explosions",
		@@ -22,3 +22,3 @@ "keywords": [ "html", "xml", "parser", "explosion" ],
		"devDependencies": {
		"moczha": ">= 1.1.0",
		"mocha": ">= 1.1.0",
		"should": ">= 0.6.3"
		@@ -25,0 +25,0 @@ },

README.md

		@@ -1,3 +0,2 @@
		html-parser
		-----------
		# html-parser

		@@ -14,4 +13,6 @@ Now with less explosions!

		Callback based parsing
		======================
		## Installation
		`npm install html-parser`

		## Callback based parsing
		```javascript
		@@ -45,5 +46,3 @@ var htmlParser = require('html-parser');

		Sanitization
		============

		## Sanitization
		```javascript
		@@ -63,1 +62,80 @@ var htmlParser = require('html-parser');

		### Using callbacks
		```javascript
		var htmlParser = require('html-parser');

		var html = '<script>alert(\'danger!\')</script><p onclick="alert(\'danger!\')">blah blah<!-- useless comment --></p>';
		var sanitized = htmlParser.sanitize(html, {
		elements: function(name) {
		return name === 'script';
		},
		attributes: function(name, value) {
		return /^on/i.test(name) \|\| /^javascript:/i.test(value);
		}
		comments: true
		});

		console.log(sanitized);
		//<p>blah blah</p>
		```

		## API
		```javascript
		/**
		* Parses the given string o' HTML, executing each callback when it
		* encounters a token.
		*
		* @param {String} htmlString A string o' HTML
		* @param {Object} [callbacks] Callbacks for each token
		* @param {Function} [callbacks.attribute] Takes the name of the attribute and its value
		* @param {Function} [callbacks.openElement] Takes the tag name of the element
		* @param {Function} [callbacks.closeOpenedElement] Takes the tag name of the element and the token used to
		* close it (">", "/>", "?>")
		* @param {Function} [callbacks.closeElement] Takes the name of the element
		* @param {Function} [callbacks.comment] Takes the content of the comment
		* @param {Function} [callbacks.docType] Takes the content of the document type declaration
		* @param {Function} [callbacks.cdata] Takes the content of the CDATA
		* @param {Function} [callbacks.xmlProlog] Takes no arguments
		* @param {Function} [callbacks.text] Takes the value of the text node
		*/
		parse(htmlString, callbacks)

		/**
		* Parses the HTML contained in the given file asynchronously.
		*
		* Note that this is merely a convenience function, it will still read the entire
		* contents of the file into memory.
		*
		* @param {String} fileName Name of the file to parse
		* @param {String} [encoding] Optional encoding to read the file in, defaults to utf8
		* @param {Object} [callbacks] Callbacks to pass to parse()
		* @param {Function} [callback]
		*/
		parseFile(fileName, encoding, callbacks, callback)

		/**
		* Sanitizes an HTML string.
		*
		* If removalCallbacks is not given, it will simply reformat the HTML
		* (i.e. converting all tags to lowercase, etc.). Note that this function
		* assumes that the HTML is decently formatted and kind of valid. It
		* may exhibit undefined or unexpected behavior if your HTML is trash.
		*
		* @param {String} htmlString A string o' HTML
		* @param {Object} [removalCallbacks] Callbacks for each token type
		* @param {Function\|Array} [removalCallbacks.attributes] Callback or array of specific attributes to strip
		* @param {Function\|Array} [removalCallbacks.elements] Callback or array of specific elements to strip
		* @param {Function\|Boolean} [removalCallbacks.comments] Callback or boolean indicating to strip comments
		* @param {Function\|Boolean} [removalCallbacks.docTypes] Callback or boolean indicating to strip doc type declarations
		* @return {String} The sanitized HTML
		*/
		sanitize(htmlString, removalCallbacks)
		```

		## Development
		```shell
		git clone https://github.com/tmont/html-parser.git
		cd html-parser
		npm link
		npm test
		```

107

src/parser.js

		@@ -186,5 +186,22 @@ var parseContext = require('./context');

		exports.parse = function(string, options) {
		string = string.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
		var context = parseContext.create(string, options);
		/**
		* Parses the given string o' HTML, executing each callback when it
		* encounters a token.
		*
		* @param {String} htmlString A string o' HTML
		* @param {Object} [callbacks] Callbacks for each token
		* @param {Function} [callbacks.attribute] Takes the name of the attribute and its value
		* @param {Function} [callbacks.openElement] Takes the tag name of the element
		* @param {Function} [callbacks.closeOpenedElement] Takes the tag name of the element and the token used to
		* close it (">", "/>", "?>")
		* @param {Function} [callbacks.closeElement] Takes the name of the element
		* @param {Function} [callbacks.comment] Takes the content of the comment
		* @param {Function} [callbacks.docType] Takes the content of the document type declaration
		* @param {Function} [callbacks.cdata] Takes the content of the CDATA
		* @param {Function} [callbacks.xmlProlog] Takes no arguments
		* @param {Function} [callbacks.text] Takes the value of the text node
		*/
		exports.parse = function(htmlString, callbacks) {
		htmlString = htmlString.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
		var context = parseContext.create(htmlString, callbacks);
		do {
		@@ -197,3 +214,14 @@ parseNext(context);

		exports.parseFile = function(fileName, encoding, options, callback) {
		/**
		* Parses the HTML contained in the given file asynchronously.
		*
		* Note that this is merely a convenience function, it will still read the entire
		* contents of the file into memory.
		*
		* @param {String} fileName Name of the file to parse
		* @param {String} [encoding] Optional encoding to read the file in, defaults to utf8
		* @param {Object} [callbacks] Callbacks to pass to parse()
		* @param {Function} [callback]
		*/
		exports.parseFile = function(fileName, encoding, callbacks, callback) {
		var fs = require('fs');
		@@ -206,3 +234,3 @@ fs.readFile(fileName, encoding \|\| 'utf8', function(err, contents) {

		exports.parse(contents, options);
		exports.parse(contents, callbacks);
		callback && callback();
		@@ -212,9 +240,52 @@ });

		exports.sanitize = function(string, options) {
		options = options \|\| {};
		/**
		* Sanitizes an HTML string.
		*
		* If removalCallbacks is not given, it will simply reformat the HTML
		* (i.e. converting all tags to lowercase, etc.). Note that this function
		* assumes that the HTML is decently formatted and kind of valid. It
		* may exhibit undefined or unexpected behavior if your HTML is trash.
		*
		* @param {String} htmlString A string o' HTML
		* @param {Object} [removalCallbacks] Callbacks for each token type
		* @param {Function\|Array} [removalCallbacks.attributes] Callback or array of specific attributes to strip
		* @param {Function\|Array} [removalCallbacks.elements] Callback or array of specific elements to strip
		* @param {Function\|Boolean} [removalCallbacks.comments] Callback or boolean indicating to strip comments
		* @param {Function\|Boolean} [removalCallbacks.docTypes] Callback or boolean indicating to strip doc type declarations
		* @return {String} The sanitized HTML
		*/
		exports.sanitize = function(htmlString, removalCallbacks) {
		removalCallbacks = removalCallbacks \|\| {};

		function createArrayCallback(index) {
		var callbackOrArray = removalCallbacks[index] \|\| [];
		if (typeof(callbackOrArray) === 'function') {
		return function() {
		return callbackOrArray.apply(null, arguments);
		}
		} else {
		return function(value) {
		return callbackOrArray.indexOf(value) !== -1;
		}
		}
		}

		function createBoolCallback(index) {
		var callbackOrBool = removalCallbacks[index] \|\| false;
		if (typeof(callbackOrBool) === 'function') {
		return function() {
		return callbackOrBool.apply(null, arguments);
		}
		} else {
		return function() {
		return callbackOrBool;
		}
		}
		}

		var toRemove = {
		attributes: options.attributes \|\| [],
		elements: options.elements \|\| [],
		comments: !!options.comments,
		docTypes: !!options.docTypes
		attributes: createArrayCallback('attributes'),
		elements: createArrayCallback('elements'),
		comments: createBoolCallback('comments'),
		docTypes: createBoolCallback('docTypes')
		};
		@@ -226,3 +297,3 @@
		docType: function(value) {
		if (toRemove.docTypes) {
		if (toRemove.docTypes(value)) {
		return;
		@@ -236,3 +307,3 @@ }
		tagStack.push({ name: name });
		if (toRemove.elements.indexOf(name) !== -1) {
		if (toRemove.elements(name)) {
		if (!ignoring) {
		@@ -252,3 +323,3 @@ ignoring = tagStack[tagStack.length - 1];
		}
		if (ignoring \|\| toRemove.elements.indexOf(name) !== -1) {
		if (ignoring \|\| toRemove.elements(name)) {
		return;
		@@ -267,3 +338,3 @@ }
		}
		if (ignoring \|\| toRemove.elements.indexOf(name) !== -1) {
		if (ignoring \|\| toRemove.elements(name)) {
		return;
		@@ -280,3 +351,3 @@ }
		name = name.toLowerCase();
		if (toRemove.attributes.indexOf(name) !== -1) {
		if (toRemove.attributes(name)) {
		return;
		@@ -296,3 +367,3 @@ }
		comment: function(value) {
		if (ignoring \|\| toRemove.comments) {
		if (ignoring \|\| toRemove.comments(value)) {
		return;
		@@ -319,4 +390,4 @@ }

		exports.parse(string, callbacks);
		exports.parse(htmlString, callbacks);
		return sanitized;
		};

tests/sanitization-tests.js

		@@ -22,2 +22,11 @@ var should = require('should');

		it('should remove doctypes with callback', function() {
		var sanitized = helpers.parser.sanitize('<!doctype html><foo><!doctype asdf></foo>', {
		docTypes: function(value) {
		return value !== 'html';
		}
		});
		sanitized.should.equal('<!doctype html><foo></foo>');
		});

		it('should remove comments', function() {
		@@ -30,2 +39,11 @@ var sanitized = helpers.parser.sanitize('<!-- foo --><foo><!-- foo --></foo><!-- foo -->', {

		it('should remove comments with callback', function() {
		var sanitized = helpers.parser.sanitize('<!-- foo --><foo><!-- bar --></foo><!-- foo -->', {
		comments: function(value) {
		return /foo/.test(value);
		}
		});
		sanitized.should.equal('<foo><!-- bar --></foo>');
		});

		it('should remove specified attributes', function() {
		@@ -38,2 +56,11 @@ var sanitized = helpers.parser.sanitize('<foo bar="baz" bat="qux"></foo>', {

		it('should remove attributes with callback', function() {
		var sanitized = helpers.parser.sanitize('<foo bar="baz" bat="qux"></foo>', {
		attributes: function(name, value) {
		return name === 'bar';
		}
		});
		sanitized.should.equal('<foo bat="qux"></foo>');
		});

		it('should remove specified elements', function() {
		@@ -46,2 +73,12 @@ var html = '<foo><bar><baz><bat foo=bar>asdf</bat></baz></bar><bat><!-- comment --></bat></foo>';
		});

		it('should remove elements with callback', function() {
		var html = '<foo><bar><baz><bat foo=bar>asdf</bat></baz></bar><bat><!-- comment --></bat></foo>';
		var sanitized = helpers.parser.sanitize(html, {
		elements: function(name) {
		return name === 'bat';
		}
		});
		sanitized.should.equal('<foo><bar><baz></baz></bar></foo>');
		});
		});

.gitignore

Sorry, the diff of this file is not supported yet

.idea/workspace.xml

Sorry, the diff of this file is not supported yet

html-parser - npm Package Compare versions

Improved metrics