Socket
Socket
Sign inDemoInstall

htmlparser2

Package Overview
Dependencies
0
Maintainers
1
Versions
76
Alerts
File Explorer

Advanced tools

Install Socket

Detect and block malicious and high-risk dependencies

Install

Comparing version 1.5.0 to 2.0.0

.travis.yml

139

lib/DefaultHandler.js

@@ -8,10 +8,4 @@ var ElementType = require("./ElementType.js");

this._tagStack = [];
if(options){ //otherwise, the prototype is used
this._options = options;
if(typeof this._options.verbose === "undefined")
this._options.verbose = true;
if (typeof this._options.enforceEmptyTags === "undefined")
this._options.enforceEmptyTags = true;
}
this._callback = callback;
if(options) this._options = options; //otherwise, the prototype is used
if(callback) this._callback = callback;
}

@@ -21,50 +15,34 @@

DefaultHandler.prototype._options = {
ignoreWhitespace: false, //Keep whitespace-only text nodes
verbose: true, //Keep data property for tags and raw property for all
enforceEmptyTags: true //Don't allow children for HTML tags defined as empty in spec
ignoreWhitespace: false //Keep whitespace-only text nodes
};
//HTML Tags that shouldn't contain child nodes
var emptyTags={area:true,base:true,basefont:true,br:true,col:true,frame:true,hr:true,img:true,input:true,isindex:true,link:true,meta:true,param:true,embed:true};
//Resets the handler back to starting state
DefaultHandler.prototype.onreset = DefaultHandler;
//**Public**//
//Methods//
//Resets the handler back to starting state
DefaultHandler.prototype.reset = function() {
DefaultHandler.call(this, this._callback);
};
//Signals the handler that parsing is done
DefaultHandler.prototype.done = function() {
DefaultHandler.prototype.onend = function(){
if(this._done) return;
this._done = true;
this.handleCallback(null);
this._handleCallback(null);
};
//Methods//
DefaultHandler.prototype.error =
DefaultHandler.prototype.handleCallback = function(error){
if(typeof this._callback === "function")
this._callback(error, this.dom);
else if(error) throw error;
DefaultHandler.prototype.onerror = function(error){
if(typeof this._callback === "function"){
return this._callback(error, this.dom);
} else {
if(error) throw error;
}
};
DefaultHandler.prototype._isEmptyTag = function(name) {
return this._options.enforceEmptyTags && emptyTags[name];
};
DefaultHandler.prototype._handleCallback = DefaultHandler.prototype.onerror;
DefaultHandler.prototype.closeTag = function(name){
//Ignore closing tags that obviously don't have an opening tag
if(!this._tagStack || this._isEmptyTag(name)) return;
var pos = this._tagStack.length - 1;
while(pos !== -1 && this._tagStack[pos--].name !== name){}
if (pos !== -1 || this._tagStack[0].name === name)
this._tagStack.splice(pos+1);
DefaultHandler.prototype.onclosetag = function(name){
this._tagStack.pop();
};
DefaultHandler.prototype._addDomElement = function(element){
if(!this._options.verbose) delete element.raw;
var lastChild,
lastTag = this._tagStack[this._tagStack.length - 1];
var lastTag = this._tagStack[this._tagStack.length-1], lastChild;
if(!lastTag) this.dom.push(element);
else{ //There are parent elements
if(lastTag){ //There are parent elements
if(!lastTag.children){

@@ -74,36 +52,67 @@ lastTag.children = [element];

}
lastChild = lastTag.children[lastTag.children.length-1];
if(element.type === ElementType.Comment && lastChild.type === ElementType.Comment){
lastChild = lastTag.children[lastTag.children.length - 1];
if(this._inSpecialTag && element.type === ElementType.Text && lastChild.type === ElementType.Text){
lastChild.data += element.data;
if(this._options.verbose) lastChild.raw = lastChild.data;
} else {
lastTag.children.push(element);
}
else if(this._inSpecialTag && element.type === ElementType.Text && lastChild.type === ElementType.Text){
lastChild.data += element.data;
if(this._options.verbose)
lastChild.raw = lastChild.data;
}
else lastTag.children.push(element);
}
else {
this.dom.push(element);
}
};
DefaultHandler.prototype.openTag = function(element){
if(!this._options.verbose) delete element.data;
var isSpecial = element.type === ElementType.Script || element.type === ElementType.Style;
if(isSpecial) this._inSpecialTag = true;
DefaultHandler.prototype.onopentag = function(name, attribs, type){
if(type === ElementType.Script || type === ElementType.Style){
this._inSpecialTag = true;
}
var element = {
type: type,
name: name,
attribs: attribs
};
this._addDomElement(element);
this._tagStack.push(element);
};
DefaultHandler.prototype.ontext = function(data){
if(this._options.ignoreWhitespace && data.trim() === "") return;
this._addDomElement({
data: data,
type: ElementType.Text
});
};
DefaultHandler.prototype.oncomment = function(data){
var lastTag = this._tagStack[this._tagStack.length - 1];
var lastChild = lastTag && lastTag.children && lastTag.children[lastTag.children.length - 1];
//Don't add tags to the tag stack that can't have children
if(!this._isEmptyTag(element.name)) this._tagStack.push(element);
var element;
if(!lastChild || lastChild.type !== ElementType.Comment){
element = {
data: data,
type: ElementType.Comment
};
if(!lastTag){
return this.dom.push(element);
} else if(!lastChild){
lastTag.children = [element];
} else {
if(lastChild.type !== ElementType.Comment){
lastTag.children.push(element);
}
}
} else {
lastChild.data += data;
}
};
DefaultHandler.prototype.writeText = function(element){
if(this._options.ignoreWhitespace && element.data.trim() === "") return;
this._addDomElement(element);
DefaultHandler.prototype.onprocessinginstruction = function(name, data){
this._addDomElement({
name: name,
data: data,
type: ElementType.Directive
});
};
DefaultHandler.prototype.writeComment = DefaultHandler.prototype.writeDirective = DefaultHandler.prototype._addDomElement;
module.exports = DefaultHandler;

@@ -7,71 +7,96 @@ var ElementType = require("./ElementType.js");

function testElement(options, element) {
if (!element) return false;
var type = element.type;
var arrayPush = Array.prototype.push;
function filterArray(test, arr, recurse, limit){
var result = [], childs;
for(var i = 0, j = arr.length; i < j; i++){
if(test(arr[i])){
result.push(arr[i]);
if(--limit <= 0) break;
}
if(recurse && (childs = arr[i].children)){
childs = filterArray(test, childs, limit);
arrayPush.apply(result, childs);
limit -= childs.length;
if(limit <= 0) break;
}
}
return result;
}
for (var key in options) {
if (key === "tag_name") {
if (type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false;
if (!options.tag_name(element.name)) return false;
} else if (key === "tag_type") {
if (!options.tag_type(type)) return false;
} else if (key === "tag_contains") {
if (type !== ElementType.Text && type !== ElementType.Comment && type !== ElementType.Directive) return false;
if (!options.tag_contains(element.data)) return false;
} else if (!element.attribs || !options[key](element.attribs[key]))
return false;
}
return true;
function filter(test, element, recurse, limit){
if(recurse !== false) recurse = true;
if(isNaN(limit)) limit = Infinity;
if(!Array.isArray(element)){
element = [element];
}
return filterArray(test, element, recurse, limit);
}
module.exports = {
testElement: testElement,
testElement: function testElement(options, element) {
var type = element.type;
getElements: function(options, currentElement, recurse, limit){
recurse = recurse === undefined || recurse === null || recurse;
if(isNaN(limit)) limit = -1;
for(var key in options){
if(key === "tag_name"){
if(type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false;
if(!options.tag_name(element.name)) return false;
} else if(key === "tag_type") {
if(!options.tag_type(type)) return false;
} else if(key === "tag_contains") {
if(type !== ElementType.Text && type !== ElementType.Comment && type !== ElementType.Directive) return false;
if(!options.tag_contains(element.data)) return false;
} else if(!element.attribs || !options[key](element.attribs[key]))
return false;
}
return true;
},
getElements: function(options, element, recurse, limit){
for(var key in options){
if (typeof options[key] !== "function")
if(typeof options[key] !== "function"){
options[key] = getTest(options[key]);
}
}
return this.testAttr(testElement.bind(null, options), currentElement, recurse, limit);
}
, getElementById: function(id, currentElement, recurse) {
var result = this.getElements({ id: id }, currentElement, recurse, 1);
return filter(this.testElement.bind(null, options), element, recurse, limit);
},
getElementById: function(id, element, recurse) {
var result;
if(typeof id === "function"){
result = filter(function(elem){
return elem.attribs && id(elem.attribs);
}, element, recurse, 1);
}
else{
result = filter(function(elem){
return elem.attribs && elem.attribs.id === id;
}, element, recurse, 1);
}
return result.length ? result[0] : null;
//function(elem){return elem.attribs && elem.attribs.id === id;}
}
, getElementsByTagName: function(name, currentElement, recurse, limit) {
return this.getElements({ tag_name: name }, currentElement, recurse, limit);
/*function(elem){
},
getElementsByTagName: function(name, element, recurse, limit){
if(typeof name === "function") return filter(function(elem){
var type = elem.type;
if(type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false;
return name(elem.name);
}, element, recurse, limit);
else return filter(function(elem){
var type = elem.type;
if(type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false;
return elem.name === name;
};*/
}, element, recurse, limit);
},
getElementsByTagType: function(type, element, recurse, limit){
if(typeof type === "function"){
return filter(function(elem){return type(elem.type);}, element, recurse, limit);
}
else return filter(function(elem){return elem.type === type;}, element, recurse, limit);
}
, getElementsByTagType: function(type, currentElement, recurse, limit){
return this.testAttr(function(elem){return elem.type === type;}, currentElement, recurse, limit);
//function(elem){return elem.type === type;}
}
, testAttr: function(test, element, recurse, limit){
var found = [], elementList;
if(!element) return found;
if(test(element)) found.push(element);
if(recurse && element.children) elementList = element.children;
else if(Array.isArray(element)) elementList = element;
else return found;
for(var i = 0, j = elementList.length; i < j && (limit < 0 || found.length < limit); i++){
found = found.concat(this.testAttr(test, elementList[i], recurse, limit));
}
return found;
}};
};
module.exports = {
Parser: require("./Parser.js"),
DefaultHandler: require("./DefaultHandler.js"),
RssHandler: require("./RssHandler.js"),
EventedHandler: require("./EventedHandler.js"),
FeedHandler: require("./FeedHandler.js"),
ElementType: require("./ElementType.js"),
DomUtils: require("./DomUtils.js")
}
var ElementType = require("./ElementType.js");
function Parser(handler, options){
function Parser(cbs, options){
if(options) this._options = options;
if(cbs) this._cbs = cbs;
validateHandler(handler);
this._handler = handler;
this._buffer = "";
this._prevTagSep = "";
this._stack = [];
this._contentFlags = 0;
this._done = false;
this._current = 0;
this._location = {
row: 0,
col: 0,
charOffset: 0
};
this._parseState = ElementType.Text;

@@ -24,18 +17,20 @@ }

//Regular expressions used for cleaning up and parsing (stateless)
var _reWhitespace = /\s/; //Used to find any whitespace to split on
var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element
var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //matches tagnames
var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;//"
//Find attributes in a tag
var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;
var tagTypes = {};
tagTypes[ ElementType.Script ] = true;
tagTypes[ ElementType.Style ] = true;
tagTypes[ ElementType.Tag ] = true;
Parser.prototype._options = {
includeLocation: false, //Do not track element position in document by default
xmlMode: false //Special behaviour for script/style tags by default
xmlMode: false, //Special behaviour for script/style tags by default
lowerCaseTags: false //call .toLowerCase for each tagname
};
Parser.prototype._cbs = {
/*
onopentag,
onclosetag,
ontext,
onprocessinginstruction,
oncomment
*/
};
//**Public**//

@@ -51,6 +46,7 @@ //Methods//

//Parses a piece of an HTML document
Parser.prototype.write =
Parser.prototype.parseChunk = function(data){
if(this._done) this.handleError(Error("Attempted to parse chunk after parsing already done"));
if(this._done) this._handleError(Error("Attempted to parse chunk after parsing already done"));
this._buffer += data; //FIXME: this can be a bottleneck
this.parseTags();
this._parseTags();
};

@@ -63,20 +59,10 @@

//Push any unparsed text into a final element in the element list
if(this._buffer){
var data = this._buffer;
if(this._parseState === ElementType.Tag){
data = data.trim();
var name = parseTagName(data);
if(name.charAt(0) === "/") this._handler.closeTag(name.substr(1));
else this._handler.openTag({
name: name, raw: data, data: data, attribs: parseAttributes(data)
});
}
else this._handler.writeText({
raw: data, data: data, type: ElementType.Text
});
this._buffer = "";
//Parse the buffer to its end
if(this._buffer) this._parseTags(true);
if(this._cbs.onclosetag){
while(this._stack.length) this._cbs.onclosetag(this._stack.pop());
}
this._handler.done();
if(this._cbs.onend) this._cbs.onend();
};

@@ -86,4 +72,4 @@

Parser.prototype.reset = function(){
Parser.call(this, this._handler);
this._handler.reset();
Parser.call(this);
if(this._cbs.onreset) this._cbs.onreset();
};

@@ -94,8 +80,8 @@

var parseAttributes = function(data){
var pos = data.search(_reWhitespace);
if(pos === -1) return;
var pos = data.search(/\s/), attrs = {}; //Find any whitespace
if(pos === -1) return attrs;
var attribRaw = data.substr(pos);
_reAttrib.lastIndex = 0;
var match, attrs = {};
var match;

@@ -113,6 +99,9 @@ while(match = _reAttrib.exec(attribRaw)){

//Extracts the base tag name from the data value of an element
var parseTagName = function(data){
Parser.prototype._parseTagName = function(data){
var match = data.match(_reTagName);
if(match === null) return "";
return match[1] + match[2];
if(this._options.lowerCaseTags){
return match[1] + match[2].toLowerCase();
}
else return match[1] + match[2];
};

@@ -126,14 +115,17 @@

SpecialTags.w = 4; //2^2 - if set, append prev tag sep to data
SpecialTags[ElementType.Comment] = 8; //2^8
SpecialTags[ElementType.Comment] = 8; //2^3
//Parses through HTML text and returns an array of found elements
Parser.prototype.parseTags = function(){
var buffer = this._buffer;
Parser.prototype._parseTags = function(force){
var buffer = this._buffer, current = 0;
var next, tagSep, rawData, element, elementName, prevElement, elementType, elementData, attributes;
var next, tagSep, rawData, elementName, elementType, elementData;
var opening = buffer.indexOf("<"), closing = buffer.indexOf(">");
//if force is true, parse everything
if(force) opening = Infinity;
while(opening !== closing){ //just false if both are -1
if(closing === -1 || (opening !== -1 && opening < closing)){
if((opening !== -1 && opening < closing) || closing === -1){
next = opening;

@@ -148,4 +140,7 @@ tagSep = "<";

}
rawData = buffer.substring(this._current, next); //The next chunk of data to parse
rawData = buffer.substring(current, next); //The next chunk of data to parse
elementType = this._parseState;
//set elements for next run
current = next + 1;
this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text;

@@ -155,3 +150,3 @@

elementData = rawData.trim();
elementName = parseTagName(elementData);
elementName = this._parseTagName(elementData);
}

@@ -169,3 +164,3 @@ else{

this._processComment(rawData, tagSep);
elementType = null;
continue;
}

@@ -184,13 +179,12 @@ //if it's a closing tag, remove the flag

else if(!this._options.xmlMode && rawData.substring(0, 3) !== "!--"){
//All data from here to style close is now a text element
elementType = ElementType.Text;
//If the previous element is text, append the last tag sep to element
if(this._contentFlags >= SpecialTags.w){
elementData = rawData = this._prevTagSep + rawData;
if(this._cbs.ontext) this._cbs.ontext(this._prevTagSep + rawData);
}
else{ //The previous element was not text
this._contentFlags += SpecialTags.w;
elementData = rawData;
if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData);
}
this._prevTagSep = tagSep;
continue;
}

@@ -203,30 +197,27 @@

this._processComment(rawData.substr(3), tagSep);
continue;
}
else if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){
if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){
//ElementType.Directive
//TODO: what about CDATA?
element = {raw: elementData, data: elementData, type: ElementType.Directive, name: elementName};
if(this._options.includeLocation) element.location = this.getLocation(false);
this._handler.writeDirective(element);
} else
this._processTag(elementName, elementData, tagSep, rawData);
if(this._cbs.onprocessinginstruction){
this._cbs.onprocessinginstruction(elementName, elementData);
}
continue;
}
if(elementName.charAt(0) === "/") this._processCloseTag(elementName.substr(1));
else this._processOpenTag(elementName, elementData, tagSep);
}
else if(elementType === ElementType.Text && rawData !== ""){
element = {raw: rawData, data: elementData, type: ElementType.Text};
if(this._options.includeLocation) element.location = this.getLocation(false);
this._handler.writeText(element);
else if(elementType === ElementType.Text && rawData !== "" && this._cbs.ontext){
this._cbs.ontext(elementData);
}
this._current = next + 1;
}
if(this._options.includeLocation){
this.getLocation();
this._location.charOffset = 0;
}
this._buffer = buffer.substring(this._current);
this._current = 0;
this._buffer = buffer.substring(current);
};
Parser.prototype._processComment = function(rawData, tagSep){
this._prevTagSep = tagSep;
if(tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends

@@ -238,21 +229,29 @@ //remove the written flag (also removes the comment flag)

else rawData += tagSep;
this._prevTagSep = tagSep;
var element = {
raw: rawData,
data: rawData,
type: ElementType.Comment
};
if(this._options.includeLocation) element.location = this.getLocation(false);
this._handler.writeComment(element);
if(this._cbs.oncomment) this._cbs.oncomment(rawData);
};
Parser.prototype._processTag = function(name, data, tagSep, raw){
if(name.charAt(0) === "/"){
this._handler.closeTag(name.substring(1));
return;
var emptyTags = require("./ClosingTags.js").self;
Parser.prototype._isEmptyTag = function(name){
return !this._options.xmlMode && emptyTags[name];
};
Parser.prototype._processCloseTag = function(name){
if(this._stack && !this._isEmptyTag(name)){
var i = this._stack.length;
while(i !== 0 && this._stack[--i] !== name){}
if(i !== 0 || this._stack[0] === name)
if(this._cbs.onclosetag){
while(i < this._stack.length)
this._cbs.onclosetag(this._stack.pop());
}
else this._stack.splice(i);
}
//many browsers (eg. Safari, Chrome) convert </br> to <br>
else if(name === "br" && !this._options.xmlMode)
this._processOpenTag(name, "/");
};
Parser.prototype._processOpenTag = function(name, data, tagSep){
var type = ElementType.Tag;

@@ -263,19 +262,12 @@ if(this._options.xmlMode){ /*do nothing*/ }

var element = {
raw: raw, data: data, type: type, name: name
};
if(this._cbs.onopentag){
this._cbs.onopentag(name, parseAttributes(data), type);
}
var attribs = parseAttributes(data);
if(attribs) element.attribs = attribs;
if(this._options.includeLocation)
element.location = this.getLocation(type === ElementType.Tag);
this._handler.openTag(element);
//If tag self-terminates, add an explicit, separate closing tag
if(data.substr(-1) === "/"){
this._handler.closeTag(name);
if(data.substr(-1) === "/" || this._isEmptyTag(name)){
if(this._cbs.onclosetag) this._cbs.onclosetag(name);
} else {
this._contentFlags += SpecialTags[type];
this._stack.push(name);
this._prevTagSep = tagSep;

@@ -285,44 +277,5 @@ }

Parser.prototype.getLocation = function(startTag){
var c, end, chunk,
l = this._location;
if(startTag){
end = this._current - 1;
chunk = l.charOffset === 0 && end === -1;
} else {
end = this._current;
chunk = false;
}
var rows = this._buffer.substring(l.charOffset, end).split("\n"),
rowNum = rows.length - 1;
l.charOffset = end;
l.row += rowNum;
var num = rows[rowNum].replace(/\r/g,"").length;
if(rowNum === 0) l.col += num;
else l.col = num;
if(arguments.length === 0) return;
return {
line: l.row + 1,
col: l.col + (chunk ? 0: 1)
};
};
//Checks the handler to ensure it is an object with the right interface
var validateHandler = function(handler){
if(typeof handler !== "object")
throw Error("Handler is not an object");
["reset", "done", "openTag", "closeTag", "writeText", "writeComment", "writeDirective"].forEach(function(name){
if(typeof handler[name] !== "function")
throw Error("Handler method '" + name + "' is invalid");
});
};
Parser.prototype.handleError = function(error){
if(typeof this._handler.error === "function")
this._handler.error(error);
Parser.prototype._handleError = function(error){
if(this._cbs.onerror)
this._cbs.onerror(error);
else throw error;

@@ -329,0 +282,0 @@ };

{
"name": "htmlparser2"
, "description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface (EventedHandler)."
, "version": "1.5.0"
, "author": "Felix Boehm <me@feedic.com>"
, "contributors": [ "Chris Winberry <chris@winberry.net>" ]
, "repository": {
"type": "git"
, "url": "git://github.com/fb55/node-htmlparser.git"
}
, "bugs": {
"mail": "me@feedic.com"
, "url": "http://github.com/fb55/node-htmlparser/issues"
}
, "directories": { "lib": "./lib/" }
, "main": "./lib/"
, "engines": { "node": ">0" }
, "licenses": [{
"type": "MIT"
, "url": "http://github.com/tautologistics/node-htmlparser/raw/master/LICENSE"
"name": "htmlparser2",
"description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface.",
"version": "2.0.0",
"author": "Felix Boehm <me@feedic.com>",
"contributors": ["Chris Winberry <chris@winberry.net>"],
"repository": {
"type": "git",
"url": "git://github.com/fb55/node-htmlparser.git"
},
"bugs": {
"mail": "me@feedic.com",
"url": "http://github.com/fb55/node-htmlparser/issues"
},
"directories": {
"lib": "./lib/"
},
"main": "./lib/",
"scripts": {
"test": "cd tests && node 00-runtests.js"
},
"engines": {
"node": ">0"
},
"licenses": [{
"type": "MIT",
"url": "http://github.com/tautologistics/node-htmlparser/raw/master/LICENSE"
}]
}

@@ -10,7 +10,13 @@ #htmlparser2

This project is linked to [Travis CI](http://travis-ci.org/). The latest builds status is:
[![Build Status](https://secure.travis-ci.org/FB55/node-htmlparser.png)](http://travis-ci.org/FB55/node-htmlparser)
##How is this different from [node-htmlparser](https://github.com/tautologistics/node-htmlparser)?
This is a fork of the project above. The main difference is that this is just intended to be used with node. Besides, the code is much better structured, has less duplications and is remarkably faster than the original.
Besides, it features an additional handler that provides the interface of [sax.js](https://github.com/isaacs/sax-js) (written for my readability port [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)).
Besides, the parser now provides the interface of [sax.js](https://github.com/isaacs/sax-js) (originally intended for my readability port [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)).
The support for location data and verbose output was removed a couple of versions ago. It's still available in the [verbose branch](https://github.com/FB55/node-htmlparser/tree/verbose) (if you really need it, for whatever reason that may be).
##Usage

@@ -25,17 +31,14 @@

[...parsing done, do something...]
console.log(dom);
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(rawHtml);
sys.puts(sys.inspect(handler.dom, false, null));
parser.write(rawHtml);
parser.done();
Output:
##Example output
[{
raw: 'Xyz ',
data: 'Xyz ',
type: 'text'
}, {
raw: 'script language= javascript',
data: 'script language= javascript',
type: 'script',

@@ -47,3 +50,2 @@ name: 'script',

children: [{
raw: 'var foo = \'<bar>\';<',
data: 'var foo = \'<bar>\';<',

@@ -53,3 +55,2 @@ type: 'text'

}, {
raw: '<!-- Waah! -- ',
data: '<!-- Waah! -- ',

@@ -62,161 +63,13 @@ type: 'comment'

...
parser.parseChunk(chunk);
parser.write(chunk);
}
parser.done();
##Parsing RSS/Atom Feeds
new htmlparser.RssHandler(function (error, dom) {
##Parsing RSS/RDF/Atom Feeds
new htmlparser.FeedHandler(function (error, dom) {
...
});
##Parser options
###Usage
var Parser = new htmlparser.Parser(handler, options);
###Option: includeLocation
Indicates whether the parser should include the location of a token as part of it. Default: false.
###Option: xmlMode
Indicates whether `<script>` and `<style>` tags should get special treatment. If false, their content will be text only. For RSS feeds and other XML content (not HTML), set this to true. Default: false.
##DefaultHandler options
###Usage
var handler = new htmlparser.DefaultHandler(function (error) {...}, {
verbose: false,
ignoreWhitespace: true
});
###Option: ignoreWhitespace
Indicates whether the DOM should exclude text nodes that consists solely of whitespace. The default value is "false".
The following HTML will be used:
<font>
<br>this is the text
<font>
####Example: true
[{
raw: 'font',
data: 'font',
type: 'tag',
name: 'font',
children: [{
raw: 'br',
data: 'br',
type: 'tag',
name: 'br'
}, {
raw: 'this is the text\n',
data: 'this is the text\n',
type: 'text'
}, {
raw: 'font',
data: 'font',
type: 'tag',
name: 'font'
}]
}]
####Example: false
[{
raw: 'font',
data: 'font',
type: 'tag',
name: 'font',
children: [{
raw: '\n\t',
data: '\n\t',
type: 'text'
}, {
raw: 'br',
data: 'br',
type: 'tag',
name: 'br'
}, {
raw: 'this is the text\n',
data: 'this is the text\n',
type: 'text'
}, {
raw: 'font',
data: 'font',
type: 'tag',
name: 'font'
}]
}]
###Option: verbose
Indicates whether to include extra information on each node in the DOM. This information consists of the "raw" attribute (original, unparsed text found between "<" and ">") and the "data" attribute on "tag", "script", and "comment" nodes. The default value is "true".
The following HTML is used:
<a href="test.html">xxx</a>
####Example: true
[{
raw: 'a href="test.html"',
data: 'a href="test.html"',
type: 'tag',
name: 'a',
attribs: {
href: 'test.html'
},
children: [{
raw: 'xxx',
data: 'xxx',
type: 'text'
}]
}]
####Example: false
[{
type: 'tag',
name: 'a',
attribs: {
href: 'test.html'
},
children: [{
data: 'xxx',
type: 'text'
}]
}]
###Option: enforceEmptyTags
Indicates whether the DOM should prevent children on tags marked as empty in the HTML spec. Typically this should be set to "true" HTML parsing and "false" for XML parsing. The default value is "true".
The following HTML is used:
<link>text</link>
####Example: true
[{
raw: 'link',
data: 'link',
type: 'tag',
name: 'link'
}, {
raw: 'text',
data: 'text',
type: 'text'
}]
####Example: false
[{
raw: 'link',
data: 'link',
type: 'tag',
name: 'link',
children: [{
raw: 'text',
data: 'text',
type: 'text'
}]
}]
##Further reading
* [Parser options](https://github.com/FB55/node-htmlparser/wiki/Parser-options)
* [DefaultHandler options](https://github.com/FB55/node-htmlparser/wiki/DefaultHandler-options)

@@ -1,69 +0,62 @@

var sys = require("sys");
var fs = require("fs");
var htmlparser = require("..");
var testFolder = ".";
var chunkSize = 5;
var testCount = 0,
failCount = 0,
totalTime = 0;
var testFiles = fs.readdirSync(testFolder);
var testCount = 0;
var failedCount = 0;
var totalTime = 0;
var name = __filename.split("/").slice(-1)[0];
var handler;
for (var i = 1; i < testFiles.length; i++) {
if(testFiles[i] === name) continue;
testCount++;
var moduleName = testFiles[i];
var test = require(testFolder + "/" + moduleName);
var handlerCallback = function handlerCallback (error) {
if (error)
sys.puts("Handler error: " + error);
}
console.log(testFiles[i]);
var start = Date.now();
if(test.type === "rss"){
handler = new htmlparser.RssHandler(handlerCallback, test.options.handler);
}
else if(test.type === "event"){
handler = new htmlparser.EventedHandler(test.options.handler);
}
else{
handler = new htmlparser.DefaultHandler(handlerCallback, test.options.handler);
}
var parser = new htmlparser.Parser(handler, test.options.parser);
parser.parseComplete(test.html);
var resultComplete = handler.dom;
if(test.type === "event"){
resultComplete = test.result;
test.result = [];
}
var chunkPos = 0;
parser.reset();
while (chunkPos < test.html.length) {
parser.parseChunk(test.html.substring(chunkPos, chunkPos + chunkSize));
chunkPos += chunkSize;
}
parser.done();
var resultChunk = handler.dom;
if(test.type === "event"){
resultChunk = test.result;
}
var testResult = sys.inspect(resultComplete, false, null) === sys.inspect(test.expected, false, null)
&& sys.inspect(resultChunk, false, null) === sys.inspect(test.expected, false, null);
var took = Date.now() - start;
totalTime += took;
sys.puts("[" + test.name + "\]: " + (testResult ? "passed" : "FAILED") + " (took: " + took + "ms)");
if (!testResult) {
failedCount++;
sys.puts("== Complete ==");
sys.puts(sys.inspect(resultComplete, false, null));
sys.puts("== Chunked ==");
sys.puts(sys.inspect(resultChunk, false, null));
sys.puts("== Expected ==");
sys.puts(sys.inspect(test.expected, false, null));
}
}
sys.puts("Total time: " + totalTime);
sys.puts("Total tests: " + testCount);
sys.puts("Failed tests: " + failedCount);
function runTests(test){
var begin = Date.now();
//read files, load them, run them
fs.readdirSync(test.dir
).map(function(file){
if(file[0] === ".") return false;
return require(test.dir + file);
}).forEach(function(file){
if(file === false) return;
var second = false,
failed = false,
start = Date.now()
took = 0;
console.log("Testing:", file.name);
test.test(file, function(err, dom){
if(err) console.log("Handler error:", err);
took += Date.now() - start;
var expected = JSON.stringify(file.expected, null, 2),
got = JSON.stringify(dom, null, 2);
if(expected !== got){
failed = true;
console.log("Expected", expected, "Got", got, second);
}
start = Date.now();
if(second){
testCount+=1;
if(failed) failCount+=1;
console.log("["+file.name+"]:",failed?"failed":"passed","(took",took,"ms)");
}
else second = true;
});
});
var took = Date.now()-begin;
totalTime+=took;
console.log(test.dir,"took",took);
};
//run all tests
["./01-html.js", "./02-feed.js", "./03-events.js", "./04-dom_utils.js"]
.map(require)
.forEach(runTests);
//log the results
console.log("Total time:", totalTime);
console.log("Total tests:", testCount);
console.log("Failed tests:", failCount);
if(failCount !== 0){
throw Error("Encountered " + failCount + " errors!");
}
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc