htmlparser2
Advanced tools
Comparing version 1.5.0 to 2.0.0
@@ -8,10 +8,4 @@ var ElementType = require("./ElementType.js"); | ||
this._tagStack = []; | ||
if(options){ //otherwise, the prototype is used | ||
this._options = options; | ||
if(typeof this._options.verbose === "undefined") | ||
this._options.verbose = true; | ||
if (typeof this._options.enforceEmptyTags === "undefined") | ||
this._options.enforceEmptyTags = true; | ||
} | ||
this._callback = callback; | ||
if(options) this._options = options; //otherwise, the prototype is used | ||
if(callback) this._callback = callback; | ||
} | ||
@@ -21,50 +15,34 @@ | ||
DefaultHandler.prototype._options = { | ||
ignoreWhitespace: false, //Keep whitespace-only text nodes | ||
verbose: true, //Keep data property for tags and raw property for all | ||
enforceEmptyTags: true //Don't allow children for HTML tags defined as empty in spec | ||
ignoreWhitespace: false //Keep whitespace-only text nodes | ||
}; | ||
//HTML Tags that shouldn't contain child nodes | ||
var emptyTags={area:true,base:true,basefont:true,br:true,col:true,frame:true,hr:true,img:true,input:true,isindex:true,link:true,meta:true,param:true,embed:true}; | ||
//Resets the handler back to starting state | ||
DefaultHandler.prototype.onreset = DefaultHandler; | ||
//**Public**// | ||
//Methods// | ||
//Resets the handler back to starting state | ||
DefaultHandler.prototype.reset = function() { | ||
DefaultHandler.call(this, this._callback); | ||
}; | ||
//Signals the handler that parsing is done | ||
DefaultHandler.prototype.done = function() { | ||
DefaultHandler.prototype.onend = function(){ | ||
if(this._done) return; | ||
this._done = true; | ||
this.handleCallback(null); | ||
this._handleCallback(null); | ||
}; | ||
//Methods// | ||
DefaultHandler.prototype.error = | ||
DefaultHandler.prototype.handleCallback = function(error){ | ||
if(typeof this._callback === "function") | ||
this._callback(error, this.dom); | ||
else if(error) throw error; | ||
DefaultHandler.prototype.onerror = function(error){ | ||
if(typeof this._callback === "function"){ | ||
return this._callback(error, this.dom); | ||
} else { | ||
if(error) throw error; | ||
} | ||
}; | ||
DefaultHandler.prototype._isEmptyTag = function(name) { | ||
return this._options.enforceEmptyTags && emptyTags[name]; | ||
}; | ||
DefaultHandler.prototype._handleCallback = DefaultHandler.prototype.onerror; | ||
DefaultHandler.prototype.closeTag = function(name){ | ||
//Ignore closing tags that obviously don't have an opening tag | ||
if(!this._tagStack || this._isEmptyTag(name)) return; | ||
var pos = this._tagStack.length - 1; | ||
while(pos !== -1 && this._tagStack[pos--].name !== name){} | ||
if (pos !== -1 || this._tagStack[0].name === name) | ||
this._tagStack.splice(pos+1); | ||
DefaultHandler.prototype.onclosetag = function(name){ | ||
this._tagStack.pop(); | ||
}; | ||
DefaultHandler.prototype._addDomElement = function(element){ | ||
if(!this._options.verbose) delete element.raw; | ||
var lastChild, | ||
lastTag = this._tagStack[this._tagStack.length - 1]; | ||
var lastTag = this._tagStack[this._tagStack.length-1], lastChild; | ||
if(!lastTag) this.dom.push(element); | ||
else{ //There are parent elements | ||
if(lastTag){ //There are parent elements | ||
if(!lastTag.children){ | ||
@@ -74,36 +52,67 @@ lastTag.children = [element]; | ||
} | ||
lastChild = lastTag.children[lastTag.children.length-1]; | ||
if(element.type === ElementType.Comment && lastChild.type === ElementType.Comment){ | ||
lastChild = lastTag.children[lastTag.children.length - 1]; | ||
if(this._inSpecialTag && element.type === ElementType.Text && lastChild.type === ElementType.Text){ | ||
lastChild.data += element.data; | ||
if(this._options.verbose) lastChild.raw = lastChild.data; | ||
} else { | ||
lastTag.children.push(element); | ||
} | ||
else if(this._inSpecialTag && element.type === ElementType.Text && lastChild.type === ElementType.Text){ | ||
lastChild.data += element.data; | ||
if(this._options.verbose) | ||
lastChild.raw = lastChild.data; | ||
} | ||
else lastTag.children.push(element); | ||
} | ||
else { | ||
this.dom.push(element); | ||
} | ||
}; | ||
DefaultHandler.prototype.openTag = function(element){ | ||
if(!this._options.verbose) delete element.data; | ||
var isSpecial = element.type === ElementType.Script || element.type === ElementType.Style; | ||
if(isSpecial) this._inSpecialTag = true; | ||
DefaultHandler.prototype.onopentag = function(name, attribs, type){ | ||
if(type === ElementType.Script || type === ElementType.Style){ | ||
this._inSpecialTag = true; | ||
} | ||
var element = { | ||
type: type, | ||
name: name, | ||
attribs: attribs | ||
}; | ||
this._addDomElement(element); | ||
this._tagStack.push(element); | ||
}; | ||
DefaultHandler.prototype.ontext = function(data){ | ||
if(this._options.ignoreWhitespace && data.trim() === "") return; | ||
this._addDomElement({ | ||
data: data, | ||
type: ElementType.Text | ||
}); | ||
}; | ||
DefaultHandler.prototype.oncomment = function(data){ | ||
var lastTag = this._tagStack[this._tagStack.length - 1]; | ||
var lastChild = lastTag && lastTag.children && lastTag.children[lastTag.children.length - 1]; | ||
//Don't add tags to the tag stack that can't have children | ||
if(!this._isEmptyTag(element.name)) this._tagStack.push(element); | ||
var element; | ||
if(!lastChild || lastChild.type !== ElementType.Comment){ | ||
element = { | ||
data: data, | ||
type: ElementType.Comment | ||
}; | ||
if(!lastTag){ | ||
return this.dom.push(element); | ||
} else if(!lastChild){ | ||
lastTag.children = [element]; | ||
} else { | ||
if(lastChild.type !== ElementType.Comment){ | ||
lastTag.children.push(element); | ||
} | ||
} | ||
} else { | ||
lastChild.data += data; | ||
} | ||
}; | ||
DefaultHandler.prototype.writeText = function(element){ | ||
if(this._options.ignoreWhitespace && element.data.trim() === "") return; | ||
this._addDomElement(element); | ||
DefaultHandler.prototype.onprocessinginstruction = function(name, data){ | ||
this._addDomElement({ | ||
name: name, | ||
data: data, | ||
type: ElementType.Directive | ||
}); | ||
}; | ||
DefaultHandler.prototype.writeComment = DefaultHandler.prototype.writeDirective = DefaultHandler.prototype._addDomElement; | ||
module.exports = DefaultHandler; |
@@ -7,71 +7,96 @@ var ElementType = require("./ElementType.js"); | ||
function testElement(options, element) { | ||
if (!element) return false; | ||
var type = element.type; | ||
var arrayPush = Array.prototype.push; | ||
function filterArray(test, arr, recurse, limit){ | ||
var result = [], childs; | ||
for(var i = 0, j = arr.length; i < j; i++){ | ||
if(test(arr[i])){ | ||
result.push(arr[i]); | ||
if(--limit <= 0) break; | ||
} | ||
if(recurse && (childs = arr[i].children)){ | ||
childs = filterArray(test, childs, limit); | ||
arrayPush.apply(result, childs); | ||
limit -= childs.length; | ||
if(limit <= 0) break; | ||
} | ||
} | ||
return result; | ||
} | ||
for (var key in options) { | ||
if (key === "tag_name") { | ||
if (type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false; | ||
if (!options.tag_name(element.name)) return false; | ||
} else if (key === "tag_type") { | ||
if (!options.tag_type(type)) return false; | ||
} else if (key === "tag_contains") { | ||
if (type !== ElementType.Text && type !== ElementType.Comment && type !== ElementType.Directive) return false; | ||
if (!options.tag_contains(element.data)) return false; | ||
} else if (!element.attribs || !options[key](element.attribs[key])) | ||
return false; | ||
} | ||
return true; | ||
function filter(test, element, recurse, limit){ | ||
if(recurse !== false) recurse = true; | ||
if(isNaN(limit)) limit = Infinity; | ||
if(!Array.isArray(element)){ | ||
element = [element]; | ||
} | ||
return filterArray(test, element, recurse, limit); | ||
} | ||
module.exports = { | ||
testElement: testElement, | ||
testElement: function testElement(options, element) { | ||
var type = element.type; | ||
getElements: function(options, currentElement, recurse, limit){ | ||
recurse = recurse === undefined || recurse === null || recurse; | ||
if(isNaN(limit)) limit = -1; | ||
for(var key in options){ | ||
if(key === "tag_name"){ | ||
if(type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false; | ||
if(!options.tag_name(element.name)) return false; | ||
} else if(key === "tag_type") { | ||
if(!options.tag_type(type)) return false; | ||
} else if(key === "tag_contains") { | ||
if(type !== ElementType.Text && type !== ElementType.Comment && type !== ElementType.Directive) return false; | ||
if(!options.tag_contains(element.data)) return false; | ||
} else if(!element.attribs || !options[key](element.attribs[key])) | ||
return false; | ||
} | ||
return true; | ||
}, | ||
getElements: function(options, element, recurse, limit){ | ||
for(var key in options){ | ||
if (typeof options[key] !== "function") | ||
if(typeof options[key] !== "function"){ | ||
options[key] = getTest(options[key]); | ||
} | ||
} | ||
return this.testAttr(testElement.bind(null, options), currentElement, recurse, limit); | ||
} | ||
, getElementById: function(id, currentElement, recurse) { | ||
var result = this.getElements({ id: id }, currentElement, recurse, 1); | ||
return filter(this.testElement.bind(null, options), element, recurse, limit); | ||
}, | ||
getElementById: function(id, element, recurse) { | ||
var result; | ||
if(typeof id === "function"){ | ||
result = filter(function(elem){ | ||
return elem.attribs && id(elem.attribs); | ||
}, element, recurse, 1); | ||
} | ||
else{ | ||
result = filter(function(elem){ | ||
return elem.attribs && elem.attribs.id === id; | ||
}, element, recurse, 1); | ||
} | ||
return result.length ? result[0] : null; | ||
//function(elem){return elem.attribs && elem.attribs.id === id;} | ||
} | ||
, getElementsByTagName: function(name, currentElement, recurse, limit) { | ||
return this.getElements({ tag_name: name }, currentElement, recurse, limit); | ||
/*function(elem){ | ||
}, | ||
getElementsByTagName: function(name, element, recurse, limit){ | ||
if(typeof name === "function") return filter(function(elem){ | ||
var type = elem.type; | ||
if(type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false; | ||
return name(elem.name); | ||
}, element, recurse, limit); | ||
else return filter(function(elem){ | ||
var type = elem.type; | ||
if(type !== ElementType.Tag && type !== ElementType.Script && type !== ElementType.Style) return false; | ||
return elem.name === name; | ||
};*/ | ||
}, element, recurse, limit); | ||
}, | ||
getElementsByTagType: function(type, element, recurse, limit){ | ||
if(typeof type === "function"){ | ||
return filter(function(elem){return type(elem.type);}, element, recurse, limit); | ||
} | ||
else return filter(function(elem){return elem.type === type;}, element, recurse, limit); | ||
} | ||
, getElementsByTagType: function(type, currentElement, recurse, limit){ | ||
return this.testAttr(function(elem){return elem.type === type;}, currentElement, recurse, limit); | ||
//function(elem){return elem.type === type;} | ||
} | ||
, testAttr: function(test, element, recurse, limit){ | ||
var found = [], elementList; | ||
if(!element) return found; | ||
if(test(element)) found.push(element); | ||
if(recurse && element.children) elementList = element.children; | ||
else if(Array.isArray(element)) elementList = element; | ||
else return found; | ||
for(var i = 0, j = elementList.length; i < j && (limit < 0 || found.length < limit); i++){ | ||
found = found.concat(this.testAttr(test, elementList[i], recurse, limit)); | ||
} | ||
return found; | ||
}}; | ||
}; |
module.exports = { | ||
Parser: require("./Parser.js"), | ||
DefaultHandler: require("./DefaultHandler.js"), | ||
RssHandler: require("./RssHandler.js"), | ||
EventedHandler: require("./EventedHandler.js"), | ||
FeedHandler: require("./FeedHandler.js"), | ||
ElementType: require("./ElementType.js"), | ||
DomUtils: require("./DomUtils.js") | ||
} |
var ElementType = require("./ElementType.js"); | ||
function Parser(handler, options){ | ||
function Parser(cbs, options){ | ||
if(options) this._options = options; | ||
if(cbs) this._cbs = cbs; | ||
validateHandler(handler); | ||
this._handler = handler; | ||
this._buffer = ""; | ||
this._prevTagSep = ""; | ||
this._stack = []; | ||
this._contentFlags = 0; | ||
this._done = false; | ||
this._current = 0; | ||
this._location = { | ||
row: 0, | ||
col: 0, | ||
charOffset: 0 | ||
}; | ||
this._parseState = ElementType.Text; | ||
@@ -24,18 +17,20 @@ } | ||
//Regular expressions used for cleaning up and parsing (stateless) | ||
var _reWhitespace = /\s/; //Used to find any whitespace to split on | ||
var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element | ||
var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //matches tagnames | ||
var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;//" | ||
//Find attributes in a tag | ||
var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; | ||
var tagTypes = {}; | ||
tagTypes[ ElementType.Script ] = true; | ||
tagTypes[ ElementType.Style ] = true; | ||
tagTypes[ ElementType.Tag ] = true; | ||
Parser.prototype._options = { | ||
includeLocation: false, //Do not track element position in document by default | ||
xmlMode: false //Special behaviour for script/style tags by default | ||
xmlMode: false, //Special behaviour for script/style tags by default | ||
lowerCaseTags: false //call .toLowerCase for each tagname | ||
}; | ||
Parser.prototype._cbs = { | ||
/* | ||
onopentag, | ||
onclosetag, | ||
ontext, | ||
onprocessinginstruction, | ||
oncomment | ||
*/ | ||
}; | ||
//**Public**// | ||
@@ -51,6 +46,7 @@ //Methods// | ||
//Parses a piece of an HTML document | ||
Parser.prototype.write = | ||
Parser.prototype.parseChunk = function(data){ | ||
if(this._done) this.handleError(Error("Attempted to parse chunk after parsing already done")); | ||
if(this._done) this._handleError(Error("Attempted to parse chunk after parsing already done")); | ||
this._buffer += data; //FIXME: this can be a bottleneck | ||
this.parseTags(); | ||
this._parseTags(); | ||
}; | ||
@@ -63,20 +59,10 @@ | ||
//Push any unparsed text into a final element in the element list | ||
if(this._buffer){ | ||
var data = this._buffer; | ||
if(this._parseState === ElementType.Tag){ | ||
data = data.trim(); | ||
var name = parseTagName(data); | ||
if(name.charAt(0) === "/") this._handler.closeTag(name.substr(1)); | ||
else this._handler.openTag({ | ||
name: name, raw: data, data: data, attribs: parseAttributes(data) | ||
}); | ||
} | ||
else this._handler.writeText({ | ||
raw: data, data: data, type: ElementType.Text | ||
}); | ||
this._buffer = ""; | ||
//Parse the buffer to its end | ||
if(this._buffer) this._parseTags(true); | ||
if(this._cbs.onclosetag){ | ||
while(this._stack.length) this._cbs.onclosetag(this._stack.pop()); | ||
} | ||
this._handler.done(); | ||
if(this._cbs.onend) this._cbs.onend(); | ||
}; | ||
@@ -86,4 +72,4 @@ | ||
Parser.prototype.reset = function(){ | ||
Parser.call(this, this._handler); | ||
this._handler.reset(); | ||
Parser.call(this); | ||
if(this._cbs.onreset) this._cbs.onreset(); | ||
}; | ||
@@ -94,8 +80,8 @@ | ||
var parseAttributes = function(data){ | ||
var pos = data.search(_reWhitespace); | ||
if(pos === -1) return; | ||
var pos = data.search(/\s/), attrs = {}; //Find any whitespace | ||
if(pos === -1) return attrs; | ||
var attribRaw = data.substr(pos); | ||
_reAttrib.lastIndex = 0; | ||
var match, attrs = {}; | ||
var match; | ||
@@ -113,6 +99,9 @@ while(match = _reAttrib.exec(attribRaw)){ | ||
//Extracts the base tag name from the data value of an element | ||
var parseTagName = function(data){ | ||
Parser.prototype._parseTagName = function(data){ | ||
var match = data.match(_reTagName); | ||
if(match === null) return ""; | ||
return match[1] + match[2]; | ||
if(this._options.lowerCaseTags){ | ||
return match[1] + match[2].toLowerCase(); | ||
} | ||
else return match[1] + match[2]; | ||
}; | ||
@@ -126,14 +115,17 @@ | ||
SpecialTags.w = 4; //2^2 - if set, append prev tag sep to data | ||
SpecialTags[ElementType.Comment] = 8; //2^8 | ||
SpecialTags[ElementType.Comment] = 8; //2^3 | ||
//Parses through HTML text and returns an array of found elements | ||
Parser.prototype.parseTags = function(){ | ||
var buffer = this._buffer; | ||
Parser.prototype._parseTags = function(force){ | ||
var buffer = this._buffer, current = 0; | ||
var next, tagSep, rawData, element, elementName, prevElement, elementType, elementData, attributes; | ||
var next, tagSep, rawData, elementName, elementType, elementData; | ||
var opening = buffer.indexOf("<"), closing = buffer.indexOf(">"); | ||
//if force is true, parse everything | ||
if(force) opening = Infinity; | ||
while(opening !== closing){ //just false if both are -1 | ||
if(closing === -1 || (opening !== -1 && opening < closing)){ | ||
if((opening !== -1 && opening < closing) || closing === -1){ | ||
next = opening; | ||
@@ -148,4 +140,7 @@ tagSep = "<"; | ||
} | ||
rawData = buffer.substring(this._current, next); //The next chunk of data to parse | ||
rawData = buffer.substring(current, next); //The next chunk of data to parse | ||
elementType = this._parseState; | ||
//set elements for next run | ||
current = next + 1; | ||
this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text; | ||
@@ -155,3 +150,3 @@ | ||
elementData = rawData.trim(); | ||
elementName = parseTagName(elementData); | ||
elementName = this._parseTagName(elementData); | ||
} | ||
@@ -169,3 +164,3 @@ else{ | ||
this._processComment(rawData, tagSep); | ||
elementType = null; | ||
continue; | ||
} | ||
@@ -184,13 +179,12 @@ //if it's a closing tag, remove the flag | ||
else if(!this._options.xmlMode && rawData.substring(0, 3) !== "!--"){ | ||
//All data from here to style close is now a text element | ||
elementType = ElementType.Text; | ||
//If the previous element is text, append the last tag sep to element | ||
if(this._contentFlags >= SpecialTags.w){ | ||
elementData = rawData = this._prevTagSep + rawData; | ||
if(this._cbs.ontext) this._cbs.ontext(this._prevTagSep + rawData); | ||
} | ||
else{ //The previous element was not text | ||
this._contentFlags += SpecialTags.w; | ||
elementData = rawData; | ||
if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData); | ||
} | ||
this._prevTagSep = tagSep; | ||
continue; | ||
} | ||
@@ -203,30 +197,27 @@ | ||
this._processComment(rawData.substr(3), tagSep); | ||
continue; | ||
} | ||
else if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){ | ||
if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){ | ||
//ElementType.Directive | ||
//TODO: what about CDATA? | ||
element = {raw: elementData, data: elementData, type: ElementType.Directive, name: elementName}; | ||
if(this._options.includeLocation) element.location = this.getLocation(false); | ||
this._handler.writeDirective(element); | ||
} else | ||
this._processTag(elementName, elementData, tagSep, rawData); | ||
if(this._cbs.onprocessinginstruction){ | ||
this._cbs.onprocessinginstruction(elementName, elementData); | ||
} | ||
continue; | ||
} | ||
if(elementName.charAt(0) === "/") this._processCloseTag(elementName.substr(1)); | ||
else this._processOpenTag(elementName, elementData, tagSep); | ||
} | ||
else if(elementType === ElementType.Text && rawData !== ""){ | ||
element = {raw: rawData, data: elementData, type: ElementType.Text}; | ||
if(this._options.includeLocation) element.location = this.getLocation(false); | ||
this._handler.writeText(element); | ||
else if(elementType === ElementType.Text && rawData !== "" && this._cbs.ontext){ | ||
this._cbs.ontext(elementData); | ||
} | ||
this._current = next + 1; | ||
} | ||
if(this._options.includeLocation){ | ||
this.getLocation(); | ||
this._location.charOffset = 0; | ||
} | ||
this._buffer = buffer.substring(this._current); | ||
this._current = 0; | ||
this._buffer = buffer.substring(current); | ||
}; | ||
Parser.prototype._processComment = function(rawData, tagSep){ | ||
this._prevTagSep = tagSep; | ||
if(tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends | ||
@@ -238,21 +229,29 @@ //remove the written flag (also removes the comment flag) | ||
else rawData += tagSep; | ||
this._prevTagSep = tagSep; | ||
var element = { | ||
raw: rawData, | ||
data: rawData, | ||
type: ElementType.Comment | ||
}; | ||
if(this._options.includeLocation) element.location = this.getLocation(false); | ||
this._handler.writeComment(element); | ||
if(this._cbs.oncomment) this._cbs.oncomment(rawData); | ||
}; | ||
Parser.prototype._processTag = function(name, data, tagSep, raw){ | ||
if(name.charAt(0) === "/"){ | ||
this._handler.closeTag(name.substring(1)); | ||
return; | ||
var emptyTags = require("./ClosingTags.js").self; | ||
Parser.prototype._isEmptyTag = function(name){ | ||
return !this._options.xmlMode && emptyTags[name]; | ||
}; | ||
Parser.prototype._processCloseTag = function(name){ | ||
if(this._stack && !this._isEmptyTag(name)){ | ||
var i = this._stack.length; | ||
while(i !== 0 && this._stack[--i] !== name){} | ||
if(i !== 0 || this._stack[0] === name) | ||
if(this._cbs.onclosetag){ | ||
while(i < this._stack.length) | ||
this._cbs.onclosetag(this._stack.pop()); | ||
} | ||
else this._stack.splice(i); | ||
} | ||
//many browsers (eg. Safari, Chrome) convert </br> to <br> | ||
else if(name === "br" && !this._options.xmlMode) | ||
this._processOpenTag(name, "/"); | ||
}; | ||
Parser.prototype._processOpenTag = function(name, data, tagSep){ | ||
var type = ElementType.Tag; | ||
@@ -263,19 +262,12 @@ if(this._options.xmlMode){ /*do nothing*/ } | ||
var element = { | ||
raw: raw, data: data, type: type, name: name | ||
}; | ||
if(this._cbs.onopentag){ | ||
this._cbs.onopentag(name, parseAttributes(data), type); | ||
} | ||
var attribs = parseAttributes(data); | ||
if(attribs) element.attribs = attribs; | ||
if(this._options.includeLocation) | ||
element.location = this.getLocation(type === ElementType.Tag); | ||
this._handler.openTag(element); | ||
//If tag self-terminates, add an explicit, separate closing tag | ||
if(data.substr(-1) === "/"){ | ||
this._handler.closeTag(name); | ||
if(data.substr(-1) === "/" || this._isEmptyTag(name)){ | ||
if(this._cbs.onclosetag) this._cbs.onclosetag(name); | ||
} else { | ||
this._contentFlags += SpecialTags[type]; | ||
this._stack.push(name); | ||
this._prevTagSep = tagSep; | ||
@@ -285,44 +277,5 @@ } | ||
Parser.prototype.getLocation = function(startTag){ | ||
var c, end, chunk, | ||
l = this._location; | ||
if(startTag){ | ||
end = this._current - 1; | ||
chunk = l.charOffset === 0 && end === -1; | ||
} else { | ||
end = this._current; | ||
chunk = false; | ||
} | ||
var rows = this._buffer.substring(l.charOffset, end).split("\n"), | ||
rowNum = rows.length - 1; | ||
l.charOffset = end; | ||
l.row += rowNum; | ||
var num = rows[rowNum].replace(/\r/g,"").length; | ||
if(rowNum === 0) l.col += num; | ||
else l.col = num; | ||
if(arguments.length === 0) return; | ||
return { | ||
line: l.row + 1, | ||
col: l.col + (chunk ? 0: 1) | ||
}; | ||
}; | ||
//Checks the handler to ensure it is an object with the right interface | ||
var validateHandler = function(handler){ | ||
if(typeof handler !== "object") | ||
throw Error("Handler is not an object"); | ||
["reset", "done", "openTag", "closeTag", "writeText", "writeComment", "writeDirective"].forEach(function(name){ | ||
if(typeof handler[name] !== "function") | ||
throw Error("Handler method '" + name + "' is invalid"); | ||
}); | ||
}; | ||
Parser.prototype.handleError = function(error){ | ||
if(typeof this._handler.error === "function") | ||
this._handler.error(error); | ||
Parser.prototype._handleError = function(error){ | ||
if(this._cbs.onerror) | ||
this._cbs.onerror(error); | ||
else throw error; | ||
@@ -329,0 +282,0 @@ }; |
{ | ||
"name": "htmlparser2" | ||
, "description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface (EventedHandler)." | ||
, "version": "1.5.0" | ||
, "author": "Felix Boehm <me@feedic.com>" | ||
, "contributors": [ "Chris Winberry <chris@winberry.net>" ] | ||
, "repository": { | ||
"type": "git" | ||
, "url": "git://github.com/fb55/node-htmlparser.git" | ||
} | ||
, "bugs": { | ||
"mail": "me@feedic.com" | ||
, "url": "http://github.com/fb55/node-htmlparser/issues" | ||
} | ||
, "directories": { "lib": "./lib/" } | ||
, "main": "./lib/" | ||
, "engines": { "node": ">0" } | ||
, "licenses": [{ | ||
"type": "MIT" | ||
, "url": "http://github.com/tautologistics/node-htmlparser/raw/master/LICENSE" | ||
"name": "htmlparser2", | ||
"description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface.", | ||
"version": "2.0.0", | ||
"author": "Felix Boehm <me@feedic.com>", | ||
"contributors": ["Chris Winberry <chris@winberry.net>"], | ||
"repository": { | ||
"type": "git", | ||
"url": "git://github.com/fb55/node-htmlparser.git" | ||
}, | ||
"bugs": { | ||
"mail": "me@feedic.com", | ||
"url": "http://github.com/fb55/node-htmlparser/issues" | ||
}, | ||
"directories": { | ||
"lib": "./lib/" | ||
}, | ||
"main": "./lib/", | ||
"scripts": { | ||
"test": "cd tests && node 00-runtests.js" | ||
}, | ||
"engines": { | ||
"node": ">0" | ||
}, | ||
"licenses": [{ | ||
"type": "MIT", | ||
"url": "http://github.com/tautologistics/node-htmlparser/raw/master/LICENSE" | ||
}] | ||
} |
181
README.md
@@ -10,7 +10,13 @@ #htmlparser2 | ||
This project is linked to [Travis CI](http://travis-ci.org/). The latest builds status is: | ||
[![Build Status](https://secure.travis-ci.org/FB55/node-htmlparser.png)](http://travis-ci.org/FB55/node-htmlparser) | ||
##How is this different from [node-htmlparser](https://github.com/tautologistics/node-htmlparser)? | ||
This is a fork of the project above. The main difference is that this is just intended to be used with node. Besides, the code is much better structured, has less duplications and is remarkably faster than the original. | ||
Besides, it features an additional handler that provides the interface of [sax.js](https://github.com/isaacs/sax-js) (written for my readability port [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)). | ||
Besides, the parser now provides the interface of [sax.js](https://github.com/isaacs/sax-js) (originally intended for my readability port [readabilitySAX](https://github.com/fb55/readabilitysax)). I also fixed a couple of bugs & included some pull requests for the original project (eg. [RDF feed support](https://github.com/tautologistics/node-htmlparser/pull/35)). | ||
The support for location data and verbose output was removed a couple of versions ago. It's still available in the [verbose branch](https://github.com/FB55/node-htmlparser/tree/verbose) (if you really need it, for whatever reason that may be). | ||
##Usage | ||
@@ -25,17 +31,14 @@ | ||
[...parsing done, do something...] | ||
console.log(dom); | ||
}); | ||
var parser = new htmlparser.Parser(handler); | ||
parser.parseComplete(rawHtml); | ||
sys.puts(sys.inspect(handler.dom, false, null)); | ||
parser.write(rawHtml); | ||
parser.done(); | ||
Output: | ||
##Example output | ||
[{ | ||
raw: 'Xyz ', | ||
data: 'Xyz ', | ||
type: 'text' | ||
}, { | ||
raw: 'script language= javascript', | ||
data: 'script language= javascript', | ||
type: 'script', | ||
@@ -47,3 +50,2 @@ name: 'script', | ||
children: [{ | ||
raw: 'var foo = \'<bar>\';<', | ||
data: 'var foo = \'<bar>\';<', | ||
@@ -53,3 +55,2 @@ type: 'text' | ||
}, { | ||
raw: '<!-- Waah! -- ', | ||
data: '<!-- Waah! -- ', | ||
@@ -62,161 +63,13 @@ type: 'comment' | ||
... | ||
parser.parseChunk(chunk); | ||
parser.write(chunk); | ||
} | ||
parser.done(); | ||
##Parsing RSS/Atom Feeds | ||
new htmlparser.RssHandler(function (error, dom) { | ||
##Parsing RSS/RDF/Atom Feeds | ||
new htmlparser.FeedHandler(function (error, dom) { | ||
... | ||
}); | ||
##Parser options | ||
###Usage | ||
var Parser = new htmlparser.Parser(handler, options); | ||
###Option: includeLocation | ||
Indicates whether the parser should include the location of a token as part of it. Default: false. | ||
###Option: xmlMode | ||
Indicates whether `<script>` and `<style>` tags should get special treatment. If false, their content will be text only. For RSS feeds and other XML content (not HTML), set this to true. Default: false. | ||
##DefaultHandler options | ||
###Usage | ||
var handler = new htmlparser.DefaultHandler(function (error) {...}, { | ||
verbose: false, | ||
ignoreWhitespace: true | ||
}); | ||
###Option: ignoreWhitespace | ||
Indicates whether the DOM should exclude text nodes that consists solely of whitespace. The default value is "false". | ||
The following HTML will be used: | ||
<font> | ||
<br>this is the text | ||
<font> | ||
####Example: true | ||
[{ | ||
raw: 'font', | ||
data: 'font', | ||
type: 'tag', | ||
name: 'font', | ||
children: [{ | ||
raw: 'br', | ||
data: 'br', | ||
type: 'tag', | ||
name: 'br' | ||
}, { | ||
raw: 'this is the text\n', | ||
data: 'this is the text\n', | ||
type: 'text' | ||
}, { | ||
raw: 'font', | ||
data: 'font', | ||
type: 'tag', | ||
name: 'font' | ||
}] | ||
}] | ||
####Example: false | ||
[{ | ||
raw: 'font', | ||
data: 'font', | ||
type: 'tag', | ||
name: 'font', | ||
children: [{ | ||
raw: '\n\t', | ||
data: '\n\t', | ||
type: 'text' | ||
}, { | ||
raw: 'br', | ||
data: 'br', | ||
type: 'tag', | ||
name: 'br' | ||
}, { | ||
raw: 'this is the text\n', | ||
data: 'this is the text\n', | ||
type: 'text' | ||
}, { | ||
raw: 'font', | ||
data: 'font', | ||
type: 'tag', | ||
name: 'font' | ||
}] | ||
}] | ||
###Option: verbose | ||
Indicates whether to include extra information on each node in the DOM. This information consists of the "raw" attribute (original, unparsed text found between "<" and ">") and the "data" attribute on "tag", "script", and "comment" nodes. The default value is "true". | ||
The following HTML is used: | ||
<a href="test.html">xxx</a> | ||
####Example: true | ||
[{ | ||
raw: 'a href="test.html"', | ||
data: 'a href="test.html"', | ||
type: 'tag', | ||
name: 'a', | ||
attribs: { | ||
href: 'test.html' | ||
}, | ||
children: [{ | ||
raw: 'xxx', | ||
data: 'xxx', | ||
type: 'text' | ||
}] | ||
}] | ||
####Example: false | ||
[{ | ||
type: 'tag', | ||
name: 'a', | ||
attribs: { | ||
href: 'test.html' | ||
}, | ||
children: [{ | ||
data: 'xxx', | ||
type: 'text' | ||
}] | ||
}] | ||
###Option: enforceEmptyTags | ||
Indicates whether the DOM should prevent children on tags marked as empty in the HTML spec. Typically this should be set to "true" HTML parsing and "false" for XML parsing. The default value is "true". | ||
The following HTML is used: | ||
<link>text</link> | ||
####Example: true | ||
[{ | ||
raw: 'link', | ||
data: 'link', | ||
type: 'tag', | ||
name: 'link' | ||
}, { | ||
raw: 'text', | ||
data: 'text', | ||
type: 'text' | ||
}] | ||
####Example: false | ||
[{ | ||
raw: 'link', | ||
data: 'link', | ||
type: 'tag', | ||
name: 'link', | ||
children: [{ | ||
raw: 'text', | ||
data: 'text', | ||
type: 'text' | ||
}] | ||
}] | ||
##Further reading | ||
* [Parser options](https://github.com/FB55/node-htmlparser/wiki/Parser-options) | ||
* [DefaultHandler options](https://github.com/FB55/node-htmlparser/wiki/DefaultHandler-options) |
@@ -1,69 +0,62 @@ | ||
var sys = require("sys"); | ||
var fs = require("fs"); | ||
var htmlparser = require(".."); | ||
var testFolder = "."; | ||
var chunkSize = 5; | ||
var testCount = 0, | ||
failCount = 0, | ||
totalTime = 0; | ||
var testFiles = fs.readdirSync(testFolder); | ||
var testCount = 0; | ||
var failedCount = 0; | ||
var totalTime = 0; | ||
var name = __filename.split("/").slice(-1)[0]; | ||
var handler; | ||
for (var i = 1; i < testFiles.length; i++) { | ||
if(testFiles[i] === name) continue; | ||
testCount++; | ||
var moduleName = testFiles[i]; | ||
var test = require(testFolder + "/" + moduleName); | ||
var handlerCallback = function handlerCallback (error) { | ||
if (error) | ||
sys.puts("Handler error: " + error); | ||
} | ||
console.log(testFiles[i]); | ||
var start = Date.now(); | ||
if(test.type === "rss"){ | ||
handler = new htmlparser.RssHandler(handlerCallback, test.options.handler); | ||
} | ||
else if(test.type === "event"){ | ||
handler = new htmlparser.EventedHandler(test.options.handler); | ||
} | ||
else{ | ||
handler = new htmlparser.DefaultHandler(handlerCallback, test.options.handler); | ||
} | ||
var parser = new htmlparser.Parser(handler, test.options.parser); | ||
parser.parseComplete(test.html); | ||
var resultComplete = handler.dom; | ||
if(test.type === "event"){ | ||
resultComplete = test.result; | ||
test.result = []; | ||
} | ||
var chunkPos = 0; | ||
parser.reset(); | ||
while (chunkPos < test.html.length) { | ||
parser.parseChunk(test.html.substring(chunkPos, chunkPos + chunkSize)); | ||
chunkPos += chunkSize; | ||
} | ||
parser.done(); | ||
var resultChunk = handler.dom; | ||
if(test.type === "event"){ | ||
resultChunk = test.result; | ||
} | ||
var testResult = sys.inspect(resultComplete, false, null) === sys.inspect(test.expected, false, null) | ||
&& sys.inspect(resultChunk, false, null) === sys.inspect(test.expected, false, null); | ||
var took = Date.now() - start; | ||
totalTime += took; | ||
sys.puts("[" + test.name + "\]: " + (testResult ? "passed" : "FAILED") + " (took: " + took + "ms)"); | ||
if (!testResult) { | ||
failedCount++; | ||
sys.puts("== Complete =="); | ||
sys.puts(sys.inspect(resultComplete, false, null)); | ||
sys.puts("== Chunked =="); | ||
sys.puts(sys.inspect(resultChunk, false, null)); | ||
sys.puts("== Expected =="); | ||
sys.puts(sys.inspect(test.expected, false, null)); | ||
} | ||
} | ||
sys.puts("Total time: " + totalTime); | ||
sys.puts("Total tests: " + testCount); | ||
sys.puts("Failed tests: " + failedCount); | ||
function runTests(test){ | ||
var begin = Date.now(); | ||
//read files, load them, run them | ||
fs.readdirSync(test.dir | ||
).map(function(file){ | ||
if(file[0] === ".") return false; | ||
return require(test.dir + file); | ||
}).forEach(function(file){ | ||
if(file === false) return; | ||
var second = false, | ||
failed = false, | ||
start = Date.now() | ||
took = 0; | ||
console.log("Testing:", file.name); | ||
test.test(file, function(err, dom){ | ||
if(err) console.log("Handler error:", err); | ||
took += Date.now() - start; | ||
var expected = JSON.stringify(file.expected, null, 2), | ||
got = JSON.stringify(dom, null, 2); | ||
if(expected !== got){ | ||
failed = true; | ||
console.log("Expected", expected, "Got", got, second); | ||
} | ||
start = Date.now(); | ||
if(second){ | ||
testCount+=1; | ||
if(failed) failCount+=1; | ||
console.log("["+file.name+"]:",failed?"failed":"passed","(took",took,"ms)"); | ||
} | ||
else second = true; | ||
}); | ||
}); | ||
var took = Date.now()-begin; | ||
totalTime+=took; | ||
console.log(test.dir,"took",took); | ||
}; | ||
//run all tests | ||
["./01-html.js", "./02-feed.js", "./03-events.js", "./04-dom_utils.js"] | ||
.map(require) | ||
.forEach(runTests); | ||
//log the results | ||
console.log("Total time:", totalTime); | ||
console.log("Total tests:", testCount); | ||
console.log("Failed tests:", failCount); | ||
if(failCount !== 0){ | ||
throw Error("Encountered " + failCount + " errors!"); | ||
} |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
No bug tracker
MaintenancePackage does not have a linked bug tracker in package.json.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
No bug tracker
MaintenancePackage does not have a linked bug tracker in package.json.
Found 1 instance in 1 package
47
1478
20
50392
70