htmlparser2
Advanced tools
Comparing version 2.0.1 to 2.1.0
@@ -100,3 +100,41 @@ var ElementType = require("./ElementType.js"); | ||
else return filter(function(elem){return elem.type === type;}, element, recurse, limit); | ||
}, | ||
getInnerHTML: function(elem){ | ||
if(!elem.children) return ""; | ||
var childs = elem.children, | ||
childNum = childs.length, | ||
ret = ""; | ||
for(var i = 0; i < childNum; i++){ | ||
ret += this.getOuterHTML(childs[i]); | ||
} | ||
return ret; | ||
}, | ||
getOuterHTML: function(elem){ | ||
var type = elem.type; | ||
if(type === ElementType.Text) return elem.data; | ||
if(type === ElementType.Comment) return "<!--" + elem.data + "-->"; | ||
var ret = "<" + elem.name; | ||
var value; | ||
for(var name in elem.attribs){ | ||
value = elem.attribs[name]; | ||
ret += " " + name + "="; | ||
if(/^[^\s"\'\`\=\<\>]+$/.test(value)) ret += value; | ||
else if(value.indeOf("\"") !== -1) ret += "'" + value + "'"; | ||
else ret += "\"" + value + "\""; | ||
} | ||
if(type === ElementType.Directive) return ret + ">"; | ||
if(type === ElementType.Tag && !elem.children) return ret + " />"; | ||
return ">" + ret + this.getInnerHTML(elem) + "</" + elem.name + ">"; | ||
} | ||
}; |
@@ -8,7 +8,6 @@ var ElementType = require("./ElementType.js"); | ||
this._buffer = ""; | ||
this._prevTagSep = ""; | ||
this._tagSep = ""; | ||
this._stack = []; | ||
this._contentFlags = 0; | ||
this._done = false; | ||
this._parseState = ElementType.Text; | ||
} | ||
@@ -18,4 +17,4 @@ | ||
//Regular expressions used for cleaning up and parsing (stateless) | ||
var _reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //matches tagnames | ||
var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;//" | ||
var _reTagName = /[^\s\/]+/; //matches tagnames | ||
var _reAttrib = /([^=<>\"\'\s]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^'"\s]+))|([^=<>\"\'\s\/]+)/g; | ||
@@ -42,4 +41,4 @@ Parser.prototype._options = { | ||
this.reset(); | ||
this.parseChunk(data); | ||
this.done(); | ||
this.write(data); | ||
this.end(); | ||
}; | ||
@@ -56,3 +55,3 @@ | ||
//Tells the parser that the HTML being parsed is complete | ||
Parser.prototype.done = function(){ | ||
Parser.prototype.end = Parser.prototype.done = function(chunk){ | ||
if(this._done) return; | ||
@@ -62,2 +61,3 @@ this._done = true; | ||
//Parse the buffer to its end | ||
if(chunk) this._buffer += chunk; | ||
if(this._buffer) this._parseTags(true); | ||
@@ -81,4 +81,4 @@ | ||
var parseAttributes = function(data){ | ||
var pos = data.search(/\s/), attrs = {}; //Find any whitespace | ||
if(pos === -1) return attrs; | ||
var pos = data.search(/\w\s/) + 1, attrs = {}; //Find any whitespace | ||
if(pos === 0) return attrs; | ||
var attribRaw = data.substr(pos); | ||
@@ -90,6 +90,4 @@ | ||
while(match = _reAttrib.exec(attribRaw)){ | ||
if(match[1]) attrs[match[1]] = match[2]; | ||
else if(match[3]) attrs[match[3]] = match[4]; | ||
else if(match[5]) attrs[match[5]] = match[6]; | ||
else if(match[7]) attrs[match[7]] = match[7]; | ||
if(match[1]) attrs[match[1]] = match[2] || match[3] || match[4]; | ||
else attrs[match[5]] = match[5]; | ||
} | ||
@@ -105,5 +103,5 @@ | ||
if(this._options.lowerCaseTags){ | ||
return match[1] + match[2].toLowerCase(); | ||
return match[0].toLowerCase(); | ||
} | ||
else return match[1] + match[2]; | ||
else return match[0]; | ||
}; | ||
@@ -123,3 +121,3 @@ | ||
var next, tagSep, rawData, elementName, elementType, elementData; | ||
var next, rawData, elementType, elementData, lastTagSep; | ||
@@ -132,82 +130,76 @@ var opening = buffer.indexOf("<"), closing = buffer.indexOf(">"); | ||
while(opening !== closing){ //just false if both are -1 | ||
lastTagSep = this._tagSep; | ||
if((opening !== -1 && opening < closing) || closing === -1){ | ||
next = opening; | ||
tagSep = "<"; | ||
opening = buffer.indexOf(tagSep, next + 1); | ||
this._tagSep = "<"; | ||
opening = buffer.indexOf("<", next + 1); | ||
} | ||
else{ | ||
next = closing; | ||
tagSep = ">"; | ||
closing = buffer.indexOf(tagSep, next + 1); | ||
this._tagSep = ">"; | ||
closing = buffer.indexOf(">", next + 1); | ||
} | ||
rawData = buffer.substring(current, next); //The next chunk of data to parse | ||
elementType = this._parseState; | ||
//set elements for next run | ||
current = next + 1; | ||
this._parseState = (tagSep === "<") ? ElementType.Tag : ElementType.Text; | ||
if(elementType === ElementType.Tag){ | ||
elementData = rawData.trim(); | ||
elementName = this._parseTagName(elementData); | ||
} | ||
else{ | ||
elementData = rawData; | ||
elementName = ""; | ||
} | ||
//This section inspects the current tag stack and modifies the current | ||
//element if we're actually parsing a special area (script/comment/style tag) | ||
if(this._contentFlags === 0){ /*do nothing*/ } | ||
else if(this._contentFlags >= SpecialTags[ElementType.Comment]){ | ||
if(this._contentFlags >= SpecialTags[ElementType.Comment]){ | ||
//We're currently in a comment tag | ||
this._processComment(rawData, tagSep); | ||
this._processComment(rawData); | ||
continue; | ||
} | ||
//if it's a closing tag, remove the flag | ||
else if(this._contentFlags >= SpecialTags[ElementType.Script] && elementName === "/script"){ | ||
//remove the script flag (also removes the written flag) | ||
this._contentFlags %= SpecialTags[ElementType.Script]; | ||
} | ||
else if(this._contentFlags >= SpecialTags[ElementType.Style] && elementName === "/style"){ | ||
//remove the style flag (also removes the written flag) | ||
this._contentFlags %= SpecialTags[ElementType.Style]; | ||
} | ||
//special behaviour for script & style tags | ||
//Make sure we're not in a comment | ||
else if(!this._options.xmlMode && rawData.substring(0, 3) !== "!--"){ | ||
//If the previous element is text, append the last tag sep to element | ||
if(this._contentFlags >= SpecialTags.w){ | ||
if(this._cbs.ontext) this._cbs.ontext(this._prevTagSep + rawData); | ||
if(lastTagSep === "<"){ | ||
elementData = rawData.trimLeft(); | ||
if(elementData.charAt(0) === "/"){ | ||
//elementData = elementData.substr(1).trim(); | ||
elementData = this._parseTagName(elementData.substr(1)); | ||
if(this._contentFlags !== 0){ | ||
//if it's a closing tag, remove the flag | ||
if(this._contentFlags >= SpecialTags[ElementType.Script] && elementData === "script"){ | ||
//remove the script flag (also removes the written flag) | ||
this._contentFlags %= SpecialTags[ElementType.Script]; | ||
} | ||
else if(this._contentFlags >= SpecialTags[ElementType.Style] && elementData === "style"){ | ||
//remove the style flag (also removes the written flag) | ||
this._contentFlags %= SpecialTags[ElementType.Style]; | ||
} | ||
else { | ||
this._writeSpecial(rawData, lastTagSep); | ||
continue; | ||
} | ||
} | ||
this._processCloseTag(elementData); | ||
} | ||
else{ //The previous element was not text | ||
this._contentFlags += SpecialTags.w; | ||
if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData); | ||
else if(elementData.charAt(0) === "!" || elementData.charAt(0) === "?"){ | ||
if(elementData.substr(0, 3) === "!--"){ | ||
//This tag is a comment | ||
this._contentFlags += SpecialTags[ElementType.Comment]; | ||
this._processComment(rawData.substr(3)); | ||
} | ||
else if(this._contentFlags !== 0){ | ||
this._writeSpecial(rawData, lastTagSep); | ||
} | ||
//This tag is a directive | ||
//TODO: what about CDATA? | ||
else if(this._cbs.onprocessinginstruction){ | ||
this._cbs.onprocessinginstruction( | ||
elementData.charAt(0) + this._parseTagName(elementData.substr(1)), | ||
elementData | ||
); | ||
} | ||
} | ||
this._prevTagSep = tagSep; | ||
continue; | ||
else if(this._contentFlags !== 0) this._writeSpecial(rawData, lastTagSep); | ||
else this._processOpenTag(this._parseTagName(elementData), elementData); | ||
} | ||
//Processing of non-special tags | ||
if(elementType === ElementType.Tag){ | ||
if(rawData.substring(0, 3) === "!--"){ //This tag is a comment | ||
this._contentFlags += SpecialTags[ElementType.Comment]; | ||
this._processComment(rawData.substr(3), tagSep); | ||
continue; | ||
else{ | ||
if(this._contentFlags !== 0){ | ||
this._writeSpecial(rawData, lastTagSep); | ||
} | ||
if(rawData.charAt(0) === "!" || rawData.charAt(0) === "?"){ | ||
//ElementType.Directive | ||
//TODO: what about CDATA? | ||
if(this._cbs.onprocessinginstruction){ | ||
this._cbs.onprocessinginstruction(elementName, elementData); | ||
} | ||
continue; | ||
else if(rawData !== "" && this._cbs.ontext){ | ||
this._cbs.ontext(rawData); | ||
} | ||
if(elementName.charAt(0) === "/") this._processCloseTag(elementName.substr(1)); | ||
else this._processOpenTag(elementName, elementData, tagSep); | ||
} | ||
else if(elementType === ElementType.Text && rawData !== "" && this._cbs.ontext){ | ||
this._cbs.ontext(elementData); | ||
} | ||
} | ||
@@ -218,6 +210,4 @@ | ||
Parser.prototype._processComment = function(rawData, tagSep){ | ||
this._prevTagSep = tagSep; | ||
if(tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends | ||
Parser.prototype._processComment = function(rawData){ | ||
if(this._tagSep === ">" && rawData.substr(-2) === "--"){ //comment ends | ||
//remove the written flag (also removes the comment flag) | ||
@@ -228,13 +218,20 @@ this._contentFlags %= SpecialTags.w; | ||
} | ||
else if(this._cbs.oncomment) this._cbs.oncomment(rawData + tagSep); | ||
else if(this._cbs.oncomment) this._cbs.oncomment(rawData + this._tagSep); | ||
}; | ||
Parser.prototype._writeSpecial = function(rawData, lastTagSep){ | ||
//if the previous element is text, append the last tag sep to element | ||
if(this._contentFlags >= SpecialTags.w){ | ||
if(this._cbs.ontext) this._cbs.ontext(lastTagSep + rawData); | ||
} | ||
else{ //The previous element was not text | ||
this._contentFlags += SpecialTags.w; | ||
if(rawData !== "" && this._cbs.ontext) this._cbs.ontext(rawData); | ||
} | ||
}; | ||
var emptyTags = require("./ClosingTags.js").self; | ||
Parser.prototype._isEmptyTag = function(name){ | ||
return !this._options.xmlMode && emptyTags[name]; | ||
}; | ||
Parser.prototype._processCloseTag = function(name){ | ||
if(this._stack && !this._isEmptyTag(name)){ | ||
if(this._stack && (!emptyTags[name] || this._options.xmlMode)){ | ||
var i = this._stack.length; | ||
@@ -254,3 +251,3 @@ while(i !== 0 && this._stack[--i] !== name){} | ||
Parser.prototype._processOpenTag = function(name, data, tagSep){ | ||
Parser.prototype._processOpenTag = function(name, data){ | ||
var type = ElementType.Tag; | ||
@@ -266,3 +263,3 @@ if(this._options.xmlMode){ /*do nothing*/ } | ||
//If tag self-terminates, add an explicit, separate closing tag | ||
if(data.substr(-1) === "/" || this._isEmptyTag(name)){ | ||
if(data.substr(-1) === "/" || (emptyTags[name] && !this._options.xmlMode)){ | ||
if(this._cbs.onclosetag) this._cbs.onclosetag(name); | ||
@@ -272,3 +269,2 @@ } else { | ||
this._stack.push(name); | ||
this._prevTagSep = tagSep; | ||
} | ||
@@ -275,0 +271,0 @@ }; |
{ | ||
"name": "htmlparser2", | ||
"description": "Forgiving HTML/XML/RSS Parser for Node. This version is optimised and cleaned and provides a SAX interface.", | ||
"version": "2.0.1", | ||
"version": "2.1.0", | ||
"author": "Felix Boehm <me@feedic.com>", | ||
@@ -6,0 +6,0 @@ "contributors": ["Chris Winberry <chris@winberry.net>"], |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
51678
1522