advanced-html-parser
Advanced tools
Comparing version 1.0.8 to 1.0.9
@@ -0,2 +1,21 @@ | ||
const enMap = {}; | ||
if (typeof require == 'function') { | ||
const enMap = require('./entity-map').EntityMap; | ||
} | ||
function DOMParser(options) { | ||
if (String.prototype.substr === undefined) | ||
String.prototype.substr = function (start, length) { | ||
var str = this.toString(); | ||
if (length === undefined || length > str.length) | ||
length = str.length; | ||
if (length < 0) | ||
length = 0; | ||
if (start < 0) | ||
start = str.length + start; | ||
if (start < 0) | ||
start = 0; | ||
return str.substring(start, length + start) | ||
} | ||
this.options = options || { locator: {} }; | ||
@@ -15,3 +34,11 @@ | ||
var entityMap = options.entityMap || {}; | ||
if (options.ignoreTags && options.ignoreTags.length>0){ | ||
if (options.onlyBody === true){ | ||
var expression = `<(body)(.*?)>(.|\n)*?<\/(body)>`; | ||
var body = new RegExp(expression, "gmi").exec(source); | ||
if (body && body.length != undefined && body.length > 0) | ||
source = body[0]; | ||
} | ||
if (options.ignoreTags && options.ignoreTags.length > 0) { | ||
var expression = `<(${options.ignoreTags.join("|")})(.*?)>(.|\n)*?<\/(${options.ignoreTags.join("|")})>`; | ||
@@ -22,6 +49,4 @@ console.log("Found ignoreTags executing", expression) | ||
if (typeof require == 'function') { | ||
const enMap = require('./entity-map').EntityMap; | ||
entityMap = Object.assign(enMap, entityMap); | ||
} | ||
entityMap = Object.assign(enMap, entityMap); | ||
if (locator) { | ||
@@ -28,0 +53,0 @@ domBuilder.setDocumentLocator(locator) |
@@ -5,4 +5,4 @@ //[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] | ||
var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF | ||
var nameChar = new RegExp("[\\-\\.0-9"+nameStartChar.source.slice(1,-1)+"\\u00B7\\u0300-\\u036F\\u203F-\\u2040]"); | ||
var tagNamePattern = new RegExp('^'+nameStartChar.source+nameChar.source+'*(?:\:'+nameStartChar.source+nameChar.source+'*)?$'); | ||
var nameChar = new RegExp("[\\-\\.0-9" + nameStartChar.source.slice(1, -1) + "\\u00B7\\u0300-\\u036F\\u203F-\\u2040]"); | ||
var tagNamePattern = new RegExp('^' + nameStartChar.source + nameChar.source + '*(?:\:' + nameStartChar.source + nameChar.source + '*)?$'); | ||
//var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/ | ||
@@ -15,3 +15,3 @@ //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',') | ||
var S_ATTR = 1;//attr name offerring | ||
var S_ATTR_SPACE=2;//attr name end and space offer | ||
var S_ATTR_SPACE = 2;//attr name end and space offer | ||
var S_EQ = 3;//=space? | ||
@@ -23,53 +23,77 @@ var S_ATTR_NOQUOT_VALUE = 4;//attr value(no quot value only) | ||
const textParser = require("he"); | ||
const cachedChars = {} | ||
const attCheck ={ | ||
html: false, | ||
body: false, | ||
div: false, | ||
a: false, | ||
span: false, | ||
i: false, | ||
select: false, | ||
option: false, | ||
form: false, | ||
nav: false, | ||
ul: false, | ||
li: false, | ||
dl: false, | ||
dd: false, | ||
h5: false, | ||
table: false, | ||
tbody: false, | ||
tr: false, | ||
td: false, | ||
b: false, | ||
h4: false, | ||
p: false, | ||
ol: false, | ||
thead: false, | ||
th: false, | ||
em: false, | ||
button: false, | ||
del: false, | ||
h3: false, | ||
head: false, | ||
title: false, | ||
script: true, | ||
textarea: true, | ||
style: false | ||
} | ||
function entityReplacer(a) { | ||
if (cachedChars[a] === undefined) | ||
cachedChars[a] = textParser.decode(a); | ||
return cachedChars[a]; | ||
} | ||
function XMLReader(){ | ||
String.prototype.entityReplacer = function () { | ||
const str = this.toString(); | ||
if (str.indexOf("&") !== -1) | ||
return str.replace(/&#?\w+;/g, entityReplacer); | ||
return str; | ||
} | ||
function XMLReader() { | ||
} | ||
XMLReader.prototype = { | ||
parse:function(source,defaultNSMap,entityMap){ | ||
parse: function (source, defaultNSMap, entityMap) { | ||
var domBuilder = this.domBuilder; | ||
domBuilder.startDocument(); | ||
_copy(defaultNSMap ,defaultNSMap = {}) | ||
parse(source,defaultNSMap,entityMap, domBuilder,this.errorHandler); | ||
_copy(defaultNSMap, defaultNSMap = {}); | ||
parse(source, defaultNSMap, entityMap, domBuilder, this.errorHandler); | ||
domBuilder.endDocument(); | ||
} | ||
} | ||
function parse(source,defaultNSMapCopy,entityMap,domBuilder,errorHandler){ | ||
function fixedFromCharCode(code) { | ||
// String.prototype.fromCharCode does not supports | ||
// > 2 bytes unicode chars directly | ||
if (code > 0xffff) { | ||
code -= 0x10000; | ||
var surrogate1 = 0xd800 + (code >> 10) | ||
, surrogate2 = 0xdc00 + (code & 0x3ff); | ||
function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) { | ||
return String.fromCharCode(surrogate1, surrogate2); | ||
} else { | ||
return String.fromCharCode(code); | ||
} | ||
} | ||
function entityReplacer(a){ | ||
return textParser.decode(a); | ||
/* var k = a.slice(1,-1); | ||
console.log(a,"decode", textParser.decode(k)); | ||
if(k in entityMap){ | ||
return entityMap[k]; | ||
}else if(k.charAt(0) === '#'){ | ||
return fixedFromCharCode(parseInt(k.substr(1).replace('x','0x'))) | ||
}else{ | ||
errorHandler.error('entity not found:'+a); | ||
return a; | ||
}*/ | ||
} | ||
function appendText(end){//has some bugs | ||
if(end>start){ | ||
var xt = source.substring(start,end).replace(/&#?\w+;/g,entityReplacer); | ||
locator&&position(start); | ||
domBuilder.characters(xt,0,end-start); | ||
function appendText(end) {//has some bugs | ||
if (end > start) { | ||
var xt = source.substring(start, end).entityReplacer(); | ||
locator && position(start); | ||
domBuilder.characters(xt, 0, end - start); | ||
start = end | ||
} | ||
} | ||
function position(p,m){ | ||
while(p>=lineEnd && (m = linePattern.exec(source))){ | ||
function position(p, m) { | ||
while (p >= lineEnd && (m = linePattern.exec(source))) { | ||
lineStart = m.index; | ||
@@ -80,3 +104,3 @@ lineEnd = lineStart + m[0].length; | ||
} | ||
locator.columnNumber = p-lineStart+1; | ||
locator.columnNumber = p - lineStart + 1; | ||
} | ||
@@ -87,114 +111,114 @@ var lineStart = 0; | ||
var locator = domBuilder.locator; | ||
var parseStack = [{currentNSMap:defaultNSMapCopy}] | ||
var parseStack = [{ currentNSMap: defaultNSMapCopy }] | ||
var closeMap = {}; | ||
var start = 0; | ||
while(true){ | ||
try{ | ||
var tagStart = source.indexOf('<',start); | ||
if(tagStart<0){ | ||
if(!source.substr(start).match(/^\s*$/)){ | ||
while (true) { | ||
try { | ||
var tagStart = source.indexOf('<', start); | ||
if (tagStart < 0) { | ||
if (!source.substr(start).match(/^\s*$/)) { | ||
var doc = domBuilder.doc; | ||
var text = doc.createTextNode(source.substr(start)); | ||
var text = doc.createTextNode(source.substr(start)); | ||
text.tagName = tagName; | ||
doc.appendChild(text); | ||
domBuilder.currentElement = text; | ||
doc.appendChild(text); | ||
domBuilder.currentElement = text; | ||
} | ||
return; | ||
} | ||
if(tagStart>start){ | ||
if (tagStart > start) { | ||
appendText(tagStart); | ||
} | ||
switch(source.charAt(tagStart+1)){ | ||
case '/': | ||
var end = source.indexOf('>',tagStart+3); | ||
var tagName = source.substring(tagStart+2,end); | ||
var config = parseStack.pop(); | ||
if(end<0){ | ||
tagName = source.substring(tagStart+2).replace(/[\s<].*/,''); | ||
//console.error('#@@@@@@'+tagName) | ||
errorHandler.error("end tag name: "+tagName+' is not complete:'+config.tagName); | ||
end = tagStart+1+tagName.length; | ||
}else if(tagName.match(/\s</)){ | ||
tagName = tagName.replace(/[\s<].*/,''); | ||
errorHandler.error("end tag name: "+tagName+' maybe not complete'); | ||
end = tagStart+1+tagName.length; | ||
} | ||
//console.error(parseStack.length,parseStack) | ||
//console.error(config); | ||
var localNSMap = config.localNSMap; | ||
var endMatch = config.tagName == tagName; | ||
var endIgnoreCaseMach = endMatch || config.tagName&&config.tagName.toLowerCase() == tagName.toLowerCase() | ||
if(endIgnoreCaseMach){ | ||
domBuilder.endElement(config.uri,config.localName,tagName); | ||
if(localNSMap){ | ||
for(var prefix in localNSMap){ | ||
domBuilder.endPrefixMapping(prefix) ; | ||
switch (source.charAt(tagStart + 1)) { | ||
case '/': | ||
var end = source.indexOf('>', tagStart + 3); | ||
var tagName = source.substring(tagStart + 2, end); | ||
var config = parseStack.pop(); | ||
if (end < 0) { | ||
tagName = source.substring(tagStart + 2).replace(/[\s<].*/, ''); | ||
//console.error('#@@@@@@'+tagName) | ||
errorHandler.error("end tag name: " + tagName + ' is not complete:' + config.tagName); | ||
end = tagStart + 1 + tagName.length; | ||
} else if (tagName.match(/\s</)) { | ||
tagName = tagName.replace(/[\s<].*/, ''); | ||
errorHandler.error("end tag name: " + tagName + ' maybe not complete'); | ||
end = tagStart + 1 + tagName.length; | ||
} | ||
//console.error(parseStack.length,parseStack) | ||
//console.error(config); | ||
var localNSMap = config.localNSMap; | ||
var endMatch = config.tagName == tagName; | ||
var endIgnoreCaseMach = endMatch || config.tagName && config.tagName.toLowerCase() == tagName.toLowerCase() | ||
if (endIgnoreCaseMach) { | ||
domBuilder.endElement(config.uri, config.localName, tagName); | ||
if (localNSMap) { | ||
for (var prefix in localNSMap) { | ||
domBuilder.endPrefixMapping(prefix); | ||
} | ||
} | ||
if (!endMatch) { | ||
errorHandler.fatalError("end tag name: " + tagName + ' is not match the current start tagName:' + config.tagName); | ||
} | ||
} else { | ||
parseStack.push(config) | ||
} | ||
if(!endMatch){ | ||
errorHandler.fatalError("end tag name: "+tagName+' is not match the current start tagName:'+config.tagName ); | ||
} | ||
}else{ | ||
parseStack.push(config) | ||
} | ||
end++; | ||
break; | ||
end++; | ||
break; | ||
// end elment | ||
case '?':// <?...?> | ||
locator&&position(tagStart); | ||
end = parseInstruction(source,tagStart,domBuilder); | ||
break; | ||
case '!':// <!doctype,<![CDATA,<!-- | ||
locator&&position(tagStart); | ||
end = parseDCC(source,tagStart,domBuilder,errorHandler); | ||
break; | ||
default: | ||
locator&&position(tagStart); | ||
var el = new ElementAttributes(); | ||
var currentNSMap = parseStack[parseStack.length-1].currentNSMap; | ||
//elStartEnd | ||
var end = parseElementStartPart(source,tagStart,el,currentNSMap,entityReplacer,errorHandler); | ||
var len = el.length; | ||
if(!el.closed && fixSelfClosed(source,end,el.tagName,closeMap)){ | ||
el.closed = true; | ||
if(!entityMap.nbsp){ | ||
errorHandler.warning('unclosed xml attribute'); | ||
case '?':// <?...?> | ||
locator && position(tagStart); | ||
end = parseInstruction(source, tagStart, domBuilder); | ||
break; | ||
case '!':// <!doctype,<![CDATA,<!-- | ||
locator && position(tagStart); | ||
end = parseDCC(source, tagStart, domBuilder, errorHandler); | ||
break; | ||
default: | ||
locator && position(tagStart); | ||
var el = new ElementAttributes(); | ||
var currentNSMap = parseStack[parseStack.length - 1].currentNSMap; | ||
//elStartEnd | ||
var end = parseElementStartPart(source, tagStart, el, currentNSMap, errorHandler); | ||
var len = el.length; | ||
if (!el.closed && fixSelfClosed(source, end, el.tagName, closeMap)) { | ||
el.closed = true; | ||
if (!entityMap.nbsp) { | ||
errorHandler.warning('unclosed xml attribute'); | ||
} | ||
} | ||
} | ||
if(locator && len){ | ||
var locator2 = copyLocator(locator,{}); | ||
//try{//attribute position fixed | ||
for(var i = 0;i<len;i++){ | ||
var a = el[i]; | ||
position(a.offset); | ||
a.locator = copyLocator(locator,{}); | ||
if (locator && len) { | ||
var locator2 = copyLocator(locator, {}); | ||
//try{//attribute position fixed | ||
for (var i = 0; i < len; i++) { | ||
var a = el[i]; | ||
position(a.offset); | ||
a.locator = copyLocator(locator, {}); | ||
} | ||
//}catch(e){console.error('@@@@@'+e)} | ||
domBuilder.locator = locator2 | ||
if (appendElement(el, domBuilder, currentNSMap)) { | ||
parseStack.push(el) | ||
} | ||
domBuilder.locator = locator; | ||
} else { | ||
if (appendElement(el, domBuilder, currentNSMap)) { | ||
parseStack.push(el) | ||
} | ||
} | ||
//}catch(e){console.error('@@@@@'+e)} | ||
domBuilder.locator = locator2 | ||
if(appendElement(el,domBuilder,currentNSMap)){ | ||
parseStack.push(el) | ||
if (el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed) { | ||
end = parseHtmlSpecialContent(source, end, el.tagName, domBuilder) | ||
} else { | ||
end++; | ||
} | ||
domBuilder.locator = locator; | ||
}else{ | ||
if(appendElement(el,domBuilder,currentNSMap)){ | ||
parseStack.push(el) | ||
} | ||
} | ||
if(el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed){ | ||
end = parseHtmlSpecialContent(source,end,el.tagName,entityReplacer,domBuilder) | ||
}else{ | ||
end++; | ||
} | ||
} | ||
}catch(e){ | ||
errorHandler.error('element parse error: '+e) | ||
} catch (e) { | ||
errorHandler.error('element parse error: ' + e) | ||
//errorHandler.error('element parse error: '+e); | ||
@@ -204,11 +228,11 @@ end = -1; | ||
} | ||
if(end>start){ | ||
if (end > start) { | ||
start = end; | ||
}else{ | ||
} else { | ||
//TODO: 这里有可能sax回退,有位置错误风险 | ||
appendText(Math.max(tagStart,start)+1); | ||
appendText(Math.max(tagStart, start) + 1); | ||
} | ||
} | ||
} | ||
function copyLocator(f,t){ | ||
function copyLocator(f, t) { | ||
t.lineNumber = f.lineNumber; | ||
@@ -223,3 +247,3 @@ t.columnNumber = f.columnNumber; | ||
*/ | ||
function parseElementStartPart(source,start,el,currentNSMap,entityReplacer,errorHandler){ | ||
function parseElementStartPart(source, start, el, currentNSMap, errorHandler) { | ||
var attrName; | ||
@@ -229,164 +253,165 @@ var value; | ||
var s = S_TAG;//status | ||
while(true){ | ||
while (true) { | ||
var c = source.charAt(p); | ||
switch(c){ | ||
case '=': | ||
if(s === S_ATTR){//attrName | ||
attrName = source.slice(start,p); | ||
s = S_EQ; | ||
}else if(s === S_ATTR_SPACE){ | ||
s = S_EQ; | ||
}else{ | ||
//fatalError: equal must after attrName or space after attrName | ||
throw new Error('attribute equal must after attrName'); | ||
} | ||
break; | ||
case '\'': | ||
case '"': | ||
if(s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE | ||
){//equal | ||
if(s === S_ATTR){ | ||
errorHandler.warning('attribute value must after "="') | ||
attrName = source.slice(start,p) | ||
switch (c) { | ||
case '=': | ||
if (s === S_ATTR) {//attrName | ||
attrName = source.slice(start, p); | ||
s = S_EQ; | ||
} else if (s === S_ATTR_SPACE) { | ||
s = S_EQ; | ||
} else { | ||
//fatalError: equal must after attrName or space after attrName | ||
throw new Error('attribute equal must after attrName'); | ||
} | ||
start = p+1; | ||
p = source.indexOf(c,start) | ||
if(p>0){ | ||
value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); | ||
el.add(attrName,value,start-1); | ||
s = S_ATTR_END; | ||
}else{ | ||
//fatalError: no end quot match | ||
throw new Error('attribute value no end \''+c+'\' match'); | ||
break; | ||
case '\'': | ||
case '"': | ||
if (s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE | ||
) {//equal | ||
if (s === S_ATTR) { | ||
errorHandler.warning('attribute value must after "="') | ||
attrName = source.slice(start, p) | ||
} | ||
start = p + 1; | ||
p = source.indexOf(c, start) | ||
if (p > 0) { | ||
value = source.slice(start, p).entityReplacer(); | ||
el.add(attrName, value, start - 1); | ||
s = S_ATTR_END; | ||
} else { | ||
//fatalError: no end quot match | ||
throw new Error('attribute value no end \'' + c + '\' match'); | ||
} | ||
} else if (s == S_ATTR_NOQUOT_VALUE) { | ||
value = source.slice(start, p).entityReplacer(); | ||
//console.log(attrName,value,start,p) | ||
el.add(attrName, value, start); | ||
//console.dir(el) | ||
errorHandler.warning('attribute "' + attrName + '" missed start quot(' + c + ')!!'); | ||
start = p + 1; | ||
s = S_ATTR_END | ||
} else { | ||
//fatalError: no equal before | ||
throw new Error('attribute value must after "="'); | ||
} | ||
}else if(s == S_ATTR_NOQUOT_VALUE){ | ||
value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); | ||
//console.log(attrName,value,start,p) | ||
el.add(attrName,value,start); | ||
//console.dir(el) | ||
errorHandler.warning('attribute "'+attrName+'" missed start quot('+c+')!!'); | ||
start = p+1; | ||
s = S_ATTR_END | ||
}else{ | ||
//fatalError: no equal before | ||
throw new Error('attribute value must after "="'); | ||
} | ||
break; | ||
case '/': | ||
switch(s){ | ||
case S_TAG: | ||
el.setTagName(source.slice(start,p)); | ||
case S_ATTR_END: | ||
case S_TAG_SPACE: | ||
case S_TAG_CLOSE: | ||
s =S_TAG_CLOSE; | ||
el.closed = true; | ||
case S_ATTR_NOQUOT_VALUE: | ||
case S_ATTR: | ||
case S_ATTR_SPACE: | ||
break; | ||
//case S_EQ: | ||
default: | ||
throw new Error("attribute invalid close char('/')") | ||
} | ||
break; | ||
case ''://end document | ||
//throw new Error('unexpected end of input') | ||
errorHandler.error('unexpected end of input'); | ||
if(s == S_TAG){ | ||
el.setTagName(source.slice(start,p)); | ||
} | ||
return p; | ||
case '>': | ||
switch(s){ | ||
case S_TAG: | ||
el.setTagName(source.slice(start,p)); | ||
case S_ATTR_END: | ||
case S_TAG_SPACE: | ||
case S_TAG_CLOSE: | ||
break;//normal | ||
case S_ATTR_NOQUOT_VALUE://Compatible state | ||
case S_ATTR: | ||
value = source.slice(start,p); | ||
if(value.slice(-1) === '/'){ | ||
el.closed = true; | ||
value = value.slice(0,-1) | ||
case '/': | ||
switch (s) { | ||
case S_TAG: | ||
el.setTagName(source.slice(start, p)); | ||
case S_ATTR_END: | ||
case S_TAG_SPACE: | ||
case S_TAG_CLOSE: | ||
s = S_TAG_CLOSE; | ||
el.closed = true; | ||
case S_ATTR_NOQUOT_VALUE: | ||
case S_ATTR: | ||
case S_ATTR_SPACE: | ||
break; | ||
//case S_EQ: | ||
default: | ||
throw new Error("attribute invalid close char('/')") | ||
} | ||
case S_ATTR_SPACE: | ||
if(s === S_ATTR_SPACE){ | ||
value = attrName; | ||
break; | ||
case ''://end document | ||
//throw new Error('unexpected end of input') | ||
errorHandler.error('unexpected end of input'); | ||
if (s == S_TAG) { | ||
el.setTagName(source.slice(start, p)); | ||
} | ||
if(s == S_ATTR_NOQUOT_VALUE){ | ||
errorHandler.warning('attribute "'+value+'" missed quot(")!!'); | ||
el.add(attrName,value.replace(/&#?\w+;/g,entityReplacer),start) | ||
}else{ | ||
if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !value.match(/^(?:disabled|checked|selected)$/i)){ | ||
errorHandler.warning('attribute "'+value+'" missed value!! "'+value+'" instead!!') | ||
} | ||
el.add(value,value,start) | ||
return p; | ||
case '>': | ||
switch (s) { | ||
case S_TAG: | ||
el.setTagName(source.slice(start, p)); | ||
case S_ATTR_END: | ||
case S_TAG_SPACE: | ||
case S_TAG_CLOSE: | ||
break;//normal | ||
case S_ATTR_NOQUOT_VALUE://Compatible state | ||
case S_ATTR: | ||
value = source.slice(start, p); | ||
if (value.slice(-1) === '/') { | ||
el.closed = true; | ||
value = value.slice(0, -1) | ||
} | ||
case S_ATTR_SPACE: | ||
if (s === S_ATTR_SPACE) { | ||
value = attrName; | ||
} | ||
if (s == S_ATTR_NOQUOT_VALUE) { | ||
errorHandler.warning('attribute "' + value + '" missed quot(")!!'); | ||
el.add(attrName, value.entityReplacer(), start) | ||
} else { | ||
if (currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !value.match(/^(?:disabled|checked|selected)$/i)) { | ||
errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!') | ||
} | ||
el.add(value, value, start) | ||
} | ||
break; | ||
case S_EQ: | ||
throw new Error('attribute value missed!!'); | ||
} | ||
break; | ||
case S_EQ: | ||
throw new Error('attribute value missed!!'); | ||
} | ||
// console.log(tagName,tagNamePattern,tagNamePattern.test(tagName)) | ||
return p; | ||
/*xml space '\x20' | #x9 | #xD | #xA; */ | ||
case '\u0080': | ||
c = ' '; | ||
default: | ||
if(c<= ' '){//space | ||
switch(s){ | ||
case S_TAG: | ||
el.setTagName(source.slice(start,p));//tagName | ||
s = S_TAG_SPACE; | ||
break; | ||
case S_ATTR: | ||
attrName = source.slice(start,p) | ||
s = S_ATTR_SPACE; | ||
break; | ||
case S_ATTR_NOQUOT_VALUE: | ||
var value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer); | ||
errorHandler.warning('attribute "'+value+'" missed quot(")!!'); | ||
el.add(attrName,value,start) | ||
case S_ATTR_END: | ||
s = S_TAG_SPACE; | ||
break; | ||
//case S_TAG_SPACE: | ||
//case S_EQ: | ||
//case S_ATTR_SPACE: | ||
// void();break; | ||
//case S_TAG_CLOSE: | ||
//ignore warning | ||
} | ||
}else{//not space | ||
//S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE | ||
//S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE | ||
switch(s){ | ||
//case S_TAG:void();break; | ||
//case S_ATTR:void();break; | ||
//case S_ATTR_NOQUOT_VALUE:void();break; | ||
case S_ATTR_SPACE: | ||
var tagName = el.tagName; | ||
if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !attrName.match(/^(?:disabled|checked|selected)$/i)){ | ||
errorHandler.warning('attribute "'+attrName+'" missed value!! "'+attrName+'" instead2!!') | ||
// console.log(tagName,tagNamePattern,tagNamePattern.test(tagName)) | ||
return p; | ||
/*xml space '\x20' | #x9 | #xD | #xA; */ | ||
case '\u0080': | ||
c = ' '; | ||
default: | ||
if (c <= ' ') {//space | ||
switch (s) { | ||
case S_TAG: | ||
el.setTagName(source.slice(start, p));//tagName | ||
s = S_TAG_SPACE; | ||
break; | ||
case S_ATTR: | ||
attrName = source.slice(start, p) | ||
s = S_ATTR_SPACE; | ||
break; | ||
case S_ATTR_NOQUOT_VALUE: | ||
var value = source.slice(start, p).entityReplacer() | ||
errorHandler.warning('attribute "' + value + '" missed quot(")!!'); | ||
el.add(attrName, value, start) | ||
case S_ATTR_END: | ||
s = S_TAG_SPACE; | ||
break; | ||
//case S_TAG_SPACE: | ||
//case S_EQ: | ||
//case S_ATTR_SPACE: | ||
// void();break; | ||
//case S_TAG_CLOSE: | ||
//ignore warning | ||
} | ||
el.add(attrName,attrName,start); | ||
start = p; | ||
s = S_ATTR; | ||
break; | ||
case S_ATTR_END: | ||
errorHandler.warning('attribute space is required"'+attrName+'"!!') | ||
case S_TAG_SPACE: | ||
s = S_ATTR; | ||
start = p; | ||
break; | ||
case S_EQ: | ||
s = S_ATTR_NOQUOT_VALUE; | ||
start = p; | ||
break; | ||
case S_TAG_CLOSE: | ||
throw new Error("elements closed character '/' and '>' must be connected to"); | ||
} else {//not space | ||
//S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE | ||
//S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE | ||
switch (s) { | ||
//case S_TAG:void();break; | ||
//case S_ATTR:void();break; | ||
//case S_ATTR_NOQUOT_VALUE:void();break; | ||
case S_ATTR_SPACE: | ||
var tagName = el.tagName; | ||
if (currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !attrName.match(/^(?:disabled|checked|selected)$/i)) { | ||
errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead2!!') | ||
} | ||
el.add(attrName, attrName, start); | ||
start = p; | ||
s = S_ATTR; | ||
break; | ||
case S_ATTR_END: | ||
errorHandler.warning('attribute space is required"' + attrName + '"!!') | ||
case S_TAG_SPACE: | ||
s = S_ATTR; | ||
start = p; | ||
break; | ||
case S_EQ: | ||
s = S_ATTR_NOQUOT_VALUE; | ||
start = p; | ||
break; | ||
case S_TAG_CLOSE: | ||
throw new Error("elements closed character '/' and '>' must be connected to"); | ||
} | ||
} | ||
} | ||
}//end outer switch | ||
@@ -400,3 +425,3 @@ //console.log('p++',p) | ||
*/ | ||
function appendElement(el,domBuilder,currentNSMap){ | ||
function appendElement(el, domBuilder, currentNSMap) { | ||
var tagName = el.tagName; | ||
@@ -407,3 +432,3 @@ var localNSMap = null; | ||
var i = el.length; | ||
while(i--){ | ||
while (i--) { | ||
var a = el[i]; | ||
@@ -413,7 +438,7 @@ var qName = a.qName; | ||
var nsp = qName.indexOf(':'); | ||
if(nsp>0){ | ||
var prefix = a.prefix = qName.slice(0,nsp); | ||
var localName = qName.slice(nsp+1); | ||
if (nsp > 0) { | ||
var prefix = a.prefix = qName.slice(0, nsp); | ||
var localName = qName.slice(nsp + 1); | ||
var nsPrefix = prefix === 'xmlns' && localName | ||
}else{ | ||
} else { | ||
localName = qName; | ||
@@ -424,9 +449,9 @@ prefix = null | ||
//can not set prefix,because prefix !== '' | ||
a.localName = localName ; | ||
a.localName = localName; | ||
//prefix == null for no ns prefix attribute | ||
if(nsPrefix !== false){//hack!! | ||
if(localNSMap == null){ | ||
if (nsPrefix !== false) {//hack!! | ||
if (localNSMap == null) { | ||
localNSMap = {} | ||
//console.log(currentNSMap,0) | ||
_copy(currentNSMap,currentNSMap={}) | ||
_copy(currentNSMap, currentNSMap = {}) | ||
//console.log(currentNSMap,1) | ||
@@ -436,15 +461,15 @@ } | ||
a.uri = 'http://www.w3.org/2000/xmlns/' | ||
domBuilder.startPrefixMapping(nsPrefix, value) | ||
domBuilder.startPrefixMapping(nsPrefix, value) | ||
} | ||
} | ||
var i = el.length; | ||
while(i--){ | ||
while (i--) { | ||
a = el[i]; | ||
var prefix = a.prefix; | ||
if(prefix){//no prefix attribute has no namespace | ||
if(prefix === 'xml'){ | ||
if (prefix) {//no prefix attribute has no namespace | ||
if (prefix === 'xml') { | ||
a.uri = 'http://www.w3.org/XML/1998/namespace'; | ||
}if(prefix !== 'xmlns'){ | ||
} if (prefix !== 'xmlns') { | ||
a.uri = currentNSMap[prefix || ''] | ||
//{console.log('###'+a.qName,domBuilder.locator.systemId+'',currentNSMap,a.uri)} | ||
@@ -455,6 +480,6 @@ } | ||
var nsp = tagName.indexOf(':'); | ||
if(nsp>0){ | ||
prefix = el.prefix = tagName.slice(0,nsp); | ||
localName = el.localName = tagName.slice(nsp+1); | ||
}else{ | ||
if (nsp > 0) { | ||
prefix = el.prefix = tagName.slice(0, nsp); | ||
localName = el.localName = tagName.slice(nsp + 1); | ||
} else { | ||
prefix = null;//important!! | ||
@@ -466,13 +491,13 @@ localName = el.localName = tagName; | ||
domBuilder.startElement(ns,localName,tagName ,el); | ||
domBuilder.startElement(ns, localName, tagName, el); | ||
//endPrefixMapping and startPrefixMapping have not any help for dom builder | ||
//localNSMap = null | ||
if(el.closed){ | ||
domBuilder.endElement(ns,localName,tagName); | ||
if(localNSMap){ | ||
for(prefix in localNSMap){ | ||
domBuilder.endPrefixMapping(prefix) | ||
if (el.closed) { | ||
domBuilder.endElement(ns, localName, tagName); | ||
if (localNSMap) { | ||
for (prefix in localNSMap) { | ||
domBuilder.endPrefixMapping(prefix) | ||
} | ||
} | ||
}else{ | ||
} else { | ||
el.currentNSMap = currentNSMap; | ||
@@ -484,82 +509,82 @@ el.localNSMap = localNSMap; | ||
} | ||
function parseHtmlSpecialContent(source,elStartEnd,tagName,entityReplacer,domBuilder){ | ||
if(/^(?:script|textarea)$/i.test(tagName)){ | ||
var elEndStart = source.indexOf('</'+tagName+'>',elStartEnd); | ||
var text = source.substring(elStartEnd+1,elEndStart); | ||
if(/[&<]/.test(text)){ | ||
if(/^script$/i.test(tagName)){ | ||
function parseHtmlSpecialContent(source, elStartEnd, tagName, domBuilder) { | ||
if (attCheck[tagName] === undefined) | ||
attCheck[tagName] = /^(?:script|textarea)$/i.test(tagName) | ||
if (attCheck[tagName]) { | ||
var elEndStart = source.indexOf('</' + tagName + '>', elStartEnd); | ||
var text = source.substring(elStartEnd + 1, elEndStart); | ||
if (/[&<]/.test(text)) { | ||
if (/^script$/i.test(tagName)) { | ||
//if(!/\]\]>/.test(text)){ | ||
//lexHandler.startCDATA(); | ||
domBuilder.characters(text,0,text.length); | ||
//lexHandler.endCDATA(); | ||
return elEndStart; | ||
//lexHandler.startCDATA(); | ||
domBuilder.characters(text, 0, text.length); | ||
//lexHandler.endCDATA(); | ||
return elEndStart; | ||
//} | ||
}//}else{//text area | ||
text = text.replace(/&#?\w+;/g,entityReplacer); | ||
domBuilder.characters(text,0,text.length); | ||
return elEndStart; | ||
text = text.entityReplacer(); | ||
domBuilder.characters(text, 0, text.length); | ||
return elEndStart; | ||
//} | ||
} | ||
} | ||
return elStartEnd+1; | ||
return elStartEnd + 1; | ||
} | ||
function fixSelfClosed(source,elStartEnd,tagName,closeMap){ | ||
function fixSelfClosed(source, elStartEnd, tagName, closeMap) { | ||
//if(tagName in closeMap){ | ||
var pos = closeMap[tagName]; | ||
if(pos == null){ | ||
if (pos == null) { | ||
//console.log(tagName) | ||
pos = source.lastIndexOf('</'+tagName+'>') | ||
if(pos<elStartEnd){//忘记闭合 | ||
pos = source.lastIndexOf('</'+tagName) | ||
pos = source.lastIndexOf('</' + tagName + '>') | ||
if (pos < elStartEnd) {//忘记闭合 | ||
pos = source.lastIndexOf('</' + tagName) | ||
} | ||
closeMap[tagName] =pos | ||
closeMap[tagName] = pos | ||
} | ||
return pos<elStartEnd; | ||
return pos < elStartEnd; | ||
//} | ||
} | ||
function _copy(source,target){ | ||
for(var n in source){target[n] = source[n]} | ||
function _copy(source, target) { | ||
for (var n in source) { target[n] = source[n] } | ||
} | ||
function parseDCC(source,start,domBuilder,errorHandler){//sure start with '<!' | ||
var next= source.charAt(start+2) | ||
switch(next){ | ||
case '-': | ||
if(source.charAt(start + 3) === '-'){ | ||
var end = source.indexOf('-->',start+4); | ||
//append comment source.substring(4,end)//<!-- | ||
if(end>start){ | ||
domBuilder.comment(source,start+4,end-start-4); | ||
return end+3; | ||
}else{ | ||
errorHandler.error("Unclosed comment"); | ||
function parseDCC(source, start, domBuilder, errorHandler) {//sure start with '<!' | ||
var next = source.charAt(start + 2) | ||
switch (next) { | ||
case '-': | ||
if (source.charAt(start + 3) === '-') { | ||
var end = source.indexOf('-->', start + 4); | ||
//append comment source.substring(4,end)//<!-- | ||
if (end > start) { | ||
domBuilder.comment(source, start + 4, end - start - 4); | ||
return end + 3; | ||
} else { | ||
errorHandler.error("Unclosed comment"); | ||
return -1; | ||
} | ||
} else { | ||
//error | ||
return -1; | ||
} | ||
}else{ | ||
//error | ||
return -1; | ||
} | ||
default: | ||
if(source.substr(start+3,6) == 'CDATA['){ | ||
var end = source.indexOf(']]>',start+9); | ||
domBuilder.startCDATA(); | ||
domBuilder.characters(source,start+9,end-start-9); | ||
domBuilder.endCDATA() | ||
return end+3; | ||
} | ||
//<!DOCTYPE | ||
//startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId) | ||
var matchs = split(source,start); | ||
var len = matchs.length; | ||
if(len>1 && /!doctype/i.test(matchs[0][0])){ | ||
var name = matchs[1][0]; | ||
var pubid = len>3 && /^public$/i.test(matchs[2][0]) && matchs[3][0] | ||
var sysid = len>4 && matchs[4][0]; | ||
var lastMatch = matchs[len-1] | ||
domBuilder.startDTD(name,pubid && pubid.replace(/^(['"])(.*?)\1$/,'$2'), | ||
sysid && sysid.replace(/^(['"])(.*?)\1$/,'$2')); | ||
domBuilder.endDTD(); | ||
return lastMatch.index+lastMatch[0].length | ||
} | ||
default: | ||
if (source.substr(start + 3, 6) == 'CDATA[') { | ||
var end = source.indexOf(']]>', start + 9); | ||
domBuilder.startCDATA(); | ||
domBuilder.characters(source, start + 9, end - start - 9); | ||
domBuilder.endCDATA() | ||
return end + 3; | ||
} | ||
//<!DOCTYPE | ||
//startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId) | ||
var matchs = split(source, start); | ||
var len = matchs.length; | ||
if (len > 1 && /!doctype/i.test(matchs[0][0])) { | ||
var name = matchs[1][0]; | ||
var pubid = len > 3 && /^public$/i.test(matchs[2][0]) && matchs[3][0] | ||
var sysid = len > 4 && matchs[4][0]; | ||
var lastMatch = matchs[len - 1] | ||
domBuilder.startDTD(name, pubid && pubid.replace(/^(['"])(.*?)\1$/, '$2'),sysid && sysid.replace(/^(['"])(.*?)\1$/, '$2')); | ||
domBuilder.endDTD(); | ||
return lastMatch.index + lastMatch[0].length | ||
} | ||
} | ||
@@ -571,11 +596,11 @@ return -1; | ||
function parseInstruction(source,start,domBuilder){ | ||
var end = source.indexOf('?>',start); | ||
if(end){ | ||
var match = source.substring(start,end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/); | ||
if(match){ | ||
function parseInstruction(source, start, domBuilder) { | ||
var end = source.indexOf('?>', start); | ||
if (end) { | ||
var match = source.substring(start, end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/); | ||
if (match) { | ||
var len = match[0].length; | ||
domBuilder.processingInstruction(match[1], match[2]) ; | ||
return end+2; | ||
}else{//error | ||
domBuilder.processingInstruction(match[1], match[2]); | ||
return end + 2; | ||
} else {//error | ||
return -1; | ||
@@ -590,34 +615,34 @@ } | ||
*/ | ||
function ElementAttributes(source){ | ||
function ElementAttributes(source) { | ||
} | ||
ElementAttributes.prototype = { | ||
setTagName:function(tagName){ | ||
if(!tagNamePattern.test(tagName)){ | ||
throw new Error('invalid tagName:'+tagName) | ||
setTagName: function (tagName) { | ||
if (!tagNamePattern.test(tagName)) { | ||
throw new Error('invalid tagName:' + tagName) | ||
} | ||
this.tagName = tagName | ||
}, | ||
add:function(qName,value,offset){ | ||
if(!tagNamePattern.test(qName)){ | ||
throw new Error('invalid attribute:'+qName) | ||
add: function (qName, value, offset) { | ||
if (!tagNamePattern.test(qName)) { | ||
throw new Error('invalid attribute:' + qName) | ||
} | ||
this[this.length++] = {qName:qName,value:value,offset:offset} | ||
this[this.length++] = { qName: qName, value: value, offset: offset } | ||
}, | ||
length:0, | ||
getLocalName:function(i){return this[i].localName}, | ||
getLocator:function(i){return this[i].locator}, | ||
getQName:function(i){return this[i].qName}, | ||
getURI:function(i){return this[i].uri}, | ||
getValue:function(i){return this[i].value} | ||
// ,getIndex:function(uri, localName)){ | ||
// if(localName){ | ||
// | ||
// }else{ | ||
// var qName = uri | ||
// } | ||
// }, | ||
// getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))}, | ||
// getType:function(uri,localName){} | ||
// getType:function(i){}, | ||
length: 0, | ||
getLocalName: function (i) { return this[i].localName }, | ||
getLocator: function (i) { return this[i].locator }, | ||
getQName: function (i) { return this[i].qName }, | ||
getURI: function (i) { return this[i].uri }, | ||
getValue: function (i) { return this[i].value } | ||
// ,getIndex:function(uri, localName)){ | ||
// if(localName){ | ||
// | ||
// }else{ | ||
// var qName = uri | ||
// } | ||
// }, | ||
// getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))}, | ||
// getType:function(uri,localName){} | ||
// getType:function(i){}, | ||
} | ||
@@ -627,3 +652,3 @@ | ||
function split(source,start){ | ||
function split(source, start) { | ||
var match; | ||
@@ -634,5 +659,5 @@ var buf = []; | ||
reg.exec(source);//skip < | ||
while(match = reg.exec(source)){ | ||
while (match = reg.exec(source)) { | ||
buf.push(match); | ||
if(match[1])return buf; | ||
if (match[1]) return buf; | ||
} | ||
@@ -639,0 +664,0 @@ } |
{ | ||
"name": "advanced-html-parser", | ||
"version": "1.0.8", | ||
"version": "1.0.9", | ||
"description": "Can use html parser in react-native, titanium, and anywhere. This is based on [xmldom](https://github.com/jindw/xmldom).", | ||
@@ -5,0 +5,0 @@ "main": "index.js", |
# advanced-html-parser | ||
Can use html parser in react-native, titanium, and anywhere. This is based on [xmldom](https://github.com/jindw/xmldom). | ||
This library is based on [react-native-html-parser](https://github.com/g6ling/react-native-html-parser) where it has much more features and it can decode almost all charecters set, also | ||
@@ -34,2 +33,3 @@ added support to CSS3 selectors and typescript. | ||
ignoreTags: ["script", "style", "head"] // This will remove all those tags before begining to parse the string. | ||
onlyBody: true | ||
}); | ||
@@ -162,2 +162,8 @@ ``` | ||
/* | ||
Parse only the body. | ||
this will extract the <body> from the html string and parse it only | ||
*/ | ||
onlyBody?: boolean; | ||
/* | ||
@@ -164,0 +170,0 @@ override the errorHandler. |
@@ -21,3 +21,3 @@ const IDOMParser = require("./../") | ||
console.log(doc.documentElement.querySelector(".col-info-desc > .desc > .small strong:first-child span")) | ||
doc.documentElement.querySelector(".col-info-desc > .desc > .small strong:first-child span").text().should.eql("8.3"); | ||
}); | ||
@@ -29,3 +29,2 @@ | ||
const doc = IDOMParser.parse(`<div>test</div>`); | ||
console.log(doc.documentElement.cloneNode(true).outerHTML) | ||
doc.documentElement.cloneNode(true).text().should.eql("test"); | ||
@@ -32,0 +31,0 @@ }); |
@@ -17,2 +17,8 @@ export interface Options { | ||
/* | ||
Parse only the body. | ||
this will extract the <body> from the html string and parse it only | ||
*/ | ||
onlyBody?: boolean; | ||
/* | ||
override the errorHandler. | ||
@@ -19,0 +25,0 @@ |
Sorry, the diff of this file is too big to display
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
428815
7982
174