Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

advanced-html-parser

Package Overview
Dependencies
Maintainers
1
Versions
11
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

advanced-html-parser - npm Package Compare versions

Comparing version 1.0.8 to 1.0.9

35

components/dom-parser.js

@@ -0,2 +1,21 @@

const enMap = {};
if (typeof require == 'function') {
const enMap = require('./entity-map').EntityMap;
}
function DOMParser(options) {
if (String.prototype.substr === undefined)
String.prototype.substr = function (start, length) {
var str = this.toString();
if (length === undefined || length > str.length)
length = str.length;
if (length < 0)
length = 0;
if (start < 0)
start = str.length + start;
if (start < 0)
start = 0;
return str.substring(start, length + start)
}
this.options = options || { locator: {} };

@@ -15,3 +34,11 @@

var entityMap = options.entityMap || {};
if (options.ignoreTags && options.ignoreTags.length>0){
if (options.onlyBody === true){
var expression = `<(body)(.*?)>(.|\n)*?<\/(body)>`;
var body = new RegExp(expression, "gmi").exec(source);
if (body && body.length != undefined && body.length > 0)
source = body[0];
}
if (options.ignoreTags && options.ignoreTags.length > 0) {
var expression = `<(${options.ignoreTags.join("|")})(.*?)>(.|\n)*?<\/(${options.ignoreTags.join("|")})>`;

@@ -22,6 +49,4 @@ console.log("Found ignoreTags executing", expression)

if (typeof require == 'function') {
const enMap = require('./entity-map').EntityMap;
entityMap = Object.assign(enMap, entityMap);
}
entityMap = Object.assign(enMap, entityMap);
if (locator) {

@@ -28,0 +53,0 @@ domBuilder.setDocumentLocator(locator)

869

components/sax.js

@@ -5,4 +5,4 @@ //[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]

var nameStartChar = /[A-Z_a-z\xC0-\xD6\xD8-\xF6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]///\u10000-\uEFFFF
var nameChar = new RegExp("[\\-\\.0-9"+nameStartChar.source.slice(1,-1)+"\\u00B7\\u0300-\\u036F\\u203F-\\u2040]");
var tagNamePattern = new RegExp('^'+nameStartChar.source+nameChar.source+'*(?:\:'+nameStartChar.source+nameChar.source+'*)?$');
var nameChar = new RegExp("[\\-\\.0-9" + nameStartChar.source.slice(1, -1) + "\\u00B7\\u0300-\\u036F\\u203F-\\u2040]");
var tagNamePattern = new RegExp('^' + nameStartChar.source + nameChar.source + '*(?:\:' + nameStartChar.source + nameChar.source + '*)?$');
//var tagNamePattern = /^[a-zA-Z_][\w\-\.]*(?:\:[a-zA-Z_][\w\-\.]*)?$/

@@ -15,3 +15,3 @@ //var handlers = 'resolveEntity,getExternalSubset,characters,endDocument,endElement,endPrefixMapping,ignorableWhitespace,processingInstruction,setDocumentLocator,skippedEntity,startDocument,startElement,startPrefixMapping,notationDecl,unparsedEntityDecl,error,fatalError,warning,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,comment,endCDATA,endDTD,endEntity,startCDATA,startDTD,startEntity'.split(',')

var S_ATTR = 1;//attr name offerring
var S_ATTR_SPACE=2;//attr name end and space offer
var S_ATTR_SPACE = 2;//attr name end and space offer
var S_EQ = 3;//=space?

@@ -23,53 +23,77 @@ var S_ATTR_NOQUOT_VALUE = 4;//attr value(no quot value only)

const textParser = require("he");
const cachedChars = {}
const attCheck ={
html: false,
body: false,
div: false,
a: false,
span: false,
i: false,
select: false,
option: false,
form: false,
nav: false,
ul: false,
li: false,
dl: false,
dd: false,
h5: false,
table: false,
tbody: false,
tr: false,
td: false,
b: false,
h4: false,
p: false,
ol: false,
thead: false,
th: false,
em: false,
button: false,
del: false,
h3: false,
head: false,
title: false,
script: true,
textarea: true,
style: false
}
function entityReplacer(a) {
if (cachedChars[a] === undefined)
cachedChars[a] = textParser.decode(a);
return cachedChars[a];
}
function XMLReader(){
String.prototype.entityReplacer = function () {
const str = this.toString();
if (str.indexOf("&") !== -1)
return str.replace(/&#?\w+;/g, entityReplacer);
return str;
}
function XMLReader() {
}
XMLReader.prototype = {
parse:function(source,defaultNSMap,entityMap){
parse: function (source, defaultNSMap, entityMap) {
var domBuilder = this.domBuilder;
domBuilder.startDocument();
_copy(defaultNSMap ,defaultNSMap = {})
parse(source,defaultNSMap,entityMap, domBuilder,this.errorHandler);
_copy(defaultNSMap, defaultNSMap = {});
parse(source, defaultNSMap, entityMap, domBuilder, this.errorHandler);
domBuilder.endDocument();
}
}
function parse(source,defaultNSMapCopy,entityMap,domBuilder,errorHandler){
function fixedFromCharCode(code) {
// String.prototype.fromCharCode does not supports
// > 2 bytes unicode chars directly
if (code > 0xffff) {
code -= 0x10000;
var surrogate1 = 0xd800 + (code >> 10)
, surrogate2 = 0xdc00 + (code & 0x3ff);
function parse(source, defaultNSMapCopy, entityMap, domBuilder, errorHandler) {
return String.fromCharCode(surrogate1, surrogate2);
} else {
return String.fromCharCode(code);
}
}
function entityReplacer(a){
return textParser.decode(a);
/* var k = a.slice(1,-1);
console.log(a,"decode", textParser.decode(k));
if(k in entityMap){
return entityMap[k];
}else if(k.charAt(0) === '#'){
return fixedFromCharCode(parseInt(k.substr(1).replace('x','0x')))
}else{
errorHandler.error('entity not found:'+a);
return a;
}*/
}
function appendText(end){//has some bugs
if(end>start){
var xt = source.substring(start,end).replace(/&#?\w+;/g,entityReplacer);
locator&&position(start);
domBuilder.characters(xt,0,end-start);
function appendText(end) {//has some bugs
if (end > start) {
var xt = source.substring(start, end).entityReplacer();
locator && position(start);
domBuilder.characters(xt, 0, end - start);
start = end
}
}
function position(p,m){
while(p>=lineEnd && (m = linePattern.exec(source))){
function position(p, m) {
while (p >= lineEnd && (m = linePattern.exec(source))) {
lineStart = m.index;

@@ -80,3 +104,3 @@ lineEnd = lineStart + m[0].length;

}
locator.columnNumber = p-lineStart+1;
locator.columnNumber = p - lineStart + 1;
}

@@ -87,114 +111,114 @@ var lineStart = 0;

var locator = domBuilder.locator;
var parseStack = [{currentNSMap:defaultNSMapCopy}]
var parseStack = [{ currentNSMap: defaultNSMapCopy }]
var closeMap = {};
var start = 0;
while(true){
try{
var tagStart = source.indexOf('<',start);
if(tagStart<0){
if(!source.substr(start).match(/^\s*$/)){
while (true) {
try {
var tagStart = source.indexOf('<', start);
if (tagStart < 0) {
if (!source.substr(start).match(/^\s*$/)) {
var doc = domBuilder.doc;
var text = doc.createTextNode(source.substr(start));
var text = doc.createTextNode(source.substr(start));
text.tagName = tagName;
doc.appendChild(text);
domBuilder.currentElement = text;
doc.appendChild(text);
domBuilder.currentElement = text;
}
return;
}
if(tagStart>start){
if (tagStart > start) {
appendText(tagStart);
}
switch(source.charAt(tagStart+1)){
case '/':
var end = source.indexOf('>',tagStart+3);
var tagName = source.substring(tagStart+2,end);
var config = parseStack.pop();
if(end<0){
tagName = source.substring(tagStart+2).replace(/[\s<].*/,'');
//console.error('#@@@@@@'+tagName)
errorHandler.error("end tag name: "+tagName+' is not complete:'+config.tagName);
end = tagStart+1+tagName.length;
}else if(tagName.match(/\s</)){
tagName = tagName.replace(/[\s<].*/,'');
errorHandler.error("end tag name: "+tagName+' maybe not complete');
end = tagStart+1+tagName.length;
}
//console.error(parseStack.length,parseStack)
//console.error(config);
var localNSMap = config.localNSMap;
var endMatch = config.tagName == tagName;
var endIgnoreCaseMach = endMatch || config.tagName&&config.tagName.toLowerCase() == tagName.toLowerCase()
if(endIgnoreCaseMach){
domBuilder.endElement(config.uri,config.localName,tagName);
if(localNSMap){
for(var prefix in localNSMap){
domBuilder.endPrefixMapping(prefix) ;
switch (source.charAt(tagStart + 1)) {
case '/':
var end = source.indexOf('>', tagStart + 3);
var tagName = source.substring(tagStart + 2, end);
var config = parseStack.pop();
if (end < 0) {
tagName = source.substring(tagStart + 2).replace(/[\s<].*/, '');
//console.error('#@@@@@@'+tagName)
errorHandler.error("end tag name: " + tagName + ' is not complete:' + config.tagName);
end = tagStart + 1 + tagName.length;
} else if (tagName.match(/\s</)) {
tagName = tagName.replace(/[\s<].*/, '');
errorHandler.error("end tag name: " + tagName + ' maybe not complete');
end = tagStart + 1 + tagName.length;
}
//console.error(parseStack.length,parseStack)
//console.error(config);
var localNSMap = config.localNSMap;
var endMatch = config.tagName == tagName;
var endIgnoreCaseMach = endMatch || config.tagName && config.tagName.toLowerCase() == tagName.toLowerCase()
if (endIgnoreCaseMach) {
domBuilder.endElement(config.uri, config.localName, tagName);
if (localNSMap) {
for (var prefix in localNSMap) {
domBuilder.endPrefixMapping(prefix);
}
}
if (!endMatch) {
errorHandler.fatalError("end tag name: " + tagName + ' is not match the current start tagName:' + config.tagName);
}
} else {
parseStack.push(config)
}
if(!endMatch){
errorHandler.fatalError("end tag name: "+tagName+' is not match the current start tagName:'+config.tagName );
}
}else{
parseStack.push(config)
}
end++;
break;
end++;
break;
// end elment
case '?':// <?...?>
locator&&position(tagStart);
end = parseInstruction(source,tagStart,domBuilder);
break;
case '!':// <!doctype,<![CDATA,<!--
locator&&position(tagStart);
end = parseDCC(source,tagStart,domBuilder,errorHandler);
break;
default:
locator&&position(tagStart);
var el = new ElementAttributes();
var currentNSMap = parseStack[parseStack.length-1].currentNSMap;
//elStartEnd
var end = parseElementStartPart(source,tagStart,el,currentNSMap,entityReplacer,errorHandler);
var len = el.length;
if(!el.closed && fixSelfClosed(source,end,el.tagName,closeMap)){
el.closed = true;
if(!entityMap.nbsp){
errorHandler.warning('unclosed xml attribute');
case '?':// <?...?>
locator && position(tagStart);
end = parseInstruction(source, tagStart, domBuilder);
break;
case '!':// <!doctype,<![CDATA,<!--
locator && position(tagStart);
end = parseDCC(source, tagStart, domBuilder, errorHandler);
break;
default:
locator && position(tagStart);
var el = new ElementAttributes();
var currentNSMap = parseStack[parseStack.length - 1].currentNSMap;
//elStartEnd
var end = parseElementStartPart(source, tagStart, el, currentNSMap, errorHandler);
var len = el.length;
if (!el.closed && fixSelfClosed(source, end, el.tagName, closeMap)) {
el.closed = true;
if (!entityMap.nbsp) {
errorHandler.warning('unclosed xml attribute');
}
}
}
if(locator && len){
var locator2 = copyLocator(locator,{});
//try{//attribute position fixed
for(var i = 0;i<len;i++){
var a = el[i];
position(a.offset);
a.locator = copyLocator(locator,{});
if (locator && len) {
var locator2 = copyLocator(locator, {});
//try{//attribute position fixed
for (var i = 0; i < len; i++) {
var a = el[i];
position(a.offset);
a.locator = copyLocator(locator, {});
}
//}catch(e){console.error('@@@@@'+e)}
domBuilder.locator = locator2
if (appendElement(el, domBuilder, currentNSMap)) {
parseStack.push(el)
}
domBuilder.locator = locator;
} else {
if (appendElement(el, domBuilder, currentNSMap)) {
parseStack.push(el)
}
}
//}catch(e){console.error('@@@@@'+e)}
domBuilder.locator = locator2
if(appendElement(el,domBuilder,currentNSMap)){
parseStack.push(el)
if (el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed) {
end = parseHtmlSpecialContent(source, end, el.tagName, domBuilder)
} else {
end++;
}
domBuilder.locator = locator;
}else{
if(appendElement(el,domBuilder,currentNSMap)){
parseStack.push(el)
}
}
if(el.uri === 'http://www.w3.org/1999/xhtml' && !el.closed){
end = parseHtmlSpecialContent(source,end,el.tagName,entityReplacer,domBuilder)
}else{
end++;
}
}
}catch(e){
errorHandler.error('element parse error: '+e)
} catch (e) {
errorHandler.error('element parse error: ' + e)
//errorHandler.error('element parse error: '+e);

@@ -204,11 +228,11 @@ end = -1;

}
if(end>start){
if (end > start) {
start = end;
}else{
} else {
//TODO: 这里有可能sax回退,有位置错误风险
appendText(Math.max(tagStart,start)+1);
appendText(Math.max(tagStart, start) + 1);
}
}
}
function copyLocator(f,t){
function copyLocator(f, t) {
t.lineNumber = f.lineNumber;

@@ -223,3 +247,3 @@ t.columnNumber = f.columnNumber;

*/
function parseElementStartPart(source,start,el,currentNSMap,entityReplacer,errorHandler){
function parseElementStartPart(source, start, el, currentNSMap, errorHandler) {
var attrName;

@@ -229,164 +253,165 @@ var value;

var s = S_TAG;//status
while(true){
while (true) {
var c = source.charAt(p);
switch(c){
case '=':
if(s === S_ATTR){//attrName
attrName = source.slice(start,p);
s = S_EQ;
}else if(s === S_ATTR_SPACE){
s = S_EQ;
}else{
//fatalError: equal must after attrName or space after attrName
throw new Error('attribute equal must after attrName');
}
break;
case '\'':
case '"':
if(s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE
){//equal
if(s === S_ATTR){
errorHandler.warning('attribute value must after "="')
attrName = source.slice(start,p)
switch (c) {
case '=':
if (s === S_ATTR) {//attrName
attrName = source.slice(start, p);
s = S_EQ;
} else if (s === S_ATTR_SPACE) {
s = S_EQ;
} else {
//fatalError: equal must after attrName or space after attrName
throw new Error('attribute equal must after attrName');
}
start = p+1;
p = source.indexOf(c,start)
if(p>0){
value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer);
el.add(attrName,value,start-1);
s = S_ATTR_END;
}else{
//fatalError: no end quot match
throw new Error('attribute value no end \''+c+'\' match');
break;
case '\'':
case '"':
if (s === S_EQ || s === S_ATTR //|| s == S_ATTR_SPACE
) {//equal
if (s === S_ATTR) {
errorHandler.warning('attribute value must after "="')
attrName = source.slice(start, p)
}
start = p + 1;
p = source.indexOf(c, start)
if (p > 0) {
value = source.slice(start, p).entityReplacer();
el.add(attrName, value, start - 1);
s = S_ATTR_END;
} else {
//fatalError: no end quot match
throw new Error('attribute value no end \'' + c + '\' match');
}
} else if (s == S_ATTR_NOQUOT_VALUE) {
value = source.slice(start, p).entityReplacer();
//console.log(attrName,value,start,p)
el.add(attrName, value, start);
//console.dir(el)
errorHandler.warning('attribute "' + attrName + '" missed start quot(' + c + ')!!');
start = p + 1;
s = S_ATTR_END
} else {
//fatalError: no equal before
throw new Error('attribute value must after "="');
}
}else if(s == S_ATTR_NOQUOT_VALUE){
value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer);
//console.log(attrName,value,start,p)
el.add(attrName,value,start);
//console.dir(el)
errorHandler.warning('attribute "'+attrName+'" missed start quot('+c+')!!');
start = p+1;
s = S_ATTR_END
}else{
//fatalError: no equal before
throw new Error('attribute value must after "="');
}
break;
case '/':
switch(s){
case S_TAG:
el.setTagName(source.slice(start,p));
case S_ATTR_END:
case S_TAG_SPACE:
case S_TAG_CLOSE:
s =S_TAG_CLOSE;
el.closed = true;
case S_ATTR_NOQUOT_VALUE:
case S_ATTR:
case S_ATTR_SPACE:
break;
//case S_EQ:
default:
throw new Error("attribute invalid close char('/')")
}
break;
case ''://end document
//throw new Error('unexpected end of input')
errorHandler.error('unexpected end of input');
if(s == S_TAG){
el.setTagName(source.slice(start,p));
}
return p;
case '>':
switch(s){
case S_TAG:
el.setTagName(source.slice(start,p));
case S_ATTR_END:
case S_TAG_SPACE:
case S_TAG_CLOSE:
break;//normal
case S_ATTR_NOQUOT_VALUE://Compatible state
case S_ATTR:
value = source.slice(start,p);
if(value.slice(-1) === '/'){
el.closed = true;
value = value.slice(0,-1)
case '/':
switch (s) {
case S_TAG:
el.setTagName(source.slice(start, p));
case S_ATTR_END:
case S_TAG_SPACE:
case S_TAG_CLOSE:
s = S_TAG_CLOSE;
el.closed = true;
case S_ATTR_NOQUOT_VALUE:
case S_ATTR:
case S_ATTR_SPACE:
break;
//case S_EQ:
default:
throw new Error("attribute invalid close char('/')")
}
case S_ATTR_SPACE:
if(s === S_ATTR_SPACE){
value = attrName;
break;
case ''://end document
//throw new Error('unexpected end of input')
errorHandler.error('unexpected end of input');
if (s == S_TAG) {
el.setTagName(source.slice(start, p));
}
if(s == S_ATTR_NOQUOT_VALUE){
errorHandler.warning('attribute "'+value+'" missed quot(")!!');
el.add(attrName,value.replace(/&#?\w+;/g,entityReplacer),start)
}else{
if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !value.match(/^(?:disabled|checked|selected)$/i)){
errorHandler.warning('attribute "'+value+'" missed value!! "'+value+'" instead!!')
}
el.add(value,value,start)
return p;
case '>':
switch (s) {
case S_TAG:
el.setTagName(source.slice(start, p));
case S_ATTR_END:
case S_TAG_SPACE:
case S_TAG_CLOSE:
break;//normal
case S_ATTR_NOQUOT_VALUE://Compatible state
case S_ATTR:
value = source.slice(start, p);
if (value.slice(-1) === '/') {
el.closed = true;
value = value.slice(0, -1)
}
case S_ATTR_SPACE:
if (s === S_ATTR_SPACE) {
value = attrName;
}
if (s == S_ATTR_NOQUOT_VALUE) {
errorHandler.warning('attribute "' + value + '" missed quot(")!!');
el.add(attrName, value.entityReplacer(), start)
} else {
if (currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !value.match(/^(?:disabled|checked|selected)$/i)) {
errorHandler.warning('attribute "' + value + '" missed value!! "' + value + '" instead!!')
}
el.add(value, value, start)
}
break;
case S_EQ:
throw new Error('attribute value missed!!');
}
break;
case S_EQ:
throw new Error('attribute value missed!!');
}
// console.log(tagName,tagNamePattern,tagNamePattern.test(tagName))
return p;
/*xml space '\x20' | #x9 | #xD | #xA; */
case '\u0080':
c = ' ';
default:
if(c<= ' '){//space
switch(s){
case S_TAG:
el.setTagName(source.slice(start,p));//tagName
s = S_TAG_SPACE;
break;
case S_ATTR:
attrName = source.slice(start,p)
s = S_ATTR_SPACE;
break;
case S_ATTR_NOQUOT_VALUE:
var value = source.slice(start,p).replace(/&#?\w+;/g,entityReplacer);
errorHandler.warning('attribute "'+value+'" missed quot(")!!');
el.add(attrName,value,start)
case S_ATTR_END:
s = S_TAG_SPACE;
break;
//case S_TAG_SPACE:
//case S_EQ:
//case S_ATTR_SPACE:
// void();break;
//case S_TAG_CLOSE:
//ignore warning
}
}else{//not space
//S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
//S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
switch(s){
//case S_TAG:void();break;
//case S_ATTR:void();break;
//case S_ATTR_NOQUOT_VALUE:void();break;
case S_ATTR_SPACE:
var tagName = el.tagName;
if(currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !attrName.match(/^(?:disabled|checked|selected)$/i)){
errorHandler.warning('attribute "'+attrName+'" missed value!! "'+attrName+'" instead2!!')
// console.log(tagName,tagNamePattern,tagNamePattern.test(tagName))
return p;
/*xml space '\x20' | #x9 | #xD | #xA; */
case '\u0080':
c = ' ';
default:
if (c <= ' ') {//space
switch (s) {
case S_TAG:
el.setTagName(source.slice(start, p));//tagName
s = S_TAG_SPACE;
break;
case S_ATTR:
attrName = source.slice(start, p)
s = S_ATTR_SPACE;
break;
case S_ATTR_NOQUOT_VALUE:
var value = source.slice(start, p).entityReplacer()
errorHandler.warning('attribute "' + value + '" missed quot(")!!');
el.add(attrName, value, start)
case S_ATTR_END:
s = S_TAG_SPACE;
break;
//case S_TAG_SPACE:
//case S_EQ:
//case S_ATTR_SPACE:
// void();break;
//case S_TAG_CLOSE:
//ignore warning
}
el.add(attrName,attrName,start);
start = p;
s = S_ATTR;
break;
case S_ATTR_END:
errorHandler.warning('attribute space is required"'+attrName+'"!!')
case S_TAG_SPACE:
s = S_ATTR;
start = p;
break;
case S_EQ:
s = S_ATTR_NOQUOT_VALUE;
start = p;
break;
case S_TAG_CLOSE:
throw new Error("elements closed character '/' and '>' must be connected to");
} else {//not space
//S_TAG, S_ATTR, S_EQ, S_ATTR_NOQUOT_VALUE
//S_ATTR_SPACE, S_ATTR_END, S_TAG_SPACE, S_TAG_CLOSE
switch (s) {
//case S_TAG:void();break;
//case S_ATTR:void();break;
//case S_ATTR_NOQUOT_VALUE:void();break;
case S_ATTR_SPACE:
var tagName = el.tagName;
if (currentNSMap[''] !== 'http://www.w3.org/1999/xhtml' || !attrName.match(/^(?:disabled|checked|selected)$/i)) {
errorHandler.warning('attribute "' + attrName + '" missed value!! "' + attrName + '" instead2!!')
}
el.add(attrName, attrName, start);
start = p;
s = S_ATTR;
break;
case S_ATTR_END:
errorHandler.warning('attribute space is required"' + attrName + '"!!')
case S_TAG_SPACE:
s = S_ATTR;
start = p;
break;
case S_EQ:
s = S_ATTR_NOQUOT_VALUE;
start = p;
break;
case S_TAG_CLOSE:
throw new Error("elements closed character '/' and '>' must be connected to");
}
}
}
}//end outer switch

@@ -400,3 +425,3 @@ //console.log('p++',p)

*/
function appendElement(el,domBuilder,currentNSMap){
function appendElement(el, domBuilder, currentNSMap) {
var tagName = el.tagName;

@@ -407,3 +432,3 @@ var localNSMap = null;

var i = el.length;
while(i--){
while (i--) {
var a = el[i];

@@ -413,7 +438,7 @@ var qName = a.qName;

var nsp = qName.indexOf(':');
if(nsp>0){
var prefix = a.prefix = qName.slice(0,nsp);
var localName = qName.slice(nsp+1);
if (nsp > 0) {
var prefix = a.prefix = qName.slice(0, nsp);
var localName = qName.slice(nsp + 1);
var nsPrefix = prefix === 'xmlns' && localName
}else{
} else {
localName = qName;

@@ -424,9 +449,9 @@ prefix = null

//can not set prefix,because prefix !== ''
a.localName = localName ;
a.localName = localName;
//prefix == null for no ns prefix attribute
if(nsPrefix !== false){//hack!!
if(localNSMap == null){
if (nsPrefix !== false) {//hack!!
if (localNSMap == null) {
localNSMap = {}
//console.log(currentNSMap,0)
_copy(currentNSMap,currentNSMap={})
_copy(currentNSMap, currentNSMap = {})
//console.log(currentNSMap,1)

@@ -436,15 +461,15 @@ }

a.uri = 'http://www.w3.org/2000/xmlns/'
domBuilder.startPrefixMapping(nsPrefix, value)
domBuilder.startPrefixMapping(nsPrefix, value)
}
}
var i = el.length;
while(i--){
while (i--) {
a = el[i];
var prefix = a.prefix;
if(prefix){//no prefix attribute has no namespace
if(prefix === 'xml'){
if (prefix) {//no prefix attribute has no namespace
if (prefix === 'xml') {
a.uri = 'http://www.w3.org/XML/1998/namespace';
}if(prefix !== 'xmlns'){
} if (prefix !== 'xmlns') {
a.uri = currentNSMap[prefix || '']
//{console.log('###'+a.qName,domBuilder.locator.systemId+'',currentNSMap,a.uri)}

@@ -455,6 +480,6 @@ }

var nsp = tagName.indexOf(':');
if(nsp>0){
prefix = el.prefix = tagName.slice(0,nsp);
localName = el.localName = tagName.slice(nsp+1);
}else{
if (nsp > 0) {
prefix = el.prefix = tagName.slice(0, nsp);
localName = el.localName = tagName.slice(nsp + 1);
} else {
prefix = null;//important!!

@@ -466,13 +491,13 @@ localName = el.localName = tagName;

domBuilder.startElement(ns,localName,tagName ,el);
domBuilder.startElement(ns, localName, tagName, el);
//endPrefixMapping and startPrefixMapping have not any help for dom builder
//localNSMap = null
if(el.closed){
domBuilder.endElement(ns,localName,tagName);
if(localNSMap){
for(prefix in localNSMap){
domBuilder.endPrefixMapping(prefix)
if (el.closed) {
domBuilder.endElement(ns, localName, tagName);
if (localNSMap) {
for (prefix in localNSMap) {
domBuilder.endPrefixMapping(prefix)
}
}
}else{
} else {
el.currentNSMap = currentNSMap;

@@ -484,82 +509,82 @@ el.localNSMap = localNSMap;

}
function parseHtmlSpecialContent(source,elStartEnd,tagName,entityReplacer,domBuilder){
if(/^(?:script|textarea)$/i.test(tagName)){
var elEndStart = source.indexOf('</'+tagName+'>',elStartEnd);
var text = source.substring(elStartEnd+1,elEndStart);
if(/[&<]/.test(text)){
if(/^script$/i.test(tagName)){
function parseHtmlSpecialContent(source, elStartEnd, tagName, domBuilder) {
if (attCheck[tagName] === undefined)
attCheck[tagName] = /^(?:script|textarea)$/i.test(tagName)
if (attCheck[tagName]) {
var elEndStart = source.indexOf('</' + tagName + '>', elStartEnd);
var text = source.substring(elStartEnd + 1, elEndStart);
if (/[&<]/.test(text)) {
if (/^script$/i.test(tagName)) {
//if(!/\]\]>/.test(text)){
//lexHandler.startCDATA();
domBuilder.characters(text,0,text.length);
//lexHandler.endCDATA();
return elEndStart;
//lexHandler.startCDATA();
domBuilder.characters(text, 0, text.length);
//lexHandler.endCDATA();
return elEndStart;
//}
}//}else{//text area
text = text.replace(/&#?\w+;/g,entityReplacer);
domBuilder.characters(text,0,text.length);
return elEndStart;
text = text.entityReplacer();
domBuilder.characters(text, 0, text.length);
return elEndStart;
//}
}
}
return elStartEnd+1;
return elStartEnd + 1;
}
function fixSelfClosed(source,elStartEnd,tagName,closeMap){
function fixSelfClosed(source, elStartEnd, tagName, closeMap) {
//if(tagName in closeMap){
var pos = closeMap[tagName];
if(pos == null){
if (pos == null) {
//console.log(tagName)
pos = source.lastIndexOf('</'+tagName+'>')
if(pos<elStartEnd){//忘记闭合
pos = source.lastIndexOf('</'+tagName)
pos = source.lastIndexOf('</' + tagName + '>')
if (pos < elStartEnd) {//忘记闭合
pos = source.lastIndexOf('</' + tagName)
}
closeMap[tagName] =pos
closeMap[tagName] = pos
}
return pos<elStartEnd;
return pos < elStartEnd;
//}
}
function _copy(source,target){
for(var n in source){target[n] = source[n]}
function _copy(source, target) {
for (var n in source) { target[n] = source[n] }
}
function parseDCC(source,start,domBuilder,errorHandler){//sure start with '<!'
var next= source.charAt(start+2)
switch(next){
case '-':
if(source.charAt(start + 3) === '-'){
var end = source.indexOf('-->',start+4);
//append comment source.substring(4,end)//<!--
if(end>start){
domBuilder.comment(source,start+4,end-start-4);
return end+3;
}else{
errorHandler.error("Unclosed comment");
function parseDCC(source, start, domBuilder, errorHandler) {//sure start with '<!'
var next = source.charAt(start + 2)
switch (next) {
case '-':
if (source.charAt(start + 3) === '-') {
var end = source.indexOf('-->', start + 4);
//append comment source.substring(4,end)//<!--
if (end > start) {
domBuilder.comment(source, start + 4, end - start - 4);
return end + 3;
} else {
errorHandler.error("Unclosed comment");
return -1;
}
} else {
//error
return -1;
}
}else{
//error
return -1;
}
default:
if(source.substr(start+3,6) == 'CDATA['){
var end = source.indexOf(']]>',start+9);
domBuilder.startCDATA();
domBuilder.characters(source,start+9,end-start-9);
domBuilder.endCDATA()
return end+3;
}
//<!DOCTYPE
//startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId)
var matchs = split(source,start);
var len = matchs.length;
if(len>1 && /!doctype/i.test(matchs[0][0])){
var name = matchs[1][0];
var pubid = len>3 && /^public$/i.test(matchs[2][0]) && matchs[3][0]
var sysid = len>4 && matchs[4][0];
var lastMatch = matchs[len-1]
domBuilder.startDTD(name,pubid && pubid.replace(/^(['"])(.*?)\1$/,'$2'),
sysid && sysid.replace(/^(['"])(.*?)\1$/,'$2'));
domBuilder.endDTD();
return lastMatch.index+lastMatch[0].length
}
default:
if (source.substr(start + 3, 6) == 'CDATA[') {
var end = source.indexOf(']]>', start + 9);
domBuilder.startCDATA();
domBuilder.characters(source, start + 9, end - start - 9);
domBuilder.endCDATA()
return end + 3;
}
//<!DOCTYPE
//startDTD(java.lang.String name, java.lang.String publicId, java.lang.String systemId)
var matchs = split(source, start);
var len = matchs.length;
if (len > 1 && /!doctype/i.test(matchs[0][0])) {
var name = matchs[1][0];
var pubid = len > 3 && /^public$/i.test(matchs[2][0]) && matchs[3][0]
var sysid = len > 4 && matchs[4][0];
var lastMatch = matchs[len - 1]
domBuilder.startDTD(name, pubid && pubid.replace(/^(['"])(.*?)\1$/, '$2'),sysid && sysid.replace(/^(['"])(.*?)\1$/, '$2'));
domBuilder.endDTD();
return lastMatch.index + lastMatch[0].length
}
}

@@ -571,11 +596,11 @@ return -1;

function parseInstruction(source,start,domBuilder){
var end = source.indexOf('?>',start);
if(end){
var match = source.substring(start,end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/);
if(match){
function parseInstruction(source, start, domBuilder) {
var end = source.indexOf('?>', start);
if (end) {
var match = source.substring(start, end).match(/^<\?(\S*)\s*([\s\S]*?)\s*$/);
if (match) {
var len = match[0].length;
domBuilder.processingInstruction(match[1], match[2]) ;
return end+2;
}else{//error
domBuilder.processingInstruction(match[1], match[2]);
return end + 2;
} else {//error
return -1;

@@ -590,34 +615,34 @@ }

*/
function ElementAttributes(source){
function ElementAttributes(source) {
}
ElementAttributes.prototype = {
setTagName:function(tagName){
if(!tagNamePattern.test(tagName)){
throw new Error('invalid tagName:'+tagName)
setTagName: function (tagName) {
if (!tagNamePattern.test(tagName)) {
throw new Error('invalid tagName:' + tagName)
}
this.tagName = tagName
},
add:function(qName,value,offset){
if(!tagNamePattern.test(qName)){
throw new Error('invalid attribute:'+qName)
add: function (qName, value, offset) {
if (!tagNamePattern.test(qName)) {
throw new Error('invalid attribute:' + qName)
}
this[this.length++] = {qName:qName,value:value,offset:offset}
this[this.length++] = { qName: qName, value: value, offset: offset }
},
length:0,
getLocalName:function(i){return this[i].localName},
getLocator:function(i){return this[i].locator},
getQName:function(i){return this[i].qName},
getURI:function(i){return this[i].uri},
getValue:function(i){return this[i].value}
// ,getIndex:function(uri, localName)){
// if(localName){
//
// }else{
// var qName = uri
// }
// },
// getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))},
// getType:function(uri,localName){}
// getType:function(i){},
length: 0,
getLocalName: function (i) { return this[i].localName },
getLocator: function (i) { return this[i].locator },
getQName: function (i) { return this[i].qName },
getURI: function (i) { return this[i].uri },
getValue: function (i) { return this[i].value }
// ,getIndex:function(uri, localName)){
// if(localName){
//
// }else{
// var qName = uri
// }
// },
// getValue:function(){return this.getValue(this.getIndex.apply(this,arguments))},
// getType:function(uri,localName){}
// getType:function(i){},
}

@@ -627,3 +652,3 @@

function split(source,start){
function split(source, start) {
var match;

@@ -634,5 +659,5 @@ var buf = [];

reg.exec(source);//skip <
while(match = reg.exec(source)){
while (match = reg.exec(source)) {
buf.push(match);
if(match[1])return buf;
if (match[1]) return buf;
}

@@ -639,0 +664,0 @@ }

{
"name": "advanced-html-parser",
"version": "1.0.8",
"version": "1.0.9",
"description": "Can use html parser in react-native, titanium, and anywhere. This is based on [xmldom](https://github.com/jindw/xmldom).",

@@ -5,0 +5,0 @@ "main": "index.js",

# advanced-html-parser
Can use html parser in react-native, titanium, and anywhere. This is based on [xmldom](https://github.com/jindw/xmldom).
This library is based on [react-native-html-parser](https://github.com/g6ling/react-native-html-parser) where it has much more features and it can decode almost all charecters set, also

@@ -34,2 +33,3 @@ added support to CSS3 selectors and typescript.

ignoreTags: ["script", "style", "head"] // This will remove all those tags before begining to parse the string.
onlyBody: true
});

@@ -162,2 +162,8 @@ ```

/*
Parse only the body.
this will extract the <body> from the html string and parse it only
*/
onlyBody?: boolean;
/*

@@ -164,0 +170,0 @@ override the errorHandler.

@@ -21,3 +21,3 @@ const IDOMParser = require("./../")

console.log(doc.documentElement.querySelector(".col-info-desc > .desc > .small strong:first-child span"))
doc.documentElement.querySelector(".col-info-desc > .desc > .small strong:first-child span").text().should.eql("8.3");
});

@@ -29,3 +29,2 @@

const doc = IDOMParser.parse(`<div>test</div>`);
console.log(doc.documentElement.cloneNode(true).outerHTML)
doc.documentElement.cloneNode(true).text().should.eql("test");

@@ -32,0 +31,0 @@ });

@@ -17,2 +17,8 @@ export interface Options {

/*
Parse only the body.
this will extract the <body> from the html string and parse it only
*/
onlyBody?: boolean;
/*
override the errorHandler.

@@ -19,0 +25,0 @@

Sorry, the diff of this file is too big to display

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc