node-html-parser
Advanced tools
Comparing version 4.1.5 to 5.0.0
235
dist/main.js
@@ -37,3 +37,3 @@ var __extends = (this && this.__extends) || (function () { | ||
} | ||
return to.concat(ar || from); | ||
return to.concat(ar || Array.prototype.slice.call(from)); | ||
}; | ||
@@ -297,2 +297,3 @@ define("back", ["require", "exports"], function (require, exports) { | ||
comment_1 = __importDefault(comment_1); | ||
var voidTags = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']); | ||
function decode(val) { | ||
@@ -442,3 +443,3 @@ // clone string | ||
HTMLElement.prototype.quoteAttribute = function (attr) { | ||
if (attr === null) { | ||
if (attr == null) { | ||
return 'null'; | ||
@@ -497,2 +498,9 @@ } | ||
}); | ||
Object.defineProperty(HTMLElement.prototype, "isVoidElement", { | ||
get: function () { | ||
return voidTags.has(this.localName); | ||
}, | ||
enumerable: false, | ||
configurable: true | ||
}); | ||
Object.defineProperty(HTMLElement.prototype, "rawText", { | ||
@@ -585,10 +593,4 @@ /** | ||
if (tag) { | ||
// const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|')); | ||
// const is_void = void_tags.has(tag); | ||
var is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag); | ||
var attrs = this.rawAttrs ? " " + this.rawAttrs : ''; | ||
if (is_void) { | ||
return "<" + tag + attrs + ">"; | ||
} | ||
return "<" + tag + attrs + ">" + this.innerHTML + "</" + tag + ">"; | ||
return this.isVoidElement ? "<" + tag + attrs + ">" : "<" + tag + attrs + ">" + this.innerHTML + "</" + tag + ">"; | ||
} | ||
@@ -743,59 +745,2 @@ return this.innerHTML; | ||
}); | ||
// let matcher: Matcher; | ||
// if (selector instanceof Matcher) { | ||
// matcher = selector; | ||
// matcher.reset(); | ||
// } else { | ||
// if (selector.includes(',')) { | ||
// const selectors = selector.split(','); | ||
// return Array.from(selectors.reduce((pre, cur) => { | ||
// const result = this.querySelectorAll(cur.trim()); | ||
// return result.reduce((p, c) => { | ||
// return p.add(c); | ||
// }, pre); | ||
// }, new Set<HTMLElement>())); | ||
// } | ||
// matcher = new Matcher(selector); | ||
// } | ||
// interface IStack { | ||
// 0: Node; // node | ||
// 1: number; // children | ||
// 2: boolean; // found flag | ||
// } | ||
// const stack = [] as IStack[]; | ||
// return this.childNodes.reduce((res, cur) => { | ||
// stack.push([cur, 0, false]); | ||
// while (stack.length) { | ||
// const state = arr_back(stack); // get last element | ||
// const el = state[0]; | ||
// if (state[1] === 0) { | ||
// // Seen for first time. | ||
// if (el.nodeType !== NodeType.ELEMENT_NODE) { | ||
// stack.pop(); | ||
// continue; | ||
// } | ||
// const html_el = el as HTMLElement; | ||
// state[2] = matcher.advance(html_el); | ||
// if (state[2]) { | ||
// if (matcher.matched) { | ||
// res.push(html_el); | ||
// res.push(...(html_el.querySelectorAll(selector))); | ||
// // no need to go further. | ||
// matcher.rewind(); | ||
// stack.pop(); | ||
// continue; | ||
// } | ||
// } | ||
// } | ||
// if (state[1] < el.childNodes.length) { | ||
// stack.push([el.childNodes[state[1]++], 0, false]); | ||
// } else { | ||
// if (state[2]) { | ||
// matcher.rewind(); | ||
// } | ||
// stack.pop(); | ||
// } | ||
// } | ||
// return res; | ||
// }, [] as HTMLElement[]); | ||
}; | ||
@@ -805,3 +750,3 @@ /** | ||
* @param {string} selector Simplified CSS selector | ||
* @return {HTMLElement} matching node | ||
* @return {(HTMLElement|null)} matching node | ||
*/ | ||
@@ -813,41 +758,42 @@ HTMLElement.prototype.querySelector = function (selector) { | ||
}); | ||
// let matcher: Matcher; | ||
// if (selector instanceof Matcher) { | ||
// matcher = selector; | ||
// matcher.reset(); | ||
// } else { | ||
// matcher = new Matcher(selector); | ||
// } | ||
// const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[]; | ||
// for (const node of this.childNodes) { | ||
// stack.push([node, 0, false]); | ||
// while (stack.length) { | ||
// const state = arr_back(stack); | ||
// const el = state[0]; | ||
// if (state[1] === 0) { | ||
// // Seen for first time. | ||
// if (el.nodeType !== NodeType.ELEMENT_NODE) { | ||
// stack.pop(); | ||
// continue; | ||
// } | ||
// state[2] = matcher.advance(el as HTMLElement); | ||
// if (state[2]) { | ||
// if (matcher.matched) { | ||
// return el as HTMLElement; | ||
// } | ||
// } | ||
// } | ||
// if (state[1] < el.childNodes.length) { | ||
// stack.push([el.childNodes[state[1]++], 0, false]); | ||
// } else { | ||
// if (state[2]) { | ||
// matcher.rewind(); | ||
// } | ||
// stack.pop(); | ||
// } | ||
// } | ||
// } | ||
// return null; | ||
}; | ||
/** | ||
* find elements by their tagName | ||
* @param {string} tagName the tagName of the elements to select | ||
*/ | ||
HTMLElement.prototype.getElementsByTagName = function (tagName) { | ||
var upperCasedTagName = tagName.toUpperCase(); | ||
var re = []; | ||
var stack = []; | ||
var currentNodeReference = this; | ||
var index = 0; | ||
// index turns to undefined once the stack is empty and the first condition occurs | ||
// which happens once all relevant children are searched through | ||
while (index !== undefined) { | ||
var child = void 0; | ||
// make it work with sparse arrays | ||
do { | ||
child = currentNodeReference.childNodes[index++]; | ||
} while (index < currentNodeReference.childNodes.length && child === undefined); | ||
// if the child does not exist we move on with the last provided index (which belongs to the parentNode) | ||
if (child === undefined) { | ||
currentNodeReference = currentNodeReference.parentNode; | ||
index = stack.pop(); | ||
continue; | ||
} | ||
if (child.nodeType === type_3.default.ELEMENT_NODE) { | ||
// https://developer.mozilla.org/en-US/docs/Web/API/Element/getElementsByTagName#syntax | ||
if (tagName === '*' || child.tagName === upperCasedTagName) | ||
re.push(child); | ||
// if children are existing push the current status to the stack and keep searching for elements in the level below | ||
if (child.childNodes.length > 0) { | ||
stack.push(index); | ||
currentNodeReference = child; | ||
index = 0; | ||
} | ||
} | ||
} | ||
return re; | ||
}; | ||
/** | ||
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null. | ||
@@ -970,3 +916,3 @@ * @param selector a DOMString containing a selector list | ||
/** | ||
* Get escaped (as-it) attributes | ||
* Get escaped (as-is) attributes | ||
* @return {Object} parsed attributes | ||
@@ -980,6 +926,10 @@ */ | ||
if (this.rawAttrs) { | ||
var re = /([a-z()#][a-z0-9-_:()#]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/gi; | ||
var re = /([a-zA-Z()#][a-zA-Z0-9-_:()#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?/g; | ||
var match = void 0; | ||
while ((match = re.exec(this.rawAttrs))) { | ||
attrs[match[1]] = match[2] || match[3] || match[4] || null; | ||
var key = match[1]; | ||
var val = match[2] || null; | ||
if (val && (val[0] === "'" || val[0] === "\"")) | ||
val = val.slice(1, val.length - 1); | ||
attrs[key] = val; | ||
} | ||
@@ -1180,8 +1130,4 @@ } | ||
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name | ||
var kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*((?=[/>]*?)|(?:.*?[\s\d/'"])|(?:.*?[\w]))(\/?)>/gi; | ||
// <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>> | ||
// <([a-z][-.:0-9_a-z]*)\s*\/> | ||
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?> | ||
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>> | ||
var kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]*)"|'([^']*)'|(\S+))/gi; | ||
var kMarkupPattern = /<!--[\s\S]*?-->|<(\/?)([a-zA-Z][-.:0-9_a-zA-Z]*)((?:\s+[^>]*?(?:(?:'[^']*')|(?:"[^"]*"))?)*)\s*(\/?)>/g; | ||
var kAttributePattern = /(?:^|\s)(id|class)\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+)/gi; | ||
var kSelfClosingElements = { | ||
@@ -1273,4 +1219,4 @@ area: true, | ||
var element_names = Object.keys(elements); | ||
var kBlockTextElements = element_names.map(function (it) { return new RegExp(it, 'i'); }); | ||
var kIgnoreElements = element_names.filter(function (it) { return elements[it]; }).map(function (it) { return new RegExp(it, 'i'); }); | ||
var kBlockTextElements = element_names.map(function (it) { return new RegExp("^" + it + "$", 'i'); }); | ||
var kIgnoreElements = element_names.filter(function (it) { return elements[it]; }).map(function (it) { return new RegExp("^" + it + "$", 'i'); }); | ||
function element_should_be_ignore(tag) { | ||
@@ -1291,10 +1237,15 @@ return kIgnoreElements.some(function (it) { return it.test(tag); }); | ||
data = "<" + frameflag + ">" + data + "</" + frameflag + ">"; | ||
var lowerCaseTagName = options.lowerCaseTagName; | ||
var dataEndPos = data.length - (frameflag.length + 2); | ||
var frameFlagOffset = frameflag.length + 2; | ||
while ((match = kMarkupPattern.exec(data))) { | ||
var tagStartPos = kMarkupPattern.lastIndex - match[0].length; | ||
// Note: Object destructuring here consistently tests as higher performance than array destructuring | ||
// eslint-disable-next-line prefer-const | ||
var matchText = match[0], leadingSlash = match[1], tagName = match[2], attributes = match[3], closingSlash = match[4]; | ||
var matchLength = matchText.length; | ||
var tagStartPos = kMarkupPattern.lastIndex - matchLength; | ||
var tagEndPos = kMarkupPattern.lastIndex; | ||
// Add TextNode if content | ||
if (lastTextPos > -1) { | ||
if (lastTextPos + match[0].length < tagEndPos) { | ||
if (lastTextPos + matchLength < tagEndPos) { | ||
var text = data.substring(lastTextPos, tagStartPos); | ||
@@ -1307,6 +1258,6 @@ currentParent.appendChild(new text_1.default(text, currentParent, createRange(lastTextPos, tagStartPos))); | ||
// Skip frameflag node | ||
if (match[2] === frameflag) | ||
if (tagName === frameflag) | ||
continue; | ||
// Handle comments | ||
if (match[0][1] === '!') { | ||
if (matchText[1] === '!') { | ||
if (options.comment) { | ||
@@ -1321,14 +1272,16 @@ // Only keep what is in between <!-- and --> | ||
// Fix tag casing if necessary | ||
if (options.lowerCaseTagName) | ||
match[2] = match[2].toLowerCase(); | ||
if (lowerCaseTagName) | ||
tagName = tagName.toLowerCase(); | ||
// Handle opening tags (ie. <this> not </that>) | ||
if (!match[1]) { | ||
if (!leadingSlash) { | ||
/* Populate attributes */ | ||
var attrs = {}; | ||
for (var attMatch = void 0; (attMatch = kAttributePattern.exec(match[3]));) { | ||
attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6]; | ||
for (var attMatch = void 0; (attMatch = kAttributePattern.exec(attributes));) { | ||
var key = attMatch[1], val = attMatch[2]; | ||
var isQuoted = val[0] === "'" || val[0] === "\""; | ||
attrs[key.toLowerCase()] = isQuoted ? val.slice(1, val.length - 1) : val; | ||
} | ||
var tagName = currentParent.rawTagName; | ||
if (!match[4] && kElementsClosedByOpening[tagName]) { | ||
if (kElementsClosedByOpening[tagName][match[2]]) { | ||
var parentTagName = currentParent.rawTagName; | ||
if (!closingSlash && kElementsClosedByOpening[parentTagName]) { | ||
if (kElementsClosedByOpening[parentTagName][tagName]) { | ||
stack.pop(); | ||
@@ -1339,3 +1292,3 @@ currentParent = (0, back_1.default)(stack); | ||
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144 | ||
if (match[2] === 'a' || match[2] === 'A') { | ||
if (tagName === 'a' || tagName === 'A') { | ||
if (noNestedTagIndex !== undefined) { | ||
@@ -1348,15 +1301,15 @@ stack.splice(noNestedTagIndex); | ||
var tagEndPos_1 = kMarkupPattern.lastIndex; | ||
var tagStartPos_1 = tagEndPos_1 - match[0].length; | ||
var tagStartPos_1 = tagEndPos_1 - matchLength; | ||
currentParent = currentParent.appendChild( | ||
// Initialize range (end position updated later for closed tags) | ||
new HTMLElement(match[2], attrs, match[3], null, createRange(tagStartPos_1, tagEndPos_1))); | ||
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1))); | ||
stack.push(currentParent); | ||
if (is_block_text_element(match[2])) { | ||
if (is_block_text_element(tagName)) { | ||
// Find closing tag | ||
var closeMarkup = "</" + match[2] + ">"; | ||
var closeIndex = options.lowerCaseTagName | ||
var closeMarkup = "</" + tagName + ">"; | ||
var closeIndex = lowerCaseTagName | ||
? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex) | ||
: data.indexOf(closeMarkup, kMarkupPattern.lastIndex); | ||
var textEndPos = closeIndex === -1 ? dataEndPos : closeIndex; | ||
if (element_should_be_ignore(match[2])) { | ||
if (element_should_be_ignore(tagName)) { | ||
var text = data.substring(tagEndPos_1, textEndPos); | ||
@@ -1373,3 +1326,3 @@ if (text.length > 0 && /\S/.test(text)) { | ||
// Cause to be treated as self-closing, because no close found | ||
match[1] = 'true'; | ||
leadingSlash = '/'; | ||
} | ||
@@ -1379,7 +1332,7 @@ } | ||
// Handle closing tags or self-closed elements (ie </tag> or <br>) | ||
if (match[1] || match[4] || kSelfClosingElements[match[2]]) { | ||
if (leadingSlash || closingSlash || kSelfClosingElements[tagName]) { | ||
while (true) { | ||
if (match[2] === 'a' || match[2] === 'A') | ||
if (tagName === 'a' || tagName === 'A') | ||
noNestedTagIndex = undefined; | ||
if (currentParent.rawTagName === match[2]) { | ||
if (currentParent.rawTagName === tagName) { | ||
// Update range end for closed tag | ||
@@ -1392,6 +1345,6 @@ currentParent.range[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1]; | ||
else { | ||
var tagName = currentParent.tagName; | ||
var parentTagName = currentParent.tagName; | ||
// Trying to close current tag, and move on | ||
if (kElementsClosedByClosing[tagName]) { | ||
if (kElementsClosedByClosing[tagName][match[2]]) { | ||
if (kElementsClosedByClosing[parentTagName]) { | ||
if (kElementsClosedByClosing[parentTagName][tagName]) { | ||
stack.pop(); | ||
@@ -1398,0 +1351,0 @@ currentParent = (0, back_1.default)(stack); |
@@ -80,2 +80,3 @@ import Node from './node'; | ||
get localName(): string; | ||
get isVoidElement(): boolean; | ||
/** | ||
@@ -129,6 +130,11 @@ * Get escpaed (as-it) text value of current node and its children. | ||
* @param {string} selector Simplified CSS selector | ||
* @return {HTMLElement} matching node | ||
* @return {(HTMLElement|null)} matching node | ||
*/ | ||
querySelector(selector: string): HTMLElement; | ||
querySelector(selector: string): HTMLElement | null; | ||
/** | ||
* find elements by their tagName | ||
* @param {string} tagName the tagName of the elements to select | ||
*/ | ||
getElementsByTagName(tagName: string): Array<HTMLElement>; | ||
/** | ||
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null. | ||
@@ -162,3 +168,3 @@ * @param selector a DOMString containing a selector list | ||
/** | ||
* Get escaped (as-it) attributes | ||
* Get escaped (as-is) attributes | ||
* @return {Object} parsed attributes | ||
@@ -165,0 +171,0 @@ */ |
@@ -35,3 +35,3 @@ "use strict"; | ||
} | ||
return to.concat(ar || from); | ||
return to.concat(ar || Array.prototype.slice.call(from)); | ||
}; | ||
@@ -51,2 +51,3 @@ var __importDefault = (this && this.__importDefault) || function (mod) { | ||
var comment_1 = __importDefault(require("./comment")); | ||
var voidTags = new Set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr']); | ||
function decode(val) { | ||
@@ -196,3 +197,3 @@ // clone string | ||
HTMLElement.prototype.quoteAttribute = function (attr) { | ||
if (attr === null) { | ||
if (attr == null) { | ||
return 'null'; | ||
@@ -251,2 +252,9 @@ } | ||
}); | ||
Object.defineProperty(HTMLElement.prototype, "isVoidElement", { | ||
get: function () { | ||
return voidTags.has(this.localName); | ||
}, | ||
enumerable: false, | ||
configurable: true | ||
}); | ||
Object.defineProperty(HTMLElement.prototype, "rawText", { | ||
@@ -339,10 +347,4 @@ /** | ||
if (tag) { | ||
// const void_tags = new Set('area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'.split('|')); | ||
// const is_void = void_tags.has(tag); | ||
var is_void = /^(area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr)$/i.test(tag); | ||
var attrs = this.rawAttrs ? " " + this.rawAttrs : ''; | ||
if (is_void) { | ||
return "<" + tag + attrs + ">"; | ||
} | ||
return "<" + tag + attrs + ">" + this.innerHTML + "</" + tag + ">"; | ||
return this.isVoidElement ? "<" + tag + attrs + ">" : "<" + tag + attrs + ">" + this.innerHTML + "</" + tag + ">"; | ||
} | ||
@@ -497,59 +499,2 @@ return this.innerHTML; | ||
}); | ||
// let matcher: Matcher; | ||
// if (selector instanceof Matcher) { | ||
// matcher = selector; | ||
// matcher.reset(); | ||
// } else { | ||
// if (selector.includes(',')) { | ||
// const selectors = selector.split(','); | ||
// return Array.from(selectors.reduce((pre, cur) => { | ||
// const result = this.querySelectorAll(cur.trim()); | ||
// return result.reduce((p, c) => { | ||
// return p.add(c); | ||
// }, pre); | ||
// }, new Set<HTMLElement>())); | ||
// } | ||
// matcher = new Matcher(selector); | ||
// } | ||
// interface IStack { | ||
// 0: Node; // node | ||
// 1: number; // children | ||
// 2: boolean; // found flag | ||
// } | ||
// const stack = [] as IStack[]; | ||
// return this.childNodes.reduce((res, cur) => { | ||
// stack.push([cur, 0, false]); | ||
// while (stack.length) { | ||
// const state = arr_back(stack); // get last element | ||
// const el = state[0]; | ||
// if (state[1] === 0) { | ||
// // Seen for first time. | ||
// if (el.nodeType !== NodeType.ELEMENT_NODE) { | ||
// stack.pop(); | ||
// continue; | ||
// } | ||
// const html_el = el as HTMLElement; | ||
// state[2] = matcher.advance(html_el); | ||
// if (state[2]) { | ||
// if (matcher.matched) { | ||
// res.push(html_el); | ||
// res.push(...(html_el.querySelectorAll(selector))); | ||
// // no need to go further. | ||
// matcher.rewind(); | ||
// stack.pop(); | ||
// continue; | ||
// } | ||
// } | ||
// } | ||
// if (state[1] < el.childNodes.length) { | ||
// stack.push([el.childNodes[state[1]++], 0, false]); | ||
// } else { | ||
// if (state[2]) { | ||
// matcher.rewind(); | ||
// } | ||
// stack.pop(); | ||
// } | ||
// } | ||
// return res; | ||
// }, [] as HTMLElement[]); | ||
}; | ||
@@ -559,3 +504,3 @@ /** | ||
* @param {string} selector Simplified CSS selector | ||
* @return {HTMLElement} matching node | ||
* @return {(HTMLElement|null)} matching node | ||
*/ | ||
@@ -567,41 +512,42 @@ HTMLElement.prototype.querySelector = function (selector) { | ||
}); | ||
// let matcher: Matcher; | ||
// if (selector instanceof Matcher) { | ||
// matcher = selector; | ||
// matcher.reset(); | ||
// } else { | ||
// matcher = new Matcher(selector); | ||
// } | ||
// const stack = [] as { 0: Node; 1: 0 | 1; 2: boolean }[]; | ||
// for (const node of this.childNodes) { | ||
// stack.push([node, 0, false]); | ||
// while (stack.length) { | ||
// const state = arr_back(stack); | ||
// const el = state[0]; | ||
// if (state[1] === 0) { | ||
// // Seen for first time. | ||
// if (el.nodeType !== NodeType.ELEMENT_NODE) { | ||
// stack.pop(); | ||
// continue; | ||
// } | ||
// state[2] = matcher.advance(el as HTMLElement); | ||
// if (state[2]) { | ||
// if (matcher.matched) { | ||
// return el as HTMLElement; | ||
// } | ||
// } | ||
// } | ||
// if (state[1] < el.childNodes.length) { | ||
// stack.push([el.childNodes[state[1]++], 0, false]); | ||
// } else { | ||
// if (state[2]) { | ||
// matcher.rewind(); | ||
// } | ||
// stack.pop(); | ||
// } | ||
// } | ||
// } | ||
// return null; | ||
}; | ||
/** | ||
* find elements by their tagName | ||
* @param {string} tagName the tagName of the elements to select | ||
*/ | ||
HTMLElement.prototype.getElementsByTagName = function (tagName) { | ||
var upperCasedTagName = tagName.toUpperCase(); | ||
var re = []; | ||
var stack = []; | ||
var currentNodeReference = this; | ||
var index = 0; | ||
// index turns to undefined once the stack is empty and the first condition occurs | ||
// which happens once all relevant children are searched through | ||
while (index !== undefined) { | ||
var child = void 0; | ||
// make it work with sparse arrays | ||
do { | ||
child = currentNodeReference.childNodes[index++]; | ||
} while (index < currentNodeReference.childNodes.length && child === undefined); | ||
// if the child does not exist we move on with the last provided index (which belongs to the parentNode) | ||
if (child === undefined) { | ||
currentNodeReference = currentNodeReference.parentNode; | ||
index = stack.pop(); | ||
continue; | ||
} | ||
if (child.nodeType === type_1.default.ELEMENT_NODE) { | ||
// https://developer.mozilla.org/en-US/docs/Web/API/Element/getElementsByTagName#syntax | ||
if (tagName === '*' || child.tagName === upperCasedTagName) | ||
re.push(child); | ||
// if children are existing push the current status to the stack and keep searching for elements in the level below | ||
if (child.childNodes.length > 0) { | ||
stack.push(index); | ||
currentNodeReference = child; | ||
index = 0; | ||
} | ||
} | ||
} | ||
return re; | ||
}; | ||
/** | ||
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null. | ||
@@ -724,3 +670,3 @@ * @param selector a DOMString containing a selector list | ||
/** | ||
* Get escaped (as-it) attributes | ||
* Get escaped (as-is) attributes | ||
* @return {Object} parsed attributes | ||
@@ -734,6 +680,10 @@ */ | ||
if (this.rawAttrs) { | ||
var re = /([a-z()#][a-z0-9-_:()#]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/gi; | ||
var re = /([a-zA-Z()#][a-zA-Z0-9-_:()#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?/g; | ||
var match = void 0; | ||
while ((match = re.exec(this.rawAttrs))) { | ||
attrs[match[1]] = match[2] || match[3] || match[4] || null; | ||
var key = match[1]; | ||
var val = match[2] || null; | ||
if (val && (val[0] === "'" || val[0] === "\"")) | ||
val = val.slice(1, val.length - 1); | ||
attrs[key] = val; | ||
} | ||
@@ -934,8 +884,4 @@ } | ||
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name | ||
var kMarkupPattern = /<!--[^]*?(?=-->)-->|<(\/?)([a-z][-.:0-9_a-z]*)\s*((?=[/>]*?)|(?:.*?[\s\d/'"])|(?:.*?[\w]))(\/?)>/gi; | ||
// <(?<tag>[^\s]*)(.*)>(.*)</\k<tag>> | ||
// <([a-z][-.:0-9_a-z]*)\s*\/> | ||
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?> | ||
// <(area|base|br|col|hr|img|input|link|meta|source)\s*(.*)\/?>|<(?<tag>[^\s]*)(.*)>(.*)</\k<tag>> | ||
var kAttributePattern = /(^|\s)(id|class)\s*=\s*("([^"]*)"|'([^']*)'|(\S+))/gi; | ||
var kMarkupPattern = /<!--[\s\S]*?-->|<(\/?)([a-zA-Z][-.:0-9_a-zA-Z]*)((?:\s+[^>]*?(?:(?:'[^']*')|(?:"[^"]*"))?)*)\s*(\/?)>/g; | ||
var kAttributePattern = /(?:^|\s)(id|class)\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+)/gi; | ||
var kSelfClosingElements = { | ||
@@ -1027,4 +973,4 @@ area: true, | ||
var element_names = Object.keys(elements); | ||
var kBlockTextElements = element_names.map(function (it) { return new RegExp(it, 'i'); }); | ||
var kIgnoreElements = element_names.filter(function (it) { return elements[it]; }).map(function (it) { return new RegExp(it, 'i'); }); | ||
var kBlockTextElements = element_names.map(function (it) { return new RegExp("^" + it + "$", 'i'); }); | ||
var kIgnoreElements = element_names.filter(function (it) { return elements[it]; }).map(function (it) { return new RegExp("^" + it + "$", 'i'); }); | ||
function element_should_be_ignore(tag) { | ||
@@ -1045,10 +991,15 @@ return kIgnoreElements.some(function (it) { return it.test(tag); }); | ||
data = "<" + frameflag + ">" + data + "</" + frameflag + ">"; | ||
var lowerCaseTagName = options.lowerCaseTagName; | ||
var dataEndPos = data.length - (frameflag.length + 2); | ||
var frameFlagOffset = frameflag.length + 2; | ||
while ((match = kMarkupPattern.exec(data))) { | ||
var tagStartPos = kMarkupPattern.lastIndex - match[0].length; | ||
// Note: Object destructuring here consistently tests as higher performance than array destructuring | ||
// eslint-disable-next-line prefer-const | ||
var matchText = match[0], leadingSlash = match[1], tagName = match[2], attributes = match[3], closingSlash = match[4]; | ||
var matchLength = matchText.length; | ||
var tagStartPos = kMarkupPattern.lastIndex - matchLength; | ||
var tagEndPos = kMarkupPattern.lastIndex; | ||
// Add TextNode if content | ||
if (lastTextPos > -1) { | ||
if (lastTextPos + match[0].length < tagEndPos) { | ||
if (lastTextPos + matchLength < tagEndPos) { | ||
var text = data.substring(lastTextPos, tagStartPos); | ||
@@ -1061,6 +1012,6 @@ currentParent.appendChild(new text_1.default(text, currentParent, createRange(lastTextPos, tagStartPos))); | ||
// Skip frameflag node | ||
if (match[2] === frameflag) | ||
if (tagName === frameflag) | ||
continue; | ||
// Handle comments | ||
if (match[0][1] === '!') { | ||
if (matchText[1] === '!') { | ||
if (options.comment) { | ||
@@ -1075,14 +1026,16 @@ // Only keep what is in between <!-- and --> | ||
// Fix tag casing if necessary | ||
if (options.lowerCaseTagName) | ||
match[2] = match[2].toLowerCase(); | ||
if (lowerCaseTagName) | ||
tagName = tagName.toLowerCase(); | ||
// Handle opening tags (ie. <this> not </that>) | ||
if (!match[1]) { | ||
if (!leadingSlash) { | ||
/* Populate attributes */ | ||
var attrs = {}; | ||
for (var attMatch = void 0; (attMatch = kAttributePattern.exec(match[3]));) { | ||
attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6]; | ||
for (var attMatch = void 0; (attMatch = kAttributePattern.exec(attributes));) { | ||
var key = attMatch[1], val = attMatch[2]; | ||
var isQuoted = val[0] === "'" || val[0] === "\""; | ||
attrs[key.toLowerCase()] = isQuoted ? val.slice(1, val.length - 1) : val; | ||
} | ||
var tagName = currentParent.rawTagName; | ||
if (!match[4] && kElementsClosedByOpening[tagName]) { | ||
if (kElementsClosedByOpening[tagName][match[2]]) { | ||
var parentTagName = currentParent.rawTagName; | ||
if (!closingSlash && kElementsClosedByOpening[parentTagName]) { | ||
if (kElementsClosedByOpening[parentTagName][tagName]) { | ||
stack.pop(); | ||
@@ -1093,3 +1046,3 @@ currentParent = (0, back_1.default)(stack); | ||
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144 | ||
if (match[2] === 'a' || match[2] === 'A') { | ||
if (tagName === 'a' || tagName === 'A') { | ||
if (noNestedTagIndex !== undefined) { | ||
@@ -1102,15 +1055,15 @@ stack.splice(noNestedTagIndex); | ||
var tagEndPos_1 = kMarkupPattern.lastIndex; | ||
var tagStartPos_1 = tagEndPos_1 - match[0].length; | ||
var tagStartPos_1 = tagEndPos_1 - matchLength; | ||
currentParent = currentParent.appendChild( | ||
// Initialize range (end position updated later for closed tags) | ||
new HTMLElement(match[2], attrs, match[3], null, createRange(tagStartPos_1, tagEndPos_1))); | ||
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos_1, tagEndPos_1))); | ||
stack.push(currentParent); | ||
if (is_block_text_element(match[2])) { | ||
if (is_block_text_element(tagName)) { | ||
// Find closing tag | ||
var closeMarkup = "</" + match[2] + ">"; | ||
var closeIndex = options.lowerCaseTagName | ||
var closeMarkup = "</" + tagName + ">"; | ||
var closeIndex = lowerCaseTagName | ||
? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex) | ||
: data.indexOf(closeMarkup, kMarkupPattern.lastIndex); | ||
var textEndPos = closeIndex === -1 ? dataEndPos : closeIndex; | ||
if (element_should_be_ignore(match[2])) { | ||
if (element_should_be_ignore(tagName)) { | ||
var text = data.substring(tagEndPos_1, textEndPos); | ||
@@ -1127,3 +1080,3 @@ if (text.length > 0 && /\S/.test(text)) { | ||
// Cause to be treated as self-closing, because no close found | ||
match[1] = 'true'; | ||
leadingSlash = '/'; | ||
} | ||
@@ -1133,7 +1086,7 @@ } | ||
// Handle closing tags or self-closed elements (ie </tag> or <br>) | ||
if (match[1] || match[4] || kSelfClosingElements[match[2]]) { | ||
if (leadingSlash || closingSlash || kSelfClosingElements[tagName]) { | ||
while (true) { | ||
if (match[2] === 'a' || match[2] === 'A') | ||
if (tagName === 'a' || tagName === 'A') | ||
noNestedTagIndex = undefined; | ||
if (currentParent.rawTagName === match[2]) { | ||
if (currentParent.rawTagName === tagName) { | ||
// Update range end for closed tag | ||
@@ -1146,6 +1099,6 @@ currentParent.range[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1]; | ||
else { | ||
var tagName = currentParent.tagName; | ||
var parentTagName = currentParent.tagName; | ||
// Trying to close current tag, and move on | ||
if (kElementsClosedByClosing[tagName]) { | ||
if (kElementsClosedByClosing[tagName][match[2]]) { | ||
if (kElementsClosedByClosing[parentTagName]) { | ||
if (kElementsClosedByClosing[parentTagName][tagName]) { | ||
stack.pop(); | ||
@@ -1152,0 +1105,0 @@ currentParent = (0, back_1.default)(stack); |
{ | ||
"name": "node-html-parser", | ||
"version": "4.1.5", | ||
"version": "5.0.0", | ||
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.", | ||
"main": "dist/index.js", | ||
"module": "dist/esm/index.js", | ||
"types": "dist/index.d.ts", | ||
"scripts": { | ||
"test": "mocha", | ||
"compile": "tsc", | ||
"build": "npm run lint && npm run clean && npm run compile:cjs && npm run compile:amd", | ||
"compile:cjs": "tsc -m commonjs", | ||
"compile:amd": "tsc -t es5 -m amd -d false --outFile ./dist/main.js", | ||
"lint": "eslint ./src/*.ts ./src/**/*.ts", | ||
"clean": "del-cli ./dist/", | ||
"ts:cjs": "tsc -m commonjs", | ||
"ts:amd": "tsc -t es5 -m amd -d false --outFile ./dist/main.js", | ||
"ts:esm": "tsc -t es2019 -m esnext -d false --outDir ./dist/esm/", | ||
"build": "npm run lint && npm run clean && npm run ts:cjs && npm run ts:amd && npm run ts:esm", | ||
"dev": "tsc -w & mocha -w ./test/*.js", | ||
"pretest": "tsc -m commonjs", | ||
"release": "yarn build && np", | ||
"prepare": "npm run build" | ||
"---------------": "", | ||
"test": "mocha ./test/tests/**/*.js", | ||
"test:src": "cross-env TEST_TARGET=src mocha ./test/tests", | ||
"test:dist": "cross-env TEST_TARGET=dist mocha ./test/tests", | ||
"benchmark": "node ./test/benchmark/compare.mjs", | ||
"--------------- ": "", | ||
"clean": "npx rimraf ./dist/", | ||
"clean:global": "yarn run clean && npx rimraf yarn.lock test/yarn.lock test/node_modules node_modules", | ||
"reset": "yarn run clean:global && yarn install && yarn build", | ||
"--------------- ": "", | ||
"posttest": "yarn run benchmark", | ||
"prepare": "cd test && yarn install" | ||
}, | ||
@@ -27,5 +32,13 @@ "keywords": [ | ||
], | ||
"files": [ | ||
"dist", | ||
"esm", | ||
"README.md", | ||
"LICENSE", | ||
"CHANGELOG.md" | ||
], | ||
"author": "Xiaoyi Shi <ashi009@gmail.com>", | ||
"contributors": [ | ||
"taoqf<tao_qiufeng@126.com>" | ||
"taoqf <tao_qiufeng@126.com>", | ||
"Ron S. <ron@nonara.com>" | ||
], | ||
@@ -49,4 +62,4 @@ "license": "MIT", | ||
"cheerio": "^1.0.0-rc.5", | ||
"del-cli": "latest", | ||
"eslint": "latest", | ||
"rimraf": "^3.0.2", | ||
"eslint": "^7.32.0", | ||
"eslint-config-prettier": "latest", | ||
@@ -64,4 +77,7 @@ "eslint-plugin-import": "latest", | ||
"spec": "latest", | ||
"standard-version": "^9.3.1", | ||
"travis-cov": "latest", | ||
"typescript": "next" | ||
"ts-node": "^10.2.1", | ||
"typescript": "latest", | ||
"cross-env": "^7.0.3" | ||
}, | ||
@@ -90,3 +106,7 @@ "config": { | ||
"homepage": "https://github.com/taoqf/node-fast-html-parser", | ||
"sideEffects": false | ||
"sideEffects": false, | ||
"exports": { | ||
"require": "./dist/index.js", | ||
"import": "./esm/index.js" | ||
} | ||
} |
@@ -1,2 +0,2 @@ | ||
# Fast HTML Parser [![NPM version](https://badge.fury.io/js/node-html-parser.png)](http://badge.fury.io/js/node-html-parser) [![Build Status](https://travis-ci.org/taoqf/node-html-parser.svg?branch=master)](https://travis-ci.org/taoqf/node-html-parser) | ||
# Fast HTML Parser [![NPM version](https://badge.fury.io/js/node-html-parser.png)](http://badge.fury.io/js/node-html-parser) [![Build Status](https://img.shields.io/endpoint.svg?url=https%3A%2F%2Factions-badge.atrox.dev%2Ftaoqf%2Fnode-html-parser%2Fbadge%3Fref%3Dmaster&style=flat)](https://actions-badge.atrox.dev/taoqf/node-html-parser/goto?ref=master) | ||
@@ -81,3 +81,3 @@ Fast HTML Parser is a _very fast_ HTML parser. Which will generate a simplified | ||
lowerCaseTagName: false, // convert tag name to lower case (hurt performance heavily) | ||
comment: false // retrieve comments (hurt performance slightly) | ||
comment: false, // retrieve comments (hurt performance slightly) | ||
blockTextElements: { | ||
@@ -116,2 +116,8 @@ script: true, // keep text content when parsing | ||
### HTMLElement#getElementsByTagName(tagName) | ||
Get all elements with the specified tagName. | ||
Note: * for all elements. | ||
### HTMLElement#closest(selector) | ||
@@ -204,3 +210,3 @@ | ||
Get escpaed (as-it) text value of current node and its children. May have | ||
Get escaped (as-it) text value of current node and its children. May have | ||
`&` in it. (fast) | ||
@@ -258,2 +264,2 @@ | ||
Corresponding source code start and end indexes (ie [ 0, 40 ]) | ||
Corresponding source code start and end indexes (ie [ 0, 40 ]) |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
262
140218
27
27
3351