node-html-better-parser
Advanced tools
Comparing version
@@ -171,2 +171,8 @@ export declare enum NodeType { | ||
/** | ||
* Query CSS selector implementation | ||
* @param matcher The matcher to use for selection | ||
* @param all Whether to return all matches or just the first one | ||
*/ | ||
private querySelectorImpl; | ||
/** | ||
* Query CSS Selector to find matching node. | ||
@@ -229,3 +235,3 @@ * @param {string} selector Simplified CSS selector | ||
export declare class Matcher { | ||
private matchers; | ||
private checkers; | ||
private nextMatch; | ||
@@ -235,12 +241,26 @@ /** | ||
* @param {string} selector | ||
* | ||
* @memberof Matcher | ||
*/ | ||
constructor(selector: string); | ||
/** | ||
* Parse complete CSS selector using regex to extract all parts | ||
*/ | ||
private parseCompleteSelector; | ||
/** | ||
* Parse attributes string like "[attr1][attr2=value]" into structured data | ||
*/ | ||
private parseAttributes; | ||
/** | ||
* Create a checker function from parsed selector data | ||
*/ | ||
private createCheckerFromParsed; | ||
/** | ||
* Create attribute checker function | ||
*/ | ||
private createAttributeChecker; | ||
/** | ||
* Trying to advance match pointer | ||
* @param {Node} el element to make the match | ||
* @param {HTMLElement} el element to make the match | ||
* @return {bool} true when pointer advanced. | ||
*/ | ||
advance(el: Node): boolean; | ||
advance(el: HTMLElement): boolean; | ||
/** | ||
@@ -256,10 +276,13 @@ * Rewind the match pointer | ||
/** | ||
* Rest match pointer. | ||
* @return {[type]} [description] | ||
* Reset match pointer. | ||
*/ | ||
reset(): void; | ||
/** | ||
* flush cache to free memory | ||
* Get current match level (for debugging) | ||
*/ | ||
flushCache(): void; | ||
get level(): number; | ||
/** | ||
* Clone this matcher with current state | ||
*/ | ||
clone(): Matcher; | ||
} | ||
@@ -266,0 +289,0 @@ /** |
@@ -348,34 +348,50 @@ "use strict"; | ||
querySelectorAll(selector) { | ||
if (!(selector instanceof Matcher)) { | ||
if (selector.includes(',')) { | ||
const selectors = selector.split(','); | ||
const results = new Set(selectors.map(selector => this.querySelectorAll(selector.trim())).flat()); | ||
return Array.from(results); | ||
let matcher; | ||
if (selector instanceof Matcher) { | ||
matcher = selector; | ||
matcher.reset(); | ||
return this.querySelectorImpl(selector, true); | ||
} | ||
else { | ||
const parts = selector.split(',').filter(part => part.trim()).map(part => part.trim()); | ||
const result = new Set(parts.map(part => this.querySelectorImpl(new Matcher(part), true)).flat()); | ||
return Array.from(result); | ||
} | ||
} | ||
querySelectorImpl(matcher, all) { | ||
function explore(node, currentMatcher, all) { | ||
if (debug) | ||
console.log('exploring:', node.toString().replace(/\s+/g, ' '), 'matcher level:', currentMatcher.level); | ||
const advanced = node.tagName ? currentMatcher.advance(node) : false; | ||
if (debug) | ||
console.log('advanced:', advanced, 'matched:', currentMatcher.matched); | ||
// If we look for a single node and we found it, return it | ||
if (!all && advanced && currentMatcher.matched) | ||
return node; | ||
// If we look for all matching nodes and advanced the matcher, we need to create a new matcher to explore the descendants | ||
// with the same level too | ||
else if (advanced && all) { | ||
const clonedMatcher = currentMatcher.clone(); | ||
clonedMatcher.rewind(); | ||
const childrenResults = node.children.map(child => explore(child, clonedMatcher.clone(), true)).flat(); | ||
if (currentMatcher.matched) | ||
return [node, ...childrenResults]; | ||
else | ||
return childrenResults.concat(...node.children.map(child => explore(child, currentMatcher.clone(), true)).flat()); | ||
} | ||
else | ||
return this.querySelectorAll(new Matcher(selector)); | ||
} | ||
const matcher = selector; | ||
const res = new Set(); | ||
const stack = []; | ||
this.childNodes.forEach((node) => stack.push(node)); | ||
while (stack.length > 0) { | ||
const node = stack.shift(); | ||
if (node.nodeType === NodeType.ELEMENT_NODE) { | ||
// If the node matches | ||
if (matcher.advance(node)) { | ||
if (matcher.matched) { | ||
// Add the matched node to the results | ||
res.add(node); | ||
// We keep looking for children | ||
matcher.rewind(); | ||
} | ||
// If we look for all nodes but we didn't advance the matcher, we need to explore the children | ||
else if (all) | ||
return node.children.map(child => explore(child, currentMatcher.clone(), true)).flat(); | ||
// If we look for a single node and didn't advance the matcher, find the target in the children | ||
else { | ||
for (const child of node.children) { | ||
const result = explore(child, currentMatcher.clone(), false); | ||
if (result) | ||
return result; | ||
} | ||
// Add the children nodes to the stack | ||
node.childNodes.forEach((childNode) => { | ||
stack.push(childNode); | ||
}); | ||
return null; | ||
} | ||
} | ||
return Array.from(res); | ||
; | ||
return all ? explore(this, matcher, true) : explore(this, matcher, false); | ||
} | ||
@@ -393,35 +409,13 @@ /** | ||
matcher.reset(); | ||
return this.querySelectorImpl(selector, false); | ||
} | ||
else { | ||
matcher = new Matcher(selector); | ||
} | ||
const stack = []; | ||
for (let i = 0; i < this.childNodes.length; i++) { | ||
stack.push([this.childNodes[i], 0, false]); | ||
while (stack.length) { | ||
const state = arr_back(stack); | ||
const el = state[0]; | ||
if (state[1] === 0) { | ||
// Seen for first time. | ||
if (el.nodeType !== NodeType.ELEMENT_NODE) { | ||
stack.pop(); | ||
continue; | ||
} | ||
if (state[2] = matcher.advance(el)) { | ||
if (matcher.matched) { | ||
return el; | ||
} | ||
} | ||
} | ||
if (state[1] < el.childNodes.length) { | ||
stack.push([el.childNodes[state[1]++], 0, false]); | ||
} | ||
else { | ||
if (state[2]) | ||
matcher.rewind(); | ||
stack.pop(); | ||
} | ||
const parts = selector.split(',').map(part => part.trim()).filter(part => part.length); | ||
for (const part of parts) { | ||
const result = this.querySelectorImpl(new Matcher(part), false); | ||
if (result) | ||
return result; | ||
} | ||
return null; | ||
} | ||
return null; | ||
} | ||
@@ -558,145 +552,4 @@ /** | ||
exports.HTMLElement = HTMLElement; | ||
// Removed old complex function cache system - now using simple composed functions in Matcher class | ||
/** | ||
* Cache to store generated match functions | ||
* @type {Object} | ||
*/ | ||
let pMatchFunctionCache = {}; | ||
/** | ||
* Function cache | ||
*/ | ||
const functionCache = { | ||
"f145": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
if (el.id != tagName.substr(1)) | ||
return false; | ||
for (let cls = classes, i = 0; i < cls.length; i++) | ||
if (el.classNames.indexOf(cls[i]) === -1) | ||
return false; | ||
return true; | ||
}, | ||
"f45": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
for (let cls = classes, i = 0; i < cls.length; i++) | ||
if (el.classNames.indexOf(cls[i]) === -1) | ||
return false; | ||
return true; | ||
}, | ||
"f15": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
if (el.id != tagName.substr(1)) | ||
return false; | ||
return true; | ||
}, | ||
"f1": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
if (el.id != tagName.substr(1)) | ||
return false; | ||
}, | ||
"f5": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
el = el || {}; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
return true; | ||
}, | ||
"f245": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
let attrs = el.attributes; | ||
for (let key in attrs) { | ||
const val = attrs[key]; | ||
if (key == attr_key && val == value) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
// for (let cls = classes, i = 0; i < cls.length; i++) {if (el.classNames.indexOf(cls[i]) === -1){ return false;}} | ||
// return true; | ||
}, | ||
"f25": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
let attrs = el.attributes; | ||
for (let key in attrs) { | ||
const val = attrs[key]; | ||
if (key == attr_key && val == value) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
//return true; | ||
}, | ||
"f2": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
let attrs = el.attributes; | ||
for (let key in attrs) { | ||
const val = attrs[key]; | ||
if (key == attr_key && val == value) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
}, | ||
"f345": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
if (el.tagName != tagName) | ||
return false; | ||
for (let cls = classes, i = 0; i < cls.length; i++) | ||
if (el.classNames.indexOf(cls[i]) === -1) | ||
return false; | ||
return true; | ||
}, | ||
"f35": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
if (el.tagName != tagName) | ||
return false; | ||
return true; | ||
}, | ||
"f3": function (el, tagName, classes, attr_key, value) { | ||
"use strict"; | ||
tagName = tagName || ""; | ||
classes = classes || []; | ||
attr_key = attr_key || ""; | ||
value = value || ""; | ||
if (el.tagName != tagName) | ||
return false; | ||
} | ||
}; | ||
/** | ||
* Matcher class to make CSS match | ||
@@ -710,68 +563,114 @@ * | ||
* @param {string} selector | ||
* | ||
* @memberof Matcher | ||
*/ | ||
constructor(selector) { | ||
this.checkers = []; | ||
this.nextMatch = 0; | ||
functionCache["f5"] = functionCache["f5"]; | ||
this.matchers = selector.split(' ').map((matcher) => { | ||
if (pMatchFunctionCache[matcher]) | ||
return pMatchFunctionCache[matcher]; | ||
const parts = matcher.split('.'); | ||
const tagName = parts[0]; | ||
const classes = parts.slice(1).sort(); | ||
let source = '"use strict";'; | ||
let function_name = 'f'; | ||
let attr_key = ""; | ||
let value = ""; | ||
if (tagName && tagName != '*') { | ||
let matcher; | ||
if (tagName[0] == '#') { | ||
source += 'if (el.id != ' + JSON.stringify(tagName.substr(1)) + ') return false;'; //1 | ||
function_name += '1'; | ||
// @ts-ignore | ||
} | ||
else if (matcher = tagName.match(/^\[\s*(\S+)\s*(=|!=)\s*((((["'])([^\6]*)\6))|(\S*?))\]\s*/)) { | ||
attr_key = matcher[1]; | ||
let method = matcher[2]; | ||
if (method !== '=' && method !== '!=') { | ||
throw new Error('Selector not supported, Expect [key${op}value].op must be =,!='); | ||
} | ||
if (method === '=') { | ||
method = '=='; | ||
} | ||
value = matcher[7] || matcher[8]; | ||
source += `let attrs = el.attributes;for (let key in attrs){const val = attrs[key]; if (key == "${attr_key}" && val == "${value}"){return true;}} return false;`; //2 | ||
function_name += '2'; | ||
} | ||
else { | ||
source += 'if (el.tagName != ' + JSON.stringify(tagName) + ') return false;'; //3 | ||
function_name += '3'; | ||
} | ||
this.checkers = selector ? this.parseCompleteSelector(selector) : []; | ||
} | ||
/** | ||
* Parse complete CSS selector using regex to extract all parts | ||
*/ | ||
parseCompleteSelector(selector) { | ||
// Regex to match complete selector parts (tag#id.class1.class2[attr1][attr2]) | ||
// This captures each descendant selector part as a whole | ||
const selectorPartRegex = /(?:^|\s+)([a-zA-Z*][\w-]*)?(?:#([\w-]+))?(?:\.([\w-]+(?:\.[\w-]+)*))?(\[(?:[^\]]+)\](?:\[(?:[^\]]+)\])*)?/g; | ||
const parsedSelectors = []; | ||
let match; | ||
while ((match = selectorPartRegex.exec(selector)) !== null) { | ||
if (match[0].trim()) { // Skip empty matches | ||
parsedSelectors.push({ | ||
tag: match[1] || '', | ||
id: match[2] || '', | ||
classes: match[3] ? match[3].split('.') : [], | ||
attrs: this.parseAttributes(match[4] || '') | ||
}); | ||
} | ||
if (classes.length > 0) { | ||
source += 'for (let cls = ' + JSON.stringify(classes) + ', i = 0; i < cls.length; i++) if (el.classNames.indexOf(cls[i]) === -1) return false;'; //4 | ||
function_name += '4'; | ||
} | ||
return parsedSelectors.map(part => this.createCheckerFromParsed(part)); | ||
} | ||
/** | ||
* Parse attributes string like "[attr1][attr2=value]" into structured data | ||
*/ | ||
parseAttributes(attrsString) { | ||
if (!attrsString) | ||
return []; | ||
const attrs = []; | ||
const attrRegex = /\[([^\s~|^$*!=]+)(?:\s*(=|!=|\^=|\$=|\*=|\|=|~=)\s*(?:["']?([^"'\]]*)["']?)?)?\]/g; | ||
let match; | ||
while ((match = attrRegex.exec(attrsString)) !== null) { | ||
attrs.push({ | ||
key: match[1], | ||
op: match[2] || '', | ||
value: match[3] || '' | ||
}); | ||
} | ||
return attrs; | ||
} | ||
/** | ||
* Create a checker function from parsed selector data | ||
*/ | ||
createCheckerFromParsed(parsed) { | ||
// Create array of check functions - only for what's needed | ||
const checks = []; | ||
if (parsed.tag && parsed.tag !== '*') | ||
checks.push(element => element.tagName === parsed.tag); | ||
if (parsed.id) | ||
checks.push(element => element.id === parsed.id); | ||
if (parsed.classes.length > 0) { | ||
for (const cls of parsed.classes) { | ||
checks.push(element => element.classNames.includes(cls)); | ||
} | ||
source += 'return true;'; //5 | ||
function_name += '5'; | ||
let obj = { | ||
func: functionCache[function_name], | ||
tagName: tagName || "", | ||
classes: classes || "", | ||
attr_key: attr_key || "", | ||
value: value || "" | ||
}; | ||
source = source || ""; | ||
return pMatchFunctionCache[matcher] = obj; | ||
}); | ||
} | ||
if (parsed.attrs.length > 0) { | ||
const attrChecks = parsed.attrs.map(attr => this.createAttributeChecker(attr.key, attr.op, attr.value)); | ||
checks.push(element => attrChecks.every(check => check(element))); | ||
} | ||
// Return a function that checks all conditions | ||
return element => checks.every(check => check(element)); | ||
} | ||
/** | ||
* Create attribute checker function | ||
*/ | ||
createAttributeChecker(attrKey, operator, value) { | ||
switch (operator) { | ||
case '=': | ||
return element => element.attributes[attrKey] === value; | ||
case '!=': | ||
return element => element.attributes[attrKey] !== value; | ||
case '^=': | ||
return element => { | ||
const attrValue = element.attributes[attrKey]; | ||
return attrValue !== undefined && attrValue.startsWith(value); | ||
}; | ||
case '$=': | ||
return element => { | ||
const attrValue = element.attributes[attrKey]; | ||
return attrValue !== undefined && attrValue.endsWith(value); | ||
}; | ||
case '*=': | ||
return element => { | ||
const attrValue = element.attributes[attrKey]; | ||
return attrValue !== undefined && attrValue.includes(value); | ||
}; | ||
case '|=': | ||
return element => { | ||
const attrValue = element.attributes[attrKey]; | ||
return attrValue !== undefined && (attrValue === value || attrValue.startsWith(value + '-')); | ||
}; | ||
case '~=': | ||
return element => { | ||
const attrValue = element.attributes[attrKey]; | ||
return attrValue !== undefined && attrValue.split(/\s+/).includes(value); | ||
}; | ||
default: | ||
return element => element.attributes[attrKey] !== undefined; | ||
} | ||
} | ||
/** | ||
* Trying to advance match pointer | ||
* @param {Node} el element to make the match | ||
* @param {HTMLElement} el element to make the match | ||
* @return {bool} true when pointer advanced. | ||
*/ | ||
advance(el) { | ||
if (this.nextMatch < this.matchers.length && | ||
this.matchers[this.nextMatch].func(el, this.matchers[this.nextMatch].tagName, this.matchers[this.nextMatch].classes, this.matchers[this.nextMatch].attr_key, this.matchers[this.nextMatch].value)) { | ||
if (this.nextMatch < this.checkers.length && this.checkers[this.nextMatch](el)) { | ||
this.nextMatch++; | ||
@@ -793,7 +692,6 @@ return true; | ||
get matched() { | ||
return this.nextMatch == this.matchers.length; | ||
return this.nextMatch === this.checkers.length; | ||
} | ||
/** | ||
* Rest match pointer. | ||
* @return {[type]} [description] | ||
* Reset match pointer. | ||
*/ | ||
@@ -804,7 +702,16 @@ reset() { | ||
/** | ||
* flush cache to free memory | ||
* Get current match level (for debugging) | ||
*/ | ||
flushCache() { | ||
pMatchFunctionCache = {}; | ||
get level() { | ||
return this.nextMatch; | ||
} | ||
/** | ||
* Clone this matcher with current state | ||
*/ | ||
clone() { | ||
const cloned = new Matcher(''); // Empty selector, we'll copy the checkers | ||
cloned.checkers = this.checkers; | ||
cloned.nextMatch = this.nextMatch; | ||
return cloned; | ||
} | ||
} | ||
@@ -830,12 +737,12 @@ exports.Matcher = Matcher; | ||
li: { li: true }, | ||
p: { p: true, div: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
p: { p: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
b: { div: true }, | ||
td: { td: true, th: true }, | ||
th: { td: true, th: true }, | ||
h1: { p: true, div: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h2: { p: true, div: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h3: { p: true, div: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h4: { p: true, div: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h5: { p: true, div: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h6: { p: true, div: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h1: { p: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h2: { p: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h3: { p: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h4: { p: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h5: { p: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
h6: { p: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true }, | ||
// Table elements | ||
@@ -850,12 +757,12 @@ tr: { tr: true, thead: true, tbody: true, tfoot: true }, | ||
// Section elements | ||
section: { section: true, div: true }, | ||
article: { article: true, div: true }, | ||
aside: { aside: true, div: true }, | ||
nav: { nav: true, div: true }, | ||
section: { section: true }, | ||
article: { article: true }, | ||
aside: { aside: true }, | ||
nav: { nav: true }, | ||
// Form elements | ||
form: { form: true }, | ||
// Header elements | ||
header: { header: true, div: true }, | ||
footer: { footer: true, div: true }, | ||
main: { main: true, div: true } | ||
header: { header: true }, | ||
footer: { footer: true }, | ||
main: { main: true } | ||
}; | ||
@@ -862,0 +769,0 @@ const kBlockTextElements = { |
{ | ||
"name": "node-html-better-parser", | ||
"version": "1.5.0", | ||
"version": "1.5.1", | ||
"description": "A very fast HTML parser, generating a simplified DOM, with basic element query support.", | ||
@@ -5,0 +5,0 @@ "main": "dist/src/index.js", |
54583
-2.55%1281
-5.18%