lezer-html
Advanced tools
Comparing version 0.13.2 to 0.13.3
@@ -0,1 +1,7 @@ | ||
## 0.13.3 (2021-02-17) | ||
### Bug fixes | ||
Optimize the tokenizer by using a context tracker. | ||
## 0.13.2 (2021-01-22) | ||
@@ -2,0 +8,0 @@ |
@@ -1,6 +0,5 @@ | ||
import { ExternalTokenizer, Parser, NodeProp } from 'lezer'; | ||
import { ContextTracker, ExternalTokenizer, Parser, NodeProp } from 'lezer'; | ||
// This file was generated by lezer-generator. You probably shouldn't edit it. | ||
const | ||
StartTag = 1, | ||
const StartTag = 1, | ||
StartCloseTag = 2, | ||
@@ -14,3 +13,2 @@ MismatchedStartCloseTag = 3, | ||
OpenTag = 11, | ||
SelfClosingTag = 20, | ||
RawText = 25, | ||
@@ -65,56 +63,65 @@ Dialect_noMatch = 0; | ||
let cachedName = null, cachedInput = null, cachedPos = 0; | ||
function tagNameAfter(input, pos) { | ||
if (cachedPos == pos && cachedInput == input) return cachedName | ||
let next = input.get(pos); | ||
while (isSpace(next)) next = input.get(++pos); | ||
let start = pos; | ||
while (nameChar(next)) next = input.get(++pos); | ||
// Undefined to signal there's a <? or <!, null for just missing | ||
cachedInput = input; cachedPos = pos; | ||
return cachedName = pos > start ? input.read(start, pos).toLowerCase() : next == question || next == bang ? undefined : null | ||
} | ||
const lessThan = 60, greaterThan = 62, slash = 47, question = 63, bang = 33; | ||
const tagStartExpr = /^<\s*([\.\-\:\w\xa1-\uffff]+)/; | ||
function ElementContext(name, parent) { | ||
this.name = name; | ||
this.parent = parent; | ||
this.hash = parent ? parent.hash : 0; | ||
for (let i = 0; i < name.length; i++) this.hash += (this.hash << 4) + name.charCodeAt(i) + (name.charCodeAt(i) << 8); | ||
} | ||
let elementQuery = [Element], openAt = 0; | ||
const elementContext = new ContextTracker({ | ||
start: null, | ||
shift(context, term, input, stack) { | ||
return term == StartTag ? new ElementContext(tagNameAfter(input, stack.pos) || "", context) : context | ||
}, | ||
reduce(context, term) { | ||
return term == Element && context ? context.parent : context | ||
}, | ||
reuse(context, node, input, stack) { | ||
let type = node.type.id; | ||
return type == StartTag || type == OpenTag | ||
? new ElementContext(tagNameAfter(input, stack.pos - node.length + 1) || "", context) : context | ||
}, | ||
// Always returns 0 to avoid interfering with reuse. May not be safe | ||
// but I haven't found a counterexample yet. | ||
hash() { return 0 } | ||
}); | ||
function parentElement(input, stack, pos, len) { | ||
openAt = stack.startOf(elementQuery, pos); | ||
if (openAt == null) return null | ||
let match = tagStartExpr.exec(input.read(openAt, openAt + len + 10)); | ||
return match ? match[1].toLowerCase() : "" | ||
} | ||
const tagStart = new ExternalTokenizer((input, token, stack) => { | ||
let pos = token.start, first = input.get(pos); | ||
// End of file, just close anything | ||
if (first < 0) { | ||
let contextStart = stack.startOf(elementQuery); | ||
let match = contextStart == null ? null : tagStartExpr.exec(input.read(contextStart, contextStart + 30)); | ||
if (match && implicitlyClosed[match[1].toLowerCase()]) token.accept(missingCloseTag, token.start); | ||
} | ||
let pos = token.start, first = input.get(pos), close; | ||
// End of file, close any open tags | ||
if (first < 0 && stack.context) token.accept(missingCloseTag, token.start); | ||
if (first != lessThan) return | ||
pos++; | ||
let close = false, tokEnd = pos; | ||
for (let next; next = input.get(pos);) { | ||
if (next == slash && !close) { close = true; pos++; tokEnd = pos; } | ||
else if (next == question || next == bang) return | ||
else if (isSpace(next)) pos++; | ||
else break | ||
} | ||
let nameStart = pos; | ||
while (nameChar(input.get(pos))) pos++; | ||
if (pos == nameStart) return token.accept(close ? IncompleteCloseTag : StartTag, tokEnd) | ||
if (close = (input.get(pos) == slash)) pos++; | ||
let name = tagNameAfter(input, pos); | ||
if (name === undefined) return | ||
if (!name) return token.accept(close ? IncompleteCloseTag : StartTag, pos) | ||
let name = input.read(nameStart, pos).toLowerCase(); | ||
let parent = parentElement(input, stack, stack.pos, name.length); | ||
let parent = stack.context ? stack.context.name : null; | ||
if (close) { | ||
if (name == parent) return token.accept(StartCloseTag, tokEnd) | ||
if (implicitlyClosed[parent]) return token.accept(missingCloseTag, token.start) | ||
if (stack.dialectEnabled(Dialect_noMatch)) return token.accept(StartCloseTag, tokEnd) | ||
while (parent != null) { | ||
parent = parentElement(input, stack, openAt - 1, name.length); | ||
if (parent == name) return | ||
} | ||
token.accept(MismatchedStartCloseTag, tokEnd); | ||
if (name == parent) return token.accept(StartCloseTag, pos) | ||
if (parent && implicitlyClosed[parent]) return token.accept(missingCloseTag, token.start) | ||
if (stack.dialectEnabled(Dialect_noMatch)) return token.accept(StartCloseTag, pos) | ||
for (let cx = stack.context; cx; cx = cx.parent) if (cx.name == name) return | ||
token.accept(MismatchedStartCloseTag, pos); | ||
} else { | ||
if (parent && closeOnOpen[parent] && closeOnOpen[parent][name]) | ||
return token.accept(missingCloseTag, token.start) | ||
token.accept(StartTag, tokEnd); | ||
if (parent && closeOnOpen[parent] && closeOnOpen[parent][name]) token.accept(missingCloseTag, token.start); | ||
else token.accept(StartTag, pos); | ||
} | ||
}, {contextual: true}); | ||
}); | ||
const tagQuery = [OpenTag, SelfClosingTag]; | ||
const selfClosed = new ExternalTokenizer((input, token, stack) => { | ||
@@ -128,8 +135,6 @@ let next = input.get(token.start), end = token.start + 1; | ||
} | ||
let from = stack.startOf(tagQuery); | ||
let match = from == null ? null : tagStartExpr.exec(input.read(from, token.start)); | ||
if (match && selfClosers[match[1].toLowerCase()]) token.accept(SelfCloseEndTag, end); | ||
}, {contextual: true}); | ||
if (stack.context && selfClosers[stack.context.name]) token.accept(SelfCloseEndTag, end); | ||
}); | ||
const commentContent$1 = new ExternalTokenizer((input, token, stack) => { | ||
const commentContent$1 = new ExternalTokenizer((input, token) => { | ||
let pos = token.start, endPos = 0; | ||
@@ -218,2 +223,3 @@ for (;;) { | ||
maxTerm: 44, | ||
context: elementContext, | ||
nodeProps: [ | ||
@@ -220,0 +226,0 @@ [NodeProp.closedBy, -2,1,2,"EndTag SelfCloseEndTag",11,"CloseTag"], |
{ | ||
"name": "lezer-html", | ||
"version": "0.13.2", | ||
"version": "0.13.3", | ||
"description": "lezer-based HTML grammar", | ||
@@ -17,3 +17,3 @@ "main": "dist/index.cjs", | ||
"lezer-javascript": "^0.13.0", | ||
"lezer-generator": "^0.13.0", | ||
"lezer-generator": "^0.13.3", | ||
"mocha": "^8.1.3", | ||
@@ -24,3 +24,3 @@ "rollup": "^2.27.1", | ||
"dependencies": { | ||
"lezer": "^0.13.0" | ||
"lezer": "^0.13.2" | ||
}, | ||
@@ -27,0 +27,0 @@ "repository": { |
// This file was generated by lezer-generator. You probably shouldn't edit it. | ||
import {Parser} from "lezer" | ||
import {tagStart, selfClosed, commentContent} from "./tokens.js" | ||
import {tagStart, selfClosed, commentContent, elementContext} from "./tokens.js" | ||
import {elementContent} from "./content.js" | ||
@@ -13,2 +13,3 @@ import {NodeProp} from "lezer" | ||
maxTerm: 44, | ||
context: elementContext, | ||
nodeProps: [ | ||
@@ -15,0 +16,0 @@ [NodeProp.closedBy, -2,1,2,"EndTag SelfCloseEndTag",11,"CloseTag"], |
/* Hand-written tokenizers for HTML. */ | ||
import {ExternalTokenizer} from "lezer" | ||
import {ExternalTokenizer, ContextTracker} from "lezer" | ||
import {StartTag, StartCloseTag, MismatchedStartCloseTag, missingCloseTag, | ||
SelfCloseEndTag, IncompleteCloseTag, Element, OpenTag, SelfClosingTag, | ||
SelfCloseEndTag, IncompleteCloseTag, Element, OpenTag, | ||
Dialect_noMatch, commentContent as cmntContent} from "./parser.terms.js" | ||
@@ -52,56 +52,65 @@ | ||
let cachedName = null, cachedInput = null, cachedPos = 0 | ||
function tagNameAfter(input, pos) { | ||
if (cachedPos == pos && cachedInput == input) return cachedName | ||
let next = input.get(pos) | ||
while (isSpace(next)) next = input.get(++pos) | ||
let start = pos | ||
while (nameChar(next)) next = input.get(++pos) | ||
// Undefined to signal there's a <? or <!, null for just missing | ||
cachedInput = input; cachedPos = pos | ||
return cachedName = pos > start ? input.read(start, pos).toLowerCase() : next == question || next == bang ? undefined : null | ||
} | ||
const lessThan = 60, greaterThan = 62, slash = 47, question = 63, bang = 33 | ||
const tagStartExpr = /^<\s*([\.\-\:\w\xa1-\uffff]+)/ | ||
function ElementContext(name, parent) { | ||
this.name = name | ||
this.parent = parent | ||
this.hash = parent ? parent.hash : 0 | ||
for (let i = 0; i < name.length; i++) this.hash += (this.hash << 4) + name.charCodeAt(i) + (name.charCodeAt(i) << 8) | ||
} | ||
let elementQuery = [Element], openAt = 0 | ||
export const elementContext = new ContextTracker({ | ||
start: null, | ||
shift(context, term, input, stack) { | ||
return term == StartTag ? new ElementContext(tagNameAfter(input, stack.pos) || "", context) : context | ||
}, | ||
reduce(context, term) { | ||
return term == Element && context ? context.parent : context | ||
}, | ||
reuse(context, node, input, stack) { | ||
let type = node.type.id | ||
return type == StartTag || type == OpenTag | ||
? new ElementContext(tagNameAfter(input, stack.pos - node.length + 1) || "", context) : context | ||
}, | ||
// Always returns 0 to avoid interfering with reuse. May not be safe | ||
// but I haven't found a counterexample yet. | ||
hash() { return 0 } | ||
}) | ||
function parentElement(input, stack, pos, len) { | ||
openAt = stack.startOf(elementQuery, pos) | ||
if (openAt == null) return null | ||
let match = tagStartExpr.exec(input.read(openAt, openAt + len + 10)) | ||
return match ? match[1].toLowerCase() : "" | ||
} | ||
export const tagStart = new ExternalTokenizer((input, token, stack) => { | ||
let pos = token.start, first = input.get(pos) | ||
// End of file, just close anything | ||
if (first < 0) { | ||
let contextStart = stack.startOf(elementQuery) | ||
let match = contextStart == null ? null : tagStartExpr.exec(input.read(contextStart, contextStart + 30)) | ||
if (match && implicitlyClosed[match[1].toLowerCase()]) token.accept(missingCloseTag, token.start) | ||
} | ||
let pos = token.start, first = input.get(pos), close | ||
// End of file, close any open tags | ||
if (first < 0 && stack.context) token.accept(missingCloseTag, token.start) | ||
if (first != lessThan) return | ||
pos++ | ||
let close = false, tokEnd = pos | ||
for (let next; next = input.get(pos);) { | ||
if (next == slash && !close) { close = true; pos++; tokEnd = pos } | ||
else if (next == question || next == bang) return | ||
else if (isSpace(next)) pos++ | ||
else break | ||
} | ||
let nameStart = pos | ||
while (nameChar(input.get(pos))) pos++ | ||
if (pos == nameStart) return token.accept(close ? IncompleteCloseTag : StartTag, tokEnd) | ||
if (close = (input.get(pos) == slash)) pos++ | ||
let name = tagNameAfter(input, pos) | ||
if (name === undefined) return | ||
if (!name) return token.accept(close ? IncompleteCloseTag : StartTag, pos) | ||
let name = input.read(nameStart, pos).toLowerCase() | ||
let parent = parentElement(input, stack, stack.pos, name.length) | ||
let parent = stack.context ? stack.context.name : null | ||
if (close) { | ||
if (name == parent) return token.accept(StartCloseTag, tokEnd) | ||
if (implicitlyClosed[parent]) return token.accept(missingCloseTag, token.start) | ||
if (stack.dialectEnabled(Dialect_noMatch)) return token.accept(StartCloseTag, tokEnd) | ||
while (parent != null) { | ||
parent = parentElement(input, stack, openAt - 1, name.length) | ||
if (parent == name) return | ||
} | ||
token.accept(MismatchedStartCloseTag, tokEnd) | ||
if (name == parent) return token.accept(StartCloseTag, pos) | ||
if (parent && implicitlyClosed[parent]) return token.accept(missingCloseTag, token.start) | ||
if (stack.dialectEnabled(Dialect_noMatch)) return token.accept(StartCloseTag, pos) | ||
for (let cx = stack.context; cx; cx = cx.parent) if (cx.name == name) return | ||
token.accept(MismatchedStartCloseTag, pos) | ||
} else { | ||
if (parent && closeOnOpen[parent] && closeOnOpen[parent][name]) | ||
return token.accept(missingCloseTag, token.start) | ||
token.accept(StartTag, tokEnd) | ||
if (parent && closeOnOpen[parent] && closeOnOpen[parent][name]) token.accept(missingCloseTag, token.start) | ||
else token.accept(StartTag, pos) | ||
} | ||
}, {contextual: true}) | ||
}) | ||
const tagQuery = [OpenTag, SelfClosingTag] | ||
export const selfClosed = new ExternalTokenizer((input, token, stack) => { | ||
@@ -115,8 +124,6 @@ let next = input.get(token.start), end = token.start + 1 | ||
} | ||
let from = stack.startOf(tagQuery) | ||
let match = from == null ? null : tagStartExpr.exec(input.read(from, token.start)) | ||
if (match && selfClosers[match[1].toLowerCase()]) token.accept(SelfCloseEndTag, end) | ||
}, {contextual: true}) | ||
if (stack.context && selfClosers[stack.context.name]) token.accept(SelfCloseEndTag, end) | ||
}) | ||
export const commentContent = new ExternalTokenizer((input, token, stack) => { | ||
export const commentContent = new ExternalTokenizer((input, token) => { | ||
let pos = token.start, endPos = 0 | ||
@@ -123,0 +130,0 @@ for (;;) { |
@@ -39,3 +39,3 @@ # Doesn't parse VB as JS | ||
Element(OpenTag(StartTag,TagName,EndTag), | ||
Script(ExpressionStatement(null)),⚠),⚠)) | ||
Script(ExpressionStatement(null))))) | ||
@@ -42,0 +42,0 @@ # Error in JS |
@@ -118,3 +118,3 @@ # Regular tag | ||
Document(Element(OpenTag(StartTag,TagName,EndTag),MismatchedCloseTag(StartCloseTag,TagName,EndTag),⚠)) | ||
Document(Element(OpenTag(StartTag,TagName,EndTag),MismatchedCloseTag(StartCloseTag,TagName,EndTag))) | ||
@@ -127,3 +127,3 @@ # Unclosed tag | ||
Document(Element(OpenTag(StartTag,TagName,EndTag),⚠)) | ||
Document(Element(OpenTag(StartTag,TagName,EndTag))) | ||
@@ -283,4 +283,4 @@ # Ignore pseudo-xml self-closers | ||
foo=bar | ||
>hi< | ||
/body | ||
>hi</ | ||
body | ||
> | ||
@@ -287,0 +287,0 @@ |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
61271
18
862
Updatedlezer@^0.13.2