@bonniernews/atlas-html-stream
Advanced tools
Comparing version 1.2.2 to 2.0.0
{ | ||
"name": "@bonniernews/atlas-html-stream", | ||
"version": "1.2.2", | ||
"version": "2.0.0", | ||
"description": "A super fast html-parser stream that outputs tag, text and closing nodes.", | ||
"scripts": { | ||
"test-server": "mocha --colors --watch --recursive", | ||
"test": "mocha --colors --recursive" | ||
"test": "mocha --colors --recursive", | ||
"posttest": "eslint . --cache" | ||
}, | ||
@@ -26,4 +27,5 @@ "main": "./src/HtmlParser.js", | ||
"devDependencies": { | ||
"chai": "^4.1.2", | ||
"mocha": "^5.2.0" | ||
"chai": "^4.3.4", | ||
"eslint": "^8.4.1", | ||
"mocha": "^9.1.3" | ||
}, | ||
@@ -33,5 +35,2 @@ "engines": { | ||
}, | ||
"dependencies": { | ||
"@bonniernews/atlas-seq-matcher": "^1.0.3" | ||
}, | ||
"homepage": "https://github.com/BonnierNews/atlas-html-stream#readme", | ||
@@ -42,3 +41,6 @@ "bugs": "https://github.com/BonnierNews/atlas-html-stream/issues", | ||
"url": "https://github.com/BonnierNews/atlas-html-stream.git" | ||
} | ||
}, | ||
"files": [ | ||
"src" | ||
] | ||
} |
@@ -1,3 +0,6 @@ | ||
# atlas-html-stream | ||
atlas-html-stream | ||
================= | ||
[![Built latest](https://github.com/BonnierNews/atlas-html-stream/actions/workflows/build-latest.yaml/badge.svg)](https://github.com/BonnierNews/atlas-html-stream/actions/workflows/build-latest.yaml) | ||
Fork of [atlas-html-stream](https://github.com/atlassubbed/atlas-html-stream) which is a super fast html-parser stream that outputs tag, | ||
@@ -7,3 +10,3 @@ text and closing nodes. See `CHANGELOG.md` for changes introduced in the fork. | ||
--- | ||
## install | ||
## install | ||
@@ -16,3 +19,3 @@ ``` | ||
I didn't like htmlparser2's streaming API and I wanted an html parser that collpased text-node whitespace by default. I also wanted to see if I could write a faster parser. | ||
I didn't like htmlparser2's streaming API and I wanted an html parser that collpased text-node whitespace by default. I also wanted to see if I could write a faster parser. | ||
@@ -41,3 +44,3 @@ ## performance | ||
## examples | ||
## examples | ||
@@ -130,5 +133,5 @@ Using this parser is really easy since it `extends` the stream interface -- all you need to do is pipe or write HTML to it and listen to `"data"` events: | ||
const commentNodes = parseHtml(` | ||
<!-- | ||
this is | ||
a comment | ||
<!-- | ||
this is | ||
a comment | ||
--> | ||
@@ -150,5 +153,5 @@ `) | ||
<b> | ||
<b> | ||
Hola | ||
</b> | ||
</b> | ||
`) | ||
@@ -177,3 +180,3 @@ nodes.forEach(n => console.log(n)); | ||
I'd like to make this thing even faster. The parsing itself takes about `3.5 ms/file` (using [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark)) on my machine. Pushing nodes as `data` events to our stream adds around 40% more processing time, which is why the benchmark above shows around `4.9 ms/file` -- this can't be avoided, because we *want* the streaming interface. | ||
I'd like to make this thing even faster. The parsing itself takes about `3.5 ms/file` (using [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark)) on my machine. Pushing nodes as `data` events to our stream adds around 40% more processing time, which is why the benchmark above shows around `4.9 ms/file` -- this can't be avoided, because we *want* the streaming interface. | ||
@@ -180,0 +183,0 @@ The `SeqMatcher` slows down this parser (checking comment, script and style nodes); there might be a faster way to handle these special nodes. |
@@ -1,123 +0,186 @@ | ||
const SeqMatcher = require("@bonniernews/atlas-seq-matcher"); | ||
const { Transform } = require("stream"); | ||
const { TEXT, NODE, NAME, KEY, VALUE, SCRIPT, STYLE, COMMENT } = require("./states"); | ||
class SeqMatcher { | ||
constructor(str) { | ||
this.str = str; | ||
this.max = str.length - 1; | ||
this.pos = 0; | ||
} | ||
found(code) { | ||
if (code !== this.str.charCodeAt(this.pos)) return !!(this.pos = 0); | ||
if (this.pos === this.max) return !(this.pos = 0); | ||
return !++this.pos; | ||
} | ||
reset() { | ||
return !(this.pos = 0); | ||
} | ||
} | ||
module.exports = class HtmlParser extends Transform { | ||
constructor({preserveWS} = {preserveWS: false}){ | ||
super({readableObjectMode: true}) | ||
const endScript = SeqMatcher("</script>") | ||
const endStyle = SeqMatcher("</style>") | ||
const beginComment = SeqMatcher("!--") | ||
const endComment = SeqMatcher("-->") | ||
let cache, name, key, text, data, state, curPos, minPos; | ||
let isClose, isSelfClose, hasEqual, valStartChar; | ||
this.reset = () => { | ||
endStyle.reset(), endScript.reset(), endComment.reset(), beginComment.reset(); | ||
state = TEXT, data = {}, text = [], name = cache = key = ""; | ||
curPos = minPos = 0; | ||
isClose = isSelfClose = hasEqual = valStartChar = null; | ||
} | ||
this.reset(); | ||
this.cache = chunk => (cache+=chunk).length; | ||
const getCache = (start, end) => cache.substr(start, end-start); | ||
const flushKey = (v, i) => (key = data[key || cache.substr(v, i-v)] = "") | ||
const flushVal = (v, i) => (data[key] = cache.substr(v, i-v), key = "", valStartChar = hasEqual = null) | ||
const flushNode = () => { | ||
if (!isClose) this.push({name, data}) | ||
if (isSelfClose || isClose) this.push({name}) | ||
const s = name === "script" ? SCRIPT : name === "style" ? STYLE : name === "!--" ? COMMENT : TEXT | ||
data = {}, name = "", isClose = isSelfClose = null | ||
return s | ||
} | ||
const flushSpecialNode = (v, i, name) => { | ||
const text = cache.substr(v, i-v) | ||
text && this.push({text}), this.push({name}) | ||
return TEXT | ||
} | ||
const flushText = (v, i) => { | ||
if (v < i) { | ||
text.push(cache.substr(v, i-v)) | ||
this.push({text:text.join(" ")}) | ||
text = [] | ||
} else if (text.length) { | ||
this.push({text:text.join(" ")}) | ||
text = [] | ||
} | ||
} | ||
this.flushText = () => flushText(minPos, curPos) | ||
this.parse = cacheLen => { | ||
let i = curPos, v = minPos, s = state, c | ||
while (i < cacheLen){ | ||
c = cache.charCodeAt(i) | ||
if (s === TEXT){ | ||
if (!preserveWS && (c === 32 || c >= 9 && c <= 13)) // ws | ||
v < i && text.push(getCache(v, i)), v = i + 1 | ||
else if (c === 60) // < | ||
flushText(v, i), s = NODE, v = i + 1 | ||
} else if (s === NODE){ | ||
if (c === 62) // > | ||
key && flushKey(), s = flushNode(), v = i + 1 | ||
else if (c === 47 && !hasEqual) // / | ||
isClose = !(isSelfClose = !!name) | ||
else if (c !== 32 && (c < 9 || c > 13)){ // !ws | ||
if (!name) // name start | ||
beginComment.found(c), v = i, s = NAME | ||
else if (!key) // key start | ||
v = i, s = KEY | ||
else if (c === 61) // = | ||
hasEqual = true | ||
else if (!hasEqual) // next key | ||
flushKey(), v = i, s = KEY | ||
else if (c === 34 || c === 39) // ', " | ||
v = i + 1, valStartChar = c, s = VALUE | ||
else // un-quoted val | ||
v = i, s = VALUE | ||
} | ||
} else if (s === NAME){ | ||
if (beginComment.found(c)) // start comment | ||
name = getCache(v, i + 1), s = flushNode(), v = i + 1 | ||
else if (c === 32 || c >= 9 && c <= 13) // ws | ||
name = getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 47) // / | ||
isSelfClose = true, name = getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 62) // > | ||
name = getCache(v, i), s = flushNode(), v = i + 1 | ||
} else if (s === KEY){ | ||
if (c === 32 || c >= 9 && c <= 13) // ws | ||
key = getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 61) // = | ||
hasEqual = true, key = getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 47) // / | ||
isSelfClose = true, key = getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 62) // > | ||
flushKey(v,i), s = flushNode(), v = i + 1 | ||
} else if (s === VALUE){ | ||
if (valStartChar != null){ | ||
if (c === valStartChar) // found end quote | ||
flushVal(v,i), s = NODE, v = i + 1 | ||
} else if (c === 32 || c >= 9 && c <= 13) // ws | ||
flushVal(v,i), s = NODE, v = i + 1 | ||
else if (c === 62) // > | ||
flushVal(v,i), s = flushNode(), v = i + 1 | ||
} else if (s === COMMENT && endComment.found(c)) | ||
s = flushSpecialNode(v, i-2, "!--"), v = i + 1 | ||
else if (s === SCRIPT && endScript.found(c)) | ||
s = flushSpecialNode(v, i-8, "script"), v = i + 1 | ||
else if (s === STYLE && endStyle.found(c)) | ||
s = flushSpecialNode(v, i-7, "style"), v = i + 1 | ||
i = i + 1 | ||
} | ||
cache = cache.substr(v), curPos = i - v, minPos = 0, state = s; | ||
} | ||
this.preserveWS = preserveWS; | ||
this.endScript = new SeqMatcher("</script>") | ||
this.endStyle = new SeqMatcher("</style>") | ||
this.beginComment = new SeqMatcher("!--") | ||
this.endComment = new SeqMatcher("-->") | ||
this.curPos = 0; | ||
this.minPos = 0; | ||
this.state = TEXT; | ||
this.cache = ""; | ||
this.name = ""; | ||
this.key = ""; | ||
this.text = []; | ||
this.data = {}; | ||
this.flags = { | ||
isClose: null, | ||
isSelfClose: null, | ||
hasEqual: null, | ||
valStartChar: null, | ||
}; | ||
} | ||
_transform(chunk, encoding, done){ | ||
if (chunk === null) return this.end(); | ||
this.parse(this.cache(chunk)); | ||
this.parse(this.addToCache(chunk)); | ||
done(null) | ||
} | ||
_flush(done){ | ||
this.flushText(); | ||
this.flushAllText(); | ||
this.reset(); | ||
done(null) | ||
} | ||
parse(cacheLen) { | ||
let i = this.curPos, v = this.minPos, s = this.state, c | ||
while (i < cacheLen){ | ||
c = this.cache.charCodeAt(i) | ||
if (s === TEXT){ | ||
if (!this.preserveWS && (c === 32 || c >= 9 && c <= 13)) // ws | ||
v < i && this.text.push(this.getCache(v, i)), v = i + 1 | ||
else if (c === 60) // < | ||
this.flushText(v, i), s = NODE, v = i + 1 | ||
} else if (s === NODE){ | ||
if (c === 62) // > | ||
this.key && this.flushKey(), s = this.flushNode(), v = i + 1 | ||
else if (c === 47 && !this.flags.hasEqual) // / | ||
this.flags.isClose = !(this.flags.isSelfClose = !!this.name) | ||
else if (c !== 32 && (c < 9 || c > 13)){ // !ws | ||
if (!this.name) // name start | ||
this.beginComment.found(c), v = i, s = NAME | ||
else if (!this.key) // key start | ||
v = i, s = KEY | ||
else if (c === 61) // = | ||
this.flags.hasEqual = true | ||
else if (!this.flags.hasEqual) // next key | ||
this.flushKey(), v = i, s = KEY | ||
else if (c === 34 || c === 39) // ', " | ||
v = i + 1, this.flags.valStartChar = c, s = VALUE | ||
else // un-quoted val | ||
v = i, s = VALUE | ||
} | ||
} else if (s === NAME){ | ||
if (this.beginComment.found(c)) // start comment | ||
this.name = this.getCache(v, i + 1), s = this.flushNode(), v = i + 1 | ||
else if (c === 32 || c >= 9 && c <= 13) // ws | ||
this.name = this.getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 47) // / | ||
this.flags.isSelfClose = true, this.name = this.getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 62) // > | ||
this.name = this.getCache(v, i), s = this.flushNode(), v = i + 1 | ||
} else if (s === KEY){ | ||
if (c === 32 || c >= 9 && c <= 13) // ws | ||
this.key = this.getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 61) // = | ||
this.flags.hasEqual = true, this.key = this.getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 47) // / | ||
this.flags.isSelfClose = true, this.key = this.getCache(v, i), s = NODE, v = i + 1 | ||
else if (c === 62) // > | ||
this.flushKey(v,i), s = this.flushNode(), v = i + 1 | ||
} else if (s === VALUE){ | ||
if (this.flags.valStartChar != null){ | ||
if (c === this.flags.valStartChar) // found end quote | ||
this.flushVal(v,i), s = NODE, v = i + 1 | ||
} else if (c === 32 || c >= 9 && c <= 13) // ws | ||
this.flushVal(v,i), s = NODE, v = i + 1 | ||
else if (c === 62) // > | ||
this.flushVal(v,i), s = this.flushNode(), v = i + 1 | ||
} else if (s === COMMENT && this.endComment.found(c)) | ||
s = this.flushSpecialNode(v, i-2, "!--"), v = i + 1 | ||
else if (s === SCRIPT && this.endScript.found(c)) | ||
s = this.flushSpecialNode(v, i-8, "script"), v = i + 1 | ||
else if (s === STYLE && this.endStyle.found(c)) | ||
s = this.flushSpecialNode(v, i-7, "style"), v = i + 1 | ||
i = i + 1 | ||
} | ||
this.cache = this.cache.substr(v); | ||
this.curPos = i - v; | ||
this.minPos = 0; | ||
this.state = s; | ||
} | ||
reset() { | ||
this.endStyle.reset(); | ||
this.endScript.reset(); | ||
this.endComment.reset(); | ||
this.beginComment.reset(); | ||
this.state = TEXT; | ||
this.cache = ""; | ||
this.name = ""; | ||
this.key = ""; | ||
this.text = []; | ||
this.data = {}; | ||
this.curPos = 0; | ||
this.minPos = 0; | ||
this.flags = { | ||
isClose: null, | ||
isSelfClose: null, | ||
hasEqual: null, | ||
valStartChar: null, | ||
}; | ||
} | ||
getCache(start, end) { | ||
return this.cache.substr(start, end-start); | ||
} | ||
addToCache(chunk) { | ||
return (this.cache += chunk).length; | ||
} | ||
flushKey(v, i) { | ||
return (this.key = this.data[this.key || this.cache.substr(v, i-v)] = ""); | ||
} | ||
flushVal(v, i) { | ||
return (this.data[this.key] = this.cache.substr(v, i-v), this.key = "", this.flags.valStartChar = this.flags.hasEqual = null); | ||
} | ||
flushNode() { | ||
const name = this.name; | ||
if (!this.flags.isClose) this.push({name, data: this.data}) | ||
if (this.flags.isSelfClose || this.flags.isClose) this.push({name}) | ||
const s = name === "script" ? SCRIPT : name === "style" ? STYLE : name === "!--" ? COMMENT : TEXT | ||
this.data = {}; | ||
this.name = ""; | ||
this.flags.isClose = this.flags.isSelfClose = null | ||
return s | ||
} | ||
flushSpecialNode(v, i, name) { | ||
const text = this.cache.substr(v, i-v) | ||
text && this.push({text}), this.push({name}) | ||
return TEXT | ||
} | ||
flushText(v, i) { | ||
if (v < i) { | ||
this.text.push(this.cache.substr(v, i-v)) | ||
this.push({text: this.text.join(" ")}) | ||
this.text = [] | ||
} else if (this.text.length) { | ||
this.push({text: this.text.join(" ")}) | ||
this.text = [] | ||
} | ||
} | ||
flushAllText() { | ||
return this.flushText(this.minPos, this.curPos); | ||
} | ||
} |
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
0
200
0
16812
3
5
187
2
- Removed@bonniernews/atlas-seq-matcher@1.0.4(transitive)