Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

@bonniernews/atlas-html-stream

Package Overview
Dependencies
Maintainers
8
Versions
4
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@bonniernews/atlas-html-stream - npm Package Compare versions

Comparing version 1.2.2 to 2.0.0

18

package.json
{
"name": "@bonniernews/atlas-html-stream",
"version": "1.2.2",
"version": "2.0.0",
"description": "A super fast html-parser stream that outputs tag, text and closing nodes.",
"scripts": {
"test-server": "mocha --colors --watch --recursive",
"test": "mocha --colors --recursive"
"test": "mocha --colors --recursive",
"posttest": "eslint . --cache"
},

@@ -26,4 +27,5 @@ "main": "./src/HtmlParser.js",

"devDependencies": {
"chai": "^4.1.2",
"mocha": "^5.2.0"
"chai": "^4.3.4",
"eslint": "^8.4.1",
"mocha": "^9.1.3"
},

@@ -33,5 +35,2 @@ "engines": {

},
"dependencies": {
"@bonniernews/atlas-seq-matcher": "^1.0.3"
},
"homepage": "https://github.com/BonnierNews/atlas-html-stream#readme",

@@ -42,3 +41,6 @@ "bugs": "https://github.com/BonnierNews/atlas-html-stream/issues",

"url": "https://github.com/BonnierNews/atlas-html-stream.git"
}
},
"files": [
"src"
]
}

@@ -1,3 +0,6 @@

# atlas-html-stream
atlas-html-stream
=================
[![Built latest](https://github.com/BonnierNews/atlas-html-stream/actions/workflows/build-latest.yaml/badge.svg)](https://github.com/BonnierNews/atlas-html-stream/actions/workflows/build-latest.yaml)
Fork of [atlas-html-stream](https://github.com/atlassubbed/atlas-html-stream) which is a super fast html-parser stream that outputs tag,

@@ -7,3 +10,3 @@ text and closing nodes. See `CHANGELOG.md` for changes introduced in the fork.

---
## install
## install

@@ -16,3 +19,3 @@ ```

I didn't like htmlparser2's streaming API and I wanted an html parser that collpased text-node whitespace by default. I also wanted to see if I could write a faster parser.
I didn't like htmlparser2's streaming API and I wanted an html parser that collpased text-node whitespace by default. I also wanted to see if I could write a faster parser.

@@ -41,3 +44,3 @@ ## performance

## examples
## examples

@@ -130,5 +133,5 @@ Using this parser is really easy since it `extends` the stream interface -- all you need to do is pipe or write HTML to it and listen to `"data"` events:

const commentNodes = parseHtml(`
<!--
this is
a comment
<!--
this is
a comment
-->

@@ -150,5 +153,5 @@ `)

<b>
<b>
Hola
</b>
</b>
`)

@@ -177,3 +180,3 @@ nodes.forEach(n => console.log(n));

I'd like to make this thing even faster. The parsing itself takes about `3.5 ms/file` (using [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark)) on my machine. Pushing nodes as `data` events to our stream adds around 40% more processing time, which is why the benchmark above shows around `4.9 ms/file` -- this can't be avoided, because we *want* the streaming interface.
I'd like to make this thing even faster. The parsing itself takes about `3.5 ms/file` (using [htmlparser-benchmark](https://github.com/AndreasMadsen/htmlparser-benchmark)) on my machine. Pushing nodes as `data` events to our stream adds around 40% more processing time, which is why the benchmark above shows around `4.9 ms/file` -- this can't be avoided, because we *want* the streaming interface.

@@ -180,0 +183,0 @@ The `SeqMatcher` slows down this parser (checking comment, script and style nodes); there might be a faster way to handle these special nodes.

@@ -1,123 +0,186 @@

const SeqMatcher = require("@bonniernews/atlas-seq-matcher");
const { Transform } = require("stream");
const { TEXT, NODE, NAME, KEY, VALUE, SCRIPT, STYLE, COMMENT } = require("./states");
class SeqMatcher {
constructor(str) {
this.str = str;
this.max = str.length - 1;
this.pos = 0;
}
found(code) {
if (code !== this.str.charCodeAt(this.pos)) return !!(this.pos = 0);
if (this.pos === this.max) return !(this.pos = 0);
return !++this.pos;
}
reset() {
return !(this.pos = 0);
}
}
module.exports = class HtmlParser extends Transform {
constructor({preserveWS} = {preserveWS: false}){
super({readableObjectMode: true})
const endScript = SeqMatcher("</script>")
const endStyle = SeqMatcher("</style>")
const beginComment = SeqMatcher("!--")
const endComment = SeqMatcher("-->")
let cache, name, key, text, data, state, curPos, minPos;
let isClose, isSelfClose, hasEqual, valStartChar;
this.reset = () => {
endStyle.reset(), endScript.reset(), endComment.reset(), beginComment.reset();
state = TEXT, data = {}, text = [], name = cache = key = "";
curPos = minPos = 0;
isClose = isSelfClose = hasEqual = valStartChar = null;
}
this.reset();
this.cache = chunk => (cache+=chunk).length;
const getCache = (start, end) => cache.substr(start, end-start);
const flushKey = (v, i) => (key = data[key || cache.substr(v, i-v)] = "")
const flushVal = (v, i) => (data[key] = cache.substr(v, i-v), key = "", valStartChar = hasEqual = null)
const flushNode = () => {
if (!isClose) this.push({name, data})
if (isSelfClose || isClose) this.push({name})
const s = name === "script" ? SCRIPT : name === "style" ? STYLE : name === "!--" ? COMMENT : TEXT
data = {}, name = "", isClose = isSelfClose = null
return s
}
const flushSpecialNode = (v, i, name) => {
const text = cache.substr(v, i-v)
text && this.push({text}), this.push({name})
return TEXT
}
const flushText = (v, i) => {
if (v < i) {
text.push(cache.substr(v, i-v))
this.push({text:text.join(" ")})
text = []
} else if (text.length) {
this.push({text:text.join(" ")})
text = []
}
}
this.flushText = () => flushText(minPos, curPos)
this.parse = cacheLen => {
let i = curPos, v = minPos, s = state, c
while (i < cacheLen){
c = cache.charCodeAt(i)
if (s === TEXT){
if (!preserveWS && (c === 32 || c >= 9 && c <= 13)) // ws
v < i && text.push(getCache(v, i)), v = i + 1
else if (c === 60) // <
flushText(v, i), s = NODE, v = i + 1
} else if (s === NODE){
if (c === 62) // >
key && flushKey(), s = flushNode(), v = i + 1
else if (c === 47 && !hasEqual) // /
isClose = !(isSelfClose = !!name)
else if (c !== 32 && (c < 9 || c > 13)){ // !ws
if (!name) // name start
beginComment.found(c), v = i, s = NAME
else if (!key) // key start
v = i, s = KEY
else if (c === 61) // =
hasEqual = true
else if (!hasEqual) // next key
flushKey(), v = i, s = KEY
else if (c === 34 || c === 39) // ', "
v = i + 1, valStartChar = c, s = VALUE
else // un-quoted val
v = i, s = VALUE
}
} else if (s === NAME){
if (beginComment.found(c)) // start comment
name = getCache(v, i + 1), s = flushNode(), v = i + 1
else if (c === 32 || c >= 9 && c <= 13) // ws
name = getCache(v, i), s = NODE, v = i + 1
else if (c === 47) // /
isSelfClose = true, name = getCache(v, i), s = NODE, v = i + 1
else if (c === 62) // >
name = getCache(v, i), s = flushNode(), v = i + 1
} else if (s === KEY){
if (c === 32 || c >= 9 && c <= 13) // ws
key = getCache(v, i), s = NODE, v = i + 1
else if (c === 61) // =
hasEqual = true, key = getCache(v, i), s = NODE, v = i + 1
else if (c === 47) // /
isSelfClose = true, key = getCache(v, i), s = NODE, v = i + 1
else if (c === 62) // >
flushKey(v,i), s = flushNode(), v = i + 1
} else if (s === VALUE){
if (valStartChar != null){
if (c === valStartChar) // found end quote
flushVal(v,i), s = NODE, v = i + 1
} else if (c === 32 || c >= 9 && c <= 13) // ws
flushVal(v,i), s = NODE, v = i + 1
else if (c === 62) // >
flushVal(v,i), s = flushNode(), v = i + 1
} else if (s === COMMENT && endComment.found(c))
s = flushSpecialNode(v, i-2, "!--"), v = i + 1
else if (s === SCRIPT && endScript.found(c))
s = flushSpecialNode(v, i-8, "script"), v = i + 1
else if (s === STYLE && endStyle.found(c))
s = flushSpecialNode(v, i-7, "style"), v = i + 1
i = i + 1
}
cache = cache.substr(v), curPos = i - v, minPos = 0, state = s;
}
this.preserveWS = preserveWS;
this.endScript = new SeqMatcher("</script>")
this.endStyle = new SeqMatcher("</style>")
this.beginComment = new SeqMatcher("!--")
this.endComment = new SeqMatcher("-->")
this.curPos = 0;
this.minPos = 0;
this.state = TEXT;
this.cache = "";
this.name = "";
this.key = "";
this.text = [];
this.data = {};
this.flags = {
isClose: null,
isSelfClose: null,
hasEqual: null,
valStartChar: null,
};
}
_transform(chunk, encoding, done){
if (chunk === null) return this.end();
this.parse(this.cache(chunk));
this.parse(this.addToCache(chunk));
done(null)
}
_flush(done){
this.flushText();
this.flushAllText();
this.reset();
done(null)
}
parse(cacheLen) {
let i = this.curPos, v = this.minPos, s = this.state, c
while (i < cacheLen){
c = this.cache.charCodeAt(i)
if (s === TEXT){
if (!this.preserveWS && (c === 32 || c >= 9 && c <= 13)) // ws
v < i && this.text.push(this.getCache(v, i)), v = i + 1
else if (c === 60) // <
this.flushText(v, i), s = NODE, v = i + 1
} else if (s === NODE){
if (c === 62) // >
this.key && this.flushKey(), s = this.flushNode(), v = i + 1
else if (c === 47 && !this.flags.hasEqual) // /
this.flags.isClose = !(this.flags.isSelfClose = !!this.name)
else if (c !== 32 && (c < 9 || c > 13)){ // !ws
if (!this.name) // name start
this.beginComment.found(c), v = i, s = NAME
else if (!this.key) // key start
v = i, s = KEY
else if (c === 61) // =
this.flags.hasEqual = true
else if (!this.flags.hasEqual) // next key
this.flushKey(), v = i, s = KEY
else if (c === 34 || c === 39) // ', "
v = i + 1, this.flags.valStartChar = c, s = VALUE
else // un-quoted val
v = i, s = VALUE
}
} else if (s === NAME){
if (this.beginComment.found(c)) // start comment
this.name = this.getCache(v, i + 1), s = this.flushNode(), v = i + 1
else if (c === 32 || c >= 9 && c <= 13) // ws
this.name = this.getCache(v, i), s = NODE, v = i + 1
else if (c === 47) // /
this.flags.isSelfClose = true, this.name = this.getCache(v, i), s = NODE, v = i + 1
else if (c === 62) // >
this.name = this.getCache(v, i), s = this.flushNode(), v = i + 1
} else if (s === KEY){
if (c === 32 || c >= 9 && c <= 13) // ws
this.key = this.getCache(v, i), s = NODE, v = i + 1
else if (c === 61) // =
this.flags.hasEqual = true, this.key = this.getCache(v, i), s = NODE, v = i + 1
else if (c === 47) // /
this.flags.isSelfClose = true, this.key = this.getCache(v, i), s = NODE, v = i + 1
else if (c === 62) // >
this.flushKey(v,i), s = this.flushNode(), v = i + 1
} else if (s === VALUE){
if (this.flags.valStartChar != null){
if (c === this.flags.valStartChar) // found end quote
this.flushVal(v,i), s = NODE, v = i + 1
} else if (c === 32 || c >= 9 && c <= 13) // ws
this.flushVal(v,i), s = NODE, v = i + 1
else if (c === 62) // >
this.flushVal(v,i), s = this.flushNode(), v = i + 1
} else if (s === COMMENT && this.endComment.found(c))
s = this.flushSpecialNode(v, i-2, "!--"), v = i + 1
else if (s === SCRIPT && this.endScript.found(c))
s = this.flushSpecialNode(v, i-8, "script"), v = i + 1
else if (s === STYLE && this.endStyle.found(c))
s = this.flushSpecialNode(v, i-7, "style"), v = i + 1
i = i + 1
}
this.cache = this.cache.substr(v);
this.curPos = i - v;
this.minPos = 0;
this.state = s;
}
reset() {
this.endStyle.reset();
this.endScript.reset();
this.endComment.reset();
this.beginComment.reset();
this.state = TEXT;
this.cache = "";
this.name = "";
this.key = "";
this.text = [];
this.data = {};
this.curPos = 0;
this.minPos = 0;
this.flags = {
isClose: null,
isSelfClose: null,
hasEqual: null,
valStartChar: null,
};
}
getCache(start, end) {
return this.cache.substr(start, end-start);
}
addToCache(chunk) {
return (this.cache += chunk).length;
}
flushKey(v, i) {
return (this.key = this.data[this.key || this.cache.substr(v, i-v)] = "");
}
flushVal(v, i) {
return (this.data[this.key] = this.cache.substr(v, i-v), this.key = "", this.flags.valStartChar = this.flags.hasEqual = null);
}
flushNode() {
const name = this.name;
if (!this.flags.isClose) this.push({name, data: this.data})
if (this.flags.isSelfClose || this.flags.isClose) this.push({name})
const s = name === "script" ? SCRIPT : name === "style" ? STYLE : name === "!--" ? COMMENT : TEXT
this.data = {};
this.name = "";
this.flags.isClose = this.flags.isSelfClose = null
return s
}
flushSpecialNode(v, i, name) {
const text = this.cache.substr(v, i-v)
text && this.push({text}), this.push({name})
return TEXT
}
flushText(v, i) {
if (v < i) {
this.text.push(this.cache.substr(v, i-v))
this.push({text: this.text.join(" ")})
this.text = []
} else if (this.text.length) {
this.push({text: this.text.join(" ")})
this.text = []
}
}
flushAllText() {
return this.flushText(this.minPos, this.curPos);
}
}
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc