Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

htmlparser2

Package Overview
Dependencies
Maintainers
1
Versions
76
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

htmlparser2 - npm Package Compare versions

Comparing version 7.1.2 to 7.2.0

100

lib/Tokenizer.d.ts

@@ -21,50 +21,13 @@ /** All the states the tokenizer can be in. */

BeforeComment = 18,
InComment = 19,
CDATASequence = 19,
InSpecialComment = 20,
AfterComment1 = 21,
AfterComment2 = 22,
BeforeCdata1 = 23,
BeforeCdata2 = 24,
BeforeCdata3 = 25,
BeforeCdata4 = 26,
BeforeCdata5 = 27,
BeforeCdata6 = 28,
InCdata = 29,
AfterCdata1 = 30,
AfterCdata2 = 31,
BeforeSpecialS = 32,
BeforeSpecialSEnd = 33,
BeforeScript1 = 34,
BeforeScript2 = 35,
BeforeScript3 = 36,
BeforeScript4 = 37,
BeforeScript5 = 38,
AfterScript1 = 39,
AfterScript2 = 40,
AfterScript3 = 41,
AfterScript4 = 42,
AfterScript5 = 43,
BeforeStyle1 = 44,
BeforeStyle2 = 45,
BeforeStyle3 = 46,
BeforeStyle4 = 47,
AfterStyle1 = 48,
AfterStyle2 = 49,
AfterStyle3 = 50,
AfterStyle4 = 51,
BeforeSpecialT = 52,
BeforeSpecialTEnd = 53,
BeforeTitle1 = 54,
BeforeTitle2 = 55,
BeforeTitle3 = 56,
BeforeTitle4 = 57,
AfterTitle1 = 58,
AfterTitle2 = 59,
AfterTitle3 = 60,
AfterTitle4 = 61,
BeforeEntity = 62,
BeforeNumericEntity = 63,
InNamedEntity = 64,
InNumericEntity = 65,
InHexEntity = 66
InCommentLike = 21,
BeforeSpecialS = 22,
SpecialStartSequence = 23,
InSpecialTag = 24,
BeforeEntity = 25,
BeforeNumericEntity = 26,
InNamedEntity = 27,
InNumericEntity = 28,
InHexEntity = 29
}

@@ -90,3 +53,3 @@ export interface Callbacks {

/** The current state the tokenizer is in. */
_state: State;
private _state;
/** The read buffer. */

@@ -97,3 +60,3 @@ private buffer;

/** The index within the buffer that we are currently looking at. */
_index: number;
private _index;
/**

@@ -107,3 +70,3 @@ * Data that has already been processed will be removed from the buffer occasionally.

/** For special parsing behavior inside of script and style tags. */
private special;
private isSpecial;
/** Indicates whether the tokenizer has been paused. */

@@ -134,3 +97,25 @@ private running;

private stateText;
private currentSequence;
private sequenceIndex;
private stateSpecialStartSequence;
/** Look for an end tag. For <title> tags, also decode entities. */
private stateInSpecialTag;
private stateCDATASequence;
/**
* When we wait for one specific character, we can speed things up
* by skipping through the buffer until we find it.
*
* @returns Whether the character was found.
*/
private fastForwardTo;
/**
* Comments and CDATA end with `-->` and `]]>`.
*
* Their common qualities are:
* - Their end sequences have a distinct character they start with.
* - That character is then repeated, so we have to check multiple repeats.
* - All characters but the start character of the sequence can be skipped.
*/
private stateInCommentLike;
/**
* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.

@@ -142,2 +127,3 @@ *

private isTagStartChar;
private startSpecial;
private stateBeforeTagName;

@@ -161,21 +147,12 @@ private stateInTagName;

private stateBeforeComment;
private stateInComment;
private stateInSpecialComment;
private stateAfterComment1;
private stateAfterComment2;
private stateBeforeCdata6;
private stateInCdata;
private stateAfterCdata1;
private stateAfterCdata2;
private stateBeforeSpecialS;
private stateBeforeSpecialSEnd;
private stateBeforeSpecialLast;
private stateAfterSpecialLast;
private trieIndex;
private trieCurrent;
private trieResult;
private trieExcess;
private entityExcess;
private stateBeforeEntity;
private stateInNamedEntity;
private emitNamedEntity;
private stateBeforeNumericEntity;
private decodeNumericEntity;

@@ -189,2 +166,3 @@ private stateInNumericEntity;

private cleanup;
private shouldContinue;
/**

@@ -191,0 +169,0 @@ * Iterates through the buffer, calling the function corresponding to the current state.

@@ -8,3 +8,3 @@ "use strict";

var decode_1 = require("entities/lib/decode");
function whitespace(c) {
function isWhitespace(c) {
return (c === 32 /* Space */ ||

@@ -16,2 +16,8 @@ c === 10 /* NewLine */ ||

}
function isEndOfTagSection(c) {
return c === 47 /* Slash */ || c === 62 /* Gt */ || isWhitespace(c);
}
function isNumber(c) {
return c >= 48 /* Zero */ && c <= 57 /* Nine */;
}
function isASCIIAlpha(c) {

@@ -21,43 +27,18 @@ return ((c >= 97 /* LowerA */ && c <= 122 /* LowerZ */) ||

}
function ifElseState(upper, SUCCESS, FAILURE) {
var upperCode = upper.charCodeAt(0);
var lowerCode = upper.toLowerCase().charCodeAt(0);
return function (t, c) {
if (c === lowerCode || c === upperCode) {
t._state = SUCCESS;
}
else {
t._state = FAILURE;
t._index--;
}
};
}
var stateBeforeCdata1 = ifElseState("C", 24 /* BeforeCdata2 */, 16 /* InDeclaration */);
var stateBeforeCdata2 = ifElseState("D", 25 /* BeforeCdata3 */, 16 /* InDeclaration */);
var stateBeforeCdata3 = ifElseState("A", 26 /* BeforeCdata4 */, 16 /* InDeclaration */);
var stateBeforeCdata4 = ifElseState("T", 27 /* BeforeCdata5 */, 16 /* InDeclaration */);
var stateBeforeCdata5 = ifElseState("A", 28 /* BeforeCdata6 */, 16 /* InDeclaration */);
var stateBeforeScript1 = ifElseState("R", 35 /* BeforeScript2 */, 3 /* InTagName */);
var stateBeforeScript2 = ifElseState("I", 36 /* BeforeScript3 */, 3 /* InTagName */);
var stateBeforeScript3 = ifElseState("P", 37 /* BeforeScript4 */, 3 /* InTagName */);
var stateBeforeScript4 = ifElseState("T", 38 /* BeforeScript5 */, 3 /* InTagName */);
var stateAfterScript1 = ifElseState("R", 40 /* AfterScript2 */, 1 /* Text */);
var stateAfterScript2 = ifElseState("I", 41 /* AfterScript3 */, 1 /* Text */);
var stateAfterScript3 = ifElseState("P", 42 /* AfterScript4 */, 1 /* Text */);
var stateAfterScript4 = ifElseState("T", 43 /* AfterScript5 */, 1 /* Text */);
var stateBeforeStyle1 = ifElseState("Y", 45 /* BeforeStyle2 */, 3 /* InTagName */);
var stateBeforeStyle2 = ifElseState("L", 46 /* BeforeStyle3 */, 3 /* InTagName */);
var stateBeforeStyle3 = ifElseState("E", 47 /* BeforeStyle4 */, 3 /* InTagName */);
var stateAfterStyle1 = ifElseState("Y", 49 /* AfterStyle2 */, 1 /* Text */);
var stateAfterStyle2 = ifElseState("L", 50 /* AfterStyle3 */, 1 /* Text */);
var stateAfterStyle3 = ifElseState("E", 51 /* AfterStyle4 */, 1 /* Text */);
var stateBeforeSpecialT = ifElseState("I", 54 /* BeforeTitle1 */, 3 /* InTagName */);
var stateBeforeTitle1 = ifElseState("T", 55 /* BeforeTitle2 */, 3 /* InTagName */);
var stateBeforeTitle2 = ifElseState("L", 56 /* BeforeTitle3 */, 3 /* InTagName */);
var stateBeforeTitle3 = ifElseState("E", 57 /* BeforeTitle4 */, 3 /* InTagName */);
var stateBeforeSpecialTEnd = ifElseState("I", 58 /* AfterTitle1 */, 1 /* Text */);
var stateAfterTitle1 = ifElseState("T", 59 /* AfterTitle2 */, 1 /* Text */);
var stateAfterTitle2 = ifElseState("L", 60 /* AfterTitle3 */, 1 /* Text */);
var stateAfterTitle3 = ifElseState("E", 61 /* AfterTitle4 */, 1 /* Text */);
var stateBeforeNumericEntity = ifElseState("X", 66 /* InHexEntity */, 65 /* InNumericEntity */);
/**
* Sequences used to match longer strings.
*
* We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End
* sequences with an increased offset.
*/
var Sequences = {
Cdata: new Uint16Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]),
CdataEnd: new Uint16Array([0x5d, 0x5d, 0x3e]),
CommentEnd: new Uint16Array([0x2d, 0x2d, 0x3e]),
ScriptEnd: new Uint16Array([
0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
]),
StyleEnd: new Uint16Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]),
TitleEnd: new Uint16Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
};
var Tokenizer = /** @class */ (function () {

@@ -83,3 +64,3 @@ function Tokenizer(_a, cbs) {

/** For special parsing behavior inside of script and style tags. */
this.special = 1 /* None */;
this.isSpecial = false;
/** Indicates whether the tokenizer has been paused. */

@@ -89,6 +70,7 @@ this.running = true;

this.ended = false;
this.sequenceIndex = 0;
this.trieIndex = 0;
this.trieCurrent = 0;
this.trieResult = null;
this.trieExcess = 0;
this.entityExcess = 0;
this.xmlMode = xmlMode;

@@ -105,3 +87,3 @@ this.decodeEntities = decodeEntities;

this.baseState = 1 /* Text */;
this.special = 1 /* None */;
this.currentSequence = undefined;
this.running = true;

@@ -113,6 +95,3 @@ this.ended = false;

return this.cbs.onerror(Error(".write() after done!"));
if (this.buffer.length)
this.buffer += chunk;
else
this.buffer = chunk;
this.buffer += chunk;
this.parse();

@@ -154,3 +133,4 @@ };

Tokenizer.prototype.stateText = function (c) {
if (c === 60 /* Lt */) {
if (c === 60 /* Lt */ ||
(!this.decodeEntities && this.fastForwardTo(60 /* Lt */))) {
if (this._index > this.sectionStart) {

@@ -162,14 +142,135 @@ this.cbs.ontext(this.getSection());

}
else if (this.decodeEntities &&
c === 38 /* Amp */ &&
(this.special === 1 /* None */ || this.special === 4 /* Title */)) {
if (this._index > this.sectionStart) {
this.cbs.ontext(this.getSection());
else if (this.decodeEntities && c === 38 /* Amp */) {
this._state = 25 /* BeforeEntity */;
}
};
Tokenizer.prototype.stateSpecialStartSequence = function (c) {
var isEnd = this.sequenceIndex === this.currentSequence.length;
var isMatch = isEnd
? // If we are at the end of the sequence, make sure the tag name has ended
isEndOfTagSection(c)
: // Otherwise, do a case-insensitive comparison
(c | 0x20) === this.currentSequence[this.sequenceIndex];
if (!isMatch) {
this.isSpecial = false;
}
else if (!isEnd) {
this.sequenceIndex++;
return;
}
this.sequenceIndex = 0;
this._state = 3 /* InTagName */;
this.stateInTagName(c);
};
/** Look for an end tag. For <title> tags, also decode entities. */
Tokenizer.prototype.stateInSpecialTag = function (c) {
if (this.sequenceIndex === this.currentSequence.length) {
if (c === 62 /* Gt */ || isWhitespace(c)) {
var endOfText = this._index - this.currentSequence.length;
if (this.sectionStart < endOfText) {
// Spoof the index so that reported locations match up.
var actualIndex = this._index;
this._index = endOfText;
this.cbs.ontext(this.getSection());
this._index = actualIndex;
}
this.isSpecial = false;
this.sectionStart = endOfText + 2; // Skip over the `</`
this.stateInClosingTagName(c);
return; // We are done; skip the rest of the function.
}
this.baseState = 1 /* Text */;
this._state = 62 /* BeforeEntity */;
this.sectionStart = this._index;
this.sequenceIndex = 0;
}
if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) {
this.sequenceIndex += 1;
}
else if (this.sequenceIndex === 0) {
if (this.currentSequence === Sequences.TitleEnd) {
// We have to parse entities in <title> tags.
if (this.decodeEntities && c === 38 /* Amp */) {
this._state = 25 /* BeforeEntity */;
}
}
else if (this.fastForwardTo(60 /* Lt */)) {
// Outside of <title> tags, we can fast-forward.
this.sequenceIndex = 1;
}
}
else {
// If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`.
this.sequenceIndex = Number(c === 60 /* Lt */);
}
};
Tokenizer.prototype.stateCDATASequence = function (c) {
if (c === Sequences.Cdata[this.sequenceIndex]) {
if (++this.sequenceIndex === Sequences.Cdata.length) {
this._state = 21 /* InCommentLike */;
this.currentSequence = Sequences.CdataEnd;
this.sequenceIndex = 0;
this.sectionStart = this._index + 1;
}
}
else {
this.sequenceIndex = 0;
this._state = 16 /* InDeclaration */;
this.stateInDeclaration(c); // Reconsume the character
}
};
/**
* When we wait for one specific character, we can speed things up
* by skipping through the buffer until we find it.
*
* @returns Whether the character was found.
*/
Tokenizer.prototype.fastForwardTo = function (c) {
while (++this._index < this.buffer.length) {
if (this.buffer.charCodeAt(this._index) === c) {
return true;
}
}
/*
* We increment the index at the end of the `parse` loop,
* so set it to `buffer.length - 1` here.
*
* TODO: Refactor `parse` to increment index before calling states.
*/
this._index = this.buffer.length - 1;
return false;
};
/**
* Comments and CDATA end with `-->` and `]]>`.
*
* Their common qualities are:
* - Their end sequences have a distinct character they start with.
* - That character is then repeated, so we have to check multiple repeats.
* - All characters but the start character of the sequence can be skipped.
*/
Tokenizer.prototype.stateInCommentLike = function (c) {
if (c === this.currentSequence[this.sequenceIndex]) {
if (++this.sequenceIndex === this.currentSequence.length) {
// Remove 2 trailing chars
var section = this.buffer.slice(this.sectionStart, this._index - 2);
if (this.currentSequence === Sequences.CdataEnd) {
this.cbs.oncdata(section);
}
else {
this.cbs.oncomment(section);
}
this.sequenceIndex = 0;
this.sectionStart = this._index + 1;
this._state = 1 /* Text */;
}
}
else if (this.sequenceIndex === 0) {
// Fast-forward to the first character of the sequence
if (this.fastForwardTo(this.currentSequence[0])) {
this.sequenceIndex = 1;
}
}
else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
// Allow long sequences, eg. --->, ]]]>
this.sequenceIndex = 0;
}
};
/**
* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.

@@ -181,22 +282,12 @@ *

Tokenizer.prototype.isTagStartChar = function (c) {
return (isASCIIAlpha(c) ||
(this.xmlMode &&
!whitespace(c) &&
c !== 47 /* Slash */ &&
c !== 62 /* Gt */));
return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c);
};
Tokenizer.prototype.startSpecial = function (sequence, offset) {
this.isSpecial = true;
this.currentSequence = sequence;
this.sequenceIndex = offset;
this._state = 23 /* SpecialStartSequence */;
};
Tokenizer.prototype.stateBeforeTagName = function (c) {
if (c === 47 /* Slash */) {
this._state = 5 /* BeforeClosingTagName */;
}
else if (c === 60 /* Lt */) {
this.cbs.ontext(this.getSection());
this.sectionStart = this._index;
}
else if (c === 62 /* Gt */ ||
this.special !== 1 /* None */ ||
whitespace(c)) {
this._state = 1 /* Text */;
}
else if (c === 33 /* ExclamationMark */) {
if (c === 33 /* ExclamationMark */) {
this._state = 15 /* BeforeDeclaration */;

@@ -209,19 +300,25 @@ this.sectionStart = this._index + 1;

}
else if (!this.isTagStartChar(c)) {
this._state = 1 /* Text */;
else if (this.isTagStartChar(c)) {
var lower = c | 0x20;
this.sectionStart = this._index;
if (!this.xmlMode && lower === Sequences.TitleEnd[2]) {
this.startSpecial(Sequences.TitleEnd, 3);
}
else {
this._state =
!this.xmlMode && lower === Sequences.ScriptEnd[2]
? 22 /* BeforeSpecialS */
: 3 /* InTagName */;
}
}
else if (c === 47 /* Slash */) {
this._state = 5 /* BeforeClosingTagName */;
}
else {
this._state =
!this.xmlMode &&
(c === 115 /* LowerS */ || c === 83 /* UpperS */)
? 32 /* BeforeSpecialS */
: !this.xmlMode &&
(c === 116 /* LowerT */ || c === 84 /* UpperT */)
? 52 /* BeforeSpecialT */
: 3 /* InTagName */;
this.sectionStart = this._index;
this._state = 1 /* Text */;
this.stateText(c);
}
};
Tokenizer.prototype.stateInTagName = function (c) {
if (c === 47 /* Slash */ || c === 62 /* Gt */ || whitespace(c)) {
if (isEndOfTagSection(c)) {
this.cbs.onopentagname(this.getSection());

@@ -234,3 +331,3 @@ this.sectionStart = -1;

Tokenizer.prototype.stateBeforeClosingTagName = function (c) {
if (whitespace(c)) {
if (isWhitespace(c)) {
// Ignore

@@ -241,22 +338,6 @@ }

}
else if (this.special !== 1 /* None */) {
if (this.special !== 4 /* Title */ &&
(c === 115 /* LowerS */ || c === 83 /* UpperS */)) {
this._state = 33 /* BeforeSpecialSEnd */;
}
else if (this.special === 4 /* Title */ &&
(c === 116 /* LowerT */ || c === 84 /* UpperT */)) {
this._state = 53 /* BeforeSpecialTEnd */;
}
else {
this._state = 1 /* Text */;
this.stateText(c);
}
}
else if (!this.isTagStartChar(c)) {
this._state = 20 /* InSpecialComment */;
this.sectionStart = this._index;
}
else {
this._state = 6 /* InClosingTagName */;
this._state = this.isTagStartChar(c)
? 6 /* InClosingTagName */
: 20 /* InSpecialComment */;
this.sectionStart = this._index;

@@ -266,3 +347,3 @@ }

Tokenizer.prototype.stateInClosingTagName = function (c) {
if (c === 62 /* Gt */ || whitespace(c)) {
if (c === 62 /* Gt */ || isWhitespace(c)) {
this.cbs.onclosetag(this.getSection());

@@ -276,3 +357,3 @@ this.sectionStart = -1;

// Skip everything until ">"
if (c === 62 /* Gt */) {
if (c === 62 /* Gt */ || this.fastForwardTo(62 /* Gt */)) {
this._state = 1 /* Text */;

@@ -285,3 +366,10 @@ this.sectionStart = this._index + 1;

this.cbs.onopentagend();
this._state = 1 /* Text */;
if (this.isSpecial) {
this._state = 24 /* InSpecialTag */;
this.sequenceIndex = 0;
}
else {
this._state = 1 /* Text */;
}
this.baseState = this._state;
this.sectionStart = this._index + 1;

@@ -292,3 +380,3 @@ }

}
else if (!whitespace(c)) {
else if (!isWhitespace(c)) {
this._state = 9 /* InAttributeName */;

@@ -302,6 +390,7 @@ this.sectionStart = this._index;

this._state = 1 /* Text */;
this.baseState = 1 /* Text */;
this.sectionStart = this._index + 1;
this.special = 1 /* None */; // Reset special state, in case of self-closing special tags
this.isSpecial = false; // Reset special state, in case of self-closing special tags
}
else if (!whitespace(c)) {
else if (!isWhitespace(c)) {
this._state = 8 /* BeforeAttributeName */;

@@ -312,6 +401,3 @@ this.stateBeforeAttributeName(c);

Tokenizer.prototype.stateInAttributeName = function (c) {
if (c === 61 /* Eq */ ||
c === 47 /* Slash */ ||
c === 62 /* Gt */ ||
whitespace(c)) {
if (c === 61 /* Eq */ || isEndOfTagSection(c)) {
this.cbs.onattribname(this.getSection());

@@ -332,3 +418,3 @@ this.sectionStart = -1;

}
else if (!whitespace(c)) {
else if (!isWhitespace(c)) {
this.cbs.onattribend(undefined);

@@ -348,3 +434,3 @@ this._state = 9 /* InAttributeName */;

}
else if (!whitespace(c)) {
else if (!isWhitespace(c)) {
this.sectionStart = this._index;

@@ -356,3 +442,4 @@ this._state = 14 /* InAttributeValueNq */;

Tokenizer.prototype.handleInAttributeValue = function (c, quote) {
if (c === quote) {
if (c === quote ||
(!this.decodeEntities && this.fastForwardTo(quote))) {
this.cbs.onattribdata(this.getSection());

@@ -364,6 +451,4 @@ this.sectionStart = -1;

else if (this.decodeEntities && c === 38 /* Amp */) {
this.cbs.onattribdata(this.getSection());
this.baseState = this._state;
this._state = 62 /* BeforeEntity */;
this.sectionStart = this._index;
this._state = 25 /* BeforeEntity */;
}

@@ -378,3 +463,3 @@ };

Tokenizer.prototype.stateInAttributeValueNoQuotes = function (c) {
if (whitespace(c) || c === 62 /* Gt */) {
if (isWhitespace(c) || c === 62 /* Gt */) {
this.cbs.onattribdata(this.getSection());

@@ -387,18 +472,20 @@ this.sectionStart = -1;

else if (this.decodeEntities && c === 38 /* Amp */) {
this.cbs.onattribdata(this.getSection());
this.baseState = this._state;
this._state = 62 /* BeforeEntity */;
this.sectionStart = this._index;
this._state = 25 /* BeforeEntity */;
}
};
Tokenizer.prototype.stateBeforeDeclaration = function (c) {
this._state =
c === 91 /* OpeningSquareBracket */
? 23 /* BeforeCdata1 */
: c === 45 /* Dash */
if (c === 91 /* OpeningSquareBracket */) {
this._state = 19 /* CDATASequence */;
this.sequenceIndex = 0;
}
else {
this._state =
c === 45 /* Dash */
? 18 /* BeforeComment */
: 16 /* InDeclaration */;
}
};
Tokenizer.prototype.stateInDeclaration = function (c) {
if (c === 62 /* Gt */) {
if (c === 62 /* Gt */ || this.fastForwardTo(62 /* Gt */)) {
this.cbs.ondeclaration(this.getSection());

@@ -410,3 +497,3 @@ this._state = 1 /* Text */;

Tokenizer.prototype.stateInProcessingInstruction = function (c) {
if (c === 62 /* Gt */) {
if (c === 62 /* Gt */ || this.fastForwardTo(62 /* Gt */)) {
this.cbs.onprocessinginstruction(this.getSection());

@@ -419,3 +506,6 @@ this._state = 1 /* Text */;

if (c === 45 /* Dash */) {
this._state = 19 /* InComment */;
this._state = 21 /* InCommentLike */;
this.currentSequence = Sequences.CommentEnd;
// Allow short comments (eg. <!-->)
this.sequenceIndex = 2;
this.sectionStart = this._index + 1;

@@ -427,9 +517,5 @@ }

};
Tokenizer.prototype.stateInComment = function (c) {
if (c === 45 /* Dash */)
this._state = 21 /* AfterComment1 */;
};
Tokenizer.prototype.stateInSpecialComment = function (c) {
if (c === 62 /* Gt */) {
this.cbs.oncomment(this.buffer.substring(this.sectionStart, this._index));
if (c === 62 /* Gt */ || this.fastForwardTo(62 /* Gt */)) {
this.cbs.oncomment(this.getSection());
this._state = 1 /* Text */;

@@ -439,60 +525,9 @@ this.sectionStart = this._index + 1;

};
Tokenizer.prototype.stateAfterComment1 = function (c) {
if (c === 45 /* Dash */) {
this._state = 22 /* AfterComment2 */;
}
else {
this._state = 19 /* InComment */;
}
};
Tokenizer.prototype.stateAfterComment2 = function (c) {
if (c === 62 /* Gt */) {
// Remove 2 trailing chars
this.cbs.oncomment(this.buffer.substring(this.sectionStart, this._index - 2));
this._state = 1 /* Text */;
this.sectionStart = this._index + 1;
}
else if (c !== 45 /* Dash */) {
this._state = 19 /* InComment */;
}
// Else: stay in AFTER_COMMENT_2 (`--->`)
};
Tokenizer.prototype.stateBeforeCdata6 = function (c) {
if (c === 91 /* OpeningSquareBracket */) {
this._state = 29 /* InCdata */;
this.sectionStart = this._index + 1;
}
else {
this._state = 16 /* InDeclaration */;
this.stateInDeclaration(c);
}
};
Tokenizer.prototype.stateInCdata = function (c) {
if (c === 93 /* ClosingSquareBracket */)
this._state = 30 /* AfterCdata1 */;
};
Tokenizer.prototype.stateAfterCdata1 = function (c) {
if (c === 93 /* ClosingSquareBracket */)
this._state = 31 /* AfterCdata2 */;
else
this._state = 29 /* InCdata */;
};
Tokenizer.prototype.stateAfterCdata2 = function (c) {
if (c === 62 /* Gt */) {
// Remove 2 trailing chars
this.cbs.oncdata(this.buffer.substring(this.sectionStart, this._index - 2));
this._state = 1 /* Text */;
this.sectionStart = this._index + 1;
}
else if (c !== 93 /* ClosingSquareBracket */) {
this._state = 29 /* InCdata */;
}
// Else: stay in AFTER_CDATA_2 (`]]]>`)
};
Tokenizer.prototype.stateBeforeSpecialS = function (c) {
if (c === 99 /* LowerC */ || c === 67 /* UpperC */) {
this._state = 34 /* BeforeScript1 */;
var lower = c | 0x20;
if (lower === Sequences.ScriptEnd[3]) {
this.startSpecial(Sequences.ScriptEnd, 4);
}
else if (c === 116 /* LowerT */ || c === 84 /* UpperT */) {
this._state = 44 /* BeforeStyle1 */;
else if (lower === Sequences.StyleEnd[3]) {
this.startSpecial(Sequences.StyleEnd, 4);
}

@@ -504,52 +539,21 @@ else {

};
Tokenizer.prototype.stateBeforeSpecialSEnd = function (c) {
if (this.special === 2 /* Script */ &&
(c === 99 /* LowerC */ || c === 67 /* UpperC */)) {
this._state = 39 /* AfterScript1 */;
}
else if (this.special === 3 /* Style */ &&
(c === 116 /* LowerT */ || c === 84 /* UpperT */)) {
this._state = 48 /* AfterStyle1 */;
}
else
this._state = 1 /* Text */;
};
Tokenizer.prototype.stateBeforeSpecialLast = function (c, special) {
if (c === 47 /* Slash */ || c === 62 /* Gt */ || whitespace(c)) {
this.special = special;
}
this._state = 3 /* InTagName */;
this.stateInTagName(c); // Consume the token again
};
Tokenizer.prototype.stateAfterSpecialLast = function (c, sectionStartOffset) {
if (c === 62 /* Gt */ || whitespace(c)) {
this.sectionStart = this._index - sectionStartOffset;
this.special = 1 /* None */;
this._state = 6 /* InClosingTagName */;
this.stateInClosingTagName(c); // Reconsume the token
}
else
this._state = 1 /* Text */;
};
Tokenizer.prototype.stateBeforeEntity = function (c) {
// Start excess with 1 to include the '&'
this.entityExcess = 1;
if (c === 35 /* Num */) {
this._state = 63 /* BeforeNumericEntity */;
this._state = 26 /* BeforeNumericEntity */;
}
else if (c === 38 /* Amp */) {
// We have two `&` characters in a row. Emit the first one.
this.emitPartial(this.getSection());
this.sectionStart = this._index;
// We have two `&` characters in a row. Stay in the current state.
}
else {
this._state = 64 /* InNamedEntity */;
this.trieIndex = 0;
this.trieCurrent = this.entityTrie[0];
this.trieResult = null;
// Start excess with 1 to include the '&'
this.trieExcess = 1;
this._index--;
this._state = 27 /* InNamedEntity */;
this.stateInNamedEntity(c);
}
};
Tokenizer.prototype.stateInNamedEntity = function (c) {
this.trieExcess += 1;
this.entityExcess += 1;
this.trieIndex = (0, decode_1.determineBranch)(this.entityTrie, this.trieCurrent, this.trieIndex + 1, c);

@@ -570,2 +574,7 @@ if (this.trieIndex < 0) {

else {
// Add 1 as we have already incremented the excess
var entityStart = this._index - this.entityExcess + 1;
if (entityStart > this.sectionStart) {
this.emitPartial(this.buffer.substring(this.sectionStart, entityStart));
}
// If this is a surrogate pair, combine the higher bits from the node with the next byte

@@ -576,3 +585,4 @@ this.trieResult =

: String.fromCharCode(this.entityTrie[++this.trieIndex]);
this.trieExcess = 0;
this.entityExcess = 0;
this.sectionStart = this._index + 1;
}

@@ -585,10 +595,24 @@ }

}
this.sectionStart = this._index - this.trieExcess + 1;
this._state = this.baseState;
};
Tokenizer.prototype.stateBeforeNumericEntity = function (c) {
if ((c | 0x20) === 120 /* LowerX */) {
this.entityExcess++;
this._state = 29 /* InHexEntity */;
}
else {
this._state = 28 /* InNumericEntity */;
this.stateInNumericEntity(c);
}
};
Tokenizer.prototype.decodeNumericEntity = function (base, strict) {
var sectionStart = this.sectionStart + 2 + (base >> 4);
if (sectionStart !== this._index) {
var entityStart = this._index - this.entityExcess - 1;
var numberStart = entityStart + 2 + (base >> 4);
if (numberStart !== this._index) {
// Emit leading data if any
if (entityStart > this.sectionStart) {
this.emitPartial(this.buffer.substring(this.sectionStart, entityStart));
}
// Parse entity
var entity = this.buffer.substring(sectionStart, this._index);
var entity = this.buffer.substring(numberStart, this._index);
var parsed = parseInt(entity, base);

@@ -604,3 +628,3 @@ this.emitPartial((0, decode_codepoint_1.default)(parsed));

}
else if (c < 48 /* Zero */ || c > 57 /* Nine */) {
else if (!isNumber(c)) {
if (this.allowLegacyEntity()) {

@@ -614,2 +638,5 @@ this.decodeNumericEntity(10, false);

}
else {
this.entityExcess++;
}
};

@@ -622,3 +649,3 @@ Tokenizer.prototype.stateInHexEntity = function (c) {

(c < 65 /* UpperA */ || c > 70 /* UpperF */) &&
(c < 48 /* Zero */ || c > 57 /* Nine */)) {
!isNumber(c)) {
if (this.allowLegacyEntity()) {

@@ -632,5 +659,10 @@ this.decodeNumericEntity(16, false);

}
else {
this.entityExcess++;
}
};
Tokenizer.prototype.allowLegacyEntity = function () {
return !this.xmlMode && this.baseState === 1 /* Text */;
return (!this.xmlMode &&
(this.baseState === 1 /* Text */ ||
this.baseState === 24 /* InSpecialTag */));
};

@@ -643,4 +675,6 @@ /**

if (this.running &&
this._state === 1 /* Text */ &&
this.sectionStart !== this._index) {
this.sectionStart !== this._index &&
(this._state === 1 /* Text */ ||
(this._state === 24 /* InSpecialTag */ &&
this.sequenceIndex === 0))) {
// TODO: We could emit attribute data here as well.

@@ -659,2 +693,5 @@ this.cbs.ontext(this.buffer.substr(this.sectionStart));

};
Tokenizer.prototype.shouldContinue = function () {
return this._index < this.buffer.length && this.running;
};
/**

@@ -666,3 +703,3 @@ * Iterates through the buffer, calling the function corresponding to the current state.

Tokenizer.prototype.parse = function () {
while (this._index < this.buffer.length && this.running) {
while (this.shouldContinue()) {
var c = this.buffer.charCodeAt(this._index);

@@ -672,2 +709,11 @@ if (this._state === 1 /* Text */) {

}
else if (this._state === 23 /* SpecialStartSequence */) {
this.stateSpecialStartSequence(c);
}
else if (this._state === 24 /* InSpecialTag */) {
this.stateInSpecialTag(c);
}
else if (this._state === 19 /* CDATASequence */) {
this.stateCDATASequence(c);
}
else if (this._state === 12 /* InAttributeValueDq */) {

@@ -679,4 +725,4 @@ this.stateInAttributeValueDoubleQuotes(c);

}
else if (this._state === 19 /* InComment */) {
this.stateInComment(c);
else if (this._state === 21 /* InCommentLike */) {
this.stateInCommentLike(c);
}

@@ -713,8 +759,5 @@ else if (this._state === 20 /* InSpecialComment */) {

}
else if (this._state === 32 /* BeforeSpecialS */) {
else if (this._state === 22 /* BeforeSpecialS */) {
this.stateBeforeSpecialS(c);
}
else if (this._state === 21 /* AfterComment1 */) {
this.stateAfterComment1(c);
}
else if (this._state === 14 /* InAttributeValueNq */) {

@@ -732,135 +775,18 @@ this.stateInAttributeValueNoQuotes(c);

}
else if (this._state === 22 /* AfterComment2 */) {
this.stateAfterComment2(c);
}
else if (this._state === 18 /* BeforeComment */) {
this.stateBeforeComment(c);
}
else if (this._state === 33 /* BeforeSpecialSEnd */) {
this.stateBeforeSpecialSEnd(c);
}
else if (this._state === 53 /* BeforeSpecialTEnd */) {
stateBeforeSpecialTEnd(this, c);
}
else if (this._state === 39 /* AfterScript1 */) {
stateAfterScript1(this, c);
}
else if (this._state === 40 /* AfterScript2 */) {
stateAfterScript2(this, c);
}
else if (this._state === 41 /* AfterScript3 */) {
stateAfterScript3(this, c);
}
else if (this._state === 34 /* BeforeScript1 */) {
stateBeforeScript1(this, c);
}
else if (this._state === 35 /* BeforeScript2 */) {
stateBeforeScript2(this, c);
}
else if (this._state === 36 /* BeforeScript3 */) {
stateBeforeScript3(this, c);
}
else if (this._state === 37 /* BeforeScript4 */) {
stateBeforeScript4(this, c);
}
else if (this._state === 38 /* BeforeScript5 */) {
this.stateBeforeSpecialLast(c, 2 /* Script */);
}
else if (this._state === 42 /* AfterScript4 */) {
stateAfterScript4(this, c);
}
else if (this._state === 43 /* AfterScript5 */) {
this.stateAfterSpecialLast(c, 6);
}
else if (this._state === 44 /* BeforeStyle1 */) {
stateBeforeStyle1(this, c);
}
else if (this._state === 29 /* InCdata */) {
this.stateInCdata(c);
}
else if (this._state === 45 /* BeforeStyle2 */) {
stateBeforeStyle2(this, c);
}
else if (this._state === 46 /* BeforeStyle3 */) {
stateBeforeStyle3(this, c);
}
else if (this._state === 47 /* BeforeStyle4 */) {
this.stateBeforeSpecialLast(c, 3 /* Style */);
}
else if (this._state === 48 /* AfterStyle1 */) {
stateAfterStyle1(this, c);
}
else if (this._state === 49 /* AfterStyle2 */) {
stateAfterStyle2(this, c);
}
else if (this._state === 50 /* AfterStyle3 */) {
stateAfterStyle3(this, c);
}
else if (this._state === 51 /* AfterStyle4 */) {
this.stateAfterSpecialLast(c, 5);
}
else if (this._state === 52 /* BeforeSpecialT */) {
stateBeforeSpecialT(this, c);
}
else if (this._state === 54 /* BeforeTitle1 */) {
stateBeforeTitle1(this, c);
}
else if (this._state === 55 /* BeforeTitle2 */) {
stateBeforeTitle2(this, c);
}
else if (this._state === 56 /* BeforeTitle3 */) {
stateBeforeTitle3(this, c);
}
else if (this._state === 57 /* BeforeTitle4 */) {
this.stateBeforeSpecialLast(c, 4 /* Title */);
}
else if (this._state === 58 /* AfterTitle1 */) {
stateAfterTitle1(this, c);
}
else if (this._state === 59 /* AfterTitle2 */) {
stateAfterTitle2(this, c);
}
else if (this._state === 60 /* AfterTitle3 */) {
stateAfterTitle3(this, c);
}
else if (this._state === 61 /* AfterTitle4 */) {
this.stateAfterSpecialLast(c, 5);
}
else if (this._state === 17 /* InProcessingInstruction */) {
this.stateInProcessingInstruction(c);
}
else if (this._state === 64 /* InNamedEntity */) {
else if (this._state === 27 /* InNamedEntity */) {
this.stateInNamedEntity(c);
}
else if (this._state === 23 /* BeforeCdata1 */) {
stateBeforeCdata1(this, c);
}
else if (this._state === 62 /* BeforeEntity */) {
else if (this._state === 25 /* BeforeEntity */) {
this.stateBeforeEntity(c);
}
else if (this._state === 24 /* BeforeCdata2 */) {
stateBeforeCdata2(this, c);
}
else if (this._state === 25 /* BeforeCdata3 */) {
stateBeforeCdata3(this, c);
}
else if (this._state === 30 /* AfterCdata1 */) {
this.stateAfterCdata1(c);
}
else if (this._state === 31 /* AfterCdata2 */) {
this.stateAfterCdata2(c);
}
else if (this._state === 26 /* BeforeCdata4 */) {
stateBeforeCdata4(this, c);
}
else if (this._state === 27 /* BeforeCdata5 */) {
stateBeforeCdata5(this, c);
}
else if (this._state === 28 /* BeforeCdata6 */) {
this.stateBeforeCdata6(c);
}
else if (this._state === 66 /* InHexEntity */) {
else if (this._state === 29 /* InHexEntity */) {
this.stateInHexEntity(c);
}
else if (this._state === 65 /* InNumericEntity */) {
else if (this._state === 28 /* InNumericEntity */) {
this.stateInNumericEntity(c);

@@ -870,3 +796,3 @@ }

// `this._state === State.BeforeNumericEntity`
stateBeforeNumericEntity(this, c);
this.stateBeforeNumericEntity(c);
}

@@ -878,2 +804,5 @@ this._index++;

Tokenizer.prototype.finish = function () {
if (this._state === 27 /* InNamedEntity */) {
this.emitNamedEntity();
}
// If there is remaining data, emit it in a reasonable way

@@ -888,26 +817,17 @@ if (this.sectionStart < this._index) {

var data = this.buffer.substr(this.sectionStart);
if (this._state === 29 /* InCdata */ ||
this._state === 30 /* AfterCdata1 */ ||
this._state === 31 /* AfterCdata2 */) {
this.cbs.oncdata(data);
}
else if (this._state === 19 /* InComment */ ||
this._state === 21 /* AfterComment1 */ ||
this._state === 22 /* AfterComment2 */) {
this.cbs.oncomment(data);
}
else if (this._state === 64 /* InNamedEntity */ && !this.xmlMode) {
// Increase excess for EOF
this.trieExcess++;
this.emitNamedEntity();
if (this.sectionStart < this._index) {
this._state = this.baseState;
this.handleTrailingData();
if (this._state === 21 /* InCommentLike */) {
if (this.currentSequence === Sequences.CdataEnd) {
this.cbs.oncdata(data);
}
else {
this.cbs.oncomment(data);
}
}
else if (this._state === 65 /* InNumericEntity */ && !this.xmlMode) {
else if (this._state === 28 /* InNumericEntity */ &&
this.allowLegacyEntity()) {
this.decodeNumericEntity(10, false);
// All trailing data will have been consumed
}
else if (this._state === 66 /* InHexEntity */ && !this.xmlMode) {
else if (this._state === 29 /* InHexEntity */ &&
this.allowLegacyEntity()) {
this.decodeNumericEntity(16, false);

@@ -938,3 +858,4 @@ // All trailing data will have been consumed

Tokenizer.prototype.emitPartial = function (value) {
if (this.baseState !== 1 /* Text */) {
if (this.baseState !== 1 /* Text */ &&
this.baseState !== 24 /* InSpecialTag */) {
this.cbs.onattribdata(value);

@@ -941,0 +862,0 @@ }

{
"name": "htmlparser2",
"description": "Fast & forgiving HTML/XML parser",
"version": "7.1.2",
"version": "7.2.0",
"author": "Felix Boehm <me@feedic.com>",

@@ -57,12 +57,12 @@ "funding": [

"devDependencies": {
"@types/jest": "^27.0.1",
"@types/node": "^16.9.1",
"@typescript-eslint/eslint-plugin": "^4.31.0",
"@typescript-eslint/parser": "^4.31.0",
"eslint": "^7.32.0",
"@types/jest": "^27.0.2",
"@types/node": "^16.11.7",
"@typescript-eslint/eslint-plugin": "^5.3.1",
"@typescript-eslint/parser": "^5.3.1",
"eslint": "^8.2.0",
"eslint-config-prettier": "^8.1.0",
"jest": "^27.1.1",
"prettier": "^2.4.0",
"ts-jest": "^27.0.5",
"typescript": "^4.4.2"
"jest": "^27.3.1",
"prettier": "^2.4.1",
"ts-jest": "^27.0.7",
"typescript": "^4.4.4"
},

@@ -69,0 +69,0 @@ "jest": {

@@ -110,3 +110,3 @@ # htmlparser2

const dom = htmlparser2.parseDocument();
const dom = htmlparser2.parseDocument(htmlString);
```

@@ -113,0 +113,0 @@

Sorry, the diff of this file is not supported yet

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc