htmlparser2 - npm Package Compare versions

Comparing version 7.1.2 to 7.2.0

100

lib/Tokenizer.d.ts

		@@ -21,50 +21,13 @@ /** All the states the tokenizer can be in. */
		BeforeComment = 18,
		InComment = 19,
		CDATASequence = 19,
		InSpecialComment = 20,
		AfterComment1 = 21,
		AfterComment2 = 22,
		BeforeCdata1 = 23,
		BeforeCdata2 = 24,
		BeforeCdata3 = 25,
		BeforeCdata4 = 26,
		BeforeCdata5 = 27,
		BeforeCdata6 = 28,
		InCdata = 29,
		AfterCdata1 = 30,
		AfterCdata2 = 31,
		BeforeSpecialS = 32,
		BeforeSpecialSEnd = 33,
		BeforeScript1 = 34,
		BeforeScript2 = 35,
		BeforeScript3 = 36,
		BeforeScript4 = 37,
		BeforeScript5 = 38,
		AfterScript1 = 39,
		AfterScript2 = 40,
		AfterScript3 = 41,
		AfterScript4 = 42,
		AfterScript5 = 43,
		BeforeStyle1 = 44,
		BeforeStyle2 = 45,
		BeforeStyle3 = 46,
		BeforeStyle4 = 47,
		AfterStyle1 = 48,
		AfterStyle2 = 49,
		AfterStyle3 = 50,
		AfterStyle4 = 51,
		BeforeSpecialT = 52,
		BeforeSpecialTEnd = 53,
		BeforeTitle1 = 54,
		BeforeTitle2 = 55,
		BeforeTitle3 = 56,
		BeforeTitle4 = 57,
		AfterTitle1 = 58,
		AfterTitle2 = 59,
		AfterTitle3 = 60,
		AfterTitle4 = 61,
		BeforeEntity = 62,
		BeforeNumericEntity = 63,
		InNamedEntity = 64,
		InNumericEntity = 65,
		InHexEntity = 66
		InCommentLike = 21,
		BeforeSpecialS = 22,
		SpecialStartSequence = 23,
		InSpecialTag = 24,
		BeforeEntity = 25,
		BeforeNumericEntity = 26,
		InNamedEntity = 27,
		InNumericEntity = 28,
		InHexEntity = 29
		}
		@@ -90,3 +53,3 @@ export interface Callbacks {
		/** The current state the tokenizer is in. */
		_state: State;
		private _state;
		/** The read buffer. */
		@@ -97,3 +60,3 @@ private buffer;
		/** The index within the buffer that we are currently looking at. */
		_index: number;
		private _index;
		/**
		@@ -107,3 +70,3 @@ * Data that has already been processed will be removed from the buffer occasionally.
		/** For special parsing behavior inside of script and style tags. */
		private special;
		private isSpecial;
		/** Indicates whether the tokenizer has been paused. */
		@@ -134,3 +97,25 @@ private running;
		private stateText;
		private currentSequence;
		private sequenceIndex;
		private stateSpecialStartSequence;
		/** Look for an end tag. For <title> tags, also decode entities. */
		private stateInSpecialTag;
		private stateCDATASequence;
		/**
		* When we wait for one specific character, we can speed things up
		* by skipping through the buffer until we find it.
		*
		* @returns Whether the character was found.
		*/
		private fastForwardTo;
		/**
		* Comments and CDATA end with `-->` and `]]>`.
		*
		* Their common qualities are:
		* - Their end sequences have a distinct character they start with.
		* - That character is then repeated, so we have to check multiple repeats.
		* - All characters but the start character of the sequence can be skipped.
		*/
		private stateInCommentLike;
		/**
		* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
		@@ -142,2 +127,3 @@ *
		private isTagStartChar;
		private startSpecial;
		private stateBeforeTagName;
		@@ -161,21 +147,12 @@ private stateInTagName;
		private stateBeforeComment;
		private stateInComment;
		private stateInSpecialComment;
		private stateAfterComment1;
		private stateAfterComment2;
		private stateBeforeCdata6;
		private stateInCdata;
		private stateAfterCdata1;
		private stateAfterCdata2;
		private stateBeforeSpecialS;
		private stateBeforeSpecialSEnd;
		private stateBeforeSpecialLast;
		private stateAfterSpecialLast;
		private trieIndex;
		private trieCurrent;
		private trieResult;
		private trieExcess;
		private entityExcess;
		private stateBeforeEntity;
		private stateInNamedEntity;
		private emitNamedEntity;
		private stateBeforeNumericEntity;
		private decodeNumericEntity;
		@@ -189,2 +166,3 @@ private stateInNumericEntity;
		private cleanup;
		private shouldContinue;
		/**
		@@ -191,0 +169,0 @@ * Iterates through the buffer, calling the function corresponding to the current state.

709

lib/Tokenizer.js

		@@ -8,3 +8,3 @@ "use strict";
		var decode_1 = require("entities/lib/decode");
		function whitespace(c) {
		function isWhitespace(c) {
		return (c === 32 /* Space */ \|\|
		@@ -16,2 +16,8 @@ c === 10 /* NewLine */ \|\|
		}
		function isEndOfTagSection(c) {
		return c === 47 /* Slash / \|\| c === 62 / Gt */ \|\| isWhitespace(c);
		}
		function isNumber(c) {
		return c >= 48 /* Zero / && c <= 57 / Nine */;
		}
		function isASCIIAlpha(c) {
		@@ -21,43 +27,18 @@ return ((c >= 97 /* LowerA / && c <= 122 / LowerZ */) \|\|
		}
		function ifElseState(upper, SUCCESS, FAILURE) {
		var upperCode = upper.charCodeAt(0);
		var lowerCode = upper.toLowerCase().charCodeAt(0);
		return function (t, c) {
		if (c === lowerCode \|\| c === upperCode) {
		t._state = SUCCESS;
		}
		else {
		t._state = FAILURE;
		t._index--;
		}
		};
		}
		var stateBeforeCdata1 = ifElseState("C", 24 /* BeforeCdata2 /, 16 / InDeclaration */);
		var stateBeforeCdata2 = ifElseState("D", 25 /* BeforeCdata3 /, 16 / InDeclaration */);
		var stateBeforeCdata3 = ifElseState("A", 26 /* BeforeCdata4 /, 16 / InDeclaration */);
		var stateBeforeCdata4 = ifElseState("T", 27 /* BeforeCdata5 /, 16 / InDeclaration */);
		var stateBeforeCdata5 = ifElseState("A", 28 /* BeforeCdata6 /, 16 / InDeclaration */);
		var stateBeforeScript1 = ifElseState("R", 35 /* BeforeScript2 /, 3 / InTagName */);
		var stateBeforeScript2 = ifElseState("I", 36 /* BeforeScript3 /, 3 / InTagName */);
		var stateBeforeScript3 = ifElseState("P", 37 /* BeforeScript4 /, 3 / InTagName */);
		var stateBeforeScript4 = ifElseState("T", 38 /* BeforeScript5 /, 3 / InTagName */);
		var stateAfterScript1 = ifElseState("R", 40 /* AfterScript2 /, 1 / Text */);
		var stateAfterScript2 = ifElseState("I", 41 /* AfterScript3 /, 1 / Text */);
		var stateAfterScript3 = ifElseState("P", 42 /* AfterScript4 /, 1 / Text */);
		var stateAfterScript4 = ifElseState("T", 43 /* AfterScript5 /, 1 / Text */);
		var stateBeforeStyle1 = ifElseState("Y", 45 /* BeforeStyle2 /, 3 / InTagName */);
		var stateBeforeStyle2 = ifElseState("L", 46 /* BeforeStyle3 /, 3 / InTagName */);
		var stateBeforeStyle3 = ifElseState("E", 47 /* BeforeStyle4 /, 3 / InTagName */);
		var stateAfterStyle1 = ifElseState("Y", 49 /* AfterStyle2 /, 1 / Text */);
		var stateAfterStyle2 = ifElseState("L", 50 /* AfterStyle3 /, 1 / Text */);
		var stateAfterStyle3 = ifElseState("E", 51 /* AfterStyle4 /, 1 / Text */);
		var stateBeforeSpecialT = ifElseState("I", 54 /* BeforeTitle1 /, 3 / InTagName */);
		var stateBeforeTitle1 = ifElseState("T", 55 /* BeforeTitle2 /, 3 / InTagName */);
		var stateBeforeTitle2 = ifElseState("L", 56 /* BeforeTitle3 /, 3 / InTagName */);
		var stateBeforeTitle3 = ifElseState("E", 57 /* BeforeTitle4 /, 3 / InTagName */);
		var stateBeforeSpecialTEnd = ifElseState("I", 58 /* AfterTitle1 /, 1 / Text */);
		var stateAfterTitle1 = ifElseState("T", 59 /* AfterTitle2 /, 1 / Text */);
		var stateAfterTitle2 = ifElseState("L", 60 /* AfterTitle3 /, 1 / Text */);
		var stateAfterTitle3 = ifElseState("E", 61 /* AfterTitle4 /, 1 / Text */);
		var stateBeforeNumericEntity = ifElseState("X", 66 /* InHexEntity /, 65 / InNumericEntity */);
		/**
		* Sequences used to match longer strings.
		*
		* We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End
		* sequences with an increased offset.
		*/
		var Sequences = {
		Cdata: new Uint16Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]),
		CdataEnd: new Uint16Array([0x5d, 0x5d, 0x3e]),
		CommentEnd: new Uint16Array([0x2d, 0x2d, 0x3e]),
		ScriptEnd: new Uint16Array([
		0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74,
		]),
		StyleEnd: new Uint16Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]),
		TitleEnd: new Uint16Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
		};
		var Tokenizer = /** @class */ (function () {
		@@ -83,3 +64,3 @@ function Tokenizer(_a, cbs) {
		/** For special parsing behavior inside of script and style tags. */
		this.special = 1 /* None */;
		this.isSpecial = false;
		/** Indicates whether the tokenizer has been paused. */
		@@ -89,6 +70,7 @@ this.running = true;
		this.ended = false;
		this.sequenceIndex = 0;
		this.trieIndex = 0;
		this.trieCurrent = 0;
		this.trieResult = null;
		this.trieExcess = 0;
		this.entityExcess = 0;
		this.xmlMode = xmlMode;
		@@ -105,3 +87,3 @@ this.decodeEntities = decodeEntities;
		this.baseState = 1 /* Text */;
		this.special = 1 /* None */;
		this.currentSequence = undefined;
		this.running = true;
		@@ -113,6 +95,3 @@ this.ended = false;
		return this.cbs.onerror(Error(".write() after done!"));
		if (this.buffer.length)
		this.buffer += chunk;
		else
		this.buffer = chunk;
		this.buffer += chunk;
		this.parse();
		@@ -154,3 +133,4 @@ };
		Tokenizer.prototype.stateText = function (c) {
		if (c === 60 /* Lt */) {
		if (c === 60 /* Lt */ \|\|
		(!this.decodeEntities && this.fastForwardTo(60 /* Lt */))) {
		if (this._index > this.sectionStart) {
		@@ -162,14 +142,135 @@ this.cbs.ontext(this.getSection());
		}
		else if (this.decodeEntities &&
		c === 38 /* Amp */ &&
		(this.special === 1 /* None / \|\| this.special === 4 / Title */)) {
		if (this._index > this.sectionStart) {
		this.cbs.ontext(this.getSection());
		else if (this.decodeEntities && c === 38 /* Amp */) {
		this._state = 25 /* BeforeEntity */;
		}
		};
		Tokenizer.prototype.stateSpecialStartSequence = function (c) {
		var isEnd = this.sequenceIndex === this.currentSequence.length;
		var isMatch = isEnd
		? // If we are at the end of the sequence, make sure the tag name has ended
		isEndOfTagSection(c)
		: // Otherwise, do a case-insensitive comparison
		(c \| 0x20) === this.currentSequence[this.sequenceIndex];
		if (!isMatch) {
		this.isSpecial = false;
		}
		else if (!isEnd) {
		this.sequenceIndex++;
		return;
		}
		this.sequenceIndex = 0;
		this._state = 3 /* InTagName */;
		this.stateInTagName(c);
		};
		/** Look for an end tag. For <title> tags, also decode entities. */
		Tokenizer.prototype.stateInSpecialTag = function (c) {
		if (this.sequenceIndex === this.currentSequence.length) {
		if (c === 62 /* Gt */ \|\| isWhitespace(c)) {
		var endOfText = this._index - this.currentSequence.length;
		if (this.sectionStart < endOfText) {
		// Spoof the index so that reported locations match up.
		var actualIndex = this._index;
		this._index = endOfText;
		this.cbs.ontext(this.getSection());
		this._index = actualIndex;
		}
		this.isSpecial = false;
		this.sectionStart = endOfText + 2; // Skip over the `</`
		this.stateInClosingTagName(c);
		return; // We are done; skip the rest of the function.
		}
		this.baseState = 1 /* Text */;
		this._state = 62 /* BeforeEntity */;
		this.sectionStart = this._index;
		this.sequenceIndex = 0;
		}
		if ((c \| 0x20) === this.currentSequence[this.sequenceIndex]) {
		this.sequenceIndex += 1;
		}
		else if (this.sequenceIndex === 0) {
		if (this.currentSequence === Sequences.TitleEnd) {
		// We have to parse entities in <title> tags.
		if (this.decodeEntities && c === 38 /* Amp */) {
		this._state = 25 /* BeforeEntity */;
		}
		}
		else if (this.fastForwardTo(60 /* Lt */)) {
		// Outside of <title> tags, we can fast-forward.
		this.sequenceIndex = 1;
		}
		}
		else {
		// If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`.
		this.sequenceIndex = Number(c === 60 /* Lt */);
		}
		};
		Tokenizer.prototype.stateCDATASequence = function (c) {
		if (c === Sequences.Cdata[this.sequenceIndex]) {
		if (++this.sequenceIndex === Sequences.Cdata.length) {
		this._state = 21 /* InCommentLike */;
		this.currentSequence = Sequences.CdataEnd;
		this.sequenceIndex = 0;
		this.sectionStart = this._index + 1;
		}
		}
		else {
		this.sequenceIndex = 0;
		this._state = 16 /* InDeclaration */;
		this.stateInDeclaration(c); // Reconsume the character
		}
		};
		/**
		* When we wait for one specific character, we can speed things up
		* by skipping through the buffer until we find it.
		*
		* @returns Whether the character was found.
		*/
		Tokenizer.prototype.fastForwardTo = function (c) {
		while (++this._index < this.buffer.length) {
		if (this.buffer.charCodeAt(this._index) === c) {
		return true;
		}
		}
		/*
		* We increment the index at the end of the `parse` loop,
		* so set it to `buffer.length - 1` here.
		*
		* TODO: Refactor `parse` to increment index before calling states.
		*/
		this._index = this.buffer.length - 1;
		return false;
		};
		/**
		* Comments and CDATA end with `-->` and `]]>`.
		*
		* Their common qualities are:
		* - Their end sequences have a distinct character they start with.
		* - That character is then repeated, so we have to check multiple repeats.
		* - All characters but the start character of the sequence can be skipped.
		*/
		Tokenizer.prototype.stateInCommentLike = function (c) {
		if (c === this.currentSequence[this.sequenceIndex]) {
		if (++this.sequenceIndex === this.currentSequence.length) {
		// Remove 2 trailing chars
		var section = this.buffer.slice(this.sectionStart, this._index - 2);
		if (this.currentSequence === Sequences.CdataEnd) {
		this.cbs.oncdata(section);
		}
		else {
		this.cbs.oncomment(section);
		}
		this.sequenceIndex = 0;
		this.sectionStart = this._index + 1;
		this._state = 1 /* Text */;
		}
		}
		else if (this.sequenceIndex === 0) {
		// Fast-forward to the first character of the sequence
		if (this.fastForwardTo(this.currentSequence[0])) {
		this.sequenceIndex = 1;
		}
		}
		else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
		// Allow long sequences, eg. --->, ]]]>
		this.sequenceIndex = 0;
		}
		};
		/**
		* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
		@@ -181,22 +282,12 @@ *
		Tokenizer.prototype.isTagStartChar = function (c) {
		return (isASCIIAlpha(c) \|\|
		(this.xmlMode &&
		!whitespace(c) &&
		c !== 47 /* Slash */ &&
		c !== 62 /* Gt */));
		return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c);
		};
		Tokenizer.prototype.startSpecial = function (sequence, offset) {
		this.isSpecial = true;
		this.currentSequence = sequence;
		this.sequenceIndex = offset;
		this._state = 23 /* SpecialStartSequence */;
		};
		Tokenizer.prototype.stateBeforeTagName = function (c) {
		if (c === 47 /* Slash */) {
		this._state = 5 /* BeforeClosingTagName */;
		}
		else if (c === 60 /* Lt */) {
		this.cbs.ontext(this.getSection());
		this.sectionStart = this._index;
		}
		else if (c === 62 /* Gt */ \|\|
		this.special !== 1 /* None */ \|\|
		whitespace(c)) {
		this._state = 1 /* Text */;
		}
		else if (c === 33 /* ExclamationMark */) {
		if (c === 33 /* ExclamationMark */) {
		this._state = 15 /* BeforeDeclaration */;
		@@ -209,19 +300,25 @@ this.sectionStart = this._index + 1;
		}
		else if (!this.isTagStartChar(c)) {
		this._state = 1 /* Text */;
		else if (this.isTagStartChar(c)) {
		var lower = c \| 0x20;
		this.sectionStart = this._index;
		if (!this.xmlMode && lower === Sequences.TitleEnd[2]) {
		this.startSpecial(Sequences.TitleEnd, 3);
		}
		else {
		this._state =
		!this.xmlMode && lower === Sequences.ScriptEnd[2]
		? 22 /* BeforeSpecialS */
		: 3 /* InTagName */;
		}
		}
		else if (c === 47 /* Slash */) {
		this._state = 5 /* BeforeClosingTagName */;
		}
		else {
		this._state =
		!this.xmlMode &&
		(c === 115 /* LowerS / \|\| c === 83 / UpperS */)
		? 32 /* BeforeSpecialS */
		: !this.xmlMode &&
		(c === 116 /* LowerT / \|\| c === 84 / UpperT */)
		? 52 /* BeforeSpecialT */
		: 3 /* InTagName */;
		this.sectionStart = this._index;
		this._state = 1 /* Text */;
		this.stateText(c);
		}
		};
		Tokenizer.prototype.stateInTagName = function (c) {
		if (c === 47 /* Slash / \|\| c === 62 / Gt */ \|\| whitespace(c)) {
		if (isEndOfTagSection(c)) {
		this.cbs.onopentagname(this.getSection());
		@@ -234,3 +331,3 @@ this.sectionStart = -1;
		Tokenizer.prototype.stateBeforeClosingTagName = function (c) {
		if (whitespace(c)) {
		if (isWhitespace(c)) {
		// Ignore
		@@ -241,22 +338,6 @@ }
		}
		else if (this.special !== 1 /* None */) {
		if (this.special !== 4 /* Title */ &&
		(c === 115 /* LowerS / \|\| c === 83 / UpperS */)) {
		this._state = 33 /* BeforeSpecialSEnd */;
		}
		else if (this.special === 4 /* Title */ &&
		(c === 116 /* LowerT / \|\| c === 84 / UpperT */)) {
		this._state = 53 /* BeforeSpecialTEnd */;
		}
		else {
		this._state = 1 /* Text */;
		this.stateText(c);
		}
		}
		else if (!this.isTagStartChar(c)) {
		this._state = 20 /* InSpecialComment */;
		this.sectionStart = this._index;
		}
		else {
		this._state = 6 /* InClosingTagName */;
		this._state = this.isTagStartChar(c)
		? 6 /* InClosingTagName */
		: 20 /* InSpecialComment */;
		this.sectionStart = this._index;
		@@ -266,3 +347,3 @@ }
		Tokenizer.prototype.stateInClosingTagName = function (c) {
		if (c === 62 /* Gt */ \|\| whitespace(c)) {
		if (c === 62 /* Gt */ \|\| isWhitespace(c)) {
		this.cbs.onclosetag(this.getSection());
		@@ -276,3 +357,3 @@ this.sectionStart = -1;
		// Skip everything until ">"
		if (c === 62 /* Gt */) {
		if (c === 62 /* Gt / \|\| this.fastForwardTo(62 / Gt */)) {
		this._state = 1 /* Text */;
		@@ -285,3 +366,10 @@ this.sectionStart = this._index + 1;
		this.cbs.onopentagend();
		this._state = 1 /* Text */;
		if (this.isSpecial) {
		this._state = 24 /* InSpecialTag */;
		this.sequenceIndex = 0;
		}
		else {
		this._state = 1 /* Text */;
		}
		this.baseState = this._state;
		this.sectionStart = this._index + 1;
		@@ -292,3 +380,3 @@ }
		}
		else if (!whitespace(c)) {
		else if (!isWhitespace(c)) {
		this._state = 9 /* InAttributeName */;
		@@ -302,6 +390,7 @@ this.sectionStart = this._index;
		this._state = 1 /* Text */;
		this.baseState = 1 /* Text */;
		this.sectionStart = this._index + 1;
		this.special = 1 /* None */; // Reset special state, in case of self-closing special tags
		this.isSpecial = false; // Reset special state, in case of self-closing special tags
		}
		else if (!whitespace(c)) {
		else if (!isWhitespace(c)) {
		this._state = 8 /* BeforeAttributeName */;
		@@ -312,6 +401,3 @@ this.stateBeforeAttributeName(c);
		Tokenizer.prototype.stateInAttributeName = function (c) {
		if (c === 61 /* Eq */ \|\|
		c === 47 /* Slash */ \|\|
		c === 62 /* Gt */ \|\|
		whitespace(c)) {
		if (c === 61 /* Eq */ \|\| isEndOfTagSection(c)) {
		this.cbs.onattribname(this.getSection());
		@@ -332,3 +418,3 @@ this.sectionStart = -1;
		}
		else if (!whitespace(c)) {
		else if (!isWhitespace(c)) {
		this.cbs.onattribend(undefined);
		@@ -348,3 +434,3 @@ this._state = 9 /* InAttributeName */;
		}
		else if (!whitespace(c)) {
		else if (!isWhitespace(c)) {
		this.sectionStart = this._index;
		@@ -356,3 +442,4 @@ this._state = 14 /* InAttributeValueNq */;
		Tokenizer.prototype.handleInAttributeValue = function (c, quote) {
		if (c === quote) {
		if (c === quote \|\|
		(!this.decodeEntities && this.fastForwardTo(quote))) {
		this.cbs.onattribdata(this.getSection());
		@@ -364,6 +451,4 @@ this.sectionStart = -1;
		else if (this.decodeEntities && c === 38 /* Amp */) {
		this.cbs.onattribdata(this.getSection());
		this.baseState = this._state;
		this._state = 62 /* BeforeEntity */;
		this.sectionStart = this._index;
		this._state = 25 /* BeforeEntity */;
		}
		@@ -378,3 +463,3 @@ };
		Tokenizer.prototype.stateInAttributeValueNoQuotes = function (c) {
		if (whitespace(c) \|\| c === 62 /* Gt */) {
		if (isWhitespace(c) \|\| c === 62 /* Gt */) {
		this.cbs.onattribdata(this.getSection());
		@@ -387,18 +472,20 @@ this.sectionStart = -1;
		else if (this.decodeEntities && c === 38 /* Amp */) {
		this.cbs.onattribdata(this.getSection());
		this.baseState = this._state;
		this._state = 62 /* BeforeEntity */;
		this.sectionStart = this._index;
		this._state = 25 /* BeforeEntity */;
		}
		};
		Tokenizer.prototype.stateBeforeDeclaration = function (c) {
		this._state =
		c === 91 /* OpeningSquareBracket */
		? 23 /* BeforeCdata1 */
		: c === 45 /* Dash */
		if (c === 91 /* OpeningSquareBracket */) {
		this._state = 19 /* CDATASequence */;
		this.sequenceIndex = 0;
		}
		else {
		this._state =
		c === 45 /* Dash */
		? 18 /* BeforeComment */
		: 16 /* InDeclaration */;
		}
		};
		Tokenizer.prototype.stateInDeclaration = function (c) {
		if (c === 62 /* Gt */) {
		if (c === 62 /* Gt / \|\| this.fastForwardTo(62 / Gt */)) {
		this.cbs.ondeclaration(this.getSection());
		@@ -410,3 +497,3 @@ this._state = 1 /* Text */;
		Tokenizer.prototype.stateInProcessingInstruction = function (c) {
		if (c === 62 /* Gt */) {
		if (c === 62 /* Gt / \|\| this.fastForwardTo(62 / Gt */)) {
		this.cbs.onprocessinginstruction(this.getSection());
		@@ -419,3 +506,6 @@ this._state = 1 /* Text */;
		if (c === 45 /* Dash */) {
		this._state = 19 /* InComment */;
		this._state = 21 /* InCommentLike */;
		this.currentSequence = Sequences.CommentEnd;
		// Allow short comments (eg. <!-->)
		this.sequenceIndex = 2;
		this.sectionStart = this._index + 1;
		@@ -427,9 +517,5 @@ }
		};
		Tokenizer.prototype.stateInComment = function (c) {
		if (c === 45 /* Dash */)
		this._state = 21 /* AfterComment1 */;
		};
		Tokenizer.prototype.stateInSpecialComment = function (c) {
		if (c === 62 /* Gt */) {
		this.cbs.oncomment(this.buffer.substring(this.sectionStart, this._index));
		if (c === 62 /* Gt / \|\| this.fastForwardTo(62 / Gt */)) {
		this.cbs.oncomment(this.getSection());
		this._state = 1 /* Text */;
		@@ -439,60 +525,9 @@ this.sectionStart = this._index + 1;
		};
		Tokenizer.prototype.stateAfterComment1 = function (c) {
		if (c === 45 /* Dash */) {
		this._state = 22 /* AfterComment2 */;
		}
		else {
		this._state = 19 /* InComment */;
		}
		};
		Tokenizer.prototype.stateAfterComment2 = function (c) {
		if (c === 62 /* Gt */) {
		// Remove 2 trailing chars
		this.cbs.oncomment(this.buffer.substring(this.sectionStart, this._index - 2));
		this._state = 1 /* Text */;
		this.sectionStart = this._index + 1;
		}
		else if (c !== 45 /* Dash */) {
		this._state = 19 /* InComment */;
		}
		// Else: stay in AFTER_COMMENT_2 (`--->`)
		};
		Tokenizer.prototype.stateBeforeCdata6 = function (c) {
		if (c === 91 /* OpeningSquareBracket */) {
		this._state = 29 /* InCdata */;
		this.sectionStart = this._index + 1;
		}
		else {
		this._state = 16 /* InDeclaration */;
		this.stateInDeclaration(c);
		}
		};
		Tokenizer.prototype.stateInCdata = function (c) {
		if (c === 93 /* ClosingSquareBracket */)
		this._state = 30 /* AfterCdata1 */;
		};
		Tokenizer.prototype.stateAfterCdata1 = function (c) {
		if (c === 93 /* ClosingSquareBracket */)
		this._state = 31 /* AfterCdata2 */;
		else
		this._state = 29 /* InCdata */;
		};
		Tokenizer.prototype.stateAfterCdata2 = function (c) {
		if (c === 62 /* Gt */) {
		// Remove 2 trailing chars
		this.cbs.oncdata(this.buffer.substring(this.sectionStart, this._index - 2));
		this._state = 1 /* Text */;
		this.sectionStart = this._index + 1;
		}
		else if (c !== 93 /* ClosingSquareBracket */) {
		this._state = 29 /* InCdata */;
		}
		// Else: stay in AFTER_CDATA_2 (`]]]>`)
		};
		Tokenizer.prototype.stateBeforeSpecialS = function (c) {
		if (c === 99 /* LowerC / \|\| c === 67 / UpperC */) {
		this._state = 34 /* BeforeScript1 */;
		var lower = c \| 0x20;
		if (lower === Sequences.ScriptEnd[3]) {
		this.startSpecial(Sequences.ScriptEnd, 4);
		}
		else if (c === 116 /* LowerT / \|\| c === 84 / UpperT */) {
		this._state = 44 /* BeforeStyle1 */;
		else if (lower === Sequences.StyleEnd[3]) {
		this.startSpecial(Sequences.StyleEnd, 4);
		}
		@@ -504,52 +539,21 @@ else {
		};
		Tokenizer.prototype.stateBeforeSpecialSEnd = function (c) {
		if (this.special === 2 /* Script */ &&
		(c === 99 /* LowerC / \|\| c === 67 / UpperC */)) {
		this._state = 39 /* AfterScript1 */;
		}
		else if (this.special === 3 /* Style */ &&
		(c === 116 /* LowerT / \|\| c === 84 / UpperT */)) {
		this._state = 48 /* AfterStyle1 */;
		}
		else
		this._state = 1 /* Text */;
		};
		Tokenizer.prototype.stateBeforeSpecialLast = function (c, special) {
		if (c === 47 /* Slash / \|\| c === 62 / Gt */ \|\| whitespace(c)) {
		this.special = special;
		}
		this._state = 3 /* InTagName */;
		this.stateInTagName(c); // Consume the token again
		};
		Tokenizer.prototype.stateAfterSpecialLast = function (c, sectionStartOffset) {
		if (c === 62 /* Gt */ \|\| whitespace(c)) {
		this.sectionStart = this._index - sectionStartOffset;
		this.special = 1 /* None */;
		this._state = 6 /* InClosingTagName */;
		this.stateInClosingTagName(c); // Reconsume the token
		}
		else
		this._state = 1 /* Text */;
		};
		Tokenizer.prototype.stateBeforeEntity = function (c) {
		// Start excess with 1 to include the '&'
		this.entityExcess = 1;
		if (c === 35 /* Num */) {
		this._state = 63 /* BeforeNumericEntity */;
		this._state = 26 /* BeforeNumericEntity */;
		}
		else if (c === 38 /* Amp */) {
		// We have two `&` characters in a row. Emit the first one.
		this.emitPartial(this.getSection());
		this.sectionStart = this._index;
		// We have two `&` characters in a row. Stay in the current state.
		}
		else {
		this._state = 64 /* InNamedEntity */;
		this.trieIndex = 0;
		this.trieCurrent = this.entityTrie[0];
		this.trieResult = null;
		// Start excess with 1 to include the '&'
		this.trieExcess = 1;
		this._index--;
		this._state = 27 /* InNamedEntity */;
		this.stateInNamedEntity(c);
		}
		};
		Tokenizer.prototype.stateInNamedEntity = function (c) {
		this.trieExcess += 1;
		this.entityExcess += 1;
		this.trieIndex = (0, decode_1.determineBranch)(this.entityTrie, this.trieCurrent, this.trieIndex + 1, c);
		@@ -570,2 +574,7 @@ if (this.trieIndex < 0) {
		else {
		// Add 1 as we have already incremented the excess
		var entityStart = this._index - this.entityExcess + 1;
		if (entityStart > this.sectionStart) {
		this.emitPartial(this.buffer.substring(this.sectionStart, entityStart));
		}
		// If this is a surrogate pair, combine the higher bits from the node with the next byte
		@@ -576,3 +585,4 @@ this.trieResult =
		: String.fromCharCode(this.entityTrie[++this.trieIndex]);
		this.trieExcess = 0;
		this.entityExcess = 0;
		this.sectionStart = this._index + 1;
		}
		@@ -585,10 +595,24 @@ }
		}
		this.sectionStart = this._index - this.trieExcess + 1;
		this._state = this.baseState;
		};
		Tokenizer.prototype.stateBeforeNumericEntity = function (c) {
		if ((c \| 0x20) === 120 /* LowerX */) {
		this.entityExcess++;
		this._state = 29 /* InHexEntity */;
		}
		else {
		this._state = 28 /* InNumericEntity */;
		this.stateInNumericEntity(c);
		}
		};
		Tokenizer.prototype.decodeNumericEntity = function (base, strict) {
		var sectionStart = this.sectionStart + 2 + (base >> 4);
		if (sectionStart !== this._index) {
		var entityStart = this._index - this.entityExcess - 1;
		var numberStart = entityStart + 2 + (base >> 4);
		if (numberStart !== this._index) {
		// Emit leading data if any
		if (entityStart > this.sectionStart) {
		this.emitPartial(this.buffer.substring(this.sectionStart, entityStart));
		}
		// Parse entity
		var entity = this.buffer.substring(sectionStart, this._index);
		var entity = this.buffer.substring(numberStart, this._index);
		var parsed = parseInt(entity, base);
		@@ -604,3 +628,3 @@ this.emitPartial((0, decode_codepoint_1.default)(parsed));
		}
		else if (c < 48 /* Zero / \|\| c > 57 / Nine */) {
		else if (!isNumber(c)) {
		if (this.allowLegacyEntity()) {
		@@ -614,2 +638,5 @@ this.decodeNumericEntity(10, false);
		}
		else {
		this.entityExcess++;
		}
		};
		@@ -622,3 +649,3 @@ Tokenizer.prototype.stateInHexEntity = function (c) {
		(c < 65 /* UpperA / \|\| c > 70 / UpperF */) &&
		(c < 48 /* Zero / \|\| c > 57 / Nine */)) {
		!isNumber(c)) {
		if (this.allowLegacyEntity()) {
		@@ -632,5 +659,10 @@ this.decodeNumericEntity(16, false);
		}
		else {
		this.entityExcess++;
		}
		};
		Tokenizer.prototype.allowLegacyEntity = function () {
		return !this.xmlMode && this.baseState === 1 /* Text */;
		return (!this.xmlMode &&
		(this.baseState === 1 /* Text */ \|\|
		this.baseState === 24 /* InSpecialTag */));
		};
		@@ -643,4 +675,6 @@ /**
		if (this.running &&
		this._state === 1 /* Text */ &&
		this.sectionStart !== this._index) {
		this.sectionStart !== this._index &&
		(this._state === 1 /* Text */ \|\|
		(this._state === 24 /* InSpecialTag */ &&
		this.sequenceIndex === 0))) {
		// TODO: We could emit attribute data here as well.
		@@ -659,2 +693,5 @@ this.cbs.ontext(this.buffer.substr(this.sectionStart));
		};
		Tokenizer.prototype.shouldContinue = function () {
		return this._index < this.buffer.length && this.running;
		};
		/**
		@@ -666,3 +703,3 @@ * Iterates through the buffer, calling the function corresponding to the current state.
		Tokenizer.prototype.parse = function () {
		while (this._index < this.buffer.length && this.running) {
		while (this.shouldContinue()) {
		var c = this.buffer.charCodeAt(this._index);
		@@ -672,2 +709,11 @@ if (this._state === 1 /* Text */) {
		}
		else if (this._state === 23 /* SpecialStartSequence */) {
		this.stateSpecialStartSequence(c);
		}
		else if (this._state === 24 /* InSpecialTag */) {
		this.stateInSpecialTag(c);
		}
		else if (this._state === 19 /* CDATASequence */) {
		this.stateCDATASequence(c);
		}
		else if (this._state === 12 /* InAttributeValueDq */) {
		@@ -679,4 +725,4 @@ this.stateInAttributeValueDoubleQuotes(c);
		}
		else if (this._state === 19 /* InComment */) {
		this.stateInComment(c);
		else if (this._state === 21 /* InCommentLike */) {
		this.stateInCommentLike(c);
		}
		@@ -713,8 +759,5 @@ else if (this._state === 20 /* InSpecialComment */) {
		}
		else if (this._state === 32 /* BeforeSpecialS */) {
		else if (this._state === 22 /* BeforeSpecialS */) {
		this.stateBeforeSpecialS(c);
		}
		else if (this._state === 21 /* AfterComment1 */) {
		this.stateAfterComment1(c);
		}
		else if (this._state === 14 /* InAttributeValueNq */) {
		@@ -732,135 +775,18 @@ this.stateInAttributeValueNoQuotes(c);
		}
		else if (this._state === 22 /* AfterComment2 */) {
		this.stateAfterComment2(c);
		}
		else if (this._state === 18 /* BeforeComment */) {
		this.stateBeforeComment(c);
		}
		else if (this._state === 33 /* BeforeSpecialSEnd */) {
		this.stateBeforeSpecialSEnd(c);
		}
		else if (this._state === 53 /* BeforeSpecialTEnd */) {
		stateBeforeSpecialTEnd(this, c);
		}
		else if (this._state === 39 /* AfterScript1 */) {
		stateAfterScript1(this, c);
		}
		else if (this._state === 40 /* AfterScript2 */) {
		stateAfterScript2(this, c);
		}
		else if (this._state === 41 /* AfterScript3 */) {
		stateAfterScript3(this, c);
		}
		else if (this._state === 34 /* BeforeScript1 */) {
		stateBeforeScript1(this, c);
		}
		else if (this._state === 35 /* BeforeScript2 */) {
		stateBeforeScript2(this, c);
		}
		else if (this._state === 36 /* BeforeScript3 */) {
		stateBeforeScript3(this, c);
		}
		else if (this._state === 37 /* BeforeScript4 */) {
		stateBeforeScript4(this, c);
		}
		else if (this._state === 38 /* BeforeScript5 */) {
		this.stateBeforeSpecialLast(c, 2 /* Script */);
		}
		else if (this._state === 42 /* AfterScript4 */) {
		stateAfterScript4(this, c);
		}
		else if (this._state === 43 /* AfterScript5 */) {
		this.stateAfterSpecialLast(c, 6);
		}
		else if (this._state === 44 /* BeforeStyle1 */) {
		stateBeforeStyle1(this, c);
		}
		else if (this._state === 29 /* InCdata */) {
		this.stateInCdata(c);
		}
		else if (this._state === 45 /* BeforeStyle2 */) {
		stateBeforeStyle2(this, c);
		}
		else if (this._state === 46 /* BeforeStyle3 */) {
		stateBeforeStyle3(this, c);
		}
		else if (this._state === 47 /* BeforeStyle4 */) {
		this.stateBeforeSpecialLast(c, 3 /* Style */);
		}
		else if (this._state === 48 /* AfterStyle1 */) {
		stateAfterStyle1(this, c);
		}
		else if (this._state === 49 /* AfterStyle2 */) {
		stateAfterStyle2(this, c);
		}
		else if (this._state === 50 /* AfterStyle3 */) {
		stateAfterStyle3(this, c);
		}
		else if (this._state === 51 /* AfterStyle4 */) {
		this.stateAfterSpecialLast(c, 5);
		}
		else if (this._state === 52 /* BeforeSpecialT */) {
		stateBeforeSpecialT(this, c);
		}
		else if (this._state === 54 /* BeforeTitle1 */) {
		stateBeforeTitle1(this, c);
		}
		else if (this._state === 55 /* BeforeTitle2 */) {
		stateBeforeTitle2(this, c);
		}
		else if (this._state === 56 /* BeforeTitle3 */) {
		stateBeforeTitle3(this, c);
		}
		else if (this._state === 57 /* BeforeTitle4 */) {
		this.stateBeforeSpecialLast(c, 4 /* Title */);
		}
		else if (this._state === 58 /* AfterTitle1 */) {
		stateAfterTitle1(this, c);
		}
		else if (this._state === 59 /* AfterTitle2 */) {
		stateAfterTitle2(this, c);
		}
		else if (this._state === 60 /* AfterTitle3 */) {
		stateAfterTitle3(this, c);
		}
		else if (this._state === 61 /* AfterTitle4 */) {
		this.stateAfterSpecialLast(c, 5);
		}
		else if (this._state === 17 /* InProcessingInstruction */) {
		this.stateInProcessingInstruction(c);
		}
		else if (this._state === 64 /* InNamedEntity */) {
		else if (this._state === 27 /* InNamedEntity */) {
		this.stateInNamedEntity(c);
		}
		else if (this._state === 23 /* BeforeCdata1 */) {
		stateBeforeCdata1(this, c);
		}
		else if (this._state === 62 /* BeforeEntity */) {
		else if (this._state === 25 /* BeforeEntity */) {
		this.stateBeforeEntity(c);
		}
		else if (this._state === 24 /* BeforeCdata2 */) {
		stateBeforeCdata2(this, c);
		}
		else if (this._state === 25 /* BeforeCdata3 */) {
		stateBeforeCdata3(this, c);
		}
		else if (this._state === 30 /* AfterCdata1 */) {
		this.stateAfterCdata1(c);
		}
		else if (this._state === 31 /* AfterCdata2 */) {
		this.stateAfterCdata2(c);
		}
		else if (this._state === 26 /* BeforeCdata4 */) {
		stateBeforeCdata4(this, c);
		}
		else if (this._state === 27 /* BeforeCdata5 */) {
		stateBeforeCdata5(this, c);
		}
		else if (this._state === 28 /* BeforeCdata6 */) {
		this.stateBeforeCdata6(c);
		}
		else if (this._state === 66 /* InHexEntity */) {
		else if (this._state === 29 /* InHexEntity */) {
		this.stateInHexEntity(c);
		}
		else if (this._state === 65 /* InNumericEntity */) {
		else if (this._state === 28 /* InNumericEntity */) {
		this.stateInNumericEntity(c);
		@@ -870,3 +796,3 @@ }
		// `this._state === State.BeforeNumericEntity`
		stateBeforeNumericEntity(this, c);
		this.stateBeforeNumericEntity(c);
		}
		@@ -878,2 +804,5 @@ this._index++;
		Tokenizer.prototype.finish = function () {
		if (this._state === 27 /* InNamedEntity */) {
		this.emitNamedEntity();
		}
		// If there is remaining data, emit it in a reasonable way
		@@ -888,26 +817,17 @@ if (this.sectionStart < this._index) {
		var data = this.buffer.substr(this.sectionStart);
		if (this._state === 29 /* InCdata */ \|\|
		this._state === 30 /* AfterCdata1 */ \|\|
		this._state === 31 /* AfterCdata2 */) {
		this.cbs.oncdata(data);
		}
		else if (this._state === 19 /* InComment */ \|\|
		this._state === 21 /* AfterComment1 */ \|\|
		this._state === 22 /* AfterComment2 */) {
		this.cbs.oncomment(data);
		}
		else if (this._state === 64 /* InNamedEntity */ && !this.xmlMode) {
		// Increase excess for EOF
		this.trieExcess++;
		this.emitNamedEntity();
		if (this.sectionStart < this._index) {
		this._state = this.baseState;
		this.handleTrailingData();
		if (this._state === 21 /* InCommentLike */) {
		if (this.currentSequence === Sequences.CdataEnd) {
		this.cbs.oncdata(data);
		}
		else {
		this.cbs.oncomment(data);
		}
		}
		else if (this._state === 65 /* InNumericEntity */ && !this.xmlMode) {
		else if (this._state === 28 /* InNumericEntity */ &&
		this.allowLegacyEntity()) {
		this.decodeNumericEntity(10, false);
		// All trailing data will have been consumed
		}
		else if (this._state === 66 /* InHexEntity */ && !this.xmlMode) {
		else if (this._state === 29 /* InHexEntity */ &&
		this.allowLegacyEntity()) {
		this.decodeNumericEntity(16, false);
		@@ -938,3 +858,4 @@ // All trailing data will have been consumed
		Tokenizer.prototype.emitPartial = function (value) {
		if (this.baseState !== 1 /* Text */) {
		if (this.baseState !== 1 /* Text */ &&
		this.baseState !== 24 /* InSpecialTag */) {
		this.cbs.onattribdata(value);
		@@ -941,0 +862,0 @@ }

package.json

		{
		"name": "htmlparser2",
		"description": "Fast & forgiving HTML/XML parser",
		"version": "7.1.2",
		"version": "7.2.0",
		"author": "Felix Boehm <me@feedic.com>",
		@@ -57,12 +57,12 @@ "funding": [
		"devDependencies": {
		"@types/jest": "^27.0.1",
		"@types/node": "^16.9.1",
		"@typescript-eslint/eslint-plugin": "^4.31.0",
		"@typescript-eslint/parser": "^4.31.0",
		"eslint": "^7.32.0",
		"@types/jest": "^27.0.2",
		"@types/node": "^16.11.7",
		"@typescript-eslint/eslint-plugin": "^5.3.1",
		"@typescript-eslint/parser": "^5.3.1",
		"eslint": "^8.2.0",
		"eslint-config-prettier": "^8.1.0",
		"jest": "^27.1.1",
		"prettier": "^2.4.0",
		"ts-jest": "^27.0.5",
		"typescript": "^4.4.2"
		"jest": "^27.3.1",
		"prettier": "^2.4.1",
		"ts-jest": "^27.0.7",
		"typescript": "^4.4.4"
		},
		@@ -69,0 +69,0 @@ "jest": {

README.md

		@@ -110,3 +110,3 @@ # htmlparser2

		const dom = htmlparser2.parseDocument();
		const dom = htmlparser2.parseDocument(htmlString);
		```
		@@ -113,0 +113,0 @@

lib/Tokenizer.d.ts.map

Sorry, the diff of this file is not supported yet

htmlparser2 - npm Package Compare versions

New alerts

Fixed alerts

Worsened metrics