@nodable/flexible-xml-parser
Advanced tools
+4
-3
| { | ||
| "name": "@nodable/flexible-xml-parser", | ||
| "version": "1.1.1", | ||
| "version": "1.2.0", | ||
| "description": "Fastest XML parser in pure JS with fully customizable ouput", | ||
@@ -48,5 +48,5 @@ "main": "./lib/fxp.cjs", | ||
| "@nodable/base-output-builder": "^1.0.5", | ||
| "@nodable/compact-builder": "^1.0.6", | ||
| "@nodable/compact-builder": "^1.0.8", | ||
| "path-expression-matcher": "^1.5.0", | ||
| "strnum": "^2.2.2" | ||
| "xml-naming": "^0.1.0" | ||
| }, | ||
@@ -58,2 +58,3 @@ "devDependencies": { | ||
| "@babel/register": "^7.28.6", | ||
| "@byspec/xml": "^0.1.0", | ||
| "@nodable/entities": "^2.1.0", | ||
@@ -60,0 +61,0 @@ "@types/node": "^20.19.37", |
| 'use strict'; | ||
| import { ParseError, ErrorCode } from './ParseError.js'; | ||
| import { isSpaceCode } from "./util.js" | ||
@@ -24,7 +25,85 @@ /** | ||
| // Module-level regex. Stateless between calls because getAllMatches() always | ||
| // resets lastIndex to 0 before iterating — see getAllMatches() below. | ||
| const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm'); | ||
| // Module-level regex kept for reference only — no longer called from this | ||
| // module. parseAttributes() below replaces it with an O(n) linear scanner | ||
| // that is immune to catastrophic backtracking and stack overflow. | ||
| // const attrsRegx = new RegExp('([^\\s=]+)\\s*(=\\s*([\'"])([\\s\\S]*?)\\3)?', 'gm'); | ||
| /** | ||
| * Parse an attribute expression string into an array of match tuples. | ||
| * | ||
| * Each element has the same shape the old getAllMatches() returned so that | ||
| * callers are unchanged: | ||
| * [fullMatch, name, '=value' | undefined, quote | undefined, value | undefined] | ||
| * | ||
| * The implementation is a single O(n) pass over char codes with no regex and | ||
| * no recursion, making it safe for arbitrarily long attribute strings. | ||
| * | ||
| * State machine: | ||
| * SEEK_NAME — skipping whitespace looking for the start of an attr name | ||
| * IN_NAME — accumulating a name token until whitespace or '=' | ||
| * SEEK_VALUE — saw name + optional whitespace, now expecting '=' or next name | ||
| * IN_VALUE — inside a quoted value, accumulating until the closing quote | ||
| * | ||
| * @param {string} attrStr | ||
| * @returns {Array} array of match tuples (see shape above) | ||
| */ | ||
| function parseAttributes(attrStr) { | ||
| const results = []; | ||
| const len = attrStr.length; | ||
| let i = 0; | ||
| while (i < len) { | ||
| // Skip whitespace between attributes | ||
| while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++; | ||
| if (i >= len) break; | ||
| // Read name | ||
| const nameStart = i; | ||
| while (i < len && attrStr.charCodeAt(i) !== 61 && !isSpaceCode(attrStr.charCodeAt(i))) i++; | ||
| const name = attrStr.substring(nameStart, i); | ||
| // Skip whitespace before '=' | ||
| while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++; | ||
| if (i >= len || attrStr.charCodeAt(i) !== 61) { | ||
| // Boolean attribute — no '=' | ||
| const m = [name, name, undefined, undefined, undefined]; | ||
| m.startIndex = nameStart; | ||
| results.push(m); | ||
| continue; | ||
| } | ||
| i++; // skip '=' | ||
| // Skip whitespace after '=' | ||
| while (i < len && isSpaceCode(attrStr.charCodeAt(i))) i++; | ||
| // Read quoted value | ||
| const quote = attrStr.charCodeAt(i); | ||
| if (quote === 34 || quote === 39) { // " or ' | ||
| i++; // skip opening quote | ||
| const valueStart = i; | ||
| let value = ''; | ||
| let segStart = i; | ||
| while (i < len && attrStr.charCodeAt(i) !== quote) { | ||
| const c = attrStr.charCodeAt(i); | ||
| if (c === 10 || c === 13) { // \n or \r → space per XML §3.3.3 | ||
| value += attrStr.substring(segStart, i) + ' '; | ||
| segStart = i + 1; | ||
| } | ||
| i++; | ||
| } | ||
| value += attrStr.substring(segStart, i); | ||
| i++; // skip closing quote | ||
| const quoteChar = String.fromCharCode(quote); | ||
| const m = [name + '=' + quoteChar + value + quoteChar, name, '=' + quoteChar + value + quoteChar, quoteChar, value]; | ||
| m.startIndex = nameStart; | ||
| results.push(m); | ||
| } | ||
| } | ||
| return results; | ||
| } | ||
| /** | ||
| * Pass 1: extract raw (unparsed) attribute values into rawAttributes. | ||
@@ -37,5 +116,5 @@ * | ||
| export function collectRawAttributes(attrStr, parser, tagExp) { | ||
| if (!attrStr || attrStr.length === 0) return; | ||
| if (!attrStr || attrStr.length === 0) return; | ||
| const matches = getAllMatches(attrStr, attrsRegx); | ||
| const matches = parseAttributes(attrStr); | ||
| const len = matches.length; | ||
@@ -61,3 +140,3 @@ let count = 0; | ||
| if (!attrStr || attrStr.length === 0) return; | ||
| const matches = getAllMatches(attrStr, attrsRegx); | ||
| const matches = parseAttributes(attrStr); | ||
| const len = matches.length; | ||
@@ -84,28 +163,2 @@ | ||
| } | ||
| } | ||
| /** | ||
| * Run the regex against the string and return all capture groups. | ||
| * lastIndex is always reset to 0 before iterating so the module-level | ||
| * stateful regex is safe to share across calls. | ||
| * | ||
| * @param {string} string | ||
| * @param {RegExp} regex | ||
| * @returns {Array} | ||
| */ | ||
| function getAllMatches(string, regex) { | ||
| regex.lastIndex = 0; | ||
| const matches = []; | ||
| let match = regex.exec(string); | ||
| while (match) { | ||
| const allmatches = []; | ||
| allmatches.startIndex = regex.lastIndex - match[0].length; | ||
| const len = match.length; | ||
| for (let index = 0; index < len; index++) { | ||
| allmatches.push(match[index]); | ||
| } | ||
| matches.push(allmatches); | ||
| match = regex.exec(string); | ||
| } | ||
| return matches; | ||
| } |
@@ -1,3 +0,3 @@ | ||
| import { isName } from './util.js'; | ||
| import { ParseError, ErrorCode } from './ParseError.js'; | ||
| import { name as isName, qName as isQName } from 'xml-naming'; | ||
@@ -270,3 +270,3 @@ export function readDocType(parser) { | ||
| validateEntityName(entityName); | ||
| validateEntityName(entityName, parser.xmlVersion); | ||
| skipSourceWhitespace(source); | ||
@@ -350,3 +350,3 @@ | ||
| if (!isName(elementName)) { | ||
| if (!isName(elementName, parser.xmlVersion)) { | ||
| throw new ParseError(`Invalid element name: "${elementName}"`, | ||
@@ -439,3 +439,3 @@ ErrorCode.INVALID_TAG, | ||
| validateEntityName(notationName); | ||
| validateEntityName(notationName, parser.xmlVersion); | ||
| skipSourceWhitespace(source); | ||
@@ -518,4 +518,4 @@ | ||
| function validateEntityName(name) { | ||
| if (isName(name)) return name; | ||
| function validateEntityName(name, xmlVersion) { | ||
| if (isName(name, xmlVersion)) return name; | ||
| throw new ParseError( | ||
@@ -522,0 +522,0 @@ `Invalid entity name "${name}"`, |
@@ -135,2 +135,3 @@ import { CompactBuilderFactory } from '@nodable/compact-builder'; | ||
| flushThreshold: 1024, | ||
| bufferSize: 256 | ||
| }, | ||
@@ -137,0 +138,0 @@ |
+9
-10
@@ -1,8 +0,1 @@ | ||
| 'use strict'; | ||
| const nameStartChar = ':A-Za-z_\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD'; | ||
| const nameChar = nameStartChar + '\\-.\\d\\u00B7\\u0300-\\u036F\\u203F-\\u2040'; | ||
| export const nameRegexp = '[' + nameStartChar + '][' + nameChar + ']*'; | ||
| const regexName = new RegExp('^' + nameRegexp + '$'); | ||
| export function getAllMatches(string, regex) { | ||
@@ -24,7 +17,13 @@ const matches = []; | ||
| export const isName = function (string) { | ||
| const match = regexName.exec(string); | ||
| return !(match === null || typeof match === 'undefined'); | ||
| export function isSpace(char) { | ||
| return char === " " || char === "\t" || char === "\n" || char === "\r" || char === "\f"; | ||
| } | ||
| export function isSpaceCode(code) { | ||
| return code === 32 || code === 9 || code === 10 || code === 13 || code === 12; // space \t \n \r \f | ||
| } | ||
| export function isExist(v) { | ||
@@ -31,0 +30,0 @@ return typeof v !== 'undefined'; |
+23
-5
@@ -8,5 +8,6 @@ import StringSource from './InputSource/StringSource.js'; | ||
| import { readDocType } from './DocTypeReader.js'; | ||
| import { isName, DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js'; | ||
| import { DANGEROUS_PROPERTY_NAMES, criticalProperties } from './util.js'; | ||
| import AutoCloseHandler from './AutoCloseHandler.js'; | ||
| import { ParseError, ErrorCode } from './ParseError.js'; | ||
| import { name as isName, qName as isQName } from 'xml-naming'; | ||
@@ -64,2 +65,3 @@ class TagDetail { | ||
| this._exitIfTriggered = false; | ||
| this.xmlVersion = '1.0'; | ||
@@ -288,2 +290,14 @@ if (!this.matcher) { | ||
| // Extract namespace prefix and local name from raw tag name (e.g. "ns:tag" → "ns", "tag"). | ||
| // Always done from the raw name (tagExp.tagName), before processTagName strips the prefix, | ||
| // so these values are stable regardless of skip.nsPrefix. | ||
| const colonIdx = tagExp.tagName.indexOf(':'); | ||
| const tagNamespace = colonIdx !== -1 ? tagExp.tagName.slice(0, colonIdx) : undefined; | ||
| // Local name for the matcher: prefix-free always (e.g. "code" from "ns:code"). | ||
| // The matcher library tracks namespace separately via the 3rd push() argument — | ||
| // passing the full "ns:code" as the tag name would break ns::code expression matching. | ||
| const matcherTagName = tagNamespace !== undefined | ||
| ? tagExp.tagName.slice(colonIdx + 1) | ||
| : processedTagName; | ||
| // ── Limit: maxNestedTags ───────────────────────────────────────────────── | ||
@@ -310,3 +324,3 @@ const maxNested = options.limits?.maxNestedTags; | ||
| this.matcher.push(processedTagName, {}); | ||
| this.matcher.push(matcherTagName, {}, tagNamespace); | ||
| if (raeAttrLen > 0) { | ||
@@ -341,3 +355,6 @@ this.matcher.updateCurrent(rawAttributes); | ||
| // Create a fresh processor with the matching nested + skipEnclosures config. | ||
| this._stopNodeProcessor = new StopNodeProcessor(processedTagName, { | ||
| // Raw tag name (tagExp.tagName) is used — the processor scans the source | ||
| // character-by-character and must match the prefix-as-written (e.g. "ns:code"), | ||
| // independent of what skip.nsPrefix does to the processed output name. | ||
| this._stopNodeProcessor = new StopNodeProcessor(tagExp.tagName, { | ||
| nested: stopNodeConfig.nested, | ||
@@ -359,3 +376,4 @@ skipEnclosures: stopNodeConfig.skipEnclosures, | ||
| // but call no output builder methods — the tag is silently dropped. | ||
| this._stopNodeProcessor = new StopNodeProcessor(processedTagName, { | ||
| // Raw tag name used for the same reason as the stop-node branch above. | ||
| this._stopNodeProcessor = new StopNodeProcessor(tagExp.tagName, { | ||
| nested: skipTagConfig.nested, | ||
@@ -469,3 +487,3 @@ skipEnclosures: skipTagConfig.skipEnclosures, | ||
| attrName = resolveNsPrefix(attrName, options.skip.nsPrefix); | ||
| if (!isName(attrName)) { //TODO: make it optional | ||
| if (!isQName(attrName, this.xmlVersion)) { //TODO: make it optional | ||
| throw new ParseError(`Invalid attribute name: ${attrName}`, ErrorCode.INVALID_ATTRIBUTE_NAME); | ||
@@ -472,0 +490,0 @@ } |
+42
-12
@@ -16,2 +16,6 @@ import { buildOptions } from './OptionsBuilder.js'; | ||
| this._isFeeding = false; | ||
| // ── Batching state ────────────────────────────────── | ||
| this._pendingBytes = 0; | ||
| this._batchThreshold = this.options.feedable?.bufferSize; | ||
| } | ||
@@ -130,2 +134,33 @@ | ||
| _runParse() { | ||
| if (!this._feedParser) return; | ||
| const beforePos = this._feedSource.startIndex; // bytes consumed so far | ||
| try { | ||
| this._feedParser.parseXml(); | ||
| } catch (err) { | ||
| if (err.code === ErrorCode.UNEXPECTED_END) { | ||
| this._feedSource.rewindToMark(); | ||
| } else { | ||
| throw err; | ||
| } | ||
| } | ||
| const afterPos = this._feedSource.startIndex; | ||
| const didAdvance = afterPos > beforePos; | ||
| if (didAdvance) { | ||
| // Real progress made — reset threshold normally | ||
| this._pendingBytes = 0; | ||
| } else { | ||
| // Parser is stuck mid-token — grow the threshold to avoid | ||
| // hammering parseXml() until significantly more data arrives | ||
| this._batchThreshold = Math.min( | ||
| this._batchThreshold * 2, | ||
| this.options.feedable.maxBufferSize | ||
| ); | ||
| } | ||
| } | ||
| /** | ||
@@ -165,16 +200,8 @@ * Feed an XML data chunk for incremental parsing. | ||
| this._feedSource.feed(str); | ||
| this._pendingBytes += str.length; | ||
| try { | ||
| this._feedParser.parseXml(); | ||
| } catch (err) { | ||
| if (err.code === ErrorCode.UNEXPECTED_END) { | ||
| // Chunk boundary fell mid-token. Rewind to the token start so the | ||
| // incomplete bytes are re-parsed when the next chunk arrives. | ||
| this._feedSource.rewindToMark(); | ||
| } else { | ||
| // Real parse error — clean up and propagate. | ||
| this._cleanupFeedSession(); | ||
| throw err; | ||
| } | ||
| if (this._pendingBytes >= this._batchThreshold) { | ||
| this._runParse(); | ||
| } | ||
| // Otherwise, delay parsing until next feed() or end() | ||
@@ -207,2 +234,5 @@ return this; | ||
| // Force a final parse (any pending bytes are now processed) | ||
| this._runParse(); | ||
| try { | ||
@@ -209,0 +239,0 @@ // Mark the source as complete so readers know there is no more data. |
+10
-7
| 'use strict'; | ||
| import { ParseError, ErrorCode } from './ParseError.js'; | ||
| import { collectRawAttributes } from './AttributeProcessor.js'; | ||
| import { isName } from "./util.js" | ||
| import { isSpace } from "./util.js" | ||
| import { name as isName, qName as isQName } from 'xml-naming'; | ||
| // Re-export flushAttributes so Xml2JsParser and XmlSpecialTagsReader can | ||
@@ -160,4 +161,5 @@ // continue to import it from here without changing their import lines. | ||
| for (; i < expLen; i++) { | ||
| if (exp[i] === " ") { | ||
| for (; i < exp.length; i++) { | ||
| const c = exp[i]; | ||
| if (isSpace(c)) { | ||
| tagExp.tagName = exp.substring(0, i); | ||
@@ -169,7 +171,7 @@ attrsExp = exp.substring(i + 1); | ||
| //only tag | ||
| if (tagExp.tagName.length === 0 && i === expLen) tagExp.tagName = exp; | ||
| if (tagExp.tagName.length === 0 && i === exp.length) tagExp.tagName = exp; | ||
| tagExp.tagName = tagExp.tagName.trimEnd(); | ||
| tagExp._attrsExp = attrsExp; | ||
| if (!isName(tagExp.tagName)) { | ||
| if (!isQName(tagExp.tagName, parser.xmlVersion)) { | ||
| throw new ParseError("Invalid tag name", ErrorCode.INVALID_TAG_NAME); | ||
@@ -183,4 +185,5 @@ } | ||
| } | ||
| // console.log(tagExp) | ||
| return tagExp; | ||
| } | ||
| return tagExp; | ||
| } |
@@ -39,7 +39,17 @@ import { readPiExp, flushAttributes } from './XmlPartReader.js'; | ||
| let tagExp = readPiExp(parser, "?>"); | ||
| if (!tagExp) throw new ParseError( | ||
| "Invalid Pi Tag expression.", | ||
| ErrorCode.INVALID_TAG, | ||
| { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex } | ||
| ); | ||
| if (!tagExp) { | ||
| throw new ParseError( | ||
| "Invalid Pi Tag expression.", | ||
| ErrorCode.INVALID_TAG, | ||
| { line: parser.source.line, col: parser.source.cols, index: parser.source.startIndex } | ||
| ) | ||
| } else if (tagExp.tagName === "xml") { | ||
| // Read version from the declaration and store it on the parser for validators. | ||
| const version = tagExp.rawAttributes?.version; | ||
| if (version === '1.1') { | ||
| parser.xmlVersion = 1.1; | ||
| } else { | ||
| parser.xmlVersion = 1.0; // default | ||
| } | ||
| } | ||
@@ -46,0 +56,0 @@ // Flush attributes into the output builder's this.attributes accumulator |
205255
2.37%4202
2.36%13
8.33%+ Added
+ Added
+ Added
+ Added
- Removed
- Removed
- Removed