+133
-7
@@ -66,2 +66,3 @@ ;(function (sax) { | ||
| parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH | ||
| parser.encoding = null; | ||
| parser.opt = opt || {} | ||
@@ -211,2 +212,35 @@ parser.opt.lowercase = parser.opt.lowercase || parser.opt.lowercasetags | ||
| function determineBufferEncoding(data, isEnd) { | ||
| // BOM-based detection is the most reliable signal when present. | ||
| if (data.length >= 2) { | ||
| if (data[0] === 0xff && data[1] === 0xfe) { | ||
| return 'utf-16le' | ||
| } | ||
| if (data[0] === 0xfe && data[1] === 0xff) { | ||
| return 'utf-16be' | ||
| } | ||
| } | ||
| if (data.length >= 3 && data[0] === 0xef && data[1] === 0xbb && data[2] === 0xbf) { | ||
| return 'utf8' | ||
| } | ||
| if (data.length >= 4) { | ||
| // XML documents without a BOM still start with "<?xml", which is enough | ||
| // to distinguish UTF-16LE/BE from UTF-8 by looking at the zero bytes. | ||
| if (data[0] === 0x3c && data[1] === 0x00 && data[2] === 0x3f && data[3] === 0x00) { | ||
| return 'utf-16le' | ||
| } | ||
| if (data[0] === 0x00 && data[1] === 0x3c && data[2] === 0x00 && data[3] === 0x3f) { | ||
| return 'utf-16be' | ||
| } | ||
| return 'utf8' | ||
| } | ||
| return isEnd ? 'utf8' : null | ||
| } | ||
| function SAXStream(strict, opt) { | ||
@@ -238,3 +272,3 @@ if (!(this instanceof SAXStream)) { | ||
| this._decoder = null | ||
| this._decoderBuffer = null | ||
| streamWraps.forEach(function (ev) { | ||
@@ -265,2 +299,28 @@ Object.defineProperty(me, 'on' + ev, { | ||
| SAXStream.prototype._decodeBuffer = function (data, isEnd) { | ||
| if (this._decoderBuffer) { | ||
| // Keep incomplete leading bytes until we have enough data to infer the | ||
| // stream encoding, then decode the buffered prefix together with the next chunk. | ||
| data = Buffer.concat([this._decoderBuffer, data]) | ||
| this._decoderBuffer = null | ||
| } | ||
| if (!this._decoder) { | ||
| var encoding = determineBufferEncoding(data, isEnd) | ||
| if (!encoding) { | ||
| // A very short first chunk may not contain enough bytes to detect the | ||
| // encoding yet, so defer decoding until the next write/end call. | ||
| this._decoderBuffer = data | ||
| return '' | ||
| } | ||
| // Store the detected transport encoding so strict mode can compare it | ||
| // with the optional encoding declared in the XML prolog later on. | ||
| this._parser.encoding = encoding | ||
| this._decoder = new TextDecoder(encoding) | ||
| } | ||
| return this._decoder.decode(data, { stream: !isEnd }) | ||
| } | ||
| SAXStream.prototype.write = function (data) { | ||
@@ -272,6 +332,11 @@ if ( | ||
| ) { | ||
| if (!this._decoder) { | ||
| this._decoder = new TextDecoder('utf8') | ||
| data = this._decodeBuffer(data, false) | ||
| } else if (this._decoderBuffer) { | ||
| // Flush any buffered binary prefix before handling a string chunk. | ||
| // This only matters if the caller mixes Buffer and string writes (used in test). | ||
| var remaining = this._decodeBuffer(Buffer.alloc(0), true) | ||
| if (remaining) { | ||
| this._parser.write(remaining) | ||
| this.emit('data', remaining) | ||
| } | ||
| data = this._decoder.decode(data, { stream: true }) | ||
| } | ||
@@ -289,3 +354,9 @@ | ||
| // Flush any remaining decoded data from the TextDecoder | ||
| if (this._decoder) { | ||
| if (this._decoderBuffer) { | ||
| var finalChunk = this._decodeBuffer(Buffer.alloc(0), true) | ||
| if (finalChunk) { | ||
| this._parser.write(finalChunk) | ||
| this.emit('data', finalChunk) | ||
| } | ||
| } else if (this._decoder) { | ||
| var remaining = this._decoder.decode() | ||
@@ -683,2 +754,55 @@ if (remaining) { | ||
| function getDeclaredEncoding(body) { | ||
| var match = body && body.match(/(?:^|\s)encoding\s*=\s*(['"])([^'"]+)\1/i) | ||
| return match ? match[2] : null | ||
| } | ||
| function normalizeEncodingName(encoding) { | ||
| if (!encoding) { | ||
| return null | ||
| } | ||
| return encoding.toLowerCase().replace(/[^a-z0-9]/g, '') | ||
| } | ||
| function encodingsMatch(detectedEncoding, declaredEncoding) { | ||
| const detected = normalizeEncodingName(detectedEncoding) | ||
| const declared = normalizeEncodingName(declaredEncoding) | ||
| if (!detected || !declared) { | ||
| return true | ||
| } | ||
| if (declared === 'utf16') { | ||
| return detected === 'utf16le' || detected === 'utf16be' | ||
| } | ||
| return detected === declared | ||
| } | ||
| function validateXmlDeclarationEncoding(parser, data) { | ||
| if ( | ||
| !parser.strict || | ||
| !parser.encoding || | ||
| !data || | ||
| data.name !== 'xml' | ||
| ) { | ||
| return | ||
| } | ||
| var declaredEncoding = getDeclaredEncoding(data.body) | ||
| if ( | ||
| declaredEncoding && | ||
| !encodingsMatch(parser.encoding, declaredEncoding) | ||
| ) { | ||
| strictFail( | ||
| parser, | ||
| 'XML declaration encoding ' + | ||
| declaredEncoding + | ||
| ' does not match detected stream encoding ' + | ||
| parser.encoding.toUpperCase() | ||
| ) | ||
| } | ||
| } | ||
| function emitNode(parser, nodeType, data) { | ||
@@ -1389,6 +1513,8 @@ if (parser.textNode) closeText(parser) | ||
| if (c === '>') { | ||
| emitNode(parser, 'onprocessinginstruction', { | ||
| const procInstEndData = { | ||
| name: parser.procInstName, | ||
| body: parser.procInstBody, | ||
| }) | ||
| } | ||
| validateXmlDeclarationEncoding(parser, procInstEndData) | ||
| emitNode(parser, 'onprocessinginstruction', procInstEndData) | ||
| parser.procInstName = parser.procInstBody = '' | ||
@@ -1395,0 +1521,0 @@ parser.state = S.TEXT |
+5
-2
@@ -5,3 +5,3 @@ { | ||
| "author": "Isaac Z. Schlueter <i@izs.me> (http://blog.izs.me/)", | ||
| "version": "1.5.0", | ||
| "version": "1.6.0", | ||
| "main": "lib/sax.js", | ||
@@ -16,3 +16,6 @@ "license": "BlueOak-1.0.0", | ||
| }, | ||
| "repository": "git://github.com/isaacs/sax-js.git", | ||
| "repository": { | ||
| "type": "git", | ||
| "url": "git+ssh://git@github.com/isaacs/sax-js.git" | ||
| }, | ||
| "files": [ | ||
@@ -19,0 +22,0 @@ "lib/sax.js", |
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
Long strings
Supply chain riskContains long string literals, which may be a sign of obfuscated or packed code.
Found 1 instance in 1 package
No repository
Supply chain riskPackage does not have a linked source code repository. Without this field, a package will have no reference to the location of the source code use to generate the package.
Found 1 instance in 1 package
61006
6.81%1696
6.73%