html-encoding-sniffer
Advanced tools
Comparing version 2.0.1 to 3.0.0
@@ -5,4 +5,4 @@ "use strict"; | ||
// https://html.spec.whatwg.org/#encoding-sniffing-algorithm | ||
module.exports = (buffer, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { | ||
let encoding = whatwgEncoding.getBOMEncoding(buffer); // see https://github.com/whatwg/html/issues/1910 | ||
module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => { | ||
let encoding = whatwgEncoding.getBOMEncoding(uint8Array); | ||
@@ -14,3 +14,3 @@ if (encoding === null && transportLayerEncodingLabel !== undefined) { | ||
if (encoding === null) { | ||
encoding = prescanMetaCharset(buffer); | ||
encoding = prescanMetaCharset(uint8Array); | ||
} | ||
@@ -26,13 +26,13 @@ | ||
// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding | ||
function prescanMetaCharset(buffer) { | ||
const l = Math.min(buffer.length, 1024); | ||
function prescanMetaCharset(uint8Array) { | ||
const l = Math.min(uint8Array.byteLength, 1024); | ||
for (let i = 0; i < l; i++) { | ||
let c = buffer[i]; | ||
let c = uint8Array[i]; | ||
if (c === 0x3C) { | ||
// "<" | ||
const c1 = buffer[i + 1]; | ||
const c2 = buffer[i + 2]; | ||
const c3 = buffer[i + 3]; | ||
const c4 = buffer[i + 4]; | ||
const c5 = buffer[i + 5]; | ||
const c1 = uint8Array[i + 1]; | ||
const c2 = uint8Array[i + 2]; | ||
const c3 = uint8Array[i + 3]; | ||
const c4 = uint8Array[i + 4]; | ||
const c5 = uint8Array[i + 5]; | ||
// !-- (comment start) | ||
@@ -42,5 +42,5 @@ if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) { | ||
for (; i < l; i++) { | ||
c = buffer[i]; | ||
const cMinus1 = buffer[i - 1]; | ||
const cMinus2 = buffer[i - 2]; | ||
c = uint8Array[i]; | ||
const cMinus1 = uint8Array[i - 1]; | ||
const cMinus2 = uint8Array[i - 2]; | ||
// --> (comment end) | ||
@@ -65,3 +65,3 @@ if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) { | ||
do { | ||
attrRes = getAttribute(buffer, i, l); | ||
attrRes = getAttribute(uint8Array, i, l); | ||
if (attrRes.attr && !attributeList.has(attrRes.attr.name)) { | ||
@@ -105,3 +105,3 @@ attributeList.add(attrRes.attr.name); | ||
for (i += 2; i < l; i++) { | ||
c = buffer[i]; | ||
c = uint8Array[i]; | ||
// space or > | ||
@@ -114,3 +114,3 @@ if (isSpaceCharacter(c) || c === 0x3E) { | ||
do { | ||
attrRes = getAttribute(buffer, i, l); | ||
attrRes = getAttribute(uint8Array, i, l); | ||
i = attrRes.i; | ||
@@ -121,3 +121,3 @@ } while (attrRes.attr); | ||
for (i += 2; i < l; i++) { | ||
c = buffer[i]; | ||
c = uint8Array[i]; | ||
// > | ||
@@ -135,5 +135,5 @@ if (c === 0x3E) { | ||
// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing | ||
function getAttribute(buffer, i, l) { | ||
function getAttribute(uint8Array, i, l) { | ||
for (; i < l; i++) { | ||
let c = buffer[i]; | ||
let c = uint8Array[i]; | ||
// space or / | ||
@@ -150,3 +150,3 @@ if (isSpaceCharacter(c) || c === 0x2F) { | ||
nameLoop:for (; i < l; i++) { | ||
c = buffer[i]; | ||
c = uint8Array[i]; | ||
// "=" | ||
@@ -160,3 +160,3 @@ if (c === 0x3D && name !== "") { | ||
for (i++; i < l; i++) { | ||
c = buffer[i]; | ||
c = uint8Array[i]; | ||
// space | ||
@@ -187,7 +187,7 @@ if (isSpaceCharacter(c)) { | ||
} | ||
c = buffer[i]; | ||
c = uint8Array[i]; | ||
// space | ||
if (isSpaceCharacter(c)) { | ||
for (i++; i < l; i++) { | ||
c = buffer[i]; | ||
c = uint8Array[i]; | ||
// space | ||
@@ -205,3 +205,3 @@ if (isSpaceCharacter(c)) { | ||
for (i++; i < l; i++) { | ||
c = buffer[i]; | ||
c = uint8Array[i]; | ||
@@ -235,3 +235,3 @@ if (c === quote) { | ||
for (i++; i < l; i++) { | ||
c = buffer[i]; | ||
c = uint8Array[i]; | ||
@@ -258,3 +258,3 @@ // space or > | ||
while (true) { | ||
const indexOfCharset = string.substring(position).search(/charset/i); | ||
const indexOfCharset = string.substring(position).search(/charset/ui); | ||
@@ -300,3 +300,3 @@ if (indexOfCharset === -1) { | ||
const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/); | ||
const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/u); | ||
const end = indexOfASCIIWhitespaceOrSemicolon === -1 ? | ||
@@ -303,0 +303,0 @@ string.length : |
@@ -1,2 +0,2 @@ | ||
Copyright © 2016–2020 Domenic Denicola <d@domenic.me> | ||
Copyright © Domenic Denicola <d@domenic.me> | ||
@@ -3,0 +3,0 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: |
@@ -8,3 +8,3 @@ { | ||
], | ||
"version": "2.0.1", | ||
"version": "3.0.0", | ||
"author": "Domenic Denicola <d@domenic.me> (https://domenic.me/)", | ||
@@ -22,11 +22,12 @@ "license": "MIT", | ||
"dependencies": { | ||
"whatwg-encoding": "^1.0.5" | ||
"whatwg-encoding": "^2.0.0" | ||
}, | ||
"devDependencies": { | ||
"eslint": "^6.8.0", | ||
"mocha": "^7.0.0" | ||
"@domenic/eslint-config": "^1.4.0", | ||
"eslint": "^7.32.0", | ||
"mocha": "^9.1.1" | ||
}, | ||
"engines": { | ||
"node": ">=10" | ||
"node": ">=12" | ||
} | ||
} |
@@ -9,6 +9,8 @@ # Determine the Encoding of a HTML Byte Stream | ||
const htmlBuffer = fs.readFileSync("./html-page.html"); | ||
const sniffedEncoding = htmlEncodingSniffer(htmlBuffer); | ||
const htmlBytes = fs.readFileSync("./html-page.html"); | ||
const sniffedEncoding = htmlEncodingSniffer(htmlBytes); | ||
``` | ||
The passed bytes are given as a `Uint8Array`; the Node.js `Buffer` subclass of `Uint8Array` will also work, as shown above. | ||
The returned value will be a canonical [encoding name](https://encoding.spec.whatwg.org/#names-and-labels) (not a label). You might then combine this with the [whatwg-encoding](https://github.com/jsdom/whatwg-encoding) package to decode the result: | ||
@@ -18,3 +20,3 @@ | ||
const whatwgEncoding = require("whatwg-encoding"); | ||
const htmlString = whatwgEncoding.decode(htmlBuffer, sniffedEncoding); | ||
const htmlString = whatwgEncoding.decode(htmlBytes, sniffedEncoding); | ||
``` | ||
@@ -27,3 +29,3 @@ | ||
```js | ||
const sniffedEncoding = htmlEncodingSniffer(htmlBuffer, { | ||
const sniffedEncoding = htmlEncodingSniffer(htmlBytes, { | ||
transportLayerEncodingLabel, | ||
@@ -30,0 +32,0 @@ defaultEncoding |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
11679
41
3
+ Addediconv-lite@0.6.3(transitive)
+ Addedwhatwg-encoding@2.0.0(transitive)
- Removediconv-lite@0.4.24(transitive)
- Removedwhatwg-encoding@1.0.5(transitive)
Updatedwhatwg-encoding@^2.0.0