jschardet - npm Package Compare versions

.github/workflows/build.yml

.github/workflows/npm-publish.yml

scripts/run-workflow.sh

scripts/show-size-changes.sh

8

index.d.ts

		@@ -5,4 +5,10 @@ export interface IDetectedMap {
		}
		export function detect(buffer: Buffer \| string, options?: { minimumThreshold: number }): IDetectedMap;
		export interface IOptionsMap {
		minimumThreshold?: number,
		detectEncodings?: Array<string>
		}
		export function detect(buffer: Buffer \| string, options?: IOptionsMap): IDetectedMap;

		export function detectAll(buffer: Buffer \| string, options?: IOptionsMap): IDetectedMap[];

		export function enableDebug(): void;

14

package.json

		{
		"name": "jschardet",
		"version": "3.0.0",
		"version": "3.1.0",
		"description": "Character encoding auto-detection in JavaScript (port of python's chardet)",
		@@ -20,10 +20,12 @@ "author": "António Afonso",
		},
		"dependencies": {},
		"devDependencies": {
		"browserify": "~12.0.1",
		"google-closure-compiler": "20151015.0.0"
		"browserify": "~17.0.0",
		"google-closure-compiler": "20151015.0.0",
		"jest": "^29.7.0"
		},
		"scripts": {
		"dist-dev": "mkdir -p dist && browserify index.js -s jschardet --detect-globals false -o dist/jschardet.js",
		"dist": "npm run dist-dev && java -jar node_modules/google-closure-compiler/compiler.jar --warning_level QUIET --compilation_level SIMPLE_OPTIMIZATIONS --js dist/jschardet.js > dist/jschardet.min.js"
		"dist-dev": "mkdir -p dist && browserify index.js -s jschardet --detect-globals false -o dist/jschardet.js && ./scripts/show-size-changes.sh dist/jschardet.js",
		"dist": "npm run dist-dev && java -jar node_modules/google-closure-compiler/compiler.jar --warning_level QUIET --compilation_level SIMPLE_OPTIMIZATIONS --language_in=ECMASCRIPT6_STRICT --language_out=ES5 --js dist/jschardet.js > dist/jschardet.min.js && ./scripts/show-size-changes.sh dist/jschardet.min.js",
		"dist-size-changes": "./scripts/show-size-changes.sh dist/*",
		"test": "jest"
		},
		@@ -30,0 +32,0 @@ "engines": {

6

README.md

		@@ -17,3 +17,3 @@ [![NPM](https://nodei.co/npm/jschardet.png?downloads=true&downloadRank=true)](https://nodei.co/npm/jschardet/)
		### Node
		```
		```
		npm install jschardet
		@@ -58,2 +58,6 @@ ```
		jschardet.detect(str, { minimumThreshold: 0 });

		// Lock down which encodings to detect, can be useful in situations jschardet
		// is giving a higher probability to encodings that you never use.
		jschardet.detect(str, { detectEncodings: ["UTF-8", "windows-1252"] });
		```
		@@ -60,0 +64,0 @@

4

src/charsetgroupprober.js

		@@ -66,2 +66,6 @@ /*

		this.getSupportedCharsetNames = function() {
		throw new Error("Unimplemented method getSupportedCharsetNames()");
		}

		this.feed = function(aBuf) {
		@@ -68,0 +72,0 @@ for( var i = 0, prober; prober = this._mProbers[i]; i++ ) {

4

src/charsetprober.js

		@@ -41,2 +41,6 @@ /*

		this.getSupportedCharsetNames = function() {
		throw new Error("Unimplemented method getSupportedCharsetNames()");
		}

		this.feed = function(aBuf) {
		@@ -43,0 +47,0 @@ }

8

src/escprober.js

		@@ -47,2 +47,6 @@ /*
		];
		self._supportedCharsetNames = [];
		for (const codingSM of self._mCodingSM) {
		self._supportedCharsetNames.push(codingSM.getCodingStateMachine());
		}
		self.reset();
		@@ -66,2 +70,6 @@ }

		this.getSupportedCharsetNames = function() {
		return self._supportedCharsetNames;
		}

		this.getConfidence = function() {
		@@ -68,0 +76,0 @@ if( this._mDetectedCharset ) {

5

src/eucjpprober.js

		@@ -74,4 +74,5 @@ /*
		this._mLastChar[1] = aBuf[0];
		this._mContextAnalyzer.feed(this._mLastChar, charLen);
		this._mDistributionAnalyzer.feed(this._mLastChar, charLen);
		var lastCharStr = this._mLastChar.join('');
		this._mContextAnalyzer.feed(lastCharStr, charLen);
		this._mDistributionAnalyzer.feed(lastCharStr, charLen);
		} else {
		@@ -78,0 +79,0 @@ this._mContextAnalyzer.feed(aBuf.slice(i-1,i+1), charLen);

4

src/latin1prober.js

		@@ -115,2 +115,6 @@ /*

		this.getSupportedCharsetNames = function() {
		return [this.getCharsetName()];
		}

		this.feed = function(aBuf) {
		@@ -117,0 +121,0 @@ aBuf = this.filterWithEnglishLetters(aBuf);

8

src/mbcharsetprober.js

		@@ -42,4 +42,3 @@ /*
		self._mCodingSM = null;
		//self._mLastChar = ["\x00", "\x00"];
		self._mLastChar = "\x00\x00";
		self._mLastChar = ["\x00", "\x00"];
		}
		@@ -55,4 +54,3 @@
		}
		//this._mLastChar = ["\x00", "\x00"];
		this._mLastChar = "\x00\x00";
		this._mLastChar = ["\x00", "\x00"];
		}
		@@ -78,3 +76,3 @@
		this._mLastChar[1] = aBuf[0];
		this._mDistributionAnalyzer.feed(this._mLastChar, charLen);
		this._mDistributionAnalyzer.feed(this._mLastChar.join(''), charLen);
		} else {
		@@ -81,0 +79,0 @@ this._mDistributionAnalyzer.feed(aBuf.slice(i-1,i+1), charLen);

10

src/mbcsgroupprober.js

		@@ -50,2 +50,12 @@ /*
		];
		const supportedCharsetNames = (function() {
		const charsetNames = [];
		for (const prober of this._mProbers) {
		charsetNames.push(prober.getCharsetName())
		}
		return charsetNames;
		});
		this.getSupportedCharsetNames = function() {
		return supportedCharsetNames;
		}
		this.reset();
		@@ -52,0 +62,0 @@ }

9

src/sbcsgroupprober.js

		@@ -67,5 +67,14 @@ /*

		self._supportedCharsetNames = [];
		for (const prober of self._mProbers) {
		self._supportedCharsetNames.push(prober.getCharsetName())
		}

		self.reset();
		}

		this.getSupportedCharsetNames = function() {
		return self._supportedCharsetNames;
		}

		init();
		@@ -72,0 +81,0 @@ }

4

src/sjisprober.js

		@@ -74,4 +74,4 @@ /*
		this._mLastChar[1] = aBuf[0];
		this._mContextAnalyzer.feed(this._mLastChar.slice(2 - charLen), charLen);
		this._mDistributionAnalyzer.feed(this._mLastChar, charLen);
		this._mContextAnalyzer.feed(this._mLastChar.slice(2 - charLen).join(''), charLen);
		this._mDistributionAnalyzer.feed(this._mLastChar.join(''), charLen);
		} else {
		@@ -78,0 +78,0 @@ this._mContextAnalyzer.feed(aBuf.slice(i + 1 - charLen, i + 3 - charLen), charLen);

76

src/universaldetector.js

		@@ -38,5 +38,34 @@ /*
		var Latin1Prober = require('./latin1prober');
		var EscCharSetProber = require('./escprober')
		var EscCharSetProber = require('./escprober');
		var logger = require('./logger');

		const supportedEncodings = (function() {
		const BOM_UTF = [
		"UTF-8", "UTF-32LE", "UTF-32BE", "UTF-32BE", "UTF-16LE", "UTF-16BE",
		"X-ISO-10646-UCS-4-3412", "X-ISO-10646-UCS-4-2143"
		]
		const probers = [
		new EscCharSetProber(),
		new MBCSGroupProber(),
		new SBCSGroupProber(),
		new Latin1Prober()
		];
		const encodings = BOM_UTF.slice(0);
		for (const prober of probers) {
		[].push.apply(encodings, prober.getSupportedCharsetNames());
		}
		return encodings;
		})();

		const supportedEncodingsDenormalized = (function() {
		denormalizedEncodings = [];
		for (const encoding of supportedEncodings) {
		denormalizedEncodings.push(
		encoding.toLocaleLowerCase(),
		encoding.toLocaleLowerCase().replace(/-/g, "")
		);
		}
		return denormalizedEncodings;
		})();

		function UniversalDetector(options) {
		@@ -46,2 +75,10 @@ if (!options) options = {};

		if (options.detectEncodings) {
		for (const encoding of options.detectEncodings) {
		if (!supportedEncodingsDenormalized.includes(encoding.toLowerCase())) {
		throw new Error(`Encoding ${encoding} is not supported. Supported encodings: ${supportedEncodings}.`);
		}
		}
		}

		var _state = {
		@@ -62,2 +99,9 @@ pureAscii : 0,

		function canDetectEncoding(encoding) {
		if (!options.detectEncodings) {
		return true;
		}
		return options.detectEncodings.includes(encoding.toLowerCase());
		}

		this.reset = function() {
		@@ -70,3 +114,3 @@ this.result = {"encoding": null, "confidence": 0.0};
		this._mInputState = _state.pureAscii;
		this._mLastChar = "";
		this._mLastChar = [];
		this._mBOM = "";
		@@ -90,21 +134,21 @@ if( this._mEscCharsetProber ) {
		// If the data starts with BOM, we know it is UTF
		if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" ) {
		if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" && canDetectEncoding("UTF-8")) {
		// EF BB BF UTF-8 with BOM
		this.result = {"encoding": "UTF-8", "confidence": 1.0};
		} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" ) {
		} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" && canDetectEncoding("UTF-32LE") ) {
		// FF FE 00 00 UTF-32, little-endian BOM
		this.result = {"encoding": "UTF-32LE", "confidence": 1.0};
		} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" ) {
		} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" && canDetectEncoding("UTF-32BE")) {
		// 00 00 FE FF UTF-32, big-endian BOM
		this.result = {"encoding": "UTF-32BE", "confidence": 1.0};
		} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" ) {
		} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" && canDetectEncoding("X-ISO-10646-UCS-4-3412")) {
		// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
		this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0};
		} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" ) {
		} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" && canDetectEncoding("X-ISO-10646-UCS-4-2143")) {
		// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
		this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0};
		} else if( this._mBOM.slice(0,2) == "\xFF\xFE" ) {
		} else if( this._mBOM.slice(0,2) == "\xFF\xFE" && canDetectEncoding("UTF-16LE")) {
		// FF FE UTF-16, little endian BOM
		this.result = {"encoding": "UTF-16LE", "confidence": 1.0};
		} else if( this._mBOM.slice(0,2) == "\xFE\xFF" ) {
		} else if( this._mBOM.slice(0,2) == "\xFE\xFF" && canDetectEncoding("UTF-16BE")) {
		// FE FF UTF-16, big endian BOM
		@@ -133,3 +177,3 @@ this.result = {"encoding": "UTF-16BE", "confidence": 1.0};
		this._mInputState = _state.highbyte;
		} else if( this._escDetector.test(this._mLastChar + aBuf) ) {
		} else if( this._escDetector.test(this._mLastChar.join('') + aBuf) ) {
		this._mInputState = _state.escAscii;
		@@ -139,3 +183,3 @@ }

		this._mLastChar = aBuf.slice(-1);
		this._mLastChar = aBuf.slice(-1).split('');

		@@ -146,3 +190,3 @@ if( this._mInputState == _state.escAscii ) {
		}
		if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt ) {
		if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt && canDetectEncoding(this._mEscCharsetProber.getCharsetName()) ) {
		this.result = {
		@@ -164,3 +208,3 @@ "encoding": this._mEscCharsetProber.getCharsetName(),
		for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
		if( prober.feed(aBuf) == constants.foundIt ) {
		if( prober.feed(aBuf) == constants.foundIt && canDetectEncoding(prober.getCharsetName()) ) {
		this.result = {
		@@ -186,3 +230,3 @@ "encoding": prober.getCharsetName(),

		if( this._mInputState == _state.pureAscii ) {
		if( this._mInputState == _state.pureAscii && canDetectEncoding("ascii") ) {
		logger.log("pure ascii")
		@@ -196,3 +240,3 @@ this.result = {"encoding": "ascii", "confidence": 1.0};
		for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
		if( !prober \|\| !prober.getCharsetName()) continue;
		if( !prober \|\| !prober.getCharsetName() \|\| !canDetectEncoding(prober.getCharsetName()) ) continue;
		this.results.push({
		@@ -219,3 +263,3 @@ "encoding": prober.getCharsetName(),
		for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
		if( !prober ) continue;
		if( !prober \|\| !canDetectEncoding(prober.getCharsetName()) ) continue;
		logger.log(prober.getCharsetName() + " confidence = " +
		@@ -222,0 +266,0 @@ prober.getConfidence() + "\n");

dist/jschardet.js

Sorry, the diff of this file is too big to display

dist/jschardet.min.js

Sorry, the diff of this file is too big to display

jschardet - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics

Worsened metrics