Comparing version
@@ -5,4 +5,10 @@ export interface IDetectedMap { | ||
} | ||
export function detect(buffer: Buffer | string, options?: { minimumThreshold: number }): IDetectedMap; | ||
export interface IOptionsMap { | ||
minimumThreshold?: number, | ||
detectEncodings?: Array<string> | ||
} | ||
export function detect(buffer: Buffer | string, options?: IOptionsMap): IDetectedMap; | ||
export function detectAll(buffer: Buffer | string, options?: IOptionsMap): IDetectedMap[]; | ||
export function enableDebug(): void; |
{ | ||
"name": "jschardet", | ||
"version": "3.0.0", | ||
"version": "3.1.0", | ||
"description": "Character encoding auto-detection in JavaScript (port of python's chardet)", | ||
@@ -20,10 +20,12 @@ "author": "António Afonso", | ||
}, | ||
"dependencies": {}, | ||
"devDependencies": { | ||
"browserify": "~12.0.1", | ||
"google-closure-compiler": "20151015.0.0" | ||
"browserify": "~17.0.0", | ||
"google-closure-compiler": "20151015.0.0", | ||
"jest": "^29.7.0" | ||
}, | ||
"scripts": { | ||
"dist-dev": "mkdir -p dist && browserify index.js -s jschardet --detect-globals false -o dist/jschardet.js", | ||
"dist": "npm run dist-dev && java -jar node_modules/google-closure-compiler/compiler.jar --warning_level QUIET --compilation_level SIMPLE_OPTIMIZATIONS --js dist/jschardet.js > dist/jschardet.min.js" | ||
"dist-dev": "mkdir -p dist && browserify index.js -s jschardet --detect-globals false -o dist/jschardet.js && ./scripts/show-size-changes.sh dist/jschardet.js", | ||
"dist": "npm run dist-dev && java -jar node_modules/google-closure-compiler/compiler.jar --warning_level QUIET --compilation_level SIMPLE_OPTIMIZATIONS --language_in=ECMASCRIPT6_STRICT --language_out=ES5 --js dist/jschardet.js > dist/jschardet.min.js && ./scripts/show-size-changes.sh dist/jschardet.min.js", | ||
"dist-size-changes": "./scripts/show-size-changes.sh dist/*", | ||
"test": "jest" | ||
}, | ||
@@ -30,0 +32,0 @@ "engines": { |
@@ -17,3 +17,3 @@ [](https://nodei.co/npm/jschardet/) | ||
### Node | ||
``` | ||
``` | ||
npm install jschardet | ||
@@ -58,2 +58,6 @@ ``` | ||
jschardet.detect(str, { minimumThreshold: 0 }); | ||
// Lock down which encodings to detect, can be useful in situations jschardet | ||
// is giving a higher probability to encodings that you never use. | ||
jschardet.detect(str, { detectEncodings: ["UTF-8", "windows-1252"] }); | ||
``` | ||
@@ -60,0 +64,0 @@ |
@@ -66,2 +66,6 @@ /* | ||
this.getSupportedCharsetNames = function() { | ||
throw new Error("Unimplemented method getSupportedCharsetNames()"); | ||
} | ||
this.feed = function(aBuf) { | ||
@@ -68,0 +72,0 @@ for( var i = 0, prober; prober = this._mProbers[i]; i++ ) { |
@@ -41,2 +41,6 @@ /* | ||
this.getSupportedCharsetNames = function() { | ||
throw new Error("Unimplemented method getSupportedCharsetNames()"); | ||
} | ||
this.feed = function(aBuf) { | ||
@@ -43,0 +47,0 @@ } |
@@ -47,2 +47,6 @@ /* | ||
]; | ||
self._supportedCharsetNames = []; | ||
for (const codingSM of self._mCodingSM) { | ||
self._supportedCharsetNames.push(codingSM.getCodingStateMachine()); | ||
} | ||
self.reset(); | ||
@@ -66,2 +70,6 @@ } | ||
this.getSupportedCharsetNames = function() { | ||
return self._supportedCharsetNames; | ||
} | ||
this.getConfidence = function() { | ||
@@ -68,0 +76,0 @@ if( this._mDetectedCharset ) { |
@@ -74,4 +74,5 @@ /* | ||
this._mLastChar[1] = aBuf[0]; | ||
this._mContextAnalyzer.feed(this._mLastChar, charLen); | ||
this._mDistributionAnalyzer.feed(this._mLastChar, charLen); | ||
var lastCharStr = this._mLastChar.join(''); | ||
this._mContextAnalyzer.feed(lastCharStr, charLen); | ||
this._mDistributionAnalyzer.feed(lastCharStr, charLen); | ||
} else { | ||
@@ -78,0 +79,0 @@ this._mContextAnalyzer.feed(aBuf.slice(i-1,i+1), charLen); |
@@ -115,2 +115,6 @@ /* | ||
this.getSupportedCharsetNames = function() { | ||
return [this.getCharsetName()]; | ||
} | ||
this.feed = function(aBuf) { | ||
@@ -117,0 +121,0 @@ aBuf = this.filterWithEnglishLetters(aBuf); |
@@ -42,4 +42,3 @@ /* | ||
self._mCodingSM = null; | ||
//self._mLastChar = ["\x00", "\x00"]; | ||
self._mLastChar = "\x00\x00"; | ||
self._mLastChar = ["\x00", "\x00"]; | ||
} | ||
@@ -55,4 +54,3 @@ | ||
} | ||
//this._mLastChar = ["\x00", "\x00"]; | ||
this._mLastChar = "\x00\x00"; | ||
this._mLastChar = ["\x00", "\x00"]; | ||
} | ||
@@ -78,3 +76,3 @@ | ||
this._mLastChar[1] = aBuf[0]; | ||
this._mDistributionAnalyzer.feed(this._mLastChar, charLen); | ||
this._mDistributionAnalyzer.feed(this._mLastChar.join(''), charLen); | ||
} else { | ||
@@ -81,0 +79,0 @@ this._mDistributionAnalyzer.feed(aBuf.slice(i-1,i+1), charLen); |
@@ -50,2 +50,12 @@ /* | ||
]; | ||
const supportedCharsetNames = (function() { | ||
const charsetNames = []; | ||
for (const prober of this._mProbers) { | ||
charsetNames.push(prober.getCharsetName()) | ||
} | ||
return charsetNames; | ||
}); | ||
this.getSupportedCharsetNames = function() { | ||
return supportedCharsetNames; | ||
} | ||
this.reset(); | ||
@@ -52,0 +62,0 @@ } |
@@ -67,5 +67,14 @@ /* | ||
self._supportedCharsetNames = []; | ||
for (const prober of self._mProbers) { | ||
self._supportedCharsetNames.push(prober.getCharsetName()) | ||
} | ||
self.reset(); | ||
} | ||
this.getSupportedCharsetNames = function() { | ||
return self._supportedCharsetNames; | ||
} | ||
init(); | ||
@@ -72,0 +81,0 @@ } |
@@ -74,4 +74,4 @@ /* | ||
this._mLastChar[1] = aBuf[0]; | ||
this._mContextAnalyzer.feed(this._mLastChar.slice(2 - charLen), charLen); | ||
this._mDistributionAnalyzer.feed(this._mLastChar, charLen); | ||
this._mContextAnalyzer.feed(this._mLastChar.slice(2 - charLen).join(''), charLen); | ||
this._mDistributionAnalyzer.feed(this._mLastChar.join(''), charLen); | ||
} else { | ||
@@ -78,0 +78,0 @@ this._mContextAnalyzer.feed(aBuf.slice(i + 1 - charLen, i + 3 - charLen), charLen); |
@@ -38,5 +38,34 @@ /* | ||
var Latin1Prober = require('./latin1prober'); | ||
var EscCharSetProber = require('./escprober') | ||
var EscCharSetProber = require('./escprober'); | ||
var logger = require('./logger'); | ||
const supportedEncodings = (function() { | ||
const BOM_UTF = [ | ||
"UTF-8", "UTF-32LE", "UTF-32BE", "UTF-32BE", "UTF-16LE", "UTF-16BE", | ||
"X-ISO-10646-UCS-4-3412", "X-ISO-10646-UCS-4-2143" | ||
] | ||
const probers = [ | ||
new EscCharSetProber(), | ||
new MBCSGroupProber(), | ||
new SBCSGroupProber(), | ||
new Latin1Prober() | ||
]; | ||
const encodings = BOM_UTF.slice(0); | ||
for (const prober of probers) { | ||
[].push.apply(encodings, prober.getSupportedCharsetNames()); | ||
} | ||
return encodings; | ||
})(); | ||
const supportedEncodingsDenormalized = (function() { | ||
denormalizedEncodings = []; | ||
for (const encoding of supportedEncodings) { | ||
denormalizedEncodings.push( | ||
encoding.toLocaleLowerCase(), | ||
encoding.toLocaleLowerCase().replace(/-/g, "") | ||
); | ||
} | ||
return denormalizedEncodings; | ||
})(); | ||
function UniversalDetector(options) { | ||
@@ -46,2 +75,10 @@ if (!options) options = {}; | ||
if (options.detectEncodings) { | ||
for (const encoding of options.detectEncodings) { | ||
if (!supportedEncodingsDenormalized.includes(encoding.toLowerCase())) { | ||
throw new Error(`Encoding ${encoding} is not supported. Supported encodings: ${supportedEncodings}.`); | ||
} | ||
} | ||
} | ||
var _state = { | ||
@@ -62,2 +99,9 @@ pureAscii : 0, | ||
function canDetectEncoding(encoding) { | ||
if (!options.detectEncodings) { | ||
return true; | ||
} | ||
return options.detectEncodings.includes(encoding.toLowerCase()); | ||
} | ||
this.reset = function() { | ||
@@ -70,3 +114,3 @@ this.result = {"encoding": null, "confidence": 0.0}; | ||
this._mInputState = _state.pureAscii; | ||
this._mLastChar = ""; | ||
this._mLastChar = []; | ||
this._mBOM = ""; | ||
@@ -90,21 +134,21 @@ if( this._mEscCharsetProber ) { | ||
// If the data starts with BOM, we know it is UTF | ||
if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" ) { | ||
if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" && canDetectEncoding("UTF-8")) { | ||
// EF BB BF UTF-8 with BOM | ||
this.result = {"encoding": "UTF-8", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" ) { | ||
} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" && canDetectEncoding("UTF-32LE") ) { | ||
// FF FE 00 00 UTF-32, little-endian BOM | ||
this.result = {"encoding": "UTF-32LE", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" ) { | ||
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" && canDetectEncoding("UTF-32BE")) { | ||
// 00 00 FE FF UTF-32, big-endian BOM | ||
this.result = {"encoding": "UTF-32BE", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" ) { | ||
} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" && canDetectEncoding("X-ISO-10646-UCS-4-3412")) { | ||
// FE FF 00 00 UCS-4, unusual octet order BOM (3412) | ||
this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" ) { | ||
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" && canDetectEncoding("X-ISO-10646-UCS-4-2143")) { | ||
// 00 00 FF FE UCS-4, unusual octet order BOM (2143) | ||
this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,2) == "\xFF\xFE" ) { | ||
} else if( this._mBOM.slice(0,2) == "\xFF\xFE" && canDetectEncoding("UTF-16LE")) { | ||
// FF FE UTF-16, little endian BOM | ||
this.result = {"encoding": "UTF-16LE", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,2) == "\xFE\xFF" ) { | ||
} else if( this._mBOM.slice(0,2) == "\xFE\xFF" && canDetectEncoding("UTF-16BE")) { | ||
// FE FF UTF-16, big endian BOM | ||
@@ -133,3 +177,3 @@ this.result = {"encoding": "UTF-16BE", "confidence": 1.0}; | ||
this._mInputState = _state.highbyte; | ||
} else if( this._escDetector.test(this._mLastChar + aBuf) ) { | ||
} else if( this._escDetector.test(this._mLastChar.join('') + aBuf) ) { | ||
this._mInputState = _state.escAscii; | ||
@@ -139,3 +183,3 @@ } | ||
this._mLastChar = aBuf.slice(-1); | ||
this._mLastChar = aBuf.slice(-1).split(''); | ||
@@ -146,3 +190,3 @@ if( this._mInputState == _state.escAscii ) { | ||
} | ||
if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt ) { | ||
if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt && canDetectEncoding(this._mEscCharsetProber.getCharsetName()) ) { | ||
this.result = { | ||
@@ -164,3 +208,3 @@ "encoding": this._mEscCharsetProber.getCharsetName(), | ||
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { | ||
if( prober.feed(aBuf) == constants.foundIt ) { | ||
if( prober.feed(aBuf) == constants.foundIt && canDetectEncoding(prober.getCharsetName()) ) { | ||
this.result = { | ||
@@ -186,3 +230,3 @@ "encoding": prober.getCharsetName(), | ||
if( this._mInputState == _state.pureAscii ) { | ||
if( this._mInputState == _state.pureAscii && canDetectEncoding("ascii") ) { | ||
logger.log("pure ascii") | ||
@@ -196,3 +240,3 @@ this.result = {"encoding": "ascii", "confidence": 1.0}; | ||
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { | ||
if( !prober || !prober.getCharsetName()) continue; | ||
if( !prober || !prober.getCharsetName() || !canDetectEncoding(prober.getCharsetName()) ) continue; | ||
this.results.push({ | ||
@@ -219,3 +263,3 @@ "encoding": prober.getCharsetName(), | ||
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { | ||
if( !prober ) continue; | ||
if( !prober || !canDetectEncoding(prober.getCharsetName()) ) continue; | ||
logger.log(prober.getCharsetName() + " confidence = " + | ||
@@ -222,0 +266,0 @@ prober.getConfidence() + "\n"); |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
1316392
1.42%56
7.69%16257
1.01%102
4.08%3
50%