Comparing version 3.0.0 to 3.1.0
@@ -5,4 +5,10 @@ export interface IDetectedMap { | ||
} | ||
export function detect(buffer: Buffer | string, options?: { minimumThreshold: number }): IDetectedMap; | ||
export interface IOptionsMap { | ||
minimumThreshold?: number, | ||
detectEncodings?: Array<string> | ||
} | ||
export function detect(buffer: Buffer | string, options?: IOptionsMap): IDetectedMap; | ||
export function detectAll(buffer: Buffer | string, options?: IOptionsMap): IDetectedMap[]; | ||
export function enableDebug(): void; |
{ | ||
"name": "jschardet", | ||
"version": "3.0.0", | ||
"version": "3.1.0", | ||
"description": "Character encoding auto-detection in JavaScript (port of python's chardet)", | ||
@@ -20,10 +20,12 @@ "author": "António Afonso", | ||
}, | ||
"dependencies": {}, | ||
"devDependencies": { | ||
"browserify": "~12.0.1", | ||
"google-closure-compiler": "20151015.0.0" | ||
"browserify": "~17.0.0", | ||
"google-closure-compiler": "20151015.0.0", | ||
"jest": "^29.7.0" | ||
}, | ||
"scripts": { | ||
"dist-dev": "mkdir -p dist && browserify index.js -s jschardet --detect-globals false -o dist/jschardet.js", | ||
"dist": "npm run dist-dev && java -jar node_modules/google-closure-compiler/compiler.jar --warning_level QUIET --compilation_level SIMPLE_OPTIMIZATIONS --js dist/jschardet.js > dist/jschardet.min.js" | ||
"dist-dev": "mkdir -p dist && browserify index.js -s jschardet --detect-globals false -o dist/jschardet.js && ./scripts/show-size-changes.sh dist/jschardet.js", | ||
"dist": "npm run dist-dev && java -jar node_modules/google-closure-compiler/compiler.jar --warning_level QUIET --compilation_level SIMPLE_OPTIMIZATIONS --language_in=ECMASCRIPT6_STRICT --language_out=ES5 --js dist/jschardet.js > dist/jschardet.min.js && ./scripts/show-size-changes.sh dist/jschardet.min.js", | ||
"dist-size-changes": "./scripts/show-size-changes.sh dist/*", | ||
"test": "jest" | ||
}, | ||
@@ -30,0 +32,0 @@ "engines": { |
@@ -17,3 +17,3 @@ [![NPM](https://nodei.co/npm/jschardet.png?downloads=true&downloadRank=true)](https://nodei.co/npm/jschardet/) | ||
### Node | ||
``` | ||
``` | ||
npm install jschardet | ||
@@ -58,2 +58,6 @@ ``` | ||
jschardet.detect(str, { minimumThreshold: 0 }); | ||
// Lock down which encodings to detect, can be useful in situations jschardet | ||
// is giving a higher probability to encodings that you never use. | ||
jschardet.detect(str, { detectEncodings: ["UTF-8", "windows-1252"] }); | ||
``` | ||
@@ -60,0 +64,0 @@ |
@@ -66,2 +66,6 @@ /* | ||
this.getSupportedCharsetNames = function() { | ||
throw new Error("Unimplemented method getSupportedCharsetNames()"); | ||
} | ||
this.feed = function(aBuf) { | ||
@@ -68,0 +72,0 @@ for( var i = 0, prober; prober = this._mProbers[i]; i++ ) { |
@@ -41,2 +41,6 @@ /* | ||
this.getSupportedCharsetNames = function() { | ||
throw new Error("Unimplemented method getSupportedCharsetNames()"); | ||
} | ||
this.feed = function(aBuf) { | ||
@@ -43,0 +47,0 @@ } |
@@ -47,2 +47,6 @@ /* | ||
]; | ||
self._supportedCharsetNames = []; | ||
for (const codingSM of self._mCodingSM) { | ||
self._supportedCharsetNames.push(codingSM.getCodingStateMachine()); | ||
} | ||
self.reset(); | ||
@@ -66,2 +70,6 @@ } | ||
this.getSupportedCharsetNames = function() { | ||
return self._supportedCharsetNames; | ||
} | ||
this.getConfidence = function() { | ||
@@ -68,0 +76,0 @@ if( this._mDetectedCharset ) { |
@@ -74,4 +74,5 @@ /* | ||
this._mLastChar[1] = aBuf[0]; | ||
this._mContextAnalyzer.feed(this._mLastChar, charLen); | ||
this._mDistributionAnalyzer.feed(this._mLastChar, charLen); | ||
var lastCharStr = this._mLastChar.join(''); | ||
this._mContextAnalyzer.feed(lastCharStr, charLen); | ||
this._mDistributionAnalyzer.feed(lastCharStr, charLen); | ||
} else { | ||
@@ -78,0 +79,0 @@ this._mContextAnalyzer.feed(aBuf.slice(i-1,i+1), charLen); |
@@ -115,2 +115,6 @@ /* | ||
this.getSupportedCharsetNames = function() { | ||
return [this.getCharsetName()]; | ||
} | ||
this.feed = function(aBuf) { | ||
@@ -117,0 +121,0 @@ aBuf = this.filterWithEnglishLetters(aBuf); |
@@ -42,4 +42,3 @@ /* | ||
self._mCodingSM = null; | ||
//self._mLastChar = ["\x00", "\x00"]; | ||
self._mLastChar = "\x00\x00"; | ||
self._mLastChar = ["\x00", "\x00"]; | ||
} | ||
@@ -55,4 +54,3 @@ | ||
} | ||
//this._mLastChar = ["\x00", "\x00"]; | ||
this._mLastChar = "\x00\x00"; | ||
this._mLastChar = ["\x00", "\x00"]; | ||
} | ||
@@ -78,3 +76,3 @@ | ||
this._mLastChar[1] = aBuf[0]; | ||
this._mDistributionAnalyzer.feed(this._mLastChar, charLen); | ||
this._mDistributionAnalyzer.feed(this._mLastChar.join(''), charLen); | ||
} else { | ||
@@ -81,0 +79,0 @@ this._mDistributionAnalyzer.feed(aBuf.slice(i-1,i+1), charLen); |
@@ -50,2 +50,12 @@ /* | ||
]; | ||
const supportedCharsetNames = (function() { | ||
const charsetNames = []; | ||
for (const prober of this._mProbers) { | ||
charsetNames.push(prober.getCharsetName()) | ||
} | ||
return charsetNames; | ||
}); | ||
this.getSupportedCharsetNames = function() { | ||
return supportedCharsetNames; | ||
} | ||
this.reset(); | ||
@@ -52,0 +62,0 @@ } |
@@ -67,5 +67,14 @@ /* | ||
self._supportedCharsetNames = []; | ||
for (const prober of self._mProbers) { | ||
self._supportedCharsetNames.push(prober.getCharsetName()) | ||
} | ||
self.reset(); | ||
} | ||
this.getSupportedCharsetNames = function() { | ||
return self._supportedCharsetNames; | ||
} | ||
init(); | ||
@@ -72,0 +81,0 @@ } |
@@ -74,4 +74,4 @@ /* | ||
this._mLastChar[1] = aBuf[0]; | ||
this._mContextAnalyzer.feed(this._mLastChar.slice(2 - charLen), charLen); | ||
this._mDistributionAnalyzer.feed(this._mLastChar, charLen); | ||
this._mContextAnalyzer.feed(this._mLastChar.slice(2 - charLen).join(''), charLen); | ||
this._mDistributionAnalyzer.feed(this._mLastChar.join(''), charLen); | ||
} else { | ||
@@ -78,0 +78,0 @@ this._mContextAnalyzer.feed(aBuf.slice(i + 1 - charLen, i + 3 - charLen), charLen); |
@@ -38,5 +38,34 @@ /* | ||
var Latin1Prober = require('./latin1prober'); | ||
var EscCharSetProber = require('./escprober') | ||
var EscCharSetProber = require('./escprober'); | ||
var logger = require('./logger'); | ||
const supportedEncodings = (function() { | ||
const BOM_UTF = [ | ||
"UTF-8", "UTF-32LE", "UTF-32BE", "UTF-32BE", "UTF-16LE", "UTF-16BE", | ||
"X-ISO-10646-UCS-4-3412", "X-ISO-10646-UCS-4-2143" | ||
] | ||
const probers = [ | ||
new EscCharSetProber(), | ||
new MBCSGroupProber(), | ||
new SBCSGroupProber(), | ||
new Latin1Prober() | ||
]; | ||
const encodings = BOM_UTF.slice(0); | ||
for (const prober of probers) { | ||
[].push.apply(encodings, prober.getSupportedCharsetNames()); | ||
} | ||
return encodings; | ||
})(); | ||
const supportedEncodingsDenormalized = (function() { | ||
denormalizedEncodings = []; | ||
for (const encoding of supportedEncodings) { | ||
denormalizedEncodings.push( | ||
encoding.toLocaleLowerCase(), | ||
encoding.toLocaleLowerCase().replace(/-/g, "") | ||
); | ||
} | ||
return denormalizedEncodings; | ||
})(); | ||
function UniversalDetector(options) { | ||
@@ -46,2 +75,10 @@ if (!options) options = {}; | ||
if (options.detectEncodings) { | ||
for (const encoding of options.detectEncodings) { | ||
if (!supportedEncodingsDenormalized.includes(encoding.toLowerCase())) { | ||
throw new Error(`Encoding ${encoding} is not supported. Supported encodings: ${supportedEncodings}.`); | ||
} | ||
} | ||
} | ||
var _state = { | ||
@@ -62,2 +99,9 @@ pureAscii : 0, | ||
function canDetectEncoding(encoding) { | ||
if (!options.detectEncodings) { | ||
return true; | ||
} | ||
return options.detectEncodings.includes(encoding.toLowerCase()); | ||
} | ||
this.reset = function() { | ||
@@ -70,3 +114,3 @@ this.result = {"encoding": null, "confidence": 0.0}; | ||
this._mInputState = _state.pureAscii; | ||
this._mLastChar = ""; | ||
this._mLastChar = []; | ||
this._mBOM = ""; | ||
@@ -90,21 +134,21 @@ if( this._mEscCharsetProber ) { | ||
// If the data starts with BOM, we know it is UTF | ||
if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" ) { | ||
if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" && canDetectEncoding("UTF-8")) { | ||
// EF BB BF UTF-8 with BOM | ||
this.result = {"encoding": "UTF-8", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" ) { | ||
} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" && canDetectEncoding("UTF-32LE") ) { | ||
// FF FE 00 00 UTF-32, little-endian BOM | ||
this.result = {"encoding": "UTF-32LE", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" ) { | ||
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" && canDetectEncoding("UTF-32BE")) { | ||
// 00 00 FE FF UTF-32, big-endian BOM | ||
this.result = {"encoding": "UTF-32BE", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" ) { | ||
} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" && canDetectEncoding("X-ISO-10646-UCS-4-3412")) { | ||
// FE FF 00 00 UCS-4, unusual octet order BOM (3412) | ||
this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" ) { | ||
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" && canDetectEncoding("X-ISO-10646-UCS-4-2143")) { | ||
// 00 00 FF FE UCS-4, unusual octet order BOM (2143) | ||
this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,2) == "\xFF\xFE" ) { | ||
} else if( this._mBOM.slice(0,2) == "\xFF\xFE" && canDetectEncoding("UTF-16LE")) { | ||
// FF FE UTF-16, little endian BOM | ||
this.result = {"encoding": "UTF-16LE", "confidence": 1.0}; | ||
} else if( this._mBOM.slice(0,2) == "\xFE\xFF" ) { | ||
} else if( this._mBOM.slice(0,2) == "\xFE\xFF" && canDetectEncoding("UTF-16BE")) { | ||
// FE FF UTF-16, big endian BOM | ||
@@ -133,3 +177,3 @@ this.result = {"encoding": "UTF-16BE", "confidence": 1.0}; | ||
this._mInputState = _state.highbyte; | ||
} else if( this._escDetector.test(this._mLastChar + aBuf) ) { | ||
} else if( this._escDetector.test(this._mLastChar.join('') + aBuf) ) { | ||
this._mInputState = _state.escAscii; | ||
@@ -139,3 +183,3 @@ } | ||
this._mLastChar = aBuf.slice(-1); | ||
this._mLastChar = aBuf.slice(-1).split(''); | ||
@@ -146,3 +190,3 @@ if( this._mInputState == _state.escAscii ) { | ||
} | ||
if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt ) { | ||
if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt && canDetectEncoding(this._mEscCharsetProber.getCharsetName()) ) { | ||
this.result = { | ||
@@ -164,3 +208,3 @@ "encoding": this._mEscCharsetProber.getCharsetName(), | ||
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { | ||
if( prober.feed(aBuf) == constants.foundIt ) { | ||
if( prober.feed(aBuf) == constants.foundIt && canDetectEncoding(prober.getCharsetName()) ) { | ||
this.result = { | ||
@@ -186,3 +230,3 @@ "encoding": prober.getCharsetName(), | ||
if( this._mInputState == _state.pureAscii ) { | ||
if( this._mInputState == _state.pureAscii && canDetectEncoding("ascii") ) { | ||
logger.log("pure ascii") | ||
@@ -196,3 +240,3 @@ this.result = {"encoding": "ascii", "confidence": 1.0}; | ||
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { | ||
if( !prober || !prober.getCharsetName()) continue; | ||
if( !prober || !prober.getCharsetName() || !canDetectEncoding(prober.getCharsetName()) ) continue; | ||
this.results.push({ | ||
@@ -219,3 +263,3 @@ "encoding": prober.getCharsetName(), | ||
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) { | ||
if( !prober ) continue; | ||
if( !prober || !canDetectEncoding(prober.getCharsetName()) ) continue; | ||
logger.log(prober.getCharsetName() + " confidence = " + | ||
@@ -222,0 +266,0 @@ prober.getConfidence() + "\n"); |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
1316392
56
16257
102
3