iconv-lite
Advanced tools
Comparing version 0.4.8 to 0.4.9
# 0.4.9 / 2015-05-24 | ||
* Streamlined BOM handling: strip BOM by default, add BOM when encoding if | ||
addBOM: true. Added docs to Readme. | ||
* UTF16 now uses UTF16-LE by default. | ||
* Fixed minor issue with big5 encoding. | ||
* Added io.js testing on Travis; updated node-iconv version to test against. | ||
Now we just skip testing SBCS encodings that node-iconv doesn't support. | ||
* (internal refactoring) Updated codec interface to use classes. | ||
* Use strict mode in all files. | ||
# 0.4.8 / 2015-04-14 | ||
@@ -3,0 +15,0 @@ |
@@ -0,1 +1,2 @@ | ||
"use strict" | ||
@@ -6,5 +7,3 @@ // Multibyte codec. In this scheme, a character is represented by 1 or more bytes. | ||
exports._dbcs = function(options) { | ||
return new DBCSCodec(options); | ||
} | ||
exports._dbcs = DBCSCodec; | ||
@@ -23,11 +22,11 @@ var UNASSIGNED = -1, | ||
// Class DBCSCodec reads and initializes mapping tables. | ||
function DBCSCodec(options) { | ||
this.options = options; | ||
if (!options) | ||
function DBCSCodec(codecOptions, iconv) { | ||
this.encodingName = codecOptions.encodingName; | ||
if (!codecOptions) | ||
throw new Error("DBCS codec is called without the data.") | ||
if (!options.table) | ||
throw new Error("Encoding '" + options.encodingName + "' has no data."); | ||
if (!codecOptions.table) | ||
throw new Error("Encoding '" + this.encodingName + "' has no data."); | ||
// Load tables. | ||
var mappingTable = options.table(); | ||
var mappingTable = codecOptions.table(); | ||
@@ -54,3 +53,3 @@ | ||
this.defaultCharUnicode = options.iconv.defaultCharUnicode; | ||
this.defaultCharUnicode = iconv.defaultCharUnicode; | ||
@@ -75,7 +74,10 @@ | ||
var skipEncodeChars = {}; | ||
if (options.encodeSkipVals) | ||
for (var i = 0; i < options.encodeSkipVals.length; i++) { | ||
var range = options.encodeSkipVals[i]; | ||
for (var j = range.from; j <= range.to; j++) | ||
skipEncodeChars[j] = true; | ||
if (codecOptions.encodeSkipVals) | ||
for (var i = 0; i < codecOptions.encodeSkipVals.length; i++) { | ||
var val = codecOptions.encodeSkipVals[i]; | ||
if (typeof val === 'number') | ||
skipEncodeChars[val] = true; | ||
else | ||
for (var j = val.from; j <= val.to; j++) | ||
skipEncodeChars[j] = true; | ||
} | ||
@@ -87,9 +89,9 @@ | ||
// Add more encoding pairs when needed. | ||
if (options.encodeAdd) { | ||
for (var uChar in options.encodeAdd) | ||
if (Object.prototype.hasOwnProperty.call(options.encodeAdd, uChar)) | ||
this._setEncodeChar(uChar.charCodeAt(0), options.encodeAdd[uChar]); | ||
if (codecOptions.encodeAdd) { | ||
for (var uChar in codecOptions.encodeAdd) | ||
if (Object.prototype.hasOwnProperty.call(codecOptions.encodeAdd, uChar)) | ||
this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); | ||
} | ||
this.defCharSB = this.encodeTable[0][options.iconv.defaultCharSingleByte.charCodeAt(0)]; | ||
this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; | ||
if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?']; | ||
@@ -100,4 +102,4 @@ if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0); | ||
// Load & create GB18030 tables when needed. | ||
if (typeof options.gb18030 === 'function') { | ||
this.gb18030 = options.gb18030(); // Load GB18030 ranges. | ||
if (typeof codecOptions.gb18030 === 'function') { | ||
this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. | ||
@@ -124,45 +126,5 @@ // Add GB18030 decode tables. | ||
// Public interface: create encoder and decoder objects. | ||
// The methods (write, end) are simple functions to not inhibit optimizations. | ||
DBCSCodec.prototype.encoder = function encoderDBCS(options) { | ||
return { | ||
// Methods | ||
write: encoderDBCSWrite, | ||
end: encoderDBCSEnd, | ||
DBCSCodec.prototype.encoder = DBCSEncoder; | ||
DBCSCodec.prototype.decoder = DBCSDecoder; | ||
// Encoder state | ||
leadSurrogate: -1, | ||
seqObj: undefined, | ||
// Static data | ||
encodeTable: this.encodeTable, | ||
encodeTableSeq: this.encodeTableSeq, | ||
defaultCharSingleByte: this.defCharSB, | ||
gb18030: this.gb18030, | ||
// Export for testing | ||
findIdx: findIdx, | ||
} | ||
} | ||
DBCSCodec.prototype.decoder = function decoderDBCS(options) { | ||
return { | ||
// Methods | ||
write: decoderDBCSWrite, | ||
end: decoderDBCSEnd, | ||
// Decoder state | ||
nodeIdx: 0, | ||
prevBuf: new Buffer(0), | ||
// Static data | ||
decodeTables: this.decodeTables, | ||
decodeTableSeq: this.decodeTableSeq, | ||
defaultCharUnicode: this.defaultCharUnicode, | ||
gb18030: this.gb18030, | ||
} | ||
} | ||
// Decoder helpers | ||
@@ -188,3 +150,3 @@ DBCSCodec.prototype._getDecodeTrieNode = function(addr) { | ||
else | ||
throw new Error("Overwrite byte in " + this.options.encodingName + ", addr: " + addr.toString(16)); | ||
throw new Error("Overwrite byte in " + this.encodingName + ", addr: " + addr.toString(16)); | ||
} | ||
@@ -214,3 +176,3 @@ return node; | ||
else | ||
throw new Error("Incorrect surrogate pair in " + this.options.encodingName + " at chunk " + chunk[0]); | ||
throw new Error("Incorrect surrogate pair in " + this.encodingName + " at chunk " + chunk[0]); | ||
} | ||
@@ -236,6 +198,6 @@ else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding used) | ||
else | ||
throw new Error("Incorrect type '" + typeof part + "' given in " + this.options.encodingName + " at chunk " + chunk[0]); | ||
throw new Error("Incorrect type '" + typeof part + "' given in " + this.encodingName + " at chunk " + chunk[0]); | ||
} | ||
if (curAddr > 0xFF) | ||
throw new Error("Incorrect chunk in " + this.options.encodingName + " at addr " + chunk[0] + ": too long" + curAddr); | ||
throw new Error("Incorrect chunk in " + this.encodingName + " at addr " + chunk[0] + ": too long" + curAddr); | ||
} | ||
@@ -316,6 +278,17 @@ | ||
// == Actual Encoding ========================================================== | ||
// == Encoder ================================================================== | ||
function DBCSEncoder(options, codec) { | ||
// Encoder state | ||
this.leadSurrogate = -1; | ||
this.seqObj = undefined; | ||
// Static data | ||
this.encodeTable = codec.encodeTable; | ||
this.encodeTableSeq = codec.encodeTableSeq; | ||
this.defaultCharSingleByte = codec.defCharSB; | ||
this.gb18030 = codec.gb18030; | ||
} | ||
function encoderDBCSWrite(str) { | ||
DBCSEncoder.prototype.write = function(str) { | ||
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)), | ||
@@ -440,3 +413,3 @@ leadSurrogate = this.leadSurrogate, | ||
function encoderDBCSEnd() { | ||
DBCSEncoder.prototype.end = function() { | ||
if (this.leadSurrogate === -1 && this.seqObj === undefined) | ||
@@ -472,7 +445,21 @@ return; // All clean. Most often case. | ||
// Export for testing | ||
DBCSEncoder.prototype.findIdx = findIdx; | ||
// == Actual Decoding ========================================================== | ||
// == Decoder ================================================================== | ||
function decoderDBCSWrite(buf) { | ||
function DBCSDecoder(options, codec) { | ||
// Decoder state | ||
this.nodeIdx = 0; | ||
this.prevBuf = new Buffer(0); | ||
// Static data | ||
this.decodeTables = codec.decodeTables; | ||
this.decodeTableSeq = codec.decodeTableSeq; | ||
this.defaultCharUnicode = codec.defaultCharUnicode; | ||
this.gb18030 = codec.gb18030; | ||
} | ||
DBCSDecoder.prototype.write = function(buf) { | ||
var newBuf = new Buffer(buf.length*2), | ||
@@ -545,3 +532,3 @@ nodeIdx = this.nodeIdx, | ||
function decoderDBCSEnd() { | ||
DBCSDecoder.prototype.end = function() { | ||
var ret = ''; | ||
@@ -559,3 +546,3 @@ | ||
if (buf.length > 0) | ||
ret += decoderDBCSWrite.call(this, buf); | ||
ret += this.write(buf); | ||
} | ||
@@ -562,0 +549,0 @@ |
@@ -0,1 +1,2 @@ | ||
"use strict" | ||
@@ -162,2 +163,3 @@ // Description of supported double byte encodings and aliases. | ||
table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) }, | ||
encodeSkipVals: [0xa2cc], | ||
}, | ||
@@ -164,0 +166,0 @@ |
@@ -0,1 +1,2 @@ | ||
"use strict" | ||
@@ -2,0 +3,0 @@ // Update this array if you add/rename/remove files in this directory. |
@@ -0,32 +1,37 @@ | ||
"use strict" | ||
// Export Node.js internal encodings. | ||
var utf16lebom = new Buffer([0xFF, 0xFE]); | ||
module.exports = { | ||
// Encodings | ||
utf8: { type: "_internal", enc: "utf8" }, | ||
cesu8: { type: "_internal", enc: "utf8" }, | ||
unicode11utf8: { type: "_internal", enc: "utf8" }, | ||
ucs2: { type: "_internal", enc: "ucs2", bom: utf16lebom }, | ||
utf16le:{ type: "_internal", enc: "ucs2", bom: utf16lebom }, | ||
binary: { type: "_internal", enc: "binary" }, | ||
base64: { type: "_internal", enc: "base64" }, | ||
hex: { type: "_internal", enc: "hex" }, | ||
utf8: { type: "_internal", bomAware: true}, | ||
cesu8: "utf8", | ||
unicode11utf8: "utf8", | ||
// Codec. | ||
_internal: function(options) { | ||
if (!options || !options.enc) | ||
throw new Error("Internal codec is called without encoding type.") | ||
ucs2: { type: "_internal", bomAware: true}, | ||
utf16le: "ucs2", | ||
return { | ||
encoder: options.enc == "base64" ? encoderBase64 : encoderInternal, | ||
decoder: decoderInternal, | ||
binary: { type: "_internal" }, | ||
base64: { type: "_internal" }, | ||
hex: { type: "_internal" }, | ||
enc: options.enc, | ||
bom: options.bom, | ||
}; | ||
}, | ||
// Codec. | ||
_internal: InternalCodec, | ||
}; | ||
//------------------------------------------------------------------------------ | ||
function InternalCodec(codecOptions) { | ||
this.enc = codecOptions.encodingName; | ||
this.bomAware = codecOptions.bomAware; | ||
if (this.enc === "base64") | ||
this.encoder = InternalEncoderBase64; | ||
} | ||
InternalCodec.prototype.encoder = InternalEncoder; | ||
InternalCodec.prototype.decoder = InternalDecoder; | ||
//------------------------------------------------------------------------------ | ||
// We use node.js internal decoder. It's signature is the same as ours. | ||
@@ -38,34 +43,33 @@ var StringDecoder = require('string_decoder').StringDecoder; | ||
function decoderInternal() { | ||
return new StringDecoder(this.enc); | ||
function InternalDecoder(options, codec) { | ||
StringDecoder.call(this, codec.enc); | ||
} | ||
InternalDecoder.prototype = StringDecoder.prototype; | ||
//------------------------------------------------------------------------------ | ||
// Encoder is mostly trivial | ||
function encoderInternal() { | ||
return { | ||
write: encodeInternal, | ||
end: function() {}, | ||
enc: this.enc, | ||
} | ||
function InternalEncoder(options, codec) { | ||
this.enc = codec.enc; | ||
} | ||
function encodeInternal(str) { | ||
InternalEncoder.prototype.write = function(str) { | ||
return new Buffer(str, this.enc); | ||
} | ||
InternalEncoder.prototype.end = function() { | ||
} | ||
//------------------------------------------------------------------------------ | ||
// Except base64 encoder, which must keep its state. | ||
function encoderBase64() { | ||
return { | ||
write: encodeBase64Write, | ||
end: encodeBase64End, | ||
prevStr: '', | ||
}; | ||
function InternalEncoderBase64(options, codec) { | ||
this.prevStr = ''; | ||
} | ||
function encodeBase64Write(str) { | ||
InternalEncoderBase64.prototype.write = function(str) { | ||
str = this.prevStr + str; | ||
@@ -79,5 +83,5 @@ var completeQuads = str.length - (str.length % 4); | ||
function encodeBase64End() { | ||
InternalEncoderBase64.prototype.end = function() { | ||
return new Buffer(this.prevStr, "base64"); | ||
} | ||
@@ -0,1 +1,2 @@ | ||
"use strict" | ||
@@ -5,45 +6,39 @@ // Single-byte codec. Needs a 'chars' string parameter that contains 256 or 128 chars that | ||
exports._sbcs = function(options) { | ||
if (!options) | ||
exports._sbcs = SBCSCodec; | ||
function SBCSCodec(codecOptions, iconv) { | ||
if (!codecOptions) | ||
throw new Error("SBCS codec is called without the data.") | ||
// Prepare char buffer for decoding. | ||
if (!options.chars || (options.chars.length !== 128 && options.chars.length !== 256)) | ||
throw new Error("Encoding '"+options.type+"' has incorrect 'chars' (must be of len 128 or 256)"); | ||
if (!codecOptions.chars || (codecOptions.chars.length !== 128 && codecOptions.chars.length !== 256)) | ||
throw new Error("Encoding '"+codecOptions.type+"' has incorrect 'chars' (must be of len 128 or 256)"); | ||
if (options.chars.length === 128) { | ||
if (codecOptions.chars.length === 128) { | ||
var asciiString = ""; | ||
for (var i = 0; i < 128; i++) | ||
asciiString += String.fromCharCode(i); | ||
options.chars = asciiString + options.chars; | ||
codecOptions.chars = asciiString + codecOptions.chars; | ||
} | ||
var decodeBuf = new Buffer(options.chars, 'ucs2'); | ||
this.decodeBuf = new Buffer(codecOptions.chars, 'ucs2'); | ||
// Encoding buffer. | ||
var encodeBuf = new Buffer(65536); | ||
encodeBuf.fill(options.iconv.defaultCharSingleByte.charCodeAt(0)); | ||
encodeBuf.fill(iconv.defaultCharSingleByte.charCodeAt(0)); | ||
for (var i = 0; i < options.chars.length; i++) | ||
encodeBuf[options.chars.charCodeAt(i)] = i; | ||
for (var i = 0; i < codecOptions.chars.length; i++) | ||
encodeBuf[codecOptions.chars.charCodeAt(i)] = i; | ||
return { | ||
encoder: encoderSBCS, | ||
decoder: decoderSBCS, | ||
encodeBuf: encodeBuf, | ||
decodeBuf: decodeBuf, | ||
}; | ||
this.encodeBuf = encodeBuf; | ||
} | ||
function encoderSBCS(options) { | ||
return { | ||
write: encoderSBCSWrite, | ||
end: function() {}, | ||
SBCSCodec.prototype.encoder = SBCSEncoder; | ||
SBCSCodec.prototype.decoder = SBCSDecoder; | ||
encodeBuf: this.encodeBuf, | ||
}; | ||
function SBCSEncoder(options, codec) { | ||
this.encodeBuf = codec.encodeBuf; | ||
} | ||
function encoderSBCSWrite(str) { | ||
SBCSEncoder.prototype.write = function(str) { | ||
var buf = new Buffer(str.length); | ||
@@ -56,13 +51,11 @@ for (var i = 0; i < str.length; i++) | ||
SBCSEncoder.prototype.end = function() { | ||
} | ||
function decoderSBCS(options) { | ||
return { | ||
write: decoderSBCSWrite, | ||
end: function() {}, | ||
decodeBuf: this.decodeBuf, | ||
}; | ||
function SBCSDecoder(options, codec) { | ||
this.decodeBuf = codec.decodeBuf; | ||
} | ||
function decoderSBCSWrite(buf) { | ||
SBCSDecoder.prototype.write = function(buf) { | ||
// Strings are immutable in JS -> we use ucs2 buffer to speed up computations. | ||
@@ -72,3 +65,3 @@ var decodeBuf = this.decodeBuf; | ||
var idx1 = 0, idx2 = 0; | ||
for (var i = 0, _len = buf.length; i < _len; i++) { | ||
for (var i = 0; i < buf.length; i++) { | ||
idx1 = buf[i]*2; idx2 = i*2; | ||
@@ -80,1 +73,4 @@ newBuf[idx2] = decodeBuf[idx1]; | ||
} | ||
SBCSDecoder.prototype.end = function() { | ||
} |
@@ -0,1 +1,2 @@ | ||
"use strict" | ||
@@ -2,0 +3,0 @@ // Generated data for sbcs codec. Don't edit manually. Regenerate using generation/gen-sbcs.js script. |
@@ -0,1 +1,2 @@ | ||
"use strict" | ||
@@ -2,0 +3,0 @@ // Manually added data to be used by sbcs codec in addition to generated one. |
@@ -0,13 +1,12 @@ | ||
"use strict" | ||
// == UTF16-BE codec. ========================================================== | ||
exports.utf16be = function(options) { | ||
return { | ||
encoder: utf16beEncoder, | ||
decoder: utf16beDecoder, | ||
exports.utf16be = Utf16BECodec; | ||
function Utf16BECodec() { | ||
} | ||
bom: new Buffer([0xFE, 0xFF]), | ||
}; | ||
}; | ||
Utf16BECodec.prototype.encoder = Utf16BEEncoder; | ||
Utf16BECodec.prototype.decoder = Utf16BEDecoder; | ||
Utf16BECodec.prototype.bomAware = true; | ||
@@ -17,10 +16,6 @@ | ||
function utf16beEncoder(options) { | ||
return { | ||
write: utf16beEncoderWrite, | ||
end: function() {}, | ||
} | ||
function Utf16BEEncoder() { | ||
} | ||
function utf16beEncoderWrite(str) { | ||
Utf16BEEncoder.prototype.write = function(str) { | ||
var buf = new Buffer(str, 'ucs2'); | ||
@@ -33,15 +28,13 @@ for (var i = 0; i < buf.length; i += 2) { | ||
Utf16BEEncoder.prototype.end = function() { | ||
} | ||
// -- Decoding | ||
function utf16beDecoder(options) { | ||
return { | ||
write: utf16beDecoderWrite, | ||
end: function() {}, | ||
overflowByte: -1, | ||
}; | ||
function Utf16BEDecoder() { | ||
this.overflowByte = -1; | ||
} | ||
function utf16beDecoderWrite(buf) { | ||
Utf16BEDecoder.prototype.write = function(buf) { | ||
if (buf.length == 0) | ||
@@ -69,51 +62,38 @@ return ''; | ||
Utf16BEDecoder.prototype.end = function() { | ||
} | ||
// == UTF-16 codec ============================================================= | ||
// Decoder chooses automatically from UTF-16LE and UTF-16BE using BOM and space-based heuristic. | ||
// Defaults to UTF-16BE, according to RFC 2781, although it is against some industry practices, see | ||
// Defaults to UTF-16LE, as it's prevalent and default in Node. | ||
// http://en.wikipedia.org/wiki/UTF-16 and http://encoding.spec.whatwg.org/#utf-16le | ||
// Decoder default can be changed: iconv.decode(buf, 'utf16', {default: 'utf-16le'}); | ||
// Decoder default can be changed: iconv.decode(buf, 'utf16', {defaultEncoding: 'utf-16be'}); | ||
// Encoder prepends BOM and uses UTF-16BE. | ||
// Endianness can also be changed: iconv.encode(str, 'utf16', {use: 'utf-16le'}); | ||
// Encoder uses UTF-16LE and prepends BOM (which can be overridden with addBOM: false). | ||
exports.utf16 = function(options) { | ||
return { | ||
encoder: utf16Encoder, | ||
decoder: utf16Decoder, | ||
exports.utf16 = Utf16Codec; | ||
function Utf16Codec(codecOptions, iconv) { | ||
this.iconv = iconv; | ||
} | ||
getCodec: options.iconv.getCodec, | ||
}; | ||
}; | ||
Utf16Codec.prototype.encoder = Utf16Encoder; | ||
Utf16Codec.prototype.decoder = Utf16Decoder; | ||
// -- Encoding | ||
function utf16Encoder(options) { | ||
// -- Encoding (pass-through) | ||
function Utf16Encoder(options, codec) { | ||
options = options || {}; | ||
var codec = this.getCodec(options.use || 'utf-16be'); | ||
if (!codec.bom) | ||
throw new Error("iconv-lite: in UTF-16 encoder, 'use' parameter should be either UTF-16BE or UTF16-LE."); | ||
return { | ||
write: utf16EncoderWrite, | ||
end: utf16EncoderEnd, | ||
bom: codec.bom, | ||
internalEncoder: codec.encoder(options), | ||
}; | ||
if (options.addBOM === undefined) | ||
options.addBOM = true; | ||
this.encoder = codec.iconv.getEncoder('utf-16le', options); | ||
} | ||
function utf16EncoderWrite(str) { | ||
var buf = this.internalEncoder.write(str); | ||
if (this.bom) { | ||
buf = Buffer.concat([this.bom, buf]); | ||
this.bom = null; | ||
} | ||
return buf; | ||
Utf16Encoder.prototype.write = function(str) { | ||
return this.encoder.write(str); | ||
} | ||
function utf16EncoderEnd() { | ||
return this.internalEncoder.end(); | ||
Utf16Encoder.prototype.end = function() { | ||
return this.encoder.end(); | ||
} | ||
@@ -124,83 +104,75 @@ | ||
function utf16Decoder(options) { | ||
return { | ||
write: utf16DecoderWrite, | ||
end: utf16DecoderEnd, | ||
function Utf16Decoder(options, codec) { | ||
this.decoder = null; | ||
this.initialBytes = []; | ||
this.initialBytesLen = 0; | ||
internalDecoder: null, | ||
initialBytes: [], | ||
initialBytesLen: 0, | ||
options: options || {}, | ||
getCodec: this.getCodec, | ||
}; | ||
this.options = options || {}; | ||
this.iconv = codec.iconv; | ||
} | ||
function utf16DecoderWrite(buf) { | ||
if (this.internalDecoder) | ||
return this.internalDecoder.write(buf); | ||
Utf16Decoder.prototype.write = function(buf) { | ||
if (!this.decoder) { | ||
// Codec is not chosen yet. Accumulate initial bytes. | ||
this.initialBytes.push(buf); | ||
this.initialBytesLen += buf.length; | ||
if (this.initialBytesLen < 16) // We need more bytes to use space heuristic (see below) | ||
return ''; | ||
// Codec is not chosen yet. Accumulate initial bytes. | ||
this.initialBytes.push(buf); | ||
this.initialBytesLen += buf.length; | ||
if (this.initialBytesLen < 16) // We need > 2 bytes to use space heuristic (see below) | ||
return ''; | ||
// We have enough bytes -> detect endianness. | ||
var buf = Buffer.concat(this.initialBytes), | ||
encoding = detectEncoding(buf, this.options.defaultEncoding); | ||
this.decoder = this.iconv.getDecoder(encoding, this.options); | ||
this.initialBytes.length = this.initialBytesLen = 0; | ||
} | ||
// We have enough bytes -> decide endianness. | ||
return utf16DecoderDecideEndianness.call(this); | ||
return this.decoder.write(buf); | ||
} | ||
function utf16DecoderEnd() { | ||
if (this.internalDecoder) | ||
return this.internalDecoder.end(); | ||
Utf16Decoder.prototype.end = function() { | ||
if (!this.decoder) { | ||
var buf = Buffer.concat(this.initialBytes), | ||
encoding = detectEncoding(buf, this.options.defaultEncoding); | ||
this.decoder = this.iconv.getDecoder(encoding, this.options); | ||
var res = utf16DecoderDecideEndianness.call(this); | ||
var trail; | ||
var res = this.decoder.write(buf), | ||
trail = this.decoder.end(); | ||
if (this.internalDecoder) | ||
trail = this.internalDecoder.end(); | ||
return (trail && trail.length > 0) ? (res + trail) : res; | ||
return trail ? (res + trail) : res; | ||
} | ||
return this.decoder.end(); | ||
} | ||
function utf16DecoderDecideEndianness() { | ||
var buf = Buffer.concat(this.initialBytes); | ||
this.initialBytes.length = this.initialBytesLen = 0; | ||
function detectEncoding(buf, defaultEncoding) { | ||
var enc = defaultEncoding || 'utf-16le'; | ||
if (buf.length < 2) | ||
return ''; // Not a valid UTF-16 sequence anyway. | ||
if (buf.length >= 2) { | ||
// Check BOM. | ||
if (buf[0] == 0xFE && buf[1] == 0xFF) // UTF-16BE BOM | ||
enc = 'utf-16be'; | ||
else if (buf[0] == 0xFF && buf[1] == 0xFE) // UTF-16LE BOM | ||
enc = 'utf-16le'; | ||
else { | ||
// No BOM found. Try to deduce encoding from initial content. | ||
// Most of the time, the content has spaces (U+0020), but the opposite (U+2000) is very uncommon. | ||
// So, we count spaces as if it was LE or BE, and decide from that. | ||
var spacesLE = 0, spacesBE = 0, // Counts of space chars in both positions | ||
_len = Math.min(buf.length - (buf.length % 2), 64); // Len is always even. | ||
// Default encoding. | ||
var enc = this.options.default || 'utf-16be'; | ||
for (var i = 0; i < _len; i += 2) { | ||
if (buf[i] == 0x00 && buf[i+1] == 0x20) spacesBE++; | ||
if (buf[i] == 0x20 && buf[i+1] == 0x00) spacesLE++; | ||
} | ||
// Check BOM. | ||
if (buf[0] == 0xFE && buf[1] == 0xFF) { // UTF-16BE BOM | ||
enc = 'utf-16be'; buf = buf.slice(2); | ||
} | ||
else if (buf[0] == 0xFF && buf[1] == 0xFE) { // UTF-16LE BOM | ||
enc = 'utf-16le'; buf = buf.slice(2); | ||
} | ||
else { | ||
// No BOM found. Try to deduce encoding from initial content. | ||
// Most of the time, the content has spaces (U+0020), but the opposite (U+2000) is very uncommon. | ||
// So, we count spaces as if it was LE or BE, and decide from that. | ||
var spaces = [0, 0], // Counts of space chars in both positions | ||
_len = Math.min(buf.length - (buf.length % 2), 64); // Len is always even. | ||
for (var i = 0; i < _len; i += 2) { | ||
if (buf[i] == 0x00 && buf[i+1] == 0x20) spaces[0]++; | ||
if (buf[i] == 0x20 && buf[i+1] == 0x00) spaces[1]++; | ||
if (spacesBE > 0 && spacesLE == 0) | ||
enc = 'utf-16be'; | ||
else if (spacesBE == 0 && spacesLE > 0) | ||
enc = 'utf-16le'; | ||
} | ||
if (spaces[0] > 0 && spaces[1] == 0) | ||
enc = 'utf-16be'; | ||
else if (spaces[0] == 0 && spaces[1] > 0) | ||
enc = 'utf-16le'; | ||
} | ||
this.internalDecoder = this.getCodec(enc).decoder(this.options); | ||
return this.internalDecoder.write(buf); | ||
return enc; | ||
} | ||
@@ -0,34 +1,26 @@ | ||
"use strict" | ||
// UTF-7 codec, according to https://tools.ietf.org/html/rfc2152 | ||
// Below is UTF-7-IMAP codec, according to http://tools.ietf.org/html/rfc3501#section-5.1.3 | ||
// See also below a UTF-7-IMAP codec, according to http://tools.ietf.org/html/rfc3501#section-5.1.3 | ||
exports.utf7 = function(options) { | ||
return { | ||
encoder: function utf7Encoder() { | ||
return { | ||
write: utf7EncoderWrite, | ||
end: function() {}, | ||
exports.utf7 = Utf7Codec; | ||
exports.unicode11utf7 = 'utf7'; // Alias UNICODE-1-1-UTF-7 | ||
function Utf7Codec(codecOptions, iconv) { | ||
this.iconv = iconv; | ||
}; | ||
iconv: options.iconv, | ||
}; | ||
}, | ||
decoder: function utf7Decoder() { | ||
return { | ||
write: utf7DecoderWrite, | ||
end: utf7DecoderEnd, | ||
Utf7Codec.prototype.encoder = Utf7Encoder; | ||
Utf7Codec.prototype.decoder = Utf7Decoder; | ||
Utf7Codec.prototype.bomAware = true; | ||
iconv: options.iconv, | ||
inBase64: false, | ||
base64Accum: '', | ||
}; | ||
}, | ||
}; | ||
}; | ||
exports.unicode11utf7 = 'utf7'; // Alias UNICODE-1-1-UTF-7 | ||
// -- Encoding | ||
var nonDirectChars = /[^A-Za-z0-9'\(\),-\.\/:\? \n\r\t]+/g; | ||
function utf7EncoderWrite(str) { | ||
function Utf7Encoder(options, codec) { | ||
this.iconv = codec.iconv; | ||
} | ||
Utf7Encoder.prototype.write = function(str) { | ||
// Naive implementation. | ||
@@ -43,3 +35,14 @@ // Non-direct chars are encoded as "+<base64>-"; single "+" char is encoded as "+-". | ||
Utf7Encoder.prototype.end = function() { | ||
} | ||
// -- Decoding | ||
function Utf7Decoder(options, codec) { | ||
this.iconv = codec.iconv; | ||
this.inBase64 = false; | ||
this.base64Accum = ''; | ||
} | ||
var base64Regex = /[A-Za-z0-9\/+]/; | ||
@@ -54,3 +57,3 @@ var base64Chars = []; | ||
function utf7DecoderWrite(buf) { | ||
Utf7Decoder.prototype.write = function(buf) { | ||
var res = "", lastI = 0, | ||
@@ -107,3 +110,3 @@ inBase64 = this.inBase64, | ||
function utf7DecoderEnd() { | ||
Utf7Decoder.prototype.end = function() { | ||
var res = ""; | ||
@@ -131,30 +134,22 @@ if (this.inBase64 && this.base64Accum.length > 0) | ||
exports.utf7imap = function(options) { | ||
return { | ||
encoder: function utf7ImapEncoder() { | ||
return { | ||
write: utf7ImapEncoderWrite, | ||
end: utf7ImapEncoderEnd, | ||
exports.utf7imap = Utf7IMAPCodec; | ||
function Utf7IMAPCodec(codecOptions, iconv) { | ||
this.iconv = iconv; | ||
}; | ||
iconv: options.iconv, | ||
inBase64: false, | ||
base64Accum: new Buffer(6), | ||
base64AccumIdx: 0, | ||
}; | ||
}, | ||
decoder: function utf7ImapDecoder() { | ||
return { | ||
write: utf7ImapDecoderWrite, | ||
end: utf7ImapDecoderEnd, | ||
Utf7IMAPCodec.prototype.encoder = Utf7IMAPEncoder; | ||
Utf7IMAPCodec.prototype.decoder = Utf7IMAPDecoder; | ||
Utf7IMAPCodec.prototype.bomAware = true; | ||
iconv: options.iconv, | ||
inBase64: false, | ||
base64Accum: '', | ||
}; | ||
}, | ||
}; | ||
}; | ||
// -- Encoding | ||
function utf7ImapEncoderWrite(str) { | ||
function Utf7IMAPEncoder(options, codec) { | ||
this.iconv = codec.iconv; | ||
this.inBase64 = false; | ||
this.base64Accum = new Buffer(6); | ||
this.base64AccumIdx = 0; | ||
} | ||
Utf7IMAPEncoder.prototype.write = function(str) { | ||
var inBase64 = this.inBase64, | ||
@@ -208,3 +203,3 @@ base64Accum = this.base64Accum, | ||
function utf7ImapEncoderEnd() { | ||
Utf7IMAPEncoder.prototype.end = function() { | ||
var buf = new Buffer(10), bufIdx = 0; | ||
@@ -225,6 +220,14 @@ if (this.inBase64) { | ||
// -- Decoding | ||
function Utf7IMAPDecoder(options, codec) { | ||
this.iconv = codec.iconv; | ||
this.inBase64 = false; | ||
this.base64Accum = ''; | ||
} | ||
var base64IMAPChars = base64Chars.slice(); | ||
base64IMAPChars[','.charCodeAt(0)] = true; | ||
function utf7ImapDecoderWrite(buf) { | ||
Utf7IMAPDecoder.prototype.write = function(buf) { | ||
var res = "", lastI = 0, | ||
@@ -282,3 +285,3 @@ inBase64 = this.inBase64, | ||
function utf7ImapDecoderEnd() { | ||
Utf7IMAPDecoder.prototype.end = function() { | ||
var res = ""; | ||
@@ -285,0 +288,0 @@ if (this.inBase64 && this.base64Accum.length > 0) |
@@ -0,1 +1,2 @@ | ||
"use strict" | ||
@@ -17,3 +18,3 @@ // == Extend Node primitives to use iconv-lite ================================= | ||
Buffer.isNativeEncoding = function(enc) { | ||
return nodeNativeEncodings[enc && enc.toLowerCase()]; | ||
return enc && nodeNativeEncodings[enc.toLowerCase()]; | ||
} | ||
@@ -27,9 +28,3 @@ | ||
encoding = String(encoding || 'utf8').toLowerCase(); | ||
start = +start || 0; | ||
if (typeof end !== 'number') end = this.length; | ||
// Fastpath empty strings | ||
if (+end == start) | ||
return ''; | ||
// Use native conversion when possible | ||
@@ -173,8 +168,5 @@ if (Buffer.isNativeEncoding(encoding)) | ||
Readable.prototype.setEncoding = function setEncoding(enc, options) { | ||
// Try to use original function when possible. | ||
if (Buffer.isNativeEncoding(enc)) | ||
return original.ReadableSetEncoding.call(this, enc); | ||
// Try to use our own decoder, it has the same interface. | ||
this._readableState.decoder = iconv.getCodec(enc).decoder(options); | ||
// Use our own decoder, it has the same interface. | ||
// We cannot use original function as it doesn't handle BOM-s. | ||
this._readableState.decoder = iconv.getDecoder(enc, options); | ||
this._readableState.encoding = enc; | ||
@@ -181,0 +173,0 @@ } |
@@ -0,3 +1,5 @@ | ||
"use strict" | ||
var iconv = module.exports; | ||
var bomHandling = require('./bom-handling'), | ||
iconv = module.exports; | ||
@@ -16,3 +18,3 @@ // All codecs and aliases are kept here, keyed by encoding name/alias. | ||
var encoder = iconv.getCodec(encoding).encoder(options); | ||
var encoder = iconv.getEncoder(encoding, options); | ||
@@ -35,3 +37,3 @@ var res = encoder.write(str); | ||
var decoder = iconv.getCodec(encoding).decoder(options); | ||
var decoder = iconv.getDecoder(encoding, options); | ||
@@ -41,3 +43,3 @@ var res = decoder.write(buf); | ||
return (trail && trail.length > 0) ? (res + trail) : res; | ||
return trail ? (res + trail) : res; | ||
} | ||
@@ -68,39 +70,35 @@ | ||
// Traverse iconv.encodings to find actual codec. | ||
var codecData, codecOptions; | ||
var codecOptions = {}; | ||
while (true) { | ||
codecData = iconv._codecDataCache[enc]; | ||
if (codecData) | ||
return codecData; | ||
var codec = iconv._codecDataCache[enc]; | ||
if (codec) | ||
return codec; | ||
var codec = iconv.encodings[enc]; | ||
var codecDef = iconv.encodings[enc]; | ||
switch (typeof codec) { | ||
switch (typeof codecDef) { | ||
case "string": // Direct alias to other encoding. | ||
enc = codec; | ||
enc = codecDef; | ||
break; | ||
case "object": // Alias with options. Can be layered. | ||
if (!codecOptions) { | ||
codecOptions = codec; | ||
for (var key in codecDef) | ||
codecOptions[key] = codecDef[key]; | ||
if (!codecOptions.encodingName) | ||
codecOptions.encodingName = enc; | ||
} | ||
else { | ||
for (var key in codec) | ||
codecOptions[key] = codec[key]; | ||
} | ||
enc = codec.type; | ||
enc = codecDef.type; | ||
break; | ||
case "function": // Codec itself. | ||
if (!codecOptions) | ||
codecOptions = { encodingName: enc }; | ||
codecOptions.iconv = iconv; | ||
if (!codecOptions.encodingName) | ||
codecOptions.encodingName = enc; | ||
// The codec function must load all tables and return object with .encoder and .decoder methods. | ||
// It'll be called only once (for each different options object). | ||
codecData = codec.call(iconv.encodings, codecOptions); | ||
codec = new codecDef(codecOptions, iconv); | ||
iconv._codecDataCache[codecOptions.encodingName] = codecData; // Save it to be reused later. | ||
return codecData; | ||
iconv._codecDataCache[codecOptions.encodingName] = codec; // Save it to be reused later. | ||
return codec; | ||
@@ -113,2 +111,23 @@ default: | ||
iconv.getEncoder = function getEncoder(encoding, options) { | ||
var codec = iconv.getCodec(encoding), | ||
encoder = new codec.encoder(options, codec); | ||
if (codec.bomAware && options && options.addBOM) | ||
encoder = new bomHandling.PrependBOM(encoder, options); | ||
return encoder; | ||
} | ||
iconv.getDecoder = function getDecoder(encoding, options) { | ||
var codec = iconv.getCodec(encoding), | ||
decoder = new codec.decoder(options, codec); | ||
if (codec.bomAware && !(options && options.stripBOM === false)) | ||
decoder = new bomHandling.StripBOM(decoder, options); | ||
return decoder; | ||
} | ||
// Load extensions in Node. All of them are omitted in Browserify build via 'browser' field in package.json. | ||
@@ -115,0 +134,0 @@ var nodeVer = typeof process !== 'undefined' && process.versions && process.versions.node; |
@@ -0,1 +1,3 @@ | ||
"use strict" | ||
var Transform = require("stream").Transform; | ||
@@ -9,7 +11,7 @@ | ||
iconv.encodeStream = function encodeStream(encoding, options) { | ||
return new IconvLiteEncoderStream(iconv.getCodec(encoding).encoder(options), options); | ||
return new IconvLiteEncoderStream(iconv.getEncoder(encoding, options), options); | ||
} | ||
iconv.decodeStream = function decodeStream(encoding, options) { | ||
return new IconvLiteDecoderStream(iconv.getCodec(encoding).decoder(options), options); | ||
return new IconvLiteDecoderStream(iconv.getDecoder(encoding, options), options); | ||
} | ||
@@ -16,0 +18,0 @@ |
{ | ||
"name": "iconv-lite", | ||
"description": "Convert character encodings in pure javascript.", | ||
"version": "0.4.8", | ||
"version": "0.4.9", | ||
"license": "MIT", | ||
@@ -50,4 +50,4 @@ | ||
"istanbul": "*", | ||
"iconv": "2.1.4" | ||
"iconv": "2.1" | ||
} | ||
} |
@@ -1,5 +0,3 @@ | ||
## Pure JS character encoding conversion | ||
## Pure JS character encoding conversion [![Build Status](https://travis-ci.org/ashtuchkin/iconv-lite.svg?branch=master)](https://travis-ci.org/ashtuchkin/iconv-lite) | ||
<!-- [![Build Status](https://secure.travis-ci.org/ashtuchkin/iconv-lite.png?branch=master)](http://travis-ci.org/ashtuchkin/iconv-lite) --> | ||
* Doesn't need native code compilation. Works on Windows and in sandboxed environments like [Cloud9](http://c9.io). | ||
@@ -118,8 +116,21 @@ * Used in popular projects like [Express.js (body_parser)](https://github.com/expressjs/body-parser), | ||
## BOM handling | ||
## Notes | ||
* Decoding: BOM is stripped by default, unless overridden by passing `stripBOM: false` in options | ||
(f.ex. `iconv.decode(buf, enc, {stripBOM: false})`). | ||
A callback might also be given as a `stripBOM` parameter - it'll be called if BOM character was actually found. | ||
* Encoding: No BOM added, unless overridden by `addBOM: true` option. | ||
## UTF-16 Encodings | ||
This library supports UTF-16LE, UTF-16BE and UTF-16 encodings. First two are straightforward, but UTF-16 is trying to be | ||
smart about endianness in the following ways: | ||
* Decoding: uses BOM and 'spaces heuristic' to determine input endianness. Default is UTF-16LE, but can be | ||
overridden with `defaultEncoding: 'utf-16be'` option. Strips BOM unless `stripBOM: false`. | ||
* Encoding: uses UTF-16LE and writes BOM by default. Use `addBOM: false` to override. | ||
## Other notes | ||
When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding). | ||
Untranslatable characters are set to � or ?. No transliteration is currently supported. | ||
Uses BOM to determine endianness, but doesn't remove it. Use ['strip-bom' module](https://github.com/sindresorhus/strip-bom). | ||
Node versions 0.10.31 and 0.11.13 are buggy, don't use them (see #65, #77). | ||
@@ -126,0 +137,0 @@ |
Sorry, the diff of this file is not supported yet
326594
27
3818
156