iconv-lite
Advanced tools
Comparing version 0.4.0-pre3 to 0.4.0
// Double-byte codec. This scheme is widespread and consists of 2 tables: | ||
// 1. Single-byte: mostly just ASCII, but can be more complex. | ||
// 2. Double-byte with leading byte not assigned in single-byte. | ||
// Multibyte codec. In this scheme, a character is represented by 1 or more bytes. | ||
// Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences. | ||
// To save memory and loading time, we read table files only when requested. | ||
// To save memory, we read table files only when requested. | ||
exports._dbcs = function(options) { | ||
return new DBCSCodec(options); | ||
} | ||
exports._dbcs = function(options) { | ||
var UNASSIGNED = -1, | ||
GB18030_CODE = -2, | ||
SEQ_START = -10, | ||
NODE_START = -1000, | ||
UNASSIGNED_NODE = new Array(0x100), | ||
DEF_CHAR = -1; | ||
for (var i = 0; i < 0x100; i++) | ||
UNASSIGNED_NODE[i] = UNASSIGNED; | ||
// Class DBCSCodec reads and initializes mapping tables. | ||
function DBCSCodec(options) { | ||
this.options = options; | ||
if (!options) | ||
throw new Error("DBCS codec is called without the data.") | ||
if (!options.table) | ||
throw new Error("Encoding '" + options.type + "' has no data."); | ||
// Fill out DBCS -> Unicode decoding tables | ||
var decodeLead = []; | ||
for (var i = 0; i < 0x100; i++) | ||
decodeLead[i] = -1; // Unassigned. | ||
throw new Error("Encoding '" + options.encodingName + "' has no data."); | ||
var decodeTable = []; | ||
for (var i = 0; i < 0x8000; i++) | ||
decodeTable[i] = -1; // Unassigned. | ||
// Load tables. | ||
var mappingTable = options.table(); | ||
var decodeTableSeq = [null, null, null]; // Sequences, start with 3. (they are designated by negative indexes and -1 is reserved for undefined, -2: leading byte) | ||
if (!options.table.map) options.table = [options.table]; | ||
for (var i = 0; i < options.table.length; i++) { | ||
var table = require(options.table[i]); | ||
for (var j = 0; j < table.length; j++) { // Chunks. | ||
var chunk = table[j]; | ||
var curAddr = parseInt(chunk[0], 16), writeTable; | ||
if (curAddr < 0x100) { | ||
writeTable = decodeLead; | ||
} | ||
else if (curAddr < 0x10000) { | ||
if (decodeLead[curAddr >> 8] >= 0) | ||
throw new Error("Overwrite lead byte in table " + options.table + ": " + chunk[0]); | ||
decodeLead[curAddr >> 8] = -2; // DBCS lead byte. | ||
writeTable = decodeTable; | ||
curAddr -= 0x8000; | ||
if (curAddr < 0) | ||
throw new Error("DB address < 0x8000 in table " + options.table + ": " + chunk[0]); | ||
} | ||
else | ||
throw new Error("Unsupported address in table " + options.table + ": " + chunk[0]); | ||
// Decode tables: MBCS -> Unicode. | ||
for (var k = 1; k < chunk.length; k++) { | ||
var part = chunk[k]; | ||
if (typeof part === "string") { // String, write as-is. | ||
for (var l = 0; l < part.length;) { | ||
var code = part.charCodeAt(l++); | ||
if (0xD800 <= code && code < 0xDC00) { // Surrogate | ||
var codeTrail = part.charCodeAt(l++); | ||
if (0xDC00 <= codeTrail && codeTrail < 0xE000) | ||
writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00); | ||
else | ||
throw new Error("Incorrect surrogate pair in table " + options.table + ": " + chunk[0]); | ||
} | ||
else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding) | ||
var len = 0xFFF - code + 2; | ||
var seq = []; | ||
for (var m = 0; m < len; m++) | ||
seq.push(part.charCodeAt(l++)); // Simple variation: don't support surrogates or subsequences in seq. | ||
// decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. | ||
// Trie root is decodeTables[0]. | ||
// Values: >= 0 -> unicode character code. can be > 0xFFFF | ||
// == UNASSIGNED -> unknown/unassigned sequence. | ||
// == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. | ||
// <= NODE_START -> index of the next node in our trie to process next byte. | ||
// <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. | ||
this.decodeTables = []; | ||
this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. | ||
decodeTableSeq.push(seq); | ||
writeTable[curAddr++] = -(decodeTableSeq.length-1); // negative char code -> sequence idx. | ||
} | ||
else | ||
writeTable[curAddr++] = code; // Basic char | ||
} | ||
} | ||
else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character. | ||
var charCode = writeTable[curAddr - 1] + 1; | ||
for (var l = 0; l < part; l++) | ||
writeTable[curAddr++] = charCode++; | ||
} | ||
else | ||
throw new Error("Incorrect value type '" + typeof part + "' in table " + options.table + ": " + chunk[0]); | ||
} | ||
} | ||
} | ||
// Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. | ||
this.decodeTableSeq = []; | ||
// Unicode -> DBCS. Split table in smaller tables by 256 chars each. | ||
var encodeTable = []; | ||
var encodeTableSeq = [null, null, null]; | ||
// for (var i = 0; i < 0x1100; i++) // Handle all 17 Unicode planes. | ||
// encodeTable[i] = null; // Unassigned | ||
// Actual mapping tables consist of chunks. Use them to fill up decode tables. | ||
for (var i = 0; i < mappingTable.length; i++) | ||
this._addDecodeChunk(mappingTable[i]); | ||
var tables = [[decodeTable, 0x8000], [decodeLead, 0]]; | ||
for (var t = 0; t < tables.length; t++) { | ||
var table = tables[t][0], offset = tables[t][1]; | ||
for (var i = 0; i < table.length; i++) { | ||
var uCode = table[i]; | ||
if (uCode >= 0) { | ||
var high = uCode >> 8; // This could be > 0xFF because of astral characters. | ||
var low = uCode & 0xFF; | ||
var subtable = encodeTable[high]; | ||
if (subtable === undefined) { | ||
encodeTable[high] = subtable = []; | ||
for (var j = 0; j < 0x100; j++) | ||
subtable[j] = -1; | ||
} | ||
if (subtable[low] < -2) | ||
encodeTableSeq[-subtable[low]][-1] = i + offset; | ||
else | ||
subtable[low] = i + offset; | ||
} | ||
else if (uCode < -2) { // Sequence. | ||
var seq = decodeTableSeq[-uCode]; | ||
//console.log((i+offset).toString(16), uCode, seq.map(function(uCode) {return uCode.toString(16)})); | ||
uCode = seq[0]; | ||
this.defaultCharUnicode = options.iconv.defaultCharUnicode; | ||
var high = uCode >> 8; | ||
var low = uCode & 0xFF; | ||
var subtable = encodeTable[high]; | ||
if (subtable === undefined) { | ||
encodeTable[high] = subtable = []; | ||
for (var j = 0; j < 0x100; j++) | ||
subtable[j] = -1; | ||
} | ||
// Encode tables: Unicode -> DBCS. | ||
var seqObj; | ||
if (subtable[low] < -1) | ||
seqObj = encodeTableSeq[-subtable[low]]; | ||
else { | ||
seqObj = {}; | ||
if (subtable[low] !== -1) seqObj[-1] = subtable[low]; | ||
encodeTableSeq.push(seqObj); | ||
subtable[low] = -(encodeTableSeq.length - 1); | ||
} | ||
// `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. | ||
// Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. | ||
// Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). | ||
// == UNASSIGNED -> no conversion found. Output a default char. | ||
// <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. | ||
this.encodeTable = []; | ||
// `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of | ||
// objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key | ||
// means end of sequence (needed when one sequence is a strict subsequence of another). | ||
// Objects are kept separately from encodeTable to increase performance. | ||
this.encodeTableSeq = []; | ||
for (var j = 1; j < seq.length; j++) { | ||
uCode = seq[j]; | ||
if (j === seq.length-1) { | ||
seqObj[uCode] = i + offset; | ||
} else { | ||
var oldVal = seqObj[uCode]; | ||
if (typeof oldVal === 'object') | ||
seqObj = oldVal; | ||
else { | ||
seqObj = seqObj[uCode] = {} | ||
if (oldVal !== undefined) | ||
seqObj[-1] = oldVal | ||
} | ||
} | ||
} | ||
} | ||
// Some chars can be decoded, but need not be encoded. | ||
var skipEncodeChars = {}; | ||
if (options.encodeSkipVals) | ||
for (var i = 0; i < options.encodeSkipVals.length; i++) { | ||
var range = options.encodeSkipVals[i]; | ||
for (var j = range.from; j <= range.to; j++) | ||
skipEncodeChars[j] = true; | ||
} | ||
} | ||
if (typeof options.gb18030 == 'string') { | ||
options.gb18030 = require(options.gb18030); | ||
for (var i = 0; i < 0x100; i++) | ||
if ((0x81 <= i && i <= 0xFE) != (decodeLead[i] == -2)) | ||
throw new Error("Invalid GB18030 double-byte table; leading byte is not in range 0x81-0xFE: ", i.toString(16)); | ||
} | ||
// Use decode trie to recursively fill out encode tables. | ||
this._fillEncodeTable(0, 0, skipEncodeChars); | ||
var defCharSB = encodeTable[0][options.iconv.defaultCharSingleByte.charCodeAt(0)]; | ||
if (defCharSB === -1) defCharSB = encodeTable[0]['?']; | ||
if (defCharSB === -1) defCharSB = "?".charCodeAt(0); | ||
// Add more encoding pairs when needed. | ||
for (var uChar in options.encodeAdd || {}) | ||
this._setEncodeChar(uChar.charCodeAt(0), options.encodeAdd[uChar]); | ||
return { | ||
encoder: encoderDBCS, | ||
decoder: decoderDBCS, | ||
this.defCharSB = this.encodeTable[0][options.iconv.defaultCharSingleByte.charCodeAt(0)]; | ||
if (this.defCharSB === UNASSIGNED) this.defCharSB = this.encodeTable[0]['?']; | ||
if (this.defCharSB === UNASSIGNED) this.defCharSB = "?".charCodeAt(0); | ||
decodeLead: decodeLead, | ||
decodeTable: decodeTable, | ||
decodeTableSeq: decodeTableSeq, | ||
defaultCharUnicode: options.iconv.defaultCharUnicode, | ||
encodeTable: encodeTable, | ||
encodeTableSeq: encodeTableSeq, | ||
defaultCharSingleByte: defCharSB, | ||
gb18030: options.gb18030, | ||
}; | ||
// Load & create GB18030 tables when needed. | ||
if (typeof options.gb18030 === 'function') { | ||
this.gb18030 = options.gb18030(); // Load GB18030 ranges. | ||
// Add GB18030 decode tables. | ||
var thirdByteNodeIdx = this.decodeTables.length; | ||
var thirdByteNode = this.decodeTables[thirdByteNodeIdx] = UNASSIGNED_NODE.slice(0); | ||
var fourthByteNodeIdx = this.decodeTables.length; | ||
var fourthByteNode = this.decodeTables[fourthByteNodeIdx] = UNASSIGNED_NODE.slice(0); | ||
for (var i = 0x81; i <= 0xFE; i++) { | ||
var secondByteNodeIdx = NODE_START - this.decodeTables[0][i]; | ||
var secondByteNode = this.decodeTables[secondByteNodeIdx]; | ||
for (var j = 0x30; j <= 0x39; j++) | ||
secondByteNode[j] = NODE_START - thirdByteNodeIdx; | ||
} | ||
for (var i = 0x81; i <= 0xFE; i++) | ||
thirdByteNode[i] = NODE_START - fourthByteNodeIdx; | ||
for (var i = 0x30; i <= 0x39; i++) | ||
fourthByteNode[i] = GB18030_CODE | ||
} | ||
} | ||
function encoderDBCS(options) { | ||
// Public interface: create encoder and decoder objects. | ||
// The methods (write, end) are simple functions to not inhibit optimizations. | ||
DBCSCodec.prototype.encoder = function encoderDBCS(options) { | ||
return { | ||
@@ -186,3 +123,3 @@ // Methods | ||
// Decoder state | ||
// Encoder state | ||
leadSurrogate: -1, | ||
@@ -194,3 +131,3 @@ seqObj: undefined, | ||
encodeTableSeq: this.encodeTableSeq, | ||
defaultCharSingleByte: this.defaultCharSingleByte, | ||
defaultCharSingleByte: this.defCharSB, | ||
gb18030: this.gb18030, | ||
@@ -203,4 +140,172 @@ | ||
DBCSCodec.prototype.decoder = function decoderDBCS(options) { | ||
return { | ||
// Methods | ||
write: decoderDBCSWrite, | ||
end: decoderDBCSEnd, | ||
// Decoder state | ||
nodeIdx: 0, | ||
prevBuf: new Buffer(0), | ||
// Static data | ||
decodeTables: this.decodeTables, | ||
decodeTableSeq: this.decodeTableSeq, | ||
defaultCharUnicode: this.defaultCharUnicode, | ||
gb18030: this.gb18030, | ||
} | ||
} | ||
// Decoder helpers | ||
DBCSCodec.prototype._getDecodeTrieNode = function(addr) { | ||
var bytes = []; | ||
for (; addr > 0; addr >>= 8) | ||
bytes.push(addr & 0xFF); | ||
if (bytes.length == 0) | ||
bytes.push(0); | ||
var node = this.decodeTables[0]; | ||
for (var i = bytes.length-1; i > 0; i--) { // Traverse nodes deeper into the trie. | ||
var val = node[bytes[i]]; | ||
if (val == UNASSIGNED) { // Create new node. | ||
node[bytes[i]] = NODE_START - this.decodeTables.length; | ||
this.decodeTables.push(node = UNASSIGNED_NODE.slice(0)); | ||
} | ||
else if (val <= NODE_START) { // Existing node. | ||
node = this.decodeTables[NODE_START - val]; | ||
} | ||
else | ||
throw new Error("Overwrite byte in " + this.options.encodingName + ", addr: " + addr.toString(16)); | ||
} | ||
return node; | ||
} | ||
DBCSCodec.prototype._addDecodeChunk = function(chunk) { | ||
// First element of chunk is the hex mbcs code where we start. | ||
var curAddr = parseInt(chunk[0], 16); | ||
// Choose the decoding node where we'll write our chars. | ||
var writeTable = this._getDecodeTrieNode(curAddr); | ||
curAddr = curAddr & 0xFF; | ||
// Write all other elements of the chunk to the table. | ||
for (var k = 1; k < chunk.length; k++) { | ||
var part = chunk[k]; | ||
if (typeof part === "string") { // String, write as-is. | ||
for (var l = 0; l < part.length;) { | ||
var code = part.charCodeAt(l++); | ||
if (0xD800 <= code && code < 0xDC00) { // Decode surrogate | ||
var codeTrail = part.charCodeAt(l++); | ||
if (0xDC00 <= codeTrail && codeTrail < 0xE000) | ||
writeTable[curAddr++] = 0x10000 + (code - 0xD800) * 0x400 + (codeTrail - 0xDC00); | ||
else | ||
throw new Error("Incorrect surrogate pair in " + this.options.encodingName + " at chunk " + chunk[0]); | ||
} | ||
else if (0x0FF0 < code && code <= 0x0FFF) { // Character sequence (our own encoding used) | ||
var len = 0xFFF - code + 2; | ||
var seq = []; | ||
for (var m = 0; m < len; m++) | ||
seq.push(part.charCodeAt(l++)); // Simple variation: don't support surrogates or subsequences in seq. | ||
writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; | ||
this.decodeTableSeq.push(seq); | ||
} | ||
else | ||
writeTable[curAddr++] = code; // Basic char | ||
} | ||
} | ||
else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character. | ||
var charCode = writeTable[curAddr - 1] + 1; | ||
for (var l = 0; l < part; l++) | ||
writeTable[curAddr++] = charCode++; | ||
} | ||
else | ||
throw new Error("Incorrect type '" + typeof part + "' given in " + this.options.encodingName + " at chunk " + chunk[0]); | ||
} | ||
if (curAddr > 0xFF) | ||
throw new Error("Incorrect chunk in " + this.options.encodingName + " at addr " + chunk[0] + ": too long" + curAddr); | ||
} | ||
// Encoder helpers | ||
DBCSCodec.prototype._getEncodeBucket = function(uCode) { | ||
var high = uCode >> 8; // This could be > 0xFF because of astral characters. | ||
if (this.encodeTable[high] === undefined) | ||
this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. | ||
return this.encodeTable[high]; | ||
} | ||
DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) { | ||
var bucket = this._getEncodeBucket(uCode); | ||
var low = uCode & 0xFF; | ||
if (bucket[low] <= SEQ_START) | ||
this.encodeTableSeq[SEQ_START-bucket[low]][DEF_CHAR] = dbcsCode; // There's already a sequence, set a single-char subsequence of it. | ||
else if (bucket[low] == UNASSIGNED) | ||
bucket[low] = dbcsCode; | ||
} | ||
DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) { | ||
// Get the root of character tree according to first character of the sequence. | ||
var uCode = seq[0]; | ||
var bucket = this._getEncodeBucket(uCode); | ||
var low = uCode & 0xFF; | ||
var node; | ||
if (bucket[low] <= SEQ_START) { | ||
// There's already a sequence with - use it. | ||
node = this.encodeTableSeq[SEQ_START-bucket[low]]; | ||
} | ||
else { | ||
// There was no sequence object - allocate a new one. | ||
node = {}; | ||
if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. | ||
bucket[low] = SEQ_START - this.encodeTableSeq.length; | ||
this.encodeTableSeq.push(node); | ||
} | ||
// Traverse the character tree, allocating new nodes as needed. | ||
for (var j = 1; j < seq.length-1; j++) { | ||
var oldVal = node[uCode]; | ||
if (typeof oldVal === 'object') | ||
node = oldVal; | ||
else { | ||
node = node[uCode] = {} | ||
if (oldVal !== undefined) | ||
node[DEF_CHAR] = oldVal | ||
} | ||
} | ||
// Set the leaf to given dbcsCode. | ||
uCode = seq[seq.length-1]; | ||
node[uCode] = dbcsCode; | ||
} | ||
DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars) { | ||
var node = this.decodeTables[nodeIdx]; | ||
for (var i = 0; i < 0x100; i++) { | ||
var uCode = node[i]; | ||
var mbCode = prefix + i; | ||
if (skipEncodeChars[mbCode]) | ||
continue; | ||
if (uCode >= 0) | ||
this._setEncodeChar(uCode, mbCode); | ||
else if (uCode <= NODE_START) | ||
this._fillEncodeTable(NODE_START - uCode, mbCode << 8, skipEncodeChars); | ||
else if (uCode <= SEQ_START) | ||
this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); | ||
} | ||
} | ||
// == Actual Encoding ========================================================== | ||
function encoderDBCSWrite(str) { | ||
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 2)), | ||
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)), | ||
leadSurrogate = this.leadSurrogate, | ||
@@ -230,3 +335,3 @@ seqObj = this.seqObj, nextChar = -1, | ||
// Double lead surrogate found. | ||
uCode = -1; | ||
uCode = UNASSIGNED; | ||
} | ||
@@ -239,3 +344,3 @@ } else { // We've got trail surrogate. | ||
// Incomplete surrogate pair - only trail surrogate found. | ||
uCode = -1; | ||
uCode = UNASSIGNED; | ||
} | ||
@@ -247,3 +352,3 @@ | ||
// Incomplete surrogate pair - only lead surrogate found. | ||
nextChar = uCode; uCode = -1; // Write an error, then current char. | ||
nextChar = uCode; uCode = UNASSIGNED; // Write an error, then current char. | ||
leadSurrogate = -1; | ||
@@ -253,4 +358,4 @@ } | ||
// 2. Convert uCode character. | ||
var dbcsCode = -1; | ||
if (seqObj !== undefined && uCode != -1) { // We are in the middle of the sequence | ||
var dbcsCode = UNASSIGNED; | ||
if (seqObj !== undefined && uCode != UNASSIGNED) { // We are in the middle of the sequence | ||
var resCode = seqObj[uCode]; | ||
@@ -267,3 +372,3 @@ if (typeof resCode === 'object') { // Sequence continues. | ||
// Try default character for this sequence | ||
resCode = seqObj[-1]; | ||
resCode = seqObj[DEF_CHAR]; | ||
if (resCode !== undefined) { | ||
@@ -287,8 +392,8 @@ dbcsCode = resCode; // Found. Write it. | ||
if (dbcsCode < -2) { // Sequence start | ||
seqObj = this.encodeTableSeq[-dbcsCode]; | ||
if (dbcsCode <= SEQ_START) { // Sequence start | ||
seqObj = this.encodeTableSeq[SEQ_START-dbcsCode]; | ||
continue; | ||
} | ||
if (dbcsCode == -1 && this.gb18030) { | ||
if (dbcsCode == UNASSIGNED && this.gb18030) { | ||
// Use GB18030 algorithm to find character(s) to write. | ||
@@ -308,3 +413,3 @@ var idx = findIdx(this.gb18030.uChars, uCode); | ||
// 3. Write dbcsCode character. | ||
if (dbcsCode === -1) | ||
if (dbcsCode === UNASSIGNED) | ||
dbcsCode = this.defaultCharSingleByte; | ||
@@ -315,6 +420,11 @@ | ||
} | ||
else { | ||
else if (dbcsCode < 0x10000) { | ||
newBuf[j++] = dbcsCode >> 8; // high byte | ||
newBuf[j++] = dbcsCode & 0xFF; // low byte | ||
} | ||
else { | ||
newBuf[j++] = dbcsCode >> 16; | ||
newBuf[j++] = (dbcsCode >> 8) & 0xFF; | ||
newBuf[j++] = dbcsCode & 0xFF; | ||
} | ||
} | ||
@@ -334,3 +444,3 @@ | ||
if (this.seqObj) { // We're in the sequence. | ||
var dbcsCode = this.seqObj[-1]; | ||
var dbcsCode = this.seqObj[DEF_CHAR]; | ||
if (dbcsCode !== undefined) { // Write beginning of the sequence. | ||
@@ -360,92 +470,43 @@ if (dbcsCode < 0x100) { | ||
// == Actual Decoding ========================================================== | ||
function decoderDBCS(options) { | ||
return { | ||
// Methods | ||
write: decoderDBCSWrite, | ||
end: decoderDBCSEnd, | ||
// Decoder state | ||
leadBytes: -1, | ||
// Static data | ||
decodeLead: this.decodeLead, | ||
decodeTable: this.decodeTable, | ||
decodeTableSeq: this.decodeTableSeq, | ||
defaultCharUnicode: this.defaultCharUnicode, | ||
gb18030: this.gb18030, | ||
} | ||
} | ||
function decoderDBCSWrite(buf) { | ||
var newBuf = new Buffer(buf.length*2), | ||
leadBytes = this.leadBytes, uCode; | ||
nodeIdx = this.nodeIdx, | ||
prevBuf = this.prevBuf, prevBufOffset = this.prevBuf.length, | ||
seqStart = -this.prevBuf.length, // idx of the start of current parsed sequence. | ||
uCode; | ||
if (prevBufOffset > 0) // Make prev buf overlap a little to make it easier to slice later. | ||
prevBuf = Buffer.concat([prevBuf, buf.slice(0, 10)]); | ||
for (var i = 0, j = 0; i < buf.length; i++) { | ||
var curByte = buf[i]; | ||
if (leadBytes === -1) { // We have no leading byte in buffer. | ||
uCode = this.decodeLead[curByte]; | ||
if (uCode === -2) { // Check if this is a leading byte of a double-byte char sequence. | ||
leadBytes = curByte; | ||
continue; | ||
} | ||
} else { // curByte is a trailing byte in double-byte char sequence. | ||
var curByte = (i >= 0) ? buf[i] : prevBuf[i + prevBufOffset]; | ||
if (this.gb18030) { | ||
if (leadBytes < 0x100) { // Single byte lead | ||
if (0x30 <= curByte && curByte <= 0x39) { | ||
leadBytes = leadBytes * 0x100 + curByte; // Move on. | ||
continue; | ||
} | ||
else // Usual decode table. | ||
uCode = this.decodeTable[(leadBytes << 8) + curByte - 0x8000]; | ||
} else if (leadBytes < 0x10000) { // Double byte lead | ||
if (0x81 <= curByte && curByte <= 0xFE) { | ||
leadBytes = leadBytes * 0x100 + curByte; // Move on. | ||
continue; | ||
// Lookup in current trie node. | ||
var uCode = this.decodeTables[nodeIdx][curByte]; | ||
} else { // Incorrect byte. | ||
uCode = this.defaultCharUnicode.charCodeAt(0); | ||
newBuf[j++] = uCode & 0xFF; // Emit 'incorrect sequence' char. | ||
newBuf[j++] = uCode >> 8; | ||
newBuf[j++] = leadBytes & 0xFF; // Throw out first char, emit second char (it'll be '0'-'9'). | ||
newBuf[j++] = 0; | ||
leadBytes = -1; i--; // Cur char will be processed once again, without leading. | ||
continue; | ||
} | ||
} else { // Triple byte lead: we're ready. | ||
if (0x30 <= curByte && curByte <= 0x39) { // Complete sequence. Decode it. | ||
var ptr = ((((leadBytes >> 16)-0x81)*10 + ((leadBytes >> 8) & 0xFF)-0x30)*126 + (leadBytes & 0xFF)-0x81)*10 + curByte-0x30; | ||
var idx = findIdx(this.gb18030.gbChars, ptr); | ||
uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; | ||
} else { // Incorrect 4-th byte. | ||
uCode = this.defaultCharUnicode.charCodeAt(0); | ||
newBuf[j++] = uCode & 0xFF; // Emit 'incorrect sequence' char. | ||
newBuf[j++] = uCode >> 8; | ||
newBuf[j++] = (leadBytes >> 8) & 0xFF; // Throw out first char, emit second char (it'll be '0'-'9'). | ||
newBuf[j++] = 0; | ||
leadBytes = leadBytes & 0xFF; // Make third char a leading byte - it was in 0x81-0xFE range. | ||
i--; // Cur char will be processed once again. | ||
continue; | ||
} | ||
} | ||
} else | ||
uCode = this.decodeTable[(leadBytes << 8) + curByte - 0x8000]; | ||
leadBytes = -1; | ||
if (uCode == -1) i--; // Try curByte one more time in the next iteration without the lead byte. | ||
if (uCode >= 0) { | ||
// Normal character, just use it. | ||
} | ||
// Decide what to do with character. | ||
if (uCode === -1) { // Undefined char. | ||
// TODO: Callback. | ||
else if (uCode === UNASSIGNED) { // Unknown char. | ||
// TODO: Callback with seq. | ||
//var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset); | ||
i = seqStart; // Try to parse again, after skipping first byte of the sequence ('i' will be incremented by 'for' cycle). | ||
uCode = this.defaultCharUnicode.charCodeAt(0); | ||
} | ||
else if (uCode < 0) { // Sequence | ||
var seq = this.decodeTableSeq[-uCode]; | ||
if (!seq) throw new Error("Incorrect sequence table"); | ||
for (var k = 0; k < seq.length; k++) { | ||
else if (uCode === GB18030_CODE) { | ||
var curSeq = (seqStart >= 0) ? buf.slice(seqStart, i+1) : prevBuf.slice(seqStart + prevBufOffset, i+1 + prevBufOffset); | ||
var ptr = (curSeq[0]-0x81)*12600 + (curSeq[1]-0x30)*1260 + (curSeq[2]-0x81)*10 + (curSeq[3]-0x30); | ||
var idx = findIdx(this.gb18030.gbChars, ptr); | ||
uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; | ||
} | ||
else if (uCode <= NODE_START) { // Go to next trie node. | ||
nodeIdx = NODE_START - uCode; | ||
continue; | ||
} | ||
else if (uCode <= SEQ_START) { // Output a sequence of chars. | ||
var seq = this.decodeTableSeq[SEQ_START - uCode]; | ||
for (var k = 0; k < seq.length - 1; k++) { | ||
uCode = seq[k]; | ||
@@ -455,5 +516,9 @@ newBuf[j++] = uCode & 0xFF; | ||
} | ||
continue; | ||
uCode = seq[seq.length-1]; | ||
} | ||
else if (uCode > 0xFFFF) { // Surrogates | ||
else | ||
throw new Error("Unknown table value when decoding: " + val); | ||
// Write the character to buffer, handling higher planes using surrogate pair. | ||
if (uCode > 0xFFFF) { | ||
uCode -= 0x10000; | ||
@@ -466,9 +531,11 @@ var uCodeLead = 0xD800 + Math.floor(uCode / 0x400); | ||
} | ||
// Write the character to buffer. | ||
newBuf[j++] = uCode & 0xFF; | ||
newBuf[j++] = uCode >> 8; | ||
// Reset trie node. | ||
nodeIdx = 0; seqStart = i+1; | ||
} | ||
this.leadBytes = leadBytes; | ||
this.nodeIdx = nodeIdx; | ||
this.prevBuf = (seqStart >= 0) ? buf.slice(seqStart) : prevBuf.slice(seqStart + prevBufOffset); | ||
return newBuf.slice(0, j).toString('ucs2'); | ||
@@ -478,17 +545,18 @@ } | ||
function decoderDBCSEnd() { | ||
if (this.leadBytes === -1) | ||
return; | ||
var ret = ''; | ||
var ret = this.defaultCharUnicode; | ||
// Try to parse all remaining chars. | ||
while (this.prevBuf.length > 0) { | ||
// Skip 1 character in the buffer. | ||
ret += this.defaultCharUnicode; | ||
var buf = this.prevBuf.slice(1); | ||
if (this.gb18030 && this.leadBytes >= 0x100) { | ||
if (this.leadBytes < 0x10000) | ||
// Double byte lead: throw out first char, emit second char (it'll be '0'-'9'). | ||
ret += String.fromCharCode(this.leadBytes & 0xFF); | ||
else | ||
// Triple byte lead: throw out first char, emit second char (it'll be '0'-'9'), emit default for third char (its 0x81-0xFE). | ||
ret += String.fromCharCode((this.leadBytes >> 8) & 0xFF) + this.defaultCharUnicode; | ||
// Parse remaining as usual. | ||
this.prevBuf = new Buffer(0); | ||
this.nodeIdx = 0; | ||
if (buf.length > 0) | ||
ret += decoderDBCSWrite.call(this, buf); | ||
} | ||
this.leadBytes = -1; | ||
this.nodeIdx = 0; | ||
return ret; | ||
@@ -512,1 +580,2 @@ } | ||
} | ||
// Description of supported dbcs encodings and aliases. Tables are not require()-d | ||
// until they are needed. | ||
// Description of supported double byte encodings and aliases. | ||
// Tables are not require()-d until they are needed to speed up library load. | ||
// require()-s are direct to support Browserify. | ||
@@ -36,2 +37,4 @@ module.exports = { | ||
// After JIS X 0213 appeared, Shift_JIS-2004, EUC-JISX0213 and ISO2022-JP-2004 followed, with just changing the planes. | ||
// | ||
// Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html | ||
@@ -41,3 +44,5 @@ | ||
type: '_dbcs', | ||
table: './tables/shiftjis.json', | ||
table: function() { return require('./tables/shiftjis.json') }, | ||
encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, | ||
encodeSkipVals: [{from: 0xED40, to: 0xF940}], | ||
}, | ||
@@ -49,9 +54,10 @@ 'csshiftjis': 'shiftjis', | ||
'x-sjis': 'shiftjis', | ||
'windows932': 'shiftjis', | ||
'932': 'shiftjis', | ||
'cp932': 'shiftjis', | ||
// CP932 is an extension of Shift_JIS. | ||
'windows932': 'cp932', | ||
'932': 'cp932', | ||
'cp932': { | ||
'eucjp': { | ||
type: '_dbcs', | ||
table: ['./tables/shiftjis.json', './tables/cp932-added.json'], | ||
table: function() { return require('./tables/eucjp.json') }, | ||
encodeAdd: {'\u00a5': 0x5C, '\u203E': 0x7E}, | ||
}, | ||
@@ -72,2 +78,3 @@ | ||
'csiso58gb231280': 'cp936', | ||
'euccn': 'cp936', | ||
'isoir58': 'gbk', | ||
@@ -81,3 +88,3 @@ | ||
type: '_dbcs', | ||
table: './tables/cp936.json', | ||
table: function() { return require('./tables/cp936.json') }, | ||
}, | ||
@@ -88,3 +95,3 @@ | ||
type: '_dbcs', | ||
table: ['./tables/cp936.json', './tables/gbk-added.json'], | ||
table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, | ||
}, | ||
@@ -96,4 +103,4 @@ 'xgbk': 'gbk', | ||
type: '_dbcs', | ||
table: ['./tables/cp936.json', './tables/gbk-added.json'], | ||
gb18030: './tables/gb18030-ranges.json', | ||
table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, | ||
gb18030: function() { return require('./tables/gb18030-ranges.json') }, | ||
}, | ||
@@ -114,3 +121,3 @@ | ||
type: '_dbcs', | ||
table: './tables/cp949.json', | ||
table: function() { return require('./tables/cp949.json') }, | ||
}, | ||
@@ -155,3 +162,3 @@ | ||
type: '_dbcs', | ||
table: './tables/cp950.json', | ||
table: function() { return require('./tables/cp950.json') }, | ||
}, | ||
@@ -163,3 +170,3 @@ | ||
type: '_dbcs', | ||
table: ['./tables/cp950.json', './tables/big5-added.json'], | ||
table: function() { return require('./tables/cp950.json').concat(require('./tables/big5-added.json')) }, | ||
}, | ||
@@ -166,0 +173,0 @@ |
// Update this array if you add/rename/remove files in this directory. | ||
// We support Browserify by skipping automatic module discovery and requiring modules directly. | ||
var modules = [ | ||
"internal", | ||
"sbcs-codec", | ||
"sbcs-data", | ||
"sbcs-data-generated", | ||
"dbcs-codec", | ||
"dbcs-data", | ||
require("./internal"), | ||
require("./sbcs-codec"), | ||
require("./sbcs-data"), | ||
require("./sbcs-data-generated"), | ||
require("./dbcs-codec"), | ||
require("./dbcs-data"), | ||
]; | ||
// Load all encoding definition files. Support Browserify by skipping fs module. | ||
modules.forEach(function(moduleName) { | ||
var module = require("./"+moduleName); | ||
// Put all encoding/alias/codec definitions to single object and export it. | ||
for (var i = 0; i < modules.length; i++) { | ||
var module = modules[i]; | ||
for (var enc in module) | ||
exports[enc] = module[enc]; | ||
}); | ||
} |
@@ -13,2 +13,3 @@ | ||
base64: { type: "_internal", enc: "base64" }, | ||
hex: { type: "_internal", enc: "hex" }, | ||
@@ -15,0 +16,0 @@ // Codec. |
@@ -11,3 +11,3 @@ [ | ||
["a340","",62], | ||
["a380","",32], | ||
["a380","",31," "], | ||
["a440","",62], | ||
@@ -14,0 +14,0 @@ ["a480","",32], |
[ | ||
["0","\u0000",91,"¥]",32,"‾"], | ||
["0","\u0000",128], | ||
["a1","。",62], | ||
["8140"," 、。,.・:;?!゛゜´`¨^ ̄_ヽヾゝゞ〃仝々〆〇ー―‐/\\〜‖|…‥‘’“”()〔〕[]{}〈",9,"+−±×"], | ||
["8180","÷=≠<>≦≧∞∴♂♀°′″℃¥$¢£%#&*@§☆★○●◎◇◆□■△▲▽▼※〒→←↑↓〓"], | ||
["8140"," 、。,.・:;?!゛゜´`¨^ ̄_ヽヾゝゞ〃仝々〆〇ー―‐/\~∥|…‥‘’“”()〔〕[]{}〈",9,"+-±×"], | ||
["8180","÷=≠<>≦≧∞∴♂♀°′″℃¥$¢£%#&*@§☆★○●◎◇◆□■△▲▽▼※〒→←↑↓〓"], | ||
["81b8","∈∋⊆⊇⊂⊃∪∩"], | ||
["81c8","∧∨¬⇒⇔∀∃"], | ||
["81c8","∧∨¬⇒⇔∀∃"], | ||
["81da","∠⊥⌒∂∇≡≒≪≫√∽∝∵∫∬"], | ||
@@ -23,2 +23,6 @@ ["81f0","ʼn♯♭♪†‡¶"], | ||
["849f","─│┌┐┘└├┬┤┴┼━┃┏┓┛┗┣┳┫┻╋┠┯┨┷┿┝┰┥┸╂"], | ||
["8740","①",19,"Ⅰ",9], | ||
["875f","㍉㌔㌢㍍㌘㌧㌃㌶㍑㍗㌍㌦㌣㌫㍊㌻㎜㎝㎞㎎㎏㏄㎡"], | ||
["877e","㍻"], | ||
["8780","〝〟№㏍℡㊤",4,"㈱㈲㈹㍾㍽㍼≒≡∫∮∑√⊥∠∟⊿∵∩∪"], | ||
["889f","亜唖娃阿哀愛挨姶逢葵茜穐悪握渥旭葦芦鯵梓圧斡扱宛姐虻飴絢綾鮎或粟袷安庵按暗案闇鞍杏以伊位依偉囲夷委威尉惟意慰易椅為畏異移維緯胃萎衣謂違遺医井亥域育郁磯一壱溢逸稲茨芋鰯允印咽員因姻引飲淫胤蔭"], | ||
@@ -92,3 +96,32 @@ ["8940","院陰隠韻吋右宇烏羽迂雨卯鵜窺丑碓臼渦嘘唄欝蔚鰻姥厩浦瓜閏噂云運雲荏餌叡営嬰影映曳栄永泳洩瑛盈穎頴英衛詠鋭液疫益駅悦謁越閲榎厭円"], | ||
["ea40","鵝鵞鵤鵑鵐鵙鵲鶉鶇鶫鵯鵺鶚鶤鶩鶲鷄鷁鶻鶸鶺鷆鷏鷂鷙鷓鷸鷦鷭鷯鷽鸚鸛鸞鹵鹹鹽麁麈麋麌麒麕麑麝麥麩麸麪麭靡黌黎黏黐黔黜點黝黠黥黨黯"], | ||
["ea80","黴黶黷黹黻黼黽鼇鼈皷鼕鼡鼬鼾齊齒齔齣齟齠齡齦齧齬齪齷齲齶龕龜龠堯槇遙瑤凜熙"] | ||
["ea80","黴黶黷黹黻黼黽鼇鼈皷鼕鼡鼬鼾齊齒齔齣齟齠齡齦齧齬齪齷齲齶龕龜龠堯槇遙瑤凜熙"], | ||
["ed40","纊褜鍈銈蓜俉炻昱棈鋹曻彅丨仡仼伀伃伹佖侒侊侚侔俍偀倢俿倞偆偰偂傔僴僘兊兤冝冾凬刕劜劦勀勛匀匇匤卲厓厲叝﨎咜咊咩哿喆坙坥垬埈埇﨏"], | ||
["ed80","塚增墲夋奓奛奝奣妤妺孖寀甯寘寬尞岦岺峵崧嵓﨑嵂嵭嶸嶹巐弡弴彧德忞恝悅悊惞惕愠惲愑愷愰憘戓抦揵摠撝擎敎昀昕昻昉昮昞昤晥晗晙晴晳暙暠暲暿曺朎朗杦枻桒柀栁桄棏﨓楨﨔榘槢樰橫橆橳橾櫢櫤毖氿汜沆汯泚洄涇浯涖涬淏淸淲淼渹湜渧渼溿澈澵濵瀅瀇瀨炅炫焏焄煜煆煇凞燁燾犱"], | ||
["ee40","犾猤猪獷玽珉珖珣珒琇珵琦琪琩琮瑢璉璟甁畯皂皜皞皛皦益睆劯砡硎硤硺礰礼神祥禔福禛竑竧靖竫箞精絈絜綷綠緖繒罇羡羽茁荢荿菇菶葈蒴蕓蕙"], | ||
["ee80","蕫﨟薰蘒﨡蠇裵訒訷詹誧誾諟諸諶譓譿賰賴贒赶﨣軏﨤逸遧郞都鄕鄧釚釗釞釭釮釤釥鈆鈐鈊鈺鉀鈼鉎鉙鉑鈹鉧銧鉷鉸鋧鋗鋙鋐﨧鋕鋠鋓錥錡鋻﨨錞鋿錝錂鍰鍗鎤鏆鏞鏸鐱鑅鑈閒隆﨩隝隯霳霻靃靍靏靑靕顗顥飯飼餧館馞驎髙髜魵魲鮏鮱鮻鰀鵰鵫鶴鸙黑"], | ||
["eeef","ⅰ",9,"¬¦'""], | ||
["f040","",62], | ||
["f080","",124], | ||
["f140","",62], | ||
["f180","",124], | ||
["f240","",62], | ||
["f280","",124], | ||
["f340","",62], | ||
["f380","",124], | ||
["f440","",62], | ||
["f480","",124], | ||
["f540","",62], | ||
["f580","",124], | ||
["f640","",62], | ||
["f680","",124], | ||
["f740","",62], | ||
["f780","",124], | ||
["f840","",62], | ||
["f880","",124], | ||
["f940",""], | ||
["fa40","ⅰ",9,"Ⅰ",9,"¬¦'"㈱№℡∵纊褜鍈銈蓜俉炻昱棈鋹曻彅丨仡仼伀伃伹佖侒侊侚侔俍偀倢俿倞偆偰偂傔僴僘兊"], | ||
["fa80","兤冝冾凬刕劜劦勀勛匀匇匤卲厓厲叝﨎咜咊咩哿喆坙坥垬埈埇﨏塚增墲夋奓奛奝奣妤妺孖寀甯寘寬尞岦岺峵崧嵓﨑嵂嵭嶸嶹巐弡弴彧德忞恝悅悊惞惕愠惲愑愷愰憘戓抦揵摠撝擎敎昀昕昻昉昮昞昤晥晗晙晴晳暙暠暲暿曺朎朗杦枻桒柀栁桄棏﨓楨﨔榘槢樰橫橆橳橾櫢櫤毖氿汜沆汯泚洄涇浯"], | ||
["fb40","涖涬淏淸淲淼渹湜渧渼溿澈澵濵瀅瀇瀨炅炫焏焄煜煆煇凞燁燾犱犾猤猪獷玽珉珖珣珒琇珵琦琪琩琮瑢璉璟甁畯皂皜皞皛皦益睆劯砡硎硤硺礰礼神"], | ||
["fb80","祥禔福禛竑竧靖竫箞精絈絜綷綠緖繒罇羡羽茁荢荿菇菶葈蒴蕓蕙蕫﨟薰蘒﨡蠇裵訒訷詹誧誾諟諸諶譓譿賰賴贒赶﨣軏﨤逸遧郞都鄕鄧釚釗釞釭釮釤釥鈆鈐鈊鈺鉀鈼鉎鉙鉑鈹鉧銧鉷鉸鋧鋗鋙鋐﨧鋕鋠鋓錥錡鋻﨨錞鋿錝錂鍰鍗鎤鏆鏞鏸鐱鑅鑈閒隆﨩隝隯霳霻靃靍靏靑靕顗顥飯飼餧館馞驎髙"], | ||
["fc40","髜魵魲鮏鮱鮻鰀鵰鵫鶴鸙黑"] | ||
] |
{ | ||
"name": "iconv-lite", | ||
"description": "Convert character encodings in pure javascript.", | ||
"version": "0.4.0-pre3", | ||
"version": "0.4.0", | ||
"license": "MIT", | ||
@@ -22,3 +22,3 @@ | ||
"main": "index.js", | ||
"main": "./lib/index.js", | ||
"homepage": "https://github.com/ashtuchkin/iconv-lite", | ||
@@ -36,2 +36,6 @@ "bugs": "https://github.com/ashtuchkin/iconv-lite/issues", | ||
}, | ||
"browser": { | ||
"./extend-node": false, | ||
"./streams": false | ||
}, | ||
"devDependencies": { | ||
@@ -43,4 +47,4 @@ "mocha": "*", | ||
"async": "*", | ||
"iconv": "2.x" | ||
"iconv": "~2.1.4" | ||
} | ||
} |
141
README.md
@@ -8,3 +8,6 @@ ## Pure JS character encoding conversion | ||
* Faster than [node-iconv](https://github.com/bnoordhuis/node-iconv) (see below for performance comparison). | ||
* Intuitive encode/decode API + streaming API in Node v0.10+ | ||
* Intuitive encode/decode API | ||
* Streaming support for Node v0.10+ | ||
* Can extend Node.js primitives (buffers, streams) to support all iconv-lite encodings. | ||
* In-browser usage via [Browserify](https://github.com/substack/node-browserify) (~180k gzip compressed with Buffer shim included). | ||
* License: MIT. | ||
@@ -15,77 +18,87 @@ | ||
## Usage | ||
### Basic API | ||
```javascript | ||
var iconv = require('iconv-lite'); | ||
var iconv = require('iconv-lite'); | ||
// Convert from an encoded buffer to js string. | ||
str = iconv.decode(buf, 'win1251'); | ||
// Convert from js string to an encoded buffer. | ||
buf = iconv.encode("Sample input string", 'win1251'); | ||
// Convert from an encoded buffer to js string. | ||
str = iconv.decode(new Buffer([0x68, 0x65, 0x6c, 0x6c, 0x6f]), 'win1251'); | ||
// Check if encoding is supported | ||
iconv.encodingExists("us-ascii") | ||
// Convert from js string to an encoded buffer. | ||
buf = iconv.encode("Sample input string", 'win1251'); | ||
// Check if encoding is supported | ||
iconv.encodingExists("us-ascii") | ||
``` | ||
// Decode stream example (from binary stream to js strings) | ||
// Only available in Node v0.10+ | ||
http.createServer(function(req, res) { | ||
var converterStream = iconv.decodeStream('win1251'); | ||
req.pipe(converterStream); | ||
### Streaming API (Node v0.10+) | ||
```javascript | ||
converterStream.on('data', function(str) { | ||
console.log(str); // Do something with decoded strings, chunk-by-chunk. | ||
}); | ||
// Decode stream (from binary stream to js strings) | ||
http.createServer(function(req, res) { | ||
var converterStream = iconv.decodeStream('win1251'); | ||
req.pipe(converterStream); | ||
converterStream.on('data', function(str) { | ||
console.log(str); // Do something with decoded strings, chunk-by-chunk. | ||
}); | ||
}); | ||
// Convert encoding streaming example | ||
fs.createReadStream('file-in-win1251.txt') | ||
.pipe(iconv.decodeStream('win1251')) | ||
.pipe(iconv.encodeStream('ucs2')) | ||
.pipe(fs.createWriteStream('file-in-ucs2.txt')); | ||
// Convert encoding streaming example | ||
fs.createReadStream('file-in-win1251.txt') | ||
.pipe(iconv.decodeStream('win1251')) | ||
.pipe(iconv.encodeStream('ucs2')) | ||
.pipe(fs.createWriteStream('file-in-ucs2.txt')); | ||
// Sugar: all encode/decode streams have .collect(cb) method to accumulate data. | ||
http.createServer(function(req, res) { | ||
req.pipe(iconv.decodeStream('win1251')).collect(function(err, body) { | ||
assert(typeof body == 'string'); | ||
console.log(body); // full request body string | ||
}); | ||
// Sugar: all encode/decode streams have .collect(cb) method to accumulate data. | ||
http.createServer(function(req, res) { | ||
req.pipe(iconv.decodeStream('win1251')).collect(function(err, body) { | ||
assert(typeof body == 'string'); | ||
console.log(body); // full request body string | ||
}); | ||
}); | ||
``` | ||
// For the brave/lazy: make Node basic primitives understand all iconv encodings. | ||
require('iconv-lite').extendNodeEncodings(); | ||
### Extend Node.js own encodings | ||
```javascript | ||
// After this call all Node basic primitives will understand iconv-lite encodings. | ||
iconv.extendNodeEncodings(); | ||
buf = new Buffer(str, 'win1251'); | ||
buf.write(str, 'gbk'); | ||
str = buf.toString('latin1'); | ||
assert(Buffer.isEncoding('iso-8859-15')); | ||
Buffer.byteLength(str, 'us-ascii'); | ||
// Examples: | ||
buf = new Buffer(str, 'win1251'); | ||
buf.write(str, 'gbk'); | ||
str = buf.toString('latin1'); | ||
assert(Buffer.isEncoding('iso-8859-15')); | ||
Buffer.byteLength(str, 'us-ascii'); | ||
http.createServer(function(req, res) { | ||
req.setEncoding('big5'); | ||
req.collect(function(err, body) { | ||
console.log(body); | ||
}); | ||
http.createServer(function(req, res) { | ||
req.setEncoding('big5'); | ||
req.collect(function(err, body) { | ||
console.log(body); | ||
}); | ||
}); | ||
fs.createReadStream("file.txt", "shift_jis"); | ||
fs.createReadStream("file.txt", "shift_jis"); | ||
// External modules are also supported (if they use Node primitives). | ||
request = require('request'); | ||
request({ | ||
url: "http://github.com/", | ||
encoding: "cp932" | ||
}); | ||
// External modules are also supported (if they use Node primitives, which they probably do). | ||
request = require('request'); | ||
request({ | ||
url: "http://github.com/", | ||
encoding: "cp932" | ||
}); | ||
// To remove extensions | ||
iconv.undoExtendNodeEncodings(); | ||
``` | ||
## Supported encodings | ||
* All node.js native encodings: 'utf8', 'ucs2', 'ascii', 'binary', 'base64' | ||
* All node.js native encodings: utf8, ucs2, ascii, binary, base64, hex. | ||
* All widespread singlebyte encodings: Windows 125x family, ISO-8859 family, | ||
IBM/DOS codepages, Macintosh family, KOI8 family, all others supported by iconv library. | ||
Aliases like 'latin1', 'us-ascii' also supported. | ||
* Multibyte encodings: CP932, CP936, CP949, CP950, GBK, GB2313, Big5, Shift_JIS. | ||
* All widespread multibyte encodings: CP932, CP936, CP949, CP950, GB2313, GBK, GB18030, Big5, Shift_JIS, EUC-JP. | ||
Most singlebyte encodings are generated automatically from [node-iconv](https://github.com/bnoordhuis/node-iconv). Thank you Ben Noordhuis and iconv authors! | ||
Most singlebyte encodings are generated automatically from [node-iconv](https://github.com/bnoordhuis/node-iconv). Thank you Ben Noordhuis and libiconv authors! | ||
Not supported yet: GB18030, EUC family, ISO2022 family. | ||
Multibyte encodings are generated from [Unicode.org mappings](http://www.unicode.org/Public/MAPPINGS/) and [WHATWG Encoding Standard mappings](http://encoding.spec.whatwg.org/). Thank you, respective authors! | ||
@@ -98,6 +111,6 @@ | ||
operation iconv@2.0.7 iconv-lite@0.4.0 | ||
operation iconv@2.1.4 iconv-lite@0.4.0 | ||
---------------------------------------------------------- | ||
encode('win1251') ~115 Mb/s ~340 Mb/s | ||
decode('win1251') ~110 Mb/s ~180 Mb/s | ||
encode('win1251') ~130 Mb/s ~380 Mb/s | ||
decode('win1251') ~127 Mb/s ~210 Mb/s | ||
@@ -107,3 +120,3 @@ | ||
When decoding, a 'binary'-encoded string can be used as a source buffer. | ||
When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding). | ||
Untranslatable characters are set to � or ?. No transliteration is currently supported. | ||
@@ -113,9 +126,11 @@ | ||
git clone git@github.com:ashtuchkin/iconv-lite.git | ||
cd iconv-lite | ||
npm install | ||
npm test | ||
```bash | ||
$ git clone git@github.com:ashtuchkin/iconv-lite.git | ||
$ cd iconv-lite | ||
$ npm install | ||
$ npm test | ||
# To view performance: | ||
node test/performance.js | ||
$ # To view performance: | ||
$ node test/performance.js | ||
``` | ||
@@ -122,0 +137,0 @@ ## Adoption |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
308170
25
3396
135
0