istextorbinary
Advanced tools
Comparing version 5.15.0-next.1627595674.345b3aa16bba43ba618239c19c8def5cf66eda10 to 5.15.0-next.1627747584.40036cea671e4dee4fd2c4a154d3f69017ec7422
@@ -91,3 +91,10 @@ /* eslint no-use-before-define:0 */ | ||
// Extract | ||
const chunkEnd = Math.min(buffer.length, chunkBegin + chunkLength); | ||
chunkBegin = getChunkBegin(buffer, chunkBegin); | ||
if (chunkBegin === -1) { | ||
return binaryEncoding; | ||
} | ||
const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength)); | ||
if (chunkEnd > buffer.length) { | ||
return binaryEncoding; | ||
} | ||
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd); | ||
@@ -108,1 +115,86 @@ // Detect encoding | ||
} | ||
// The functions below are created to handle multibyte utf8 characters. | ||
// To understand how the encoding works, | ||
// check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
function getChunkBegin(buf, chunkBegin) { | ||
// If it's the beginning, just return. | ||
if (chunkBegin === 0) { | ||
return 0; | ||
} | ||
if (!isLaterByteOfUtf8(buf[chunkBegin])) { | ||
return chunkBegin; | ||
} | ||
let begin = chunkBegin - 3; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 2; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 1; | ||
if (begin >= 0) { | ||
// Is it a 4-byte, 3-byte utf8 character? | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) || | ||
isFirstByteOf2ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
return -1; | ||
} | ||
function getChunkEnd(buf, chunkEnd) { | ||
// If it's the end, just return. | ||
if (chunkEnd === buf.length) { | ||
return chunkEnd; | ||
} | ||
let index = chunkEnd - 3; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 2; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 1; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 3; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf2ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
return chunkEnd; | ||
} | ||
function isFirstByteOf4ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 3 === 30; // 11110xxx? | ||
} | ||
function isFirstByteOf3ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 4 === 14; // 1110xxxx? | ||
} | ||
function isFirstByteOf2ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 5 === 6; // 110xxxxx? | ||
} | ||
function isLaterByteOfUtf8(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 6 === 2; // 10xxxxxx? | ||
} |
@@ -92,3 +92,10 @@ /* eslint no-use-before-define:0 */ | ||
// Extract | ||
const chunkEnd = Math.min(buffer.length, chunkBegin + chunkLength); | ||
chunkBegin = getChunkBegin(buffer, chunkBegin); | ||
if (chunkBegin === -1) { | ||
return binaryEncoding; | ||
} | ||
const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength)); | ||
if (chunkEnd > buffer.length) { | ||
return binaryEncoding; | ||
} | ||
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd); | ||
@@ -109,1 +116,86 @@ // Detect encoding | ||
} | ||
// The functions below are created to handle multibyte utf8 characters. | ||
// To understand how the encoding works, | ||
// check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
function getChunkBegin(buf, chunkBegin) { | ||
// If it's the beginning, just return. | ||
if (chunkBegin === 0) { | ||
return 0; | ||
} | ||
if (!isLaterByteOfUtf8(buf[chunkBegin])) { | ||
return chunkBegin; | ||
} | ||
let begin = chunkBegin - 3; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 2; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 1; | ||
if (begin >= 0) { | ||
// Is it a 4-byte, 3-byte utf8 character? | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) || | ||
isFirstByteOf2ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
return -1; | ||
} | ||
function getChunkEnd(buf, chunkEnd) { | ||
// If it's the end, just return. | ||
if (chunkEnd === buf.length) { | ||
return chunkEnd; | ||
} | ||
let index = chunkEnd - 3; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 2; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 1; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 3; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf2ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
return chunkEnd; | ||
} | ||
function isFirstByteOf4ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 3 === 30; // 11110xxx? | ||
} | ||
function isFirstByteOf3ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 4 === 14; // 1110xxxx? | ||
} | ||
function isFirstByteOf2ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 5 === 6; // 110xxxxx? | ||
} | ||
function isLaterByteOfUtf8(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 6 === 2; // 10xxxxxx? | ||
} |
@@ -119,3 +119,10 @@ "use strict"; | ||
// Extract | ||
const chunkEnd = Math.min(buffer.length, chunkBegin + chunkLength); | ||
chunkBegin = getChunkBegin(buffer, chunkBegin); | ||
if (chunkBegin === -1) { | ||
return binaryEncoding; | ||
} | ||
const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength)); | ||
if (chunkEnd > buffer.length) { | ||
return binaryEncoding; | ||
} | ||
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd); | ||
@@ -137,1 +144,86 @@ // Detect encoding | ||
exports.getEncoding = getEncoding; | ||
// The functions below are created to handle multibyte utf8 characters. | ||
// To understand how the encoding works, | ||
// check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
function getChunkBegin(buf, chunkBegin) { | ||
// If it's the beginning, just return. | ||
if (chunkBegin === 0) { | ||
return 0; | ||
} | ||
if (!isLaterByteOfUtf8(buf[chunkBegin])) { | ||
return chunkBegin; | ||
} | ||
let begin = chunkBegin - 3; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 2; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 1; | ||
if (begin >= 0) { | ||
// Is it a 4-byte, 3-byte utf8 character? | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) || | ||
isFirstByteOf2ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
return -1; | ||
} | ||
function getChunkEnd(buf, chunkEnd) { | ||
// If it's the end, just return. | ||
if (chunkEnd === buf.length) { | ||
return chunkEnd; | ||
} | ||
let index = chunkEnd - 3; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 2; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 1; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 3; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf2ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
return chunkEnd; | ||
} | ||
function isFirstByteOf4ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 3 === 30; // 11110xxx? | ||
} | ||
function isFirstByteOf3ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 4 === 14; // 1110xxxx? | ||
} | ||
function isFirstByteOf2ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 5 === 6; // 110xxxxx? | ||
} | ||
function isLaterByteOfUtf8(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 6 === 2; // 10xxxxxx? | ||
} |
{ | ||
"title": "Is Text or Binary?", | ||
"name": "istextorbinary", | ||
"version": "5.15.0-next.1627595674.345b3aa16bba43ba618239c19c8def5cf66eda10", | ||
"version": "5.15.0-next.1627747584.40036cea671e4dee4fd2c4a154d3f69017ec7422", | ||
"description": "Determine if a filename and/or buffer is text or binary. Smarter detection than the other solutions.", | ||
@@ -6,0 +6,0 @@ "homepage": "https://github.com/bevry/istextorbinary", |
@@ -113,3 +113,16 @@ /* eslint no-use-before-define:0 */ | ||
// Extract | ||
const chunkEnd = Math.min(buffer.length, chunkBegin + chunkLength) | ||
chunkBegin = getChunkBegin(buffer, chunkBegin) | ||
if (chunkBegin === -1) { | ||
return binaryEncoding | ||
} | ||
const chunkEnd = getChunkEnd( | ||
buffer, | ||
Math.min(buffer.length, chunkBegin + chunkLength) | ||
) | ||
if (chunkEnd > buffer.length) { | ||
return binaryEncoding | ||
} | ||
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd) | ||
@@ -132,1 +145,115 @@ | ||
} | ||
// The functions below are created to handle multibyte utf8 characters. | ||
// To understand how the encoding works, | ||
// check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
function getChunkBegin(buf: Buffer, chunkBegin: number) { | ||
// If it's the beginning, just return. | ||
if (chunkBegin === 0) { | ||
return 0 | ||
} | ||
if (!isLaterByteOfUtf8(buf[chunkBegin])) { | ||
return chunkBegin | ||
} | ||
let begin = chunkBegin - 3 | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin])) { | ||
return begin | ||
} | ||
} | ||
begin = chunkBegin - 2 | ||
if (begin >= 0) { | ||
if ( | ||
isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) | ||
) { | ||
return begin | ||
} | ||
} | ||
begin = chunkBegin - 1 | ||
if (begin >= 0) { | ||
// Is it a 4-byte, 3-byte utf8 character? | ||
if ( | ||
isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) || | ||
isFirstByteOf2ByteChar(buf[begin]) | ||
) { | ||
return begin | ||
} | ||
} | ||
return -1 | ||
} | ||
function getChunkEnd(buf: Buffer, chunkEnd: number) { | ||
// If it's the end, just return. | ||
if (chunkEnd === buf.length) { | ||
return chunkEnd | ||
} | ||
let index = chunkEnd - 3 | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 1 | ||
} | ||
} | ||
index = chunkEnd - 2 | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 2 | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 1 | ||
} | ||
} | ||
index = chunkEnd - 1 | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 3 | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 2 | ||
} | ||
if (isFirstByteOf2ByteChar(buf[index])) { | ||
return chunkEnd + 1 | ||
} | ||
} | ||
return chunkEnd | ||
} | ||
function isFirstByteOf4ByteChar(byte: number) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 3 === 30 // 11110xxx? | ||
} | ||
function isFirstByteOf3ByteChar(byte: number) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 4 === 14 // 1110xxxx? | ||
} | ||
function isFirstByteOf2ByteChar(byte: number) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 5 === 6 // 110xxxxx? | ||
} | ||
function isLaterByteOfUtf8(byte: number) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 6 === 2 // 10xxxxxx? | ||
} |
Sorry, the diff of this file is not supported yet
70163
887