istextorbinary
Advanced tools
Comparing version 5.15.0 to 6.0.0-next.1627748439.0ee4d89a92f81b1d8280e8d0dd12cf2fb9976ed0
@@ -91,3 +91,10 @@ /* eslint no-use-before-define:0 */ | ||
// Extract | ||
const chunkEnd = Math.min(buffer.length, chunkBegin + chunkLength); | ||
chunkBegin = getChunkBegin(buffer, chunkBegin); | ||
if (chunkBegin === -1) { | ||
return binaryEncoding; | ||
} | ||
const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength)); | ||
if (chunkEnd > buffer.length) { | ||
return binaryEncoding; | ||
} | ||
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd); | ||
@@ -108,1 +115,87 @@ // Detect encoding | ||
} | ||
// ==================================== | ||
// The functions below are created to handle multibyte utf8 characters. | ||
// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
// @todo add documentation for these | ||
function getChunkBegin(buf, chunkBegin) { | ||
// If it's the beginning, just return. | ||
if (chunkBegin === 0) { | ||
return 0; | ||
} | ||
if (!isLaterByteOfUtf8(buf[chunkBegin])) { | ||
return chunkBegin; | ||
} | ||
let begin = chunkBegin - 3; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 2; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 1; | ||
if (begin >= 0) { | ||
// Is it a 4-byte, 3-byte utf8 character? | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) || | ||
isFirstByteOf2ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
return -1; | ||
} | ||
function getChunkEnd(buf, chunkEnd) { | ||
// If it's the end, just return. | ||
if (chunkEnd === buf.length) { | ||
return chunkEnd; | ||
} | ||
let index = chunkEnd - 3; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 2; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 1; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 3; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf2ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
return chunkEnd; | ||
} | ||
function isFirstByteOf4ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 3 === 30; // 11110xxx? | ||
} | ||
function isFirstByteOf3ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 4 === 14; // 1110xxxx? | ||
} | ||
function isFirstByteOf2ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 5 === 6; // 110xxxxx? | ||
} | ||
function isLaterByteOfUtf8(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 6 === 2; // 10xxxxxx? | ||
} |
@@ -92,3 +92,10 @@ /* eslint no-use-before-define:0 */ | ||
// Extract | ||
const chunkEnd = Math.min(buffer.length, chunkBegin + chunkLength); | ||
chunkBegin = getChunkBegin(buffer, chunkBegin); | ||
if (chunkBegin === -1) { | ||
return binaryEncoding; | ||
} | ||
const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength)); | ||
if (chunkEnd > buffer.length) { | ||
return binaryEncoding; | ||
} | ||
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd); | ||
@@ -109,1 +116,87 @@ // Detect encoding | ||
} | ||
// ==================================== | ||
// The functions below are created to handle multibyte utf8 characters. | ||
// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
// @todo add documentation for these | ||
function getChunkBegin(buf, chunkBegin) { | ||
// If it's the beginning, just return. | ||
if (chunkBegin === 0) { | ||
return 0; | ||
} | ||
if (!isLaterByteOfUtf8(buf[chunkBegin])) { | ||
return chunkBegin; | ||
} | ||
let begin = chunkBegin - 3; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 2; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 1; | ||
if (begin >= 0) { | ||
// Is it a 4-byte, 3-byte utf8 character? | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) || | ||
isFirstByteOf2ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
return -1; | ||
} | ||
function getChunkEnd(buf, chunkEnd) { | ||
// If it's the end, just return. | ||
if (chunkEnd === buf.length) { | ||
return chunkEnd; | ||
} | ||
let index = chunkEnd - 3; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 2; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 1; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 3; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf2ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
return chunkEnd; | ||
} | ||
function isFirstByteOf4ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 3 === 30; // 11110xxx? | ||
} | ||
function isFirstByteOf3ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 4 === 14; // 1110xxxx? | ||
} | ||
function isFirstByteOf2ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 5 === 6; // 110xxxxx? | ||
} | ||
function isLaterByteOfUtf8(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 6 === 2; // 10xxxxxx? | ||
} |
@@ -119,3 +119,10 @@ "use strict"; | ||
// Extract | ||
const chunkEnd = Math.min(buffer.length, chunkBegin + chunkLength); | ||
chunkBegin = getChunkBegin(buffer, chunkBegin); | ||
if (chunkBegin === -1) { | ||
return binaryEncoding; | ||
} | ||
const chunkEnd = getChunkEnd(buffer, Math.min(buffer.length, chunkBegin + chunkLength)); | ||
if (chunkEnd > buffer.length) { | ||
return binaryEncoding; | ||
} | ||
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd); | ||
@@ -137,1 +144,87 @@ // Detect encoding | ||
exports.getEncoding = getEncoding; | ||
// ==================================== | ||
// The functions below are created to handle multibyte utf8 characters. | ||
// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
// @todo add documentation for these | ||
function getChunkBegin(buf, chunkBegin) { | ||
// If it's the beginning, just return. | ||
if (chunkBegin === 0) { | ||
return 0; | ||
} | ||
if (!isLaterByteOfUtf8(buf[chunkBegin])) { | ||
return chunkBegin; | ||
} | ||
let begin = chunkBegin - 3; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 2; | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
begin = chunkBegin - 1; | ||
if (begin >= 0) { | ||
// Is it a 4-byte, 3-byte utf8 character? | ||
if (isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) || | ||
isFirstByteOf2ByteChar(buf[begin])) { | ||
return begin; | ||
} | ||
} | ||
return -1; | ||
} | ||
function getChunkEnd(buf, chunkEnd) { | ||
// If it's the end, just return. | ||
if (chunkEnd === buf.length) { | ||
return chunkEnd; | ||
} | ||
let index = chunkEnd - 3; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 2; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
index = chunkEnd - 1; | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 3; | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 2; | ||
} | ||
if (isFirstByteOf2ByteChar(buf[index])) { | ||
return chunkEnd + 1; | ||
} | ||
} | ||
return chunkEnd; | ||
} | ||
function isFirstByteOf4ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 3 === 30; // 11110xxx? | ||
} | ||
function isFirstByteOf3ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 4 === 14; // 1110xxxx? | ||
} | ||
function isFirstByteOf2ByteChar(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 5 === 6; // 110xxxxx? | ||
} | ||
function isLaterByteOfUtf8(byte) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 6 === 2; // 10xxxxxx? | ||
} |
# History | ||
## v6.0.0 2021 August 1 | ||
- Thanks to [Kukhyeon Heo](https://github.com/sainthkh) for [pull request #214](https://github.com/bevry/istextorbinary/pull/214) `istextorbinary` can now speak UTF8 multibyte characters, now understanding that Crilly, CJK, Emoji, etc. are not binary. This is a big win. | ||
- Closes [issue #13](https://github.com/bevry/istextorbinary/issues/13) reported by [dlsgusrn7577](https://github.com/dlsgusrn7577) | ||
- Updated dependencies, [base files](https://github.com/bevry/base), and [editions](https://editions.bevry.me) using [boundation](https://github.com/bevry/boundation) | ||
## v5.15.0 2021 July 30 | ||
@@ -4,0 +10,0 @@ |
{ | ||
"title": "Is Text or Binary?", | ||
"name": "istextorbinary", | ||
"version": "5.15.0", | ||
"version": "6.0.0-next.1627748439.0ee4d89a92f81b1d8280e8d0dd12cf2fb9976ed0", | ||
"description": "Determine if a filename and/or buffer is text or binary. Smarter detection than the other solutions.", | ||
@@ -84,2 +84,3 @@ "homepage": "https://github.com/bevry/istextorbinary", | ||
"Ian Sibner <sibnerian@gmail.com> (https://github.com/sibnerian)", | ||
"Kukhyeon Heo <sainthkh@gmail.com> (https://github.com/sainthkh)", | ||
"Michael Mooring <mike@mdm.cc> (https://github.com/mikeumus)", | ||
@@ -173,10 +174,10 @@ "Rob Loach <robloach@gmail.com> (https://github.com/robloach)", | ||
"binaryextensions": "^4.18.0", | ||
"textextensions": "^5.13.0" | ||
"textextensions": "^5.14.0" | ||
}, | ||
"devDependencies": { | ||
"@bevry/update-contributors": "^1.19.0", | ||
"@bevry/update-contributors": "^1.20.0", | ||
"@typescript-eslint/eslint-plugin": "^4.28.5", | ||
"@typescript-eslint/parser": "^4.28.5", | ||
"assert-helpers": "^8.4.0", | ||
"eslint": "^7.31.0", | ||
"eslint": "^7.32.0", | ||
"eslint-config-bevry": "^3.27.0", | ||
@@ -186,11 +187,11 @@ "eslint-config-prettier": "^8.3.0", | ||
"filedirname": "^2.7.0", | ||
"kava": "^5.14.0", | ||
"kava": "^5.15.0", | ||
"make-deno-edition": "^1.3.0", | ||
"prettier": "^2.3.2", | ||
"projectz": "^2.21.0", | ||
"projectz": "^2.22.0", | ||
"surge": "^0.23.0", | ||
"typedoc": "^0.21.4", | ||
"typescript": "4.3.5", | ||
"valid-directory": "^3.7.0", | ||
"valid-module": "^1.16.0" | ||
"valid-directory": "^3.9.0", | ||
"valid-module": "^1.17.0" | ||
}, | ||
@@ -236,2 +237,2 @@ "scripts": { | ||
} | ||
} | ||
} |
@@ -96,3 +96,3 @@ <!-- TITLE/ --> | ||
<script type="module"> | ||
import * as pkg from '//cdn.skypack.dev/istextorbinary@^5.15.0' | ||
import * as pkg from '//cdn.skypack.dev/istextorbinary@^6.0.0' | ||
</script> | ||
@@ -105,3 +105,3 @@ ``` | ||
<script type="module"> | ||
import * as pkg from '//unpkg.com/istextorbinary@^5.15.0' | ||
import * as pkg from '//unpkg.com/istextorbinary@^6.0.0' | ||
</script> | ||
@@ -114,3 +114,3 @@ ``` | ||
<script type="module"> | ||
import * as pkg from '//dev.jspm.io/istextorbinary@5.15.0' | ||
import * as pkg from '//dev.jspm.io/istextorbinary@6.0.0' | ||
</script> | ||
@@ -182,2 +182,3 @@ ``` | ||
<li><a href="https://github.com/sibnerian">Ian Sibner</a> — <a href="https://github.com/bevry/istextorbinary/commits?author=sibnerian" title="View the GitHub contributions of Ian Sibner on repository bevry/istextorbinary">view contributions</a></li> | ||
<li><a href="https://github.com/sainthkh">Kukhyeon Heo</a> — <a href="https://github.com/bevry/istextorbinary/commits?author=sainthkh" title="View the GitHub contributions of Kukhyeon Heo on repository bevry/istextorbinary">view contributions</a></li> | ||
<li><a href="https://github.com/mikeumus">Michael Mooring</a> — <a href="https://github.com/bevry/istextorbinary/commits?author=mikeumus" title="View the GitHub contributions of Michael Mooring on repository bevry/istextorbinary">view contributions</a></li> | ||
@@ -184,0 +185,0 @@ <li><a href="https://github.com/robloach">Rob Loach</a> — <a href="https://github.com/bevry/istextorbinary/commits?author=robloach" title="View the GitHub contributions of Rob Loach on repository bevry/istextorbinary">view contributions</a></li> |
@@ -12,2 +12,3 @@ /* eslint no-use-before-define:0 */ | ||
chunkLength?: number | ||
/** If not provided, will check the start, beginning, and end */ | ||
@@ -114,3 +115,16 @@ chunkBegin?: number | ||
// Extract | ||
const chunkEnd = Math.min(buffer.length, chunkBegin + chunkLength) | ||
chunkBegin = getChunkBegin(buffer, chunkBegin) | ||
if (chunkBegin === -1) { | ||
return binaryEncoding | ||
} | ||
const chunkEnd = getChunkEnd( | ||
buffer, | ||
Math.min(buffer.length, chunkBegin + chunkLength) | ||
) | ||
if (chunkEnd > buffer.length) { | ||
return binaryEncoding | ||
} | ||
const contentChunkUTF8 = buffer.toString(textEncoding, chunkBegin, chunkEnd) | ||
@@ -133,1 +147,116 @@ | ||
} | ||
// ==================================== | ||
// The functions below are created to handle multibyte utf8 characters. | ||
// To understand how the encoding works, check this article: https://en.wikipedia.org/wiki/UTF-8#Encoding | ||
// @todo add documentation for these | ||
function getChunkBegin(buf: Buffer, chunkBegin: number) { | ||
// If it's the beginning, just return. | ||
if (chunkBegin === 0) { | ||
return 0 | ||
} | ||
if (!isLaterByteOfUtf8(buf[chunkBegin])) { | ||
return chunkBegin | ||
} | ||
let begin = chunkBegin - 3 | ||
if (begin >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[begin])) { | ||
return begin | ||
} | ||
} | ||
begin = chunkBegin - 2 | ||
if (begin >= 0) { | ||
if ( | ||
isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) | ||
) { | ||
return begin | ||
} | ||
} | ||
begin = chunkBegin - 1 | ||
if (begin >= 0) { | ||
// Is it a 4-byte, 3-byte utf8 character? | ||
if ( | ||
isFirstByteOf4ByteChar(buf[begin]) || | ||
isFirstByteOf3ByteChar(buf[begin]) || | ||
isFirstByteOf2ByteChar(buf[begin]) | ||
) { | ||
return begin | ||
} | ||
} | ||
return -1 | ||
} | ||
function getChunkEnd(buf: Buffer, chunkEnd: number) { | ||
// If it's the end, just return. | ||
if (chunkEnd === buf.length) { | ||
return chunkEnd | ||
} | ||
let index = chunkEnd - 3 | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 1 | ||
} | ||
} | ||
index = chunkEnd - 2 | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 2 | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 1 | ||
} | ||
} | ||
index = chunkEnd - 1 | ||
if (index >= 0) { | ||
if (isFirstByteOf4ByteChar(buf[index])) { | ||
return chunkEnd + 3 | ||
} | ||
if (isFirstByteOf3ByteChar(buf[index])) { | ||
return chunkEnd + 2 | ||
} | ||
if (isFirstByteOf2ByteChar(buf[index])) { | ||
return chunkEnd + 1 | ||
} | ||
} | ||
return chunkEnd | ||
} | ||
function isFirstByteOf4ByteChar(byte: number) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 3 === 30 // 11110xxx? | ||
} | ||
function isFirstByteOf3ByteChar(byte: number) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 4 === 14 // 1110xxxx? | ||
} | ||
function isFirstByteOf2ByteChar(byte: number) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 5 === 6 // 110xxxxx? | ||
} | ||
function isLaterByteOfUtf8(byte: number) { | ||
// eslint-disable-next-line no-bitwise | ||
return byte >> 6 === 2 // 10xxxxxx? | ||
} |
Sorry, the diff of this file is not supported yet
No v1
QualityPackage is not semver >=1. This means it is not stable and does not support ^ ranges.
Found 1 instance in 1 package
71387
891
204
1
Updatedtextextensions@^5.14.0