functionalscript
Advanced tools
Comparing version 0.3.13 to 0.3.14
{ | ||
"name": "functionalscript", | ||
"version": "0.3.13", | ||
"version": "0.3.14", | ||
"type": "module", | ||
@@ -5,0 +5,0 @@ "files": [ |
import { type List, type Thunk } from '../../types/list/module.f.ts'; | ||
import type { Array1, Array2, Array3 } from '../../types/array/module.f.ts'; | ||
/** | ||
* An unsigned 8-bit integer, represents a single byte. | ||
*/ | ||
export type U8 = number; | ||
/** | ||
* A singed 32-bit integer. | ||
*/ | ||
export type I32 = number; | ||
/** | ||
* Represents an unsigend 8-bit type - U8 or the end-of-file indicator. | ||
* The U8 represents the byte itself, and null indicates that reading does not return anything else. | ||
*/ | ||
export type ByteOrEof = U8 | null; | ||
/** | ||
* Represents the state of a UTF-8 decoding operation that contains at least one byte. | ||
*/ | ||
export type Utf8NonEmptyState = Array1<number> | Array2<number> | Array3<number>; | ||
/** | ||
* Represents the state of a UTF-8 decoding operation, which can be either `null` (no state) | ||
* or a non-empty state containing one or more bytes. | ||
*/ | ||
export type Utf8State = null | Utf8NonEmptyState; | ||
export type U8 = number; | ||
export type I32 = number; | ||
/** | ||
* Maps a list of Unicode code points to a stream of UTF-8 bytes. | ||
* | ||
* @param input - A list of Unicode code points to be converted. | ||
* @returns A thunk that lazily produces a sequence of UTF-8 bytes. | ||
*/ | ||
export declare const fromCodePointList: (input: List<number>) => Thunk<U8>; | ||
/** | ||
* Converts a list of UTF-8 bytes into a list of Unicode code points. | ||
* | ||
* @param input - A list of UTF-8 bytes. | ||
* @returns A list of Unicode code points or error codes. | ||
*/ | ||
export declare const toCodePointList: (input: List<U8>) => List<I32>; |
@@ -1,3 +0,16 @@ | ||
import { flatMap, flat, stateScan } from "../../types/list/module.f.js"; | ||
import { flat, flatMap, stateScan } from "../../types/list/module.f.js"; | ||
/** | ||
* Error mask constant used to represent invalid code points or encoding errors in UTF-8. | ||
*/ | ||
const errorMask = 0b1000_0000_0000_0000_0000_0000_0000_0000; | ||
/** | ||
* Converts a Unicode code point to a sequence of UTF-8 bytes. | ||
* @param input The Unicode code point to be converted. Valid range: | ||
* - 0x0000 to 0x007F for 1-byte sequences. | ||
* - 0x0080 to 0x07FF for 2-byte sequences. | ||
* - 0x0800 to 0xFFFF for 3-byte sequences. | ||
* - 0x10000 to 0x10FFFF for 4-byte sequences. | ||
* @returns A readonly array of UTF-8 bytes representing the input code point. | ||
* - Returns `[errorMask]` if the input does not match valid UTF-8 encoding rules. | ||
*/ | ||
const codePointToUtf8 = (input) => { | ||
@@ -11,24 +24,54 @@ if (input >= 0x0000 && input <= 0x007f) { | ||
if (input >= 0x0800 && input <= 0xffff) { | ||
return [input >> 12 | 0b1110_0000, input >> 6 & 0b0011_1111 | 0b1000_0000, input & 0b0011_1111 | 0b1000_0000]; | ||
return [ | ||
input >> 12 | 0b1110_0000, | ||
input >> 6 & 0b0011_1111 | 0b1000_0000, | ||
input & 0b0011_1111 | 0b1000_0000, | ||
]; | ||
} | ||
if (input >= 0x10000 && input <= 0x10ffff) { | ||
return [input >> 18 | 0b1111_0000, input >> 12 & 0b0011_1111 | 0b1000_0000, input >> 6 & 0b0011_1111 | 0b1000_0000, input & 0b0011_1111 | 0b1000_0000]; | ||
return [ | ||
input >> 18 | 0b1111_0000, | ||
input >> 12 & 0b0011_1111 | 0b1000_0000, | ||
input >> 6 & 0b0011_1111 | 0b1000_0000, | ||
input & 0b0011_1111 | 0b1000_0000, | ||
]; | ||
} | ||
if ((input & errorMask) !== 0) { | ||
if ((input & 0b1000_0000_0000_0000) !== 0) { | ||
return [input >> 12 & 0b0000_0111 | 0b1111_0000, input >> 6 & 0b0011_1111 | 0b1000_0000, input & 0b0011_1111 | 0b1000_0000]; | ||
return [ | ||
input >> 12 & 0b0000_0111 | 0b1111_0000, | ||
input >> 6 & 0b0011_1111 | 0b1000_0000, | ||
input & 0b0011_1111 | 0b1000_0000, | ||
]; | ||
} | ||
if ((input & 0b0000_0100_0000_0000) !== 0) { | ||
return [input >> 6 & 0b0000_1111 | 0b1110_0000, input & 0b0011_1111 | 0b1000_0000]; | ||
return [ | ||
input >> 6 & 0b0000_1111 | 0b1110_0000, | ||
input & 0b0011_1111 | 0b1000_0000, | ||
]; | ||
} | ||
if ((input & 0b0000_0010_0000_0000) !== 0) { | ||
return [input >> 6 & 0b0000_0111 | 0b1111_0000, input & 0b0011_1111 | 0b1000_0000]; | ||
return [ | ||
input >> 6 & 0b0000_0111 | 0b1111_0000, | ||
input & 0b0011_1111 | 0b1000_0000, | ||
]; | ||
} | ||
if ((input & 0b0000_0000_1000_0000) !== 0) { | ||
if ((input & 0b0000_0000_1000_0000) !== 0) | ||
return [input & 0b1111_1111]; | ||
} | ||
} | ||
return [errorMask]; | ||
}; | ||
/** | ||
* Maps a list of Unicode code points to a stream of UTF-8 bytes. | ||
* | ||
* @param input - A list of Unicode code points to be converted. | ||
* @returns A thunk that lazily produces a sequence of UTF-8 bytes. | ||
*/ | ||
export const fromCodePointList = flatMap(codePointToUtf8); | ||
/** | ||
* Converts a non-empty UTF-8 decoding state to an error code. | ||
* | ||
* @param state - A non-empty UTF-8 decoding state. | ||
* @returns An I32 error code derived from the invalid UTF-8 state. | ||
*/ | ||
const utf8StateToError = (state) => { | ||
@@ -45,3 +88,4 @@ let x; | ||
? ((s0 & 0b0000_1111) << 6) + (s1 & 0b0011_1111) + 0b0000_0100_0000_0000 | ||
: ((s0 & 0b0000_0111) << 6) + (s1 & 0b0011_1111) + 0b0000_0010_0000_0000; | ||
: ((s0 & 0b0000_0111) << 6) + (s1 & 0b0011_1111) + | ||
0b0000_0010_0000_0000; | ||
break; | ||
@@ -51,3 +95,4 @@ } | ||
const [s0, s1, s2] = state; | ||
x = ((s0 & 0b0000_0111) << 12) + ((s1 & 0b0011_1111) << 6) + (s2 & 0b0011_1111) + 0b1000_0000_0000_0000; | ||
x = ((s0 & 0b0000_0111) << 12) + ((s1 & 0b0011_1111) << 6) + | ||
(s2 & 0b0011_1111) + 0b1000_0000_0000_0000; | ||
break; | ||
@@ -60,3 +105,12 @@ } | ||
}; | ||
const utf8ByteToCodePointOp = state => byte => { | ||
/** | ||
* Decodes a byte into a Unicode code point, using a given UTF-8 state. | ||
* | ||
* @param state - The current UTF-8 decoding state. | ||
* @param byte - A single byte to decode. | ||
* @returns A tuple containing: | ||
* - A list of decoded Unicode code points or error codes. | ||
* - The updated UTF-8 state. | ||
*/ | ||
const utf8ByteToCodePointOp = (state) => (byte) => { | ||
if (byte < 0x00 || byte > 0xff) { | ||
@@ -66,8 +120,6 @@ return [[errorMask], state]; | ||
if (state === null) { | ||
if (byte < 0b1000_0000) { | ||
if (byte < 0b1000_0000) | ||
return [[byte], null]; | ||
} | ||
if (byte >= 0b1100_0010 && byte <= 0b1111_0100) { | ||
if (byte >= 0b1100_0010 && byte <= 0b1111_0100) | ||
return [[], [byte]]; | ||
} | ||
return [[byte | errorMask], null]; | ||
@@ -82,5 +134,4 @@ } | ||
} | ||
if (s0 < 0b1111_1000) { | ||
if (s0 < 0b1111_1000) | ||
return [[], [s0, byte]]; | ||
} | ||
break; | ||
@@ -91,7 +142,9 @@ } | ||
if (s0 < 0b1111_0000) { | ||
return [[((s0 & 0b0000_1111) << 12) + ((s1 & 0b0011_1111) << 6) + (byte & 0b0011_1111)], null]; | ||
return [[ | ||
((s0 & 0b0000_1111) << 12) + ((s1 & 0b0011_1111) << 6) + | ||
(byte & 0b0011_1111), | ||
], null]; | ||
} | ||
if (s0 < 0b1111_1000) { | ||
if (s0 < 0b1111_1000) | ||
return [[], [s0, s1, byte]]; | ||
} | ||
break; | ||
@@ -101,3 +154,6 @@ } | ||
const [s0, s1, s2] = state; | ||
return [[((s0 & 0b0000_0111) << 18) + ((s1 & 0b0011_1111) << 12) + ((s2 & 0b0011_1111) << 6) + (byte & 0b0011_1111)], null]; | ||
return [[ | ||
((s0 & 0b0000_0111) << 18) + ((s1 & 0b0011_1111) << 12) + | ||
((s2 & 0b0011_1111) << 6) + (byte & 0b0011_1111), | ||
], null]; | ||
} | ||
@@ -107,13 +163,45 @@ } | ||
const error = utf8StateToError(state); | ||
if (byte < 0b1000_0000) { | ||
if (byte < 0b1000_0000) | ||
return [[error, byte], null]; | ||
} | ||
if (byte >= 0b1100_0010 && byte <= 0b1111_0100) { | ||
if (byte >= 0b1100_0010 && byte <= 0b1111_0100) | ||
return [[error], [byte]]; | ||
} | ||
return [[error, byte | errorMask], null]; | ||
}; | ||
const utf8EofToCodePointOp = (state) => [state === null ? null : [utf8StateToError(state)], null]; | ||
const utf8ByteOrEofToCodePointOp = state => input => input === null ? utf8EofToCodePointOp(state) : utf8ByteToCodePointOp(state)(input); | ||
/** | ||
* Handles the end-of-file (EOF) case for UTF-8 decoding. | ||
* | ||
* @param state - The current UTF-8 decoding state. | ||
* @returns A tuple containing: | ||
* - A list of decoded Unicode code points or error codes. | ||
* - The reset UTF-8 state (`null`). | ||
*/ | ||
const utf8EofToCodePointOp = (state) => [ | ||
state === null ? null : [utf8StateToError(state)], | ||
null, | ||
]; | ||
/** | ||
* Combines UTF-8 byte and EOF handling into a single decoding operation. | ||
* | ||
* @param state - The current UTF-8 decoding state. | ||
* @param input - The next byte or EOF indicator. | ||
* @returns A tuple containing: | ||
* - A list of decoded Unicode code points or error codes. | ||
* - The updated UTF-8 state. | ||
*/ | ||
const utf8ByteOrEofToCodePointOp = (state) => (input) => input === null ? utf8EofToCodePointOp(state) : utf8ByteToCodePointOp(state)(input); | ||
/** | ||
* A constant representing the end-of-file (EOF) marker for UTF-8 decoding. | ||
* | ||
* @remarks | ||
* This is used as a sentinel value in decoding operations to signify the | ||
* termination of input. The list contains a single `null` value, which | ||
* represents the EOF condition. | ||
*/ | ||
const eofList = [null]; | ||
export const toCodePointList = input => flat(stateScan(utf8ByteOrEofToCodePointOp)(null)(flat([input, eofList]))); | ||
/** | ||
* Converts a list of UTF-8 bytes into a list of Unicode code points. | ||
* | ||
* @param input - A list of UTF-8 bytes. | ||
* @returns A list of Unicode code points or error codes. | ||
*/ | ||
export const toCodePointList = (input) => flat(stateScan(utf8ByteOrEofToCodePointOp)(null)(flat([input, eofList]))); |
@@ -21,4 +21,4 @@ import { type List, type Thunk } from '../list/module.f.ts'; | ||
* const vec4 = vec(4n) | ||
* const v0 = vec4(5n) // 0x15n | ||
* const v1 = vec4(0x5FEn) // 0x1En | ||
* const v0 = vec4(5n) // 0x15n = 0b1_0101 | ||
* const v1 = vec4(0x5FEn) // 0x1En = 0b1_1110 | ||
* ``` | ||
@@ -25,0 +25,0 @@ */ |
@@ -0,1 +1,13 @@ | ||
/** | ||
* @module | ||
* @description | ||
* ``` | ||
* MSb is most-significant bit first. | ||
* - byte: 0x53 = 0b0101_0011 | ||
* - 0123_4567 | ||
* LSb is least-significant bit first. | ||
* - byte: 0x53 = 0b0101_0011 | ||
* - 7654_3210 | ||
* ``` | ||
*/ | ||
import { log2, mask } from "../bigint/module.f.js"; | ||
@@ -19,4 +31,4 @@ import { flip } from "../function/module.f.js"; | ||
* const vec4 = vec(4n) | ||
* const v0 = vec4(5n) // 0x15n | ||
* const v1 = vec4(0x5FEn) // 0x1En | ||
* const v0 = vec4(5n) // 0x15n = 0b1_0101 | ||
* const v1 = vec4(0x5FEn) // 0x1En = 0b1_1110 | ||
* ``` | ||
@@ -77,3 +89,3 @@ */ | ||
return b => (b << aLen) | (a & m); | ||
} | ||
}, | ||
}; | ||
@@ -100,3 +112,3 @@ /** | ||
}, | ||
concat: flip(lsb.concat) | ||
concat: flip(lsb.concat), | ||
}; | ||
@@ -103,0 +115,0 @@ const appendU8 = ({ concat }) => (u8) => (a) => concat(a)(vec8(BigInt(u8))); |
594360
16807