Comparing version 0.9.1 to 0.9.2
{ | ||
"name": "hyparquet", | ||
"version": "0.9.1", | ||
"version": "0.9.2", | ||
"description": "parquet file parser for javascript", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
@@ -9,3 +9,3 @@ # hyparquet | ||
[![dependencies](https://img.shields.io/badge/Dependencies-0-blueviolet)](https://www.npmjs.com/package/hyparquet?activeTab=dependencies) | ||
![coverage](https://img.shields.io/badge/Coverage-94-darkred) | ||
![coverage](https://img.shields.io/badge/Coverage-95-darkred) | ||
@@ -12,0 +12,0 @@ Dependency free since 2023! |
@@ -99,3 +99,3 @@ import { assembleObjects } from './assemble.js' | ||
) | ||
dictionary = readDictionaryPage(page, diph, columnMetadata) | ||
dictionary = readDictionaryPage(page, diph, columnMetadata, element.type_length) | ||
} else if (header.type === 'DATA_PAGE_V2') { | ||
@@ -102,0 +102,0 @@ const daph2 = header.data_page_header_v2 |
@@ -1,2 +0,2 @@ | ||
const dayMillis = 86400000000000 // 1 day in milliseconds | ||
const dayMillis = 86400000 // 1 day in milliseconds | ||
@@ -12,3 +12,2 @@ /** | ||
export function convert(data, schemaElement) { | ||
if (!Array.isArray(data)) return data | ||
const ctype = schemaElement.converted_type | ||
@@ -20,16 +19,22 @@ if (ctype === 'UTF8') { | ||
if (ctype === 'DECIMAL') { | ||
const scaleFactor = schemaElement.scale ? Math.pow(10, schemaElement.scale) : 1 | ||
const scale = schemaElement.scale || 0 | ||
const factor = Math.pow(10, -scale) | ||
if (typeof data[0] === 'number') { | ||
return scaleFactor === 1 ? data : data.map(v => v * scaleFactor) | ||
if (factor === 1) return data | ||
return Array.from(data).map(v => v * factor) | ||
} else if (typeof data[0] === 'bigint') { | ||
return scaleFactor === 1 ? data : data.map(v => v * BigInt(scaleFactor)) | ||
if (factor === 1) return data | ||
return Array.from(data).map(v => Number(v) * factor) | ||
} else { | ||
return data.map(v => parseDecimal(v) * scaleFactor) | ||
return Array.from(data).map(v => parseDecimal(v) * factor) | ||
} | ||
} | ||
if (ctype === 'DATE') { | ||
return data.map(v => new Date(v * dayMillis)) | ||
return Array.from(data).map(v => new Date(v * dayMillis)) | ||
} | ||
if (ctype === undefined && schemaElement.type === 'INT96') { | ||
return Array.from(data).map(parseInt96Date) | ||
} | ||
if (ctype === 'TIME_MILLIS') { | ||
return data.map(v => new Date(v)) | ||
return Array.from(data).map(v => new Date(v)) | ||
} | ||
@@ -49,4 +54,2 @@ if (ctype === 'JSON') { | ||
/** | ||
* Parse decimal from byte array. | ||
* | ||
* @param {Uint8Array} bytes | ||
@@ -63,1 +66,12 @@ * @returns {number} | ||
} | ||
/** | ||
* @param {bigint} value | ||
* @returns {Date} | ||
*/ | ||
function parseInt96Date(value) { | ||
const days = Number((value >> 64n) - 2440588n) | ||
const nano = Number((value & 0xffffffffffffffffn) / 1000000n) | ||
const millis = days * dayMillis + nano | ||
return new Date(millis) | ||
} |
@@ -25,6 +25,4 @@ import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' | ||
// repetition levels | ||
// repetition and definition levels | ||
const repetitionLevels = readRepetitionLevels(reader, daph, schemaPath) | ||
// definition levels | ||
const { definitionLevels, numNulls } = readDefinitionLevels(reader, daph, schemaPath) | ||
@@ -35,3 +33,4 @@ | ||
if (daph.encoding === 'PLAIN') { | ||
dataPage = readPlain(reader, columnMetadata.type, nValues) | ||
const { type_length } = schemaPath[schemaPath.length - 1].element | ||
dataPage = readPlain(reader, columnMetadata.type, nValues, type_length) | ||
} else if ( | ||
@@ -43,7 +42,5 @@ daph.encoding === 'PLAIN_DICTIONARY' || | ||
// bit width is stored as single byte | ||
let bitWidth | ||
let bitWidth = 1 | ||
// TODO: RLE encoding uses bitWidth = schemaElement.type_length | ||
if (columnMetadata.type === 'BOOLEAN') { | ||
bitWidth = 1 | ||
} else { | ||
if (columnMetadata.type !== 'BOOLEAN') { | ||
bitWidth = view.getUint8(reader.offset) | ||
@@ -69,12 +66,12 @@ reader.offset++ | ||
* | ||
* @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader | ||
* @param {Uint8Array} bytes raw page data | ||
* @param {DictionaryPageHeader} diph dictionary page header | ||
* @param {import("./types.d.ts").DictionaryPageHeader} diph dictionary page header | ||
* @param {ColumnMetaData} columnMetadata | ||
* @param {number | undefined} typeLength - type_length from schema | ||
* @returns {ArrayLike<any>} array of values | ||
*/ | ||
export function readDictionaryPage(bytes, diph, columnMetadata) { | ||
export function readDictionaryPage(bytes, diph, columnMetadata, typeLength) { | ||
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) | ||
const reader = { view, offset: 0 } | ||
return readPlain(reader, columnMetadata.type, diph.num_values) | ||
return readPlain(reader, columnMetadata.type, diph.num_values, typeLength) | ||
} | ||
@@ -81,0 +78,0 @@ |
@@ -5,3 +5,3 @@ import { decompressPage } from './column.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' | ||
import { readVarInt, readZigZag } from './thrift.js' | ||
import { readVarInt, readZigZagBigInt } from './thrift.js' | ||
@@ -26,4 +26,2 @@ /** | ||
const reader = { view, offset: 0 } | ||
/** @type {any} */ | ||
let dataPage = [] | ||
@@ -35,38 +33,30 @@ const daph2 = ph.data_page_header_v2 | ||
const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schemaPath) | ||
// assert(reader.offset === daph2.repetition_levels_byte_length) | ||
if (reader.offset !== daph2.repetition_levels_byte_length) { | ||
throw new Error(`parquet repetition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length}`) | ||
} | ||
// definition levels | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
const definitionLevels = readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) | ||
// assert(reader.offset === daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) | ||
if (reader.offset !== daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) { | ||
throw new Error(`parquet definition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length}`) | ||
const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length | ||
let page = compressedBytes.subarray(reader.offset) | ||
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') { | ||
page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors) | ||
} | ||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) | ||
const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length | ||
// read values based on encoding | ||
/** @type {import('./types.d.ts').DecodedArray} */ | ||
let dataPage = [] | ||
const nValues = daph2.num_values - daph2.num_nulls | ||
if (daph2.encoding === 'PLAIN') { | ||
let page = compressedBytes.slice(reader.offset) | ||
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') { | ||
page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors) | ||
} | ||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) | ||
const pageReader = { view: pageView, offset: 0 } | ||
dataPage = readPlain(pageReader, columnMetadata.type, nValues) | ||
const { type_length } = schemaPath[schemaPath.length - 1].element | ||
dataPage = readPlain(pageReader, columnMetadata.type, nValues, type_length) | ||
} else if (daph2.encoding === 'RLE') { | ||
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) | ||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) | ||
const bitWidth = 1 | ||
if (daph2.num_nulls) { | ||
throw new Error('parquet RLE encoding with nulls not supported') | ||
} else { | ||
const pageReader = { view: pageView, offset: 4 } | ||
dataPage = new Array(nValues) | ||
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, dataPage) | ||
} | ||
const pageReader = { view: pageView, offset: 4 } | ||
dataPage = new Array(nValues) | ||
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, dataPage) | ||
} else if ( | ||
@@ -76,5 +66,2 @@ daph2.encoding === 'PLAIN_DICTIONARY' || | ||
) { | ||
compressedBytes = compressedBytes.subarray(reader.offset) | ||
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) | ||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) | ||
const bitWidth = pageView.getUint8(0) | ||
@@ -85,5 +72,4 @@ const pageReader = { view: pageView, offset: 1 } | ||
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') { | ||
if (daph2.num_nulls) throw new Error('parquet delta-int not supported') | ||
const codec = daph2.is_compressed ? columnMetadata.codec : 'UNCOMPRESSED' | ||
const page = decompressPage(compressedBytes, uncompressedPageSize, codec, compressors) | ||
const int32 = columnMetadata.type === 'INT32' | ||
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues) | ||
deltaBinaryUnpack(page, nValues, dataPage) | ||
@@ -142,5 +128,6 @@ } else { | ||
* @param {number} nValues number of values to read | ||
* @param {any[]} values array to write to | ||
* @param {Int32Array | BigInt64Array} values array to write to | ||
*/ | ||
function deltaBinaryUnpack(page, nValues, values) { | ||
const int32 = values instanceof Int32Array | ||
const view = new DataView(page.buffer, page.byteOffset, page.byteLength) | ||
@@ -150,37 +137,42 @@ const reader = { view, offset: 0 } | ||
const miniblockPerBlock = readVarInt(reader) | ||
const count = readVarInt(reader) | ||
let value = readZigZag(reader) | ||
let count = readVarInt(reader) | ||
let value = readZigZagBigInt(reader) // first value | ||
let valueIndex = 0 | ||
values[valueIndex++] = int32 ? Number(value) : value | ||
const valuesPerMiniblock = blockSize / miniblockPerBlock | ||
for (let valueIndex = 0; valueIndex < nValues;) { | ||
const minDelta = readZigZag(reader) | ||
while (valueIndex < nValues) { | ||
const minDelta = readZigZagBigInt(reader) | ||
const bitWidths = new Uint8Array(miniblockPerBlock) | ||
for (let i = 0; i < miniblockPerBlock; i++, reader.offset++) { | ||
bitWidths[i] = page[reader.offset] | ||
for (let i = 0; i < miniblockPerBlock; i++) { | ||
bitWidths[i] = page[reader.offset++] | ||
} | ||
for (let i = 0; i < miniblockPerBlock; i++) { | ||
const bitWidth = bitWidths[i] | ||
let miniblockCount = Math.min(count, valuesPerMiniblock) | ||
const bitWidth = BigInt(bitWidths[i]) | ||
if (bitWidth) { | ||
if (count > 1) { | ||
// no more diffs if on last value, delta read bitpacked | ||
let data = 0 | ||
let stop = -bitWidth | ||
// only works for bitWidth < 31 | ||
const mask = (1 << bitWidth) - 1 | ||
while (count) { | ||
if (stop < 0) { | ||
// fails when data gets too large | ||
data = (data << 8) | view.getUint8(reader.offset++) | ||
stop += 8 | ||
} else { | ||
values.push((data >> stop) & mask) | ||
const mask = (1n << bitWidth) - 1n | ||
let bitpackPos = 0n | ||
while (count && miniblockCount) { | ||
let bits = (BigInt(view.getUint8(reader.offset)) >> bitpackPos) & mask // TODO: don't re-read value every time | ||
bitpackPos += bitWidth | ||
while (bitpackPos >= 8) { | ||
bitpackPos -= 8n | ||
reader.offset++ | ||
bits |= (BigInt(view.getUint8(reader.offset)) << bitWidth - bitpackPos) & mask | ||
} | ||
const delta = minDelta + bits | ||
value += delta | ||
values[valueIndex++] = int32 ? Number(value) : value | ||
count-- | ||
miniblockCount-- | ||
} | ||
} | ||
} else { | ||
for (let j = 0; j < valuesPerMiniblock && valueIndex < nValues; j++, valueIndex++) { | ||
values[valueIndex] = value | ||
for (let j = 0; j < valuesPerMiniblock && valueIndex < nValues; j++) { | ||
value += minDelta | ||
values[valueIndex++] = int32 ? Number(value) : value | ||
} | ||
@@ -187,0 +179,0 @@ } |
@@ -10,5 +10,6 @@ /** | ||
* @param {number} count - number of values to read | ||
* @param {number | undefined} fixedLength - length of each fixed length byte array | ||
* @returns {DecodedArray} array of values | ||
*/ | ||
export function readPlain(reader, type, count) { | ||
export function readPlain(reader, type, count, fixedLength) { | ||
if (count === 0) return [] | ||
@@ -30,3 +31,4 @@ if (type === 'BOOLEAN') { | ||
} else if (type === 'FIXED_LEN_BYTE_ARRAY') { | ||
return readPlainByteArrayFixed(reader, count) | ||
if (!fixedLength) throw new Error('parquet missing fixed length') | ||
return readPlainByteArrayFixed(reader, count, fixedLength) | ||
} else { | ||
@@ -98,3 +100,3 @@ throw new Error(`parquet unhandled type: ${type}`) | ||
const high = reader.view.getInt32(reader.offset + i * 12 + 8, true) | ||
values[i] = (BigInt(high) << BigInt(32)) | low | ||
values[i] = (BigInt(high) << 64n) | low | ||
} | ||
@@ -157,12 +159,14 @@ reader.offset += count * 12 | ||
* @param {DataReader} reader | ||
* @param {number} count | ||
* @param {number} fixedLength | ||
* @returns {Uint8Array} | ||
* @returns {Uint8Array[]} | ||
*/ | ||
function readPlainByteArrayFixed(reader, fixedLength) { | ||
reader.offset += fixedLength | ||
return new Uint8Array( | ||
reader.view.buffer, | ||
reader.view.byteOffset + reader.offset - fixedLength, | ||
fixedLength | ||
) | ||
function readPlainByteArrayFixed(reader, count, fixedLength) { | ||
// assert(reader.view.byteLength - reader.offset >= count * fixedLength) | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
values[i] = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, fixedLength) | ||
reader.offset += fixedLength | ||
} | ||
return values | ||
} | ||
@@ -169,0 +173,0 @@ |
@@ -132,3 +132,3 @@ // TCompactProtocol types | ||
result |= (byte & 0x7f) << shift | ||
if ((byte & 0x80) === 0) { | ||
if (!(byte & 0x80)) { | ||
return result | ||
@@ -147,11 +147,11 @@ } | ||
function readVarBigInt(reader) { | ||
let result = BigInt(0) | ||
let shift = BigInt(0) | ||
let result = 0n | ||
let shift = 0n | ||
while (true) { | ||
const byte = BigInt(reader.view.getUint8(reader.offset++)) | ||
result |= (byte & BigInt(0x7f)) << shift | ||
if ((byte & BigInt(0x80)) === BigInt(0)) { | ||
const byte = reader.view.getUint8(reader.offset++) | ||
result |= BigInt(byte & 0x7f) << shift | ||
if (!(byte & 0x80)) { | ||
return result | ||
} | ||
shift += BigInt(7) | ||
shift += 7n | ||
} | ||
@@ -167,3 +167,3 @@ } | ||
*/ | ||
export function readZigZag(reader) { | ||
function readZigZag(reader) { | ||
const zigzag = readVarInt(reader) | ||
@@ -181,3 +181,3 @@ // convert zigzag to int | ||
*/ | ||
function readZigZagBigInt(reader) { | ||
export function readZigZagBigInt(reader) { | ||
const zigzag = readVarBigInt(reader) | ||
@@ -184,0 +184,0 @@ // convert zigzag to int |
@@ -14,2 +14,3 @@ /** | ||
if (obj instanceof Uint8Array) return Array.from(obj) | ||
if (obj instanceof Date) return obj.toISOString() | ||
if (obj instanceof Object) { | ||
@@ -16,0 +17,0 @@ /** @type {Record<string, unknown>} */ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
94932
2465