Comparing version 0.7.10 to 0.7.11
{ | ||
"name": "hyparquet", | ||
"version": "0.7.10", | ||
"version": "0.7.11", | ||
"description": "parquet file parser for javascript", | ||
@@ -30,3 +30,3 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "20.12.7", | ||
"@types/node": "20.12.8", | ||
"@typescript-eslint/eslint-plugin": "7.8.0", | ||
@@ -33,0 +33,0 @@ "@vitest/coverage-v8": "1.5.3", |
@@ -59,3 +59,3 @@ import { assembleObjects } from './assemble.js' | ||
) | ||
const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schemaPath, columnMetadata) | ||
const { definitionLevels, repetitionLevels, dataPage } = readDataPage(page, daph, schemaPath, columnMetadata) | ||
valuesSeen += daph.num_values | ||
@@ -66,3 +66,3 @@ | ||
// construct output values: skip nulls and construct lists | ||
/** @type {any[]} */ | ||
/** @type {DecodedArray} */ | ||
let values | ||
@@ -114,3 +114,3 @@ if (repetitionLevels.length) { | ||
const { definitionLevels, repetitionLevels, value: dataPage } = readDataPageV2( | ||
const { definitionLevels, repetitionLevels, dataPage } = readDataPageV2( | ||
compressedBytes, header, schemaPath, columnMetadata, compressors | ||
@@ -151,4 +151,5 @@ ) | ||
* | ||
* @typedef {import('./types.js').DecodedArray} DecodedArray | ||
* @param {ArrayLike<any> | undefined} dictionary | ||
* @param {number[]} dataPage | ||
* @param {DecodedArray} dataPage | ||
*/ | ||
@@ -155,0 +156,0 @@ function dereferenceDictionary(dictionary, dataPage) { |
@@ -1,2 +0,2 @@ | ||
const dayMillis = 86400000000000 // 1 day in ms | ||
const dayMillis = 86400000000000 // 1 day in milliseconds | ||
@@ -6,7 +6,9 @@ /** | ||
* | ||
* @param {any[]} data series of primitive types | ||
* @typedef {import('./types.js').DecodedArray} DecodedArray | ||
* @param {DecodedArray} data series of primitive types | ||
* @param {import('./types.js').SchemaElement} schemaElement schema element for the data | ||
* @returns {any[]} series of rich types | ||
* @returns {DecodedArray} series of rich types | ||
*/ | ||
export function convert(data, schemaElement) { | ||
if (!Array.isArray(data)) return data | ||
const ctype = schemaElement.converted_type | ||
@@ -13,0 +15,0 @@ if (ctype === 'UTF8') { |
import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' | ||
import { readPlain } from './plain.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired } from './schema.js' | ||
const skipNulls = false // TODO | ||
/** | ||
* Read a data page from the given Uint8Array. | ||
* | ||
* @typedef {{ definitionLevels: number[], numNulls: number }} DefinitionLevels | ||
* @typedef {import("./types.d.ts").DataPage} DataPage | ||
@@ -15,2 +12,3 @@ * @typedef {import("./types.d.ts").ColumnMetaData} ColumnMetaData | ||
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree | ||
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray | ||
* @param {Uint8Array} bytes raw page data (should already be decompressed) | ||
@@ -25,4 +23,4 @@ * @param {DataPageHeader} daph data page header | ||
const reader = { view, offset: 0 } | ||
/** @type {any[]} */ | ||
let values = [] | ||
/** @type {DecodedArray} */ | ||
let dataPage = [] | ||
@@ -33,14 +31,3 @@ // repetition levels | ||
// definition levels | ||
let definitionLevels = undefined | ||
let numNulls = 0 | ||
// let maxDefinitionLevel = -1 | ||
// TODO: move into readDefinitionLevels | ||
if (skipNulls && !isRequired(schemaPath)) { | ||
// skip_definition_bytes | ||
reader.offset += skipDefinitionBytes(daph.num_values) | ||
} else { | ||
const dl = readDefinitionLevels(reader, daph, schemaPath) | ||
definitionLevels = dl.definitionLevels | ||
numNulls = dl.numNulls | ||
} | ||
const { definitionLevels, numNulls } = readDefinitionLevels(reader, daph, schemaPath) | ||
@@ -53,3 +40,3 @@ // read values based on encoding | ||
const plainObj = readPlain(reader, columnMetadata.type, nValues, utf8) | ||
values = Array.isArray(plainObj) ? plainObj : Array.from(plainObj) | ||
dataPage = plainObj | ||
} else if ( | ||
@@ -70,7 +57,7 @@ daph.encoding === 'PLAIN_DICTIONARY' || | ||
if (bitWidth) { | ||
values = new Array(nValues) | ||
readRleBitPackedHybrid(reader, bitWidth, view.byteLength - reader.offset, values) | ||
dataPage = new Array(nValues) | ||
readRleBitPackedHybrid(reader, bitWidth, view.byteLength - reader.offset, dataPage) | ||
} else { | ||
// nval zeros | ||
values = new Array(nValues).fill(0) | ||
dataPage = new Array(nValues).fill(0) | ||
} | ||
@@ -81,3 +68,3 @@ } else { | ||
return { definitionLevels, repetitionLevels, value: values } | ||
return { definitionLevels, repetitionLevels, dataPage } | ||
} | ||
@@ -128,3 +115,3 @@ | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {DefinitionLevels} definition levels and number of bytes read | ||
* @returns {{ definitionLevels: number[], numNulls: number }} definition levels | ||
*/ | ||
@@ -131,0 +118,0 @@ function readDefinitionLevels(reader, daph, schemaPath) { |
@@ -26,3 +26,3 @@ import { decompressPage } from './column.js' | ||
/** @type {any} */ | ||
let values = [] | ||
let dataPage = [] | ||
@@ -60,3 +60,3 @@ const daph2 = ph.data_page_header_v2 | ||
const pageReader = { view: pageView, offset: 0 } | ||
values = readPlain(pageReader, columnMetadata.type, nValues, utf8) | ||
dataPage = readPlain(pageReader, columnMetadata.type, nValues, utf8) | ||
} else if (daph2.encoding === 'RLE') { | ||
@@ -70,4 +70,4 @@ const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) | ||
const pageReader = { view: pageView, offset: 4 } | ||
values = new Array(nValues) | ||
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, values) | ||
dataPage = new Array(nValues) | ||
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, dataPage) | ||
} | ||
@@ -83,4 +83,4 @@ } else if ( | ||
const pageReader = { view: pageView, offset: 1 } | ||
values = new Array(nValues) | ||
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, values) | ||
dataPage = new Array(nValues) | ||
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, dataPage) | ||
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') { | ||
@@ -90,3 +90,3 @@ if (daph2.num_nulls) throw new Error('parquet delta-int not supported') | ||
const page = decompressPage(compressedBytes, uncompressedPageSize, codec, compressors) | ||
deltaBinaryUnpack(page, nValues, values) | ||
deltaBinaryUnpack(page, nValues, dataPage) | ||
} else { | ||
@@ -96,3 +96,3 @@ throw new Error(`parquet unsupported encoding: ${daph2.encoding}`) | ||
return { definitionLevels, repetitionLevels, value: values } | ||
return { definitionLevels, repetitionLevels, dataPage } | ||
} | ||
@@ -99,0 +99,0 @@ |
@@ -19,6 +19,7 @@ import { readVarInt } from './thrift.js' | ||
* @typedef {import("./types.d.ts").DataReader} DataReader | ||
* @typedef {number[]} DecodedArray | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} width - width of each bit-packed group | ||
* @param {number} length - length of the encoded data | ||
* @param {number[]} values - output array | ||
* @param {DecodedArray} values - output array | ||
*/ | ||
@@ -56,3 +57,3 @@ export function readRleBitPackedHybrid(reader, width, length, values) { | ||
* @param {number} bitWidth - width of each bit-packed group | ||
* @param {number[]} values - output array | ||
* @param {DecodedArray} values - output array | ||
* @param {number} seen - number of values seen so far | ||
@@ -59,0 +60,0 @@ */ |
@@ -26,6 +26,11 @@ /** | ||
* @param {number} count - number of values to read | ||
* @returns {number[]} array of int32 values | ||
* @returns {Int32Array} array of int32 values | ||
*/ | ||
function readPlainInt32(reader, count) { | ||
const values = new Array(count) | ||
if ((reader.view.byteOffset + reader.offset) % 4 === 0) { | ||
const values = new Int32Array(reader.view.buffer, reader.view.byteOffset + reader.offset, count) | ||
reader.offset += count * 4 | ||
return values | ||
} | ||
const values = new Int32Array(count) | ||
for (let i = 0; i < count; i++) { | ||
@@ -43,6 +48,11 @@ values[i] = reader.view.getInt32(reader.offset + i * 4, true) | ||
* @param {number} count - number of values to read | ||
* @returns {bigint[]} array of int64 values | ||
* @returns {BigInt64Array} array of int64 values | ||
*/ | ||
function readPlainInt64(reader, count) { | ||
const values = new Array(count) | ||
if ((reader.view.byteOffset + reader.offset) % 8 === 0) { | ||
const values = new BigInt64Array(reader.view.buffer, reader.view.byteOffset + reader.offset, count) | ||
reader.offset += count * 8 | ||
return values | ||
} | ||
const values = new BigInt64Array(count) | ||
for (let i = 0; i < count; i++) { | ||
@@ -78,9 +88,6 @@ values[i] = reader.view.getBigInt64(reader.offset + i * 8, true) | ||
* @param {number} count - number of values to read | ||
* @returns {number[]} array of float values | ||
* @returns {Float32Array} array of float values | ||
*/ | ||
function readPlainFloat(reader, count) { | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
values[i] = reader.view.getFloat32(reader.offset + i * 4, true) | ||
} | ||
const values = new Float32Array(reader.view.buffer, reader.view.byteOffset + reader.offset, count) | ||
reader.offset += count * 4 | ||
@@ -95,9 +102,6 @@ return values | ||
* @param {number} count - number of values to read | ||
* @returns {number[]} array of double values | ||
* @returns {Float64Array} array of double values | ||
*/ | ||
function readPlainDouble(reader, count) { | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
values[i] = reader.view.getFloat64(reader.offset + i * 8, true) | ||
} | ||
const values = new Float64Array(reader.view.buffer, reader.view.byteOffset + reader.offset, count) | ||
reader.offset += count * 8 | ||
@@ -144,2 +148,3 @@ return values | ||
* | ||
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray | ||
* @typedef {import("./types.d.ts").ParquetType} ParquetType | ||
@@ -150,3 +155,3 @@ * @param {DataReader} reader - buffer to read data from | ||
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8 | ||
* @returns {ArrayLike<any>} array of values | ||
* @returns {DecodedArray} array of values | ||
*/ | ||
@@ -153,0 +158,0 @@ export function readPlain(reader, type, count, utf8) { |
@@ -97,18 +97,2 @@ /** | ||
/** | ||
* Get the number of bytes to skip for definition levels. | ||
* | ||
* @param {number} num number of values | ||
* @returns {number} number of bytes to skip | ||
*/ | ||
export function skipDefinitionBytes(num) { | ||
let byteLength = 6 | ||
let n = num >>> 8 | ||
while (n !== 0) { | ||
byteLength++ | ||
n >>>= 7 | ||
} | ||
return byteLength | ||
} | ||
/** | ||
* Get the column name as foo.bar and handle list and map like columns. | ||
@@ -115,0 +99,0 @@ * |
@@ -245,3 +245,11 @@ type Awaitable<T> = T | Promise<T> | ||
repetitionLevels: number[] | ||
value: any[] | ||
dataPage: DecodedArray | ||
} | ||
export type DecodedArray = | ||
Uint8Array | | ||
Int32Array | | ||
BigInt64Array | | ||
Float32Array | | ||
Float64Array | | ||
any[] |
@@ -27,4 +27,6 @@ /** | ||
* Concatenate two arrays fast. | ||
* | ||
* @typedef {import('./types.js').DecodedArray} DecodedArray | ||
* @param {any[]} aaa first array | ||
* @param {any[]} bbb second array | ||
* @param {DecodedArray} bbb second array | ||
*/ | ||
@@ -31,0 +33,0 @@ export function concat(aaa, bbb) { |
92275
2384