Comparing version 0.7.8 to 0.7.9
{ | ||
"name": "hyparquet", | ||
"version": "0.7.8", | ||
"version": "0.7.9", | ||
"description": "parquet file parser for javascript", | ||
@@ -31,4 +31,4 @@ "keywords": [ | ||
"@types/node": "20.12.7", | ||
"@typescript-eslint/eslint-plugin": "7.7.1", | ||
"@vitest/coverage-v8": "1.5.2", | ||
"@typescript-eslint/eslint-plugin": "7.8.0", | ||
"@vitest/coverage-v8": "1.5.3", | ||
"eslint": "8.57.0", | ||
@@ -40,4 +40,4 @@ "eslint-plugin-import": "2.29.1", | ||
"typescript": "5.4.5", | ||
"vitest": "1.5.2" | ||
"vitest": "1.5.3" | ||
} | ||
} |
@@ -6,3 +6,3 @@ import { assembleObjects } from './assemble.js' | ||
import { parquetHeader } from './header.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, schemaElement } from './schema.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired } from './schema.js' | ||
import { snappyUncompress } from './snappy.js' | ||
@@ -12,3 +12,3 @@ import { concat } from './utils.js' | ||
/** | ||
* @typedef {import('./types.js').SchemaElement} SchemaElement | ||
* @typedef {import('./types.js').SchemaTree} SchemaTree | ||
* @typedef {import('./types.js').ColumnMetaData} ColumnMetaData | ||
@@ -26,7 +26,7 @@ * @typedef {import('./types.js').Compressors} Compressors | ||
* @param {ColumnMetaData} columnMetadata column metadata | ||
* @param {SchemaElement[]} schema schema for the file | ||
* @param {SchemaTree[]} schemaPath schema path for the column | ||
* @param {Compressors} [compressors] custom decompressors | ||
* @returns {ArrayLike<any>} array of values | ||
*/ | ||
export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, schema, compressors) { | ||
export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, schemaPath, compressors) { | ||
/** @type {ArrayLike<any> | undefined} */ | ||
@@ -38,2 +38,3 @@ let dictionary = undefined | ||
const rowData = [] | ||
const { element } = schemaPath[schemaPath.length - 1] | ||
@@ -62,3 +63,3 @@ while (valuesSeen < rowGroup.num_rows) { | ||
) | ||
const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schema, columnMetadata) | ||
const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schemaPath, columnMetadata) | ||
valuesSeen += daph.num_values | ||
@@ -74,10 +75,12 @@ | ||
// Use repetition levels to construct lists | ||
const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]]) | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) | ||
const isNullable = columnMetadata && !isRequired(schemaPath.slice(0, 2)) | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
// convert primitive types to rich types | ||
values = convert(dataPage, element) | ||
values = assembleObjects( | ||
definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel, maxRepetitionLevel | ||
definitionLevels, repetitionLevels, values, isNullable, maxDefinitionLevel, maxRepetitionLevel | ||
) | ||
} else if (definitionLevels?.length) { | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
// Use definition levels to skip nulls | ||
@@ -89,6 +92,6 @@ values = [] | ||
dereferenceDictionary(dictionary, dataPage) | ||
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element) | ||
values = convert(dataPage, element) | ||
} else if (Array.isArray(dataPage)) { | ||
// convert primitive types to rich types | ||
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element) | ||
values = convert(dataPage, element) | ||
} else { | ||
@@ -111,3 +114,3 @@ values = dataPage // TODO: data page shouldn't be a fixed byte array? | ||
) | ||
dictionary = readDictionaryPage(page, diph, schema, columnMetadata) | ||
dictionary = readDictionaryPage(page, diph, columnMetadata) | ||
} else if (header.type === 'DATA_PAGE_V2') { | ||
@@ -118,8 +121,8 @@ const daph2 = header.data_page_header_v2 | ||
const { definitionLevels, repetitionLevels, value: dataPage } = readDataPageV2( | ||
compressedBytes, header, schema, columnMetadata, compressors | ||
compressedBytes, header, schemaPath, columnMetadata, compressors | ||
) | ||
valuesSeen += daph2.num_values | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
if (repetitionLevels.length) { | ||
@@ -168,3 +171,3 @@ dereferenceDictionary(dictionary, dataPage) | ||
* | ||
* @param {ColumnMetaData} columnMetadata column metadata | ||
* @param {ColumnMetaData} columnMetadata | ||
* @returns {number} byte offset | ||
@@ -171,0 +174,0 @@ */ |
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, schemaElement, skipDefinitionBytes } from './schema.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js' | ||
@@ -13,10 +13,10 @@ const skipNulls = false // TODO | ||
* @typedef {import("./types.d.ts").DataPageHeader} DataPageHeader | ||
* @typedef {import("./types.d.ts").SchemaElement} SchemaElement | ||
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree | ||
* @param {Uint8Array} bytes raw page data (should already be decompressed) | ||
* @param {DataPageHeader} daph data page header | ||
* @param {SchemaElement[]} schema schema for the file | ||
* @param {ColumnMetaData} columnMetadata metadata for the column | ||
* @param {SchemaTree[]} schemaPath | ||
* @param {ColumnMetaData} columnMetadata | ||
* @returns {DataPage} definition levels, repetition levels, and array of values | ||
*/ | ||
export function readDataPage(bytes, daph, schema, columnMetadata) { | ||
export function readDataPage(bytes, daph, schemaPath, columnMetadata) { | ||
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) | ||
@@ -28,5 +28,3 @@ const reader = { view, offset: 0 } | ||
// repetition levels | ||
const repetitionLevels = readRepetitionLevels( | ||
reader, daph, schema, columnMetadata | ||
) | ||
const repetitionLevels = readRepetitionLevels(reader, daph, schemaPath) | ||
@@ -38,7 +36,7 @@ // definition levels | ||
// TODO: move into readDefinitionLevels | ||
if (skipNulls && !isRequired(schema, columnMetadata.path_in_schema)) { | ||
if (skipNulls && !isRequired(schemaPath)) { | ||
// skip_definition_bytes | ||
reader.offset += skipDefinitionBytes(daph.num_values) | ||
} else { | ||
const dl = readDefinitionLevels(reader, daph, schema, columnMetadata.path_in_schema) | ||
const dl = readDefinitionLevels(reader, daph, schemaPath) | ||
definitionLevels = dl.definitionLevels | ||
@@ -51,3 +49,3 @@ numNulls = dl.numNulls | ||
if (daph.encoding === 'PLAIN') { | ||
const { element } = schemaElement(schema, columnMetadata.path_in_schema) | ||
const { element } = schemaPath[schemaPath.length - 1] | ||
const utf8 = element.converted_type === 'UTF8' | ||
@@ -92,7 +90,6 @@ const plainObj = readPlain(reader, columnMetadata.type, nValues, utf8) | ||
* @param {DictionaryPageHeader} diph dictionary page header | ||
* @param {SchemaElement[]} schema schema for the file | ||
* @param {ColumnMetaData} columnMetadata metadata for the column | ||
* @param {ColumnMetaData} columnMetadata | ||
* @returns {ArrayLike<any>} array of values | ||
*/ | ||
export function readDictionaryPage(bytes, diph, schema, columnMetadata) { | ||
export function readDictionaryPage(bytes, diph, columnMetadata) { | ||
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) | ||
@@ -109,9 +106,8 @@ const reader = { view, offset: 0 } | ||
* @param {DataPageHeader} daph data page header | ||
* @param {SchemaElement[]} schema schema for the file | ||
* @param {ColumnMetaData} columnMetadata metadata for the column | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {any[]} repetition levels and number of bytes read | ||
*/ | ||
function readRepetitionLevels(reader, daph, schema, columnMetadata) { | ||
if (columnMetadata.path_in_schema.length > 1) { | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) | ||
function readRepetitionLevels(reader, daph, schemaPath) { | ||
if (schemaPath.length > 1) { | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
if (maxRepetitionLevel) { | ||
@@ -132,9 +128,8 @@ const bitWidth = widthFromMaxInt(maxRepetitionLevel) | ||
* @param {DataPageHeader} daph data page header | ||
* @param {SchemaElement[]} schema schema for the file | ||
* @param {string[]} path_in_schema path in the schema | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {DefinitionLevels} definition levels and number of bytes read | ||
*/ | ||
function readDefinitionLevels(reader, daph, schema, path_in_schema) { | ||
if (!isRequired(schema, path_in_schema)) { | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, path_in_schema) | ||
function readDefinitionLevels(reader, daph, schemaPath) { | ||
if (!isRequired(schemaPath)) { | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
const bitWidth = widthFromMaxInt(maxDefinitionLevel) | ||
@@ -141,0 +136,0 @@ if (bitWidth) { |
import { decompressPage } from './column.js' | ||
import { readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, schemaElement } from './schema.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' | ||
import { readVarInt, readZigZag } from './thrift.js' | ||
@@ -13,11 +13,11 @@ | ||
* @typedef {import("./types.d.ts").DataPageHeaderV2} DataPageHeaderV2 | ||
* @typedef {import("./types.d.ts").SchemaElement} SchemaElement | ||
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree | ||
* @param {Uint8Array} compressedBytes raw page data (should already be decompressed) | ||
* @param {import("./types.d.ts").PageHeader} ph page header | ||
* @param {SchemaElement[]} schema schema for the file | ||
* @param {ColumnMetaData} columnMetadata metadata for the column | ||
* @param {SchemaTree[]} schemaPath | ||
* @param {ColumnMetaData} columnMetadata | ||
* @param {Compressors | undefined} compressors | ||
* @returns {DataPage} definition levels, repetition levels, and array of values | ||
*/ | ||
export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, compressors) { | ||
export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata, compressors) { | ||
const view = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength) | ||
@@ -32,3 +32,3 @@ const reader = { view, offset: 0 } | ||
// repetition levels | ||
const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schema, columnMetadata) | ||
const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schemaPath) | ||
@@ -40,3 +40,3 @@ if (reader.offset !== daph2.repetition_levels_byte_length) { | ||
// definition levels | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
const definitionLevels = readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) | ||
@@ -53,3 +53,3 @@ | ||
if (daph2.encoding === 'PLAIN') { | ||
const { element } = schemaElement(schema, columnMetadata.path_in_schema) | ||
const { element } = schemaPath[schemaPath.length - 1] | ||
const utf8 = element.converted_type === 'UTF8' | ||
@@ -106,8 +106,7 @@ let page = compressedBytes.slice(reader.offset) | ||
* @param {DataPageHeaderV2} daph2 data page header | ||
* @param {SchemaElement[]} schema schema for the file | ||
* @param {ColumnMetaData} columnMetadata metadata for the column | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {any[]} repetition levels and number of bytes read | ||
*/ | ||
export function readRepetitionLevelsV2(reader, daph2, schema, columnMetadata) { | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) | ||
export function readRepetitionLevelsV2(reader, daph2, schemaPath) { | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
if (!maxRepetitionLevel) return [] | ||
@@ -127,3 +126,3 @@ | ||
* @param {DataPageHeaderV2} daph2 data page header v2 | ||
* @param {number} maxDefinitionLevel maximum definition level for this column | ||
* @param {number} maxDefinitionLevel | ||
* @returns {number[] | undefined} definition levels and number of bytes read | ||
@@ -130,0 +129,0 @@ */ |
@@ -12,3 +12,3 @@ import { readVarInt } from './thrift.js' | ||
function readPlainBoolean(reader, count) { | ||
const value = [] | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
@@ -18,6 +18,6 @@ const byteOffset = reader.offset + Math.floor(i / 8) | ||
const byte = reader.view.getUint8(byteOffset) | ||
value.push((byte & (1 << bitOffset)) !== 0) | ||
values[i] = (byte & (1 << bitOffset)) !== 0 | ||
} | ||
reader.offset += Math.ceil(count / 8) | ||
return value | ||
return values | ||
} | ||
@@ -33,8 +33,8 @@ | ||
function readPlainInt32(reader, count) { | ||
const value = [] | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
value.push(reader.view.getInt32(reader.offset + i * 4, true)) | ||
values[i] = reader.view.getInt32(reader.offset + i * 4, true) | ||
} | ||
reader.offset += count * 4 | ||
return value | ||
return values | ||
} | ||
@@ -50,8 +50,8 @@ | ||
function readPlainInt64(reader, count) { | ||
const value = [] | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
value.push(reader.view.getBigInt64(reader.offset + i * 8, true)) | ||
values[i] = reader.view.getBigInt64(reader.offset + i * 8, true) | ||
} | ||
reader.offset += count * 8 | ||
return value | ||
return values | ||
} | ||
@@ -67,10 +67,10 @@ | ||
function readPlainInt96(reader, count) { | ||
const value = [] | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
const low = reader.view.getBigInt64(reader.offset + i * 12, true) | ||
const high = reader.view.getInt32(reader.offset + i * 12 + 8, true) | ||
value.push((BigInt(high) << BigInt(32)) | low) | ||
values[i] = (BigInt(high) << BigInt(32)) | low | ||
} | ||
reader.offset += count * 12 | ||
return value | ||
return values | ||
} | ||
@@ -86,8 +86,8 @@ | ||
function readPlainFloat(reader, count) { | ||
const value = [] | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
value.push(reader.view.getFloat32(reader.offset + i * 4, true)) | ||
values[i] = reader.view.getFloat32(reader.offset + i * 4, true) | ||
} | ||
reader.offset += count * 4 | ||
return value | ||
return values | ||
} | ||
@@ -103,8 +103,8 @@ | ||
function readPlainDouble(reader, count) { | ||
const value = [] | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
value.push(reader.view.getFloat64(reader.offset + i * 8, true)) | ||
values[i] = reader.view.getFloat64(reader.offset + i * 8, true) | ||
} | ||
reader.offset += count * 8 | ||
return value | ||
return values | ||
} | ||
@@ -120,11 +120,10 @@ | ||
function readPlainByteArray(reader, count) { | ||
const value = [] | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
const length = reader.view.getInt32(reader.offset, true) | ||
reader.offset += 4 | ||
const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length) | ||
value.push(bytes) | ||
values[i] = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length) | ||
reader.offset += length | ||
} | ||
return value | ||
return values | ||
} | ||
@@ -210,3 +209,3 @@ | ||
/** @type {any[]} */ | ||
const value = [] | ||
const values = [] | ||
if (encoding === 'RLE') { | ||
@@ -217,3 +216,3 @@ let seen = 0 | ||
if (!rle.length) break // EOF | ||
concat(value, rle) | ||
concat(values, rle) | ||
seen += rle.length | ||
@@ -224,3 +223,3 @@ } | ||
} | ||
return value | ||
return values | ||
} | ||
@@ -247,5 +246,5 @@ | ||
/** @type {number[]} */ | ||
const value = [] | ||
const values = [] | ||
const startOffset = reader.offset | ||
while (reader.offset - startOffset < length && value.length < numValues) { | ||
while (reader.offset - startOffset < length && values.length < numValues) { | ||
const [header, newOffset] = readVarInt(reader.view, reader.offset) | ||
@@ -256,13 +255,13 @@ reader.offset = newOffset | ||
const rle = readRle(reader, header, width) | ||
concat(value, rle) | ||
concat(values, rle) | ||
} else { | ||
// bit-packed | ||
const bitPacked = readBitPacked( | ||
reader, header, width, numValues - value.length | ||
reader, header, width, numValues - values.length | ||
) | ||
concat(value, bitPacked) | ||
concat(values, bitPacked) | ||
} | ||
} | ||
return value | ||
return values | ||
} | ||
@@ -284,11 +283,11 @@ | ||
const width = (bitWidth + 7) >> 3 | ||
let readValue | ||
let value | ||
if (width === 1) { | ||
readValue = reader.view.getUint8(reader.offset) | ||
value = reader.view.getUint8(reader.offset) | ||
reader.offset++ | ||
} else if (width === 2) { | ||
readValue = reader.view.getUint16(reader.offset, true) | ||
value = reader.view.getUint16(reader.offset, true) | ||
reader.offset += 2 | ||
} else if (width === 4) { | ||
readValue = reader.view.getUint32(reader.offset, true) | ||
value = reader.view.getUint32(reader.offset, true) | ||
reader.offset += 4 | ||
@@ -300,7 +299,7 @@ } else { | ||
// repeat value count times | ||
const value = [] | ||
const values = new Array(count) | ||
for (let i = 0; i < count; i++) { | ||
value.push(readValue) | ||
values[i] = value | ||
} | ||
return value | ||
return values | ||
} | ||
@@ -321,3 +320,4 @@ | ||
let count = (header >> 1) << 3 | ||
const mask = maskForBits(bitWidth) | ||
// mask for bitWidth number of bits | ||
const mask = (1 << bitWidth) - 1 | ||
@@ -335,3 +335,3 @@ // Sometimes it tries to read outside of available memory, but it will be masked out anyway | ||
/** @type {number[]} */ | ||
const value = [] | ||
const values = [] | ||
@@ -353,3 +353,3 @@ // read values | ||
// emit value by shifting off to the right and masking | ||
value.push((data >> right) & mask) | ||
values.push((data >> right) & mask) | ||
remaining-- | ||
@@ -362,13 +362,3 @@ } | ||
return value | ||
return values | ||
} | ||
/** | ||
* Generate a mask for the given number of bits. | ||
* | ||
* @param {number} bits - number of bits for the mask | ||
* @returns {number} a mask for the given number of bits | ||
*/ | ||
function maskForBits(bits) { | ||
return (1 << bits) - 1 | ||
} |
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js' | ||
import { schemaElement } from './schema.js' | ||
import { getSchemaPath } from './schema.js' | ||
import { deserializeTCompactProtocol } from './thrift.js' | ||
@@ -175,3 +175,3 @@ | ||
export function parquetSchema(metadata) { | ||
return schemaElement(metadata.schema, []) | ||
return getSchemaPath(metadata.schema, [])[0] | ||
} | ||
@@ -178,0 +178,0 @@ |
import { getColumnOffset, readColumn } from './column.js' | ||
import { parquetMetadataAsync } from './metadata.js' | ||
import { getColumnName, isMapLike } from './schema.js' | ||
import { getColumnName, getSchemaPath, isMapLike } from './schema.js' | ||
import { concat } from './utils.js' | ||
@@ -78,3 +78,3 @@ | ||
* @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed | ||
* @param {Compressors} [options.compressors] custom decompressors | ||
* @param {Compressors} [options.compressors] | ||
* @param {RowGroup} rowGroup row group to read | ||
@@ -92,5 +92,4 @@ * @param {number} groupStart row index of the first row in the group | ||
if (!columnMetadata) throw new Error('parquet column metadata is undefined') | ||
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema) | ||
// skip columns that are not requested | ||
if (columns && !columns.includes(columnName)) return | ||
if (columns && !columns.includes(getColumnName(columnMetadata.path_in_schema))) return | ||
@@ -106,5 +105,5 @@ const startByte = getColumnOffset(columnMetadata) | ||
} | ||
// if row group size is less than 128mb, pre-load in one read | ||
// if row group size is less than 32mb, pre-load in one read | ||
let groupBuffer | ||
if (groupEndByte - groupStartByte <= 1 << 27) { | ||
if (groupEndByte - groupStartByte <= 1 << 25) { | ||
// pre-load row group byte data in one big read, | ||
@@ -126,4 +125,3 @@ // otherwise read column data individually | ||
// skip columns that are not requested | ||
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema) | ||
// skip columns that are not requested | ||
const columnName = getColumnName(columnMetadata.path_in_schema) | ||
if (columns && !columns.includes(columnName)) continue | ||
@@ -157,6 +155,6 @@ | ||
promises.push(buffer.then(arrayBuffer => { | ||
// TODO: extract SchemaElement for this column | ||
const schemaPath = getSchemaPath(metadata.schema, columnMetadata.path_in_schema) | ||
/** @type {ArrayLike<any> | undefined} */ | ||
let columnData = readColumn( | ||
arrayBuffer, bufferOffset, rowGroup, columnMetadata, metadata.schema, compressors | ||
arrayBuffer, bufferOffset, rowGroup, columnMetadata, schemaPath, compressors | ||
) | ||
@@ -167,3 +165,3 @@ if (columnData.length !== Number(rowGroup.num_rows)) { | ||
if (isMapLike(metadata.schema, columnMetadata.path_in_schema)) { | ||
if (isMapLike(schemaPath)) { | ||
const name = columnMetadata.path_in_schema.slice(0, -2).join('.') | ||
@@ -195,6 +193,2 @@ if (!maps.has(name)) { | ||
} | ||
if (keys[i][j] instanceof Uint8Array) { | ||
// decode utf-8 keys | ||
keys[i][j] = new TextDecoder().decode(keys[i][j]) | ||
} | ||
if (!keys[i][j]) continue | ||
@@ -201,0 +195,0 @@ obj[keys[i][j]] = values[i][j] === undefined ? null : values[i][j] |
@@ -31,35 +31,29 @@ /** | ||
/** | ||
* Get the schema element with the given name. | ||
* Get schema elements from the root to the given element name. | ||
* | ||
* @param {SchemaElement[]} schema | ||
* @param {string[]} name path to the element | ||
* @returns {SchemaTree} schema element | ||
* @returns {SchemaTree[]} list of schema elements | ||
*/ | ||
export function schemaElement(schema, name) { | ||
export function getSchemaPath(schema, name) { | ||
let tree = schemaTree(schema, 0) | ||
// traverse the tree to find the element | ||
const path = [tree] | ||
for (const part of name) { | ||
const child = tree.children.find(child => child.element.name === part) | ||
if (!child) throw new Error(`parquet schema element not found: ${name}`) | ||
path.push(child) | ||
tree = child | ||
} | ||
return tree | ||
return path | ||
} | ||
/** | ||
* Check if the schema element with the given name is required. | ||
* An element is required if all of its ancestors are required. | ||
* Check if the schema path and all its ancestors are required. | ||
* | ||
* @param {SchemaElement[]} schema | ||
* @param {string[]} name path to the element | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {boolean} true if the element is required | ||
*/ | ||
export function isRequired(schema, name) { | ||
/** @type {SchemaTree | undefined} */ | ||
let tree = schemaTree(schema, 0) | ||
for (let i = 0; i < name.length; i++) { | ||
// Find schema child with the given name | ||
tree = tree.children.find(child => child.element.name === name[i]) | ||
if (!tree) throw new Error(`parquet schema element not found: ${name}`) | ||
if (tree.element.repetition_type !== 'REQUIRED') { | ||
export function isRequired(schemaPath) { | ||
for (const { element } of schemaPath.slice(1)) { | ||
if (element.repetition_type !== 'REQUIRED') { | ||
return false | ||
@@ -74,14 +68,12 @@ } | ||
* | ||
* @param {SchemaElement[]} schema | ||
* @param {string[]} parts path to the element | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {number} max repetition level | ||
*/ | ||
export function getMaxRepetitionLevel(schema, parts) { | ||
export function getMaxRepetitionLevel(schemaPath) { | ||
let maxLevel = 0 | ||
parts.forEach((part, i) => { | ||
const { element } = schemaElement(schema, parts.slice(0, i + 1)) | ||
for (const { element } of schemaPath) { | ||
if (element.repetition_type === 'REPEATED') { | ||
maxLevel++ | ||
} | ||
}) | ||
} | ||
return maxLevel | ||
@@ -93,14 +85,12 @@ } | ||
* | ||
* @param {SchemaElement[]} schema | ||
* @param {string[]} parts path to the element | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {number} max definition level | ||
*/ | ||
export function getMaxDefinitionLevel(schema, parts) { | ||
export function getMaxDefinitionLevel(schemaPath) { | ||
let maxLevel = 0 | ||
parts.forEach((part, i) => { | ||
const { element } = schemaElement(schema, parts.slice(0, i + 1)) | ||
for (const { element } of schemaPath.slice(1)) { | ||
if (element.repetition_type !== 'REQUIRED') { | ||
maxLevel++ | ||
} | ||
}) | ||
} | ||
return maxLevel | ||
@@ -126,13 +116,12 @@ } | ||
/** | ||
* Get the column name as foo.bar and handle list-like columns. | ||
* @param {SchemaElement[]} schema | ||
* Get the column name as foo.bar and handle list and map like columns. | ||
* | ||
* @param {string[]} path | ||
* @returns {string} column name | ||
*/ | ||
export function getColumnName(schema, path) { | ||
if (isListLike(schema, path) || isMapLike(schema, path)) { | ||
return path.slice(0, -2).join('.') | ||
} else { | ||
return path.join('.') | ||
} | ||
export function getColumnName(path) { | ||
return path.join('.') | ||
.replace(/(\.list\.element)+/g, '') | ||
.replace(/\.key_value\.key/g, '') | ||
.replace(/\.key_value\.value/g, '') | ||
} | ||
@@ -143,9 +132,8 @@ | ||
* | ||
* @param {SchemaElement[]} schemaElements parquet schema elements | ||
* @param {string[]} path column path | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {boolean} true if map-like | ||
*/ | ||
function isListLike(schemaElements, path) { | ||
const schema = schemaElement(schemaElements, path.slice(0, -2)) | ||
if (path.length < 3) return false | ||
export function isListLike(schemaPath) { | ||
const schema = schemaPath.at(-3) | ||
if (!schema) return false | ||
if (schema.element.converted_type !== 'LIST') return false | ||
@@ -167,9 +155,8 @@ if (schema.children.length > 1) return false | ||
* | ||
* @param {SchemaElement[]} schemaElements parquet schema elements | ||
* @param {string[]} path column path | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {boolean} true if map-like | ||
*/ | ||
export function isMapLike(schemaElements, path) { | ||
const schema = schemaElement(schemaElements, path.slice(0, -2)) | ||
if (path.length < 3) return false | ||
export function isMapLike(schemaPath) { | ||
const schema = schemaPath.at(-3) | ||
if (!schema) return false | ||
if (schema.element.converted_type !== 'MAP') return false | ||
@@ -176,0 +163,0 @@ if (schema.children.length > 1) return false |
@@ -87,9 +87,9 @@ // TCompactProtocol types | ||
index = listIndex | ||
const listValues = [] | ||
const values = new Array(listSize) | ||
for (let i = 0; i < listSize; i++) { | ||
let listElem | ||
[listElem, index] = readElement(view, elemType, index) | ||
listValues.push(listElem) | ||
values[i] = listElem | ||
} | ||
return [listValues, index] | ||
return [values, index] | ||
} | ||
@@ -96,0 +96,0 @@ case CompactType.STRUCT: { |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
94837
2468