Comparing version 0.9.8 to 0.9.9
{ | ||
"name": "hyparquet", | ||
"version": "0.9.8", | ||
"version": "0.9.9", | ||
"description": "parquet file parser for javascript", | ||
@@ -30,8 +30,8 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "20.12.12", | ||
"@typescript-eslint/eslint-plugin": "7.11.0", | ||
"@types/node": "20.14.2", | ||
"@typescript-eslint/eslint-plugin": "7.12.0", | ||
"@vitest/coverage-v8": "1.6.0", | ||
"eslint": "8.57.0", | ||
"eslint-plugin-import": "2.29.1", | ||
"eslint-plugin-jsdoc": "48.2.6", | ||
"eslint-plugin-jsdoc": "48.2.9", | ||
"http-server": "14.1.1", | ||
@@ -38,0 +38,0 @@ "hyparquet-compressors": "0.1.4", |
@@ -11,2 +11,3 @@ import { isListLike, isMapLike } from './schema.js' | ||
* @typedef {import('./types.d.ts').FieldRepetitionType} FieldRepetitionType | ||
* @param {any[]} output | ||
* @param {number[] | undefined} definitionLevels | ||
@@ -17,12 +18,9 @@ * @param {number[]} repetitionLevels | ||
* @param {number} maxDefinitionLevel definition level that corresponds to non-null | ||
* @param {number} maxRepetitionLevel repetition level that corresponds to a new row | ||
* @returns {DecodedArray} array of values | ||
* @returns {any[]} | ||
*/ | ||
export function assembleLists( | ||
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel | ||
output, definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel | ||
) { | ||
const n = definitionLevels?.length || repetitionLevels.length | ||
let valueIndex = 0 | ||
/** @type {any[]} */ | ||
const output = [] | ||
@@ -36,2 +34,14 @@ // Track state of nested structures | ||
if (repetitionLevels[0]) { | ||
// continue previous row | ||
while (currentDepth < repetitionPath.length - 2 && currentRepLevel < repetitionLevels[0]) { | ||
// go into last list | ||
currentContainer = currentContainer.at(-1) | ||
containerStack.push(currentContainer) | ||
currentDepth++ | ||
if (repetitionPath[currentDepth] !== 'REQUIRED') currentDefLevel++ | ||
if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel++ | ||
} | ||
} | ||
for (let i = 0; i < n; i++) { | ||
@@ -83,6 +93,3 @@ // assert(currentDefLevel === containerStack.length - 1) | ||
// Handle edge cases for empty inputs or single-level data | ||
if (output.length === 0) { | ||
if (values.length > 0 && maxRepetitionLevel === 0) { | ||
return values // flat list | ||
} | ||
if (!output.length) { | ||
// return max definition level of nested lists | ||
@@ -89,0 +96,0 @@ for (let i = 0; i < maxDefinitionLevel; i++) { |
@@ -6,3 +6,3 @@ import { assembleLists } from './assemble.js' | ||
import { parquetHeader } from './header.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' | ||
import { getMaxDefinitionLevel } from './schema.js' | ||
import { concat } from './utils.js' | ||
@@ -16,3 +16,3 @@ | ||
* @param {import('./types.js').DataReader} reader | ||
* @param {import('./types.js').RowGroup} rowGroup row group metadata | ||
* @param {number} rowLimit maximum number of rows to read | ||
* @param {ColumnMetaData} columnMetadata column metadata | ||
@@ -23,11 +23,10 @@ * @param {import('./types.js').SchemaTree[]} schemaPath schema path for the column | ||
*/ | ||
export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compressors, utf8 }) { | ||
export function readColumn(reader, rowLimit, columnMetadata, schemaPath, { compressors, utf8 }) { | ||
const { element } = schemaPath[schemaPath.length - 1] | ||
/** @type {DecodedArray | undefined} */ | ||
let dictionary = undefined | ||
let seen = 0 | ||
/** @type {any[]} */ | ||
const rowData = [] | ||
while (seen < rowGroup.num_rows) { | ||
while (rowData.length < rowLimit) { | ||
// parse column header | ||
@@ -51,4 +50,3 @@ const header = parquetHeader(reader) | ||
const { definitionLevels, repetitionLevels, dataPage } = readDataPage(page, daph, schemaPath, columnMetadata) | ||
seen += daph.num_values | ||
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length)) | ||
// assert(!daph.statistics?.null_count || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length)) | ||
@@ -59,6 +57,5 @@ // convert types, dereference dictionary, and assemble lists | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type) | ||
values = assembleLists( | ||
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel | ||
assembleLists( | ||
rowData, definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel | ||
) | ||
@@ -72,5 +69,4 @@ } else { | ||
} | ||
concat(rowData, values) | ||
} | ||
// assert(BigInt(values.length) === rowGroup.num_rows) | ||
concat(rowData, values) | ||
} else if (header.type === 'DATA_PAGE_V2') { | ||
@@ -83,3 +79,2 @@ const daph2 = header.data_page_header_v2 | ||
) | ||
seen += daph2.num_values | ||
@@ -90,9 +85,9 @@ // convert types, dereference dictionary, and assemble lists | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type) | ||
values = assembleLists( | ||
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel | ||
assembleLists( | ||
rowData, definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel | ||
) | ||
} else { | ||
concat(rowData, values) | ||
} | ||
concat(rowData, values) | ||
} else if (header.type === 'DICTIONARY_PAGE') { | ||
@@ -111,4 +106,4 @@ const diph = header.dictionary_page_header | ||
} | ||
if (rowData.length !== Number(rowGroup.num_rows)) { | ||
throw new Error(`parquet row data length ${rowData.length} does not match row group length ${rowGroup.num_rows}}`) | ||
if (rowData.length < rowLimit) { | ||
throw new Error(`parquet row data length ${rowData.length} does not match row group limit ${rowLimit}}`) | ||
} | ||
@@ -122,10 +117,10 @@ return rowData | ||
* @param {ColumnMetaData} columnMetadata | ||
* @returns {number} byte offset | ||
* @returns {[bigint, bigint]} byte offset range | ||
*/ | ||
export function getColumnOffset({ dictionary_page_offset, data_page_offset }) { | ||
export function getColumnRange({ dictionary_page_offset, data_page_offset, total_compressed_size }) { | ||
let columnOffset = dictionary_page_offset | ||
if (!dictionary_page_offset || data_page_offset < dictionary_page_offset) { | ||
if (!columnOffset || data_page_offset < columnOffset) { | ||
columnOffset = data_page_offset | ||
} | ||
return Number(columnOffset) | ||
return [columnOffset, columnOffset + total_compressed_size] | ||
} |
@@ -28,3 +28,5 @@ import { bitWidth, byteStreamSplit, readRleBitPackedHybrid } from './encoding.js' | ||
const repetitionLevels = readRepetitionLevels(reader, daph, schemaPath) | ||
// assert(!repetitionLevels.length || repetitionLevels.length === daph.num_values) | ||
const { definitionLevels, numNulls } = readDefinitionLevels(reader, daph, schemaPath) | ||
// assert(!definitionLevels.length || definitionLevels.length === daph.num_values) | ||
@@ -31,0 +33,0 @@ // read values based on encoding |
@@ -89,3 +89,3 @@ import type { AsyncBuffer, Compressors, FileMetaData, SchemaTree } from './types.d.ts' | ||
*/ | ||
export function toJson(obj: any): unknown | ||
export function toJson(obj: any): any | ||
@@ -92,0 +92,0 @@ /** |
import { assembleNested } from './assemble.js' | ||
import { getColumnOffset, readColumn } from './column.js' | ||
import { getColumnRange, readColumn } from './column.js' | ||
import { parquetMetadataAsync } from './metadata.js' | ||
@@ -54,3 +54,4 @@ import { getSchemaPath } from './schema.js' | ||
// read row group | ||
const groupData = await readRowGroup(options, rowGroup, groupStart) | ||
const rowLimit = rowEnd && rowEnd - groupStart | ||
const groupData = await readRowGroup(options, rowGroup, groupStart, rowLimit) | ||
if (onComplete) { | ||
@@ -82,7 +83,9 @@ // filter to rows in range | ||
* @param {number} groupStart row index of the first row in the group | ||
* @param {number} [rowLimit] max rows to read from this group | ||
* @returns {Promise<any[][]>} resolves to row data | ||
*/ | ||
async function readRowGroup(options, rowGroup, groupStart) { | ||
export async function readRowGroup(options, rowGroup, groupStart, rowLimit) { | ||
const { file, metadata, columns } = options | ||
if (!metadata) throw new Error('parquet metadata not found') | ||
if (rowLimit === undefined || rowLimit > rowGroup.num_rows) rowLimit = Number(rowGroup.num_rows) | ||
@@ -96,6 +99,5 @@ // loop through metadata to find min/max bytes to read | ||
const startByte = getColumnOffset(columnMetadata) | ||
const endByte = startByte + Number(columnMetadata.total_compressed_size) | ||
groupStartByte = Math.min(groupStartByte, startByte) | ||
groupEndByte = Math.max(groupEndByte, endByte) | ||
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number) | ||
groupStartByte = Math.min(groupStartByte, columnStartByte) | ||
groupEndByte = Math.max(groupEndByte, columnEndByte) | ||
}) | ||
@@ -130,4 +132,3 @@ if (groupStartByte >= groupEndByte && columns?.length) { | ||
const columnStartByte = getColumnOffset(columnMetadata) | ||
const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size) | ||
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number) | ||
const columnBytes = columnEndByte - columnStartByte | ||
@@ -160,3 +161,3 @@ | ||
/** @type {any[] | undefined} */ | ||
let columnData = readColumn(reader, rowGroup, columnMetadata, schemaPath, options) | ||
let columnData = readColumn(reader, rowLimit, columnMetadata, schemaPath, options) | ||
// assert(columnData.length === Number(rowGroup.num_rows) | ||
@@ -163,0 +164,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
101954
2662