Comparing version 0.6.0 to 0.7.0
{ | ||
"name": "hyparquet", | ||
"version": "0.6.0", | ||
"version": "0.7.0", | ||
"description": "parquet file parser for javascript", | ||
@@ -30,13 +30,13 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "20.11.21", | ||
"@typescript-eslint/eslint-plugin": "7.1.0", | ||
"@types/node": "20.11.27", | ||
"@typescript-eslint/eslint-plugin": "7.2.0", | ||
"@vitest/coverage-v8": "1.3.1", | ||
"eslint": "8.57.0", | ||
"eslint-plugin-import": "2.29.1", | ||
"eslint-plugin-jsdoc": "48.2.0", | ||
"eslint-plugin-jsdoc": "48.2.1", | ||
"http-server": "14.1.1", | ||
"hysnappy": "0.3.0", | ||
"typescript": "5.3.3", | ||
"typescript": "5.4.2", | ||
"vitest": "1.3.1" | ||
} | ||
} |
@@ -82,6 +82,6 @@ import { PageType } from './constants.js' | ||
dereferenceDictionary(dictionary, dataPage) | ||
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema)) | ||
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element) | ||
} else if (Array.isArray(dataPage)) { | ||
// convert primitive types to rich types | ||
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema)) | ||
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element) | ||
} else { | ||
@@ -88,0 +88,0 @@ values = dataPage // TODO: data page shouldn't be a fixed byte array? |
@@ -0,1 +1,5 @@ | ||
/** | ||
* @typedef {import('./types.js').ParquetType} ParquetTypeType | ||
* @type {ParquetTypeType[]} | ||
*/ | ||
export const ParquetType = [ | ||
@@ -31,2 +35,6 @@ 'BOOLEAN', | ||
/** | ||
* @typedef {import('./types.js').ConvertedType} ConvertedTypeType | ||
* @type {ConvertedTypeType[]} | ||
*/ | ||
export const ConvertedType = [ | ||
@@ -57,2 +65,24 @@ 'UTF8', | ||
/** | ||
* @typedef {import('./types.js').LogicalTypeType} LogicalTypeType | ||
* @type {LogicalTypeType[]} | ||
*/ | ||
export const logicalTypeType = [ | ||
'NULL', | ||
'STRING', | ||
'MAP', | ||
'LIST', | ||
'ENUM', | ||
'DECIMAL', | ||
'DATE', | ||
'TIME', | ||
'TIMESTAMP', | ||
'INTERVAL', | ||
'INTEGER', | ||
'NULL', | ||
'JSON', | ||
'BSON', | ||
'UUID', | ||
] | ||
export const CompressionCodec = [ | ||
@@ -59,0 +89,0 @@ 'UNCOMPRESSED', |
@@ -64,4 +64,4 @@ import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' | ||
if (daph.encoding === 'PLAIN') { | ||
const se = schemaElement(schema, columnMetadata.path_in_schema) | ||
const utf8 = se.converted_type === 'UTF8' | ||
const { element } = schemaElement(schema, columnMetadata.path_in_schema) | ||
const utf8 = element.converted_type === 'UTF8' | ||
const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8) | ||
@@ -68,0 +68,0 @@ values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value) |
@@ -50,4 +50,4 @@ import { decompressPage } from './column.js' | ||
if (daph2.encoding === 'PLAIN') { | ||
const se = schemaElement(schema, columnMetadata.path_in_schema) | ||
const utf8 = se.converted_type === 'UTF8' | ||
const { element } = schemaElement(schema, columnMetadata.path_in_schema) | ||
const utf8 = element.converted_type === 'UTF8' | ||
let page = compressedBytes.slice(offset) | ||
@@ -54,0 +54,0 @@ if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') { |
@@ -17,3 +17,3 @@ export { AsyncBuffer, FileMetaData, SchemaTree } from './types' | ||
* @param {FileMetaData} [options.metadata] parquet file metadata | ||
* @param {number[]} [options.columns] columns to read, all columns if undefined | ||
* @param {string[]} [options.columns] columns to read, all columns if undefined | ||
* @param {number} [options.rowStart] first requested row index (inclusive) | ||
@@ -96,3 +96,3 @@ * @param {number} [options.rowEnd] last requested row index (exclusive) | ||
metadata?: FileMetaData // parquet metadata, will be parsed if not provided | ||
columns?: number[] // columns to read, all columns if undefined | ||
columns?: string[] // columns to read, all columns if undefined | ||
rowStart?: number // inclusive | ||
@@ -109,6 +109,6 @@ rowEnd?: number // exclusive | ||
export interface ColumnData { | ||
column: number | ||
data: ArrayLike<any> | ||
columnName: string | ||
columnData: ArrayLike<any> | ||
rowStart: number | ||
rowEnd: number | ||
} |
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js' | ||
import { schemaTree } from './schema.js' | ||
import { schemaElement } from './schema.js' | ||
import { deserializeTCompactProtocol } from './thrift.js' | ||
@@ -108,2 +108,3 @@ | ||
field_id: field.field_9, | ||
logical_type: logicalType(field.field_10), | ||
})) | ||
@@ -173,3 +174,32 @@ const num_rows = metadata.field_3 | ||
export function parquetSchema(metadata) { | ||
return schemaTree(metadata.schema, 0) | ||
return schemaElement(metadata.schema, []) | ||
} | ||
/** | ||
* Parse logical type by type. | ||
* | ||
* @typedef {import("./types.d.ts").LogicalType} LogicalType | ||
* @param {any} logicalType | ||
* @returns {LogicalType | undefined} | ||
*/ | ||
function logicalType(logicalType) { | ||
if (logicalType?.field_5) { | ||
return { | ||
logicalType: 'DECIMAL', | ||
scale: logicalType.field_5.field_1, | ||
precision: logicalType.field_5.field_2, | ||
} | ||
} | ||
// TODO: TimestampType | ||
// TOFO: TimeType | ||
if (logicalType?.field_10) { | ||
return { | ||
logicalType: 'INTEGER', | ||
bitWidth: logicalType.field_10.field_1, | ||
isSigned: logicalType.field_10.field_2, | ||
} | ||
} | ||
if (logicalType) { | ||
return logicalType | ||
} | ||
} |
import { getColumnOffset, readColumn } from './column.js' | ||
import { parquetMetadataAsync } from './metadata.js' | ||
import { getColumnName, isMapLike } from './schema.js' | ||
@@ -23,3 +24,3 @@ /** | ||
* @param {FileMetaData} [options.metadata] parquet file metadata | ||
* @param {number[]} [options.columns] columns to read, all columns if undefined | ||
* @param {string[]} [options.columns] columns to read, all columns if undefined | ||
* @param {number} [options.rowStart] first requested row index (inclusive) | ||
@@ -73,3 +74,3 @@ * @param {number} [options.rowEnd] last requested row index (exclusive) | ||
* @param {FileMetaData} [options.metadata] parquet file metadata | ||
* @param {number[]} [options.columns] columns to read, all columns if undefined | ||
* @param {string[]} [options.columns] columns to read, all columns if undefined | ||
* @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range. | ||
@@ -87,14 +88,16 @@ * @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed | ||
let [groupStartByte, groupEndByte] = [file.byteLength, 0] | ||
rowGroup.columns.forEach((columnChunk, columnIndex) => { | ||
// skip columns that are not requested or lack metadata | ||
if (columns && !columns.includes(columnIndex)) return | ||
if (!columnChunk.meta_data) return | ||
rowGroup.columns.forEach(({ meta_data: columnMetadata }) => { | ||
if (!columnMetadata) throw new Error('parquet column metadata is undefined') | ||
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema) | ||
// skip columns that are not requested | ||
if (columns && !columns.includes(columnName)) return | ||
const startByte = getColumnOffset(columnChunk.meta_data) | ||
const endByte = startByte + Number(columnChunk.meta_data.total_compressed_size) | ||
const startByte = getColumnOffset(columnMetadata) | ||
const endByte = startByte + Number(columnMetadata.total_compressed_size) | ||
groupStartByte = Math.min(groupStartByte, startByte) | ||
groupEndByte = Math.max(groupEndByte, endByte) | ||
}) | ||
if (groupStartByte >= groupEndByte) { | ||
throw new Error('parquet missing row group metadata') | ||
if (groupStartByte >= groupEndByte && columns?.length) { | ||
// TODO: should throw if any column is missing | ||
throw new Error(`parquet columns not found: ${columns.join(', ')}`) | ||
} | ||
@@ -112,10 +115,14 @@ // if row group size is less than 128mb, pre-load in one read | ||
const promises = [] | ||
const maps = new Map() | ||
let outputColumnIndex = 0 | ||
// read column data | ||
for (let columnIndex = 0; columnIndex < rowGroup.columns.length; columnIndex++) { | ||
// skip columns that are not requested | ||
if (columns && !columns.includes(columnIndex)) continue | ||
const columnMetadata = rowGroup.columns[columnIndex].meta_data | ||
if (!columnMetadata) throw new Error('parquet column metadata is undefined') | ||
// skip columns that are not requested | ||
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema) | ||
// skip columns that are not requested | ||
if (columns && !columns.includes(columnName)) continue | ||
const columnStartByte = getColumnOffset(columnMetadata) | ||
@@ -129,2 +136,3 @@ const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size) | ||
console.warn(`parquet skipping huge column "${columnMetadata.path_in_schema}" ${columnBytes.toLocaleString()} bytes`) | ||
// TODO: set column to new Error('parquet column too large') | ||
continue | ||
@@ -146,3 +154,4 @@ } | ||
// TODO: extract SchemaElement for this column | ||
const columnData = readColumn( | ||
/** @type {ArrayLike<any> | undefined} */ | ||
let columnData = readColumn( | ||
arrayBuffer, bufferOffset, rowGroup, columnMetadata, metadata.schema, compressors | ||
@@ -153,6 +162,40 @@ ) | ||
} | ||
if (isMapLike(metadata.schema, columnMetadata.path_in_schema)) { | ||
const name = columnMetadata.path_in_schema.slice(0, -2).join('.') | ||
if (!maps.has(name)) { | ||
maps.set(name, columnData) | ||
columnData = undefined // do not emit column data until both key and value are read | ||
} else { | ||
if (columnMetadata.path_in_schema[0] === 'key') { | ||
throw new Error('parquet map-like column key is not first') // TODO: support value-first | ||
} else { | ||
const values = columnData | ||
const keys = maps.get(name) | ||
const out = [] | ||
if (keys.length !== values.length) { | ||
throw new Error('parquet map-like column key/value length mismatch') | ||
} | ||
// assemble map-like column data | ||
for (let i = 0; i < keys.length; i++) { | ||
/** @type {Record<string, any>} */ | ||
const obj = {} | ||
for (let j = 0; j < keys[i].length; j++) { | ||
obj[keys[i][j]] = values[i][j] | ||
} | ||
out.push(obj) | ||
} | ||
columnData = out | ||
} | ||
maps.delete(name) | ||
} | ||
} | ||
// do not emit column data until structs are fully parsed | ||
if (!columnData) return | ||
// notify caller of column data | ||
if (options.onChunk) options.onChunk({ column: columnIndex, data: columnData, rowStart: 0, rowEnd: columnData.length }) | ||
if (options.onChunk) options.onChunk({ columnName, columnData, rowStart: 0, rowEnd: columnData.length }) | ||
// add column data to group data only if onComplete is defined | ||
if (options.onComplete) addColumn(groupData, columnIndex, columnData) | ||
if (options.onComplete) addColumn(groupData, outputColumnIndex, columnData) | ||
outputColumnIndex++ | ||
})) | ||
@@ -159,0 +202,0 @@ } |
@@ -13,3 +13,3 @@ /** | ||
*/ | ||
export function schemaTree(schema, rootIndex) { | ||
function schemaTree(schema, rootIndex) { | ||
const root = schema[rootIndex] | ||
@@ -36,3 +36,3 @@ const children = [] | ||
* @param {string[]} name path to the element | ||
* @returns {SchemaElement} schema element | ||
* @returns {SchemaTree} schema element | ||
*/ | ||
@@ -47,3 +47,3 @@ export function schemaElement(schema, name) { | ||
} | ||
return tree.element | ||
return tree | ||
} | ||
@@ -83,3 +83,3 @@ | ||
parts.forEach((part, i) => { | ||
const element = schemaElement(schema, parts.slice(0, i + 1)) | ||
const { element } = schemaElement(schema, parts.slice(0, i + 1)) | ||
if (element.repetition_type === 'REPEATED') { | ||
@@ -102,3 +102,3 @@ maxLevel += 1 | ||
parts.forEach((part, i) => { | ||
const element = schemaElement(schema, parts.slice(0, i + 1)) | ||
const { element } = schemaElement(schema, parts.slice(0, i + 1)) | ||
if (element.repetition_type !== 'REQUIRED') { | ||
@@ -126,1 +126,64 @@ maxLevel += 1 | ||
} | ||
/** | ||
* Get the column name as foo.bar and handle list-like columns. | ||
* @param {SchemaElement[]} schema | ||
* @param {string[]} path | ||
* @returns {string} column name | ||
*/ | ||
export function getColumnName(schema, path) { | ||
if (isListLike(schema, path) || isMapLike(schema, path)) { | ||
return path.slice(0, -2).join('.') | ||
} else { | ||
return path.join('.') | ||
} | ||
} | ||
/** | ||
* Check if a column is list-like. | ||
* | ||
* @param {SchemaElement[]} schemaElements parquet schema elements | ||
* @param {string[]} path column path | ||
* @returns {boolean} true if map-like | ||
*/ | ||
function isListLike(schemaElements, path) { | ||
const schema = schemaElement(schemaElements, path.slice(0, -2)) | ||
if (path.length < 3) return false | ||
if (schema.element.converted_type !== 'LIST') return false | ||
if (schema.children.length > 1) return false | ||
const firstChild = schema.children[0] | ||
if (firstChild.children.length > 1) return false | ||
if (firstChild.element.repetition_type !== 'REPEATED') return false | ||
const secondChild = firstChild.children[0] | ||
if (secondChild.element.repetition_type !== 'REQUIRED') return false | ||
return true | ||
} | ||
/** | ||
* Check if a column is map-like. | ||
* | ||
* @param {SchemaElement[]} schemaElements parquet schema elements | ||
* @param {string[]} path column path | ||
* @returns {boolean} true if map-like | ||
*/ | ||
export function isMapLike(schemaElements, path) { | ||
const schema = schemaElement(schemaElements, path.slice(0, -2)) | ||
if (path.length < 3) return false | ||
if (schema.element.converted_type !== 'MAP') return false | ||
if (schema.children.length > 1) return false | ||
const firstChild = schema.children[0] | ||
if (firstChild.children.length !== 2) return false | ||
if (firstChild.element.repetition_type !== 'REPEATED') return false | ||
const keyChild = firstChild.children.find(child => child.element.name === 'key') | ||
if (keyChild?.element.repetition_type !== 'REQUIRED') return false | ||
const valueChild = firstChild.children.find(child => child.element.name === 'value') | ||
if (valueChild?.element.repetition_type === 'REPEATED') return false | ||
return true | ||
} |
@@ -45,2 +45,3 @@ /** | ||
field_id?: number | ||
logicalType?: LogicalType | ||
} | ||
@@ -87,2 +88,36 @@ | ||
type LogicalDecimalType = { | ||
logicalType: 'DECIMAL' | ||
precision: number | ||
scale: number | ||
} | ||
type LogicalIntType = { | ||
logicalType: 'INTEGER' | ||
bitWidth: number | ||
isSigned: boolean | ||
} | ||
export type LogicalType = | ||
{ logicalType: LogicalTypeType } | | ||
LogicalDecimalType | | ||
LogicalIntType | ||
export type LogicalTypeType = | ||
'STRING' | // convertedType UTF8 | ||
'MAP' | // convertedType MAP | ||
'LIST' | // convertedType LIST | ||
'ENUM' | // convertedType ENUM | ||
'DECIMAL' | // convertedType DECIMAL + precision/scale | ||
'DATE' | // convertedType DATE | ||
'TIME' | // convertedType TIME_MILLIS or TIME_MICROS | ||
'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS | ||
'INTEGER' | // convertedType INT or UINT | ||
'INTERVAL' | // convertedType INT or UINT | ||
'NULL' | // no convertedType | ||
'JSON' | // convertedType JSON | ||
'BSON' | // convertedType BSON | ||
'UUID' | // no convertedType | ||
'FLOAT16' // no convertedType | ||
export interface RowGroup { | ||
@@ -89,0 +124,0 @@ columns: ColumnChunk[] |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
93137
2485