Comparing version 1.2.1 to 1.3.0
{ | ||
"name": "hyparquet", | ||
"version": "1.2.1", | ||
"version": "1.3.0", | ||
"description": "parquet file parser for javascript", | ||
@@ -29,4 +29,4 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "22.3.0", | ||
"@typescript-eslint/eslint-plugin": "8.1.0", | ||
"@types/node": "22.4.1", | ||
"@typescript-eslint/eslint-plugin": "8.2.0", | ||
"@vitest/coverage-v8": "2.0.5", | ||
@@ -33,0 +33,0 @@ "eslint": "8.57.0", |
@@ -1,4 +0,2 @@ | ||
/** | ||
* @type {import('./types.js').ParquetType[]} | ||
*/ | ||
/** @type {import('./types.js').ParquetType[]} */ | ||
export const ParquetType = [ | ||
@@ -34,5 +32,3 @@ 'BOOLEAN', | ||
/** | ||
* @type {import('./types.js').ConvertedType[]} | ||
*/ | ||
/** @type {import('./types.js').ConvertedType[]} */ | ||
export const ConvertedType = [ | ||
@@ -63,5 +59,3 @@ 'UTF8', | ||
/** | ||
* @type {import('./types.js').LogicalTypeType[]} | ||
*/ | ||
/** @type {import('./types.js').LogicalTypeType[]} */ | ||
export const logicalTypeType = [ | ||
@@ -96,5 +90,3 @@ 'NULL', | ||
/** | ||
* @type {import('./types.js').PageType[]} | ||
*/ | ||
/** @type {import('./types.js').PageType[]} */ | ||
export const PageType = [ | ||
@@ -106,1 +98,8 @@ 'DATA_PAGE', | ||
] | ||
/** @type {import('./types.js').BoundaryOrder[]} */ | ||
export const BoundaryOrder = [ | ||
'UNORDERED', | ||
'ASCENDING', | ||
'DESCENDING', | ||
] |
@@ -31,2 +31,16 @@ import type { AsyncBuffer, Compressors, FileMetaData, SchemaTree } from './types.d.ts' | ||
/** | ||
* Read parquet data and return a Promise of object-oriented row data. | ||
* | ||
* @param {object} options read options | ||
* @param {AsyncBuffer} options.file file-like object containing parquet data | ||
* @param {FileMetaData} [options.metadata] parquet file metadata | ||
* @param {string[]} [options.columns] columns to read, all columns if undefined | ||
* @param {number} [options.rowStart] first requested row index (inclusive) | ||
* @param {number} [options.rowEnd] last requested row index (exclusive) | ||
* @param {Compressors} [options.compressor] custom decompressors | ||
* @returns {Promise<void>} resolves when all requested rows and columns are parsed | ||
*/ | ||
export function parquetReadObjects(options: ParquetReadOptions): Promise<Array<Record<string, any>>> | ||
/** | ||
* Read parquet metadata from an async buffer. | ||
@@ -33,0 +47,0 @@ * |
@@ -12,1 +12,15 @@ import { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata.js' | ||
export { asyncBufferFromFile, asyncBufferFromUrl, toJson } | ||
/** | ||
* @param {import('./hyparquet.js').ParquetReadOptions} options | ||
* @returns {Promise<Array<Record<string, any>>>} | ||
*/ | ||
export function parquetReadObjects(options) { | ||
return new Promise((onComplete, reject) => { | ||
parquetRead({ | ||
rowFormat: 'object', | ||
...options, | ||
onComplete, | ||
}).catch(reject) | ||
}) | ||
} |
@@ -140,3 +140,3 @@ import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from './constants.js' | ||
dictionary_page_offset: column.field_3.field_11, | ||
statistics: columnStats(column.field_3.field_12, columnSchema[columnIndex]), | ||
statistics: convertStats(column.field_3.field_12, columnSchema[columnIndex]), | ||
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({ | ||
@@ -256,28 +256,10 @@ page_type: PageType[encodingStat.field_1], | ||
*/ | ||
function columnStats(stats, schema) { | ||
const { type, converted_type, logical_type } = schema | ||
function convert(/** @type {Uint8Array} */ value) { | ||
if (value === undefined) return value | ||
if (type === 'BOOLEAN') return value[0] === 1 | ||
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value) | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
if (type === 'FLOAT') return view.getFloat32(0, true) | ||
if (type === 'DOUBLE') return view.getFloat64(0, true) | ||
if (type === 'INT32' && converted_type === 'DATE') return new Date(view.getInt32(0, true) * 86400000) | ||
if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return new Date(Number(view.getBigInt64(0, true) / 1000n)) | ||
if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return new Date(Number(view.getBigInt64(0, true))) | ||
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true))) | ||
if (type === 'INT32') return view.getInt32(0, true) | ||
if (type === 'INT64') return view.getBigInt64(0, true) | ||
if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0)) | ||
if (logical_type?.type === 'FLOAT16') return parseFloat16(value) | ||
return value | ||
} | ||
function convertStats(stats, schema) { | ||
return stats && { | ||
max: convert(stats.field_1), | ||
min: convert(stats.field_2), | ||
max: convertMetadata(stats.field_1, schema), | ||
min: convertMetadata(stats.field_2, schema), | ||
null_count: stats.field_3, | ||
distinct_count: stats.field_4, | ||
max_value: convert(stats.field_5), | ||
min_value: convert(stats.field_6), | ||
max_value: convertMetadata(stats.field_5, schema), | ||
min_value: convertMetadata(stats.field_6, schema), | ||
is_max_value_exact: stats.field_7, | ||
@@ -287,1 +269,27 @@ is_min_value_exact: stats.field_8, | ||
} | ||
/** | ||
* @param {Uint8Array | undefined} value | ||
* @param {SchemaElement} schema | ||
* @returns {import('./types.d.ts').MinMaxType | undefined} | ||
*/ | ||
export function convertMetadata(value, schema) { | ||
const { type, converted_type, logical_type } = schema | ||
if (value === undefined) return value | ||
if (type === 'BOOLEAN') return value[0] === 1 | ||
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value) | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
if (type === 'FLOAT' && view.byteLength === 4) return view.getFloat32(0, true) | ||
if (type === 'DOUBLE' && view.byteLength === 8) return view.getFloat64(0, true) | ||
if (type === 'INT32' && converted_type === 'DATE') return new Date(view.getInt32(0, true) * 86400000) | ||
if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return new Date(Number(view.getBigInt64(0, true) / 1000n)) | ||
if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return new Date(Number(view.getBigInt64(0, true))) | ||
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true))) | ||
if (type === 'INT32' && view.byteLength === 4) return view.getInt32(0, true) | ||
if (type === 'INT64' && view.byteLength === 8) return view.getBigInt64(0, true) | ||
if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0)) | ||
if (logical_type?.type === 'FLOAT16') return parseFloat16(value) | ||
if (type === 'FIXED_LEN_BYTE_ARRAY') return value | ||
// assert(false) | ||
return value | ||
} |
@@ -81,5 +81,6 @@ // TCompactProtocol types | ||
const [elemType, listSize] = readCollectionBegin(reader) | ||
const boolType = elemType === CompactType.TRUE || elemType === CompactType.FALSE | ||
const values = new Array(listSize) | ||
for (let i = 0; i < listSize; i++) { | ||
values[i] = readElement(reader, elemType) | ||
values[i] = boolType ? readElement(reader, CompactType.BYTE) === 1 : readElement(reader, elemType) | ||
} | ||
@@ -207,8 +208,7 @@ return values | ||
let fid // field id | ||
if (delta === 0) { | ||
// not a delta, read zigzag varint field id | ||
fid = readZigZag(reader) | ||
} else { | ||
if (delta) { | ||
// add delta to last field id | ||
fid = lastFid + delta | ||
} else { | ||
throw new Error('non-delta field id not supported') | ||
} | ||
@@ -215,0 +215,0 @@ return [getCompactType(type), fid, fid] |
@@ -214,3 +214,3 @@ export type Awaitable<T> = T | Promise<T> | ||
type MinMaxType = bigint | boolean | number | string | ||
type MinMaxType = bigint | boolean | number | string | Date | Uint8Array | ||
@@ -305,1 +305,24 @@ export interface Statistics { | ||
any[] | ||
export interface OffsetIndex { | ||
page_locations: PageLocation[] | ||
unencoded_byte_array_data_bytes?: bigint[] | ||
} | ||
interface PageLocation { | ||
offset: bigint | ||
compressed_page_size: number | ||
first_row_index: bigint | ||
} | ||
export interface ColumnIndex { | ||
null_pages: boolean[] | ||
min_values: MinMaxType[] | ||
max_values: MinMaxType[] | ||
boundary_order: BoundaryOrder | ||
null_counts?: bigint[] | ||
repetition_level_histograms?: bigint[] | ||
definition_level_histograms?: bigint[] | ||
} | ||
export type BoundaryOrder = 'UNORDERED' | 'ASCENDING' | 'DESCENDING' |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
109715
23
2859