Comparing version 0.8.0 to 0.9.0
{ | ||
"name": "hyparquet", | ||
"version": "0.8.0", | ||
"version": "0.9.0", | ||
"description": "parquet file parser for javascript", | ||
@@ -32,3 +32,3 @@ "keywords": [ | ||
"@typescript-eslint/eslint-plugin": "7.8.0", | ||
"@vitest/coverage-v8": "1.5.3", | ||
"@vitest/coverage-v8": "1.6.0", | ||
"eslint": "8.57.0", | ||
@@ -40,4 +40,4 @@ "eslint-plugin-import": "2.29.1", | ||
"typescript": "5.4.5", | ||
"vitest": "1.5.3" | ||
"vitest": "1.6.0" | ||
} | ||
} |
@@ -99,2 +99,6 @@ import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js' | ||
const metadata = deserializeTCompactProtocol(reader) | ||
const decoder = new TextDecoder() | ||
function decode(/** @type {Uint8Array} */ value) { | ||
return value && decoder.decode(value) | ||
} | ||
@@ -107,3 +111,3 @@ // Parse metadata from thrift data | ||
repetition_type: FieldRepetitionType[field.field_3], | ||
name: field.field_4, | ||
name: decode(field.field_4), | ||
num_children: field.field_5, | ||
@@ -116,6 +120,8 @@ converted_type: ConvertedType[field.field_6], | ||
})) | ||
// @ts-expect-error get types by column index | ||
const columnTypes = schema.map(e => e.type).filter(e => e) | ||
const num_rows = metadata.field_3 | ||
const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({ | ||
columns: rowGroup.field_1.map((/** @type {any} */ column) => ({ | ||
file_path: column.field_1, | ||
columns: rowGroup.field_1.map((/** @type {any} */ column, /** @type {number} */ columnIndex) => ({ | ||
file_path: decode(column.field_1), | ||
file_offset: column.field_2, | ||
@@ -125,3 +131,3 @@ meta_data: column.field_3 && { | ||
encodings: column.field_3.field_2?.map((/** @type {number} */ e) => Encoding[e]), | ||
path_in_schema: column.field_3.field_3, | ||
path_in_schema: column.field_3.field_3.map(decode), | ||
codec: CompressionCodec[column.field_3.field_4], | ||
@@ -135,8 +141,3 @@ num_values: column.field_3.field_5, | ||
dictionary_page_offset: column.field_3.field_11, | ||
statistics: column.field_3.field_12 && { | ||
max: column.field_3.field_12.field_1, | ||
min: column.field_3.field_12.field_2, | ||
null_count: column.field_3.field_12.field_3, | ||
distinct_count: column.field_3.field_12.field_4, | ||
}, | ||
statistics: columnStats(column.field_3.field_12, columnTypes[columnIndex]), | ||
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({ | ||
@@ -147,3 +148,16 @@ page_type: encodingStat.field_1, | ||
})), | ||
bloom_filter_offset: column.field_3.field_14, | ||
bloom_filter_length: column.field_3.field_15, | ||
size_statistics: column.field_3.field_16 && { | ||
unencoded_byte_array_data_bytes: column.field_3.field_16.field_1, | ||
repetition_level_histogram: column.field_3.field_16.field_2, | ||
definition_level_histogram: column.field_3.field_16.field_3, | ||
}, | ||
}, | ||
offset_index_offset: column.field_4, | ||
offset_index_length: column.field_5, | ||
column_index_offset: column.field_6, | ||
column_index_length: column.field_7, | ||
crypto_metadata: column.field_7, | ||
encrypted_column_metadata: column.field_8, | ||
})), | ||
@@ -157,8 +171,11 @@ total_byte_size: rowGroup.field_2, | ||
})), | ||
file_offset: rowGroup.field_5, | ||
total_compressed_size: rowGroup.field_6, | ||
ordinal: rowGroup.field_7, | ||
})) | ||
const key_value_metadata = metadata.field_5?.map((/** @type {any} */ keyValue) => ({ | ||
key: keyValue.field_1, | ||
value: keyValue.field_2, | ||
key: decode(keyValue.field_1), | ||
value: decode(keyValue.field_2), | ||
})) | ||
const created_by = metadata.field_6 | ||
const created_by = decode(metadata.field_6) | ||
@@ -202,3 +219,3 @@ return { | ||
// TODO: TimestampType | ||
// TOFO: TimeType | ||
// TODO: TimeType | ||
if (logicalType?.field_10) { | ||
@@ -215,1 +232,43 @@ return { | ||
} | ||
/** | ||
* Convert column statistics based on column type. | ||
* | ||
* @param {any} stats | ||
* @param {import("./types.d.ts").ParquetType} type | ||
* @returns {import("./types.d.ts").Statistics} | ||
*/ | ||
function columnStats(stats, type) { | ||
function convert(/** @type {Uint8Array} */ value) { | ||
if (value === undefined) return value | ||
if (type === 'BOOLEAN') return value[0] === 1 | ||
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value) | ||
if (type === 'INT32') { | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
return view.getInt32(0, true) | ||
} | ||
if (type === 'INT64') { | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
return view.getBigInt64(0, true) | ||
} | ||
if (type === 'FLOAT') { | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
return view.getFloat32(0, true) | ||
} | ||
if (type === 'DOUBLE') { | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
return view.getFloat64(0, true) | ||
} | ||
return value | ||
} | ||
return stats && { | ||
max: convert(stats.field_1), | ||
min: convert(stats.field_2), | ||
null_count: stats.field_3, | ||
distinct_count: stats.field_4, | ||
max_value: convert(stats.field_5), | ||
min_value: convert(stats.field_6), | ||
is_max_value_exact: stats.field_7, | ||
is_min_value_exact: stats.field_8, | ||
} | ||
} |
@@ -73,7 +73,6 @@ // TCompactProtocol types | ||
case CompactType.BINARY: { | ||
// strings are encoded as utf-8, no \0 delimiter | ||
const stringLength = readVarInt(reader) | ||
const strBytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, stringLength) | ||
reader.offset += stringLength | ||
return new TextDecoder().decode(strBytes) | ||
return strBytes | ||
} | ||
@@ -80,0 +79,0 @@ case CompactType.LIST: { |
@@ -20,6 +20,9 @@ type Awaitable<T> = T | Promise<T> | ||
schema: SchemaElement[] | ||
num_rows: number | ||
num_rows: bigint | ||
row_groups: RowGroup[] | ||
key_value_metadata?: KeyValue[] | ||
created_by?: string | ||
// column_orders?: ColumnOrder[] | ||
// encryption_algorithm?: EncryptionAlgorithm | ||
// footer_signing_key_metadata?: Uint8Array | ||
metadata_length: number | ||
@@ -122,5 +125,8 @@ } | ||
columns: ColumnChunk[] | ||
total_byte_size: number | ||
num_rows: number | ||
total_byte_size: bigint | ||
num_rows: bigint | ||
sorting_columns?: SortingColumn[] | ||
file_offset?: bigint | ||
total_compressed_size?: bigint | ||
ordinal?: number | ||
} | ||
@@ -130,4 +136,10 @@ | ||
file_path?: string | ||
file_offset: number | ||
file_offset: bigint | ||
meta_data?: ColumnMetaData | ||
offset_index_offset?: bigint | ||
offset_index_length?: number | ||
column_index_offset?: bigint | ||
column_index_length?: number | ||
crypto_metadata?: ColumnCryptoMetaData | ||
encrypted_column_metadata?: Uint8Array | ||
} | ||
@@ -140,13 +152,18 @@ | ||
codec: CompressionCodec | ||
num_values: number | ||
total_uncompressed_size: number | ||
total_compressed_size: number | ||
num_values: bigint | ||
total_uncompressed_size: bigint | ||
total_compressed_size: bigint | ||
key_value_metadata?: KeyValue[] | ||
data_page_offset: number | ||
index_page_offset?: number | ||
dictionary_page_offset?: number | ||
data_page_offset: bigint | ||
index_page_offset?: bigint | ||
dictionary_page_offset?: bigint | ||
statistics?: Statistics | ||
encoding_stats?: PageEncodingStats[] | ||
bloom_filter_offset?: bigint | ||
bloom_filter_length?: number | ||
size_statistics?: SizeStatistics | ||
} | ||
interface ColumnCryptoMetaData {} | ||
export type Encoding = | ||
@@ -182,9 +199,21 @@ 'PLAIN' | | ||
type MinMaxType = bigint | boolean | number | string | ||
export interface Statistics { | ||
max?: string | ||
min?: string | ||
null_count?: number | ||
distinct_count?: number | ||
max?: MinMaxType | ||
min?: MinMaxType | ||
null_count?: bigint | ||
distinct_count?: bigint | ||
max_value?: string | ||
min_value?: string | ||
is_max_value_exact?: boolean | ||
is_min_value_exact?: boolean | ||
} | ||
interface SizeStatistics { | ||
unencoded_byte_array_data_bytes?: bigint | ||
repetition_level_histogram?: bigint[] | ||
definition_level_histogram?: bigint[] | ||
} | ||
interface PageEncodingStats { | ||
@@ -191,0 +220,0 @@ page_type: PageType |
@@ -13,2 +13,3 @@ /** | ||
if (Array.isArray(obj)) return obj.map(toJson) | ||
if (obj instanceof Uint8Array) return Array.from(obj) | ||
if (obj instanceof Object) { | ||
@@ -15,0 +16,0 @@ /** @type {Record<string, unknown>} */ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
95289
2467