Comparing version 0.9.6 to 0.9.7
{ | ||
"name": "hyparquet", | ||
"version": "0.9.6", | ||
"version": "0.9.7", | ||
"description": "parquet file parser for javascript", | ||
@@ -31,3 +31,3 @@ "keywords": [ | ||
"@types/node": "20.12.12", | ||
"@typescript-eslint/eslint-plugin": "7.10.0", | ||
"@typescript-eslint/eslint-plugin": "7.11.0", | ||
"@vitest/coverage-v8": "1.6.0", | ||
@@ -38,3 +38,3 @@ "eslint": "8.57.0", | ||
"http-server": "14.1.1", | ||
"hyparquet-compressors": "0.1.3", | ||
"hyparquet-compressors": "0.1.4", | ||
"typescript": "5.4.5", | ||
@@ -41,0 +41,0 @@ "vitest": "1.6.0" |
import { assembleLists } from './assemble.js' | ||
import { convert, dereferenceDictionary } from './convert.js' | ||
import { readDataPage, readDictionaryPage } from './datapage.js' | ||
import { convertWithDictionary } from './convert.js' | ||
import { decompressPage, readDataPage, readDictionaryPage } from './datapage.js' | ||
import { readDataPageV2 } from './datapageV2.js' | ||
import { parquetHeader } from './header.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' | ||
import { snappyUncompress } from './snappy.js' | ||
import { concat } from './utils.js' | ||
@@ -52,7 +51,5 @@ | ||
// construct output values: skip nulls and construct lists | ||
values = dereferenceDictionary(dictionary, dataPage) | ||
values = convert(values, element, utf8) | ||
// convert types, dereference dictionary, and assemble lists | ||
values = convertWithDictionary(dataPage, dictionary, element, daph.encoding, utf8) | ||
if (repetitionLevels.length || definitionLevels?.length) { | ||
// Use repetition levels to construct lists | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
@@ -83,6 +80,5 @@ const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
values = dereferenceDictionary(dictionary, dataPage) | ||
values = convert(values, element, utf8) | ||
// convert types, dereference dictionary, and assemble lists | ||
values = convertWithDictionary(dataPage, dictionary, element, daph2.encoding, utf8) | ||
if (repetitionLevels.length || definitionLevels?.length) { | ||
// Use repetition levels to construct lists | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
@@ -128,28 +124,1 @@ const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
} | ||
/** | ||
* @param {Uint8Array} compressedBytes | ||
* @param {number} uncompressed_page_size | ||
* @param {import('./types.js').CompressionCodec} codec | ||
* @param {import('./types.js').Compressors | undefined} compressors | ||
* @returns {Uint8Array} | ||
*/ | ||
export function decompressPage(compressedBytes, uncompressed_page_size, codec, compressors) { | ||
/** @type {Uint8Array} */ | ||
let page | ||
const customDecompressor = compressors?.[codec] | ||
if (codec === 'UNCOMPRESSED') { | ||
page = compressedBytes | ||
} else if (customDecompressor) { | ||
page = customDecompressor(compressedBytes, uncompressed_page_size) | ||
} else if (codec === 'SNAPPY') { | ||
page = new Uint8Array(uncompressed_page_size) | ||
snappyUncompress(compressedBytes, page) | ||
} else { | ||
throw new Error(`parquet unsupported compression codec: ${codec}`) | ||
} | ||
if (page?.length !== uncompressed_page_size) { | ||
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`) | ||
} | ||
return page | ||
} |
const dayMillis = 86400000 // 1 day in milliseconds | ||
/** | ||
* Convert known types from primitive to rich. | ||
* Convert known types from primitive to rich, and dereference dictionary. | ||
* | ||
* @typedef {import('./types.js').DecodedArray} DecodedArray | ||
* @typedef {import('./types.js').SchemaElement} SchemaElement | ||
* @param {DecodedArray} data series of primitive types | ||
* @param {import('./types.js').SchemaElement} schemaElement schema element for the data | ||
* @param {DecodedArray | undefined} dictionary | ||
* @param {SchemaElement} schemaElement | ||
* @param {import('./types.js').Encoding} encoding | ||
* @param {boolean | undefined} utf8 decode bytes as utf8? | ||
* @returns {DecodedArray} series of rich types | ||
*/ | ||
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) { | ||
if (dictionary && encoding.endsWith('_DICTIONARY')) { | ||
// convert dictionary | ||
dictionary = convert(dictionary, schemaElement, utf8) | ||
let output = data | ||
if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) { | ||
// @ts-expect-error upgrade data to match dictionary type with fancy constructor | ||
output = new dictionary.constructor(data.length) | ||
} | ||
for (let i = 0; i < data.length; i++) { | ||
output[i] = dictionary[data[i]] | ||
} | ||
return output | ||
} else { | ||
return convert(data, schemaElement, utf8) | ||
} | ||
} | ||
/** | ||
* Convert known types from primitive to rich. | ||
* | ||
* @param {DecodedArray} data series of primitive types | ||
* @param {SchemaElement} schemaElement | ||
* @param {boolean | undefined} utf8 decode bytes as utf8? | ||
* @returns {DecodedArray} series of rich types | ||
*/ | ||
export function convert(data, schemaElement, utf8 = true) { | ||
@@ -76,10 +105,13 @@ const ctype = schemaElement.converted_type | ||
} | ||
const logicalType = schemaElement.logical_type?.type | ||
if (logicalType === 'FLOAT16') { | ||
if (schemaElement.logical_type?.type === 'FLOAT16') { | ||
return Array.from(data).map(parseFloat16) | ||
} | ||
if (logicalType === 'TIMESTAMP') { | ||
if (schemaElement.logical_type?.type === 'TIMESTAMP') { | ||
const { unit } = schemaElement.logical_type | ||
let factor = 1n | ||
if (unit === 'MICROS') factor = 1000n | ||
if (unit === 'NANOS') factor = 1000000n | ||
const arr = new Array(data.length) | ||
for (let i = 0; i < arr.length; i++) { | ||
arr[i] = new Date(Number(data[i])) | ||
arr[i] = new Date(Number(data[i] / factor)) | ||
} | ||
@@ -95,3 +127,3 @@ return arr | ||
*/ | ||
function parseDecimal(bytes) { | ||
export function parseDecimal(bytes) { | ||
// TODO: handle signed | ||
@@ -130,23 +162,1 @@ let value = 0 | ||
} | ||
/** | ||
* Map data to dictionary values in place. | ||
* | ||
* @param {DecodedArray | undefined} dictionary | ||
* @param {DecodedArray} dataPage | ||
* @returns {DecodedArray} | ||
*/ | ||
export function dereferenceDictionary(dictionary, dataPage) { | ||
let output = dataPage | ||
if (dictionary) { | ||
if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) { | ||
// upgrade dataPage to match dictionary type | ||
// @ts-expect-error not my fault typescript doesn't understand constructors | ||
output = new dictionary.constructor(dataPage.length) | ||
} | ||
for (let i = 0; i < dataPage.length; i++) { | ||
output[i] = dictionary[dataPage[i]] | ||
} | ||
} | ||
return output | ||
} |
import { bitWidth, byteStreamSplit, readRleBitPackedHybrid } from './encoding.js' | ||
import { readPlain } from './plain.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' | ||
import { snappyUncompress } from './snappy.js' | ||
@@ -110,1 +111,28 @@ /** | ||
} | ||
/** | ||
* @param {Uint8Array} compressedBytes | ||
* @param {number} uncompressed_page_size | ||
* @param {import('./types.js').CompressionCodec} codec | ||
* @param {import('./types.js').Compressors | undefined} compressors | ||
* @returns {Uint8Array} | ||
*/ | ||
export function decompressPage(compressedBytes, uncompressed_page_size, codec, compressors) { | ||
/** @type {Uint8Array} */ | ||
let page | ||
const customDecompressor = compressors?.[codec] | ||
if (codec === 'UNCOMPRESSED') { | ||
page = compressedBytes | ||
} else if (customDecompressor) { | ||
page = customDecompressor(compressedBytes, uncompressed_page_size) | ||
} else if (codec === 'SNAPPY') { | ||
page = new Uint8Array(uncompressed_page_size) | ||
snappyUncompress(compressedBytes, page) | ||
} else { | ||
throw new Error(`parquet unsupported compression codec: ${codec}`) | ||
} | ||
if (page?.length !== uncompressed_page_size) { | ||
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`) | ||
} | ||
return page | ||
} |
@@ -1,2 +0,2 @@ | ||
import { decompressPage } from './column.js' | ||
import { decompressPage } from './datapage.js' | ||
import { deltaBinaryUnpack, deltaByteArray, deltaLengthByteArray } from './delta.js' | ||
@@ -3,0 +3,0 @@ import { bitWidth, byteStreamSplit, readRleBitPackedHybrid } from './encoding.js' |
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from './constants.js' | ||
import { parseFloat16 } from './convert.js' | ||
import { parseDecimal, parseFloat16 } from './convert.js' | ||
import { getSchemaPath } from './schema.js' | ||
@@ -217,3 +217,3 @@ import { deserializeTCompactProtocol } from './thrift.js' | ||
isAdjustedToUTC: logicalType.field_7.field_1, | ||
unit: logicalType.field_7.field_2, | ||
unit: timeUnit(logicalType.field_7.field_2), | ||
} | ||
@@ -223,3 +223,3 @@ if (logicalType?.field_8) return { | ||
isAdjustedToUTC: logicalType.field_8.field_1, | ||
unit: logicalType.field_8.field_2, | ||
unit: timeUnit(logicalType.field_8.field_2), | ||
} | ||
@@ -240,2 +240,13 @@ if (logicalType?.field_10) return { | ||
/** | ||
* @param {any} unit | ||
* @returns {import("./types.d.ts").TimeUnit} | ||
*/ | ||
function timeUnit(unit) { | ||
if (unit.field_1) return 'MILLIS' | ||
if (unit.field_2) return 'MICROS' | ||
if (unit.field_3) return 'NANOS' | ||
throw new Error('parquet time unit required') | ||
} | ||
/** | ||
* Convert column statistics based on column type. | ||
@@ -248,3 +259,3 @@ * | ||
function columnStats(stats, schema) { | ||
const { type, logical_type } = schema | ||
const { type, converted_type, logical_type } = schema | ||
function convert(/** @type {Uint8Array} */ value) { | ||
@@ -254,21 +265,13 @@ if (value === undefined) return value | ||
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value) | ||
if (type === 'INT32') { | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
return view.getInt32(0, true) | ||
} | ||
if (type === 'INT64') { | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
return view.getBigInt64(0, true) | ||
} | ||
if (type === 'FLOAT') { | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
return view.getFloat32(0, true) | ||
} | ||
if (type === 'DOUBLE') { | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
return view.getFloat64(0, true) | ||
} | ||
if (logical_type?.type === 'FLOAT16') { | ||
return parseFloat16(value) | ||
} | ||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength) | ||
if (type === 'FLOAT') return view.getFloat32(0, true) | ||
if (type === 'DOUBLE') return view.getFloat64(0, true) | ||
if (type === 'INT32' && converted_type === 'DATE') return new Date(view.getInt32(0, true) * 86400000) | ||
if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return new Date(Number(view.getBigInt64(0, true) / 1000n)) | ||
if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return new Date(Number(view.getBigInt64(0, true))) | ||
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true))) | ||
if (type === 'INT32') return view.getInt32(0, true) | ||
if (type === 'INT64') return view.getBigInt64(0, true) | ||
if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0)) | ||
if (logical_type?.type === 'FLOAT16') return parseFloat16(value) | ||
return value | ||
@@ -275,0 +278,0 @@ } |
@@ -95,3 +95,3 @@ export type Awaitable<T> = T | Promise<T> | ||
type TimeUnit = 'MILLIS' | 'MICROS' | 'NANOS' | ||
export type TimeUnit = 'MILLIS' | 'MICROS' | 'NANOS' | ||
@@ -117,3 +117,3 @@ type LogicalTimeType = { | ||
export type LogicalType = | ||
{ type: LogicalTypeType } | | ||
{ type: LogicalTypeSimple } | | ||
LogicalDecimalType | | ||
@@ -124,18 +124,20 @@ LogicalTimeType | | ||
export type LogicalTypeType = | ||
'STRING' | // convertedType UTF8 | ||
'MAP' | // convertedType MAP | ||
'LIST' | // convertedType LIST | ||
'ENUM' | // convertedType ENUM | ||
'DECIMAL' | // convertedType DECIMAL + precision/scale | ||
'DATE' | // convertedType DATE | ||
type LogicalTypeSimple = | ||
'STRING' | | ||
'MAP' | | ||
'LIST' | | ||
'ENUM' | | ||
'DECIMAL' | | ||
'DATE' | | ||
'INTERVAL' | | ||
'NULL' | | ||
'JSON' | | ||
'BSON' | | ||
'UUID' | | ||
'FLOAT16' | ||
export type LogicalTypeType = LogicalTypeSimple | | ||
'TIME' | // convertedType TIME_MILLIS or TIME_MICROS | ||
'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS | ||
'INTEGER' | // convertedType INT or UINT | ||
'INTERVAL' | // convertedType INT or UINT | ||
'NULL' | // no convertedType | ||
'JSON' | // convertedType JSON | ||
'BSON' | // convertedType BSON | ||
'UUID' | // no convertedType | ||
'FLOAT16' // no convertedType | ||
'INTEGER' // convertedType INT or UINT | ||
@@ -142,0 +144,0 @@ export interface RowGroup { |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
102354
2670