@@ -1,5 +0,5 @@
		import { CompressionCodec, Encoding, PageType } from './constants.js'
		import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js'
		import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
		import { parquetHeader } from './header.js'
		import { getMaxDefinitionLevel, isRequired } from './schema.js'
		import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js'
		import { snappyUncompress } from './snappy.js'
		@@ -14,2 +14,4 @@

		const dayMillis = 86400000000000 // 1 day in milliseconds

		/**
		@@ -31,2 +33,3 @@ * Read a column from the file.
		let byteOffset = 0 // byteOffset within the column
		/** @type {ArrayLike<any> \| undefined} */
		let dictionary = undefined
		@@ -70,2 +73,4 @@ const rowIndex = [0] // map/list object index

		const dictionaryEncoding = daph.encoding === Encoding.PLAIN_DICTIONARY \|\| daph.encoding === Encoding.RLE_DICTIONARY

		// construct output values: skip nulls and construct lists
		@@ -75,4 +80,7 @@ let values
		// Use repetition levels to construct lists
		if ([Encoding.PLAIN_DICTIONARY, Encoding.RLE_DICTIONARY].includes(daph.encoding)) {
		// TODO: dereference dictionary values
		if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) {
		// dereference dictionary values
		for (let i = 0; i < dataPage.length; i++) {
		dataPage[i] = dictionary[dataPage[i]]
		}
		}
		@@ -83,3 +91,3 @@ const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]])
		values = assembleObjects(definitionLevels, repetitionLevels, dataPage, isNull, nullValue, maxDefinitionLevel, rowIndex[0])
		} else if (definitionLevels) {
		} else if (definitionLevels?.length) {
		const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)
		@@ -113,4 +121,14 @@ // Use definition levels to skip nulls
		} else {
		// TODO: use dictionary
		values = dataPage
		if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) {
		// dereference dictionary values
		values = []
		for (let i = 0; i < dataPage.length; i++) {
		values[i] = dictionary[dataPage[i]]
		}
		} else if (Array.isArray(dataPage)) {
		// convert primitive types to rich types
		values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema))
		} else {
		values = dataPage // TODO: data page shouldn't be a fixed byte array?
		}
		}
		@@ -153,1 +171,42 @@
		}

		/**
		* Convert known types from primitive to rich.
		*
		* @param {any[]} data series of primitive types
		* @param {SchemaElement} schemaElement schema element for the data
		* @returns {any[]} series of rich types
		*/
		function convert(data, schemaElement) {
		const ctype = schemaElement.converted_type
		if (!ctype) return data
		if (ctype === ConvertedType.UTF8) {
		const decoder = new TextDecoder()
		return data.map(v => decoder.decode(v))
		}
		if (ctype === ConvertedType.DECIMAL) {
		const scaleFactor = Math.pow(10, schemaElement.scale \|\| 0)
		if (typeof data[0] === 'number') {
		return data.map(v => v * scaleFactor)
		} else {
		// TODO: parse byte string
		throw new Error('parquet decimal byte string not supported')
		}
		}
		if (ctype === ConvertedType.DATE) {
		return data.map(v => new Date(v * dayMillis))
		}
		if (ctype === ConvertedType.TIME_MILLIS) {
		return data.map(v => new Date(v))
		}
		if (ctype === ConvertedType.JSON) {
		return data.map(v => JSON.parse(v))
		}
		if (ctype === ConvertedType.BSON) {
		throw new Error('parquet bson not supported')
		}
		if (ctype === ConvertedType.INTERVAL) {
		throw new Error('parquet interval not supported')
		}
		return data
		}

src/constants.js

		@@ -30,2 +30,27 @@ export const ParquetType = {

		export const ConvertedType = {
		UTF8: 0,
		MAP: 1,
		MAP_KEY_VALUE: 2,
		LIST: 3,
		ENUM: 4,
		DECIMAL: 5,
		DATE: 6,
		TIME_MILLIS: 7,
		TIME_MICROS: 8,
		TIMESTAMP_MILLIS: 9,
		TIMESTAMP_MICROS: 10,
		UINT_8: 11,
		UINT_16: 12,
		UINT_32: 13,
		UINT_64: 14,
		INT_8: 15,
		INT_16: 16,
		INT_32: 17,
		INT_64: 18,
		JSON: 19,
		BSON: 20,
		INTERVAL: 21,
		}

		export const CompressionCodec = {
		@@ -32,0 +57,0 @@ UNCOMPRESSED: 0,

src/datapage.js

		@@ -26,3 +26,3 @@ import { Encoding, ParquetType } from './constants.js'
		* @param {ColumnMetaData} columnMetadata metadata for the column
		* @returns {DataPage} array of values
		* @returns {DataPage} definition levels, repetition levels, and array of values
		*/
		@@ -146,4 +146,12 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
		)
		const numNulls = daph.num_values - definitionLevels
		.filter((/** @type number */ d) => d === maxDefinitionLevel).length

		// count nulls
		let numNulls = daph.num_values
		for (const def of definitionLevels) {
		if (def === maxDefinitionLevel) numNulls--
		}
		if (numNulls === 0) {
		definitionLevels.length = 0
		}

		return { byteLength, definitionLevels, numNulls }
		@@ -150,0 +158,0 @@ }

src/encoding.js

		@@ -209,3 +209,3 @@ import { ParquetEncoding, ParquetType } from './constants.js'
		while (seen < count) {
		const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count)
		const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, 1)
		if (!rleValues.length) break // EOF
		@@ -224,9 +224,9 @@ value.push(...rleValues)
		* Read values from a run-length encoded/bit-packed hybrid encoding.
		* If length is not specified, then a 32-bit int is read first to grab the
		* length of the encoded data.
		*
		* If length is zero, then read as int32 at the start of the encoded data.
		*
		* @param {DataView} dataView - buffer to read data from
		* @param {number} offset - offset to start reading from the DataView
		* @param {number} width - width of each bit-packed group
		* @param {number \| undefined} length - length of the encoded data
		* @param {number} length - length of the encoded data
		* @param {number} numValues - number of values to read
		@@ -233,0 +233,0 @@ * @returns {Decoded<number[]>} array of rle/bit-packed values

src/hyparquet.d.ts

		@@ -59,3 +59,3 @@ export { AsyncBuffer, FileMetaData, SchemaTree } from './types'
		*/
		export function parquetSchema(metadata: SchemaElement[]): SchemaTree
		export function parquetSchema(metadata: FileMetaData): SchemaTree

		@@ -62,0 +62,0 @@ /**

src/read.js

		@@ -116,2 +116,3 @@
		const columnBytes = columnEndByte - columnStartByte

		// skip columns larger than 1gb
		@@ -122,2 +123,3 @@ if (columnBytes > 1 << 30) {
		}

		// use pre-loaded row group byte data if available, else read column data
		@@ -132,2 +134,3 @@ let buffer
		}

		// read column data async
		@@ -134,0 +137,0 @@ promises.push(buffer.then(arrayBuffer => {

src/types.d.ts

		@@ -84,2 +84,13 @@ /**
		TIMESTAMP_MICROS = 10,
		UINT_8 = 11,
		UINT_16 = 12,
		UINT_32 = 13,
		UINT_64 = 14,
		INT_8 = 15,
		INT_16 = 16,
		INT_32 = 17,
		INT_64 = 18,
		JSON = 19,
		BSON = 20,
		INTERVAL = 21,
		}
		@@ -86,0 +97,0 @@

hyparquet - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics