@@ -19,5 +19,6 @@ # hyparquet
		- Designed to work with huge ML datasets (things like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata))
		- Loads metadata separately from data
		- Can load metadata separately from data
		- Data can be filtered by row and column ranges
		- Only fetches the data needed
		- Written in JavaScript, checked with TypeScript
		- Fast data loading for large scale ML applications
		@@ -54,3 +55,3 @@ - Bring data visualization closer to the user, in the browser
		const buffer = fs.readFileSync('example.parquet')
		const arrayBuffer = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
		const arrayBuffer = new Uint8Array(buffer).buffer
		const metadata = parquetMetadata(arrayBuffer)
		@@ -57,0 +58,0 @@ ```

src/column.js

		@@ -0,1 +1,2 @@
		import { CompressionCodec, Encoding, PageType } from './constants.js'
		import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
		@@ -5,3 +6,2 @@ import { parquetHeader } from './header.js'
		import { snappyUncompress } from './snappy.js'
		import { CompressionCodec, Encoding, PageType } from './types.js'

		@@ -8,0 +8,0 @@ /**

src/constants.js

		@@ -29,1 +29,31 @@ export const ParquetType = {
		}

		export const CompressionCodec = {
		UNCOMPRESSED: 0,
		SNAPPY: 1,
		GZIP: 2,
		LZO: 3,
		BROTLI: 4,
		LZ4: 5,
		ZSTD: 6,
		LZ4_RAW: 7,
		}

		export const PageType = {
		DATA_PAGE: 0,
		INDEX_PAGE: 1,
		DICTIONARY_PAGE: 2,
		DATA_PAGE_V2: 3,
		}

		export const Encoding = {
		PLAIN: 0,
		PLAIN_DICTIONARY: 2,
		RLE: 3,
		BIT_PACKED: 4, // deprecated
		DELTA_BINARY_PACKED: 5,
		DELTA_LENGTH_BYTE_ARRAY: 6,
		DELTA_BYTE_ARRAY: 7,
		RLE_DICTIONARY: 8,
		BYTE_STREAM_SPLIT: 9,
		}

src/datapage.js

		@@ -0,4 +1,4 @@
		import { Encoding, ParquetType } from './constants.js'
		import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
		import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js'
		import { Encoding, ParquetType } from './types.js'

		@@ -5,0 +5,0 @@ const skipNulls = false // TODO

src/hyparquet.d.ts

		export { AsyncBuffer, FileMetaData } from './types'

		/**
		* Read parquet data rows from a file
		* Read parquet data rows from a file-like object.
		* Reads the minimal number of row groups and columns to satisfy the request.
		*
		* @param {ArrayBuffer} arrayBuffer parquet file contents
		* @returns {any[][]} row data
		* Returns a void promise when complete, and to throw errors.
		* Data is returned in onComplete, not the return promise, because
		* if onComplete is undefined, we parse the data, and emit chunks, but skip
		* computing the row view directly. This saves on allocation if the caller
		* wants to cache the full chunks, and make their own view of the data from
		* the chunks.
		*
		* @param {object} options read options
		* @param {AsyncBuffer} options.file file-like object containing parquet data
		* @param {FileMetaData} [options.metadata] parquet file metadata
		* @param {number[]} [options.columns] columns to read, all columns if undefined
		* @param {number} [options.rowStart] first requested row index (inclusive)
		* @param {number} [options.rowEnd] last requested row index (exclusive)
		* @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
		* @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed
		* @returns {Promise<void>} resolves when all requested rows and columns are parsed
		*/
		export function parquetRead(arrayBuffer: ArrayBuffer): any[][]
		export async function parquetRead(options: ParquetReadOptions): Promise<void>

		@@ -57,1 +72,24 @@ /**
		export function toJson(obj: any): unknown

		/**
		* Parquet query options for reading data
		*/
		export interface ParquetReadOptions {
		file: AsyncBuffer // file-like object containing parquet data
		metadata?: FileMetaData // parquet metadata, will be parsed if not provided
		columns?: number[] // columns to read, all columns if undefined
		rowStart?: number // inclusive
		rowEnd?: number // exclusive
		onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range.
		onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed
		}

		/**
		* A run of column data
		*/
		export interface ColumnData {
		column: number
		data: ArrayLike<any>
		rowStart: number
		rowEnd: number
		}

src/hyparquet.js

		import { parquetMetadata, parquetMetadataAsync } from './metadata.js'
		export { parquetMetadata, parquetMetadataAsync }

		import { parquetRead } from './read.js'
		export { parquetRead }

		import { snappyUncompress } from './snappy.js'
		@@ -9,12 +12,1 @@ export { snappyUncompress }
		export { toJson }

		/**
		* Read parquet data rows from a buffer.
		*
		* @param {ArrayBuffer} arrayBuffer parquet file contents
		* @returns {any[][]} row data
		*/
		export function parquetRead(arrayBuffer) {
		const metadata = parquetMetadata(arrayBuffer)
		throw new Error('not implemented')
		}

hyparquet - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics