hyparquet - npm Package Compare versions

Comparing version 0.7.2 to 0.7.3

package.json

		{
		"name": "hyparquet",
		"version": "0.7.2",
		"version": "0.7.3",
		"description": "parquet file parser for javascript",
		@@ -30,13 +30,13 @@ "keywords": [
		"devDependencies": {
		"@types/node": "20.11.30",
		"@typescript-eslint/eslint-plugin": "7.3.1",
		"@types/node": "20.12.5",
		"@typescript-eslint/eslint-plugin": "7.5.0",
		"@vitest/coverage-v8": "1.4.0",
		"eslint": "8.57.0",
		"eslint-plugin-import": "2.29.1",
		"eslint-plugin-jsdoc": "48.2.1",
		"eslint-plugin-jsdoc": "48.2.3",
		"http-server": "14.1.1",
		"hysnappy": "0.3.0",
		"typescript": "5.4.2",
		"typescript": "5.4.4",
		"vitest": "1.4.0"
		}
		}

README.md

		@@ -10,8 +10,18 @@ # hyparquet

		JavaScript parser for [Apache Parquet](https://parquet.apache.org) files.
		Dependency free since 2023!

		Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval.
		## What is hyparquet?

		Dependency free since 2023!
		Hyparquet is a lightweight, pure JavaScript library for parsing [Apache Parquet](https://parquet.apache.org) files. Apache Parquet is a popular columnar storage format that is widely used in data engineering, data science, and machine learning applications for efficiently storing and processing large datasets.

		Hyparquet allows you to read and extract data from Parquet files directly in JavaScript environments, both in Node.js and in the browser. It is designed to be fast, memory-efficient, and easy to use.

		## Why hyparquet?

		1. Performant: Designed to efficiently process large datasets by only loading the required data, making it suitable for big data and machine learning applications.
		2. Browser-native: Built to work seamlessly in the browser, opening up new possibilities for web-based data applications and visualizations.
		3. Dependency-free: Hyparquet has zero dependencies, making it lightweight and easy to install and use in any JavaScript project.
		4. TypeScript support: The library is written in typed js code and provides TypeScript type definitions out of the box.
		5. Flexible data access: Hyparquet allows you to read specific subsets of data by specifying row and column ranges, giving fine-grained control over what data is fetched and loaded.

		## Features
		@@ -48,2 +58,8 @@

		Install the hyparquet package from npm:

		```bash
		npm install hyparquet
		```

		If you're in a node.js environment, you can load a parquet file with the following example:
		@@ -74,2 +90,16 @@

		## Reading Data

		To read the entire contents of a parquet file in a browser environment:

		```js
		const { parquetRead } = await import("https://cdn.jsdelivr.net/npm/hyparquet/src/hyparquet.min.js")
		const res = await fetch(url)
		const arrayBuffer = await res.arrayBuffer()
		await parquetRead({
		file: arrayBuffer,
		onComplete: data => console.log(data)
		})
		```

		## Async
		@@ -82,3 +112,4 @@

		The parquet format supports a number of different compression and encoding types.
		The parquet format is known to be a sprawling format which includes options for a wide array of compression schemes, encoding types, and data structures.

		Hyparquet does not support 100% of all parquet files.
		@@ -85,0 +116,0 @@ Supporting every possible compression codec available in parquet would blow up the size of the hyparquet library.

src/assemble.js

		/**
		* Dremel-assembly of arrays of values into lists
		*
		* Reconstructs a complex nested structure from flat arrays of definition and repetition levels,
		* according to Dremel encoding. This simplified version focuses on arrays and scalar values,
		* with optional support for null values.
		*
		* @param {number[] \| undefined} definitionLevels definition levels, max 3
		@@ -9,50 +13,76 @@ * @param {number[]} repetitionLevels repetition levels, max 1
		* @param {number} maxDefinitionLevel definition level that corresponds to non-null
		* @param {number} maxRepetitionLevel repetition level that corresponds to a new row
		* @returns {any[]} array of values
		*/
		export function assembleObjects(
		definitionLevels, repetitionLevels, values, isNull, maxDefinitionLevel
		definitionLevels, repetitionLevels, values, isNull, maxDefinitionLevel, maxRepetitionLevel
		) {
		let valueIndex = 0
		let started = false
		let haveNull = false
		let outputIndex = 0
		let part = []
		/** @type {any[]} */
		const output = []
		let currentContainer = output

		for (let counter = 0; counter < repetitionLevels.length; counter++) {
		const def = definitionLevels?.length ? definitionLevels[counter] : maxDefinitionLevel
		const rep = repetitionLevels[counter]
		// Trackers for nested structures.
		const containerStack = [output]

		if (!rep) {
		// new row - save what we have
		if (started) {
		output[outputIndex] = haveNull ? undefined : part
		part = []
		outputIndex++
		} else {
		// first time: no row to save yet, unless it's a row continued from previous page
		if (valueIndex > 0) {
		output[outputIndex - 1] = output[outputIndex - 1]?.concat(part) // add items to previous row
		part = []
		// don't increment i since we only filled i-1
		for (let i = 0; i < repetitionLevels.length; i++) {
		const def = definitionLevels?.length ? definitionLevels[i] : maxDefinitionLevel
		const rep = repetitionLevels[i]

		if (rep !== maxRepetitionLevel) {
		// Move back to the parent container
		while (rep < containerStack.length - 1) {
		containerStack.pop()
		}
		// Construct new lists up to max repetition level
		// @ts-expect-error won't be empty
		currentContainer = containerStack.at(-1)
		if (def) {
		for (let j = rep; j < maxRepetitionLevel; j++) {
		/** @type {any[]} */
		const newList = []
		currentContainer.push(newList)
		currentContainer = newList
		containerStack.push(newList)
		}
		started = true
		}
		}

		// Add value or null based on definition level
		if (def === maxDefinitionLevel) {
		// append real value to current item
		part.push(values[valueIndex])
		valueIndex++
		} else if (def > 0) {
		// append null to current item
		part.push(undefined)
		if (!currentContainer) {
		throw new Error('parquet assembleObjects: currentContainer is undefined')
		}
		currentContainer.push(values[valueIndex++])
		} else if (isNull) {
		if (def) {
		// TODO: Go up maxDefinitionLevel - def - 1 levels to add null
		for (let j = def; j < maxDefinitionLevel - 1; j++) {
		containerStack.pop()
		// @ts-expect-error won't be empty
		currentContainer = containerStack.at(-1)
		}
		if (def > 1) {
		currentContainer.push(undefined)
		}
		} else {
		currentContainer.push(undefined)
		}
		}

		haveNull = def === 0 && isNull
		}

		if (started) {
		output[outputIndex] = haveNull ? undefined : part
		// Handle edge cases for empty inputs or single-level data
		if (output.length === 0) {
		if (values.length > 0 && maxRepetitionLevel === 0) {
		// All values belong to the same (root) list
		return [values]
		}
		// return max definition level of nested lists
		/** @type {any[]} */
		for (let i = 0; i < maxDefinitionLevel; i++) {
		/** @type {any[]} */
		const newList = []
		currentContainer.push(newList)
		currentContainer = newList
		}
		}
		@@ -62,1 +92,3 @@
		}

		// TODO: depends on prior def level

src/column.js

		@@ -7,3 +7,3 @@ import { assembleObjects } from './assemble.js'
		import { parquetHeader } from './header.js'
		import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js'
		import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, schemaElement } from './schema.js'
		import { snappyUncompress } from './snappy.js'
		@@ -34,3 +34,4 @@
		let byteOffset = 0 // byteOffset within the column
		const rowData = []
		/** @type {any[]} */
		let rowData = []

		@@ -72,4 +73,5 @@ while (valuesSeen < rowGroup.num_rows) {
		const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)
		const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema)
		values = assembleObjects(
		definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel
		definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel, maxRepetitionLevel
		)
		@@ -97,3 +99,3 @@ } else if (definitionLevels?.length) {

		rowData.push(...values)
		rowData = rowData.concat(values)
		} else if (header.type === PageType.DICTIONARY_PAGE) {
		@@ -117,7 +119,8 @@ const diph = header.dictionary_page_header
		const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)
		const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema)
		if (repetitionLevels.length) {
		dereferenceDictionary(dictionary, dataPage)
		// Use repetition levels to construct lists
		rowData.push(...assembleObjects(
		definitionLevels, repetitionLevels, dataPage, true, maxDefinitionLevel
		rowData = rowData.concat(assembleObjects(
		definitionLevels, repetitionLevels, dataPage, true, maxDefinitionLevel, maxRepetitionLevel
		))
		@@ -130,3 +133,3 @@ } else if (daph2.num_nulls) {
		dereferenceDictionary(dictionary, dataPage)
		rowData.push(...dataPage)
		rowData = rowData.concat(dataPage)
		}
		@@ -133,0 +136,0 @@ // TODO: convert?

src/encoding.js

		@@ -214,3 +214,4 @@ import { readVarInt } from './thrift.js'
		export function readData(dataView, encoding, offset, count, bitWidth) {
		const value = []
		/** @type {any[]} */
		let value = []
		let byteLength = 0
		@@ -222,3 +223,3 @@ if (encoding === 'RLE') {
		if (!rle.value.length) break // EOF
		value.push(...rle.value)
		value = value.concat(rle.value)
		seen += rle.value.length
		@@ -252,3 +253,4 @@ byteLength += rle.byteLength
		}
		const value = []
		/** @type {number[]} */
		let value = []
		const startByteLength = byteLength
		@@ -261,3 +263,3 @@ while (byteLength - startByteLength < length && value.length < numValues) {
		const rle = readRle(dataView, offset + byteLength, header, width)
		value.push(...rle.value)
		value = value.concat(rle.value)
		byteLength += rle.byteLength
		@@ -269,3 +271,3 @@ } else {
		)
		value.push(...bitPacked.value)
		value = value.concat(bitPacked.value)
		byteLength += bitPacked.byteLength
		@@ -272,0 +274,0 @@ }

src/metadata.js

		@@ -32,2 +32,4 @@ import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js'
		export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
		if (!asyncBuffer) throw new Error('parquet asyncBuffer is required')

		// fetch last bytes (footer) of the file
		@@ -68,3 +70,3 @@ const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
		/**
		* Read parquet metadata from a buffer
		* Read parquet metadata from a buffer synchronously.
		*
		@@ -75,2 +77,4 @@ * @param {ArrayBuffer} arrayBuffer parquet file contents
		export function parquetMetadata(arrayBuffer) {
		if (!arrayBuffer) throw new Error('parquet arrayBuffer is required')

		// DataView for easier manipulation of the buffer
		@@ -77,0 +81,0 @@ const view = new DataView(arrayBuffer)

src/read.js

		@@ -33,2 +33,4 @@
		export async function parquetRead(options) {
		if (!options.file) throw new Error('parquet file is required')

		// load metadata if not provided
		@@ -39,6 +41,6 @@ options.metadata \|\|= await parquetMetadataAsync(options.file)
		const { metadata, onComplete } = options
		/** @type {any[][]} */
		const rowData = []
		const rowStart = options.rowStart \|\| 0
		const rowEnd = options.rowEnd \|\| Number(metadata.num_rows)
		/** @type {any[][]} */
		let rowData = []

		@@ -58,3 +60,3 @@ // find which row groups to read
		const end = Math.min(rowEnd - groupStart, groupRows)
		rowData.push(...groupData.slice(start, end))
		rowData = rowData.concat(groupData.slice(start, end))
		}
		@@ -178,7 +180,12 @@ }
		// keys will be empty for {} and undefined for null
		if (keys[i] !== undefined) {
		if (keys[i]) {
		/** @type {Record<string, any>} */
		const obj = {}
		for (let j = 0; j < keys[i].length; j++) {
		if (keys[i][j] === undefined) continue
		if (Array.isArray(keys[i][j])) {
		// TODO: key should not be an array, this is an assemble bug
		keys[i][j] = keys[i][j][0]
		values[i][j] = values[i][j][0]
		}
		if (!keys[i][j]) continue
		obj[keys[i][j]] = values[i][j] === undefined ? null : values[i][j]
		@@ -185,0 +192,0 @@ }

hyparquet - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics