hyparquet - npm Package Compare versions

Comparing version 0.7.4 to 0.7.5

package.json

		{
		"name": "hyparquet",
		"version": "0.7.4",
		"version": "0.7.5",
		"description": "parquet file parser for javascript",
		@@ -23,3 +23,3 @@ "keywords": [
		"scripts": {
		"coverage": "vitest run --coverage",
		"coverage": "vitest run --coverage --coverage.include=src",
		"demo": "http-server -o",
		@@ -31,5 +31,5 @@ "lint": "eslint .",
		"devDependencies": {
		"@types/node": "20.12.5",
		"@typescript-eslint/eslint-plugin": "7.5.0",
		"@vitest/coverage-v8": "1.4.0",
		"@types/node": "20.12.7",
		"@typescript-eslint/eslint-plugin": "7.7.0",
		"@vitest/coverage-v8": "1.5.0",
		"eslint": "8.57.0",
		@@ -40,5 +40,5 @@ "eslint-plugin-import": "2.29.1",
		"hysnappy": "0.3.0",
		"typescript": "5.4.4",
		"vitest": "1.4.0"
		"typescript": "5.4.5",
		"vitest": "1.5.0"
		}
		}

110

README.md

		@@ -23,3 +23,3 @@ # hyparquet
		3. Dependency-free: Hyparquet has zero dependencies, making it lightweight and easy to install and use in any JavaScript project.
		4. TypeScript support: The library is written in typed js code and provides TypeScript type definitions out of the box.
		4. TypeScript support: The library is written in jsdoc-typed JavaScript and provides TypeScript definitions out of the box.
		5. Flexible data access: Hyparquet allows you to read specific subsets of data by specifying row and column ranges, giving fine-grained control over what data is fetched and loaded.
		@@ -29,3 +29,3 @@

		- Designed to work with huge ML datasets (things like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata))
		- Designed to work with huge ML datasets (like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata))
		- Can load metadata separately from data
		@@ -38,3 +38,3 @@ - Data can be filtered by row and column ranges

		Why make a new parquet parser in javascript?
		Why make a new parquet parser?
		First, existing libraries like [parquetjs](https://github.com/ironSource/parquetjs) are officially "inactive".
		@@ -52,8 +52,2 @@ Importantly, they do not support the kind of stream processing needed to make a really performant parser in the browser.

		## Installation

		```bash
		npm install hyparquet
		```

		## Usage
		@@ -106,7 +100,53 @@

		## Filtering

		To read large parquet files, it is recommended that you filter by row and column.
		Hyparquet is designed to load only the minimal amount of data needed to fulfill a query.
		You can filter rows by number, or columns by name:

		```js
		import { parquetRead } from 'hyparquet'

		await parquetRead({
		file,
		columns: ['colA', 'colB'], // include columns colA and colB
		rowStart: 100,
		rowEnd: 200,
		onComplete: data => console.log(data),
		})
		```

		## Async

		Hyparquet supports asynchronous fetching of parquet files, over a network.
		Hyparquet supports asynchronous fetching of parquet files over a network.
		You can provide an `AsyncBuffer` which is like a js `ArrayBuffer` but the `slice` method returns `Promise<ArrayBuffer>`.

		```typescript
		interface AsyncBuffer {
		byteLength: number
		slice(start: number, end?: number): Promise<ArrayBuffer>
		}
		```

		You can read parquet files asynchronously using HTTP Range requests so that only the necessary byte ranges from a `url` will be fetched:

		```js
		import { parquetRead } from 'hyparquet'

		const url = 'https://...'
		await parquetRead({
		file: { // AsyncBuffer
		byteLength,
		async slice(start, end) {
		const headers = new Headers()
		headers.set('Range', `bytes=${start}-${end - 1}`)
		const res = await fetch(url, { headers })
		if (!res.ok \|\| !res.body) throw new Error('fetch failed')
		return readableStreamToArrayBuffer(res.body)
		},
		}
		onComplete: data => console.log(data),
		})
		```

		## Supported Parquet Files
		@@ -120,13 +160,3 @@

		You can extend support for parquet files with other compression codec using the `compressors` option.

		```js
		import { gunzipSync } from 'zlib'
		parquetRead({ file, compressors: {
		// add gzip support:
		GZIP: (input, output) => output.set(gunzipSync(input)),
		}})
		```

		Compression:
		Parquet compression types supported by default:
		- [X] Uncompressed
		@@ -141,10 +171,34 @@ - [X] Snappy

		Page Type:
		- [X] Data Page
		- [ ] Index Page
		- [X] Dictionary Page
		- [X] Data Page V2
		You can extend support for other compression codecs using the `compressors` option.

		Contributions are welcome!
		```js
		import { parquetRead } from 'hyparquet'
		import { gunzipSync } from 'zlib'

		parquetRead({ file, compressors: {
		GZIP: (input, output) => output.set(gunzipSync(input)), // add gzip support
		}})
		```

		## Hysnappy

		The most common compression codec used in parquet is snappy compression.
		Hyparquet includes a built-in snappy decompressor written in javascript.

		We developed [hysnappy](https://github.com/hyparam/hysnappy) to make parquet parsing even faster.
		Hysnappy is a snappy decompression codec written in C, compiled to WASM.

		To use hysnappy for faster parsing of large parquet files, override the `SNAPPY` compressor for hyparquet:

		```js
		import { parquetRead } from 'hyparquet'
		import { snappyUncompressor } from 'hysnappy'

		parquetRead({ file, compressors: {
		SNAPPY: snappyUncompressor(),
		}})
		```

		Parsing a [420mb wikipedia parquet file](https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet) using hysnappy reduces parsing time by 40% (4.1s to 2.3s).

		## References
		@@ -155,4 +209,6 @@
		- https://github.com/apache/thrift
		- https://github.com/apache/arrow
		- https://github.com/dask/fastparquet
		- https://github.com/google/snappy
		- https://github.com/ironSource/parquetjs
		- https://github.com/zhipeng-jia/snappyjs

src/metadata.js

		@@ -32,7 +32,8 @@ import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js'
		export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
		if (!asyncBuffer) throw new Error('parquet asyncBuffer is required')
		if (!asyncBuffer) throw new Error('parquet file is required')
		if (!(asyncBuffer.byteLength >= 0)) throw new Error('parquet file byteLength is required')

		// fetch last bytes (footer) of the file
		const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
		const footerBuffer = await asyncBuffer.slice(footerOffset)
		const footerBuffer = await asyncBuffer.slice(footerOffset, asyncBuffer.byteLength)

		@@ -76,5 +77,3 @@ // Check for parquet magic number "PAR1"
		export function parquetMetadata(arrayBuffer) {
		if (!arrayBuffer) throw new Error('parquet arrayBuffer is required')

		// DataView for easier manipulation of the buffer
		if (!arrayBuffer) throw new Error('parquet file is required')
		const view = new DataView(arrayBuffer)
		@@ -102,3 +101,3 @@

		// Parse parquet metadata from thrift data
		// Parse metadata from thrift data
		const version = metadata.field_1
		@@ -105,0 +104,0 @@ const schema = metadata.field_2.map((/** @type {any} */ field) => ({

src/read.js

		@@ -54,3 +54,3 @@
		// read row group
		const groupData = await readRowGroup(options, rowGroup)
		const groupData = await readRowGroup(options, rowGroup, groupStart)
		if (onComplete) {
		@@ -82,5 +82,6 @@ // filter to rows in range
		* @param {RowGroup} rowGroup row group to read
		* @param {number} groupStart row index of the first row in the group
		* @returns {Promise<any[][]>} resolves to row data
		*/
		async function readRowGroup(options, rowGroup) {
		async function readRowGroup(options, rowGroup, groupStart) {
		const { file, metadata, columns, compressors } = options
		@@ -206,4 +207,9 @@ if (!metadata) throw new Error('parquet metadata not found')
		// notify caller of column data
		if (options.onChunk) options.onChunk({ columnName, columnData, rowStart: 0, rowEnd: columnData.length })
		// add column data to group data only if onComplete is defined
		options.onChunk?.({
		columnName,
		columnData,
		rowStart: groupStart,
		rowEnd: groupStart + columnData.length,
		})
		// add colum data to group data only if onComplete is defined
		if (options.onComplete) addColumn(groupData, outputColumnIndex, columnData)
		@@ -210,0 +216,0 @@ outputColumnIndex++

hyparquet - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics