Comparing version 0.1.2 to 0.1.3
{ | ||
"name": "hyparquet", | ||
"version": "0.1.2", | ||
"version": "0.1.3", | ||
"description": "parquet file parser for javascript", | ||
@@ -30,4 +30,4 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "20.10.6", | ||
"@typescript-eslint/eslint-plugin": "6.18.0", | ||
"@types/node": "20.10.8", | ||
"@typescript-eslint/eslint-plugin": "6.18.1", | ||
"@vitest/coverage-v8": "1.1.3", | ||
@@ -34,0 +34,0 @@ "eslint": "8.56.0", |
@@ -15,4 +15,13 @@ # hyparquet | ||
## Usage | ||
## Features | ||
- Designed to work with huge ML datasets (things like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata)) | ||
- Loads metadata separately from data | ||
- Data can be filtered by row and column ranges | ||
- Only fetches the data needed | ||
- Fast data loading for large scale ML applications | ||
- Bring data visualization closer to the user, in the browser | ||
## Installation | ||
```bash | ||
@@ -22,8 +31,29 @@ npm install hyparquet | ||
## Usage | ||
If you're in a node.js environment, you can load a parquet file with the following example: | ||
```js | ||
const { parquetMetadata } = await import('hyparquet') | ||
const fs = await import('fs') | ||
const buffer = fs.readFileSync('example.parquet') | ||
const arrayBuffer = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) | ||
const metadata = parquetMetadata(arrayBuffer) | ||
``` | ||
If you're in a browser environment, you'll probably get parquet file data from either a drag-and-dropped file from the user, or downloaded from the web. | ||
To load parquet data in the browser from a remote server using `fetch`: | ||
```js | ||
import { parquetMetadata } from 'hyparquet' | ||
const metadata = parquetMetdata(arrayBuffer) | ||
const res = await fetch(url) | ||
const arrayBuffer = await res.arrayBuffer() | ||
const metadata = parquetMetadata(arrayBuffer) | ||
``` | ||
To parse parquet files from a user drag-and-drop action, see example in [index.html](index.html). | ||
## References | ||
@@ -30,0 +60,0 @@ |
@@ -311,3 +311,4 @@ import { ParquetEncoding, ParquetType } from './constants.js' | ||
function readBitPacked(dataView, offset, header, bitWidth, remaining) { | ||
let count = (header >> 1) * 8 | ||
// extract number of values to read from header | ||
let count = (header >> 1) << 3 | ||
const mask = maskForBits(bitWidth) | ||
@@ -322,3 +323,5 @@ | ||
// read values | ||
while (count) { | ||
// if we have crossed a byte boundary, shift the data | ||
if (right > 8) { | ||
@@ -329,10 +332,12 @@ right -= 8 | ||
} else if (left - right < bitWidth) { | ||
// read next byte | ||
data |= (dataView.getUint8(offset + byteLength) << left) | ||
// if we don't have bitWidth number of bits to read, read next byte | ||
data |= dataView.getUint8(offset + byteLength) << left | ||
byteLength++ | ||
left += 8 | ||
} else { | ||
// don't write more than num rows | ||
// otherwise, read bitWidth number of bits | ||
// don't write more than remaining number of rows | ||
// even if there are still bits to read | ||
if (remaining > 0) { | ||
// emit value | ||
// emit value by shifting off to the right and masking | ||
value.push((data >> right) & mask) | ||
@@ -346,2 +351,3 @@ remaining-- | ||
// return values and number of bytes read | ||
return { value, byteLength } | ||
@@ -348,0 +354,0 @@ } |
@@ -0,1 +1,3 @@ | ||
export { FileMetaData } from './types' | ||
/** | ||
@@ -2,0 +4,0 @@ * Read parquet data rows from a file |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
55382
16
1562
64