Comparing version 0.7.2 to 0.7.3
{ | ||
"name": "hyparquet", | ||
"version": "0.7.2", | ||
"version": "0.7.3", | ||
"description": "parquet file parser for javascript", | ||
@@ -30,13 +30,13 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "20.11.30", | ||
"@typescript-eslint/eslint-plugin": "7.3.1", | ||
"@types/node": "20.12.5", | ||
"@typescript-eslint/eslint-plugin": "7.5.0", | ||
"@vitest/coverage-v8": "1.4.0", | ||
"eslint": "8.57.0", | ||
"eslint-plugin-import": "2.29.1", | ||
"eslint-plugin-jsdoc": "48.2.1", | ||
"eslint-plugin-jsdoc": "48.2.3", | ||
"http-server": "14.1.1", | ||
"hysnappy": "0.3.0", | ||
"typescript": "5.4.2", | ||
"typescript": "5.4.4", | ||
"vitest": "1.4.0" | ||
} | ||
} |
@@ -10,8 +10,18 @@ # hyparquet | ||
JavaScript parser for [Apache Parquet](https://parquet.apache.org) files. | ||
Dependency free since 2023! | ||
Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. | ||
## What is hyparquet? | ||
Dependency free since 2023! | ||
Hyparquet is a lightweight, pure JavaScript library for parsing [Apache Parquet](https://parquet.apache.org) files. Apache Parquet is a popular columnar storage format that is widely used in data engineering, data science, and machine learning applications for efficiently storing and processing large datasets. | ||
Hyparquet allows you to read and extract data from Parquet files directly in JavaScript environments, both in Node.js and in the browser. It is designed to be fast, memory-efficient, and easy to use. | ||
## Why hyparquet? | ||
1. **Performant**: Designed to efficiently process large datasets by only loading the required data, making it suitable for big data and machine learning applications. | ||
2. **Browser-native**: Built to work seamlessly in the browser, opening up new possibilities for web-based data applications and visualizations. | ||
3. **Dependency-free**: Hyparquet has zero dependencies, making it lightweight and easy to install and use in any JavaScript project. | ||
4. **TypeScript support**: The library is written in typed js code and provides TypeScript type definitions out of the box. | ||
5. **Flexible data access**: Hyparquet allows you to read specific subsets of data by specifying row and column ranges, giving fine-grained control over what data is fetched and loaded. | ||
## Features | ||
@@ -48,2 +58,8 @@ | ||
Install the hyparquet package from npm: | ||
```bash | ||
npm install hyparquet | ||
``` | ||
If you're in a node.js environment, you can load a parquet file with the following example: | ||
@@ -74,2 +90,16 @@ | ||
## Reading Data | ||
To read the entire contents of a parquet file in a browser environment: | ||
```js | ||
const { parquetRead } = await import("https://cdn.jsdelivr.net/npm/hyparquet/src/hyparquet.min.js") | ||
const res = await fetch(url) | ||
const arrayBuffer = await res.arrayBuffer() | ||
await parquetRead({ | ||
file: arrayBuffer, | ||
onComplete: data => console.log(data) | ||
}) | ||
``` | ||
## Async | ||
@@ -82,3 +112,4 @@ | ||
The parquet format supports a number of different compression and encoding types. | ||
The parquet format is known to be a sprawling format which includes options for a wide array of compression schemes, encoding types, and data structures. | ||
Hyparquet does not support 100% of all parquet files. | ||
@@ -85,0 +116,0 @@ Supporting every possible compression codec available in parquet would blow up the size of the hyparquet library. |
/** | ||
* Dremel-assembly of arrays of values into lists | ||
* | ||
* Reconstructs a complex nested structure from flat arrays of definition and repetition levels, | ||
* according to Dremel encoding. This simplified version focuses on arrays and scalar values, | ||
* with optional support for null values. | ||
* | ||
* @param {number[] | undefined} definitionLevels definition levels, max 3 | ||
@@ -9,50 +13,76 @@ * @param {number[]} repetitionLevels repetition levels, max 1 | ||
* @param {number} maxDefinitionLevel definition level that corresponds to non-null | ||
* @param {number} maxRepetitionLevel repetition level that corresponds to a new row | ||
* @returns {any[]} array of values | ||
*/ | ||
export function assembleObjects( | ||
definitionLevels, repetitionLevels, values, isNull, maxDefinitionLevel | ||
definitionLevels, repetitionLevels, values, isNull, maxDefinitionLevel, maxRepetitionLevel | ||
) { | ||
let valueIndex = 0 | ||
let started = false | ||
let haveNull = false | ||
let outputIndex = 0 | ||
let part = [] | ||
/** @type {any[]} */ | ||
const output = [] | ||
let currentContainer = output | ||
for (let counter = 0; counter < repetitionLevels.length; counter++) { | ||
const def = definitionLevels?.length ? definitionLevels[counter] : maxDefinitionLevel | ||
const rep = repetitionLevels[counter] | ||
// Trackers for nested structures. | ||
const containerStack = [output] | ||
if (!rep) { | ||
// new row - save what we have | ||
if (started) { | ||
output[outputIndex] = haveNull ? undefined : part | ||
part = [] | ||
outputIndex++ | ||
} else { | ||
// first time: no row to save yet, unless it's a row continued from previous page | ||
if (valueIndex > 0) { | ||
output[outputIndex - 1] = output[outputIndex - 1]?.concat(part) // add items to previous row | ||
part = [] | ||
// don't increment i since we only filled i-1 | ||
for (let i = 0; i < repetitionLevels.length; i++) { | ||
const def = definitionLevels?.length ? definitionLevels[i] : maxDefinitionLevel | ||
const rep = repetitionLevels[i] | ||
if (rep !== maxRepetitionLevel) { | ||
// Move back to the parent container | ||
while (rep < containerStack.length - 1) { | ||
containerStack.pop() | ||
} | ||
// Construct new lists up to max repetition level | ||
// @ts-expect-error won't be empty | ||
currentContainer = containerStack.at(-1) | ||
if (def) { | ||
for (let j = rep; j < maxRepetitionLevel; j++) { | ||
/** @type {any[]} */ | ||
const newList = [] | ||
currentContainer.push(newList) | ||
currentContainer = newList | ||
containerStack.push(newList) | ||
} | ||
started = true | ||
} | ||
} | ||
// Add value or null based on definition level | ||
if (def === maxDefinitionLevel) { | ||
// append real value to current item | ||
part.push(values[valueIndex]) | ||
valueIndex++ | ||
} else if (def > 0) { | ||
// append null to current item | ||
part.push(undefined) | ||
if (!currentContainer) { | ||
throw new Error('parquet assembleObjects: currentContainer is undefined') | ||
} | ||
currentContainer.push(values[valueIndex++]) | ||
} else if (isNull) { | ||
if (def) { | ||
// TODO: Go up maxDefinitionLevel - def - 1 levels to add null | ||
for (let j = def; j < maxDefinitionLevel - 1; j++) { | ||
containerStack.pop() | ||
// @ts-expect-error won't be empty | ||
currentContainer = containerStack.at(-1) | ||
} | ||
if (def > 1) { | ||
currentContainer.push(undefined) | ||
} | ||
} else { | ||
currentContainer.push(undefined) | ||
} | ||
} | ||
haveNull = def === 0 && isNull | ||
} | ||
if (started) { | ||
output[outputIndex] = haveNull ? undefined : part | ||
// Handle edge cases for empty inputs or single-level data | ||
if (output.length === 0) { | ||
if (values.length > 0 && maxRepetitionLevel === 0) { | ||
// All values belong to the same (root) list | ||
return [values] | ||
} | ||
// return max definition level of nested lists | ||
/** @type {any[]} */ | ||
for (let i = 0; i < maxDefinitionLevel; i++) { | ||
/** @type {any[]} */ | ||
const newList = [] | ||
currentContainer.push(newList) | ||
currentContainer = newList | ||
} | ||
} | ||
@@ -62,1 +92,3 @@ | ||
} | ||
// TODO: depends on prior def level |
@@ -7,3 +7,3 @@ import { assembleObjects } from './assemble.js' | ||
import { parquetHeader } from './header.js' | ||
import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, schemaElement } from './schema.js' | ||
import { snappyUncompress } from './snappy.js' | ||
@@ -34,3 +34,4 @@ | ||
let byteOffset = 0 // byteOffset within the column | ||
const rowData = [] | ||
/** @type {any[]} */ | ||
let rowData = [] | ||
@@ -72,4 +73,5 @@ while (valuesSeen < rowGroup.num_rows) { | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) | ||
values = assembleObjects( | ||
definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel | ||
definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel, maxRepetitionLevel | ||
) | ||
@@ -97,3 +99,3 @@ } else if (definitionLevels?.length) { | ||
rowData.push(...values) | ||
rowData = rowData.concat(values) | ||
} else if (header.type === PageType.DICTIONARY_PAGE) { | ||
@@ -117,7 +119,8 @@ const diph = header.dictionary_page_header | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) | ||
if (repetitionLevels.length) { | ||
dereferenceDictionary(dictionary, dataPage) | ||
// Use repetition levels to construct lists | ||
rowData.push(...assembleObjects( | ||
definitionLevels, repetitionLevels, dataPage, true, maxDefinitionLevel | ||
rowData = rowData.concat(assembleObjects( | ||
definitionLevels, repetitionLevels, dataPage, true, maxDefinitionLevel, maxRepetitionLevel | ||
)) | ||
@@ -130,3 +133,3 @@ } else if (daph2.num_nulls) { | ||
dereferenceDictionary(dictionary, dataPage) | ||
rowData.push(...dataPage) | ||
rowData = rowData.concat(dataPage) | ||
} | ||
@@ -133,0 +136,0 @@ // TODO: convert? |
@@ -214,3 +214,4 @@ import { readVarInt } from './thrift.js' | ||
export function readData(dataView, encoding, offset, count, bitWidth) { | ||
const value = [] | ||
/** @type {any[]} */ | ||
let value = [] | ||
let byteLength = 0 | ||
@@ -222,3 +223,3 @@ if (encoding === 'RLE') { | ||
if (!rle.value.length) break // EOF | ||
value.push(...rle.value) | ||
value = value.concat(rle.value) | ||
seen += rle.value.length | ||
@@ -252,3 +253,4 @@ byteLength += rle.byteLength | ||
} | ||
const value = [] | ||
/** @type {number[]} */ | ||
let value = [] | ||
const startByteLength = byteLength | ||
@@ -261,3 +263,3 @@ while (byteLength - startByteLength < length && value.length < numValues) { | ||
const rle = readRle(dataView, offset + byteLength, header, width) | ||
value.push(...rle.value) | ||
value = value.concat(rle.value) | ||
byteLength += rle.byteLength | ||
@@ -269,3 +271,3 @@ } else { | ||
) | ||
value.push(...bitPacked.value) | ||
value = value.concat(bitPacked.value) | ||
byteLength += bitPacked.byteLength | ||
@@ -272,0 +274,0 @@ } |
@@ -32,2 +32,4 @@ import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js' | ||
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) { | ||
if (!asyncBuffer) throw new Error('parquet asyncBuffer is required') | ||
// fetch last bytes (footer) of the file | ||
@@ -68,3 +70,3 @@ const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize) | ||
/** | ||
* Read parquet metadata from a buffer | ||
* Read parquet metadata from a buffer synchronously. | ||
* | ||
@@ -75,2 +77,4 @@ * @param {ArrayBuffer} arrayBuffer parquet file contents | ||
export function parquetMetadata(arrayBuffer) { | ||
if (!arrayBuffer) throw new Error('parquet arrayBuffer is required') | ||
// DataView for easier manipulation of the buffer | ||
@@ -77,0 +81,0 @@ const view = new DataView(arrayBuffer) |
@@ -33,2 +33,4 @@ | ||
export async function parquetRead(options) { | ||
if (!options.file) throw new Error('parquet file is required') | ||
// load metadata if not provided | ||
@@ -39,6 +41,6 @@ options.metadata ||= await parquetMetadataAsync(options.file) | ||
const { metadata, onComplete } = options | ||
/** @type {any[][]} */ | ||
const rowData = [] | ||
const rowStart = options.rowStart || 0 | ||
const rowEnd = options.rowEnd || Number(metadata.num_rows) | ||
/** @type {any[][]} */ | ||
let rowData = [] | ||
@@ -58,3 +60,3 @@ // find which row groups to read | ||
const end = Math.min(rowEnd - groupStart, groupRows) | ||
rowData.push(...groupData.slice(start, end)) | ||
rowData = rowData.concat(groupData.slice(start, end)) | ||
} | ||
@@ -178,7 +180,12 @@ } | ||
// keys will be empty for {} and undefined for null | ||
if (keys[i] !== undefined) { | ||
if (keys[i]) { | ||
/** @type {Record<string, any>} */ | ||
const obj = {} | ||
for (let j = 0; j < keys[i].length; j++) { | ||
if (keys[i][j] === undefined) continue | ||
if (Array.isArray(keys[i][j])) { | ||
// TODO: key should not be an array, this is an assemble bug | ||
keys[i][j] = keys[i][j][0] | ||
values[i][j] = values[i][j][0] | ||
} | ||
if (!keys[i][j]) continue | ||
obj[keys[i][j]] = values[i][j] === undefined ? null : values[i][j] | ||
@@ -185,0 +192,0 @@ } |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
96868
2530
151