Comparing version 0.1.6 to 0.2.0
{ | ||
"name": "hyparquet", | ||
"version": "0.1.6", | ||
"version": "0.2.0", | ||
"description": "parquet file parser for javascript", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
@@ -19,5 +19,6 @@ # hyparquet | ||
- Designed to work with huge ML datasets (things like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata)) | ||
- Loads metadata separately from data | ||
- Can load metadata separately from data | ||
- Data can be filtered by row and column ranges | ||
- Only fetches the data needed | ||
- Written in JavaScript, checked with TypeScript | ||
- Fast data loading for large scale ML applications | ||
@@ -54,3 +55,3 @@ - Bring data visualization closer to the user, in the browser | ||
const buffer = fs.readFileSync('example.parquet') | ||
const arrayBuffer = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) | ||
const arrayBuffer = new Uint8Array(buffer).buffer | ||
const metadata = parquetMetadata(arrayBuffer) | ||
@@ -57,0 +58,0 @@ ``` |
@@ -0,1 +1,2 @@ | ||
import { CompressionCodec, Encoding, PageType } from './constants.js' | ||
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js' | ||
@@ -5,3 +6,2 @@ import { parquetHeader } from './header.js' | ||
import { snappyUncompress } from './snappy.js' | ||
import { CompressionCodec, Encoding, PageType } from './types.js' | ||
@@ -8,0 +8,0 @@ /** |
@@ -29,1 +29,31 @@ export const ParquetType = { | ||
} | ||
export const CompressionCodec = { | ||
UNCOMPRESSED: 0, | ||
SNAPPY: 1, | ||
GZIP: 2, | ||
LZO: 3, | ||
BROTLI: 4, | ||
LZ4: 5, | ||
ZSTD: 6, | ||
LZ4_RAW: 7, | ||
} | ||
export const PageType = { | ||
DATA_PAGE: 0, | ||
INDEX_PAGE: 1, | ||
DICTIONARY_PAGE: 2, | ||
DATA_PAGE_V2: 3, | ||
} | ||
export const Encoding = { | ||
PLAIN: 0, | ||
PLAIN_DICTIONARY: 2, | ||
RLE: 3, | ||
BIT_PACKED: 4, // deprecated | ||
DELTA_BINARY_PACKED: 5, | ||
DELTA_LENGTH_BYTE_ARRAY: 6, | ||
DELTA_BYTE_ARRAY: 7, | ||
RLE_DICTIONARY: 8, | ||
BYTE_STREAM_SPLIT: 9, | ||
} |
@@ -0,4 +1,4 @@ | ||
import { Encoding, ParquetType } from './constants.js' | ||
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js' | ||
import { Encoding, ParquetType } from './types.js' | ||
@@ -5,0 +5,0 @@ const skipNulls = false // TODO |
export { AsyncBuffer, FileMetaData } from './types' | ||
/** | ||
* Read parquet data rows from a file | ||
* Read parquet data rows from a file-like object. | ||
* Reads the minimal number of row groups and columns to satisfy the request. | ||
* | ||
* @param {ArrayBuffer} arrayBuffer parquet file contents | ||
* @returns {any[][]} row data | ||
* Returns a void promise when complete, and to throw errors. | ||
* Data is returned in onComplete, not the return promise, because | ||
* if onComplete is undefined, we parse the data, and emit chunks, but skip | ||
* computing the row view directly. This saves on allocation if the caller | ||
* wants to cache the full chunks, and make their own view of the data from | ||
* the chunks. | ||
* | ||
* @param {object} options read options | ||
* @param {AsyncBuffer} options.file file-like object containing parquet data | ||
* @param {FileMetaData} [options.metadata] parquet file metadata | ||
* @param {number[]} [options.columns] columns to read, all columns if undefined | ||
* @param {number} [options.rowStart] first requested row index (inclusive) | ||
* @param {number} [options.rowEnd] last requested row index (exclusive) | ||
* @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range. | ||
* @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed | ||
* @returns {Promise<void>} resolves when all requested rows and columns are parsed | ||
*/ | ||
export function parquetRead(arrayBuffer: ArrayBuffer): any[][] | ||
export async function parquetRead(options: ParquetReadOptions): Promise<void> | ||
@@ -57,1 +72,24 @@ /** | ||
export function toJson(obj: any): unknown | ||
/** | ||
* Parquet query options for reading data | ||
*/ | ||
export interface ParquetReadOptions { | ||
file: AsyncBuffer // file-like object containing parquet data | ||
metadata?: FileMetaData // parquet metadata, will be parsed if not provided | ||
columns?: number[] // columns to read, all columns if undefined | ||
rowStart?: number // inclusive | ||
rowEnd?: number // exclusive | ||
onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range. | ||
onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed | ||
} | ||
/** | ||
* A run of column data | ||
*/ | ||
export interface ColumnData { | ||
column: number | ||
data: ArrayLike<any> | ||
rowStart: number | ||
rowEnd: number | ||
} |
import { parquetMetadata, parquetMetadataAsync } from './metadata.js' | ||
export { parquetMetadata, parquetMetadataAsync } | ||
import { parquetRead } from './read.js' | ||
export { parquetRead } | ||
import { snappyUncompress } from './snappy.js' | ||
@@ -9,12 +12,1 @@ export { snappyUncompress } | ||
export { toJson } | ||
/** | ||
* Read parquet data rows from a buffer. | ||
* | ||
* @param {ArrayBuffer} arrayBuffer parquet file contents | ||
* @returns {any[][]} row data | ||
*/ | ||
export function parquetRead(arrayBuffer) { | ||
const metadata = parquetMetadata(arrayBuffer) | ||
throw new Error('not implemented') | ||
} |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
70558
18
1894
79