Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

hyparquet

Package Overview
Dependencies
Maintainers
1
Versions
58
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

hyparquet - npm Package Compare versions

Comparing version 0.1.6 to 0.2.0

src/read.js

2

package.json
{
"name": "hyparquet",
"version": "0.1.6",
"version": "0.2.0",
"description": "parquet file parser for javascript",

@@ -5,0 +5,0 @@ "keywords": [

@@ -19,5 +19,6 @@ # hyparquet

- Designed to work with huge ML datasets (things like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata))
- Loads metadata separately from data
- Can load metadata separately from data
- Data can be filtered by row and column ranges
- Only fetches the data needed
- Written in JavaScript, checked with TypeScript
- Fast data loading for large scale ML applications

@@ -54,3 +55,3 @@ - Bring data visualization closer to the user, in the browser

const buffer = fs.readFileSync('example.parquet')
const arrayBuffer = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
const arrayBuffer = new Uint8Array(buffer).buffer
const metadata = parquetMetadata(arrayBuffer)

@@ -57,0 +58,0 @@ ```

@@ -0,1 +1,2 @@

import { CompressionCodec, Encoding, PageType } from './constants.js'
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'

@@ -5,3 +6,2 @@ import { parquetHeader } from './header.js'

import { snappyUncompress } from './snappy.js'
import { CompressionCodec, Encoding, PageType } from './types.js'

@@ -8,0 +8,0 @@ /**

@@ -29,1 +29,31 @@ export const ParquetType = {

}
export const CompressionCodec = {
UNCOMPRESSED: 0,
SNAPPY: 1,
GZIP: 2,
LZO: 3,
BROTLI: 4,
LZ4: 5,
ZSTD: 6,
LZ4_RAW: 7,
}
export const PageType = {
DATA_PAGE: 0,
INDEX_PAGE: 1,
DICTIONARY_PAGE: 2,
DATA_PAGE_V2: 3,
}
export const Encoding = {
PLAIN: 0,
PLAIN_DICTIONARY: 2,
RLE: 3,
BIT_PACKED: 4, // deprecated
DELTA_BINARY_PACKED: 5,
DELTA_LENGTH_BYTE_ARRAY: 6,
DELTA_BYTE_ARRAY: 7,
RLE_DICTIONARY: 8,
BYTE_STREAM_SPLIT: 9,
}

@@ -0,4 +1,4 @@

import { Encoding, ParquetType } from './constants.js'
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js'
import { Encoding, ParquetType } from './types.js'

@@ -5,0 +5,0 @@ const skipNulls = false // TODO

export { AsyncBuffer, FileMetaData } from './types'
/**
* Read parquet data rows from a file
* Read parquet data rows from a file-like object.
* Reads the minimal number of row groups and columns to satisfy the request.
*
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @returns {any[][]} row data
* Returns a void promise when complete, and to throw errors.
* Data is returned in onComplete, not the return promise, because
* if onComplete is undefined, we parse the data, and emit chunks, but skip
* computing the row view directly. This saves on allocation if the caller
* wants to cache the full chunks, and make their own view of the data from
* the chunks.
*
* @param {object} options read options
* @param {AsyncBuffer} options.file file-like object containing parquet data
* @param {FileMetaData} [options.metadata] parquet file metadata
* @param {number[]} [options.columns] columns to read, all columns if undefined
* @param {number} [options.rowStart] first requested row index (inclusive)
* @param {number} [options.rowEnd] last requested row index (exclusive)
* @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
* @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed
* @returns {Promise<void>} resolves when all requested rows and columns are parsed
*/
export function parquetRead(arrayBuffer: ArrayBuffer): any[][]
export async function parquetRead(options: ParquetReadOptions): Promise<void>

@@ -57,1 +72,24 @@ /**

export function toJson(obj: any): unknown
/**
* Parquet query options for reading data
*/
export interface ParquetReadOptions {
file: AsyncBuffer // file-like object containing parquet data
metadata?: FileMetaData // parquet metadata, will be parsed if not provided
columns?: number[] // columns to read, all columns if undefined
rowStart?: number // inclusive
rowEnd?: number // exclusive
onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range.
onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed
}
/**
* A run of column data
*/
export interface ColumnData {
column: number
data: ArrayLike<any>
rowStart: number
rowEnd: number
}
import { parquetMetadata, parquetMetadataAsync } from './metadata.js'
export { parquetMetadata, parquetMetadataAsync }
import { parquetRead } from './read.js'
export { parquetRead }
import { snappyUncompress } from './snappy.js'

@@ -9,12 +12,1 @@ export { snappyUncompress }

export { toJson }
/**
* Read parquet data rows from a buffer.
*
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @returns {any[][]} row data
*/
export function parquetRead(arrayBuffer) {
const metadata = parquetMetadata(arrayBuffer)
throw new Error('not implemented')
}
SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc