Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

hyparquet

Package Overview
Dependencies
Maintainers
1
Versions
58
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

hyparquet - npm Package Compare versions

Comparing version 0.6.0 to 0.7.0

10

package.json
{
"name": "hyparquet",
"version": "0.6.0",
"version": "0.7.0",
"description": "parquet file parser for javascript",

@@ -30,13 +30,13 @@ "keywords": [

"devDependencies": {
"@types/node": "20.11.21",
"@typescript-eslint/eslint-plugin": "7.1.0",
"@types/node": "20.11.27",
"@typescript-eslint/eslint-plugin": "7.2.0",
"@vitest/coverage-v8": "1.3.1",
"eslint": "8.57.0",
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "48.2.0",
"eslint-plugin-jsdoc": "48.2.1",
"http-server": "14.1.1",
"hysnappy": "0.3.0",
"typescript": "5.3.3",
"typescript": "5.4.2",
"vitest": "1.3.1"
}
}

@@ -82,6 +82,6 @@ import { PageType } from './constants.js'

dereferenceDictionary(dictionary, dataPage)
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema))
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element)
} else if (Array.isArray(dataPage)) {
// convert primitive types to rich types
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema))
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element)
} else {

@@ -88,0 +88,0 @@ values = dataPage // TODO: data page shouldn't be a fixed byte array?

@@ -0,1 +1,5 @@

/**
* @typedef {import('./types.js').ParquetType} ParquetTypeType
* @type {ParquetTypeType[]}
*/
export const ParquetType = [

@@ -31,2 +35,6 @@ 'BOOLEAN',

/**
* @typedef {import('./types.js').ConvertedType} ConvertedTypeType
* @type {ConvertedTypeType[]}
*/
export const ConvertedType = [

@@ -57,2 +65,24 @@ 'UTF8',

/**
* @typedef {import('./types.js').LogicalTypeType} LogicalTypeType
* @type {LogicalTypeType[]}
*/
export const logicalTypeType = [
'NULL',
'STRING',
'MAP',
'LIST',
'ENUM',
'DECIMAL',
'DATE',
'TIME',
'TIMESTAMP',
'INTERVAL',
'INTEGER',
'NULL',
'JSON',
'BSON',
'UUID',
]
export const CompressionCodec = [

@@ -59,0 +89,0 @@ 'UNCOMPRESSED',

@@ -64,4 +64,4 @@ import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'

if (daph.encoding === 'PLAIN') {
const se = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = se.converted_type === 'UTF8'
const { element } = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = element.converted_type === 'UTF8'
const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8)

@@ -68,0 +68,0 @@ values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value)

@@ -50,4 +50,4 @@ import { decompressPage } from './column.js'

if (daph2.encoding === 'PLAIN') {
const se = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = se.converted_type === 'UTF8'
const { element } = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = element.converted_type === 'UTF8'
let page = compressedBytes.slice(offset)

@@ -54,0 +54,0 @@ if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') {

@@ -17,3 +17,3 @@ export { AsyncBuffer, FileMetaData, SchemaTree } from './types'

* @param {FileMetaData} [options.metadata] parquet file metadata
* @param {number[]} [options.columns] columns to read, all columns if undefined
* @param {string[]} [options.columns] columns to read, all columns if undefined
* @param {number} [options.rowStart] first requested row index (inclusive)

@@ -96,3 +96,3 @@ * @param {number} [options.rowEnd] last requested row index (exclusive)

metadata?: FileMetaData // parquet metadata, will be parsed if not provided
columns?: number[] // columns to read, all columns if undefined
columns?: string[] // columns to read, all columns if undefined
rowStart?: number // inclusive

@@ -109,6 +109,6 @@ rowEnd?: number // exclusive

export interface ColumnData {
column: number
data: ArrayLike<any>
columnName: string
columnData: ArrayLike<any>
rowStart: number
rowEnd: number
}
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js'
import { schemaTree } from './schema.js'
import { schemaElement } from './schema.js'
import { deserializeTCompactProtocol } from './thrift.js'

@@ -108,2 +108,3 @@

field_id: field.field_9,
logical_type: logicalType(field.field_10),
}))

@@ -173,3 +174,32 @@ const num_rows = metadata.field_3

export function parquetSchema(metadata) {
return schemaTree(metadata.schema, 0)
return schemaElement(metadata.schema, [])
}
/**
* Parse logical type by type.
*
* @typedef {import("./types.d.ts").LogicalType} LogicalType
* @param {any} logicalType
* @returns {LogicalType | undefined}
*/
function logicalType(logicalType) {
if (logicalType?.field_5) {
return {
logicalType: 'DECIMAL',
scale: logicalType.field_5.field_1,
precision: logicalType.field_5.field_2,
}
}
// TODO: TimestampType
// TOFO: TimeType
if (logicalType?.field_10) {
return {
logicalType: 'INTEGER',
bitWidth: logicalType.field_10.field_1,
isSigned: logicalType.field_10.field_2,
}
}
if (logicalType) {
return logicalType
}
}
import { getColumnOffset, readColumn } from './column.js'
import { parquetMetadataAsync } from './metadata.js'
import { getColumnName, isMapLike } from './schema.js'

@@ -23,3 +24,3 @@ /**

* @param {FileMetaData} [options.metadata] parquet file metadata
* @param {number[]} [options.columns] columns to read, all columns if undefined
* @param {string[]} [options.columns] columns to read, all columns if undefined
* @param {number} [options.rowStart] first requested row index (inclusive)

@@ -73,3 +74,3 @@ * @param {number} [options.rowEnd] last requested row index (exclusive)

* @param {FileMetaData} [options.metadata] parquet file metadata
* @param {number[]} [options.columns] columns to read, all columns if undefined
* @param {string[]} [options.columns] columns to read, all columns if undefined
* @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.

@@ -87,14 +88,16 @@ * @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed

let [groupStartByte, groupEndByte] = [file.byteLength, 0]
rowGroup.columns.forEach((columnChunk, columnIndex) => {
// skip columns that are not requested or lack metadata
if (columns && !columns.includes(columnIndex)) return
if (!columnChunk.meta_data) return
rowGroup.columns.forEach(({ meta_data: columnMetadata }) => {
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema)
// skip columns that are not requested
if (columns && !columns.includes(columnName)) return
const startByte = getColumnOffset(columnChunk.meta_data)
const endByte = startByte + Number(columnChunk.meta_data.total_compressed_size)
const startByte = getColumnOffset(columnMetadata)
const endByte = startByte + Number(columnMetadata.total_compressed_size)
groupStartByte = Math.min(groupStartByte, startByte)
groupEndByte = Math.max(groupEndByte, endByte)
})
if (groupStartByte >= groupEndByte) {
throw new Error('parquet missing row group metadata')
if (groupStartByte >= groupEndByte && columns?.length) {
// TODO: should throw if any column is missing
throw new Error(`parquet columns not found: ${columns.join(', ')}`)
}

@@ -112,10 +115,14 @@ // if row group size is less than 128mb, pre-load in one read

const promises = []
const maps = new Map()
let outputColumnIndex = 0
// read column data
for (let columnIndex = 0; columnIndex < rowGroup.columns.length; columnIndex++) {
// skip columns that are not requested
if (columns && !columns.includes(columnIndex)) continue
const columnMetadata = rowGroup.columns[columnIndex].meta_data
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
// skip columns that are not requested
const columnName = getColumnName(metadata.schema, columnMetadata.path_in_schema)
// skip columns that are not requested
if (columns && !columns.includes(columnName)) continue
const columnStartByte = getColumnOffset(columnMetadata)

@@ -129,2 +136,3 @@ const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size)

console.warn(`parquet skipping huge column "${columnMetadata.path_in_schema}" ${columnBytes.toLocaleString()} bytes`)
// TODO: set column to new Error('parquet column too large')
continue

@@ -146,3 +154,4 @@ }

// TODO: extract SchemaElement for this column
const columnData = readColumn(
/** @type {ArrayLike<any> | undefined} */
let columnData = readColumn(
arrayBuffer, bufferOffset, rowGroup, columnMetadata, metadata.schema, compressors

@@ -153,6 +162,40 @@ )

}
if (isMapLike(metadata.schema, columnMetadata.path_in_schema)) {
const name = columnMetadata.path_in_schema.slice(0, -2).join('.')
if (!maps.has(name)) {
maps.set(name, columnData)
columnData = undefined // do not emit column data until both key and value are read
} else {
if (columnMetadata.path_in_schema[0] === 'key') {
throw new Error('parquet map-like column key is not first') // TODO: support value-first
} else {
const values = columnData
const keys = maps.get(name)
const out = []
if (keys.length !== values.length) {
throw new Error('parquet map-like column key/value length mismatch')
}
// assemble map-like column data
for (let i = 0; i < keys.length; i++) {
/** @type {Record<string, any>} */
const obj = {}
for (let j = 0; j < keys[i].length; j++) {
obj[keys[i][j]] = values[i][j]
}
out.push(obj)
}
columnData = out
}
maps.delete(name)
}
}
// do not emit column data until structs are fully parsed
if (!columnData) return
// notify caller of column data
if (options.onChunk) options.onChunk({ column: columnIndex, data: columnData, rowStart: 0, rowEnd: columnData.length })
if (options.onChunk) options.onChunk({ columnName, columnData, rowStart: 0, rowEnd: columnData.length })
// add column data to group data only if onComplete is defined
if (options.onComplete) addColumn(groupData, columnIndex, columnData)
if (options.onComplete) addColumn(groupData, outputColumnIndex, columnData)
outputColumnIndex++
}))

@@ -159,0 +202,0 @@ }

@@ -13,3 +13,3 @@ /**

*/
export function schemaTree(schema, rootIndex) {
function schemaTree(schema, rootIndex) {
const root = schema[rootIndex]

@@ -36,3 +36,3 @@ const children = []

* @param {string[]} name path to the element
* @returns {SchemaElement} schema element
* @returns {SchemaTree} schema element
*/

@@ -47,3 +47,3 @@ export function schemaElement(schema, name) {

}
return tree.element
return tree
}

@@ -83,3 +83,3 @@

parts.forEach((part, i) => {
const element = schemaElement(schema, parts.slice(0, i + 1))
const { element } = schemaElement(schema, parts.slice(0, i + 1))
if (element.repetition_type === 'REPEATED') {

@@ -102,3 +102,3 @@ maxLevel += 1

parts.forEach((part, i) => {
const element = schemaElement(schema, parts.slice(0, i + 1))
const { element } = schemaElement(schema, parts.slice(0, i + 1))
if (element.repetition_type !== 'REQUIRED') {

@@ -126,1 +126,64 @@ maxLevel += 1

}
/**
* Get the column name as foo.bar and handle list-like columns.
* @param {SchemaElement[]} schema
* @param {string[]} path
* @returns {string} column name
*/
export function getColumnName(schema, path) {
if (isListLike(schema, path) || isMapLike(schema, path)) {
return path.slice(0, -2).join('.')
} else {
return path.join('.')
}
}
/**
* Check if a column is list-like.
*
* @param {SchemaElement[]} schemaElements parquet schema elements
* @param {string[]} path column path
* @returns {boolean} true if map-like
*/
function isListLike(schemaElements, path) {
const schema = schemaElement(schemaElements, path.slice(0, -2))
if (path.length < 3) return false
if (schema.element.converted_type !== 'LIST') return false
if (schema.children.length > 1) return false
const firstChild = schema.children[0]
if (firstChild.children.length > 1) return false
if (firstChild.element.repetition_type !== 'REPEATED') return false
const secondChild = firstChild.children[0]
if (secondChild.element.repetition_type !== 'REQUIRED') return false
return true
}
/**
* Check if a column is map-like.
*
* @param {SchemaElement[]} schemaElements parquet schema elements
* @param {string[]} path column path
* @returns {boolean} true if map-like
*/
export function isMapLike(schemaElements, path) {
const schema = schemaElement(schemaElements, path.slice(0, -2))
if (path.length < 3) return false
if (schema.element.converted_type !== 'MAP') return false
if (schema.children.length > 1) return false
const firstChild = schema.children[0]
if (firstChild.children.length !== 2) return false
if (firstChild.element.repetition_type !== 'REPEATED') return false
const keyChild = firstChild.children.find(child => child.element.name === 'key')
if (keyChild?.element.repetition_type !== 'REQUIRED') return false
const valueChild = firstChild.children.find(child => child.element.name === 'value')
if (valueChild?.element.repetition_type === 'REPEATED') return false
return true
}

@@ -45,2 +45,3 @@ /**

field_id?: number
logicalType?: LogicalType
}

@@ -87,2 +88,36 @@

type LogicalDecimalType = {
logicalType: 'DECIMAL'
precision: number
scale: number
}
type LogicalIntType = {
logicalType: 'INTEGER'
bitWidth: number
isSigned: boolean
}
export type LogicalType =
{ logicalType: LogicalTypeType } |
LogicalDecimalType |
LogicalIntType
export type LogicalTypeType =
'STRING' | // convertedType UTF8
'MAP' | // convertedType MAP
'LIST' | // convertedType LIST
'ENUM' | // convertedType ENUM
'DECIMAL' | // convertedType DECIMAL + precision/scale
'DATE' | // convertedType DATE
'TIME' | // convertedType TIME_MILLIS or TIME_MICROS
'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS
'INTEGER' | // convertedType INT or UINT
'INTERVAL' | // convertedType INT or UINT
'NULL' | // no convertedType
'JSON' | // convertedType JSON
'BSON' | // convertedType BSON
'UUID' | // no convertedType
'FLOAT16' // no convertedType
export interface RowGroup {

@@ -89,0 +124,0 @@ columns: ColumnChunk[]

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc