Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

hyparquet

Package Overview
Dependencies
Maintainers
1
Versions
58
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

hyparquet - npm Package Compare versions

Comparing version 0.9.8 to 0.9.9

8

package.json
{
"name": "hyparquet",
"version": "0.9.8",
"version": "0.9.9",
"description": "parquet file parser for javascript",

@@ -30,8 +30,8 @@ "keywords": [

"devDependencies": {
"@types/node": "20.12.12",
"@typescript-eslint/eslint-plugin": "7.11.0",
"@types/node": "20.14.2",
"@typescript-eslint/eslint-plugin": "7.12.0",
"@vitest/coverage-v8": "1.6.0",
"eslint": "8.57.0",
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "48.2.6",
"eslint-plugin-jsdoc": "48.2.9",
"http-server": "14.1.1",

@@ -38,0 +38,0 @@ "hyparquet-compressors": "0.1.4",

@@ -11,2 +11,3 @@ import { isListLike, isMapLike } from './schema.js'

* @typedef {import('./types.d.ts').FieldRepetitionType} FieldRepetitionType
* @param {any[]} output
* @param {number[] | undefined} definitionLevels

@@ -17,12 +18,9 @@ * @param {number[]} repetitionLevels

* @param {number} maxDefinitionLevel definition level that corresponds to non-null
* @param {number} maxRepetitionLevel repetition level that corresponds to a new row
* @returns {DecodedArray} array of values
* @returns {any[]}
*/
export function assembleLists(
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel
output, definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel
) {
const n = definitionLevels?.length || repetitionLevels.length
let valueIndex = 0
/** @type {any[]} */
const output = []

@@ -36,2 +34,14 @@ // Track state of nested structures

if (repetitionLevels[0]) {
// continue previous row
while (currentDepth < repetitionPath.length - 2 && currentRepLevel < repetitionLevels[0]) {
// go into last list
currentContainer = currentContainer.at(-1)
containerStack.push(currentContainer)
currentDepth++
if (repetitionPath[currentDepth] !== 'REQUIRED') currentDefLevel++
if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel++
}
}
for (let i = 0; i < n; i++) {

@@ -83,6 +93,3 @@ // assert(currentDefLevel === containerStack.length - 1)

// Handle edge cases for empty inputs or single-level data
if (output.length === 0) {
if (values.length > 0 && maxRepetitionLevel === 0) {
return values // flat list
}
if (!output.length) {
// return max definition level of nested lists

@@ -89,0 +96,0 @@ for (let i = 0; i < maxDefinitionLevel; i++) {

@@ -6,3 +6,3 @@ import { assembleLists } from './assemble.js'

import { parquetHeader } from './header.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
import { getMaxDefinitionLevel } from './schema.js'
import { concat } from './utils.js'

@@ -16,3 +16,3 @@

* @param {import('./types.js').DataReader} reader
* @param {import('./types.js').RowGroup} rowGroup row group metadata
* @param {number} rowLimit maximum number of rows to read
* @param {ColumnMetaData} columnMetadata column metadata

@@ -23,11 +23,10 @@ * @param {import('./types.js').SchemaTree[]} schemaPath schema path for the column

*/
export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compressors, utf8 }) {
export function readColumn(reader, rowLimit, columnMetadata, schemaPath, { compressors, utf8 }) {
const { element } = schemaPath[schemaPath.length - 1]
/** @type {DecodedArray | undefined} */
let dictionary = undefined
let seen = 0
/** @type {any[]} */
const rowData = []
while (seen < rowGroup.num_rows) {
while (rowData.length < rowLimit) {
// parse column header

@@ -51,4 +50,3 @@ const header = parquetHeader(reader)

const { definitionLevels, repetitionLevels, dataPage } = readDataPage(page, daph, schemaPath, columnMetadata)
seen += daph.num_values
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
// assert(!daph.statistics?.null_count || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))

@@ -59,6 +57,5 @@ // convert types, dereference dictionary, and assemble lists

const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
values = assembleLists(
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel
assembleLists(
rowData, definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel
)

@@ -72,5 +69,4 @@ } else {

}
concat(rowData, values)
}
// assert(BigInt(values.length) === rowGroup.num_rows)
concat(rowData, values)
} else if (header.type === 'DATA_PAGE_V2') {

@@ -83,3 +79,2 @@ const daph2 = header.data_page_header_v2

)
seen += daph2.num_values

@@ -90,9 +85,9 @@ // convert types, dereference dictionary, and assemble lists

const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
values = assembleLists(
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel
assembleLists(
rowData, definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel
)
} else {
concat(rowData, values)
}
concat(rowData, values)
} else if (header.type === 'DICTIONARY_PAGE') {

@@ -111,4 +106,4 @@ const diph = header.dictionary_page_header

}
if (rowData.length !== Number(rowGroup.num_rows)) {
throw new Error(`parquet row data length ${rowData.length} does not match row group length ${rowGroup.num_rows}}`)
if (rowData.length < rowLimit) {
throw new Error(`parquet row data length ${rowData.length} does not match row group limit ${rowLimit}}`)
}

@@ -122,10 +117,10 @@ return rowData

* @param {ColumnMetaData} columnMetadata
* @returns {number} byte offset
* @returns {[bigint, bigint]} byte offset range
*/
export function getColumnOffset({ dictionary_page_offset, data_page_offset }) {
export function getColumnRange({ dictionary_page_offset, data_page_offset, total_compressed_size }) {
let columnOffset = dictionary_page_offset
if (!dictionary_page_offset || data_page_offset < dictionary_page_offset) {
if (!columnOffset || data_page_offset < columnOffset) {
columnOffset = data_page_offset
}
return Number(columnOffset)
return [columnOffset, columnOffset + total_compressed_size]
}

@@ -28,3 +28,5 @@ import { bitWidth, byteStreamSplit, readRleBitPackedHybrid } from './encoding.js'

const repetitionLevels = readRepetitionLevels(reader, daph, schemaPath)
// assert(!repetitionLevels.length || repetitionLevels.length === daph.num_values)
const { definitionLevels, numNulls } = readDefinitionLevels(reader, daph, schemaPath)
// assert(!definitionLevels.length || definitionLevels.length === daph.num_values)

@@ -31,0 +33,0 @@ // read values based on encoding

@@ -89,3 +89,3 @@ import type { AsyncBuffer, Compressors, FileMetaData, SchemaTree } from './types.d.ts'

*/
export function toJson(obj: any): unknown
export function toJson(obj: any): any

@@ -92,0 +92,0 @@ /**

import { assembleNested } from './assemble.js'
import { getColumnOffset, readColumn } from './column.js'
import { getColumnRange, readColumn } from './column.js'
import { parquetMetadataAsync } from './metadata.js'

@@ -54,3 +54,4 @@ import { getSchemaPath } from './schema.js'

// read row group
const groupData = await readRowGroup(options, rowGroup, groupStart)
const rowLimit = rowEnd && rowEnd - groupStart
const groupData = await readRowGroup(options, rowGroup, groupStart, rowLimit)
if (onComplete) {

@@ -82,7 +83,9 @@ // filter to rows in range

* @param {number} groupStart row index of the first row in the group
* @param {number} [rowLimit] max rows to read from this group
* @returns {Promise<any[][]>} resolves to row data
*/
async function readRowGroup(options, rowGroup, groupStart) {
export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
const { file, metadata, columns } = options
if (!metadata) throw new Error('parquet metadata not found')
if (rowLimit === undefined || rowLimit > rowGroup.num_rows) rowLimit = Number(rowGroup.num_rows)

@@ -96,6 +99,5 @@ // loop through metadata to find min/max bytes to read

const startByte = getColumnOffset(columnMetadata)
const endByte = startByte + Number(columnMetadata.total_compressed_size)
groupStartByte = Math.min(groupStartByte, startByte)
groupEndByte = Math.max(groupEndByte, endByte)
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
groupStartByte = Math.min(groupStartByte, columnStartByte)
groupEndByte = Math.max(groupEndByte, columnEndByte)
})

@@ -130,4 +132,3 @@ if (groupStartByte >= groupEndByte && columns?.length) {

const columnStartByte = getColumnOffset(columnMetadata)
const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size)
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
const columnBytes = columnEndByte - columnStartByte

@@ -160,3 +161,3 @@

/** @type {any[] | undefined} */
let columnData = readColumn(reader, rowGroup, columnMetadata, schemaPath, options)
let columnData = readColumn(reader, rowLimit, columnMetadata, schemaPath, options)
// assert(columnData.length === Number(rowGroup.num_rows)

@@ -163,0 +164,0 @@

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc