Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

hyparquet

Package Overview
Dependencies
Maintainers
1
Versions
58
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

hyparquet - npm Package Compare versions

Comparing version 0.9.1 to 0.9.2

2

package.json
{
"name": "hyparquet",
"version": "0.9.1",
"version": "0.9.2",
"description": "parquet file parser for javascript",

@@ -5,0 +5,0 @@ "keywords": [

@@ -9,3 +9,3 @@ # hyparquet

[![dependencies](https://img.shields.io/badge/Dependencies-0-blueviolet)](https://www.npmjs.com/package/hyparquet?activeTab=dependencies)
![coverage](https://img.shields.io/badge/Coverage-94-darkred)
![coverage](https://img.shields.io/badge/Coverage-95-darkred)

@@ -12,0 +12,0 @@ Dependency free since 2023!

@@ -99,3 +99,3 @@ import { assembleObjects } from './assemble.js'

)
dictionary = readDictionaryPage(page, diph, columnMetadata)
dictionary = readDictionaryPage(page, diph, columnMetadata, element.type_length)
} else if (header.type === 'DATA_PAGE_V2') {

@@ -102,0 +102,0 @@ const daph2 = header.data_page_header_v2

@@ -1,2 +0,2 @@

const dayMillis = 86400000000000 // 1 day in milliseconds
const dayMillis = 86400000 // 1 day in milliseconds

@@ -12,3 +12,2 @@ /**

export function convert(data, schemaElement) {
if (!Array.isArray(data)) return data
const ctype = schemaElement.converted_type

@@ -20,16 +19,22 @@ if (ctype === 'UTF8') {

if (ctype === 'DECIMAL') {
const scaleFactor = schemaElement.scale ? Math.pow(10, schemaElement.scale) : 1
const scale = schemaElement.scale || 0
const factor = Math.pow(10, -scale)
if (typeof data[0] === 'number') {
return scaleFactor === 1 ? data : data.map(v => v * scaleFactor)
if (factor === 1) return data
return Array.from(data).map(v => v * factor)
} else if (typeof data[0] === 'bigint') {
return scaleFactor === 1 ? data : data.map(v => v * BigInt(scaleFactor))
if (factor === 1) return data
return Array.from(data).map(v => Number(v) * factor)
} else {
return data.map(v => parseDecimal(v) * scaleFactor)
return Array.from(data).map(v => parseDecimal(v) * factor)
}
}
if (ctype === 'DATE') {
return data.map(v => new Date(v * dayMillis))
return Array.from(data).map(v => new Date(v * dayMillis))
}
if (ctype === undefined && schemaElement.type === 'INT96') {
return Array.from(data).map(parseInt96Date)
}
if (ctype === 'TIME_MILLIS') {
return data.map(v => new Date(v))
return Array.from(data).map(v => new Date(v))
}

@@ -49,4 +54,2 @@ if (ctype === 'JSON') {

/**
* Parse decimal from byte array.
*
* @param {Uint8Array} bytes

@@ -63,1 +66,12 @@ * @returns {number}

}
/**
* @param {bigint} value
* @returns {Date}
*/
function parseInt96Date(value) {
const days = Number((value >> 64n) - 2440588n)
const nano = Number((value & 0xffffffffffffffffn) / 1000000n)
const millis = days * dayMillis + nano
return new Date(millis)
}

@@ -25,6 +25,4 @@ import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'

// repetition levels
// repetition and definition levels
const repetitionLevels = readRepetitionLevels(reader, daph, schemaPath)
// definition levels
const { definitionLevels, numNulls } = readDefinitionLevels(reader, daph, schemaPath)

@@ -35,3 +33,4 @@

if (daph.encoding === 'PLAIN') {
dataPage = readPlain(reader, columnMetadata.type, nValues)
const { type_length } = schemaPath[schemaPath.length - 1].element
dataPage = readPlain(reader, columnMetadata.type, nValues, type_length)
} else if (

@@ -43,7 +42,5 @@ daph.encoding === 'PLAIN_DICTIONARY' ||

// bit width is stored as single byte
let bitWidth
let bitWidth = 1
// TODO: RLE encoding uses bitWidth = schemaElement.type_length
if (columnMetadata.type === 'BOOLEAN') {
bitWidth = 1
} else {
if (columnMetadata.type !== 'BOOLEAN') {
bitWidth = view.getUint8(reader.offset)

@@ -69,12 +66,12 @@ reader.offset++

*
* @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader
* @param {Uint8Array} bytes raw page data
* @param {DictionaryPageHeader} diph dictionary page header
* @param {import("./types.d.ts").DictionaryPageHeader} diph dictionary page header
* @param {ColumnMetaData} columnMetadata
* @param {number | undefined} typeLength - type_length from schema
* @returns {ArrayLike<any>} array of values
*/
export function readDictionaryPage(bytes, diph, columnMetadata) {
export function readDictionaryPage(bytes, diph, columnMetadata, typeLength) {
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
const reader = { view, offset: 0 }
return readPlain(reader, columnMetadata.type, diph.num_values)
return readPlain(reader, columnMetadata.type, diph.num_values, typeLength)
}

@@ -81,0 +78,0 @@

@@ -5,3 +5,3 @@ import { decompressPage } from './column.js'

import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
import { readVarInt, readZigZag } from './thrift.js'
import { readVarInt, readZigZagBigInt } from './thrift.js'

@@ -26,4 +26,2 @@ /**

const reader = { view, offset: 0 }
/** @type {any} */
let dataPage = []

@@ -35,38 +33,30 @@ const daph2 = ph.data_page_header_v2

const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schemaPath)
// assert(reader.offset === daph2.repetition_levels_byte_length)
if (reader.offset !== daph2.repetition_levels_byte_length) {
throw new Error(`parquet repetition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length}`)
}
// definition levels
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const definitionLevels = readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel)
// assert(reader.offset === daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length)
if (reader.offset !== daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) {
throw new Error(`parquet definition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length}`)
const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length
let page = compressedBytes.subarray(reader.offset)
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') {
page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors)
}
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length
// read values based on encoding
/** @type {import('./types.d.ts').DecodedArray} */
let dataPage = []
const nValues = daph2.num_values - daph2.num_nulls
if (daph2.encoding === 'PLAIN') {
let page = compressedBytes.slice(reader.offset)
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') {
page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors)
}
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const pageReader = { view: pageView, offset: 0 }
dataPage = readPlain(pageReader, columnMetadata.type, nValues)
const { type_length } = schemaPath[schemaPath.length - 1].element
dataPage = readPlain(pageReader, columnMetadata.type, nValues, type_length)
} else if (daph2.encoding === 'RLE') {
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const bitWidth = 1
if (daph2.num_nulls) {
throw new Error('parquet RLE encoding with nulls not supported')
} else {
const pageReader = { view: pageView, offset: 4 }
dataPage = new Array(nValues)
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, dataPage)
}
const pageReader = { view: pageView, offset: 4 }
dataPage = new Array(nValues)
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, dataPage)
} else if (

@@ -76,5 +66,2 @@ daph2.encoding === 'PLAIN_DICTIONARY' ||

) {
compressedBytes = compressedBytes.subarray(reader.offset)
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const bitWidth = pageView.getUint8(0)

@@ -85,5 +72,4 @@ const pageReader = { view: pageView, offset: 1 }

} else if (daph2.encoding === 'DELTA_BINARY_PACKED') {
if (daph2.num_nulls) throw new Error('parquet delta-int not supported')
const codec = daph2.is_compressed ? columnMetadata.codec : 'UNCOMPRESSED'
const page = decompressPage(compressedBytes, uncompressedPageSize, codec, compressors)
const int32 = columnMetadata.type === 'INT32'
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues)
deltaBinaryUnpack(page, nValues, dataPage)

@@ -142,5 +128,6 @@ } else {

* @param {number} nValues number of values to read
* @param {any[]} values array to write to
* @param {Int32Array | BigInt64Array} values array to write to
*/
function deltaBinaryUnpack(page, nValues, values) {
const int32 = values instanceof Int32Array
const view = new DataView(page.buffer, page.byteOffset, page.byteLength)

@@ -150,37 +137,42 @@ const reader = { view, offset: 0 }

const miniblockPerBlock = readVarInt(reader)
const count = readVarInt(reader)
let value = readZigZag(reader)
let count = readVarInt(reader)
let value = readZigZagBigInt(reader) // first value
let valueIndex = 0
values[valueIndex++] = int32 ? Number(value) : value
const valuesPerMiniblock = blockSize / miniblockPerBlock
for (let valueIndex = 0; valueIndex < nValues;) {
const minDelta = readZigZag(reader)
while (valueIndex < nValues) {
const minDelta = readZigZagBigInt(reader)
const bitWidths = new Uint8Array(miniblockPerBlock)
for (let i = 0; i < miniblockPerBlock; i++, reader.offset++) {
bitWidths[i] = page[reader.offset]
for (let i = 0; i < miniblockPerBlock; i++) {
bitWidths[i] = page[reader.offset++]
}
for (let i = 0; i < miniblockPerBlock; i++) {
const bitWidth = bitWidths[i]
let miniblockCount = Math.min(count, valuesPerMiniblock)
const bitWidth = BigInt(bitWidths[i])
if (bitWidth) {
if (count > 1) {
// no more diffs if on last value, delta read bitpacked
let data = 0
let stop = -bitWidth
// only works for bitWidth < 31
const mask = (1 << bitWidth) - 1
while (count) {
if (stop < 0) {
// fails when data gets too large
data = (data << 8) | view.getUint8(reader.offset++)
stop += 8
} else {
values.push((data >> stop) & mask)
const mask = (1n << bitWidth) - 1n
let bitpackPos = 0n
while (count && miniblockCount) {
let bits = (BigInt(view.getUint8(reader.offset)) >> bitpackPos) & mask // TODO: don't re-read value every time
bitpackPos += bitWidth
while (bitpackPos >= 8) {
bitpackPos -= 8n
reader.offset++
bits |= (BigInt(view.getUint8(reader.offset)) << bitWidth - bitpackPos) & mask
}
const delta = minDelta + bits
value += delta
values[valueIndex++] = int32 ? Number(value) : value
count--
miniblockCount--
}
}
} else {
for (let j = 0; j < valuesPerMiniblock && valueIndex < nValues; j++, valueIndex++) {
values[valueIndex] = value
for (let j = 0; j < valuesPerMiniblock && valueIndex < nValues; j++) {
value += minDelta
values[valueIndex++] = int32 ? Number(value) : value
}

@@ -187,0 +179,0 @@ }

@@ -10,5 +10,6 @@ /**

* @param {number} count - number of values to read
* @param {number | undefined} fixedLength - length of each fixed length byte array
* @returns {DecodedArray} array of values
*/
export function readPlain(reader, type, count) {
export function readPlain(reader, type, count, fixedLength) {
if (count === 0) return []

@@ -30,3 +31,4 @@ if (type === 'BOOLEAN') {

} else if (type === 'FIXED_LEN_BYTE_ARRAY') {
return readPlainByteArrayFixed(reader, count)
if (!fixedLength) throw new Error('parquet missing fixed length')
return readPlainByteArrayFixed(reader, count, fixedLength)
} else {

@@ -98,3 +100,3 @@ throw new Error(`parquet unhandled type: ${type}`)

const high = reader.view.getInt32(reader.offset + i * 12 + 8, true)
values[i] = (BigInt(high) << BigInt(32)) | low
values[i] = (BigInt(high) << 64n) | low
}

@@ -157,12 +159,14 @@ reader.offset += count * 12

* @param {DataReader} reader
* @param {number} count
* @param {number} fixedLength
* @returns {Uint8Array}
* @returns {Uint8Array[]}
*/
function readPlainByteArrayFixed(reader, fixedLength) {
reader.offset += fixedLength
return new Uint8Array(
reader.view.buffer,
reader.view.byteOffset + reader.offset - fixedLength,
fixedLength
)
function readPlainByteArrayFixed(reader, count, fixedLength) {
// assert(reader.view.byteLength - reader.offset >= count * fixedLength)
const values = new Array(count)
for (let i = 0; i < count; i++) {
values[i] = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, fixedLength)
reader.offset += fixedLength
}
return values
}

@@ -169,0 +173,0 @@

@@ -132,3 +132,3 @@ // TCompactProtocol types

result |= (byte & 0x7f) << shift
if ((byte & 0x80) === 0) {
if (!(byte & 0x80)) {
return result

@@ -147,11 +147,11 @@ }

function readVarBigInt(reader) {
let result = BigInt(0)
let shift = BigInt(0)
let result = 0n
let shift = 0n
while (true) {
const byte = BigInt(reader.view.getUint8(reader.offset++))
result |= (byte & BigInt(0x7f)) << shift
if ((byte & BigInt(0x80)) === BigInt(0)) {
const byte = reader.view.getUint8(reader.offset++)
result |= BigInt(byte & 0x7f) << shift
if (!(byte & 0x80)) {
return result
}
shift += BigInt(7)
shift += 7n
}

@@ -167,3 +167,3 @@ }

*/
export function readZigZag(reader) {
function readZigZag(reader) {
const zigzag = readVarInt(reader)

@@ -181,3 +181,3 @@ // convert zigzag to int

*/
function readZigZagBigInt(reader) {
export function readZigZagBigInt(reader) {
const zigzag = readVarBigInt(reader)

@@ -184,0 +184,0 @@ // convert zigzag to int

@@ -14,2 +14,3 @@ /**

if (obj instanceof Uint8Array) return Array.from(obj)
if (obj instanceof Date) return obj.toISOString()
if (obj instanceof Object) {

@@ -16,0 +17,0 @@ /** @type {Record<string, unknown>} */

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc