Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

hyparquet

Package Overview
Dependencies
Maintainers
1
Versions
58
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

hyparquet - npm Package Compare versions

Comparing version 0.7.5 to 0.7.6

2

package.json
{
"name": "hyparquet",
"version": "0.7.5",
"version": "0.7.6",
"description": "parquet file parser for javascript",

@@ -5,0 +5,0 @@ "keywords": [

import { assembleObjects } from './assemble.js'
import { PageType } from './constants.js'
import { convert } from './convert.js'

@@ -52,3 +51,3 @@ import { readDataPage, readDictionaryPage } from './datapage.js'

// parse page data by type
if (header.type === PageType.DATA_PAGE) {
if (header.type === 'DATA_PAGE') {
const daph = header.data_page_header

@@ -99,3 +98,3 @@ if (!daph) throw new Error('parquet data page header is undefined')

concat(rowData, values)
} else if (header.type === PageType.DICTIONARY_PAGE) {
} else if (header.type === 'DICTIONARY_PAGE') {
const diph = header.dictionary_page_header

@@ -108,3 +107,3 @@ if (!diph) throw new Error('parquet dictionary page header is undefined')

dictionary = readDictionaryPage(page, diph, schema, columnMetadata)
} else if (header.type === PageType.DATA_PAGE_V2) {
} else if (header.type === 'DATA_PAGE_V2') {
const daph2 = header.data_page_header_v2

@@ -206,2 +205,3 @@ if (!daph2) throw new Error('parquet data page header v2 is undefined')

* Expand data page list with nulls and convert to utf8.
*
* @param {number[]} definitionLevels

@@ -208,0 +208,0 @@ * @param {number} maxDefinitionLevel

@@ -97,7 +97,11 @@ /**

export const PageType = {
DATA_PAGE: 0,
INDEX_PAGE: 1,
DICTIONARY_PAGE: 2,
DATA_PAGE_V2: 3,
}
/**
* @typedef {import('./types.js').PageType} PageType
* @type {PageType[]}
*/
export const PageType = [
'DATA_PAGE',
'INDEX_PAGE',
'DICTIONARY_PAGE',
'DATA_PAGE_V2',
]
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import {
getMaxDefinitionLevel,
getMaxRepetitionLevel,
isRequired,
schemaElement,
skipDefinitionBytes,
} from './schema.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, schemaElement, skipDefinitionBytes } from './schema.js'

@@ -13,17 +7,9 @@ const skipNulls = false // TODO

/**
* @typedef {{ byteLength: number, definitionLevels: number[], numNulls: number }} DefinitionLevels
* Read a data page from the given Uint8Array.
*
* @typedef {{ definitionLevels: number[], numNulls: number }} DefinitionLevels
* @typedef {import("./types.d.ts").DataPage} DataPage
* @typedef {import("./types.d.ts").ColumnMetaData} ColumnMetaData
* @typedef {import("./types.d.ts").DataPageHeader} DataPageHeader
* @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader
* @typedef {import("./types.d.ts").SchemaElement} SchemaElement
*/
/**
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
* @template T
*/
/**
* Read a data page from the given Uint8Array.
*
* @param {Uint8Array} bytes raw page data (should already be decompressed)

@@ -36,4 +22,4 @@ * @param {DataPageHeader} daph data page header

export function readDataPage(bytes, daph, schema, columnMetadata) {
const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
let offset = 0
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
const reader = { view, offset: 0 }
/** @type {any[]} */

@@ -43,6 +29,5 @@ let values = []

// repetition levels
const { value: repetitionLevels, byteLength } = readRepetitionLevels(
dataView, offset, daph, schema, columnMetadata
const repetitionLevels = readRepetitionLevels(
reader, daph, schema, columnMetadata
)
offset += byteLength

@@ -56,8 +41,7 @@ // definition levels

// skip_definition_bytes
offset += skipDefinitionBytes(daph.num_values)
reader.offset += skipDefinitionBytes(daph.num_values)
} else {
const dl = readDefinitionLevels(dataView, offset, daph, schema, columnMetadata.path_in_schema)
const dl = readDefinitionLevels(reader, daph, schema, columnMetadata.path_in_schema)
definitionLevels = dl.definitionLevels
numNulls = dl.numNulls
offset += dl.byteLength
}

@@ -70,5 +54,4 @@

const utf8 = element.converted_type === 'UTF8'
const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8)
values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value)
offset += plainObj.byteLength
const plainObj = readPlain(reader, columnMetadata.type, nValues, utf8)
values = Array.isArray(plainObj) ? plainObj : Array.from(plainObj)
} else if (

@@ -85,10 +68,9 @@ daph.encoding === 'PLAIN_DICTIONARY' ||

} else {
bitWidth = dataView.getUint8(offset)
offset += 1
bitWidth = view.getUint8(reader.offset)
reader.offset++
}
if (bitWidth) {
const { value, byteLength } = readRleBitPackedHybrid(
dataView, offset, bitWidth, dataView.byteLength - offset, nValues
const value = readRleBitPackedHybrid(
reader, bitWidth, view.byteLength - reader.offset, nValues
)
offset += byteLength
values = Array.isArray(value) ? value : Array.from(value)

@@ -109,2 +91,3 @@ } else {

*
* @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader
* @param {Uint8Array} bytes raw page data

@@ -117,4 +100,5 @@ * @param {DictionaryPageHeader} diph dictionary page header

export function readDictionaryPage(bytes, diph, schema, columnMetadata) {
const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
return readPlain(dataView, columnMetadata.type, diph.num_values, 0, false).value
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
const reader = { view, offset: 0 }
return readPlain(reader, columnMetadata.type, diph.num_values, false)
}

@@ -125,10 +109,10 @@

*
* @param {DataView} dataView data view for the page
* @param {number} offset offset to start reading from
* @typedef {import("./types.d.ts").DataReader} DataReader
* @param {DataReader} reader data view for the page
* @param {DataPageHeader} daph data page header
* @param {SchemaElement[]} schema schema for the file
* @param {ColumnMetaData} columnMetadata metadata for the column
* @returns {Decoded<any[]>} repetition levels and number of bytes read
* @returns {any[]} repetition levels and number of bytes read
*/
function readRepetitionLevels(dataView, offset, daph, schema, columnMetadata) {
function readRepetitionLevels(reader, daph, schema, columnMetadata) {
if (columnMetadata.path_in_schema.length > 1) {

@@ -139,7 +123,7 @@ const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema)

return readData(
dataView, daph.repetition_level_encoding, offset, daph.num_values, bitWidth
reader, daph.repetition_level_encoding, daph.num_values, bitWidth
)
}
}
return { value: [], byteLength: 0 }
return []
}

@@ -150,4 +134,3 @@

*
* @param {DataView} dataView data view for the page
* @param {number} offset offset to start reading from
* @param {DataReader} reader data view for the page
* @param {DataPageHeader} daph data page header

@@ -158,3 +141,3 @@ * @param {SchemaElement[]} schema schema for the file

*/
function readDefinitionLevels(dataView, offset, daph, schema, path_in_schema) {
function readDefinitionLevels(reader, daph, schema, path_in_schema) {
if (!isRequired(schema, path_in_schema)) {

@@ -165,4 +148,4 @@ const maxDefinitionLevel = getMaxDefinitionLevel(schema, path_in_schema)

// num_values is index 1 for either type of page header
const { value: definitionLevels, byteLength } = readData(
dataView, daph.definition_level_encoding, offset, daph.num_values, bitWidth
const definitionLevels = readData(
reader, daph.definition_level_encoding, daph.num_values, bitWidth
)

@@ -179,6 +162,6 @@

return { byteLength, definitionLevels, numNulls }
return { definitionLevels, numNulls }
}
}
return { byteLength: 0, definitionLevels: [], numNulls: 0 }
return { definitionLevels: [], numNulls: 0 }
}

@@ -7,7 +7,2 @@ import { decompressPage } from './column.js'

/**
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
* @template T
*/
/**
* Read a data page from the given Uint8Array.

@@ -29,4 +24,4 @@ *

export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, compressors) {
const dataView = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength)
let offset = 0
const view = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength)
const reader = { view, offset: 0 }
/** @type {any} */

@@ -39,10 +34,16 @@ let values = []

// repetition levels
const repetitionLevels = readRepetitionLevelsV2(dataView, offset, daph2, schema, columnMetadata)
const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schema, columnMetadata)
if (reader.offset !== daph2.repetition_levels_byte_length) {
throw new Error(`parquet repetition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length}`)
}
// definition levels
offset += daph2.repetition_levels_byte_length
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)
const definitionLevels = readDefinitionLevelsV2(dataView, offset, daph2, maxDefinitionLevel)
offset += daph2.definition_levels_byte_length
const definitionLevels = readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel)
if (reader.offset !== daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) {
throw new Error(`parquet definition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length}`)
}
const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length

@@ -55,3 +56,3 @@

const utf8 = element.converted_type === 'UTF8'
let page = compressedBytes.slice(offset)
let page = compressedBytes.slice(reader.offset)
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') {

@@ -61,4 +62,4 @@ page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors)

const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const plainObj = readPlain(pageView, columnMetadata.type, nValues, 0, utf8)
values = plainObj.value
const pageReader = { view: pageView, offset: 0 }
values = readPlain(pageReader, columnMetadata.type, nValues, utf8)
} else if (daph2.encoding === 'RLE') {

@@ -71,5 +72,6 @@ const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)

} else {
const pageReader = { view: pageView, offset: 4 }
values = readRleBitPackedHybrid(
pageView, 4, bitWidth, uncompressedPageSize, nValues
).value
pageReader, bitWidth, uncompressedPageSize, nValues
)
}

@@ -80,9 +82,9 @@ } else if (

) {
compressedBytes = compressedBytes.subarray(offset)
compressedBytes = compressedBytes.subarray(reader.offset)
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors)
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength)
const bitWidth = pageView.getUint8(0)
const { value } = readRleBitPackedHybrid(
pageView, 1, bitWidth, uncompressedPageSize, nValues
const pageReader = { view: pageView, offset: 1 }
const value = readRleBitPackedHybrid(
pageReader, bitWidth, uncompressedPageSize, nValues
)

@@ -105,4 +107,4 @@ values = value

*
* @param {DataView} dataView data view for the page
* @param {number} offset offset to start reading from
* @typedef {import("./types.d.ts").DataReader} DataReader
* @param {DataReader} reader data view for the page
* @param {DataPageHeaderV2} daph2 data page header

@@ -113,3 +115,3 @@ * @param {SchemaElement[]} schema schema for the file

*/
export function readRepetitionLevelsV2(dataView, offset, daph2, schema, columnMetadata) {
export function readRepetitionLevelsV2(reader, daph2, schema, columnMetadata) {
const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema)

@@ -121,4 +123,4 @@ if (!maxRepetitionLevel) return []

return readRleBitPackedHybrid(
dataView, offset, bitWidth, daph2.repetition_levels_byte_length, daph2.num_values
).value
reader, bitWidth, daph2.repetition_levels_byte_length, daph2.num_values
)
}

@@ -129,4 +131,3 @@

*
* @param {DataView} dataView data view for the page
* @param {number} offset offset to start reading from
* @param {DataReader} reader data view for the page
* @param {DataPageHeaderV2} daph2 data page header v2

@@ -136,3 +137,3 @@ * @param {number} maxDefinitionLevel maximum definition level for this column

*/
function readDefinitionLevelsV2(dataView, offset, daph2, maxDefinitionLevel) {
function readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) {
if (maxDefinitionLevel) {

@@ -142,4 +143,4 @@ // not the same as V1, because we know the length

return readRleBitPackedHybrid(
dataView, offset, bitWidth, daph2.definition_levels_byte_length, daph2.num_values
).value
reader, bitWidth, daph2.definition_levels_byte_length, daph2.num_values
)
}

@@ -146,0 +147,0 @@ }

@@ -5,26 +5,18 @@ import { readVarInt } from './thrift.js'

/**
* Return type with bytes read.
* This is useful to advance an offset through a buffer.
*
* @typedef {import("./types.d.ts").Decoded<T>} Decoded
* @template T
*/
/**
* Read `count` boolean values.
*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} count - number of values to read
* @returns {Decoded<boolean[]>} array of boolean values
* @returns {boolean[]} array of boolean values
*/
function readPlainBoolean(dataView, offset, count) {
function readPlainBoolean(reader, count) {
const value = []
for (let i = 0; i < count; i++) {
const byteOffset = offset + Math.floor(i / 8)
const byteOffset = reader.offset + Math.floor(i / 8)
const bitOffset = i % 8
const byte = dataView.getUint8(byteOffset)
const byte = reader.view.getUint8(byteOffset)
value.push((byte & (1 << bitOffset)) !== 0)
}
return { value, byteLength: Math.ceil(count / 8) }
reader.offset += Math.ceil(count / 8)
return value
}

@@ -35,13 +27,13 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} count - number of values to read
* @returns {Decoded<number[]>} array of int32 values
* @returns {number[]} array of int32 values
*/
function readPlainInt32(dataView, offset, count) {
function readPlainInt32(reader, count) {
const value = []
for (let i = 0; i < count; i++) {
value.push(dataView.getInt32(offset + i * 4, true))
value.push(reader.view.getInt32(reader.offset + i * 4, true))
}
return { value, byteLength: count * 4 }
reader.offset += count * 4
return value
}

@@ -52,13 +44,13 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} count - number of values to read
* @returns {Decoded<bigint[]>} array of int64 values
* @returns {bigint[]} array of int64 values
*/
function readPlainInt64(dataView, offset, count) {
function readPlainInt64(reader, count) {
const value = []
for (let i = 0; i < count; i++) {
value.push(dataView.getBigInt64(offset + i * 8, true))
value.push(reader.view.getBigInt64(reader.offset + i * 8, true))
}
return { value, byteLength: count * 8 }
reader.offset += count * 8
return value
}

@@ -69,15 +61,15 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} count - number of values to read
* @returns {Decoded<bigint[]>} array of int96 values
* @returns {bigint[]} array of int96 values
*/
function readPlainInt96(dataView, offset, count) {
function readPlainInt96(reader, count) {
const value = []
for (let i = 0; i < count; i++) {
const low = dataView.getBigInt64(offset + i * 12, true)
const high = dataView.getInt32(offset + i * 12 + 8, true)
const low = reader.view.getBigInt64(reader.offset + i * 12, true)
const high = reader.view.getInt32(reader.offset + i * 12 + 8, true)
value.push((BigInt(high) << BigInt(32)) | low)
}
return { value, byteLength: count * 12 }
reader.offset += count * 12
return value
}

@@ -88,13 +80,13 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} count - number of values to read
* @returns {Decoded<number[]>} array of float values
* @returns {number[]} array of float values
*/
function readPlainFloat(dataView, offset, count) {
function readPlainFloat(reader, count) {
const value = []
for (let i = 0; i < count; i++) {
value.push(dataView.getFloat32(offset + i * 4, true))
value.push(reader.view.getFloat32(reader.offset + i * 4, true))
}
return { value, byteLength: count * 4 }
reader.offset += count * 4
return value
}

@@ -105,13 +97,13 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} count - number of values to read
* @returns {Decoded<number[]>} array of double values
* @returns {number[]} array of double values
*/
function readPlainDouble(dataView, offset, count) {
function readPlainDouble(reader, count) {
const value = []
for (let i = 0; i < count; i++) {
value.push(dataView.getFloat64(offset + i * 8, true))
value.push(reader.view.getFloat64(reader.offset + i * 8, true))
}
return { value, byteLength: count * 8 }
reader.offset += count * 8
return value
}

@@ -122,18 +114,16 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} count - number of values to read
* @returns {Decoded<Uint8Array[]>} array of byte arrays
* @returns {Uint8Array[]} array of byte arrays
*/
function readPlainByteArray(dataView, offset, count) {
function readPlainByteArray(reader, count) {
const value = []
let byteLength = 0 // byte length of all data read
for (let i = 0; i < count; i++) {
const length = dataView.getInt32(offset + byteLength, true)
byteLength += 4
const bytes = new Uint8Array(dataView.buffer, dataView.byteOffset + offset + byteLength, length)
const length = reader.view.getInt32(reader.offset, true)
reader.offset += 4
const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length)
value.push(bytes)
byteLength += length
reader.offset += length
}
return { value, byteLength }
return value
}

@@ -144,52 +134,49 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} fixedLength - length of each fixed length byte array
* @returns {Decoded<Uint8Array>} array of fixed length byte arrays
* @returns {Uint8Array} array of fixed length byte arrays
*/
function readPlainByteArrayFixed(dataView, offset, fixedLength) {
return {
value: new Uint8Array(dataView.buffer, dataView.byteOffset + offset, fixedLength),
byteLength: fixedLength,
}
function readPlainByteArrayFixed(reader, fixedLength) {
reader.offset += fixedLength
return new Uint8Array(
reader.view.buffer,
reader.view.byteOffset + reader.offset - fixedLength,
fixedLength
)
}
/**
* Read `count` values of the given type from the dataView.
* Read `count` values of the given type from the reader.view.
*
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray
* @typedef {import("./types.d.ts").ParquetType} ParquetType
* @param {DataView} dataView - buffer to read data from
* @param {DataReader} reader - buffer to read data from
* @param {ParquetType} type - parquet type of the data
* @param {number} count - number of values to read
* @param {number} offset - offset to start reading from the DataView
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8
* @returns {Decoded<DecodedArray>} array of values
* @returns {DecodedArray} array of values
*/
export function readPlain(dataView, type, count, offset, utf8) {
if (count === 0) return { value: [], byteLength: 0 }
export function readPlain(reader, type, count, utf8) {
if (count === 0) return []
if (type === 'BOOLEAN') {
return readPlainBoolean(dataView, offset, count)
return readPlainBoolean(reader, count)
} else if (type === 'INT32') {
return readPlainInt32(dataView, offset, count)
return readPlainInt32(reader, count)
} else if (type === 'INT64') {
return readPlainInt64(dataView, offset, count)
return readPlainInt64(reader, count)
} else if (type === 'INT96') {
return readPlainInt96(dataView, offset, count)
return readPlainInt96(reader, count)
} else if (type === 'FLOAT') {
return readPlainFloat(dataView, offset, count)
return readPlainFloat(reader, count)
} else if (type === 'DOUBLE') {
return readPlainDouble(dataView, offset, count)
return readPlainDouble(reader, count)
} else if (type === 'BYTE_ARRAY') {
const byteArray = readPlainByteArray(dataView, offset, count)
const byteArray = readPlainByteArray(reader, count)
if (utf8) {
const decoder = new TextDecoder()
return {
value: byteArray.value.map(bytes => decoder.decode(bytes)),
byteLength: byteArray.byteLength,
}
return byteArray.map(bytes => decoder.decode(bytes))
}
return byteArray
} else if (type === 'FIXED_LEN_BYTE_ARRAY') {
return readPlainByteArrayFixed(dataView, offset, count)
return readPlainByteArrayFixed(reader, count)
} else {

@@ -215,21 +202,18 @@ throw new Error(`parquet unhandled type: ${type}`)

* @typedef {import("./types.d.ts").Encoding} Encoding
* @param {DataView} dataView - buffer to read data from
* @param {DataReader} reader - buffer to read data from
* @param {Encoding} encoding - encoding type
* @param {number} offset - offset to start reading from the DataView
* @param {number} count - number of values to read
* @param {number} bitWidth - width of each bit-packed group
* @returns {Decoded<any>} array of values
* @returns {any[]} array of values
*/
export function readData(dataView, encoding, offset, count, bitWidth) {
export function readData(reader, encoding, count, bitWidth) {
/** @type {any[]} */
const value = []
let byteLength = 0
if (encoding === 'RLE') {
let seen = 0
while (seen < count) {
const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count)
if (!rle.value.length) break // EOF
concat(value, rle.value)
seen += rle.value.length
byteLength += rle.byteLength
const rle = readRleBitPackedHybrid(reader, bitWidth, 0, count)
if (!rle.length) break // EOF
concat(value, rle)
seen += rle.length
}

@@ -239,3 +223,3 @@ } else {

}
return { value, byteLength }
return value
}

@@ -248,38 +232,35 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @typedef {import("./types.d.ts").DataReader} DataReader
* @param {DataReader} reader - buffer to read data from
* @param {number} width - width of each bit-packed group
* @param {number} length - length of the encoded data
* @param {number} numValues - number of values to read
* @returns {Decoded<number[]>} array of rle/bit-packed values
* @returns {number[]} array of rle/bit-packed values
*/
export function readRleBitPackedHybrid(dataView, offset, width, length, numValues) {
let byteLength = 0
export function readRleBitPackedHybrid(reader, width, length, numValues) {
if (!length) {
length = dataView.getInt32(offset, true)
length = reader.view.getInt32(reader.offset, true)
reader.offset += 4
if (length < 0) throw new Error(`parquet invalid rle/bitpack length ${length}`)
byteLength += 4
}
/** @type {number[]} */
const value = []
const startByteLength = byteLength
while (byteLength - startByteLength < length && value.length < numValues) {
const [header, newOffset] = readVarInt(dataView, offset + byteLength)
byteLength = newOffset - offset
const startOffset = reader.offset
while (reader.offset - startOffset < length && value.length < numValues) {
const [header, newOffset] = readVarInt(reader.view, reader.offset)
reader.offset = newOffset
if ((header & 1) === 0) {
// rle
const rle = readRle(dataView, offset + byteLength, header, width)
concat(value, rle.value)
byteLength += rle.byteLength
const rle = readRle(reader, header, width)
concat(value, rle)
} else {
// bit-packed
const bitPacked = readBitPacked(
dataView, offset + byteLength, header, width, numValues - value.length
reader, header, width, numValues - value.length
)
concat(value, bitPacked.value)
byteLength += bitPacked.byteLength
concat(value, bitPacked)
}
}
return { value, byteLength }
return value
}

@@ -293,22 +274,20 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} header - header information
* @param {number} bitWidth - width of each bit-packed group
* @returns {Decoded<number[]>} array of rle values
* @returns {number[]} array of rle values
*/
function readRle(dataView, offset, header, bitWidth) {
function readRle(reader, header, bitWidth) {
const count = header >>> 1
const width = (bitWidth + 7) >> 3
let byteLength = 0
let readValue
if (width === 1) {
readValue = dataView.getUint8(offset)
byteLength += 1
readValue = reader.view.getUint8(reader.offset)
reader.offset++
} else if (width === 2) {
readValue = dataView.getUint16(offset, true)
byteLength += 2
readValue = reader.view.getUint16(reader.offset, true)
reader.offset += 2
} else if (width === 4) {
readValue = dataView.getUint32(offset, true)
byteLength += 4
readValue = reader.view.getUint32(reader.offset, true)
reader.offset += 4
} else {

@@ -323,3 +302,3 @@ throw new Error(`parquet invalid rle width ${width}`)

}
return { value, byteLength }
return value
}

@@ -331,10 +310,9 @@

*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {DataReader} reader - buffer to read data from
* @param {number} header - header information
* @param {number} bitWidth - width of each bit-packed group
* @param {number} remaining - number of values remaining to be read
* @returns {Decoded<number[]>} array of bit-packed values
* @returns {number[]} array of bit-packed values
*/
function readBitPacked(dataView, offset, header, bitWidth, remaining) {
function readBitPacked(reader, header, bitWidth, remaining) {
// extract number of values to read from header

@@ -346,8 +324,8 @@ let count = (header >> 1) << 3

let data = 0
if (offset < dataView.byteLength) {
data = dataView.getUint8(offset)
if (reader.offset < reader.view.byteLength) {
data = reader.view.getUint8(reader.offset)
reader.offset++
} else if (mask) {
throw new Error(`parquet bitpack offset ${offset} out of range`)
throw new Error(`parquet bitpack offset ${reader.offset} out of range`)
}
let byteLength = 1
let left = 8

@@ -367,9 +345,6 @@ let right = 0

// if we don't have bitWidth number of bits to read, read next byte
data |= dataView.getUint8(offset + byteLength) << left
byteLength++
data |= reader.view.getUint8(reader.offset) << left
reader.offset++
left += 8
} else {
// otherwise, read bitWidth number of bits
// don't write more than remaining number of rows
// even if there are still bits to read
if (remaining > 0) {

@@ -385,4 +360,3 @@ // emit value by shifting off to the right and masking

// return values and number of bytes read
return { value, byteLength }
return value
}

@@ -389,0 +363,0 @@

@@ -1,8 +0,5 @@

import { Encoding } from './constants.js'
import { Encoding, PageType } from './constants.js'
import { deserializeTCompactProtocol } from './thrift.js'
/**
* Return type with bytes read.
* This is useful to advance an offset through a buffer.
*
* @typedef {import("./types.d.ts").Decoded<T>} Decoded

@@ -24,3 +21,3 @@ * @template T

// Parse parquet header from thrift data
const type = header.field_1
const type = PageType[header.field_1]
const uncompressed_page_size = header.field_2

@@ -56,3 +53,3 @@ const compressed_page_size = header.field_3

repetition_levels_byte_length: header.field_8.field_6,
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default to true
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default true
statistics: header.field_8.field_8,

@@ -59,0 +56,0 @@ }

@@ -70,3 +70,2 @@

* Read a row group from a file-like object.
* Reads the minimal number of columns to satisfy the request.
*

@@ -73,0 +72,0 @@ * @typedef {import('./types.js').RowGroup} RowGroup

@@ -82,3 +82,3 @@ /**

if (element.repetition_type === 'REPEATED') {
maxLevel += 1
maxLevel++
}

@@ -101,3 +101,3 @@ })

if (element.repetition_type !== 'REQUIRED') {
maxLevel += 1
maxLevel++
}

@@ -118,3 +118,3 @@ })

while (n !== 0) {
byteLength += 1
byteLength++
n >>>= 7

@@ -121,0 +121,0 @@ }

@@ -57,3 +57,3 @@ /**

const c = input[pos]
pos += 1
pos++
if (c < 128) {

@@ -70,3 +70,3 @@ break

let len = 0
pos += 1
pos++

@@ -108,3 +108,3 @@ if (pos >= inputLength) {

offset = input[pos] + ((c >>> 5) << 8)
pos += 1
pos++
break

@@ -111,0 +111,0 @@ case 2:

@@ -18,2 +18,7 @@ /**

export interface DataReader {
view: DataView
offset: number
}
// Parquet file metadata types

@@ -194,8 +199,7 @@ export interface FileMetaData {

export enum PageType {
DATA_PAGE = 0,
INDEX_PAGE = 1,
DICTIONARY_PAGE = 2,
DATA_PAGE_V2 = 3,
}
export type PageType =
'DATA_PAGE' |
'INDEX_PAGE' |
'DICTIONARY_PAGE' |
'DATA_PAGE_V2'

@@ -202,0 +206,0 @@ interface SortingColumn {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc