Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

hyparquet

Package Overview
Dependencies
Maintainers
1
Versions
58
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

hyparquet - npm Package Compare versions

Comparing version 0.2.1 to 0.2.2

2

package.json
{
"name": "hyparquet",
"version": "0.2.1",
"version": "0.2.2",
"description": "parquet file parser for javascript",

@@ -5,0 +5,0 @@ "keywords": [

@@ -1,5 +0,5 @@

import { CompressionCodec, Encoding, PageType } from './constants.js'
import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js'
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
import { parquetHeader } from './header.js'
import { getMaxDefinitionLevel, isRequired } from './schema.js'
import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js'
import { snappyUncompress } from './snappy.js'

@@ -14,2 +14,4 @@

const dayMillis = 86400000000000 // 1 day in milliseconds
/**

@@ -31,2 +33,3 @@ * Read a column from the file.

let byteOffset = 0 // byteOffset within the column
/** @type {ArrayLike<any> | undefined} */
let dictionary = undefined

@@ -70,2 +73,4 @@ const rowIndex = [0] // map/list object index

const dictionaryEncoding = daph.encoding === Encoding.PLAIN_DICTIONARY || daph.encoding === Encoding.RLE_DICTIONARY
// construct output values: skip nulls and construct lists

@@ -75,4 +80,7 @@ let values

// Use repetition levels to construct lists
if ([Encoding.PLAIN_DICTIONARY, Encoding.RLE_DICTIONARY].includes(daph.encoding)) {
// TODO: dereference dictionary values
if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) {
// dereference dictionary values
for (let i = 0; i < dataPage.length; i++) {
dataPage[i] = dictionary[dataPage[i]]
}
}

@@ -83,3 +91,3 @@ const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]])

values = assembleObjects(definitionLevels, repetitionLevels, dataPage, isNull, nullValue, maxDefinitionLevel, rowIndex[0])
} else if (definitionLevels) {
} else if (definitionLevels?.length) {
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)

@@ -113,4 +121,14 @@ // Use definition levels to skip nulls

} else {
// TODO: use dictionary
values = dataPage
if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) {
// dereference dictionary values
values = []
for (let i = 0; i < dataPage.length; i++) {
values[i] = dictionary[dataPage[i]]
}
} else if (Array.isArray(dataPage)) {
// convert primitive types to rich types
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema))
} else {
values = dataPage // TODO: data page shouldn't be a fixed byte array?
}
}

@@ -153,1 +171,42 @@

}
/**
* Convert known types from primitive to rich.
*
* @param {any[]} data series of primitive types
* @param {SchemaElement} schemaElement schema element for the data
* @returns {any[]} series of rich types
*/
function convert(data, schemaElement) {
const ctype = schemaElement.converted_type
if (!ctype) return data
if (ctype === ConvertedType.UTF8) {
const decoder = new TextDecoder()
return data.map(v => decoder.decode(v))
}
if (ctype === ConvertedType.DECIMAL) {
const scaleFactor = Math.pow(10, schemaElement.scale || 0)
if (typeof data[0] === 'number') {
return data.map(v => v * scaleFactor)
} else {
// TODO: parse byte string
throw new Error('parquet decimal byte string not supported')
}
}
if (ctype === ConvertedType.DATE) {
return data.map(v => new Date(v * dayMillis))
}
if (ctype === ConvertedType.TIME_MILLIS) {
return data.map(v => new Date(v))
}
if (ctype === ConvertedType.JSON) {
return data.map(v => JSON.parse(v))
}
if (ctype === ConvertedType.BSON) {
throw new Error('parquet bson not supported')
}
if (ctype === ConvertedType.INTERVAL) {
throw new Error('parquet interval not supported')
}
return data
}

@@ -30,2 +30,27 @@ export const ParquetType = {

export const ConvertedType = {
UTF8: 0,
MAP: 1,
MAP_KEY_VALUE: 2,
LIST: 3,
ENUM: 4,
DECIMAL: 5,
DATE: 6,
TIME_MILLIS: 7,
TIME_MICROS: 8,
TIMESTAMP_MILLIS: 9,
TIMESTAMP_MICROS: 10,
UINT_8: 11,
UINT_16: 12,
UINT_32: 13,
UINT_64: 14,
INT_8: 15,
INT_16: 16,
INT_32: 17,
INT_64: 18,
JSON: 19,
BSON: 20,
INTERVAL: 21,
}
export const CompressionCodec = {

@@ -32,0 +57,0 @@ UNCOMPRESSED: 0,

@@ -26,3 +26,3 @@ import { Encoding, ParquetType } from './constants.js'

* @param {ColumnMetaData} columnMetadata metadata for the column
* @returns {DataPage} array of values
* @returns {DataPage} definition levels, repetition levels, and array of values
*/

@@ -146,4 +146,12 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {

)
const numNulls = daph.num_values - definitionLevels
.filter((/** @type number */ d) => d === maxDefinitionLevel).length
// count nulls
let numNulls = daph.num_values
for (const def of definitionLevels) {
if (def === maxDefinitionLevel) numNulls--
}
if (numNulls === 0) {
definitionLevels.length = 0
}
return { byteLength, definitionLevels, numNulls }

@@ -150,0 +158,0 @@ }

@@ -209,3 +209,3 @@ import { ParquetEncoding, ParquetType } from './constants.js'

while (seen < count) {
const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count)
const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, 1)
if (!rleValues.length) break // EOF

@@ -224,9 +224,9 @@ value.push(...rleValues)

* Read values from a run-length encoded/bit-packed hybrid encoding.
* If length is not specified, then a 32-bit int is read first to grab the
* length of the encoded data.
*
* If length is zero, then read as int32 at the start of the encoded data.
*
* @param {DataView} dataView - buffer to read data from
* @param {number} offset - offset to start reading from the DataView
* @param {number} width - width of each bit-packed group
* @param {number | undefined} length - length of the encoded data
* @param {number} length - length of the encoded data
* @param {number} numValues - number of values to read

@@ -233,0 +233,0 @@ * @returns {Decoded<number[]>} array of rle/bit-packed values

@@ -59,3 +59,3 @@ export { AsyncBuffer, FileMetaData, SchemaTree } from './types'

*/
export function parquetSchema(metadata: SchemaElement[]): SchemaTree
export function parquetSchema(metadata: FileMetaData): SchemaTree

@@ -62,0 +62,0 @@ /**

@@ -116,2 +116,3 @@

const columnBytes = columnEndByte - columnStartByte
// skip columns larger than 1gb

@@ -122,2 +123,3 @@ if (columnBytes > 1 << 30) {

}
// use pre-loaded row group byte data if available, else read column data

@@ -132,2 +134,3 @@ let buffer

}
// read column data async

@@ -134,0 +137,0 @@ promises.push(buffer.then(arrayBuffer => {

@@ -84,2 +84,13 @@ /**

TIMESTAMP_MICROS = 10,
UINT_8 = 11,
UINT_16 = 12,
UINT_32 = 13,
UINT_64 = 14,
INT_8 = 15,
INT_16 = 16,
INT_32 = 17,
INT_64 = 18,
JSON = 19,
BSON = 20,
INTERVAL = 21,
}

@@ -86,0 +97,0 @@

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc