Huge News!Announcing our $40M Series B led by Abstract Ventures.Learn More
Socket
Sign inDemoInstall
Socket

hyparquet

Package Overview
Dependencies
Maintainers
1
Versions
58
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

hyparquet - npm Package Compare versions

Comparing version 0.9.6 to 0.9.7

6

package.json
{
"name": "hyparquet",
"version": "0.9.6",
"version": "0.9.7",
"description": "parquet file parser for javascript",

@@ -31,3 +31,3 @@ "keywords": [

"@types/node": "20.12.12",
"@typescript-eslint/eslint-plugin": "7.10.0",
"@typescript-eslint/eslint-plugin": "7.11.0",
"@vitest/coverage-v8": "1.6.0",

@@ -38,3 +38,3 @@ "eslint": "8.57.0",

"http-server": "14.1.1",
"hyparquet-compressors": "0.1.3",
"hyparquet-compressors": "0.1.4",
"typescript": "5.4.5",

@@ -41,0 +41,0 @@ "vitest": "1.6.0"

import { assembleLists } from './assemble.js'
import { convert, dereferenceDictionary } from './convert.js'
import { readDataPage, readDictionaryPage } from './datapage.js'
import { convertWithDictionary } from './convert.js'
import { decompressPage, readDataPage, readDictionaryPage } from './datapage.js'
import { readDataPageV2 } from './datapageV2.js'
import { parquetHeader } from './header.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
import { snappyUncompress } from './snappy.js'
import { concat } from './utils.js'

@@ -52,7 +51,5 @@

// construct output values: skip nulls and construct lists
values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
// convert types, dereference dictionary, and assemble lists
values = convertWithDictionary(dataPage, dictionary, element, daph.encoding, utf8)
if (repetitionLevels.length || definitionLevels?.length) {
// Use repetition levels to construct lists
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)

@@ -83,6 +80,5 @@ const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)

values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
// convert types, dereference dictionary, and assemble lists
values = convertWithDictionary(dataPage, dictionary, element, daph2.encoding, utf8)
if (repetitionLevels.length || definitionLevels?.length) {
// Use repetition levels to construct lists
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)

@@ -128,28 +124,1 @@ const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)

}
/**
* @param {Uint8Array} compressedBytes
* @param {number} uncompressed_page_size
* @param {import('./types.js').CompressionCodec} codec
* @param {import('./types.js').Compressors | undefined} compressors
* @returns {Uint8Array}
*/
export function decompressPage(compressedBytes, uncompressed_page_size, codec, compressors) {
/** @type {Uint8Array} */
let page
const customDecompressor = compressors?.[codec]
if (codec === 'UNCOMPRESSED') {
page = compressedBytes
} else if (customDecompressor) {
page = customDecompressor(compressedBytes, uncompressed_page_size)
} else if (codec === 'SNAPPY') {
page = new Uint8Array(uncompressed_page_size)
snappyUncompress(compressedBytes, page)
} else {
throw new Error(`parquet unsupported compression codec: ${codec}`)
}
if (page?.length !== uncompressed_page_size) {
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
}
return page
}
const dayMillis = 86400000 // 1 day in milliseconds
/**
* Convert known types from primitive to rich.
* Convert known types from primitive to rich, and dereference dictionary.
*
* @typedef {import('./types.js').DecodedArray} DecodedArray
* @typedef {import('./types.js').SchemaElement} SchemaElement
* @param {DecodedArray} data series of primitive types
* @param {import('./types.js').SchemaElement} schemaElement schema element for the data
* @param {DecodedArray | undefined} dictionary
* @param {SchemaElement} schemaElement
* @param {import('./types.js').Encoding} encoding
* @param {boolean | undefined} utf8 decode bytes as utf8?
* @returns {DecodedArray} series of rich types
*/
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) {
if (dictionary && encoding.endsWith('_DICTIONARY')) {
// convert dictionary
dictionary = convert(dictionary, schemaElement, utf8)
let output = data
if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// @ts-expect-error upgrade data to match dictionary type with fancy constructor
output = new dictionary.constructor(data.length)
}
for (let i = 0; i < data.length; i++) {
output[i] = dictionary[data[i]]
}
return output
} else {
return convert(data, schemaElement, utf8)
}
}
/**
* Convert known types from primitive to rich.
*
* @param {DecodedArray} data series of primitive types
* @param {SchemaElement} schemaElement
* @param {boolean | undefined} utf8 decode bytes as utf8?
* @returns {DecodedArray} series of rich types
*/
export function convert(data, schemaElement, utf8 = true) {

@@ -76,10 +105,13 @@ const ctype = schemaElement.converted_type

}
const logicalType = schemaElement.logical_type?.type
if (logicalType === 'FLOAT16') {
if (schemaElement.logical_type?.type === 'FLOAT16') {
return Array.from(data).map(parseFloat16)
}
if (logicalType === 'TIMESTAMP') {
if (schemaElement.logical_type?.type === 'TIMESTAMP') {
const { unit } = schemaElement.logical_type
let factor = 1n
if (unit === 'MICROS') factor = 1000n
if (unit === 'NANOS') factor = 1000000n
const arr = new Array(data.length)
for (let i = 0; i < arr.length; i++) {
arr[i] = new Date(Number(data[i]))
arr[i] = new Date(Number(data[i] / factor))
}

@@ -95,3 +127,3 @@ return arr

*/
function parseDecimal(bytes) {
export function parseDecimal(bytes) {
// TODO: handle signed

@@ -130,23 +162,1 @@ let value = 0

}
/**
* Map data to dictionary values in place.
*
* @param {DecodedArray | undefined} dictionary
* @param {DecodedArray} dataPage
* @returns {DecodedArray}
*/
export function dereferenceDictionary(dictionary, dataPage) {
let output = dataPage
if (dictionary) {
if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// upgrade dataPage to match dictionary type
// @ts-expect-error not my fault typescript doesn't understand constructors
output = new dictionary.constructor(dataPage.length)
}
for (let i = 0; i < dataPage.length; i++) {
output[i] = dictionary[dataPage[i]]
}
}
return output
}
import { bitWidth, byteStreamSplit, readRleBitPackedHybrid } from './encoding.js'
import { readPlain } from './plain.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
import { snappyUncompress } from './snappy.js'

@@ -110,1 +111,28 @@ /**

}
/**
* @param {Uint8Array} compressedBytes
* @param {number} uncompressed_page_size
* @param {import('./types.js').CompressionCodec} codec
* @param {import('./types.js').Compressors | undefined} compressors
* @returns {Uint8Array}
*/
export function decompressPage(compressedBytes, uncompressed_page_size, codec, compressors) {
/** @type {Uint8Array} */
let page
const customDecompressor = compressors?.[codec]
if (codec === 'UNCOMPRESSED') {
page = compressedBytes
} else if (customDecompressor) {
page = customDecompressor(compressedBytes, uncompressed_page_size)
} else if (codec === 'SNAPPY') {
page = new Uint8Array(uncompressed_page_size)
snappyUncompress(compressedBytes, page)
} else {
throw new Error(`parquet unsupported compression codec: ${codec}`)
}
if (page?.length !== uncompressed_page_size) {
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
}
return page
}

@@ -1,2 +0,2 @@

import { decompressPage } from './column.js'
import { decompressPage } from './datapage.js'
import { deltaBinaryUnpack, deltaByteArray, deltaLengthByteArray } from './delta.js'

@@ -3,0 +3,0 @@ import { bitWidth, byteStreamSplit, readRleBitPackedHybrid } from './encoding.js'

import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from './constants.js'
import { parseFloat16 } from './convert.js'
import { parseDecimal, parseFloat16 } from './convert.js'
import { getSchemaPath } from './schema.js'

@@ -217,3 +217,3 @@ import { deserializeTCompactProtocol } from './thrift.js'

isAdjustedToUTC: logicalType.field_7.field_1,
unit: logicalType.field_7.field_2,
unit: timeUnit(logicalType.field_7.field_2),
}

@@ -223,3 +223,3 @@ if (logicalType?.field_8) return {

isAdjustedToUTC: logicalType.field_8.field_1,
unit: logicalType.field_8.field_2,
unit: timeUnit(logicalType.field_8.field_2),
}

@@ -240,2 +240,13 @@ if (logicalType?.field_10) return {

/**
* @param {any} unit
* @returns {import("./types.d.ts").TimeUnit}
*/
function timeUnit(unit) {
if (unit.field_1) return 'MILLIS'
if (unit.field_2) return 'MICROS'
if (unit.field_3) return 'NANOS'
throw new Error('parquet time unit required')
}
/**
* Convert column statistics based on column type.

@@ -248,3 +259,3 @@ *

function columnStats(stats, schema) {
const { type, logical_type } = schema
const { type, converted_type, logical_type } = schema
function convert(/** @type {Uint8Array} */ value) {

@@ -254,21 +265,13 @@ if (value === undefined) return value

if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value)
if (type === 'INT32') {
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
return view.getInt32(0, true)
}
if (type === 'INT64') {
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
return view.getBigInt64(0, true)
}
if (type === 'FLOAT') {
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
return view.getFloat32(0, true)
}
if (type === 'DOUBLE') {
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
return view.getFloat64(0, true)
}
if (logical_type?.type === 'FLOAT16') {
return parseFloat16(value)
}
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
if (type === 'FLOAT') return view.getFloat32(0, true)
if (type === 'DOUBLE') return view.getFloat64(0, true)
if (type === 'INT32' && converted_type === 'DATE') return new Date(view.getInt32(0, true) * 86400000)
if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return new Date(Number(view.getBigInt64(0, true) / 1000n))
if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return new Date(Number(view.getBigInt64(0, true)))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return new Date(Number(view.getBigInt64(0, true)))
if (type === 'INT32') return view.getInt32(0, true)
if (type === 'INT64') return view.getBigInt64(0, true)
if (converted_type === 'DECIMAL') return parseDecimal(value) * Math.pow(10, -(schema.scale || 0))
if (logical_type?.type === 'FLOAT16') return parseFloat16(value)
return value

@@ -275,0 +278,0 @@ }

@@ -95,3 +95,3 @@ export type Awaitable<T> = T | Promise<T>

type TimeUnit = 'MILLIS' | 'MICROS' | 'NANOS'
export type TimeUnit = 'MILLIS' | 'MICROS' | 'NANOS'

@@ -117,3 +117,3 @@ type LogicalTimeType = {

export type LogicalType =
{ type: LogicalTypeType } |
{ type: LogicalTypeSimple } |
LogicalDecimalType |

@@ -124,18 +124,20 @@ LogicalTimeType |

export type LogicalTypeType =
'STRING' | // convertedType UTF8
'MAP' | // convertedType MAP
'LIST' | // convertedType LIST
'ENUM' | // convertedType ENUM
'DECIMAL' | // convertedType DECIMAL + precision/scale
'DATE' | // convertedType DATE
type LogicalTypeSimple =
'STRING' |
'MAP' |
'LIST' |
'ENUM' |
'DECIMAL' |
'DATE' |
'INTERVAL' |
'NULL' |
'JSON' |
'BSON' |
'UUID' |
'FLOAT16'
export type LogicalTypeType = LogicalTypeSimple |
'TIME' | // convertedType TIME_MILLIS or TIME_MICROS
'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS
'INTEGER' | // convertedType INT or UINT
'INTERVAL' | // convertedType INT or UINT
'NULL' | // no convertedType
'JSON' | // convertedType JSON
'BSON' | // convertedType BSON
'UUID' | // no convertedType
'FLOAT16' // no convertedType
'INTEGER' // convertedType INT or UINT

@@ -142,0 +144,0 @@ export interface RowGroup {

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc