Comparing version 0.2.6 to 0.3.0
{ | ||
"name": "hyparquet", | ||
"version": "0.2.6", | ||
"version": "0.3.0", | ||
"description": "parquet file parser for javascript", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
@@ -1,2 +0,2 @@ | ||
import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js' | ||
import { Encoding, PageType } from './constants.js' | ||
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js' | ||
@@ -52,8 +52,9 @@ import { parquetHeader } from './header.js' | ||
const { codec } = columnMetadata | ||
if (codec === CompressionCodec.SNAPPY) { | ||
if (codec === 'UNCOMPRESSED') { | ||
page = compressedBytes | ||
} else if (codec === 'SNAPPY') { | ||
page = new Uint8Array(uncompressed_page_size) | ||
snappyUncompress(compressedBytes, page) | ||
} else { | ||
const compressor = Object.entries(CompressionCodec).find(([, value]) => value === codec) | ||
throw new Error(`parquet unsupported compression codec: ${codec} ${compressor?.[0]}`) | ||
throw new Error(`parquet unsupported compression codec: ${codec}`) | ||
} | ||
@@ -142,2 +143,4 @@ if (page?.length !== uncompressed_page_size) { | ||
dictionary = readDictionaryPage(page, diph, schema, columnMetadata) | ||
} else if (header.type === PageType.DATA_PAGE_V2) { | ||
throw new Error('parquet data page v2 not supported') | ||
} else { | ||
@@ -179,7 +182,7 @@ throw new Error(`parquet unsupported page type: ${header.type}`) | ||
if (ctype === undefined) return data | ||
if (ctype === ConvertedType.UTF8) { | ||
if (ctype === 'UTF8') { | ||
const decoder = new TextDecoder() | ||
return data.map(v => decoder.decode(v)) | ||
} | ||
if (ctype === ConvertedType.DECIMAL) { | ||
if (ctype === 'DECIMAL') { | ||
const scaleFactor = Math.pow(10, schemaElement.scale || 0) | ||
@@ -193,15 +196,15 @@ if (typeof data[0] === 'number') { | ||
} | ||
if (ctype === ConvertedType.DATE) { | ||
if (ctype === 'DATE') { | ||
return data.map(v => new Date(v * dayMillis)) | ||
} | ||
if (ctype === ConvertedType.TIME_MILLIS) { | ||
if (ctype === 'TIME_MILLIS') { | ||
return data.map(v => new Date(v)) | ||
} | ||
if (ctype === ConvertedType.JSON) { | ||
if (ctype === 'JSON') { | ||
return data.map(v => JSON.parse(v)) | ||
} | ||
if (ctype === ConvertedType.BSON) { | ||
if (ctype === 'BSON') { | ||
throw new Error('parquet bson not supported') | ||
} | ||
if (ctype === ConvertedType.INTERVAL) { | ||
if (ctype === 'INTERVAL') { | ||
throw new Error('parquet interval not supported') | ||
@@ -208,0 +211,0 @@ } |
@@ -12,3 +12,3 @@ export const ParquetType = { | ||
export const ParquetEncoding = { | ||
export const Encoding = { | ||
PLAIN: 0, | ||
@@ -25,43 +25,43 @@ PLAIN_DICTIONARY: 2, | ||
export const FieldRepetitionType = { | ||
REQUIRED: 0, | ||
OPTIONAL: 1, | ||
REPEATED: 2, | ||
} | ||
export const FieldRepetitionType = [ | ||
'REQUIRED', | ||
'OPTIONAL', | ||
'REPEATED', | ||
] | ||
export const ConvertedType = { | ||
UTF8: 0, | ||
MAP: 1, | ||
MAP_KEY_VALUE: 2, | ||
LIST: 3, | ||
ENUM: 4, | ||
DECIMAL: 5, | ||
DATE: 6, | ||
TIME_MILLIS: 7, | ||
TIME_MICROS: 8, | ||
TIMESTAMP_MILLIS: 9, | ||
TIMESTAMP_MICROS: 10, | ||
UINT_8: 11, | ||
UINT_16: 12, | ||
UINT_32: 13, | ||
UINT_64: 14, | ||
INT_8: 15, | ||
INT_16: 16, | ||
INT_32: 17, | ||
INT_64: 18, | ||
JSON: 19, | ||
BSON: 20, | ||
INTERVAL: 21, | ||
} | ||
export const ConvertedType = [ | ||
'UTF8', | ||
'MAP', | ||
'MAP_KEY_VALUE', | ||
'LIST', | ||
'ENUM', | ||
'DECIMAL', | ||
'DATE', | ||
'TIME_MILLIS', | ||
'TIME_MICROS', | ||
'TIMESTAMP_MILLIS', | ||
'TIMESTAMP_MICROS', | ||
'UINT_8', | ||
'UINT_16', | ||
'UINT_32', | ||
'UINT_64', | ||
'INT_8', | ||
'INT_16', | ||
'INT_32', | ||
'INT_64', | ||
'JSON', | ||
'BSON', | ||
'INTERVAL', | ||
] | ||
export const CompressionCodec = { | ||
UNCOMPRESSED: 0, | ||
SNAPPY: 1, | ||
GZIP: 2, | ||
LZO: 3, | ||
BROTLI: 4, | ||
LZ4: 5, | ||
ZSTD: 6, | ||
LZ4_RAW: 7, | ||
} | ||
export const CompressionCodec = [ | ||
'UNCOMPRESSED', | ||
'SNAPPY', | ||
'GZIP', | ||
'LZO', | ||
'BROTLI', | ||
'LZ4', | ||
'ZSTD', | ||
'LZ4_RAW', | ||
] | ||
@@ -74,13 +74,1 @@ export const PageType = { | ||
} | ||
export const Encoding = { | ||
PLAIN: 0, | ||
PLAIN_DICTIONARY: 2, | ||
RLE: 3, | ||
BIT_PACKED: 4, // deprecated | ||
DELTA_BINARY_PACKED: 5, | ||
DELTA_LENGTH_BYTE_ARRAY: 6, | ||
DELTA_BYTE_ARRAY: 7, | ||
RLE_DICTIONARY: 8, | ||
BYTE_STREAM_SPLIT: 9, | ||
} |
@@ -35,3 +35,5 @@ import { Encoding, ParquetType } from './constants.js' | ||
// repetition levels | ||
const { value: repetitionLevels, byteLength } = readRepetitionLevels(dataView, offset, daph, schema, columnMetadata) | ||
const { value: repetitionLevels, byteLength } = readRepetitionLevels( | ||
dataView, offset, daph, schema, columnMetadata | ||
) | ||
offset += byteLength | ||
@@ -56,5 +58,10 @@ | ||
if (daph.encoding === Encoding.PLAIN) { | ||
const plainObj = readPlain(dataView, columnMetadata.type, daph.num_values - numNulls, offset) | ||
const plainObj = readPlain(dataView, columnMetadata.type, nval, offset) | ||
values = plainObj.value | ||
offset += plainObj.byteLength | ||
} else if (daph.encoding === Encoding.PLAIN_DICTIONARY) { | ||
const plainObj = readPlain(dataView, columnMetadata.type, nval, offset) | ||
values = plainObj.value | ||
offset += plainObj.byteLength | ||
// TODO: dictionary decoding | ||
} else if (daph.encoding === Encoding.RLE_DICTIONARY) { | ||
@@ -71,3 +78,5 @@ // bit width is stored as single byte | ||
if (bitWidth) { | ||
const { value, byteLength } = readRleBitPackedHybrid(dataView, offset, bitWidth, dataView.byteLength - offset, daph.num_values - numNulls) | ||
const { value, byteLength } = readRleBitPackedHybrid( | ||
dataView, offset, bitWidth, dataView.byteLength - offset, nval | ||
) | ||
offset += byteLength | ||
@@ -130,3 +139,2 @@ values = value | ||
* Read the definition levels from this page, if any. | ||
* Other implementations read the definition levels and num nulls, but we don't need em. | ||
* | ||
@@ -133,0 +141,0 @@ * @param {DataView} dataView data view for the page |
@@ -1,2 +0,2 @@ | ||
import { ParquetEncoding, ParquetType } from './constants.js' | ||
import { Encoding, ParquetType } from './constants.js' | ||
import { readVarInt } from './thrift.js' | ||
@@ -206,10 +206,10 @@ | ||
let byteLength = 0 | ||
if (encoding === ParquetEncoding.RLE) { | ||
if (encoding === Encoding.RLE) { | ||
let seen = 0 | ||
while (seen < count) { | ||
const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, 1) | ||
if (!rleValues.length) break // EOF | ||
value.push(...rleValues) | ||
seen += rleValues.length | ||
byteLength += rleByteLength | ||
const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count) | ||
if (!rle.value.length) break // EOF | ||
value.push(...rle.value) | ||
seen += rle.value.length | ||
byteLength += rle.byteLength | ||
} | ||
@@ -248,10 +248,12 @@ } else { | ||
// rle | ||
const { value: rleValues, byteLength: rleByteLength } = readRle(dataView, offset + byteLength, header, width) | ||
value.push(...rleValues) | ||
byteLength += rleByteLength | ||
const rle = readRle(dataView, offset + byteLength, header, width) | ||
value.push(...rle.value) | ||
byteLength += rle.byteLength | ||
} else { | ||
// bit-packed | ||
const { value: bitPackedValues, byteLength: bitPackedByteLength } = readBitPacked(dataView, offset + byteLength, header, width, numValues-value.length) | ||
value.push(...bitPackedValues) | ||
byteLength += bitPackedByteLength | ||
const bitPacked = readBitPacked( | ||
dataView, offset + byteLength, header, width, numValues - value.length | ||
) | ||
value.push(...bitPacked.value) | ||
byteLength += bitPacked.byteLength | ||
} | ||
@@ -258,0 +260,0 @@ } |
@@ -0,1 +1,2 @@ | ||
import { CompressionCodec, ConvertedType, FieldRepetitionType } from './constants.js' | ||
import { schemaTree } from './schema.js' | ||
@@ -100,6 +101,6 @@ import { deserializeTCompactProtocol } from './thrift.js' | ||
type_length: field.field_2, | ||
repetition_type: field.field_3, | ||
repetition_type: FieldRepetitionType[field.field_3], | ||
name: field.field_4, | ||
num_children: field.field_5, | ||
converted_type: field.field_6, | ||
converted_type: ConvertedType[field.field_6], | ||
scale: field.field_7, | ||
@@ -118,3 +119,3 @@ precision: field.field_8, | ||
path_in_schema: column.field_3.field_3, | ||
codec: column.field_3.field_4, | ||
codec: CompressionCodec[column.field_3.field_4], | ||
num_values: column.field_3.field_5, | ||
@@ -121,0 +122,0 @@ total_uncompressed_size: column.field_3.field_6, |
@@ -1,3 +0,1 @@ | ||
import { FieldRepetitionType } from './constants.js' | ||
/** | ||
@@ -60,3 +58,3 @@ * @typedef {import('./types.js').SchemaElement} SchemaElement | ||
export function isRequired(schema, name) { | ||
return schemaElement(schema, name).repetition_type === FieldRepetitionType.REQUIRED | ||
return schemaElement(schema, name).repetition_type === 'REQUIRED' | ||
} | ||
@@ -75,3 +73,3 @@ | ||
const element = schemaElement(schema, parts.slice(0, i + 1)) | ||
if (element.repetition_type === FieldRepetitionType.REPEATED) { | ||
if (element.repetition_type === 'REPEATED') { | ||
maxLevel += 1 | ||
@@ -94,3 +92,3 @@ } | ||
const element = schemaElement(schema, parts.slice(0, i + 1)) | ||
if (element.repetition_type !== FieldRepetitionType.REQUIRED) { | ||
if (element.repetition_type !== 'REQUIRED') { | ||
maxLevel += 1 | ||
@@ -97,0 +95,0 @@ } |
@@ -66,32 +66,30 @@ /** | ||
export enum FieldRepetitionType { | ||
REQUIRED = 0, | ||
OPTIONAL = 1, | ||
REPEATED = 2, | ||
} | ||
export type FieldRepetitionType = | ||
'REQUIRED' | | ||
'OPTIONAL' | | ||
'REPEATED' | ||
export enum ConvertedType { | ||
UTF8 = 0, | ||
MAP = 1, | ||
MAP_KEY_VALUE = 2, | ||
LIST = 3, | ||
ENUM = 4, | ||
DECIMAL = 5, | ||
DATE = 6, | ||
TIME_MILLIS = 7, | ||
TIME_MICROS = 8, | ||
TIMESTAMP_MILLIS = 9, | ||
TIMESTAMP_MICROS = 10, | ||
UINT_8 = 11, | ||
UINT_16 = 12, | ||
UINT_32 = 13, | ||
UINT_64 = 14, | ||
INT_8 = 15, | ||
INT_16 = 16, | ||
INT_32 = 17, | ||
INT_64 = 18, | ||
JSON = 19, | ||
BSON = 20, | ||
INTERVAL = 21, | ||
} | ||
export type ConvertedType = | ||
'UTF8' | | ||
'MAP' | | ||
'MAP_KEY_VALUE' | | ||
'LIST' | | ||
'ENUM' | | ||
'DECIMAL' | | ||
'DATE' | | ||
'TIME_MILLIS' | | ||
'TIME_MICROS' | | ||
'TIMESTAMP_MILLIS' | | ||
'TIMESTAMP_MICROS' | | ||
'UINT_8' | | ||
'UINT_16' | | ||
'UINT_32' | | ||
'UINT_64' | | ||
'INT_8' | | ||
'INT_16' | | ||
'INT_32' | | ||
'INT_64' | | ||
'JSON' | | ||
'BSON' | | ||
'INTERVAL' | ||
@@ -139,12 +137,11 @@ export interface RowGroup { | ||
export enum CompressionCodec { | ||
UNCOMPRESSED = 0, | ||
SNAPPY = 1, | ||
GZIP = 2, | ||
LZO = 3, | ||
BROTLI = 4, | ||
LZ4 = 5, | ||
ZSTD = 6, | ||
LZ4_RAW = 7, | ||
} | ||
export type CompressionCodec = | ||
'UNCOMPRESSED' | | ||
'SNAPPY' | | ||
'GZIP' | | ||
'LZO' | | ||
'BROTLI' | | ||
'LZ4' | | ||
'ZSTD' | | ||
'LZ4_RAW' | ||
@@ -151,0 +148,0 @@ interface KeyValue { |
74030
2011
1
0
48