Comparing version 0.2.1 to 0.2.2
{ | ||
"name": "hyparquet", | ||
"version": "0.2.1", | ||
"version": "0.2.2", | ||
"description": "parquet file parser for javascript", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
@@ -1,5 +0,5 @@ | ||
import { CompressionCodec, Encoding, PageType } from './constants.js' | ||
import { CompressionCodec, ConvertedType, Encoding, PageType } from './constants.js' | ||
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js' | ||
import { parquetHeader } from './header.js' | ||
import { getMaxDefinitionLevel, isRequired } from './schema.js' | ||
import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js' | ||
import { snappyUncompress } from './snappy.js' | ||
@@ -14,2 +14,4 @@ | ||
const dayMillis = 86400000000000 // 1 day in milliseconds | ||
/** | ||
@@ -31,2 +33,3 @@ * Read a column from the file. | ||
let byteOffset = 0 // byteOffset within the column | ||
/** @type {ArrayLike<any> | undefined} */ | ||
let dictionary = undefined | ||
@@ -70,2 +73,4 @@ const rowIndex = [0] // map/list object index | ||
const dictionaryEncoding = daph.encoding === Encoding.PLAIN_DICTIONARY || daph.encoding === Encoding.RLE_DICTIONARY | ||
// construct output values: skip nulls and construct lists | ||
@@ -75,4 +80,7 @@ let values | ||
// Use repetition levels to construct lists | ||
if ([Encoding.PLAIN_DICTIONARY, Encoding.RLE_DICTIONARY].includes(daph.encoding)) { | ||
// TODO: dereference dictionary values | ||
if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) { | ||
// dereference dictionary values | ||
for (let i = 0; i < dataPage.length; i++) { | ||
dataPage[i] = dictionary[dataPage[i]] | ||
} | ||
} | ||
@@ -83,3 +91,3 @@ const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]]) | ||
values = assembleObjects(definitionLevels, repetitionLevels, dataPage, isNull, nullValue, maxDefinitionLevel, rowIndex[0]) | ||
} else if (definitionLevels) { | ||
} else if (definitionLevels?.length) { | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) | ||
@@ -113,4 +121,14 @@ // Use definition levels to skip nulls | ||
} else { | ||
// TODO: use dictionary | ||
values = dataPage | ||
if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) { | ||
// dereference dictionary values | ||
values = [] | ||
for (let i = 0; i < dataPage.length; i++) { | ||
values[i] = dictionary[dataPage[i]] | ||
} | ||
} else if (Array.isArray(dataPage)) { | ||
// convert primitive types to rich types | ||
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema)) | ||
} else { | ||
values = dataPage // TODO: data page shouldn't be a fixed byte array? | ||
} | ||
} | ||
@@ -153,1 +171,42 @@ | ||
} | ||
/** | ||
* Convert known types from primitive to rich. | ||
* | ||
* @param {any[]} data series of primitive types | ||
* @param {SchemaElement} schemaElement schema element for the data | ||
* @returns {any[]} series of rich types | ||
*/ | ||
function convert(data, schemaElement) { | ||
const ctype = schemaElement.converted_type | ||
if (!ctype) return data | ||
if (ctype === ConvertedType.UTF8) { | ||
const decoder = new TextDecoder() | ||
return data.map(v => decoder.decode(v)) | ||
} | ||
if (ctype === ConvertedType.DECIMAL) { | ||
const scaleFactor = Math.pow(10, schemaElement.scale || 0) | ||
if (typeof data[0] === 'number') { | ||
return data.map(v => v * scaleFactor) | ||
} else { | ||
// TODO: parse byte string | ||
throw new Error('parquet decimal byte string not supported') | ||
} | ||
} | ||
if (ctype === ConvertedType.DATE) { | ||
return data.map(v => new Date(v * dayMillis)) | ||
} | ||
if (ctype === ConvertedType.TIME_MILLIS) { | ||
return data.map(v => new Date(v)) | ||
} | ||
if (ctype === ConvertedType.JSON) { | ||
return data.map(v => JSON.parse(v)) | ||
} | ||
if (ctype === ConvertedType.BSON) { | ||
throw new Error('parquet bson not supported') | ||
} | ||
if (ctype === ConvertedType.INTERVAL) { | ||
throw new Error('parquet interval not supported') | ||
} | ||
return data | ||
} |
@@ -30,2 +30,27 @@ export const ParquetType = { | ||
export const ConvertedType = { | ||
UTF8: 0, | ||
MAP: 1, | ||
MAP_KEY_VALUE: 2, | ||
LIST: 3, | ||
ENUM: 4, | ||
DECIMAL: 5, | ||
DATE: 6, | ||
TIME_MILLIS: 7, | ||
TIME_MICROS: 8, | ||
TIMESTAMP_MILLIS: 9, | ||
TIMESTAMP_MICROS: 10, | ||
UINT_8: 11, | ||
UINT_16: 12, | ||
UINT_32: 13, | ||
UINT_64: 14, | ||
INT_8: 15, | ||
INT_16: 16, | ||
INT_32: 17, | ||
INT_64: 18, | ||
JSON: 19, | ||
BSON: 20, | ||
INTERVAL: 21, | ||
} | ||
export const CompressionCodec = { | ||
@@ -32,0 +57,0 @@ UNCOMPRESSED: 0, |
@@ -26,3 +26,3 @@ import { Encoding, ParquetType } from './constants.js' | ||
* @param {ColumnMetaData} columnMetadata metadata for the column | ||
* @returns {DataPage} array of values | ||
* @returns {DataPage} definition levels, repetition levels, and array of values | ||
*/ | ||
@@ -146,4 +146,12 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { | ||
) | ||
const numNulls = daph.num_values - definitionLevels | ||
.filter((/** @type number */ d) => d === maxDefinitionLevel).length | ||
// count nulls | ||
let numNulls = daph.num_values | ||
for (const def of definitionLevels) { | ||
if (def === maxDefinitionLevel) numNulls-- | ||
} | ||
if (numNulls === 0) { | ||
definitionLevels.length = 0 | ||
} | ||
return { byteLength, definitionLevels, numNulls } | ||
@@ -150,0 +158,0 @@ } |
@@ -209,3 +209,3 @@ import { ParquetEncoding, ParquetType } from './constants.js' | ||
while (seen < count) { | ||
const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count) | ||
const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, 1) | ||
if (!rleValues.length) break // EOF | ||
@@ -224,9 +224,9 @@ value.push(...rleValues) | ||
* Read values from a run-length encoded/bit-packed hybrid encoding. | ||
* If length is not specified, then a 32-bit int is read first to grab the | ||
* length of the encoded data. | ||
* | ||
* If length is zero, then read as int32 at the start of the encoded data. | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {number} width - width of each bit-packed group | ||
* @param {number | undefined} length - length of the encoded data | ||
* @param {number} length - length of the encoded data | ||
* @param {number} numValues - number of values to read | ||
@@ -233,0 +233,0 @@ * @returns {Decoded<number[]>} array of rle/bit-packed values |
@@ -59,3 +59,3 @@ export { AsyncBuffer, FileMetaData, SchemaTree } from './types' | ||
*/ | ||
export function parquetSchema(metadata: SchemaElement[]): SchemaTree | ||
export function parquetSchema(metadata: FileMetaData): SchemaTree | ||
@@ -62,0 +62,0 @@ /** |
@@ -116,2 +116,3 @@ | ||
const columnBytes = columnEndByte - columnStartByte | ||
// skip columns larger than 1gb | ||
@@ -122,2 +123,3 @@ if (columnBytes > 1 << 30) { | ||
} | ||
// use pre-loaded row group byte data if available, else read column data | ||
@@ -132,2 +134,3 @@ let buffer | ||
} | ||
// read column data async | ||
@@ -134,0 +137,0 @@ promises.push(buffer.then(arrayBuffer => { |
@@ -84,2 +84,13 @@ /** | ||
TIMESTAMP_MICROS = 10, | ||
UINT_8 = 11, | ||
UINT_16 = 12, | ||
UINT_32 = 13, | ||
UINT_64 = 14, | ||
INT_8 = 15, | ||
INT_16 = 16, | ||
INT_32 = 17, | ||
INT_64 = 18, | ||
JSON = 19, | ||
BSON = 20, | ||
INTERVAL = 21, | ||
} | ||
@@ -86,0 +97,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
74086
2014