Comparing version 0.7.5 to 0.7.6
{ | ||
"name": "hyparquet", | ||
"version": "0.7.5", | ||
"version": "0.7.6", | ||
"description": "parquet file parser for javascript", | ||
@@ -5,0 +5,0 @@ "keywords": [ |
import { assembleObjects } from './assemble.js' | ||
import { PageType } from './constants.js' | ||
import { convert } from './convert.js' | ||
@@ -52,3 +51,3 @@ import { readDataPage, readDictionaryPage } from './datapage.js' | ||
// parse page data by type | ||
if (header.type === PageType.DATA_PAGE) { | ||
if (header.type === 'DATA_PAGE') { | ||
const daph = header.data_page_header | ||
@@ -99,3 +98,3 @@ if (!daph) throw new Error('parquet data page header is undefined') | ||
concat(rowData, values) | ||
} else if (header.type === PageType.DICTIONARY_PAGE) { | ||
} else if (header.type === 'DICTIONARY_PAGE') { | ||
const diph = header.dictionary_page_header | ||
@@ -108,3 +107,3 @@ if (!diph) throw new Error('parquet dictionary page header is undefined') | ||
dictionary = readDictionaryPage(page, diph, schema, columnMetadata) | ||
} else if (header.type === PageType.DATA_PAGE_V2) { | ||
} else if (header.type === 'DATA_PAGE_V2') { | ||
const daph2 = header.data_page_header_v2 | ||
@@ -206,2 +205,3 @@ if (!daph2) throw new Error('parquet data page header v2 is undefined') | ||
* Expand data page list with nulls and convert to utf8. | ||
* | ||
* @param {number[]} definitionLevels | ||
@@ -208,0 +208,0 @@ * @param {number} maxDefinitionLevel |
@@ -97,7 +97,11 @@ /** | ||
export const PageType = { | ||
DATA_PAGE: 0, | ||
INDEX_PAGE: 1, | ||
DICTIONARY_PAGE: 2, | ||
DATA_PAGE_V2: 3, | ||
} | ||
/** | ||
* @typedef {import('./types.js').PageType} PageType | ||
* @type {PageType[]} | ||
*/ | ||
export const PageType = [ | ||
'DATA_PAGE', | ||
'INDEX_PAGE', | ||
'DICTIONARY_PAGE', | ||
'DATA_PAGE_V2', | ||
] |
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' | ||
import { | ||
getMaxDefinitionLevel, | ||
getMaxRepetitionLevel, | ||
isRequired, | ||
schemaElement, | ||
skipDefinitionBytes, | ||
} from './schema.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, schemaElement, skipDefinitionBytes } from './schema.js' | ||
@@ -13,17 +7,9 @@ const skipNulls = false // TODO | ||
/** | ||
* @typedef {{ byteLength: number, definitionLevels: number[], numNulls: number }} DefinitionLevels | ||
* Read a data page from the given Uint8Array. | ||
* | ||
* @typedef {{ definitionLevels: number[], numNulls: number }} DefinitionLevels | ||
* @typedef {import("./types.d.ts").DataPage} DataPage | ||
* @typedef {import("./types.d.ts").ColumnMetaData} ColumnMetaData | ||
* @typedef {import("./types.d.ts").DataPageHeader} DataPageHeader | ||
* @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader | ||
* @typedef {import("./types.d.ts").SchemaElement} SchemaElement | ||
*/ | ||
/** | ||
* @typedef {import("./types.d.ts").Decoded<T>} Decoded | ||
* @template T | ||
*/ | ||
/** | ||
* Read a data page from the given Uint8Array. | ||
* | ||
* @param {Uint8Array} bytes raw page data (should already be decompressed) | ||
@@ -36,4 +22,4 @@ * @param {DataPageHeader} daph data page header | ||
export function readDataPage(bytes, daph, schema, columnMetadata) { | ||
const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) | ||
let offset = 0 | ||
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) | ||
const reader = { view, offset: 0 } | ||
/** @type {any[]} */ | ||
@@ -43,6 +29,5 @@ let values = [] | ||
// repetition levels | ||
const { value: repetitionLevels, byteLength } = readRepetitionLevels( | ||
dataView, offset, daph, schema, columnMetadata | ||
const repetitionLevels = readRepetitionLevels( | ||
reader, daph, schema, columnMetadata | ||
) | ||
offset += byteLength | ||
@@ -56,8 +41,7 @@ // definition levels | ||
// skip_definition_bytes | ||
offset += skipDefinitionBytes(daph.num_values) | ||
reader.offset += skipDefinitionBytes(daph.num_values) | ||
} else { | ||
const dl = readDefinitionLevels(dataView, offset, daph, schema, columnMetadata.path_in_schema) | ||
const dl = readDefinitionLevels(reader, daph, schema, columnMetadata.path_in_schema) | ||
definitionLevels = dl.definitionLevels | ||
numNulls = dl.numNulls | ||
offset += dl.byteLength | ||
} | ||
@@ -70,5 +54,4 @@ | ||
const utf8 = element.converted_type === 'UTF8' | ||
const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8) | ||
values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value) | ||
offset += plainObj.byteLength | ||
const plainObj = readPlain(reader, columnMetadata.type, nValues, utf8) | ||
values = Array.isArray(plainObj) ? plainObj : Array.from(plainObj) | ||
} else if ( | ||
@@ -85,10 +68,9 @@ daph.encoding === 'PLAIN_DICTIONARY' || | ||
} else { | ||
bitWidth = dataView.getUint8(offset) | ||
offset += 1 | ||
bitWidth = view.getUint8(reader.offset) | ||
reader.offset++ | ||
} | ||
if (bitWidth) { | ||
const { value, byteLength } = readRleBitPackedHybrid( | ||
dataView, offset, bitWidth, dataView.byteLength - offset, nValues | ||
const value = readRleBitPackedHybrid( | ||
reader, bitWidth, view.byteLength - reader.offset, nValues | ||
) | ||
offset += byteLength | ||
values = Array.isArray(value) ? value : Array.from(value) | ||
@@ -109,2 +91,3 @@ } else { | ||
* | ||
* @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader | ||
* @param {Uint8Array} bytes raw page data | ||
@@ -117,4 +100,5 @@ * @param {DictionaryPageHeader} diph dictionary page header | ||
export function readDictionaryPage(bytes, diph, schema, columnMetadata) { | ||
const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) | ||
return readPlain(dataView, columnMetadata.type, diph.num_values, 0, false).value | ||
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) | ||
const reader = { view, offset: 0 } | ||
return readPlain(reader, columnMetadata.type, diph.num_values, false) | ||
} | ||
@@ -125,10 +109,10 @@ | ||
* | ||
* @param {DataView} dataView data view for the page | ||
* @param {number} offset offset to start reading from | ||
* @typedef {import("./types.d.ts").DataReader} DataReader | ||
* @param {DataReader} reader data view for the page | ||
* @param {DataPageHeader} daph data page header | ||
* @param {SchemaElement[]} schema schema for the file | ||
* @param {ColumnMetaData} columnMetadata metadata for the column | ||
* @returns {Decoded<any[]>} repetition levels and number of bytes read | ||
* @returns {any[]} repetition levels and number of bytes read | ||
*/ | ||
function readRepetitionLevels(dataView, offset, daph, schema, columnMetadata) { | ||
function readRepetitionLevels(reader, daph, schema, columnMetadata) { | ||
if (columnMetadata.path_in_schema.length > 1) { | ||
@@ -139,7 +123,7 @@ const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) | ||
return readData( | ||
dataView, daph.repetition_level_encoding, offset, daph.num_values, bitWidth | ||
reader, daph.repetition_level_encoding, daph.num_values, bitWidth | ||
) | ||
} | ||
} | ||
return { value: [], byteLength: 0 } | ||
return [] | ||
} | ||
@@ -150,4 +134,3 @@ | ||
* | ||
* @param {DataView} dataView data view for the page | ||
* @param {number} offset offset to start reading from | ||
* @param {DataReader} reader data view for the page | ||
* @param {DataPageHeader} daph data page header | ||
@@ -158,3 +141,3 @@ * @param {SchemaElement[]} schema schema for the file | ||
*/ | ||
function readDefinitionLevels(dataView, offset, daph, schema, path_in_schema) { | ||
function readDefinitionLevels(reader, daph, schema, path_in_schema) { | ||
if (!isRequired(schema, path_in_schema)) { | ||
@@ -165,4 +148,4 @@ const maxDefinitionLevel = getMaxDefinitionLevel(schema, path_in_schema) | ||
// num_values is index 1 for either type of page header | ||
const { value: definitionLevels, byteLength } = readData( | ||
dataView, daph.definition_level_encoding, offset, daph.num_values, bitWidth | ||
const definitionLevels = readData( | ||
reader, daph.definition_level_encoding, daph.num_values, bitWidth | ||
) | ||
@@ -179,6 +162,6 @@ | ||
return { byteLength, definitionLevels, numNulls } | ||
return { definitionLevels, numNulls } | ||
} | ||
} | ||
return { byteLength: 0, definitionLevels: [], numNulls: 0 } | ||
return { definitionLevels: [], numNulls: 0 } | ||
} |
@@ -7,7 +7,2 @@ import { decompressPage } from './column.js' | ||
/** | ||
* @typedef {import("./types.d.ts").Decoded<T>} Decoded | ||
* @template T | ||
*/ | ||
/** | ||
* Read a data page from the given Uint8Array. | ||
@@ -29,4 +24,4 @@ * | ||
export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, compressors) { | ||
const dataView = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength) | ||
let offset = 0 | ||
const view = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength) | ||
const reader = { view, offset: 0 } | ||
/** @type {any} */ | ||
@@ -39,10 +34,16 @@ let values = [] | ||
// repetition levels | ||
const repetitionLevels = readRepetitionLevelsV2(dataView, offset, daph2, schema, columnMetadata) | ||
const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schema, columnMetadata) | ||
if (reader.offset !== daph2.repetition_levels_byte_length) { | ||
throw new Error(`parquet repetition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length}`) | ||
} | ||
// definition levels | ||
offset += daph2.repetition_levels_byte_length | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) | ||
const definitionLevels = readDefinitionLevelsV2(dataView, offset, daph2, maxDefinitionLevel) | ||
offset += daph2.definition_levels_byte_length | ||
const definitionLevels = readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) | ||
if (reader.offset !== daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) { | ||
throw new Error(`parquet definition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length}`) | ||
} | ||
const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length | ||
@@ -55,3 +56,3 @@ | ||
const utf8 = element.converted_type === 'UTF8' | ||
let page = compressedBytes.slice(offset) | ||
let page = compressedBytes.slice(reader.offset) | ||
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') { | ||
@@ -61,4 +62,4 @@ page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors) | ||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) | ||
const plainObj = readPlain(pageView, columnMetadata.type, nValues, 0, utf8) | ||
values = plainObj.value | ||
const pageReader = { view: pageView, offset: 0 } | ||
values = readPlain(pageReader, columnMetadata.type, nValues, utf8) | ||
} else if (daph2.encoding === 'RLE') { | ||
@@ -71,5 +72,6 @@ const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) | ||
} else { | ||
const pageReader = { view: pageView, offset: 4 } | ||
values = readRleBitPackedHybrid( | ||
pageView, 4, bitWidth, uncompressedPageSize, nValues | ||
).value | ||
pageReader, bitWidth, uncompressedPageSize, nValues | ||
) | ||
} | ||
@@ -80,9 +82,9 @@ } else if ( | ||
) { | ||
compressedBytes = compressedBytes.subarray(offset) | ||
compressedBytes = compressedBytes.subarray(reader.offset) | ||
const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) | ||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) | ||
const bitWidth = pageView.getUint8(0) | ||
const { value } = readRleBitPackedHybrid( | ||
pageView, 1, bitWidth, uncompressedPageSize, nValues | ||
const pageReader = { view: pageView, offset: 1 } | ||
const value = readRleBitPackedHybrid( | ||
pageReader, bitWidth, uncompressedPageSize, nValues | ||
) | ||
@@ -105,4 +107,4 @@ values = value | ||
* | ||
* @param {DataView} dataView data view for the page | ||
* @param {number} offset offset to start reading from | ||
* @typedef {import("./types.d.ts").DataReader} DataReader | ||
* @param {DataReader} reader data view for the page | ||
* @param {DataPageHeaderV2} daph2 data page header | ||
@@ -113,3 +115,3 @@ * @param {SchemaElement[]} schema schema for the file | ||
*/ | ||
export function readRepetitionLevelsV2(dataView, offset, daph2, schema, columnMetadata) { | ||
export function readRepetitionLevelsV2(reader, daph2, schema, columnMetadata) { | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) | ||
@@ -121,4 +123,4 @@ if (!maxRepetitionLevel) return [] | ||
return readRleBitPackedHybrid( | ||
dataView, offset, bitWidth, daph2.repetition_levels_byte_length, daph2.num_values | ||
).value | ||
reader, bitWidth, daph2.repetition_levels_byte_length, daph2.num_values | ||
) | ||
} | ||
@@ -129,4 +131,3 @@ | ||
* | ||
* @param {DataView} dataView data view for the page | ||
* @param {number} offset offset to start reading from | ||
* @param {DataReader} reader data view for the page | ||
* @param {DataPageHeaderV2} daph2 data page header v2 | ||
@@ -136,3 +137,3 @@ * @param {number} maxDefinitionLevel maximum definition level for this column | ||
*/ | ||
function readDefinitionLevelsV2(dataView, offset, daph2, maxDefinitionLevel) { | ||
function readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) { | ||
if (maxDefinitionLevel) { | ||
@@ -142,4 +143,4 @@ // not the same as V1, because we know the length | ||
return readRleBitPackedHybrid( | ||
dataView, offset, bitWidth, daph2.definition_levels_byte_length, daph2.num_values | ||
).value | ||
reader, bitWidth, daph2.definition_levels_byte_length, daph2.num_values | ||
) | ||
} | ||
@@ -146,0 +147,0 @@ } |
@@ -5,26 +5,18 @@ import { readVarInt } from './thrift.js' | ||
/** | ||
* Return type with bytes read. | ||
* This is useful to advance an offset through a buffer. | ||
* | ||
* @typedef {import("./types.d.ts").Decoded<T>} Decoded | ||
* @template T | ||
*/ | ||
/** | ||
* Read `count` boolean values. | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} count - number of values to read | ||
* @returns {Decoded<boolean[]>} array of boolean values | ||
* @returns {boolean[]} array of boolean values | ||
*/ | ||
function readPlainBoolean(dataView, offset, count) { | ||
function readPlainBoolean(reader, count) { | ||
const value = [] | ||
for (let i = 0; i < count; i++) { | ||
const byteOffset = offset + Math.floor(i / 8) | ||
const byteOffset = reader.offset + Math.floor(i / 8) | ||
const bitOffset = i % 8 | ||
const byte = dataView.getUint8(byteOffset) | ||
const byte = reader.view.getUint8(byteOffset) | ||
value.push((byte & (1 << bitOffset)) !== 0) | ||
} | ||
return { value, byteLength: Math.ceil(count / 8) } | ||
reader.offset += Math.ceil(count / 8) | ||
return value | ||
} | ||
@@ -35,13 +27,13 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} count - number of values to read | ||
* @returns {Decoded<number[]>} array of int32 values | ||
* @returns {number[]} array of int32 values | ||
*/ | ||
function readPlainInt32(dataView, offset, count) { | ||
function readPlainInt32(reader, count) { | ||
const value = [] | ||
for (let i = 0; i < count; i++) { | ||
value.push(dataView.getInt32(offset + i * 4, true)) | ||
value.push(reader.view.getInt32(reader.offset + i * 4, true)) | ||
} | ||
return { value, byteLength: count * 4 } | ||
reader.offset += count * 4 | ||
return value | ||
} | ||
@@ -52,13 +44,13 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} count - number of values to read | ||
* @returns {Decoded<bigint[]>} array of int64 values | ||
* @returns {bigint[]} array of int64 values | ||
*/ | ||
function readPlainInt64(dataView, offset, count) { | ||
function readPlainInt64(reader, count) { | ||
const value = [] | ||
for (let i = 0; i < count; i++) { | ||
value.push(dataView.getBigInt64(offset + i * 8, true)) | ||
value.push(reader.view.getBigInt64(reader.offset + i * 8, true)) | ||
} | ||
return { value, byteLength: count * 8 } | ||
reader.offset += count * 8 | ||
return value | ||
} | ||
@@ -69,15 +61,15 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} count - number of values to read | ||
* @returns {Decoded<bigint[]>} array of int96 values | ||
* @returns {bigint[]} array of int96 values | ||
*/ | ||
function readPlainInt96(dataView, offset, count) { | ||
function readPlainInt96(reader, count) { | ||
const value = [] | ||
for (let i = 0; i < count; i++) { | ||
const low = dataView.getBigInt64(offset + i * 12, true) | ||
const high = dataView.getInt32(offset + i * 12 + 8, true) | ||
const low = reader.view.getBigInt64(reader.offset + i * 12, true) | ||
const high = reader.view.getInt32(reader.offset + i * 12 + 8, true) | ||
value.push((BigInt(high) << BigInt(32)) | low) | ||
} | ||
return { value, byteLength: count * 12 } | ||
reader.offset += count * 12 | ||
return value | ||
} | ||
@@ -88,13 +80,13 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} count - number of values to read | ||
* @returns {Decoded<number[]>} array of float values | ||
* @returns {number[]} array of float values | ||
*/ | ||
function readPlainFloat(dataView, offset, count) { | ||
function readPlainFloat(reader, count) { | ||
const value = [] | ||
for (let i = 0; i < count; i++) { | ||
value.push(dataView.getFloat32(offset + i * 4, true)) | ||
value.push(reader.view.getFloat32(reader.offset + i * 4, true)) | ||
} | ||
return { value, byteLength: count * 4 } | ||
reader.offset += count * 4 | ||
return value | ||
} | ||
@@ -105,13 +97,13 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} count - number of values to read | ||
* @returns {Decoded<number[]>} array of double values | ||
* @returns {number[]} array of double values | ||
*/ | ||
function readPlainDouble(dataView, offset, count) { | ||
function readPlainDouble(reader, count) { | ||
const value = [] | ||
for (let i = 0; i < count; i++) { | ||
value.push(dataView.getFloat64(offset + i * 8, true)) | ||
value.push(reader.view.getFloat64(reader.offset + i * 8, true)) | ||
} | ||
return { value, byteLength: count * 8 } | ||
reader.offset += count * 8 | ||
return value | ||
} | ||
@@ -122,18 +114,16 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} count - number of values to read | ||
* @returns {Decoded<Uint8Array[]>} array of byte arrays | ||
* @returns {Uint8Array[]} array of byte arrays | ||
*/ | ||
function readPlainByteArray(dataView, offset, count) { | ||
function readPlainByteArray(reader, count) { | ||
const value = [] | ||
let byteLength = 0 // byte length of all data read | ||
for (let i = 0; i < count; i++) { | ||
const length = dataView.getInt32(offset + byteLength, true) | ||
byteLength += 4 | ||
const bytes = new Uint8Array(dataView.buffer, dataView.byteOffset + offset + byteLength, length) | ||
const length = reader.view.getInt32(reader.offset, true) | ||
reader.offset += 4 | ||
const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length) | ||
value.push(bytes) | ||
byteLength += length | ||
reader.offset += length | ||
} | ||
return { value, byteLength } | ||
return value | ||
} | ||
@@ -144,52 +134,49 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} fixedLength - length of each fixed length byte array | ||
* @returns {Decoded<Uint8Array>} array of fixed length byte arrays | ||
* @returns {Uint8Array} array of fixed length byte arrays | ||
*/ | ||
function readPlainByteArrayFixed(dataView, offset, fixedLength) { | ||
return { | ||
value: new Uint8Array(dataView.buffer, dataView.byteOffset + offset, fixedLength), | ||
byteLength: fixedLength, | ||
} | ||
function readPlainByteArrayFixed(reader, fixedLength) { | ||
reader.offset += fixedLength | ||
return new Uint8Array( | ||
reader.view.buffer, | ||
reader.view.byteOffset + reader.offset - fixedLength, | ||
fixedLength | ||
) | ||
} | ||
/** | ||
* Read `count` values of the given type from the dataView. | ||
* Read `count` values of the given type from the reader.view. | ||
* | ||
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray | ||
* @typedef {import("./types.d.ts").ParquetType} ParquetType | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {ParquetType} type - parquet type of the data | ||
* @param {number} count - number of values to read | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8 | ||
* @returns {Decoded<DecodedArray>} array of values | ||
* @returns {DecodedArray} array of values | ||
*/ | ||
export function readPlain(dataView, type, count, offset, utf8) { | ||
if (count === 0) return { value: [], byteLength: 0 } | ||
export function readPlain(reader, type, count, utf8) { | ||
if (count === 0) return [] | ||
if (type === 'BOOLEAN') { | ||
return readPlainBoolean(dataView, offset, count) | ||
return readPlainBoolean(reader, count) | ||
} else if (type === 'INT32') { | ||
return readPlainInt32(dataView, offset, count) | ||
return readPlainInt32(reader, count) | ||
} else if (type === 'INT64') { | ||
return readPlainInt64(dataView, offset, count) | ||
return readPlainInt64(reader, count) | ||
} else if (type === 'INT96') { | ||
return readPlainInt96(dataView, offset, count) | ||
return readPlainInt96(reader, count) | ||
} else if (type === 'FLOAT') { | ||
return readPlainFloat(dataView, offset, count) | ||
return readPlainFloat(reader, count) | ||
} else if (type === 'DOUBLE') { | ||
return readPlainDouble(dataView, offset, count) | ||
return readPlainDouble(reader, count) | ||
} else if (type === 'BYTE_ARRAY') { | ||
const byteArray = readPlainByteArray(dataView, offset, count) | ||
const byteArray = readPlainByteArray(reader, count) | ||
if (utf8) { | ||
const decoder = new TextDecoder() | ||
return { | ||
value: byteArray.value.map(bytes => decoder.decode(bytes)), | ||
byteLength: byteArray.byteLength, | ||
} | ||
return byteArray.map(bytes => decoder.decode(bytes)) | ||
} | ||
return byteArray | ||
} else if (type === 'FIXED_LEN_BYTE_ARRAY') { | ||
return readPlainByteArrayFixed(dataView, offset, count) | ||
return readPlainByteArrayFixed(reader, count) | ||
} else { | ||
@@ -215,21 +202,18 @@ throw new Error(`parquet unhandled type: ${type}`) | ||
* @typedef {import("./types.d.ts").Encoding} Encoding | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {Encoding} encoding - encoding type | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {number} count - number of values to read | ||
* @param {number} bitWidth - width of each bit-packed group | ||
* @returns {Decoded<any>} array of values | ||
* @returns {any[]} array of values | ||
*/ | ||
export function readData(dataView, encoding, offset, count, bitWidth) { | ||
export function readData(reader, encoding, count, bitWidth) { | ||
/** @type {any[]} */ | ||
const value = [] | ||
let byteLength = 0 | ||
if (encoding === 'RLE') { | ||
let seen = 0 | ||
while (seen < count) { | ||
const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count) | ||
if (!rle.value.length) break // EOF | ||
concat(value, rle.value) | ||
seen += rle.value.length | ||
byteLength += rle.byteLength | ||
const rle = readRleBitPackedHybrid(reader, bitWidth, 0, count) | ||
if (!rle.length) break // EOF | ||
concat(value, rle) | ||
seen += rle.length | ||
} | ||
@@ -239,3 +223,3 @@ } else { | ||
} | ||
return { value, byteLength } | ||
return value | ||
} | ||
@@ -248,38 +232,35 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @typedef {import("./types.d.ts").DataReader} DataReader | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} width - width of each bit-packed group | ||
* @param {number} length - length of the encoded data | ||
* @param {number} numValues - number of values to read | ||
* @returns {Decoded<number[]>} array of rle/bit-packed values | ||
* @returns {number[]} array of rle/bit-packed values | ||
*/ | ||
export function readRleBitPackedHybrid(dataView, offset, width, length, numValues) { | ||
let byteLength = 0 | ||
export function readRleBitPackedHybrid(reader, width, length, numValues) { | ||
if (!length) { | ||
length = dataView.getInt32(offset, true) | ||
length = reader.view.getInt32(reader.offset, true) | ||
reader.offset += 4 | ||
if (length < 0) throw new Error(`parquet invalid rle/bitpack length ${length}`) | ||
byteLength += 4 | ||
} | ||
/** @type {number[]} */ | ||
const value = [] | ||
const startByteLength = byteLength | ||
while (byteLength - startByteLength < length && value.length < numValues) { | ||
const [header, newOffset] = readVarInt(dataView, offset + byteLength) | ||
byteLength = newOffset - offset | ||
const startOffset = reader.offset | ||
while (reader.offset - startOffset < length && value.length < numValues) { | ||
const [header, newOffset] = readVarInt(reader.view, reader.offset) | ||
reader.offset = newOffset | ||
if ((header & 1) === 0) { | ||
// rle | ||
const rle = readRle(dataView, offset + byteLength, header, width) | ||
concat(value, rle.value) | ||
byteLength += rle.byteLength | ||
const rle = readRle(reader, header, width) | ||
concat(value, rle) | ||
} else { | ||
// bit-packed | ||
const bitPacked = readBitPacked( | ||
dataView, offset + byteLength, header, width, numValues - value.length | ||
reader, header, width, numValues - value.length | ||
) | ||
concat(value, bitPacked.value) | ||
byteLength += bitPacked.byteLength | ||
concat(value, bitPacked) | ||
} | ||
} | ||
return { value, byteLength } | ||
return value | ||
} | ||
@@ -293,22 +274,20 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} header - header information | ||
* @param {number} bitWidth - width of each bit-packed group | ||
* @returns {Decoded<number[]>} array of rle values | ||
* @returns {number[]} array of rle values | ||
*/ | ||
function readRle(dataView, offset, header, bitWidth) { | ||
function readRle(reader, header, bitWidth) { | ||
const count = header >>> 1 | ||
const width = (bitWidth + 7) >> 3 | ||
let byteLength = 0 | ||
let readValue | ||
if (width === 1) { | ||
readValue = dataView.getUint8(offset) | ||
byteLength += 1 | ||
readValue = reader.view.getUint8(reader.offset) | ||
reader.offset++ | ||
} else if (width === 2) { | ||
readValue = dataView.getUint16(offset, true) | ||
byteLength += 2 | ||
readValue = reader.view.getUint16(reader.offset, true) | ||
reader.offset += 2 | ||
} else if (width === 4) { | ||
readValue = dataView.getUint32(offset, true) | ||
byteLength += 4 | ||
readValue = reader.view.getUint32(reader.offset, true) | ||
reader.offset += 4 | ||
} else { | ||
@@ -323,3 +302,3 @@ throw new Error(`parquet invalid rle width ${width}`) | ||
} | ||
return { value, byteLength } | ||
return value | ||
} | ||
@@ -331,10 +310,9 @@ | ||
* | ||
* @param {DataView} dataView - buffer to read data from | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {DataReader} reader - buffer to read data from | ||
* @param {number} header - header information | ||
* @param {number} bitWidth - width of each bit-packed group | ||
* @param {number} remaining - number of values remaining to be read | ||
* @returns {Decoded<number[]>} array of bit-packed values | ||
* @returns {number[]} array of bit-packed values | ||
*/ | ||
function readBitPacked(dataView, offset, header, bitWidth, remaining) { | ||
function readBitPacked(reader, header, bitWidth, remaining) { | ||
// extract number of values to read from header | ||
@@ -346,8 +324,8 @@ let count = (header >> 1) << 3 | ||
let data = 0 | ||
if (offset < dataView.byteLength) { | ||
data = dataView.getUint8(offset) | ||
if (reader.offset < reader.view.byteLength) { | ||
data = reader.view.getUint8(reader.offset) | ||
reader.offset++ | ||
} else if (mask) { | ||
throw new Error(`parquet bitpack offset ${offset} out of range`) | ||
throw new Error(`parquet bitpack offset ${reader.offset} out of range`) | ||
} | ||
let byteLength = 1 | ||
let left = 8 | ||
@@ -367,9 +345,6 @@ let right = 0 | ||
// if we don't have bitWidth number of bits to read, read next byte | ||
data |= dataView.getUint8(offset + byteLength) << left | ||
byteLength++ | ||
data |= reader.view.getUint8(reader.offset) << left | ||
reader.offset++ | ||
left += 8 | ||
} else { | ||
// otherwise, read bitWidth number of bits | ||
// don't write more than remaining number of rows | ||
// even if there are still bits to read | ||
if (remaining > 0) { | ||
@@ -385,4 +360,3 @@ // emit value by shifting off to the right and masking | ||
// return values and number of bytes read | ||
return { value, byteLength } | ||
return value | ||
} | ||
@@ -389,0 +363,0 @@ |
@@ -1,8 +0,5 @@ | ||
import { Encoding } from './constants.js' | ||
import { Encoding, PageType } from './constants.js' | ||
import { deserializeTCompactProtocol } from './thrift.js' | ||
/** | ||
* Return type with bytes read. | ||
* This is useful to advance an offset through a buffer. | ||
* | ||
* @typedef {import("./types.d.ts").Decoded<T>} Decoded | ||
@@ -24,3 +21,3 @@ * @template T | ||
// Parse parquet header from thrift data | ||
const type = header.field_1 | ||
const type = PageType[header.field_1] | ||
const uncompressed_page_size = header.field_2 | ||
@@ -56,3 +53,3 @@ const compressed_page_size = header.field_3 | ||
repetition_levels_byte_length: header.field_8.field_6, | ||
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default to true | ||
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default true | ||
statistics: header.field_8.field_8, | ||
@@ -59,0 +56,0 @@ } |
@@ -70,3 +70,2 @@ | ||
* Read a row group from a file-like object. | ||
* Reads the minimal number of columns to satisfy the request. | ||
* | ||
@@ -73,0 +72,0 @@ * @typedef {import('./types.js').RowGroup} RowGroup |
@@ -82,3 +82,3 @@ /** | ||
if (element.repetition_type === 'REPEATED') { | ||
maxLevel += 1 | ||
maxLevel++ | ||
} | ||
@@ -101,3 +101,3 @@ }) | ||
if (element.repetition_type !== 'REQUIRED') { | ||
maxLevel += 1 | ||
maxLevel++ | ||
} | ||
@@ -118,3 +118,3 @@ }) | ||
while (n !== 0) { | ||
byteLength += 1 | ||
byteLength++ | ||
n >>>= 7 | ||
@@ -121,0 +121,0 @@ } |
@@ -57,3 +57,3 @@ /** | ||
const c = input[pos] | ||
pos += 1 | ||
pos++ | ||
if (c < 128) { | ||
@@ -70,3 +70,3 @@ break | ||
let len = 0 | ||
pos += 1 | ||
pos++ | ||
@@ -108,3 +108,3 @@ if (pos >= inputLength) { | ||
offset = input[pos] + ((c >>> 5) << 8) | ||
pos += 1 | ||
pos++ | ||
break | ||
@@ -111,0 +111,0 @@ case 2: |
@@ -18,2 +18,7 @@ /** | ||
export interface DataReader { | ||
view: DataView | ||
offset: number | ||
} | ||
// Parquet file metadata types | ||
@@ -194,8 +199,7 @@ export interface FileMetaData { | ||
export enum PageType { | ||
DATA_PAGE = 0, | ||
INDEX_PAGE = 1, | ||
DICTIONARY_PAGE = 2, | ||
DATA_PAGE_V2 = 3, | ||
} | ||
export type PageType = | ||
'DATA_PAGE' | | ||
'INDEX_PAGE' | | ||
'DICTIONARY_PAGE' | | ||
'DATA_PAGE_V2' | ||
@@ -202,0 +206,0 @@ interface SortingColumn { |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
97286
2513