Comparing version 1.6.2 to 1.6.3
{ | ||
"name": "hyparquet", | ||
"version": "1.6.2", | ||
"version": "1.6.3", | ||
"description": "parquet file parser for javascript", | ||
@@ -18,9 +18,12 @@ "keywords": [ | ||
"files": [ | ||
"types", | ||
"src" | ||
], | ||
"type": "module", | ||
"types": "src/hyparquet.d.ts", | ||
"types": "types/hyparquet.d.ts", | ||
"scripts": { | ||
"build:types": "tsc -p ./tsconfig.build.json", | ||
"coverage": "vitest run --coverage --coverage.include=src", | ||
"lint": "eslint .", | ||
"prepare": "npm run build:types", | ||
"test": "vitest run" | ||
@@ -27,0 +30,0 @@ }, |
@@ -9,4 +9,3 @@ import { isListLike, isMapLike } from './schema.js' | ||
* | ||
* @typedef {import('./types.d.ts').DecodedArray} DecodedArray | ||
* @typedef {import('./types.d.ts').FieldRepetitionType} FieldRepetitionType | ||
* @import {DecodedArray, FieldRepetitionType} from '../src/types.d.ts' | ||
* @param {any[]} output | ||
@@ -108,3 +107,3 @@ * @param {number[] | undefined} definitionLevels | ||
* | ||
* @typedef {import('./types.d.ts').SchemaTree} SchemaTree | ||
* @import {SchemaTree} from '../src/types.d.ts' | ||
* @param {Map<string, any[]>} subcolumnData | ||
@@ -111,0 +110,0 @@ * @param {SchemaTree} schema top-level schema element |
import { assembleLists } from './assemble.js' | ||
import { Encoding, PageType } from './constants.js' | ||
import { convertWithDictionary } from './convert.js' | ||
import { decompressPage, readDataPage, readDictionaryPage } from './datapage.js' | ||
import { readDataPageV2 } from './datapageV2.js' | ||
import { parquetHeader } from './header.js' | ||
import { decompressPage, readDataPage, readDataPageV2, readDictionaryPage } from './datapage.js' | ||
import { getMaxDefinitionLevel } from './schema.js' | ||
import { deserializeTCompactProtocol } from './thrift.js' | ||
import { concat } from './utils.js' | ||
@@ -12,9 +12,7 @@ | ||
* | ||
* @typedef {import('./types.js').ColumnMetaData} ColumnMetaData | ||
* @typedef {import('./types.js').DecodedArray} DecodedArray | ||
* @param {import('./types.js').DataReader} reader | ||
* @param {DataReader} reader | ||
* @param {number} rowLimit maximum number of rows to read | ||
* @param {ColumnMetaData} columnMetadata column metadata | ||
* @param {import('./types.js').SchemaTree[]} schemaPath schema path for the column | ||
* @param {import('./hyparquet.js').ParquetReadOptions} options read options | ||
* @param {SchemaTree[]} schemaPath schema path for the column | ||
* @param {ParquetReadOptions} options read options | ||
* @returns {any[]} array of values | ||
@@ -121,1 +119,59 @@ */ | ||
} | ||
/** | ||
* Read parquet header from a buffer. | ||
* | ||
* @import {ColumnMetaData, DecodedArray, DataReader, PageHeader, ParquetReadOptions, SchemaTree} from '../src/types.d.ts' | ||
* @param {DataReader} reader - parquet file reader | ||
* @returns {PageHeader} metadata object and bytes read | ||
*/ | ||
function parquetHeader(reader) { | ||
const header = deserializeTCompactProtocol(reader) | ||
// Parse parquet header from thrift data | ||
const type = PageType[header.field_1] | ||
const uncompressed_page_size = header.field_2 | ||
const compressed_page_size = header.field_3 | ||
const crc = header.field_4 | ||
const data_page_header = header.field_5 && { | ||
num_values: header.field_5.field_1, | ||
encoding: Encoding[header.field_5.field_2], | ||
definition_level_encoding: Encoding[header.field_5.field_3], | ||
repetition_level_encoding: Encoding[header.field_5.field_4], | ||
statistics: header.field_5.field_5 && { | ||
max: header.field_5.field_5.field_1, | ||
min: header.field_5.field_5.field_2, | ||
null_count: header.field_5.field_5.field_3, | ||
distinct_count: header.field_5.field_5.field_4, | ||
max_value: header.field_5.field_5.field_5, | ||
min_value: header.field_5.field_5.field_6, | ||
}, | ||
} | ||
const index_page_header = header.field_6 | ||
const dictionary_page_header = header.field_7 && { | ||
num_values: header.field_7.field_1, | ||
encoding: Encoding[header.field_7.field_2], | ||
is_sorted: header.field_7.field_3, | ||
} | ||
const data_page_header_v2 = header.field_8 && { | ||
num_values: header.field_8.field_1, | ||
num_nulls: header.field_8.field_2, | ||
num_rows: header.field_8.field_3, | ||
encoding: Encoding[header.field_8.field_4], | ||
definition_levels_byte_length: header.field_8.field_5, | ||
repetition_levels_byte_length: header.field_8.field_6, | ||
is_compressed: header.field_8.field_7 === undefined ? true : header.field_8.field_7, // default true | ||
statistics: header.field_8.field_8, | ||
} | ||
return { | ||
type, | ||
uncompressed_page_size, | ||
compressed_page_size, | ||
crc, | ||
data_page_header, | ||
index_page_header, | ||
dictionary_page_header, | ||
data_page_header_v2, | ||
} | ||
} |
@@ -1,2 +0,2 @@ | ||
/** @type {import('./types.js').ParquetType[]} */ | ||
/** @type {import('../src/types.d.ts').ParquetType[]} */ | ||
export const ParquetType = [ | ||
@@ -32,3 +32,3 @@ 'BOOLEAN', | ||
/** @type {import('./types.js').ConvertedType[]} */ | ||
/** @type {import('../src/types.d.ts').ConvertedType[]} */ | ||
export const ConvertedType = [ | ||
@@ -59,3 +59,3 @@ 'UTF8', | ||
/** @type {import('./types.js').LogicalTypeType[]} */ | ||
/** @type {import('../src/types.d.ts').LogicalTypeType[]} */ | ||
export const logicalTypeType = [ | ||
@@ -90,3 +90,3 @@ 'NULL', | ||
/** @type {import('./types.js').PageType[]} */ | ||
/** @type {import('../src/types.d.ts').PageType[]} */ | ||
export const PageType = [ | ||
@@ -99,3 +99,3 @@ 'DATA_PAGE', | ||
/** @type {import('./types.js').BoundaryOrder[]} */ | ||
/** @type {import('../src/types.d.ts').BoundaryOrder[]} */ | ||
export const BoundaryOrder = [ | ||
@@ -102,0 +102,0 @@ 'UNORDERED', |
@@ -6,8 +6,7 @@ const dayMillis = 86400000 // 1 day in milliseconds | ||
* | ||
* @typedef {import('./types.js').DecodedArray} DecodedArray | ||
* @typedef {import('./types.js').SchemaElement} SchemaElement | ||
* @import {DecodedArray, Encoding, SchemaElement} from '../src/types.d.ts' | ||
* @param {DecodedArray} data series of primitive types | ||
* @param {DecodedArray | undefined} dictionary | ||
* @param {SchemaElement} schemaElement | ||
* @param {import('./types.js').Encoding} encoding | ||
* @param {Encoding} encoding | ||
* @param {boolean | undefined} utf8 decode bytes as utf8? | ||
@@ -14,0 +13,0 @@ * @returns {DecodedArray} series of rich types |
@@ -0,1 +1,2 @@ | ||
import { deltaBinaryUnpack, deltaByteArray, deltaLengthByteArray } from './delta.js' | ||
import { bitWidth, byteStreamSplit, readRleBitPackedHybrid } from './encoding.js' | ||
@@ -9,7 +10,2 @@ import { readPlain } from './plain.js' | ||
* | ||
* @typedef {import("./types.d.ts").DataPage} DataPage | ||
* @typedef {import("./types.d.ts").ColumnMetaData} ColumnMetaData | ||
* @typedef {import("./types.d.ts").DataPageHeader} DataPageHeader | ||
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree | ||
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray | ||
* @param {Uint8Array} bytes raw page data (should already be decompressed) | ||
@@ -62,3 +58,3 @@ * @param {DataPageHeader} daph data page header | ||
* @param {Uint8Array} bytes raw page data | ||
* @param {import("./types.d.ts").DictionaryPageHeader} diph dictionary page header | ||
* @param {DictionaryPageHeader} diph dictionary page header | ||
* @param {ColumnMetaData} columnMetadata | ||
@@ -75,3 +71,3 @@ * @param {number | undefined} typeLength - type_length from schema | ||
/** | ||
* @typedef {import("./types.d.ts").DataReader} DataReader | ||
* @import {ColumnMetaData, CompressionCodec, Compressors, DataPage, DataPageHeader, DataPageHeaderV2, DataReader, DecodedArray, DictionaryPageHeader, PageHeader, SchemaTree} from '../src/types.d.ts' | ||
* @param {DataReader} reader data view for the page | ||
@@ -120,4 +116,4 @@ * @param {DataPageHeader} daph data page header | ||
* @param {number} uncompressed_page_size | ||
* @param {import('./types.js').CompressionCodec} codec | ||
* @param {import('./types.js').Compressors | undefined} compressors | ||
* @param {CompressionCodec} codec | ||
* @param {Compressors | undefined} compressors | ||
* @returns {Uint8Array} | ||
@@ -144,1 +140,108 @@ */ | ||
} | ||
/** | ||
* Read a data page from the given Uint8Array. | ||
* | ||
* @param {Uint8Array} compressedBytes raw page data | ||
* @param {PageHeader} ph page header | ||
* @param {SchemaTree[]} schemaPath | ||
* @param {ColumnMetaData} columnMetadata | ||
* @param {Compressors | undefined} compressors | ||
* @returns {DataPage} definition levels, repetition levels, and array of values | ||
*/ | ||
export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata, compressors) { | ||
const view = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength) | ||
const reader = { view, offset: 0 } | ||
const { codec, type } = columnMetadata | ||
const daph2 = ph.data_page_header_v2 | ||
if (!daph2) throw new Error('parquet data page header v2 is undefined') | ||
// repetition levels | ||
const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schemaPath) | ||
reader.offset = daph2.repetition_levels_byte_length // readVarInt() => len for boolean v2? | ||
// definition levels | ||
const definitionLevels = readDefinitionLevelsV2(reader, daph2, schemaPath) | ||
// assert(reader.offset === daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) | ||
const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length | ||
let page = compressedBytes.subarray(reader.offset) | ||
if (daph2.is_compressed !== false) { | ||
page = decompressPage(page, uncompressedPageSize, codec, compressors) | ||
} | ||
const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) | ||
const pageReader = { view: pageView, offset: 0 } | ||
// read values based on encoding | ||
/** @type {DecodedArray} */ | ||
let dataPage | ||
const nValues = daph2.num_values - daph2.num_nulls | ||
if (daph2.encoding === 'PLAIN') { | ||
const { type_length } = schemaPath[schemaPath.length - 1].element | ||
dataPage = readPlain(pageReader, type, nValues, type_length) | ||
} else if (daph2.encoding === 'RLE') { | ||
// assert(columnMetadata.type === 'BOOLEAN') | ||
dataPage = new Array(nValues) | ||
readRleBitPackedHybrid(pageReader, 1, 0, dataPage) | ||
dataPage = dataPage.map(x => !!x) | ||
} else if ( | ||
daph2.encoding === 'PLAIN_DICTIONARY' || | ||
daph2.encoding === 'RLE_DICTIONARY' | ||
) { | ||
const bitWidth = pageView.getUint8(pageReader.offset++) | ||
dataPage = new Array(nValues) | ||
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize - 1, dataPage) | ||
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') { | ||
const int32 = type === 'INT32' | ||
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues) | ||
deltaBinaryUnpack(pageReader, nValues, dataPage) | ||
} else if (daph2.encoding === 'DELTA_LENGTH_BYTE_ARRAY') { | ||
dataPage = new Array(nValues) | ||
deltaLengthByteArray(pageReader, nValues, dataPage) | ||
} else if (daph2.encoding === 'DELTA_BYTE_ARRAY') { | ||
dataPage = new Array(nValues) | ||
deltaByteArray(pageReader, nValues, dataPage) | ||
} else if (daph2.encoding === 'BYTE_STREAM_SPLIT') { | ||
const { type_length } = schemaPath[schemaPath.length - 1].element | ||
dataPage = byteStreamSplit(reader, nValues, type, type_length) | ||
} else { | ||
throw new Error(`parquet unsupported encoding: ${daph2.encoding}`) | ||
} | ||
return { definitionLevels, repetitionLevels, dataPage } | ||
} | ||
/** | ||
* @param {DataReader} reader | ||
* @param {DataPageHeaderV2} daph2 data page header v2 | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {any[]} repetition levels | ||
*/ | ||
function readRepetitionLevelsV2(reader, daph2, schemaPath) { | ||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) | ||
if (!maxRepetitionLevel) return [] | ||
const values = new Array(daph2.num_values) | ||
readRleBitPackedHybrid( | ||
reader, bitWidth(maxRepetitionLevel), daph2.repetition_levels_byte_length, values | ||
) | ||
return values | ||
} | ||
/** | ||
* @param {DataReader} reader | ||
* @param {DataPageHeaderV2} daph2 data page header v2 | ||
* @param {SchemaTree[]} schemaPath | ||
* @returns {number[] | undefined} definition levels | ||
*/ | ||
function readDefinitionLevelsV2(reader, daph2, schemaPath) { | ||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) | ||
if (maxDefinitionLevel) { | ||
// V2 we know the length | ||
const values = new Array(daph2.num_values) | ||
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), daph2.definition_levels_byte_length, values) | ||
return values | ||
} | ||
} |
import { readVarInt, readZigZagBigInt } from './thrift.js' | ||
/** | ||
* @typedef {import('./types.d.ts').DataReader} DataReader | ||
* @import {DataReader} from '../src/types.d.ts' | ||
* @param {DataReader} reader | ||
@@ -6,0 +6,0 @@ * @param {number} count number of values to read |
@@ -18,4 +18,2 @@ import { readVarInt } from './thrift.js' | ||
* | ||
* @typedef {import("./types.d.ts").DataReader} DataReader | ||
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray | ||
* @param {DataReader} reader | ||
@@ -121,3 +119,2 @@ * @param {number} width - width of each bit-packed group | ||
/** | ||
* @typedef {import("./types.d.ts").ParquetType} ParquetType | ||
* @param {DataReader} reader | ||
@@ -154,2 +151,3 @@ * @param {number} count | ||
/** | ||
* @import {DataReader, DecodedArray, ParquetType} from '../src/types.d.ts' | ||
* @param {ParquetType} type | ||
@@ -156,0 +154,0 @@ * @param {number | undefined} typeLength |
@@ -10,10 +10,8 @@ export { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata.js' | ||
export { asyncBufferFromFile, asyncBufferFromUrl, byteLengthFromUrl, toJson } from './utils.js' | ||
export { asyncBufferFromFile, asyncBufferFromUrl, byteLengthFromUrl, cachedAsyncBuffer, toJson } from './utils.js' | ||
export { cachedAsyncBuffer } from './asyncBuffer.js' | ||
/** | ||
* @param {import('./hyparquet.js').ParquetReadOptions} options | ||
* @returns {Promise<Array<Record<string, any>>>} | ||
*/ | ||
* @param {ParquetReadOptions} options | ||
* @returns {Promise<Record<string, any>[]>} resolves when all requested rows and columns are parsed | ||
*/ | ||
export function parquetReadObjects(options) { | ||
@@ -28,1 +26,39 @@ return new Promise((onComplete, reject) => { | ||
} | ||
/** | ||
* Explicitly export types for use in downstream typescript projects through | ||
* `import { ParquetReadOptions } from 'hyparquet'` for example. | ||
* | ||
* @template {any} T | ||
* @typedef {import('../src/types.d.ts').Awaitable<T>} Awaitable<T> | ||
*/ | ||
/** | ||
* @typedef {import('../src/types.d.ts').AsyncBuffer} AsyncBuffer | ||
* @typedef {import('../src/types.d.ts').DataReader} DataReader | ||
* @typedef {import('../src/types.d.ts').FileMetaData} FileMetaData | ||
* @typedef {import('../src/types.d.ts').SchemaTree} SchemaTree | ||
* @typedef {import('../src/types.d.ts').SchemaElement} SchemaElement | ||
* @typedef {import('../src/types.d.ts').ParquetType} ParquetType | ||
* @typedef {import('../src/types.d.ts').FieldRepetitionType} FieldRepetitionType | ||
* @typedef {import('../src/types.d.ts').ConvertedType} ConvertedType | ||
* @typedef {import('../src/types.d.ts').TimeUnit} TimeUnit | ||
* @typedef {import('../src/types.d.ts').LogicalType} LogicalType | ||
* @typedef {import('../src/types.d.ts').LogicalTypeType} LogicalTypeType | ||
* @typedef {import('../src/types.d.ts').RowGroup} RowGroup | ||
* @typedef {import('../src/types.d.ts').ColumnChunk} ColumnChunk | ||
* @typedef {import('../src/types.d.ts').ColumnMetaData} ColumnMetaData | ||
* @typedef {import('../src/types.d.ts').Encoding} Encoding | ||
* @typedef {import('../src/types.d.ts').CompressionCodec} CompressionCodec | ||
* @typedef {import('../src/types.d.ts').Compressors} Compressors | ||
* @typedef {import('../src/types.d.ts').Statistics} Statistics | ||
* @typedef {import('../src/types.d.ts').PageType} PageType | ||
* @typedef {import('../src/types.d.ts').PageHeader} PageHeader | ||
* @typedef {import('../src/types.d.ts').DataPageHeader} DataPageHeader | ||
* @typedef {import('../src/types.d.ts').DictionaryPageHeader} DictionaryPageHeader | ||
* @typedef {import('../src/types.d.ts').DecodedArray} DecodedArray | ||
* @typedef {import('../src/types.d.ts').OffsetIndex} OffsetIndex | ||
* @typedef {import('../src/types.d.ts').ColumnIndex} ColumnIndex | ||
* @typedef {import('../src/types.d.ts').BoundaryOrder} BoundaryOrder | ||
* @typedef {import('../src/types.d.ts').ColumnData} ColumnData | ||
* @typedef {import('../src/types.d.ts').ParquetReadOptions} ParquetReadOptions | ||
*/ |
@@ -6,6 +6,5 @@ import { BoundaryOrder } from './constants.js' | ||
/** | ||
* @typedef {import('./types.d.ts').DataReader} DataReader | ||
* @param {DataReader} reader | ||
* @param {import('./types.d.ts').SchemaElement} schema | ||
* @returns {import('./types.d.ts').ColumnIndex} | ||
* @param {SchemaElement} schema | ||
* @returns {ColumnIndex} | ||
*/ | ||
@@ -27,3 +26,3 @@ export function readColumnIndex(reader, schema) { | ||
* @param {DataReader} reader | ||
* @returns {import('./types.d.ts').OffsetIndex} | ||
* @returns {OffsetIndex} | ||
*/ | ||
@@ -39,4 +38,5 @@ export function readOffsetIndex(reader) { | ||
/** | ||
* @import {ColumnIndex, DataReader, OffsetIndex, PageLocation, SchemaElement} from '../src/types.d.ts' | ||
* @param {any} loc | ||
* @returns {import('./types.d.ts').PageLocation} | ||
* @returns {PageLocation} | ||
*/ | ||
@@ -43,0 +43,0 @@ function pageLocation(loc) { |
@@ -26,7 +26,4 @@ import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from './constants.js' | ||
* | ||
* @typedef {import("./types.d.ts").AsyncBuffer} AsyncBuffer | ||
* @typedef {import("./types.d.ts").FileMetaData} FileMetaData | ||
* @typedef {import("./types.d.ts").SchemaElement} SchemaElement | ||
* @param {AsyncBuffer} asyncBuffer parquet file contents | ||
* @param {number} initialFetchSize initial fetch size in bytes | ||
* @param {number} initialFetchSize initial fetch size in bytes (default 512kb) | ||
* @returns {Promise<FileMetaData>} parquet metadata object | ||
@@ -194,3 +191,3 @@ */ | ||
* @param {FileMetaData} metadata parquet metadata object | ||
* @returns {import("./types.d.ts").SchemaTree} tree of schema elements | ||
* @returns {SchemaTree} tree of schema elements | ||
*/ | ||
@@ -203,3 +200,3 @@ export function parquetSchema(metadata) { | ||
* @param {any} logicalType | ||
* @returns {import("./types.d.ts").LogicalType | undefined} | ||
* @returns {LogicalType | undefined} | ||
*/ | ||
@@ -242,3 +239,3 @@ function logicalType(logicalType) { | ||
* @param {any} unit | ||
* @returns {import("./types.d.ts").TimeUnit} | ||
* @returns {TimeUnit} | ||
*/ | ||
@@ -255,5 +252,6 @@ function timeUnit(unit) { | ||
* | ||
* @import {AsyncBuffer, FileMetaData, LogicalType, MinMaxType, SchemaElement, SchemaTree, Statistics, TimeUnit} from '../src/types.d.ts' | ||
* @param {any} stats | ||
* @param {SchemaElement} schema | ||
* @returns {import("./types.d.ts").Statistics} | ||
* @returns {Statistics} | ||
*/ | ||
@@ -276,3 +274,3 @@ function convertStats(stats, schema) { | ||
* @param {SchemaElement} schema | ||
* @returns {import('./types.d.ts').MinMaxType | undefined} | ||
* @returns {MinMaxType | undefined} | ||
*/ | ||
@@ -279,0 +277,0 @@ export function convertMetadata(value, schema) { |
/** | ||
* Read `count` values of the given type from the reader.view. | ||
* | ||
* @typedef {import("./types.d.ts").DataReader} DataReader | ||
* @typedef {import("./types.d.ts").DecodedArray} DecodedArray | ||
* @typedef {import("./types.d.ts").ParquetType} ParquetType | ||
* @import {DataReader, DecodedArray, ParquetType} from '../src/types.d.ts' | ||
* @param {DataReader} reader - buffer to read data from | ||
@@ -8,0 +6,0 @@ * @param {ParquetType} type - parquet type of the data |
@@ -6,8 +6,8 @@ import { parquetReadObjects } from './hyparquet.js' | ||
* Wraps parquetRead with orderBy support. | ||
* This is a parquet-aware query engine that can read a subset of rows, | ||
* with an optional orderBy clause. | ||
* This is a parquet-aware query engine that can read a subset of rows and columns. | ||
* Accepts an optional orderBy column name to sort the results. | ||
* Note that using orderBy may SIGNIFICANTLY increase the query time. | ||
* | ||
* @typedef {import('./hyparquet.js').ParquetReadOptions} ParquetReadOptions | ||
* @param {ParquetReadOptions & { orderBy?: string }} options | ||
* @returns {Promise<Record<string, any>[]>} | ||
* @returns {Promise<Record<string, any>[]>} resolves when all requested rows and columns are parsed | ||
*/ | ||
@@ -40,2 +40,3 @@ export async function parquetQuery(options) { | ||
* Returns a sparse array of rows. | ||
* @import {ParquetReadOptions} from '../src/types.d.ts' | ||
* @param {ParquetReadOptions & { rows: number[] }} options | ||
@@ -42,0 +43,0 @@ * @returns {Promise<Record<string, any>[]>} |
@@ -18,6 +18,2 @@ import { assembleNested } from './assemble.js' | ||
* | ||
* @typedef {import('./hyparquet.js').ColumnData} ColumnData | ||
* @typedef {import('./types.js').Compressors} Compressors | ||
* @typedef {import('./types.js').AsyncBuffer} AsyncBuffer | ||
* @typedef {import('./types.js').FileMetaData} FileMetaData | ||
* @param {object} options read options | ||
@@ -27,3 +23,3 @@ * @param {AsyncBuffer} options.file file-like object containing parquet data | ||
* @param {string[]} [options.columns] columns to read, all columns if undefined | ||
* @param {string} [options.rowFormat] format of each row passed to the onComplete function | ||
* @param {string} [options.rowFormat] desired format of each row passed to the onComplete function | ||
* @param {number} [options.rowStart] first requested row index (inclusive) | ||
@@ -74,3 +70,2 @@ * @param {number} [options.rowEnd] last requested row index (exclusive) | ||
* | ||
* @typedef {import('./types.js').RowGroup} RowGroup | ||
* @param {object} options read options | ||
@@ -223,3 +218,4 @@ * @param {AsyncBuffer} options.file file-like object containing parquet data | ||
* | ||
* @param {import('./types.js').SchemaTree} schema | ||
* @import {AsyncBuffer, ColumnData, Compressors, FileMetaData, RowGroup, SchemaTree} from '../src/types.d.ts' | ||
* @param {SchemaTree} schema | ||
* @param {string[]} output | ||
@@ -226,0 +222,0 @@ * @returns {string[]} |
/** | ||
* Build a tree from the schema elements. | ||
* | ||
* @typedef {import('./types.js').SchemaElement} SchemaElement | ||
* @typedef {import('./types.js').SchemaTree} SchemaTree | ||
* @import {SchemaElement, SchemaTree} from '../src/types.d.ts' | ||
* @param {SchemaElement[]} schema | ||
@@ -7,0 +6,0 @@ * @param {number} rootIndex index of the root element |
@@ -22,3 +22,3 @@ // TCompactProtocol types | ||
* | ||
* @typedef {import("./types.d.ts").DataReader} DataReader | ||
* @import {DataReader} from '../src/types.d.ts' | ||
* @param {DataReader} reader | ||
@@ -25,0 +25,0 @@ * @returns {Record<string, any>} |
@@ -327,1 +327,27 @@ export type Awaitable<T> = T | Promise<T> | ||
export type BoundaryOrder = 'UNORDERED' | 'ASCENDING' | 'DESCENDING' | ||
/** | ||
* A run of column data | ||
*/ | ||
export interface ColumnData { | ||
columnName: string | ||
columnData: ArrayLike<any> | ||
rowStart: number | ||
rowEnd: number | ||
} | ||
/** | ||
* Parquet query options for reading data | ||
*/ | ||
export interface ParquetReadOptions { | ||
file: AsyncBuffer // file-like object containing parquet data | ||
metadata?: FileMetaData // parquet metadata, will be parsed if not provided | ||
columns?: string[] // columns to read, all columns if undefined | ||
rowFormat?: string // format of each row passed to the onComplete function | ||
rowStart?: number // first requested row index (inclusive) | ||
rowEnd?: number // last requested row index (exclusive) | ||
onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range. | ||
onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed | ||
compressors?: Compressors // custom decompressors | ||
utf8?: boolean // decode byte arrays as utf8 strings (default true) | ||
} |
/** | ||
* Replace bigint, date, etc with legal JSON types. | ||
* When parsing parquet files, bigints are used to represent 64-bit integers. | ||
* However, JSON does not support bigints, so it's helpful to convert to numbers. | ||
* | ||
@@ -28,3 +30,2 @@ * @param {any} obj object to convert | ||
* | ||
* @typedef {import('./types.js').DecodedArray} DecodedArray | ||
* @param {any[]} aaa first array | ||
@@ -42,2 +43,3 @@ * @param {DecodedArray} bbb second array | ||
* Get the byte length of a URL using a HEAD request. | ||
* If requestInit is provided, it will be passed to fetch. | ||
* | ||
@@ -60,4 +62,5 @@ * @param {string} url | ||
* Construct an AsyncBuffer for a URL. | ||
* If byteLength is not provided, will make a HEAD request to get the file size. | ||
* If requestInit is provided, it will be passed to fetch. | ||
* | ||
* @typedef {import('./types.js').AsyncBuffer} AsyncBuffer | ||
* @param {object} options | ||
@@ -125,1 +128,55 @@ * @param {string} options.url | ||
} | ||
/** | ||
* Returns a cached layer on top of an AsyncBuffer. For caching slices of a file | ||
* that are read multiple times, possibly over a network. | ||
* | ||
* @param {AsyncBuffer} file file-like object to cache | ||
* @returns {AsyncBuffer} cached file-like object | ||
*/ | ||
export function cachedAsyncBuffer({ byteLength, slice }) { | ||
const cache = new Map() | ||
return { | ||
byteLength, | ||
/** | ||
* @param {number} start | ||
* @param {number} [end] | ||
* @returns {Awaitable<ArrayBuffer>} | ||
*/ | ||
slice(start, end) { | ||
const key = cacheKey(start, end, byteLength) | ||
const cached = cache.get(key) | ||
if (cached) return cached | ||
// cache miss, read from file | ||
const promise = slice(start, end) | ||
cache.set(key, promise) | ||
return promise | ||
}, | ||
} | ||
} | ||
/** | ||
* Returns canonical cache key for a byte range 'start,end'. | ||
* Normalize int-range and suffix-range requests to the same key. | ||
* | ||
* @import {AsyncBuffer, Awaitable, DecodedArray} from '../src/types.d.ts' | ||
* @param {number} start start byte of range | ||
* @param {number} [end] end byte of range, or undefined for suffix range | ||
* @param {number} [size] size of file, or undefined for suffix range | ||
* @returns {string} | ||
*/ | ||
function cacheKey(start, end, size) { | ||
if (start < 0) { | ||
if (end !== undefined) throw new Error(`invalid suffix range [${start}, ${end}]`) | ||
if (size === undefined) return `${start},` | ||
return `${size + start},${size}` | ||
} else if (end !== undefined) { | ||
if (start > end) throw new Error(`invalid empty range [${start}, ${end}]`) | ||
return `${start},${end}` | ||
} else if (size === undefined) { | ||
return `${start},` | ||
} else { | ||
return `${start},${size}` | ||
} | ||
} |
143256
55
3442