Comparing version 0.9.2 to 0.9.3
{ | ||
"name": "hyparquet", | ||
"version": "0.9.2", | ||
"version": "0.9.3", | ||
"description": "parquet file parser for javascript", | ||
@@ -30,8 +30,8 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "20.12.11", | ||
"@typescript-eslint/eslint-plugin": "7.8.0", | ||
"@types/node": "20.12.12", | ||
"@typescript-eslint/eslint-plugin": "7.9.0", | ||
"@vitest/coverage-v8": "1.6.0", | ||
"eslint": "8.57.0", | ||
"eslint-plugin-import": "2.29.1", | ||
"eslint-plugin-jsdoc": "48.2.4", | ||
"eslint-plugin-jsdoc": "48.2.5", | ||
"http-server": "14.1.1", | ||
@@ -38,0 +38,0 @@ "hysnappy": "0.3.1", |
@@ -26,3 +26,3 @@ import { assembleObjects } from './assemble.js' | ||
* @param {Compressors} [compressors] custom decompressors | ||
* @returns {ArrayLike<any>} array of values | ||
* @returns {any[]} array of values | ||
*/ | ||
@@ -47,5 +47,4 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, schemaPath, compressors) { | ||
// read compressed_page_size bytes starting at offset | ||
const compressedBytes = new Uint8Array(arrayBuffer).subarray( | ||
columnOffset + reader.offset, | ||
columnOffset + reader.offset + header.compressed_page_size | ||
const compressedBytes = new Uint8Array( | ||
arrayBuffer, columnOffset + reader.offset, header.compressed_page_size | ||
) | ||
@@ -65,2 +64,3 @@ | ||
valuesSeen += daph.num_values | ||
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length)) | ||
@@ -89,7 +89,4 @@ // construct output values: skip nulls and construct lists | ||
} | ||
// assert(BigInt(values.length) === rowGroup.num_rows) | ||
// TODO: check that we are at the end of the page | ||
// values.length !== daph.num_values isn't right. In cases like arrays, | ||
// you need the total number of children, not the number of top-level values. | ||
concat(rowData, values) | ||
@@ -96,0 +93,0 @@ } else if (header.type === 'DICTIONARY_PAGE') { |
@@ -15,3 +15,7 @@ const dayMillis = 86400000 // 1 day in milliseconds | ||
const decoder = new TextDecoder() | ||
return data.map(v => v && decoder.decode(v)) | ||
const arr = new Array(data.length) | ||
for (let i = 0; i < arr.length; i++) { | ||
arr[i] = data[i] && decoder.decode(data[i]) | ||
} | ||
return arr | ||
} | ||
@@ -21,20 +25,21 @@ if (ctype === 'DECIMAL') { | ||
const factor = Math.pow(10, -scale) | ||
if (typeof data[0] === 'number') { | ||
if (factor === 1) return data | ||
return Array.from(data).map(v => v * factor) | ||
} else if (typeof data[0] === 'bigint') { | ||
if (factor === 1) return data | ||
return Array.from(data).map(v => Number(v) * factor) | ||
} else { | ||
return Array.from(data).map(v => parseDecimal(v) * factor) | ||
const arr = new Array(data.length) | ||
for (let i = 0; i < arr.length; i++) { | ||
if (data[0] instanceof Uint8Array) { | ||
arr[i] = parseDecimal(data[i]) * factor | ||
} else { | ||
arr[i] = Number(data[i]) * factor | ||
} | ||
} | ||
return arr | ||
} | ||
if (ctype === 'DATE') { | ||
return Array.from(data).map(v => new Date(v * dayMillis)) | ||
} | ||
if (ctype === undefined && schemaElement.type === 'INT96') { | ||
return Array.from(data).map(parseInt96Date) | ||
} | ||
if (ctype === 'TIME_MILLIS') { | ||
return Array.from(data).map(v => new Date(v)) | ||
if (ctype === 'DATE') { | ||
const arr = new Array(data.length) | ||
for (let i = 0; i < arr.length; i++) { | ||
arr[i] = new Date(data[i] * dayMillis) | ||
} | ||
return arr | ||
} | ||
@@ -50,2 +55,8 @@ if (ctype === 'JSON') { | ||
} | ||
// TODO: ctype UINT | ||
const logicalType = schemaElement.logical_type?.type | ||
if (logicalType === 'FLOAT16') { | ||
return Array.from(data).map(parseFloat16) | ||
} | ||
// TODO: logical types | ||
return data | ||
@@ -77,1 +88,16 @@ } | ||
} | ||
/** | ||
* @param {Uint8Array | undefined} bytes | ||
* @returns {number | undefined} | ||
*/ | ||
export function parseFloat16(bytes) { | ||
if (!bytes) return undefined | ||
const int16 = (bytes[1] << 8) | bytes[0] | ||
const sign = int16 >> 15 ? -1 : 1 | ||
const exp = (int16 >> 10) & 0x1f | ||
const frac = int16 & 0x3ff | ||
if (exp === 0) return sign * Math.pow(2, -14) * (frac / 1024) // subnormals | ||
if (exp === 0x1f) return frac ? NaN : sign * Infinity | ||
return sign * Math.pow(2, exp - 15) * (1 + frac / 1024) | ||
} |
@@ -61,3 +61,3 @@ import { readVarInt } from './thrift.js' | ||
const width = (bitWidth + 7) >> 3 | ||
let value | ||
let value = 0 | ||
if (width === 1) { | ||
@@ -69,3 +69,3 @@ value = reader.view.getUint8(reader.offset) | ||
value = reader.view.getUint32(reader.offset, true) | ||
} else { | ||
} else if (width) { | ||
throw new Error(`parquet invalid rle width ${width}`) | ||
@@ -72,0 +72,0 @@ } |
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js' | ||
import { parseFloat16 } from './convert.js' | ||
import { getSchemaPath } from './schema.js' | ||
@@ -27,2 +28,3 @@ import { deserializeTCompactProtocol } from './thrift.js' | ||
* @typedef {import("./types.d.ts").FileMetaData} FileMetaData | ||
* @typedef {import("./types.d.ts").SchemaElement} SchemaElement | ||
* @param {AsyncBuffer} asyncBuffer parquet file contents | ||
@@ -107,2 +109,3 @@ * @param {number} initialFetchSize initial fetch size in bytes | ||
const version = metadata.field_1 | ||
/** @type {SchemaElement[]} */ | ||
const schema = metadata.field_2.map((/** @type {any} */ field) => ({ | ||
@@ -120,4 +123,4 @@ type: ParquetType[field.field_1], | ||
})) | ||
// @ts-expect-error get types by column index | ||
const columnTypes = schema.map(e => e.type).filter(e => e) | ||
// schema element per column index | ||
const columnSchema = schema.filter(e => e.type) | ||
const num_rows = metadata.field_3 | ||
@@ -140,3 +143,3 @@ const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({ | ||
dictionary_page_offset: column.field_3.field_11, | ||
statistics: columnStats(column.field_3.field_12, columnTypes[columnIndex]), | ||
statistics: columnStats(column.field_3.field_12, columnSchema[columnIndex]), | ||
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({ | ||
@@ -242,6 +245,7 @@ page_type: encodingStat.field_1, | ||
* @param {any} stats | ||
* @param {import("./types.d.ts").ParquetType} type | ||
* @param {SchemaElement} schema | ||
* @returns {import("./types.d.ts").Statistics} | ||
*/ | ||
function columnStats(stats, type) { | ||
function columnStats(stats, schema) { | ||
const { type, logical_type } = schema | ||
function convert(/** @type {Uint8Array} */ value) { | ||
@@ -267,2 +271,5 @@ if (value === undefined) return value | ||
} | ||
if (logical_type?.type === 'FLOAT16') { | ||
return parseFloat16(value) | ||
} | ||
return value | ||
@@ -269,0 +276,0 @@ } |
@@ -112,6 +112,5 @@ | ||
/** @type {any[][]} */ | ||
const groupData = [] | ||
const groupColumnData = [] | ||
const promises = [] | ||
const maps = new Map() | ||
let outputColumnIndex = 0 | ||
// read column data | ||
@@ -153,3 +152,3 @@ for (let columnIndex = 0; columnIndex < rowGroup.columns.length; columnIndex++) { | ||
const schemaPath = getSchemaPath(metadata.schema, columnMetadata.path_in_schema) | ||
/** @type {ArrayLike<any> | undefined} */ | ||
/** @type {any[] | undefined} */ | ||
let columnData = readColumn( | ||
@@ -212,23 +211,12 @@ arrayBuffer, bufferOffset, rowGroup, columnMetadata, schemaPath, compressors | ||
}) | ||
// add colum data to group data only if onComplete is defined | ||
if (options.onComplete) addColumn(groupData, outputColumnIndex, columnData) | ||
outputColumnIndex++ | ||
// save column data only if onComplete is defined | ||
if (options.onComplete) groupColumnData.push(columnData) | ||
})) | ||
} | ||
await Promise.all(promises) | ||
return groupData | ||
} | ||
/** | ||
* Add a column to rows. | ||
* | ||
* @param {any[][]} rows rows to add column data to | ||
* @param {number} columnIndex column index to add | ||
* @param {ArrayLike<any>} columnData column data to add | ||
*/ | ||
function addColumn(rows, columnIndex, columnData) { | ||
for (let i = 0; i < columnData.length; i++) { | ||
if (!rows[i]) rows[i] = [] | ||
rows[i][columnIndex] = columnData[i] | ||
if (options.onComplete) { | ||
// transpose columns into rows | ||
return groupColumnData[0].map((_, row) => groupColumnData.map(col => col[row])) | ||
} | ||
return [] | ||
} |
@@ -1,2 +0,2 @@ | ||
type Awaitable<T> = T | Promise<T> | ||
export type Awaitable<T> = T | Promise<T> | ||
@@ -47,3 +47,3 @@ /** | ||
field_id?: number | ||
logicalType?: LogicalType | ||
logical_type?: LogicalType | ||
} | ||
@@ -50,0 +50,0 @@ |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
95447
2484