Comparing version 0.3.2 to 0.3.3
{ | ||
"name": "hyparquet", | ||
"version": "0.3.2", | ||
"version": "0.3.3", | ||
"description": "parquet file parser for javascript", | ||
@@ -30,12 +30,12 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "20.11.19", | ||
"@typescript-eslint/eslint-plugin": "7.0.1", | ||
"@vitest/coverage-v8": "1.3.0", | ||
"eslint": "8.56.0", | ||
"@types/node": "20.11.20", | ||
"@typescript-eslint/eslint-plugin": "7.0.2", | ||
"@vitest/coverage-v8": "1.3.1", | ||
"eslint": "8.57.0", | ||
"eslint-plugin-import": "2.29.1", | ||
"eslint-plugin-jsdoc": "48.1.0", | ||
"eslint-plugin-jsdoc": "48.2.0", | ||
"http-server": "14.1.1", | ||
"typescript": "5.3.3", | ||
"vitest": "1.3.0" | ||
"vitest": "1.3.1" | ||
} | ||
} |
# hyparquet | ||
![hyparquet](hyparquet.jpg) | ||
![hyparquet parakeet](hyparquet.jpg) | ||
@@ -5,0 +5,0 @@ [![npm](https://img.shields.io/npm/v/hyparquet)](https://www.npmjs.com/package/hyparquet) |
@@ -17,3 +17,3 @@ import { Encoding, PageType } from './constants.js' | ||
/** | ||
* Read a column from the file. | ||
* Parse column data from a buffer. | ||
* | ||
@@ -28,9 +28,9 @@ * @param {ArrayBuffer} arrayBuffer parquet file contents | ||
export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, schema) { | ||
// parse column data | ||
/** @type {ArrayLike<any> | undefined} */ | ||
let dictionary = undefined | ||
let valuesSeen = 0 | ||
let byteOffset = 0 // byteOffset within the column | ||
/** @type {ArrayLike<any> | undefined} */ | ||
let dictionary = undefined | ||
const rowIndex = [0] // map/list object index | ||
const rowData = [] | ||
while (valuesSeen < rowGroup.num_rows) { | ||
@@ -49,18 +49,2 @@ // parse column header | ||
) | ||
// decompress bytes | ||
/** @type {Uint8Array | undefined} */ | ||
let page | ||
const uncompressed_page_size = Number(header.uncompressed_page_size) | ||
const { codec } = columnMetadata | ||
if (codec === 'UNCOMPRESSED') { | ||
page = compressedBytes | ||
} else if (codec === 'SNAPPY') { | ||
page = new Uint8Array(uncompressed_page_size) | ||
snappyUncompress(compressedBytes, page) | ||
} else { | ||
throw new Error(`parquet unsupported compression codec: ${codec}`) | ||
} | ||
if (page?.length !== uncompressed_page_size) { | ||
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`) | ||
} | ||
@@ -72,2 +56,3 @@ // parse page data by type | ||
const page = decompressPage(compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec) | ||
const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schema, columnMetadata) | ||
@@ -104,2 +89,3 @@ valuesSeen += daph.num_values | ||
let v = dataPage[index++] | ||
// map to dictionary value | ||
@@ -146,2 +132,3 @@ if (dictionary) { | ||
const page = decompressPage(compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec) | ||
dictionary = readDictionaryPage(page, diph, schema, columnMetadata) | ||
@@ -232,1 +219,26 @@ } else if (header.type === PageType.DATA_PAGE_V2) { | ||
} | ||
/** | ||
* @typedef {import('./types.js').PageHeader} PageHeader | ||
* @typedef {import('./types.js').CompressionCodec} CompressionCodec | ||
* @param {Uint8Array} compressedBytes | ||
* @param {number} uncompressed_page_size | ||
* @param {CompressionCodec} codec | ||
* @returns {Uint8Array} | ||
*/ | ||
function decompressPage(compressedBytes, uncompressed_page_size, codec) { | ||
/** @type {Uint8Array | undefined} */ | ||
let page | ||
if (codec === 'UNCOMPRESSED') { | ||
page = compressedBytes | ||
} else if (codec === 'SNAPPY') { | ||
page = new Uint8Array(uncompressed_page_size) | ||
snappyUncompress(compressedBytes, page) | ||
} else { | ||
throw new Error(`parquet unsupported compression codec: ${codec}`) | ||
} | ||
if (page?.length !== uncompressed_page_size) { | ||
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`) | ||
} | ||
return page | ||
} |
@@ -135,3 +135,3 @@ import { Encoding, ParquetType } from './constants.js' | ||
/** | ||
* Read `count` fixed length byte array values. | ||
* Read a fixed length byte array. | ||
* | ||
@@ -138,0 +138,0 @@ * @param {DataView} dataView - buffer to read data from |
@@ -20,4 +20,4 @@ export { AsyncBuffer, FileMetaData, SchemaTree } from './types' | ||
* @param {number} [options.rowEnd] last requested row index (exclusive) | ||
* @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range. | ||
* @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed | ||
* @param {Function} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range. | ||
* @param {Function} [options.onComplete] called when all requested rows and columns are parsed | ||
* @returns {Promise<void>} resolves when all requested rows and columns are parsed | ||
@@ -73,7 +73,7 @@ */ | ||
* | ||
* @param {Uint8Array} inputArray compressed data | ||
* @param {Uint8Array} outputArray output buffer | ||
* @param {Uint8Array} input compressed data | ||
* @param {Uint8Array} output output buffer | ||
* @returns {boolean} true if successful | ||
*/ | ||
export function snappyUncompress(inputArray: Uint8Array, outputArray: Uint8Array): boolean | ||
export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean | ||
@@ -80,0 +80,0 @@ /** |
@@ -112,2 +112,3 @@ | ||
if (!columnMetadata) throw new Error('parquet column metadata is undefined') | ||
const columnStartByte = getColumnOffset(columnMetadata) | ||
@@ -118,2 +119,3 @@ const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size) | ||
// skip columns larger than 1gb | ||
// TODO: stream process the data, returning only the requested rows | ||
if (columnBytes > 1 << 30) { | ||
@@ -120,0 +122,0 @@ console.warn(`parquet skipping huge column "${columnMetadata.path_in_schema}" ${columnBytes.toLocaleString()} bytes`) |
@@ -44,9 +44,9 @@ /** | ||
* | ||
* @param {Uint8Array} inputArray compressed data | ||
* @param {Uint8Array} outputArray output buffer | ||
* @returns {boolean} true if successful | ||
* @param {Uint8Array} input compressed data | ||
* @param {Uint8Array} output output buffer | ||
* @returns {void} | ||
*/ | ||
export function snappyUncompress(inputArray, outputArray) { | ||
const inputLength = inputArray.byteLength | ||
export function snappyUncompress(input, output) { | ||
const inputLength = input.byteLength | ||
const outputLength = output.byteLength | ||
let pos = 0 | ||
@@ -56,19 +56,22 @@ let outPos = 0 | ||
// skip preamble (contains uncompressed length as varint) | ||
let uncompressedLength = 0 | ||
let shift = 0 | ||
while (pos < inputLength) { | ||
const c = inputArray[pos] | ||
const c = input[pos] | ||
pos += 1 | ||
uncompressedLength |= (c & 0x7f) << shift | ||
if (c < 128) { | ||
break | ||
} | ||
shift += 7 | ||
} | ||
if (outputLength && pos >= inputLength) { | ||
throw new Error('invalid snappy length header') | ||
} | ||
while (pos < inputLength) { | ||
const c = inputArray[pos] | ||
const c = input[pos] | ||
let len = 0 | ||
pos += 1 | ||
if (pos >= inputLength) { | ||
throw new Error('missing eof marker') | ||
} | ||
// There are two types of elements, literals and copies (back references) | ||
@@ -81,10 +84,9 @@ if ((c & 0x3) === 0) { | ||
if (pos + 3 >= inputLength) { | ||
console.warn('snappy error literal pos + 3 >= inputLength') | ||
return false | ||
throw new Error('snappy error literal pos + 3 >= inputLength') | ||
} | ||
const lengthSize = len - 60 // length bytes - 1 | ||
len = inputArray[pos] | ||
+ (inputArray[pos + 1] << 8) | ||
+ (inputArray[pos + 2] << 16) | ||
+ (inputArray[pos + 3] << 24) | ||
len = input[pos] | ||
+ (input[pos + 1] << 8) | ||
+ (input[pos + 2] << 16) | ||
+ (input[pos + 3] << 24) | ||
len = (len & WORD_MASK[lengthSize]) + 1 | ||
@@ -94,5 +96,5 @@ pos += lengthSize | ||
if (pos + len > inputLength) { | ||
return false // literal exceeds input length | ||
throw new Error('snappy error literal exceeds input length') | ||
} | ||
copyBytes(inputArray, pos, outputArray, outPos, len) | ||
copyBytes(input, pos, output, outPos, len) | ||
pos += len | ||
@@ -107,3 +109,3 @@ outPos += len | ||
len = ((c >>> 2) & 0x7) + 4 | ||
offset = inputArray[pos] + ((c >>> 5) << 8) | ||
offset = input[pos] + ((c >>> 5) << 8) | ||
pos += 1 | ||
@@ -114,6 +116,6 @@ break | ||
if (inputLength <= pos + 1) { | ||
return false // end of input | ||
throw new Error('snappy error end of input') | ||
} | ||
len = (c >>> 2) + 1 | ||
offset = inputArray[pos] + (inputArray[pos + 1] << 8) | ||
offset = input[pos] + (input[pos + 1] << 8) | ||
pos += 2 | ||
@@ -124,9 +126,9 @@ break | ||
if (inputLength <= pos + 3) { | ||
return false // end of input | ||
throw new Error('snappy error end of input') | ||
} | ||
len = (c >>> 2) + 1 | ||
offset = inputArray[pos] | ||
+ (inputArray[pos + 1] << 8) | ||
+ (inputArray[pos + 2] << 16) | ||
+ (inputArray[pos + 3] << 24) | ||
offset = input[pos] | ||
+ (input[pos + 1] << 8) | ||
+ (input[pos + 2] << 16) | ||
+ (input[pos + 3] << 24) | ||
pos += 4 | ||
@@ -138,12 +140,15 @@ break | ||
if (offset === 0 || isNaN(offset)) { | ||
return false // invalid offset | ||
throw new Error(`invalid offset ${offset} pos ${pos} inputLength ${inputLength}`) | ||
} | ||
if (offset > outPos) { | ||
return false // cannot copy from before start of buffer | ||
throw new Error('cannot copy from before start of buffer') | ||
} | ||
selfCopyBytes(outputArray, outPos, offset, len) | ||
selfCopyBytes(output, outPos, offset, len) | ||
outPos += len | ||
} | ||
} | ||
return true | ||
if (outPos !== outputLength) { | ||
throw new Error('premature end of input') | ||
} | ||
} |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
76319
2068