hyparquet - npm Package Compare versions

Comparing version 0.3.2 to 0.3.3

package.json

		{
		"name": "hyparquet",
		"version": "0.3.2",
		"version": "0.3.3",
		"description": "parquet file parser for javascript",
		@@ -30,12 +30,12 @@ "keywords": [
		"devDependencies": {
		"@types/node": "20.11.19",
		"@typescript-eslint/eslint-plugin": "7.0.1",
		"@vitest/coverage-v8": "1.3.0",
		"eslint": "8.56.0",
		"@types/node": "20.11.20",
		"@typescript-eslint/eslint-plugin": "7.0.2",
		"@vitest/coverage-v8": "1.3.1",
		"eslint": "8.57.0",
		"eslint-plugin-import": "2.29.1",
		"eslint-plugin-jsdoc": "48.1.0",
		"eslint-plugin-jsdoc": "48.2.0",
		"http-server": "14.1.1",
		"typescript": "5.3.3",
		"vitest": "1.3.0"
		"vitest": "1.3.1"
		}
		}

README.md

		# hyparquet

		![hyparquet](hyparquet.jpg)
		![hyparquet parakeet](hyparquet.jpg)

		@@ -5,0 +5,0 @@ [![npm](https://img.shields.io/npm/v/hyparquet)](https://www.npmjs.com/package/hyparquet)

src/column.js

		@@ -17,3 +17,3 @@ import { Encoding, PageType } from './constants.js'
		/**
		* Read a column from the file.
		* Parse column data from a buffer.
		*
		@@ -28,9 +28,9 @@ * @param {ArrayBuffer} arrayBuffer parquet file contents
		export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, schema) {
		// parse column data
		/** @type {ArrayLike<any> \| undefined} */
		let dictionary = undefined
		let valuesSeen = 0
		let byteOffset = 0 // byteOffset within the column
		/** @type {ArrayLike<any> \| undefined} */
		let dictionary = undefined
		const rowIndex = [0] // map/list object index
		const rowData = []

		while (valuesSeen < rowGroup.num_rows) {
		@@ -49,18 +49,2 @@ // parse column header
		)
		// decompress bytes
		/** @type {Uint8Array \| undefined} */
		let page
		const uncompressed_page_size = Number(header.uncompressed_page_size)
		const { codec } = columnMetadata
		if (codec === 'UNCOMPRESSED') {
		page = compressedBytes
		} else if (codec === 'SNAPPY') {
		page = new Uint8Array(uncompressed_page_size)
		snappyUncompress(compressedBytes, page)
		} else {
		throw new Error(`parquet unsupported compression codec: ${codec}`)
		}
		if (page?.length !== uncompressed_page_size) {
		throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
		}

		@@ -72,2 +56,3 @@ // parse page data by type

		const page = decompressPage(compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec)
		const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schema, columnMetadata)
		@@ -104,2 +89,3 @@ valuesSeen += daph.num_values
		let v = dataPage[index++]

		// map to dictionary value
		@@ -146,2 +132,3 @@ if (dictionary) {

		const page = decompressPage(compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec)
		dictionary = readDictionaryPage(page, diph, schema, columnMetadata)
		@@ -232,1 +219,26 @@ } else if (header.type === PageType.DATA_PAGE_V2) {
		}

		/**
		* @typedef {import('./types.js').PageHeader} PageHeader
		* @typedef {import('./types.js').CompressionCodec} CompressionCodec
		* @param {Uint8Array} compressedBytes
		* @param {number} uncompressed_page_size
		* @param {CompressionCodec} codec
		* @returns {Uint8Array}
		*/
		function decompressPage(compressedBytes, uncompressed_page_size, codec) {
		/** @type {Uint8Array \| undefined} */
		let page
		if (codec === 'UNCOMPRESSED') {
		page = compressedBytes
		} else if (codec === 'SNAPPY') {
		page = new Uint8Array(uncompressed_page_size)
		snappyUncompress(compressedBytes, page)
		} else {
		throw new Error(`parquet unsupported compression codec: ${codec}`)
		}
		if (page?.length !== uncompressed_page_size) {
		throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
		}
		return page
		}

src/encoding.js

		@@ -135,3 +135,3 @@ import { Encoding, ParquetType } from './constants.js'
		/**
		* Read `count` fixed length byte array values.
		* Read a fixed length byte array.
		*
		@@ -138,0 +138,0 @@ * @param {DataView} dataView - buffer to read data from

src/hyparquet.d.ts

		@@ -20,4 +20,4 @@ export { AsyncBuffer, FileMetaData, SchemaTree } from './types'
		* @param {number} [options.rowEnd] last requested row index (exclusive)
		* @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
		* @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed
		* @param {Function} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
		* @param {Function} [options.onComplete] called when all requested rows and columns are parsed
		* @returns {Promise<void>} resolves when all requested rows and columns are parsed
		@@ -73,7 +73,7 @@ */
		*
		* @param {Uint8Array} inputArray compressed data
		* @param {Uint8Array} outputArray output buffer
		* @param {Uint8Array} input compressed data
		* @param {Uint8Array} output output buffer
		* @returns {boolean} true if successful
		*/
		export function snappyUncompress(inputArray: Uint8Array, outputArray: Uint8Array): boolean
		export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean

		@@ -80,0 +80,0 @@ /**

src/read.js

		@@ -112,2 +112,3 @@
		if (!columnMetadata) throw new Error('parquet column metadata is undefined')

		const columnStartByte = getColumnOffset(columnMetadata)
		@@ -118,2 +119,3 @@ const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size)
		// skip columns larger than 1gb
		// TODO: stream process the data, returning only the requested rows
		if (columnBytes > 1 << 30) {
		@@ -120,0 +122,0 @@ console.warn(`parquet skipping huge column "${columnMetadata.path_in_schema}" ${columnBytes.toLocaleString()} bytes`)

src/snappy.js

		@@ -44,9 +44,9 @@ /**
		*
		* @param {Uint8Array} inputArray compressed data
		* @param {Uint8Array} outputArray output buffer
		* @returns {boolean} true if successful
		* @param {Uint8Array} input compressed data
		* @param {Uint8Array} output output buffer
		* @returns {void}
		*/
		export function snappyUncompress(inputArray, outputArray) {
		const inputLength = inputArray.byteLength

		export function snappyUncompress(input, output) {
		const inputLength = input.byteLength
		const outputLength = output.byteLength
		let pos = 0
		@@ -56,19 +56,22 @@ let outPos = 0
		// skip preamble (contains uncompressed length as varint)
		let uncompressedLength = 0
		let shift = 0
		while (pos < inputLength) {
		const c = inputArray[pos]
		const c = input[pos]
		pos += 1
		uncompressedLength \|= (c & 0x7f) << shift
		if (c < 128) {
		break
		}
		shift += 7
		}
		if (outputLength && pos >= inputLength) {
		throw new Error('invalid snappy length header')
		}

		while (pos < inputLength) {
		const c = inputArray[pos]
		const c = input[pos]
		let len = 0
		pos += 1

		if (pos >= inputLength) {
		throw new Error('missing eof marker')
		}

		// There are two types of elements, literals and copies (back references)
		@@ -81,10 +84,9 @@ if ((c & 0x3) === 0) {
		if (pos + 3 >= inputLength) {
		console.warn('snappy error literal pos + 3 >= inputLength')
		return false
		throw new Error('snappy error literal pos + 3 >= inputLength')
		}
		const lengthSize = len - 60 // length bytes - 1
		len = inputArray[pos]
		+ (inputArray[pos + 1] << 8)
		+ (inputArray[pos + 2] << 16)
		+ (inputArray[pos + 3] << 24)
		len = input[pos]
		+ (input[pos + 1] << 8)
		+ (input[pos + 2] << 16)
		+ (input[pos + 3] << 24)
		len = (len & WORD_MASK[lengthSize]) + 1
		@@ -94,5 +96,5 @@ pos += lengthSize
		if (pos + len > inputLength) {
		return false // literal exceeds input length
		throw new Error('snappy error literal exceeds input length')
		}
		copyBytes(inputArray, pos, outputArray, outPos, len)
		copyBytes(input, pos, output, outPos, len)
		pos += len
		@@ -107,3 +109,3 @@ outPos += len
		len = ((c >>> 2) & 0x7) + 4
		offset = inputArray[pos] + ((c >>> 5) << 8)
		offset = input[pos] + ((c >>> 5) << 8)
		pos += 1
		@@ -114,6 +116,6 @@ break
		if (inputLength <= pos + 1) {
		return false // end of input
		throw new Error('snappy error end of input')
		}
		len = (c >>> 2) + 1
		offset = inputArray[pos] + (inputArray[pos + 1] << 8)
		offset = input[pos] + (input[pos + 1] << 8)
		pos += 2
		@@ -124,9 +126,9 @@ break
		if (inputLength <= pos + 3) {
		return false // end of input
		throw new Error('snappy error end of input')
		}
		len = (c >>> 2) + 1
		offset = inputArray[pos]
		+ (inputArray[pos + 1] << 8)
		+ (inputArray[pos + 2] << 16)
		+ (inputArray[pos + 3] << 24)
		offset = input[pos]
		+ (input[pos + 1] << 8)
		+ (input[pos + 2] << 16)
		+ (input[pos + 3] << 24)
		pos += 4
		@@ -138,12 +140,15 @@ break
		if (offset === 0 \|\| isNaN(offset)) {
		return false // invalid offset
		throw new Error(`invalid offset ${offset} pos ${pos} inputLength ${inputLength}`)
		}
		if (offset > outPos) {
		return false // cannot copy from before start of buffer
		throw new Error('cannot copy from before start of buffer')
		}
		selfCopyBytes(outputArray, outPos, offset, len)
		selfCopyBytes(output, outPos, offset, len)
		outPos += len
		}
		}
		return true

		if (outPos !== outputLength) {
		throw new Error('premature end of input')
		}
		}

hyparquet - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics