Comparing version 0.3.1 to 0.3.2
{ | ||
"name": "hyparquet", | ||
"version": "0.3.1", | ||
"version": "0.3.2", | ||
"description": "parquet file parser for javascript", | ||
@@ -30,12 +30,12 @@ "keywords": [ | ||
"devDependencies": { | ||
"@types/node": "20.11.17", | ||
"@typescript-eslint/eslint-plugin": "6.21.0", | ||
"@vitest/coverage-v8": "1.2.2", | ||
"@types/node": "20.11.19", | ||
"@typescript-eslint/eslint-plugin": "7.0.1", | ||
"@vitest/coverage-v8": "1.3.0", | ||
"eslint": "8.56.0", | ||
"eslint-plugin-import": "2.29.1", | ||
"eslint-plugin-jsdoc": "48.0.6", | ||
"eslint-plugin-jsdoc": "48.1.0", | ||
"http-server": "14.1.1", | ||
"typescript": "5.3.3", | ||
"vitest": "1.2.2" | ||
"vitest": "1.3.0" | ||
} | ||
} |
@@ -72,8 +72,32 @@ # hyparquet | ||
## Supported Parquet Files | ||
The parquet format supports a number of different compression and encoding types. | ||
Hyparquet does not support 100% of all parquet files, and probably never will, since supporting all possible compression types will increase the size of the library, and are rarely used in practice. | ||
Compression: | ||
- [X] Uncompressed | ||
- [X] Snappy | ||
- [ ] GZip | ||
- [ ] LZO | ||
- [ ] Brotli | ||
- [ ] LZ4 | ||
- [ ] ZSTD | ||
- [ ] LZ4_RAW | ||
Page Type: | ||
- [X] Data Page | ||
- [ ] Index Page | ||
- [X] Dictionary Page | ||
- [ ] Data Page V2 | ||
Contributions are welcome! | ||
## References | ||
- https://github.com/apache/parquet-format | ||
- https://github.com/apache/parquet-testing | ||
- https://github.com/apache/thrift | ||
- https://github.com/dask/fastparquet | ||
- https://github.com/apache/thrift | ||
- https://github.com/google/snappy | ||
- https://github.com/zhipeng-jia/snappyjs |
import { Encoding, ParquetType } from './constants.js' | ||
import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' | ||
import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js' | ||
import { | ||
getMaxDefinitionLevel, | ||
getMaxRepetitionLevel, | ||
isRequired, | ||
schemaElement, | ||
skipDefinitionBytes, | ||
} from './schema.js' | ||
@@ -57,3 +63,5 @@ const skipNulls = false // TODO | ||
if (daph.encoding === Encoding.PLAIN) { | ||
const plainObj = readPlain(dataView, columnMetadata.type, nval, offset) | ||
const se = schemaElement(schema, columnMetadata.path_in_schema) | ||
const utf8 = se.converted_type === 'UTF8' | ||
const plainObj = readPlain(dataView, columnMetadata.type, nval, offset, utf8) | ||
values = plainObj.value | ||
@@ -104,3 +112,3 @@ offset += plainObj.byteLength | ||
// read values based on encoding | ||
const { value } = readPlain(dataView, columnMetadata.type, diph.num_values) | ||
const { value } = readPlain(dataView, columnMetadata.type, diph.num_values, 0, false) | ||
return value | ||
@@ -107,0 +115,0 @@ } |
@@ -156,5 +156,6 @@ import { Encoding, ParquetType } from './constants.js' | ||
* @param {number} offset - offset to start reading from the DataView | ||
* @param {boolean} utf8 - whether to decode byte arrays as UTF-8 | ||
* @returns {Decoded<ArrayLike<any>>} array of values | ||
*/ | ||
export function readPlain(dataView, type, count, offset = 0) { | ||
export function readPlain(dataView, type, count, offset, utf8) { | ||
if (count === 0) return { value: [], byteLength: 0 } | ||
@@ -174,3 +175,11 @@ if (type === ParquetType.BOOLEAN) { | ||
} else if (type === ParquetType.BYTE_ARRAY) { | ||
return readPlainByteArray(dataView, offset, count) | ||
const byteArray = readPlainByteArray(dataView, offset, count) | ||
if (utf8) { | ||
const decoder = new TextDecoder() | ||
return { | ||
value: byteArray.value.map(bytes => decoder.decode(bytes)), | ||
byteLength: byteArray.byteLength, | ||
} | ||
} | ||
return byteArray | ||
} else if (type === ParquetType.FIXED_LEN_BYTE_ARRAY) { | ||
@@ -177,0 +186,0 @@ return readPlainByteArrayFixed(dataView, offset, count) |
@@ -42,5 +42,3 @@ /** | ||
const child = tree.children.find(child => child.element.name === part) | ||
if (!child) { | ||
throw new Error(`parquet schema element not found: ${name}`) | ||
} | ||
if (!child) throw new Error(`parquet schema element not found: ${name}`) | ||
tree = child | ||
@@ -53,2 +51,3 @@ } | ||
* Check if the schema element with the given name is required. | ||
* An element is required if all of its ancestors are required. | ||
* | ||
@@ -60,3 +59,13 @@ * @param {SchemaElement[]} schema | ||
export function isRequired(schema, name) { | ||
return schemaElement(schema, name).repetition_type === 'REQUIRED' | ||
/** @type {SchemaTree | undefined} */ | ||
let tree = schemaTree(schema, 0) | ||
for (let i = 0; i < name.length; i++) { | ||
// Find schema child with the given name | ||
tree = tree.children.find(child => child.element.name === name[i]) | ||
if (!tree) throw new Error(`parquet schema element not found: ${name}`) | ||
if (tree.element.repetition_type !== 'REQUIRED') { | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
@@ -63,0 +72,0 @@ |
@@ -10,16 +10,15 @@ /** | ||
export function toJson(obj) { | ||
if (typeof obj === 'bigint') { | ||
return Number(obj) | ||
} else if (Array.isArray(obj)) { | ||
return obj.map(toJson) | ||
} else if (obj instanceof Object) { | ||
if (obj === undefined) return null | ||
if (typeof obj === 'bigint') return Number(obj) | ||
if (Array.isArray(obj)) return obj.map(toJson) | ||
if (obj instanceof Object) { | ||
/** @type {Record<string, unknown>} */ | ||
const newObj = {} | ||
for (const key of Object.keys(obj)) { | ||
if (obj[key] === undefined) continue | ||
newObj[key] = toJson(obj[key]) | ||
} | ||
return newObj | ||
} else { | ||
return obj | ||
} | ||
return obj | ||
} |
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
75751
2054
103