@dsnp/parquetjs
Advanced tools
Comparing version 0.0.0-5db9db to 0.0.0-6396d2
@@ -1,1 +0,1 @@ | ||
export * from "../parquet"; | ||
export * from '../parquet'; |
@@ -1,1 +0,1 @@ | ||
export * from "../parquet"; | ||
export * from '../parquet'; |
// | ||
// Autogenerated by Thrift Compiler (0.16.0) | ||
// Autogenerated by Thrift Compiler (0.18.1) | ||
// | ||
@@ -4,0 +4,0 @@ // DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING |
@@ -1,4 +0,3 @@ | ||
/// <reference types="node" /> | ||
import Long from 'long'; | ||
import { Block } from "../declare"; | ||
import { Block } from '../declare'; | ||
/** | ||
@@ -52,3 +51,3 @@ * @class SplitBlockBloomFilter | ||
*/ | ||
static from(buffer: Buffer, rowCount?: number): SplitBlockBloomFilter; | ||
static from(buffer: Buffer, _rowCount?: number): SplitBlockBloomFilter; | ||
/** | ||
@@ -119,3 +118,3 @@ * @function getBlockIndex: get a block index to insert a hash value for | ||
getNumFilterBlocks(): number; | ||
getFilter(): Array<Block>; | ||
getFilter(): Block[]; | ||
/** | ||
@@ -122,0 +121,0 @@ * @function optNumFilterBytes |
@@ -38,10 +38,3 @@ "use strict"; | ||
static salt = [ | ||
0x47b6137b, | ||
0x44974d91, | ||
0x8824ad5b, | ||
0xa2b7289d, | ||
0x705495c7, | ||
0x2df1424b, | ||
0x9efc4947, | ||
0x5c6bfb31 | ||
0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31, | ||
]; | ||
@@ -58,3 +51,3 @@ // How many bits are in a single block: | ||
// Currently this is 1024 | ||
static LOWER_BOUND_BYTES = SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK / 8; | ||
static LOWER_BOUND_BYTES = (SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK) / 8; | ||
// The upper bound of SBBF size, set to default row group size in bytes. | ||
@@ -79,9 +72,9 @@ // Note that the subsquent requirements for an effective bloom filter on a row group this size would mean this | ||
*/ | ||
static from(buffer, rowCount) { | ||
static from(buffer, _rowCount) { | ||
if (buffer.length === 0) { | ||
throw new Error("buffer is empty"); | ||
throw new Error('buffer is empty'); | ||
} | ||
const chunkSize = SplitBlockBloomFilter.WORDS_PER_BLOCK; | ||
const uint32sFromBuf = new Uint32Array(buffer.buffer); | ||
let result = []; | ||
const result = []; | ||
const length = uint32sFromBuf.length; | ||
@@ -91,3 +84,3 @@ for (let index = 0; index < length; index += chunkSize) { | ||
} | ||
let sb = new SplitBlockBloomFilter(); | ||
const sb = new SplitBlockBloomFilter(); | ||
sb.splitBlockFilter = result; | ||
@@ -100,3 +93,2 @@ sb.numBlocks = result.length; | ||
} | ||
; | ||
/** | ||
@@ -128,4 +120,4 @@ * @function getBlockIndex: get a block index to insert a hash value for | ||
static optimalNumOfBlocks(numDistinct, falsePositiveRate) { | ||
let m = -8 * numDistinct / Math.log(1 - Math.pow(falsePositiveRate, 1.0 / 8)); | ||
m = (m + SplitBlockBloomFilter.NUMBER_OF_BLOCKS - 1) & (~SplitBlockBloomFilter.NUMBER_OF_BLOCKS); | ||
let m = (-8 * numDistinct) / Math.log(1 - Math.pow(falsePositiveRate, 1.0 / 8)); | ||
m = (m + SplitBlockBloomFilter.NUMBER_OF_BLOCKS - 1) & ~SplitBlockBloomFilter.NUMBER_OF_BLOCKS; | ||
// Handle overflow: | ||
@@ -153,3 +145,3 @@ const upperShiftL3 = SplitBlockBloomFilter.UPPER_BOUND_BYTES << 3; | ||
static mask(hashValue) { | ||
let result = SplitBlockBloomFilter.initBlock(); | ||
const result = SplitBlockBloomFilter.initBlock(); | ||
for (let i = 0; i < result.length; i++) { | ||
@@ -215,7 +207,17 @@ const y = hashValue.getLowBitsUnsigned() * SplitBlockBloomFilter.salt[i]; | ||
hasher = new xxhasher_1.default(); | ||
isInitialized() { return this.splitBlockFilter.length > 0; } | ||
getFalsePositiveRate() { return this.desiredFalsePositiveRate; } | ||
getNumDistinct() { return this.numDistinctValues; } | ||
getNumFilterBlocks() { return this.splitBlockFilter.length; } | ||
getFilter() { return this.splitBlockFilter; } | ||
isInitialized() { | ||
return this.splitBlockFilter.length > 0; | ||
} | ||
getFalsePositiveRate() { | ||
return this.desiredFalsePositiveRate; | ||
} | ||
getNumDistinct() { | ||
return this.numDistinctValues; | ||
} | ||
getNumFilterBlocks() { | ||
return this.splitBlockFilter.length; | ||
} | ||
getFilter() { | ||
return this.splitBlockFilter; | ||
} | ||
/** | ||
@@ -228,3 +230,3 @@ * @function optNumFilterBytes | ||
getNumFilterBytes() { | ||
return this.numBlocks * SplitBlockBloomFilter.BITS_PER_BLOCK >>> 3; | ||
return (this.numBlocks * SplitBlockBloomFilter.BITS_PER_BLOCK) >>> 3; | ||
} | ||
@@ -240,7 +242,7 @@ /** | ||
if (this.isInitialized()) { | ||
console.error("filter already initialized. options may no longer be changed."); | ||
console.error('filter already initialized. options may no longer be changed.'); | ||
return this; | ||
} | ||
if (proportion <= 0.0 || proportion >= 1.0) { | ||
console.error("falsePositiveProbability. Must be < 1.0 and > 0.0"); | ||
console.error('falsePositiveProbability. Must be < 1.0 and > 0.0'); | ||
return this; | ||
@@ -261,3 +263,3 @@ } | ||
if (this.isInitialized()) { | ||
console.error("filter already initialized. options may no longer be changed."); | ||
console.error('filter already initialized. options may no longer be changed.'); | ||
return this; | ||
@@ -298,3 +300,3 @@ } | ||
if (this.isInitialized()) { | ||
console.error("filter already initialized. options may no longer be changed."); | ||
console.error('filter already initialized. options may no longer be changed.'); | ||
return this; | ||
@@ -307,3 +309,3 @@ } | ||
// numBlocks = Bytes * 8b/Byte * 1Block/256b | ||
this.numBlocks = SplitBlockBloomFilter.nextPwr2(numBytes) * 8 / SplitBlockBloomFilter.BITS_PER_BLOCK; | ||
this.numBlocks = (SplitBlockBloomFilter.nextPwr2(numBytes) * 8) / SplitBlockBloomFilter.BITS_PER_BLOCK; | ||
return this; | ||
@@ -324,10 +326,11 @@ } | ||
if (this.isInitialized()) { | ||
console.error("filter already initialized."); | ||
console.error('filter already initialized.'); | ||
return this; | ||
} | ||
if (!this.hashStrategy.hasOwnProperty("XXHASH")) { | ||
throw new Error("unsupported hash strategy"); | ||
if (!Object.prototype.hasOwnProperty.call(this.hashStrategy, 'XXHASH')) { | ||
throw new Error('unsupported hash strategy'); | ||
} | ||
if (this.numBlocks === 0) { | ||
this.numBlocks = SplitBlockBloomFilter.optimalNumOfBlocks(this.numDistinctValues, this.desiredFalsePositiveRate) >>> 3; | ||
this.numBlocks = | ||
SplitBlockBloomFilter.optimalNumOfBlocks(this.numDistinctValues, this.desiredFalsePositiveRate) >>> 3; | ||
} | ||
@@ -338,4 +341,4 @@ this.splitBlockFilter = Array(this.numBlocks).fill(SplitBlockBloomFilter.initBlock()); | ||
async hash(value) { | ||
if (!this.hashStrategy.hasOwnProperty("XXHASH")) { | ||
throw new Error("unsupported hash strategy"); | ||
if (!Object.prototype.hasOwnProperty.call(this.hashStrategy, 'XXHASH')) { | ||
throw new Error('unsupported hash strategy'); | ||
} | ||
@@ -347,5 +350,5 @@ const hashed = await this.hasher.hash64(value); | ||
if (!hashValue.unsigned) | ||
throw new Error("hashValue must be an unsigned Long"); | ||
throw new Error('hashValue must be an unsigned Long'); | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized. call init() first"); | ||
throw new Error('filter has not been initialized. call init() first'); | ||
const i = SplitBlockBloomFilter.getBlockIndex(hashValue, this.splitBlockFilter.length); | ||
@@ -362,3 +365,3 @@ SplitBlockBloomFilter.blockInsert(this.splitBlockFilter[i], hashValue); | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized. call init() first"); | ||
throw new Error('filter has not been initialized. call init() first'); | ||
this.insertHash(await this.hash(value)); | ||
@@ -368,5 +371,5 @@ } | ||
if (!hashValue.unsigned) | ||
throw new Error("hashValue must be an unsigned Long"); | ||
throw new Error('hashValue must be an unsigned Long'); | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized"); | ||
throw new Error('filter has not been initialized'); | ||
const i = SplitBlockBloomFilter.getBlockIndex(hashValue, this.splitBlockFilter.length); | ||
@@ -384,3 +387,3 @@ return SplitBlockBloomFilter.blockCheck(this.splitBlockFilter[i], hashValue); | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized"); | ||
throw new Error('filter has not been initialized'); | ||
return this.checkHash(await this.hash(value)); | ||
@@ -387,0 +390,0 @@ } |
@@ -19,3 +19,3 @@ "use strict"; | ||
class XxHasher { | ||
static h64 = (0, xxhash_wasm_1.default)().then(x => x.h64ToString); | ||
static h64 = (0, xxhash_wasm_1.default)().then((x) => x.h64ToString); | ||
async hashIt(value) { | ||
@@ -42,5 +42,5 @@ return (await XxHasher.h64)(value); | ||
} | ||
throw new Error("unsupported type: " + value); | ||
throw new Error('unsupported type: ' + value); | ||
} | ||
} | ||
exports.default = XxHasher; |
@@ -1,12 +0,12 @@ | ||
import sbbf from "../bloom/sbbf"; | ||
import { ParquetEnvelopeReader } from "../reader"; | ||
import { ColumnChunkData } from "../declare"; | ||
type bloomFilterOffsetData = { | ||
import sbbf from '../bloom/sbbf'; | ||
import { ParquetEnvelopeReader } from '../reader'; | ||
import { ColumnChunkData } from '../declare'; | ||
interface bloomFilterOffsetData { | ||
columnName: string; | ||
offsetBytes: number; | ||
rowGroupIndex: number; | ||
}; | ||
export declare const parseBloomFilterOffsets: (ColumnChunkDataCollection: Array<ColumnChunkData>) => Array<bloomFilterOffsetData>; | ||
export declare const siftAllByteOffsets: (columnChunkDataCollection: Array<ColumnChunkData>) => Array<bloomFilterOffsetData>; | ||
export declare const getBloomFiltersFor: (paths: Array<string>, envelopeReader: InstanceType<typeof ParquetEnvelopeReader>) => Promise<{ | ||
} | ||
export declare const parseBloomFilterOffsets: (ColumnChunkDataCollection: ColumnChunkData[]) => bloomFilterOffsetData[]; | ||
export declare const siftAllByteOffsets: (columnChunkDataCollection: ColumnChunkData[]) => bloomFilterOffsetData[]; | ||
export declare const getBloomFiltersFor: (paths: string[], envelopeReader: InstanceType<typeof ParquetEnvelopeReader>) => Promise<{ | ||
sbbf: sbbf; | ||
@@ -13,0 +13,0 @@ columnName: string; |
@@ -39,5 +39,5 @@ "use strict"; | ||
const toInteger = (buffer) => { | ||
const integer = parseInt(buffer.toString("hex"), 16); | ||
const integer = parseInt(buffer.toString('hex'), 16); | ||
if (integer >= Number.MAX_VALUE) { | ||
throw Error("Number exceeds Number.MAX_VALUE: Godspeed"); | ||
throw Error('Number exceeds Number.MAX_VALUE: Godspeed'); | ||
} | ||
@@ -48,6 +48,6 @@ return integer; | ||
return ColumnChunkDataCollection.map(({ rowGroupIndex, column }) => { | ||
const { bloom_filter_offset: bloomOffset, path_in_schema: pathInSchema, } = column.meta_data || {}; | ||
const { bloom_filter_offset: bloomOffset, path_in_schema: pathInSchema } = column.meta_data || {}; | ||
return { | ||
offsetBytes: toInteger(bloomOffset.buffer), | ||
columnName: pathInSchema.join(","), | ||
columnName: pathInSchema.join(','), | ||
rowGroupIndex, | ||
@@ -78,3 +78,3 @@ }; | ||
const readFilterData = async (offsetBytes, envelopeReader) => { | ||
const { bloomFilterHeader, sizeOfBloomFilterHeader, } = await getBloomFilterHeader(offsetBytes, envelopeReader); | ||
const { bloomFilterHeader, sizeOfBloomFilterHeader } = await getBloomFilterHeader(offsetBytes, envelopeReader); | ||
const { numBytes: filterByteSize } = bloomFilterHeader; | ||
@@ -81,0 +81,0 @@ try { |
@@ -1,7 +0,6 @@ | ||
/// <reference types="node" /> | ||
import parquet_thrift from "../../gen-nodejs/parquet_types"; | ||
import SplitBlockBloomFilter from "../bloom/sbbf"; | ||
import { Block } from "../declare"; | ||
import parquet_thrift from '../../gen-nodejs/parquet_types'; | ||
import SplitBlockBloomFilter from '../bloom/sbbf'; | ||
import { Block } from '../declare'; | ||
import Int64 from 'node-int64'; | ||
export type createSBBFParams = { | ||
export interface createSBBFParams { | ||
numFilterBytes?: number; | ||
@@ -11,9 +10,9 @@ falsePositiveRate?: number; | ||
column?: any; | ||
}; | ||
} | ||
export declare const createSBBF: (params: createSBBFParams) => SplitBlockBloomFilter; | ||
export declare const serializeFilterHeaders: (numberOfBytes: number) => Buffer; | ||
type serializeFilterDataParams = { | ||
filterBlocks: Array<Block>; | ||
interface serializeFilterDataParams { | ||
filterBlocks: Block[]; | ||
filterByteSize: number; | ||
}; | ||
} | ||
export declare const serializeFilterData: (params: serializeFilterDataParams) => Buffer; | ||
@@ -20,0 +19,0 @@ export declare const setFilterOffset: (column: parquet_thrift.ColumnChunk, offset: Int64) => void; |
@@ -5,13 +5,13 @@ 'use strict'; | ||
const PARQUET_COMPRESSION_METHODS = { | ||
'UNCOMPRESSED': { | ||
UNCOMPRESSED: { | ||
deflate: deflate_identity, | ||
inflate: inflate_identity | ||
inflate: inflate_identity, | ||
}, | ||
'GZIP': { | ||
GZIP: { | ||
deflate: deflate_gzip, | ||
inflate: inflate_gzip | ||
inflate: inflate_gzip, | ||
}, | ||
'SNAPPY': { | ||
SNAPPY: { | ||
deflate: deflate_snappy, | ||
inflate: inflate_snappy | ||
inflate: inflate_snappy, | ||
}, | ||
@@ -18,0 +18,0 @@ }; |
@@ -1,5 +0,4 @@ | ||
/// <reference types="node" /> | ||
import { Statistics } from "../gen-nodejs/parquet_types"; | ||
import { ParquetEnvelopeReader } from "./reader"; | ||
import { FileMetaDataExt } from "./declare"; | ||
import { Statistics } from '../gen-nodejs/parquet_types'; | ||
import { ParquetEnvelopeReader } from './reader'; | ||
import { FileMetaDataExt } from './declare'; | ||
export interface BufferReaderOptions { | ||
@@ -24,3 +23,3 @@ maxSpan?: number; | ||
scheduled?: boolean; | ||
queue: Array<BufferReaderQueueRow>; | ||
queue: BufferReaderQueueRow[]; | ||
envelopeReader: ParquetEnvelopeReader; | ||
@@ -27,0 +26,0 @@ constructor(envelopeReader: ParquetEnvelopeReader, options: BufferReaderOptions); |
@@ -37,3 +37,3 @@ "use strict"; | ||
queue.sort((a, b) => a.offset - b.offset); | ||
var subqueue = []; | ||
let subqueue = []; | ||
const readSubqueue = async () => { | ||
@@ -55,5 +55,5 @@ if (!subqueue.length) { | ||
const prev = queue[i - 1]; | ||
if (!prev || (d.offset - (prev.offset + prev.length)) < this.maxSpan) { | ||
if (!prev || d.offset - (prev.offset + prev.length) < this.maxSpan) { | ||
subqueue.push(d); | ||
if ((d.offset + d.length) - subqueue[0].offset > this.maxLength) { | ||
if (d.offset + d.length - subqueue[0].offset > this.maxLength) { | ||
readSubqueue(); | ||
@@ -71,2 +71,1 @@ } | ||
exports.default = BufferReader; | ||
; |
@@ -1,6 +0,5 @@ | ||
/// <reference types="node" /> | ||
import { Cursor, Options } from "./types"; | ||
type ValidValueTypes = "BOOLEAN" | "INT32" | "INT64" | "INT96" | "FLOAT" | "DOUBLE" | "BYTE_ARRAY" | "FIXED_LEN_BYTE_ARRAY"; | ||
export declare const encodeValues: (type: ValidValueTypes | string, values: Array<unknown>, opts: Options) => Buffer; | ||
import { Cursor, Options } from './types'; | ||
type ValidValueTypes = 'BOOLEAN' | 'INT32' | 'INT64' | 'INT96' | 'FLOAT' | 'DOUBLE' | 'BYTE_ARRAY' | 'FIXED_LEN_BYTE_ARRAY'; | ||
export declare const encodeValues: (type: ValidValueTypes | string, values: unknown[], opts: Options) => Buffer; | ||
export declare const decodeValues: (type: ValidValueTypes | string, cursor: Cursor, count: number, opts: Options) => number[] | boolean[] | bigint[] | Buffer[]; | ||
export {}; |
@@ -9,3 +9,3 @@ "use strict"; | ||
function encodeValues_BOOLEAN(values) { | ||
let buf = Buffer.alloc(Math.ceil(values.length / 8)); | ||
const buf = Buffer.alloc(Math.ceil(values.length / 8)); | ||
buf.fill(0); | ||
@@ -20,5 +20,5 @@ for (let i = 0; i < values.length; ++i) { | ||
function decodeValues_BOOLEAN(cursor, count) { | ||
let values = []; | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
let b = cursor.buffer[cursor.offset + Math.floor(i / 8)]; | ||
const b = cursor.buffer[cursor.offset + Math.floor(i / 8)]; | ||
values.push((b & (1 << i % 8)) > 0); | ||
@@ -32,3 +32,3 @@ } | ||
const scale = opts?.scale || 0; | ||
let buf = Buffer.alloc(4 * values.length); | ||
const buf = Buffer.alloc(4 * values.length); | ||
for (let i = 0; i < values.length; i++) { | ||
@@ -67,3 +67,3 @@ if (isDecimal) { | ||
const scale = opts?.scale || 0; | ||
let buf = Buffer.alloc(8 * values.length); | ||
const buf = Buffer.alloc(8 * values.length); | ||
for (let i = 0; i < values.length; i++) { | ||
@@ -84,3 +84,3 @@ if (isDecimal) { | ||
if (opts.originalType === 'DECIMAL' || opts.column?.originalType === 'DECIMAL') { | ||
let columnOptions = opts.column?.originalType ? opts.column : opts; | ||
const columnOptions = opts.column?.originalType ? opts.column : opts; | ||
values = decodeValues_DECIMAL(cursor, count, columnOptions); | ||
@@ -109,3 +109,3 @@ } | ||
} | ||
let values = []; | ||
const values = []; | ||
// by default we prepare the offset and bufferFunction to work with 32bit integers | ||
@@ -131,3 +131,3 @@ let offset = 4; | ||
function encodeValues_INT96(values) { | ||
let buf = Buffer.alloc(12 * values.length); | ||
const buf = Buffer.alloc(12 * values.length); | ||
for (let i = 0; i < values.length; i++) { | ||
@@ -146,3 +146,3 @@ if (values[i] >= 0) { | ||
function decodeValues_INT96(cursor, count) { | ||
let values = []; | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
@@ -162,3 +162,3 @@ const low = int53_1.default.readInt64LE(cursor.buffer, cursor.offset); | ||
function encodeValues_FLOAT(values) { | ||
let buf = Buffer.alloc(4 * values.length); | ||
const buf = Buffer.alloc(4 * values.length); | ||
for (let i = 0; i < values.length; i++) { | ||
@@ -170,3 +170,3 @@ buf.writeFloatLE(values[i], i * 4); | ||
function decodeValues_FLOAT(cursor, count) { | ||
let values = []; | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
@@ -179,3 +179,3 @@ values.push(cursor.buffer.readFloatLE(cursor.offset)); | ||
function encodeValues_DOUBLE(values) { | ||
let buf = Buffer.alloc(8 * values.length); | ||
const buf = Buffer.alloc(8 * values.length); | ||
for (let i = 0; i < values.length; i++) { | ||
@@ -187,3 +187,3 @@ buf.writeDoubleLE(values[i], i * 8); | ||
function decodeValues_DOUBLE(cursor, count) { | ||
let values = []; | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
@@ -202,3 +202,3 @@ values.push(cursor.buffer.readDoubleLE(cursor.offset)); | ||
} | ||
let buf = Buffer.alloc(buf_len); | ||
const buf = Buffer.alloc(buf_len); | ||
let buf_pos = 0; | ||
@@ -213,5 +213,5 @@ for (let i = 0; i < returnedValues.length; i++) { | ||
function decodeValues_BYTE_ARRAY(cursor, count) { | ||
let values = []; | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
let len = cursor.buffer.readUInt32LE(cursor.offset); | ||
const len = cursor.buffer.readUInt32LE(cursor.offset); | ||
cursor.offset += 4; | ||
@@ -225,3 +225,3 @@ values.push(cursor.buffer.subarray(cursor.offset, cursor.offset + len)); | ||
if (!opts.typeLength) { | ||
throw "missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)"; | ||
throw 'missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)'; | ||
} | ||
@@ -232,3 +232,3 @@ const returnedValues = []; | ||
if (returnedValues[i].length !== opts.typeLength) { | ||
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + returnedValues[i]; | ||
throw 'invalid value for FIXED_LEN_BYTE_ARRAY: ' + returnedValues[i]; | ||
} | ||
@@ -239,6 +239,6 @@ } | ||
function decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts) { | ||
let values = []; | ||
const values = []; | ||
const typeLength = opts.typeLength ?? (opts.column ? opts.column.typeLength : undefined); | ||
if (!typeLength) { | ||
throw "missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)"; | ||
throw 'missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)'; | ||
} | ||
@@ -253,20 +253,20 @@ for (let i = 0; i < count; ++i) { | ||
switch (type) { | ||
case "BOOLEAN": | ||
case 'BOOLEAN': | ||
return encodeValues_BOOLEAN(values); | ||
case "INT32": | ||
case 'INT32': | ||
return encodeValues_INT32(values, opts); | ||
case "INT64": | ||
case 'INT64': | ||
return encodeValues_INT64(values, opts); | ||
case "INT96": | ||
case 'INT96': | ||
return encodeValues_INT96(values); | ||
case "FLOAT": | ||
case 'FLOAT': | ||
return encodeValues_FLOAT(values); | ||
case "DOUBLE": | ||
case 'DOUBLE': | ||
return encodeValues_DOUBLE(values); | ||
case "BYTE_ARRAY": | ||
case 'BYTE_ARRAY': | ||
return encodeValues_BYTE_ARRAY(values); | ||
case "FIXED_LEN_BYTE_ARRAY": | ||
case 'FIXED_LEN_BYTE_ARRAY': | ||
return encodeValues_FIXED_LEN_BYTE_ARRAY(values, opts); | ||
default: | ||
throw "unsupported type: " + type; | ||
throw 'unsupported type: ' + type; | ||
} | ||
@@ -277,22 +277,22 @@ }; | ||
switch (type) { | ||
case "BOOLEAN": | ||
case 'BOOLEAN': | ||
return decodeValues_BOOLEAN(cursor, count); | ||
case "INT32": | ||
case 'INT32': | ||
return decodeValues_INT32(cursor, count, opts); | ||
case "INT64": | ||
case 'INT64': | ||
return decodeValues_INT64(cursor, count, opts); | ||
case "INT96": | ||
case 'INT96': | ||
return decodeValues_INT96(cursor, count); | ||
case "FLOAT": | ||
case 'FLOAT': | ||
return decodeValues_FLOAT(cursor, count); | ||
case "DOUBLE": | ||
case 'DOUBLE': | ||
return decodeValues_DOUBLE(cursor, count); | ||
case "BYTE_ARRAY": | ||
case 'BYTE_ARRAY': | ||
return decodeValues_BYTE_ARRAY(cursor, count); | ||
case "FIXED_LEN_BYTE_ARRAY": | ||
case 'FIXED_LEN_BYTE_ARRAY': | ||
return decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts); | ||
default: | ||
throw "unsupported type: " + type; | ||
throw 'unsupported type: ' + type; | ||
} | ||
}; | ||
exports.decodeValues = decodeValues; |
@@ -1,4 +0,3 @@ | ||
/// <reference types="node" /> | ||
import { Cursor } from './types'; | ||
export declare const encodeValues: (type: string, values: Array<number>, opts: { | ||
export declare const encodeValues: (type: string, values: number[], opts: { | ||
bitWidth: number; | ||
@@ -5,0 +4,0 @@ disableEnvelope?: boolean; |
@@ -15,15 +15,12 @@ "use strict"; | ||
} | ||
let buf = Buffer.alloc(Math.ceil(opts.bitWidth * (values.length / 8))); | ||
const buf = Buffer.alloc(Math.ceil(opts.bitWidth * (values.length / 8))); | ||
for (let b = 0; b < opts.bitWidth * values.length; ++b) { | ||
if ((values[Math.floor(b / opts.bitWidth)] & (1 << b % opts.bitWidth)) > 0) { | ||
buf[Math.floor(b / 8)] |= (1 << (b % 8)); | ||
buf[Math.floor(b / 8)] |= 1 << b % 8; | ||
} | ||
} | ||
return Buffer.concat([ | ||
Buffer.from(varint_1.default.encode(((values.length / 8) << 1) | 1)), | ||
buf | ||
]); | ||
return Buffer.concat([Buffer.from(varint_1.default.encode(((values.length / 8) << 1) | 1)), buf]); | ||
} | ||
function encodeRunRepeated(value, count, opts) { | ||
let buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8)); | ||
const buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8)); | ||
let remainingValue = value; | ||
@@ -36,6 +33,3 @@ // This is encoded LSB to MSB, so we pick off the least | ||
} | ||
return Buffer.concat([ | ||
Buffer.from(varint_1.default.encode(count << 1)), | ||
buf | ||
]); | ||
return Buffer.concat([Buffer.from(varint_1.default.encode(count << 1)), buf]); | ||
} | ||
@@ -98,3 +92,3 @@ function unknownToParsedInt(value) { | ||
} | ||
let envelope = Buffer.alloc(buf.length + 4); | ||
const envelope = Buffer.alloc(buf.length + 4); | ||
envelope.writeUInt32LE(buf.length); | ||
@@ -109,6 +103,6 @@ buf.copy(envelope, 4); | ||
} | ||
let values = new Array(count).fill(0); | ||
const values = new Array(count).fill(0); | ||
for (let b = 0; b < opts.bitWidth * count; ++b) { | ||
if (cursor.buffer[cursor.offset + Math.floor(b / 8)] & (1 << (b % 8))) { | ||
values[Math.floor(b / opts.bitWidth)] |= (1 << b % opts.bitWidth); | ||
if (cursor.buffer[cursor.offset + Math.floor(b / 8)] & (1 << b % 8)) { | ||
values[Math.floor(b / opts.bitWidth)] |= 1 << b % opts.bitWidth; | ||
} | ||
@@ -120,3 +114,3 @@ } | ||
function decodeRunRepeated(cursor, count, opts) { | ||
var bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8); | ||
const bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8); | ||
let value = 0; | ||
@@ -156,3 +150,3 @@ for (let i = 0; i < bytesNeededForFixedBitWidth; ++i) { | ||
if (values.length !== count) { | ||
throw "invalid RLE encoding"; | ||
throw 'invalid RLE encoding'; | ||
} | ||
@@ -159,0 +153,0 @@ return values; |
@@ -1,5 +0,4 @@ | ||
/// <reference types="node" /> | ||
import { PrimitiveType } from "../declare"; | ||
import { ParquetCodec, OriginalType, ParquetField } from "../declare"; | ||
import { Statistics } from "../../gen-nodejs/parquet_types"; | ||
import { PrimitiveType } from '../declare'; | ||
import { ParquetCodec, OriginalType, ParquetField } from '../declare'; | ||
import { Statistics } from '../../gen-nodejs/parquet_types'; | ||
export interface Options { | ||
@@ -16,3 +15,3 @@ typeLength: number; | ||
cache?: unknown; | ||
dictionary?: Array<number>; | ||
dictionary?: number[]; | ||
num_values?: number; | ||
@@ -19,0 +18,0 @@ rLevelMax?: number; |
@@ -1,8 +0,5 @@ | ||
/// <reference types="node" /> | ||
interface PARQUET_COMPRESSION_METHODS { | ||
[key: string]: { | ||
deflate: (value: any) => Buffer | Promise<Buffer>; | ||
inflate: (value: any) => Buffer | Promise<Buffer>; | ||
}; | ||
} | ||
type PARQUET_COMPRESSION_METHODS = Record<string, { | ||
deflate: (value: any) => Buffer | Promise<Buffer>; | ||
inflate: (value: any) => Buffer | Promise<Buffer>; | ||
}>; | ||
export declare const PARQUET_COMPRESSION_METHODS: PARQUET_COMPRESSION_METHODS; | ||
@@ -9,0 +6,0 @@ /** |
@@ -6,3 +6,5 @@ "use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.inflate = exports.deflate = exports.PARQUET_COMPRESSION_METHODS = void 0; | ||
exports.PARQUET_COMPRESSION_METHODS = void 0; | ||
exports.deflate = deflate; | ||
exports.inflate = inflate; | ||
const zlib_1 = __importDefault(require("zlib")); | ||
@@ -13,18 +15,18 @@ const snappyjs_1 = __importDefault(require("snappyjs")); | ||
exports.PARQUET_COMPRESSION_METHODS = { | ||
'UNCOMPRESSED': { | ||
UNCOMPRESSED: { | ||
deflate: deflate_identity, | ||
inflate: inflate_identity | ||
inflate: inflate_identity, | ||
}, | ||
'GZIP': { | ||
GZIP: { | ||
deflate: deflate_gzip, | ||
inflate: inflate_gzip | ||
inflate: inflate_gzip, | ||
}, | ||
'SNAPPY': { | ||
SNAPPY: { | ||
deflate: deflate_snappy, | ||
inflate: inflate_snappy | ||
inflate: inflate_snappy, | ||
}, | ||
'BROTLI': { | ||
BROTLI: { | ||
deflate: deflate_brotli, | ||
inflate: inflate_brotli | ||
} | ||
inflate: inflate_brotli, | ||
}, | ||
}; | ||
@@ -40,3 +42,2 @@ /** | ||
} | ||
exports.deflate = deflate; | ||
function deflate_identity(value) { | ||
@@ -54,7 +55,7 @@ return buffer_from_result(value); | ||
const compressedContent = await (0, brotli_wasm_1.compress)(value /*, { | ||
mode: 0, | ||
quality: 8, | ||
lgwin: 22 | ||
} | ||
*/); | ||
mode: 0, | ||
quality: 8, | ||
lgwin: 22 | ||
} | ||
*/); | ||
return Buffer.from(compressedContent); | ||
@@ -71,3 +72,2 @@ } | ||
} | ||
exports.inflate = inflate; | ||
async function inflate_identity(value) { | ||
@@ -74,0 +74,0 @@ return buffer_from_result(value); |
@@ -1,6 +0,5 @@ | ||
/// <reference types="node" /> | ||
import parquet_thrift from "../gen-nodejs/parquet_types"; | ||
import { Statistics, OffsetIndex, ColumnIndex, PageType, DataPageHeader, DataPageHeaderV2, DictionaryPageHeader, IndexPageHeader } from "../gen-nodejs/parquet_types"; | ||
import SplitBlockBloomFilter from "./bloom/sbbf"; | ||
import { createSBBFParams } from "./bloomFilterIO/bloomFilterWriter"; | ||
import parquet_thrift from '../gen-nodejs/parquet_types'; | ||
import { Statistics, OffsetIndex, ColumnIndex, PageType, DataPageHeader, DataPageHeaderV2, DictionaryPageHeader, IndexPageHeader } from '../gen-nodejs/parquet_types'; | ||
import SplitBlockBloomFilter from './bloom/sbbf'; | ||
import { createSBBFParams } from './bloomFilterIO/bloomFilterWriter'; | ||
import Int64 from 'node-int64'; | ||
@@ -13,5 +12,3 @@ export type ParquetCodec = 'PLAIN' | 'RLE'; | ||
export type OriginalType = 'UTF8' | 'MAP' | 'LIST' | 'ENUM' | 'DECIMAL' | 'DATE' | 'TIME_MILLIS' | 'TIME_MICROS' | 'TIMESTAMP_MILLIS' | 'TIMESTAMP_MICROS' | 'UINT_8' | 'UINT_16' | 'UINT_32' | 'UINT_64' | 'INT_8' | 'INT_16' | 'INT_32' | 'INT_64' | 'JSON' | 'BSON' | 'INTERVAL'; | ||
export interface SchemaDefinition { | ||
[string: string]: FieldDefinition; | ||
} | ||
export type SchemaDefinition = Record<string, FieldDefinition>; | ||
export interface FieldDefinition { | ||
@@ -62,5 +59,3 @@ type?: ParquetType; | ||
} | ||
export interface ParquetRecord { | ||
[key: string]: any; | ||
} | ||
export type ParquetRecord = Record<string, any>; | ||
export interface ColumnChunkData { | ||
@@ -104,3 +99,3 @@ rowGroupIndex: number; | ||
count?: number; | ||
dictionary?: Array<unknown>; | ||
dictionary?: unknown[]; | ||
column?: parquet_thrift.ColumnChunk; | ||
@@ -151,5 +146,4 @@ useDictionary?: boolean; | ||
headerSize?: number; | ||
constructor(); | ||
} | ||
export type WriterOptions = { | ||
export interface WriterOptions { | ||
pageIndex?: boolean; | ||
@@ -169,4 +163,4 @@ pageSize?: number; | ||
highWaterMark?: number; | ||
}; | ||
export type Page = { | ||
} | ||
export interface Page { | ||
page: Buffer; | ||
@@ -178,3 +172,3 @@ statistics: parquet_thrift.Statistics; | ||
count?: number; | ||
}; | ||
} | ||
export {}; |
@@ -9,10 +9,6 @@ "use strict"; | ||
const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types")); | ||
; | ||
class NewPageHeader extends parquet_types_1.default.PageHeader { | ||
offset; | ||
headerSize; | ||
constructor() { | ||
super(); | ||
} | ||
} | ||
exports.NewPageHeader = NewPageHeader; |
@@ -1,2 +0,2 @@ | ||
import { FieldDefinition, ParquetType, SchemaDefinition } from "./declare"; | ||
import { FieldDefinition, ParquetType, SchemaDefinition } from './declare'; | ||
export declare function createStringField(optional?: boolean, fieldOptions?: FieldDefinition): FieldDefinition; | ||
@@ -3,0 +3,0 @@ export declare function createBooleanField(optional?: boolean, fieldOptions?: FieldDefinition): FieldDefinition; |
"use strict"; | ||
// Helper functions for creating fields | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.createListField = exports.createStructListField = exports.createStructField = exports.createTimestampField = exports.createDecimalField = exports.createDoubleField = exports.createFloatField = exports.createIntField = exports.createBooleanField = exports.createStringField = void 0; | ||
exports.createStringField = createStringField; | ||
exports.createBooleanField = createBooleanField; | ||
exports.createIntField = createIntField; | ||
exports.createFloatField = createFloatField; | ||
exports.createDoubleField = createDoubleField; | ||
exports.createDecimalField = createDecimalField; | ||
exports.createTimestampField = createTimestampField; | ||
exports.createStructField = createStructField; | ||
exports.createStructListField = createStructListField; | ||
exports.createListField = createListField; | ||
function createStringField(optional = true, fieldOptions = {}) { | ||
return { ...fieldOptions, optional, type: 'UTF8' }; | ||
} | ||
exports.createStringField = createStringField; | ||
function createBooleanField(optional = true, fieldOptions = {}) { | ||
return { ...fieldOptions, optional, type: 'BOOLEAN' }; | ||
} | ||
exports.createBooleanField = createBooleanField; | ||
function createIntField(size, optional = true, fieldOptions = {}) { | ||
return { ...fieldOptions, optional, type: `INT${size}` }; | ||
} | ||
exports.createIntField = createIntField; | ||
function createFloatField(optional = true, fieldOptions = {}) { | ||
return { ...fieldOptions, optional, type: 'FLOAT' }; | ||
} | ||
exports.createFloatField = createFloatField; | ||
function createDoubleField(optional = true, fieldOptions = {}) { | ||
return { ...fieldOptions, optional, type: 'DOUBLE' }; | ||
} | ||
exports.createDoubleField = createDoubleField; | ||
function createDecimalField(precision, optional = true, fieldOptions = {}) { | ||
return { ...fieldOptions, precision, optional, type: 'FLOAT' }; | ||
} | ||
exports.createDecimalField = createDecimalField; | ||
function createTimestampField(optional = true, fieldOptions = {}) { | ||
return { ...fieldOptions, optional, type: 'TIMESTAMP_MILLIS' }; | ||
} | ||
exports.createTimestampField = createTimestampField; | ||
function createStructField(fields, optional = true) { | ||
@@ -39,3 +41,2 @@ return { | ||
} | ||
exports.createStructField = createStructField; | ||
function createStructListField(fields, optional = true) { | ||
@@ -52,3 +53,3 @@ return { | ||
}, | ||
} | ||
}, | ||
}, | ||
@@ -58,3 +59,2 @@ }, | ||
} | ||
exports.createStructListField = createStructListField; | ||
function createListField(type, optional = true, elementOptions = { optional: true }) { | ||
@@ -78,2 +78,1 @@ return { | ||
} | ||
exports.createListField = createListField; |
@@ -34,11 +34,11 @@ "use strict"; | ||
const unsupportedFields = [ | ||
"$ref", | ||
"multipleOf", | ||
"allOf", | ||
"anyOf", | ||
"oneOf", | ||
"not", | ||
"additionalItems", | ||
"enum", | ||
"extends", | ||
'$ref', | ||
'multipleOf', | ||
'allOf', | ||
'anyOf', | ||
'oneOf', | ||
'not', | ||
'additionalItems', | ||
'enum', | ||
'extends', | ||
]; | ||
@@ -68,3 +68,4 @@ for (const field in unsupportedFields) { | ||
switch (jsonSchema.required) { | ||
case true: return true; | ||
case true: | ||
return true; | ||
case undefined: | ||
@@ -81,3 +82,3 @@ case false: | ||
if (!fieldValue.items || !fieldValue.items.type) { | ||
throw new UnsupportedJsonSchemaError("Array field with no values found."); | ||
throw new UnsupportedJsonSchemaError('Array field with no values found.'); | ||
} | ||
@@ -135,3 +136,3 @@ switch (fieldValue.items.type) { | ||
if (!isJsonSchemaSupported(jsonSchema)) { | ||
throw new UnsupportedJsonSchemaError("Unsupported fields found"); | ||
throw new UnsupportedJsonSchemaError('Unsupported fields found'); | ||
} | ||
@@ -138,0 +139,0 @@ const schema = {}; |
@@ -1,2 +0,1 @@ | ||
/// <reference types="node" /> | ||
import Int64 from 'node-int64'; | ||
@@ -9,3 +8,3 @@ import parquet_thrift from '../gen-nodejs/parquet_types'; | ||
import { Options } from './codec/types'; | ||
import { S3Client } from "@aws-sdk/client-s3"; | ||
import { S3Client } from '@aws-sdk/client-s3'; | ||
/** | ||
@@ -18,4 +17,4 @@ * A parquet cursor is used to retrieve rows from a parquet file in order | ||
schema: parquet_schema.ParquetSchema; | ||
columnList: Array<Array<unknown>>; | ||
rowGroup: Array<unknown>; | ||
columnList: unknown[][]; | ||
rowGroup: unknown[]; | ||
rowGroupIndex: number; | ||
@@ -29,3 +28,3 @@ cursorIndex: number; | ||
*/ | ||
constructor(metadata: FileMetaDataExt, envelopeReader: ParquetEnvelopeReader, schema: parquet_schema.ParquetSchema, columnList: Array<Array<unknown>>); | ||
constructor(metadata: FileMetaDataExt, envelopeReader: ParquetEnvelopeReader, schema: parquet_schema.ParquetSchema, columnList: unknown[][]); | ||
/** | ||
@@ -102,3 +101,3 @@ * Retrieve the next row from the cursor. Returns a row or NULL if the end | ||
*/ | ||
getCursor(columnList?: Array<Array<unknown>>): ParquetCursor; | ||
getCursor(columnList?: unknown[][]): ParquetCursor; | ||
getBloomFiltersFor(columnNames: string[]): Promise<Record<string, { | ||
@@ -134,3 +133,3 @@ sbbf: import("./bloom/sbbf").default; | ||
id: number; | ||
fileSize: Function | number; | ||
fileSize: number | (() => Promise<number>); | ||
default_dictionary_size: number; | ||
@@ -145,7 +144,7 @@ metadata?: FileMetaDataExt; | ||
static openUrl(url: Parameter | URL | string, options?: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
constructor(readFn: (offset: number, length: number, file?: string) => Promise<Buffer>, closeFn: () => unknown, fileSize: Function | number, options?: BufferReaderOptions, metadata?: FileMetaDataExt); | ||
constructor(readFn: (offset: number, length: number, file?: string) => Promise<Buffer>, closeFn: () => unknown, fileSize: number | (() => Promise<number>), options?: BufferReaderOptions, metadata?: FileMetaDataExt); | ||
read(offset: number, length: number, file?: string): Promise<Buffer>; | ||
readHeader(): Promise<void>; | ||
getColumn(path: string | parquet_thrift.ColumnChunk, row_group: RowGroupExt | number | string | null): ColumnChunkExt; | ||
getAllColumnChunkDataFor(paths: Array<string>, row_groups?: Array<RowGroupExt>): { | ||
getAllColumnChunkDataFor(paths: string[], row_groups?: RowGroupExt[]): { | ||
rowGroupIndex: number; | ||
@@ -156,4 +155,4 @@ column: ColumnChunkExt; | ||
readColumnIndex(path: string | ColumnChunkExt, row_group: RowGroupExt | number, opts: Options): Promise<parquet_thrift.ColumnIndex>; | ||
readPage(column: ColumnChunkExt, page: parquet_thrift.PageLocation | number, records: Array<Record<string, unknown>>, opts: Options): Promise<Record<string, unknown>[]>; | ||
readRowGroup(schema: parquet_schema.ParquetSchema, rowGroup: RowGroupExt, columnList: Array<Array<unknown>>): Promise<parquet_shredder.RecordBuffer>; | ||
readPage(column: ColumnChunkExt, page: parquet_thrift.PageLocation | number, records: Record<string, unknown>[], opts: Options): Promise<Record<string, unknown>[]>; | ||
readRowGroup(schema: parquet_schema.ParquetSchema, rowGroup: RowGroupExt, columnList: unknown[][]): Promise<parquet_shredder.RecordBuffer>; | ||
readColumnChunk(schema: parquet_schema.ParquetSchema, colChunk: ColumnChunkExt, opts?: Options): Promise<PageData>; | ||
@@ -160,0 +159,0 @@ readFooter(): Promise<parquet_thrift.FileMetaData>; |
@@ -43,3 +43,3 @@ "use strict"; | ||
const client_s3_1 = require("@aws-sdk/client-s3"); | ||
const { getBloomFiltersFor, } = bloomFilterReader; | ||
const { getBloomFiltersFor } = bloomFilterReader; | ||
/** | ||
@@ -93,3 +93,3 @@ * Parquet File Magic String | ||
} | ||
let rowBuffer = await this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList); | ||
const rowBuffer = await this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList); | ||
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer); | ||
@@ -110,3 +110,2 @@ this.rowGroupIndex++; | ||
} | ||
; | ||
/** | ||
@@ -128,7 +127,7 @@ * A parquet reader allows retrieving the rows from a parquet file in order. | ||
static async openFile(filePath, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openFile(filePath, options); | ||
const envelopeReader = await ParquetEnvelopeReader.openFile(filePath, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
static async openBuffer(buffer, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openBuffer(buffer, options); | ||
const envelopeReader = await ParquetEnvelopeReader.openBuffer(buffer, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
@@ -146,5 +145,5 @@ } | ||
try { | ||
let envelopeReader = 'function' === typeof client['headObject'] ? | ||
await ParquetEnvelopeReader.openS3(client, params, options) : // S3 client v2 | ||
await ParquetEnvelopeReader.openS3v3(client, params, options); // S3 client v3 | ||
const envelopeReader = 'function' === typeof client['headObject'] | ||
? await ParquetEnvelopeReader.openS3(client, params, options) // S3 client v2 | ||
: await ParquetEnvelopeReader.openS3v3(client, params, options); // S3 client v3 | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
@@ -163,3 +162,3 @@ } | ||
static async openUrl(params, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openUrl(params, options); | ||
const envelopeReader = await ParquetEnvelopeReader.openUrl(params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
@@ -173,3 +172,3 @@ } | ||
await envelopeReader.readHeader(); | ||
let metadata = await envelopeReader.readFooter(); | ||
const metadata = await envelopeReader.readFooter(); | ||
return new ParquetReader(metadata, envelopeReader, opts); | ||
@@ -197,3 +196,3 @@ } | ||
if (o && typeof o === 'object') { | ||
Object.keys(o).forEach(key => o[key] = convert(o[key])); | ||
Object.keys(o).forEach((key) => (o[key] = convert(o[key]))); | ||
if (o.parquetType === 'CTIME') { | ||
@@ -209,10 +208,10 @@ return new Date(o.value); | ||
// Go through all PageLocation objects and set the proper prototype | ||
metadata.row_groups.forEach(rowGroup => { | ||
rowGroup.columns.forEach(column => { | ||
metadata.row_groups.forEach((rowGroup) => { | ||
rowGroup.columns.forEach((column) => { | ||
if (column.offsetIndex) { | ||
Promise.resolve(column.offsetIndex).then(offset => (offset.page_locations.forEach(d => { | ||
Promise.resolve(column.offsetIndex).then((offset) => offset.page_locations.forEach((d) => { | ||
if (Array.isArray(d)) { | ||
Object.setPrototypeOf(d, parquet_types_1.default.PageLocation.prototype); | ||
} | ||
}))); | ||
})); | ||
} | ||
@@ -228,3 +227,3 @@ }); | ||
if (this.metadata.row_groups && !this.metadata.json && !opts.rawStatistics) { | ||
this.metadata.row_groups.forEach(row => row.columns.forEach(col => { | ||
this.metadata.row_groups.forEach((row) => row.columns.forEach((col) => { | ||
const stats = col.meta_data.statistics; | ||
@@ -254,3 +253,3 @@ if (stats) { | ||
let record = null; | ||
while (record = await cursor.next()) { | ||
while ((record = await cursor.next())) { | ||
yield record; | ||
@@ -272,3 +271,3 @@ } | ||
} | ||
columnList = columnList.map((x) => x.constructor === Array ? x : [x]); | ||
columnList = columnList.map((x) => (x.constructor === Array ? x : [x])); | ||
return new ParquetCursor(this.metadata, this.envelopeReader, this.schema, columnList); | ||
@@ -303,4 +302,4 @@ } | ||
getMetadata() { | ||
let md = {}; | ||
for (let kv of this.metadata.key_value_metadata) { | ||
const md = {}; | ||
for (const kv of this.metadata.key_value_metadata) { | ||
md[kv.key] = kv.value; | ||
@@ -316,7 +315,7 @@ } | ||
if (typeof value === 'object') { | ||
for (let k in value) { | ||
for (const k in value) { | ||
if (value[k] instanceof Date) { | ||
value[k].toJSON = () => ({ | ||
value[k].toJSON = () => JSON.stringify({ | ||
parquetType: 'CTIME', | ||
value: value[k].valueOf() | ||
value: value[k].valueOf(), | ||
}); | ||
@@ -336,3 +335,3 @@ } | ||
parquetType: 'INT64', | ||
value: [...value.buffer] | ||
value: [...value.buffer], | ||
}; | ||
@@ -390,5 +389,5 @@ } | ||
static async openFile(filePath, options) { | ||
let fileStat = await parquet_util.fstat(filePath); | ||
let fileDescriptor = await parquet_util.fopen(filePath); | ||
let readFn = (offset, length, file) => { | ||
const fileStat = await parquet_util.fstat(filePath); | ||
const fileDescriptor = await parquet_util.fopen(filePath); | ||
const readFn = (offset, length, file) => { | ||
if (file) { | ||
@@ -399,7 +398,7 @@ return Promise.reject('external references are not supported'); | ||
}; | ||
let closeFn = parquet_util.fclose.bind(undefined, fileDescriptor); | ||
const closeFn = parquet_util.fclose.bind(undefined, fileDescriptor); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size, options); | ||
} | ||
static async openBuffer(buffer, options) { | ||
let readFn = (offset, length, file) => { | ||
const readFn = (offset, length, file) => { | ||
if (file) { | ||
@@ -410,16 +409,19 @@ return Promise.reject('external references are not supported'); | ||
}; | ||
let closeFn = () => ({}); | ||
const closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length, options); | ||
} | ||
static async openS3(client, params, options) { | ||
let fileStat = async () => client.headObject(params).promise().then((d) => d.ContentLength); | ||
let readFn = async (offset, length, file) => { | ||
const fileStat = async () => client | ||
.headObject(params) | ||
.promise() | ||
.then((d) => d.ContentLength); | ||
const readFn = async (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
let Range = `bytes=${offset}-${offset + length - 1}`; | ||
let res = await client.getObject(Object.assign({ Range }, params)).promise(); | ||
const Range = `bytes=${offset}-${offset + length - 1}`; | ||
const res = await client.getObject(Object.assign({ Range }, params)).promise(); | ||
return Promise.resolve(res.Body); | ||
}; | ||
let closeFn = () => ({}); | ||
const closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options); | ||
@@ -430,3 +432,6 @@ } | ||
try { | ||
let headObjectCommand = await client.send(new client_s3_1.HeadObjectCommand(params)); | ||
const headObjectCommand = await client.send(new client_s3_1.HeadObjectCommand(params)); | ||
if (headObjectCommand.ContentLength === undefined) { | ||
throw new Error('Content Length is undefined!'); | ||
} | ||
return Promise.resolve(headObjectCommand.ContentLength); | ||
@@ -436,3 +441,3 @@ } | ||
// having params match command names makes e.message clear to user | ||
return Promise.reject("rejected headObjectCommand: " + e.message); | ||
return Promise.reject('rejected headObjectCommand: ' + e.message); | ||
} | ||
@@ -442,3 +447,3 @@ }; | ||
if (file) { | ||
return Promise.reject("external references are not supported"); | ||
return Promise.reject('external references are not supported'); | ||
} | ||
@@ -454,3 +459,3 @@ const Range = `bytes=${offset}-${offset + length - 1}`; | ||
}; | ||
let closeFn = () => ({}); | ||
const closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options); | ||
@@ -469,5 +474,5 @@ } | ||
const chunks = []; | ||
readable.on("data", (chunk) => chunks.push(chunk)); | ||
readable.on("error", reject); | ||
readable.on("end", () => resolve(Buffer.concat(chunks))); | ||
readable.on('data', (chunk) => chunks.push(chunk)); | ||
readable.on('error', reject); | ||
readable.on('end', () => resolve(Buffer.concat(chunks))); | ||
}); | ||
@@ -487,11 +492,11 @@ } | ||
const base = baseArr.slice(0, baseArr.length - 1).join('/') + '/'; | ||
let defaultHeaders = params.headers || {}; | ||
let filesize = async () => { | ||
const defaultHeaders = params.headers || {}; | ||
const filesize = async () => { | ||
const { headers } = await (0, cross_fetch_1.default)(params.url); | ||
return headers.get('Content-Length'); | ||
return Number(headers.get('Content-Length')) || 0; | ||
}; | ||
let readFn = async (offset, length, file) => { | ||
let url = file ? base + file : params.url; | ||
let range = `bytes=${offset}-${offset + length - 1}`; | ||
let headers = Object.assign({}, defaultHeaders, { range }); | ||
const readFn = async (offset, length, file) => { | ||
const url = file ? base + file : params.url; | ||
const range = `bytes=${offset}-${offset + length - 1}`; | ||
const headers = Object.assign({}, defaultHeaders, { range }); | ||
const response = await (0, cross_fetch_1.default)(url, { headers }); | ||
@@ -502,3 +507,3 @@ const arrayBuffer = await response.arrayBuffer(); | ||
}; | ||
let closeFn = () => ({}); | ||
const closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, filesize, options); | ||
@@ -543,3 +548,3 @@ } | ||
} | ||
column = parsedRowGroup.columns.find(d => d.meta_data.path_in_schema.join(',') === path); | ||
column = parsedRowGroup.columns.find((d) => d.meta_data.path_in_schema.join(',') === path); | ||
if (!column) { | ||
@@ -558,9 +563,9 @@ throw `Column ${path} Not Found`; | ||
} | ||
return row_groups.flatMap((rowGroup, index) => paths.map(columnName => ({ | ||
return row_groups.flatMap((rowGroup, index) => paths.map((columnName) => ({ | ||
rowGroupIndex: index, | ||
column: this.getColumn(columnName, rowGroup) | ||
column: this.getColumn(columnName, rowGroup), | ||
}))); | ||
} | ||
readOffsetIndex(path, row_group, opts) { | ||
let column = this.getColumn(path, row_group); | ||
const column = this.getColumn(path, row_group); | ||
if (column.offsetIndex) { | ||
@@ -573,3 +578,3 @@ return Promise.resolve(column.offsetIndex); | ||
const data = this.read(+column.offset_index_offset, column.offset_index_length).then((data) => { | ||
let offset_index = new parquet_types_1.default.OffsetIndex(); | ||
const offset_index = new parquet_types_1.default.OffsetIndex(); | ||
parquet_util.decodeThrift(offset_index, data); | ||
@@ -585,3 +590,3 @@ Object.defineProperty(offset_index, 'column', { value: column, enumerable: false }); | ||
readColumnIndex(path, row_group, opts) { | ||
let column = this.getColumn(path, row_group); | ||
const column = this.getColumn(path, row_group); | ||
if (column.columnIndex) { | ||
@@ -594,3 +599,3 @@ return Promise.resolve(column.columnIndex); | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then((buf) => { | ||
let column_index = new parquet_types_1.default.ColumnIndex(); | ||
const column_index = new parquet_types_1.default.ColumnIndex(); | ||
parquet_util.decodeThrift(column_index, buf); | ||
@@ -601,6 +606,6 @@ Object.defineProperty(column_index, 'column', { value: column }); | ||
if (column_index.max_values) { | ||
column_index.max_values = column_index.max_values.map(max_value => decodeStatisticsValue(max_value, field)); | ||
column_index.max_values = column_index.max_values.map((max_value) => decodeStatisticsValue(max_value, field)); | ||
} | ||
if (column_index.min_values) { | ||
column_index.min_values = column_index.min_values.map(min_value => decodeStatisticsValue(min_value, field)); | ||
column_index.min_values = column_index.min_values.map((min_value) => decodeStatisticsValue(min_value, field)); | ||
} | ||
@@ -631,4 +636,4 @@ return column_index; | ||
Object.defineProperty(chunk, 'column', { value: column }); | ||
let data = { | ||
columnData: { [chunk.column.meta_data.path_in_schema.join(',')]: chunk } | ||
const data = { | ||
columnData: { [chunk.column.meta_data.path_in_schema.join(',')]: chunk }, | ||
}; | ||
@@ -638,9 +643,9 @@ return parquet_shredder.materializeRecords(this.schema, data, records); | ||
async readRowGroup(schema, rowGroup, columnList) { | ||
var buffer = { | ||
const buffer = { | ||
rowCount: +rowGroup.num_rows, | ||
columnData: {}, | ||
pageRowCount: 0, | ||
pages: {} | ||
pages: {}, | ||
}; | ||
for (let colChunk of rowGroup.columns) { | ||
for (const colChunk of rowGroup.columns) { | ||
const colMetadata = colChunk.meta_data; | ||
@@ -656,7 +661,7 @@ const colKey = colMetadata.path_in_schema; | ||
async readColumnChunk(schema, colChunk, opts) { | ||
let metadata = colChunk.meta_data; | ||
let field = schema.findField(metadata.path_in_schema); | ||
let type = parquet_util.getThriftEnum(parquet_types_1.default.Type, metadata.type); | ||
let compression = parquet_util.getThriftEnum(parquet_types_1.default.CompressionCodec, metadata.codec); | ||
let pagesOffset = +metadata.data_page_offset; | ||
const metadata = colChunk.meta_data; | ||
const field = schema.findField(metadata.path_in_schema); | ||
const type = parquet_util.getThriftEnum(parquet_types_1.default.Type, metadata.type); | ||
const compression = parquet_util.getThriftEnum(parquet_types_1.default.CompressionCodec, metadata.codec); | ||
const pagesOffset = +metadata.data_page_offset; | ||
let pagesSize = +metadata.total_compressed_size; | ||
@@ -672,3 +677,3 @@ if (!colChunk.file_path) { | ||
column: field, | ||
num_values: metadata.num_values | ||
num_values: metadata.num_values, | ||
}); | ||
@@ -680,3 +685,3 @@ // If this exists and is greater than zero then we need to have an offset | ||
await this.read(offset, size, colChunk.file_path).then(async (buffer) => { | ||
await decodePage({ offset: 0, buffer, size: buffer.length }, opts).then(dict => { | ||
await decodePage({ offset: 0, buffer, size: buffer.length }, opts).then((dict) => { | ||
opts.dictionary = opts.dictionary || dict.dictionary; | ||
@@ -692,15 +697,15 @@ }); | ||
} | ||
let trailerLen = PARQUET_MAGIC.length + 4; | ||
let offset = this.fileSize - trailerLen; | ||
let trailerBuf = await this.read(offset, trailerLen); | ||
const trailerLen = PARQUET_MAGIC.length + 4; | ||
const offset = this.fileSize - trailerLen; | ||
const trailerBuf = await this.read(offset, trailerLen); | ||
if (trailerBuf.subarray(4).toString() != PARQUET_MAGIC) { | ||
throw 'not a valid parquet file'; | ||
} | ||
let metadataSize = trailerBuf.readUInt32LE(0); | ||
let metadataOffset = this.fileSize - metadataSize - trailerLen; | ||
const metadataSize = trailerBuf.readUInt32LE(0); | ||
const metadataOffset = this.fileSize - metadataSize - trailerLen; | ||
if (metadataOffset < PARQUET_MAGIC.length) { | ||
throw 'invalid metadata size'; | ||
} | ||
let metadataBuf = await this.read(metadataOffset, metadataSize); | ||
let metadata = new parquet_types_1.default.FileMetaData(); | ||
const metadataBuf = await this.read(metadataOffset, metadataSize); | ||
const metadata = new parquet_types_1.default.FileMetaData(); | ||
parquet_util.decodeThrift(metadata, metadataBuf); | ||
@@ -770,5 +775,4 @@ return metadata; | ||
case 'DICTIONARY_PAGE': | ||
const dict = await decodeDictionaryPage(cursor, pageHeader, opts); | ||
page = { | ||
dictionary: dict | ||
dictionary: await decodeDictionaryPage(cursor, pageHeader, opts), | ||
}; | ||
@@ -786,8 +790,8 @@ break; | ||
opts = opts || {}; | ||
let cursor = { | ||
const cursor = { | ||
buffer: buffer, | ||
offset: 0, | ||
size: buffer.length | ||
size: buffer.length, | ||
}; | ||
let data = { | ||
const data = { | ||
rlevels: [], | ||
@@ -797,3 +801,3 @@ dlevels: [], | ||
pageHeaders: [], | ||
count: 0 | ||
count: 0, | ||
}; | ||
@@ -810,9 +814,9 @@ while (cursor.offset < cursor.size && (!opts.num_values || data.dlevels.length < opts.num_values)) { | ||
if (opts.dictionary && pageData.useDictionary) { | ||
pageData.values = pageData.values.map(d => opts.dictionary[d]); | ||
pageData.values = pageData.values.map((d) => opts.dictionary[d]); | ||
} | ||
let length = pageData.rlevels != undefined ? pageData.rlevels.length : 0; | ||
const length = pageData.rlevels != undefined ? pageData.rlevels.length : 0; | ||
for (let i = 0; i < length; i++) { | ||
data.rlevels.push(pageData.rlevels[i]); | ||
data.dlevels.push(pageData.dlevels[i]); | ||
let value = pageData.values[i]; | ||
const value = pageData.values[i]; | ||
if (value !== undefined) { | ||
@@ -832,15 +836,14 @@ data.values.push(value); | ||
buffer: cursor.buffer.subarray(cursor.offset, cursorEnd), | ||
size: cursorEnd - cursor.offset | ||
size: cursorEnd - cursor.offset, | ||
}; | ||
cursor.offset = cursorEnd; | ||
if (opts.compression && opts.compression !== 'UNCOMPRESSED') { | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, dictCursor.buffer.subarray(dictCursor.offset, cursorEnd)); | ||
const valuesBuf = await parquet_compression.inflate(opts.compression, dictCursor.buffer.subarray(dictCursor.offset, cursorEnd)); | ||
dictCursor = { | ||
buffer: valuesBuf, | ||
offset: 0, | ||
size: valuesBuf.length | ||
size: valuesBuf.length, | ||
}; | ||
} | ||
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, (header.dictionary_page_header).num_values, opts) | ||
.map((d) => d.toString()); | ||
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, header.dictionary_page_header.num_values, opts).map((d) => d.toString()); | ||
} | ||
@@ -850,18 +853,20 @@ async function decodeDataPage(cursor, header, opts) { | ||
const dataPageHeader = header.data_page_header; | ||
let valueCount = dataPageHeader.num_values; | ||
let valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.encoding); | ||
const valueCount = dataPageHeader.num_values; | ||
const valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.encoding); | ||
let valuesBufCursor = cursor; | ||
if (opts.compression && opts.compression !== 'UNCOMPRESSED') { | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.subarray(cursor.offset, cursorEnd)); | ||
const valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.subarray(cursor.offset, cursorEnd)); | ||
valuesBufCursor = { | ||
buffer: valuesBuf, | ||
offset: 0, | ||
size: valuesBuf.length | ||
size: valuesBuf.length, | ||
}; | ||
} | ||
/* read repetition levels */ | ||
let rLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.repetition_level_encoding); | ||
const rLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.repetition_level_encoding); | ||
let rLevels = new Array(valueCount); | ||
if (opts.rLevelMax > 0) { | ||
rLevels = decodeValues(PARQUET_RDLVL_TYPE, rLevelEncoding, valuesBufCursor, valueCount, { bitWidth: parquet_util.getBitWidth(opts.rLevelMax) }); | ||
rLevels = decodeValues(PARQUET_RDLVL_TYPE, rLevelEncoding, valuesBufCursor, valueCount, { | ||
bitWidth: parquet_util.getBitWidth(opts.rLevelMax), | ||
}); | ||
} | ||
@@ -872,6 +877,8 @@ else { | ||
/* read definition levels */ | ||
let dLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.definition_level_encoding); | ||
const dLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.definition_level_encoding); | ||
let dLevels = new Array(valueCount); | ||
if (opts.dLevelMax > 0) { | ||
dLevels = decodeValues(PARQUET_RDLVL_TYPE, dLevelEncoding, valuesBufCursor, valueCount, { bitWidth: parquet_util.getBitWidth(opts.dLevelMax) }); | ||
dLevels = decodeValues(PARQUET_RDLVL_TYPE, dLevelEncoding, valuesBufCursor, valueCount, { | ||
bitWidth: parquet_util.getBitWidth(opts.dLevelMax), | ||
}); | ||
} | ||
@@ -883,3 +890,3 @@ else { | ||
let valueCountNonNull = 0; | ||
for (let dlvl of dLevels) { | ||
for (const dlvl of dLevels) { | ||
if (dlvl === opts.dLevelMax) { | ||
@@ -889,3 +896,3 @@ ++valueCountNonNull; | ||
} | ||
let values = decodeValues(opts.type, valueEncoding, valuesBufCursor, valueCountNonNull, { | ||
const values = decodeValues(opts.type, valueEncoding, valuesBufCursor, valueCountNonNull, { | ||
typeLength: opts.column.typeLength, | ||
@@ -897,3 +904,3 @@ bitWidth: opts.column.typeLength, | ||
scale: opts.column.scale, | ||
name: opts.column.name | ||
name: opts.column.name, | ||
}); | ||
@@ -906,3 +913,3 @@ cursor.offset = cursorEnd; | ||
count: valueCount, | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY', | ||
}; | ||
@@ -921,3 +928,3 @@ } | ||
bitWidth: parquet_util.getBitWidth(opts.rLevelMax), | ||
disableEnvelope: true | ||
disableEnvelope: true, | ||
}); | ||
@@ -933,3 +940,3 @@ } | ||
bitWidth: parquet_util.getBitWidth(opts.dLevelMax), | ||
disableEnvelope: true | ||
disableEnvelope: true, | ||
}); | ||
@@ -943,13 +950,13 @@ } | ||
if (dataPageHeaderV2.is_compressed) { | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.subarray(cursor.offset, cursorEnd)); | ||
const valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.subarray(cursor.offset, cursorEnd)); | ||
valuesBufCursor = { | ||
buffer: valuesBuf, | ||
offset: 0, | ||
size: valuesBuf.length | ||
size: valuesBuf.length, | ||
}; | ||
cursor.offset = cursorEnd; | ||
} | ||
let values = decodeValues(opts.type, valueEncoding, valuesBufCursor, valueCountNonNull, { | ||
const values = decodeValues(opts.type, valueEncoding, valuesBufCursor, valueCountNonNull, { | ||
bitWidth: opts.column.typeLength, | ||
...opts.column | ||
...opts.column, | ||
}); | ||
@@ -961,3 +968,3 @@ return { | ||
count: valueCount, | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY', | ||
}; | ||
@@ -967,4 +974,4 @@ } | ||
let schema = {}; | ||
schemaElements.forEach(schemaElement => { | ||
let repetitionType = parquet_util.getThriftEnum(parquet_types_1.default.FieldRepetitionType, schemaElement.repetition_type); | ||
schemaElements.forEach((schemaElement) => { | ||
const repetitionType = parquet_util.getThriftEnum(parquet_types_1.default.FieldRepetitionType, schemaElement.repetition_type); | ||
let optional = false; | ||
@@ -982,3 +989,2 @@ let repeated = false; | ||
} | ||
; | ||
if (schemaElement.num_children != undefined && schemaElement.num_children > 0) { | ||
@@ -992,9 +998,9 @@ schema[schemaElement.name] = { | ||
value: schema, | ||
enumerable: false | ||
enumerable: false, | ||
}, | ||
num_children: { | ||
value: schemaElement.num_children, | ||
enumerable: false | ||
} | ||
}) | ||
enumerable: false, | ||
}, | ||
}), | ||
}; | ||
@@ -1015,3 +1021,3 @@ /* move the schema pointer to the children */ | ||
scale: schemaElement.scale, | ||
precision: schemaElement.precision | ||
precision: schemaElement.precision, | ||
}; | ||
@@ -1018,0 +1024,0 @@ } |
@@ -9,3 +9,3 @@ import { SchemaDefinition, ParquetField } from './declare'; | ||
fields: Record<string, ParquetField>; | ||
fieldList: Array<ParquetField>; | ||
fieldList: ParquetField[]; | ||
/** | ||
@@ -22,7 +22,7 @@ * Create a new schema from JSON Schema (json-schema.org) | ||
*/ | ||
findField(path: string | Array<string>): ParquetField; | ||
findField(path: string | string[]): ParquetField; | ||
/** | ||
* Retrieve a field definition and all the field's ancestors | ||
*/ | ||
findFieldBranch(path: string | Array<string>): ParquetField[]; | ||
findFieldBranch(path: string | string[]): ParquetField[]; | ||
} |
@@ -31,3 +31,2 @@ "use strict"; | ||
const jsonSchema_1 = require("./jsonSchema"); | ||
const PARQUET_COLUMN_KEY_SEPARATOR = '.'; | ||
/** | ||
@@ -60,3 +59,3 @@ * A parquet file schema | ||
if (typeof path === 'string') { | ||
path = path.split(","); | ||
path = path.split(','); | ||
} | ||
@@ -68,3 +67,3 @@ else { | ||
for (; path.length > 1; path.shift()) { | ||
let fields = n[path[0]]?.fields; | ||
const fields = n[path[0]]?.fields; | ||
if (isDefined(fields)) { | ||
@@ -81,9 +80,9 @@ n = fields; | ||
if (typeof path === 'string') { | ||
path = path.split(","); | ||
path = path.split(','); | ||
} | ||
let branch = []; | ||
const branch = []; | ||
let n = this.fields; | ||
for (; path.length > 0; path.shift()) { | ||
branch.push(n[path[0]]); | ||
let fields = n[path[0]].fields; | ||
const fields = n[path[0]].fields; | ||
if (path.length > 1 && isDefined(fields)) { | ||
@@ -97,3 +96,2 @@ n = fields; | ||
exports.ParquetSchema = ParquetSchema; | ||
; | ||
function buildFields(schema, rLevelParentMax, dLevelParentMax, path) { | ||
@@ -109,5 +107,5 @@ if (!rLevelParentMax) { | ||
} | ||
let fieldList = {}; | ||
const fieldList = {}; | ||
let fieldErrors = []; | ||
for (let name in schema) { | ||
for (const name in schema) { | ||
const opts = schema[name]; | ||
@@ -142,3 +140,3 @@ /* field repetition type */ | ||
fieldCount: Object.keys(opts.fields).length, | ||
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name)) | ||
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name)), | ||
}; | ||
@@ -149,3 +147,3 @@ if (opts.type == 'LIST' || opts.type == 'MAP') | ||
} | ||
let nameWithPath = (`${name}` || 'missing name'); | ||
let nameWithPath = `${name}` || 'missing name'; | ||
if (path && path.length > 0) { | ||
@@ -156,3 +154,3 @@ nameWithPath = `${path}.${nameWithPath}`; | ||
if (!typeDef) { | ||
fieldErrors.push(`Invalid parquet type: ${(opts.type || "missing type")}, for Column: ${nameWithPath}`); | ||
fieldErrors.push(`Invalid parquet type: ${opts.type || 'missing type'}, for Column: ${nameWithPath}`); | ||
continue; | ||
@@ -175,3 +173,3 @@ } | ||
// Default scale to 0 per https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal | ||
if (typeof opts.scale === "undefined") | ||
if (typeof opts.scale === 'undefined') | ||
opts.scale = 0; | ||
@@ -194,3 +192,3 @@ fieldErrors = fieldErrors.concat(errorsForDecimalOpts(typeDef.originalType, typeDef.primitiveType, opts, nameWithPath)); | ||
rLevelMax: rLevelMax, | ||
dLevelMax: dLevelMax | ||
dLevelMax: dLevelMax, | ||
}; | ||
@@ -205,3 +203,3 @@ } | ||
let list = []; | ||
for (let k in fields) { | ||
for (const k in fields) { | ||
list.push(fields[k]); | ||
@@ -226,6 +224,6 @@ const nestedFields = fields[k].fields; | ||
} | ||
else if (primitiveType === "INT64" && opts.precision > 18) { | ||
else if (primitiveType === 'INT64' && opts.precision > 18) { | ||
fieldErrors.push(`invalid schema for type: ${type} and primitive type: ${primitiveType} for Column: ${columnName}, can not handle precision over 18`); | ||
} | ||
if (typeof opts.scale === "undefined" || opts.scale < 0) { | ||
if (typeof opts.scale === 'undefined' || opts.scale < 0) { | ||
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, scale is required to be 0 or greater`); | ||
@@ -232,0 +230,0 @@ } |
@@ -53,2 +53,2 @@ import { ParquetSchema } from './schema'; | ||
*/ | ||
export declare const materializeRecords: (schema: ParquetSchema, buffer: RecordBuffer, records?: Array<Record<string, unknown>>) => Record<string, unknown>[]; | ||
export declare const materializeRecords: (schema: ParquetSchema, buffer: RecordBuffer, records?: Record<string, unknown>[]) => Record<string, unknown>[]; |
@@ -30,4 +30,4 @@ "use strict"; | ||
/* shred the record, this may raise an exception */ | ||
var recordShredded = {}; | ||
for (let field of schema.fieldList) { | ||
const recordShredded = {}; | ||
for (const field of schema.fieldList) { | ||
recordShredded[field.path.join(',')] = { | ||
@@ -38,3 +38,3 @@ dlevels: [], | ||
distinct_values: new Set(), | ||
count: 0 | ||
count: 0, | ||
}; | ||
@@ -49,4 +49,4 @@ } | ||
buffer.pages = {}; | ||
for (let field of schema.fieldList) { | ||
let path = field.path.join(','); | ||
for (const field of schema.fieldList) { | ||
const path = field.path.join(','); | ||
buffer.columnData[path] = { | ||
@@ -57,3 +57,3 @@ dlevels: [], | ||
distinct_values: new Set(), | ||
count: 0 | ||
count: 0, | ||
}; | ||
@@ -65,6 +65,6 @@ buffer.pages[path] = []; | ||
buffer.pageRowCount += 1; | ||
for (let field of schema.fieldList) { | ||
let path = field.path.join(','); | ||
let record = recordShredded[path]; | ||
let column = buffer.columnData[path]; | ||
for (const field of schema.fieldList) { | ||
const path = field.path.join(','); | ||
const record = recordShredded[path]; | ||
const column = buffer.columnData[path]; | ||
for (let i = 0; i < record.rlevels.length; i++) { | ||
@@ -77,3 +77,3 @@ column.rlevels.push(record.rlevels[i]); | ||
} | ||
[...recordShredded[path].distinct_values].forEach(value => buffer.columnData[path].distinct_values.add(value)); | ||
[...recordShredded[path].distinct_values].forEach((value) => buffer.columnData[path].distinct_values.add(value)); | ||
buffer.columnData[path].count += recordShredded[path].count; | ||
@@ -84,3 +84,3 @@ } | ||
function shredRecordInternal(fields, record, data, rlvl, dlvl) { | ||
for (let fieldName in fields) { | ||
for (const fieldName in fields) { | ||
const field = fields[fieldName]; | ||
@@ -91,7 +91,8 @@ const fieldType = field.originalType || field.primitiveType; | ||
let values = []; | ||
if (record && (fieldName in record) && record[fieldName] !== undefined && record[fieldName] !== null) { | ||
if (record && fieldName in record && record[fieldName] !== undefined && record[fieldName] !== null) { | ||
if (Array.isArray(record[fieldName])) { | ||
values = record[fieldName]; | ||
} | ||
else if (ArrayBuffer.isView(record[fieldName])) { // checks if any typed array | ||
else if (ArrayBuffer.isView(record[fieldName])) { | ||
// checks if any typed array | ||
if (record[fieldName] instanceof Uint8Array) { | ||
@@ -168,7 +169,7 @@ // wrap in a buffer, since not supported by parquet_thrift | ||
} | ||
for (let k in buffer.columnData) { | ||
for (const k in buffer.columnData) { | ||
const field = schema.findField(k); | ||
const fieldBranch = schema.findFieldBranch(k); | ||
let values = buffer.columnData[k].values[Symbol.iterator](); | ||
let rLevels = new Array(field.rLevelMax + 1); | ||
const values = buffer.columnData[k].values[Symbol.iterator](); | ||
const rLevels = new Array(field.rLevelMax + 1); | ||
rLevels.fill(0); | ||
@@ -199,3 +200,3 @@ for (let i = 0; i < buffer.columnData[k].count; ++i) { | ||
if (branch.length > 1) { | ||
if (node.repetitionType === "REPEATED") { | ||
if (node.repetitionType === 'REPEATED') { | ||
if (!(node.name in record)) { | ||
@@ -217,3 +218,3 @@ record[node.name] = []; | ||
else { | ||
if (node.repetitionType === "REPEATED") { | ||
if (node.repetitionType === 'REPEATED') { | ||
if (!(node.name in record)) { | ||
@@ -220,0 +221,0 @@ record[node.name] = []; |
@@ -1,10 +0,10 @@ | ||
import { PrimitiveType, OriginalType, ParquetType, FieldDefinition, ParquetField } from "./declare"; | ||
import { Options } from "./codec/types"; | ||
type ParquetTypeDataObject = { | ||
import { PrimitiveType, OriginalType, ParquetType, FieldDefinition, ParquetField } from './declare'; | ||
import { Options } from './codec/types'; | ||
interface ParquetTypeDataObject { | ||
primitiveType?: PrimitiveType; | ||
toPrimitive: Function; | ||
fromPrimitive?: Function; | ||
toPrimitive: (x: any) => any; | ||
fromPrimitive?: (x: any) => any; | ||
originalType?: OriginalType; | ||
typeLength?: number; | ||
}; | ||
} | ||
export declare function getParquetTypeDataObject(type: ParquetType, field?: ParquetField | Options | FieldDefinition): ParquetTypeDataObject; | ||
@@ -11,0 +11,0 @@ /** |
'use strict'; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.fromPrimitive = exports.toPrimitive = exports.getParquetTypeDataObject = void 0; | ||
// Thanks to https://github.com/kbajalc/parquets for some of the code. | ||
const BSON = __importStar(require("bson")); | ||
exports.getParquetTypeDataObject = getParquetTypeDataObject; | ||
exports.toPrimitive = toPrimitive; | ||
exports.fromPrimitive = fromPrimitive; | ||
// BSON uses top level awaits, so use require for now | ||
const bsonSerialize = require('bson').serialize; | ||
const bsonDeserialize = require('bson').deserialize; | ||
function getParquetTypeDataObject(type, field) { | ||
@@ -36,3 +16,3 @@ if (type === 'DECIMAL') { | ||
typeLength: field.typeLength, | ||
toPrimitive: toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL | ||
toPrimitive: toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL, | ||
}; | ||
@@ -45,3 +25,3 @@ } | ||
typeLength: field.typeLength, | ||
toPrimitive: toPrimitive_BYTE_ARRAY_DECIMAL | ||
toPrimitive: toPrimitive_BYTE_ARRAY_DECIMAL, | ||
}; | ||
@@ -53,3 +33,3 @@ } | ||
originalType: 'DECIMAL', | ||
toPrimitive: toPrimitive_INT64 | ||
toPrimitive: toPrimitive_INT64, | ||
}; | ||
@@ -62,3 +42,2 @@ } | ||
} | ||
exports.getParquetTypeDataObject = getParquetTypeDataObject; | ||
const PARQUET_LOGICAL_TYPES = new Set([ | ||
@@ -93,131 +72,131 @@ 'BOOLEAN', | ||
'MAP', | ||
'LIST' | ||
'LIST', | ||
]); | ||
const PARQUET_LOGICAL_TYPE_DATA = { | ||
'BOOLEAN': { | ||
BOOLEAN: { | ||
primitiveType: 'BOOLEAN', | ||
toPrimitive: toPrimitive_BOOLEAN, | ||
fromPrimitive: fromPrimitive_BOOLEAN | ||
fromPrimitive: fromPrimitive_BOOLEAN, | ||
}, | ||
'INT32': { | ||
INT32: { | ||
primitiveType: 'INT32', | ||
toPrimitive: toPrimitive_INT32 | ||
toPrimitive: toPrimitive_INT32, | ||
}, | ||
'INT64': { | ||
INT64: { | ||
primitiveType: 'INT64', | ||
toPrimitive: toPrimitive_INT64 | ||
toPrimitive: toPrimitive_INT64, | ||
}, | ||
'INT96': { | ||
INT96: { | ||
primitiveType: 'INT96', | ||
toPrimitive: toPrimitive_INT96 | ||
toPrimitive: toPrimitive_INT96, | ||
}, | ||
'FLOAT': { | ||
FLOAT: { | ||
primitiveType: 'FLOAT', | ||
toPrimitive: toPrimitive_FLOAT | ||
toPrimitive: toPrimitive_FLOAT, | ||
}, | ||
'DOUBLE': { | ||
DOUBLE: { | ||
primitiveType: 'DOUBLE', | ||
toPrimitive: toPrimitive_DOUBLE | ||
toPrimitive: toPrimitive_DOUBLE, | ||
}, | ||
'BYTE_ARRAY': { | ||
BYTE_ARRAY: { | ||
primitiveType: 'BYTE_ARRAY', | ||
toPrimitive: toPrimitive_BYTE_ARRAY | ||
toPrimitive: toPrimitive_BYTE_ARRAY, | ||
}, | ||
'FIXED_LEN_BYTE_ARRAY': { | ||
FIXED_LEN_BYTE_ARRAY: { | ||
primitiveType: 'FIXED_LEN_BYTE_ARRAY', | ||
toPrimitive: toPrimitive_BYTE_ARRAY | ||
toPrimitive: toPrimitive_BYTE_ARRAY, | ||
}, | ||
'UTF8': { | ||
UTF8: { | ||
primitiveType: 'BYTE_ARRAY', | ||
originalType: 'UTF8', | ||
toPrimitive: toPrimitive_UTF8, | ||
fromPrimitive: fromPrimitive_UTF8 | ||
fromPrimitive: fromPrimitive_UTF8, | ||
}, | ||
'ENUM': { | ||
ENUM: { | ||
primitiveType: 'BYTE_ARRAY', | ||
originalType: 'UTF8', | ||
toPrimitive: toPrimitive_UTF8, | ||
fromPrimitive: fromPrimitive_UTF8 | ||
fromPrimitive: fromPrimitive_UTF8, | ||
}, | ||
'TIME_MILLIS': { | ||
TIME_MILLIS: { | ||
primitiveType: 'INT32', | ||
originalType: 'TIME_MILLIS', | ||
toPrimitive: toPrimitive_TIME_MILLIS | ||
toPrimitive: toPrimitive_TIME_MILLIS, | ||
}, | ||
'TIME_MICROS': { | ||
TIME_MICROS: { | ||
primitiveType: 'INT64', | ||
originalType: 'TIME_MICROS', | ||
toPrimitive: toPrimitive_TIME_MICROS | ||
toPrimitive: toPrimitive_TIME_MICROS, | ||
}, | ||
'DATE': { | ||
DATE: { | ||
primitiveType: 'INT32', | ||
originalType: 'DATE', | ||
toPrimitive: toPrimitive_DATE, | ||
fromPrimitive: fromPrimitive_DATE | ||
fromPrimitive: fromPrimitive_DATE, | ||
}, | ||
'TIMESTAMP_MILLIS': { | ||
TIMESTAMP_MILLIS: { | ||
primitiveType: 'INT64', | ||
originalType: 'TIMESTAMP_MILLIS', | ||
toPrimitive: toPrimitive_TIMESTAMP_MILLIS, | ||
fromPrimitive: fromPrimitive_TIMESTAMP_MILLIS | ||
fromPrimitive: fromPrimitive_TIMESTAMP_MILLIS, | ||
}, | ||
'TIMESTAMP_MICROS': { | ||
TIMESTAMP_MICROS: { | ||
primitiveType: 'INT64', | ||
originalType: 'TIMESTAMP_MICROS', | ||
toPrimitive: toPrimitive_TIMESTAMP_MICROS, | ||
fromPrimitive: fromPrimitive_TIMESTAMP_MICROS | ||
fromPrimitive: fromPrimitive_TIMESTAMP_MICROS, | ||
}, | ||
'UINT_8': { | ||
UINT_8: { | ||
primitiveType: 'INT32', | ||
originalType: 'UINT_8', | ||
toPrimitive: toPrimitive_UINT8 | ||
toPrimitive: toPrimitive_UINT8, | ||
}, | ||
'UINT_16': { | ||
UINT_16: { | ||
primitiveType: 'INT32', | ||
originalType: 'UINT_16', | ||
toPrimitive: toPrimitive_UINT16 | ||
toPrimitive: toPrimitive_UINT16, | ||
}, | ||
'UINT_32': { | ||
UINT_32: { | ||
primitiveType: 'INT32', | ||
originalType: 'UINT_32', | ||
toPrimitive: toPrimitive_UINT32 | ||
toPrimitive: toPrimitive_UINT32, | ||
}, | ||
'UINT_64': { | ||
UINT_64: { | ||
primitiveType: 'INT64', | ||
originalType: 'UINT_64', | ||
toPrimitive: toPrimitive_UINT64 | ||
toPrimitive: toPrimitive_UINT64, | ||
}, | ||
'INT_8': { | ||
INT_8: { | ||
primitiveType: 'INT32', | ||
originalType: 'INT_8', | ||
toPrimitive: toPrimitive_INT8 | ||
toPrimitive: toPrimitive_INT8, | ||
}, | ||
'INT_16': { | ||
INT_16: { | ||
primitiveType: 'INT32', | ||
originalType: 'INT_16', | ||
toPrimitive: toPrimitive_INT16 | ||
toPrimitive: toPrimitive_INT16, | ||
}, | ||
'INT_32': { | ||
INT_32: { | ||
primitiveType: 'INT32', | ||
originalType: 'INT_32', | ||
toPrimitive: toPrimitive_INT32 | ||
toPrimitive: toPrimitive_INT32, | ||
}, | ||
'INT_64': { | ||
INT_64: { | ||
primitiveType: 'INT64', | ||
originalType: 'INT_64', | ||
toPrimitive: toPrimitive_INT64 | ||
toPrimitive: toPrimitive_INT64, | ||
}, | ||
'JSON': { | ||
JSON: { | ||
primitiveType: 'BYTE_ARRAY', | ||
originalType: 'JSON', | ||
toPrimitive: toPrimitive_JSON, | ||
fromPrimitive: fromPrimitive_JSON | ||
fromPrimitive: fromPrimitive_JSON, | ||
}, | ||
'BSON': { | ||
BSON: { | ||
primitiveType: 'BYTE_ARRAY', | ||
originalType: 'BSON', | ||
toPrimitive: toPrimitive_BSON, | ||
fromPrimitive: fromPrimitive_BSON | ||
fromPrimitive: fromPrimitive_BSON, | ||
}, | ||
'INTERVAL': { | ||
INTERVAL: { | ||
primitiveType: 'FIXED_LEN_BYTE_ARRAY', | ||
@@ -227,3 +206,3 @@ originalType: 'INTERVAL', | ||
toPrimitive: toPrimitive_INTERVAL, | ||
fromPrimitive: fromPrimitive_INTERVAL | ||
fromPrimitive: fromPrimitive_INTERVAL, | ||
}, | ||
@@ -237,3 +216,3 @@ MAP: { | ||
toPrimitive: toPrimitive_LIST, | ||
} | ||
}, | ||
}; | ||
@@ -254,7 +233,6 @@ /** | ||
if (!isParquetType(type)) { | ||
throw 'invalid type: ' + type || "undefined"; | ||
throw 'invalid type: ' + type || 'undefined'; | ||
} | ||
return getParquetTypeDataObject(type, field).toPrimitive(value); | ||
} | ||
exports.toPrimitive = toPrimitive; | ||
/** | ||
@@ -266,3 +244,3 @@ * Convert a value from it's internal/underlying primitive representation to | ||
if (!isParquetType(type)) { | ||
throw 'invalid type: ' + type || "undefined"; | ||
throw 'invalid type: ' + type || 'undefined'; | ||
} | ||
@@ -277,3 +255,2 @@ const typeFromPrimitive = getParquetTypeDataObject(type, field).fromPrimitive; | ||
} | ||
exports.fromPrimitive = fromPrimitive; | ||
function toPrimitive_BOOLEAN(value) { | ||
@@ -437,3 +414,3 @@ return !!value; | ||
function fromPrimitive_UTF8(value) { | ||
return (value !== undefined && value !== null) ? value.toString() : value; | ||
return value !== undefined && value !== null ? value.toString() : value; | ||
} | ||
@@ -447,6 +424,6 @@ function toPrimitive_JSON(value) { | ||
function toPrimitive_BSON(value) { | ||
return Buffer.from(BSON.serialize(value)); | ||
return Buffer.from(bsonSerialize(value)); | ||
} | ||
function fromPrimitive_BSON(value) { | ||
return BSON.deserialize(value); | ||
return bsonDeserialize(value); | ||
} | ||
@@ -456,6 +433,6 @@ function toNumberInternal(typeName, value) { | ||
switch (typeof value) { | ||
case "string": | ||
case 'string': | ||
numberValue = parseInt(value, 10); | ||
break; | ||
case "number": | ||
case 'number': | ||
numberValue = value; | ||
@@ -473,3 +450,3 @@ break; | ||
function toPrimitive_TIME_MILLIS(value) { | ||
return toNumberInternal("TIME_MILLIS", value); | ||
return toNumberInternal('TIME_MILLIS', value); | ||
} | ||
@@ -489,3 +466,3 @@ function toPrimitive_TIME_MICROS(value) { | ||
} | ||
return toNumberInternal("DATE", value); | ||
return toNumberInternal('DATE', value); | ||
} | ||
@@ -500,3 +477,3 @@ function fromPrimitive_DATE(value) { | ||
} | ||
return toNumberInternal("TIMESTAMP_MILLIS", value); | ||
return toNumberInternal('TIMESTAMP_MILLIS', value); | ||
} | ||
@@ -531,5 +508,5 @@ function fromPrimitive_TIMESTAMP_MILLIS(value) { | ||
if (!value.months || !value.days || !value.milliseconds) { | ||
throw "value for INTERVAL must be object { months: ..., days: ..., milliseconds: ... }"; | ||
throw 'value for INTERVAL must be object { months: ..., days: ..., milliseconds: ... }'; | ||
} | ||
let buf = Buffer.alloc(12); | ||
const buf = Buffer.alloc(12); | ||
buf.writeUInt32LE(value.months, 0); | ||
@@ -549,4 +526,4 @@ buf.writeUInt32LE(value.days, 4); | ||
if (v < lowerRange || v > upperRange) { | ||
throw "invalid value"; | ||
throw 'invalid value'; | ||
} | ||
} |
@@ -1,10 +0,7 @@ | ||
/// <reference types="node" /> | ||
/// <reference types="node" /> | ||
/// <reference types="node-int64" /> | ||
import thrift from "thrift"; | ||
import thrift from 'thrift'; | ||
import fs, { WriteStream } from 'fs'; | ||
import * as parquet_thrift from '../gen-nodejs/parquet_types'; | ||
import { FileMetaDataExt, WriterOptions } from './declare'; | ||
import { Int64 } from "thrift"; | ||
export type WriteStreamMinimal = Pick<WriteStream, "write" | "end">; | ||
import { Int64 } from 'thrift'; | ||
export type WriteStreamMinimal = Pick<WriteStream, 'write' | 'end'>; | ||
type Enums = typeof parquet_thrift.Encoding | typeof parquet_thrift.FieldRepetitionType | typeof parquet_thrift.Type | typeof parquet_thrift.CompressionCodec | typeof parquet_thrift.PageType | typeof parquet_thrift.ConvertedType; | ||
@@ -32,4 +29,4 @@ type ThriftObject = FileMetaDataExt | parquet_thrift.PageHeader | parquet_thrift.ColumnMetaData | parquet_thrift.BloomFilterHeader | parquet_thrift.OffsetIndex | parquet_thrift.ColumnIndex | FileMetaDataExt; | ||
export declare const osopen: (path: string | Buffer | URL, opts?: WriterOptions) => Promise<WriteStream>; | ||
export declare const fieldIndexOf: (arr: Array<Array<unknown>>, elem: Array<unknown>) => number; | ||
export declare const fieldIndexOf: (arr: unknown[][], elem: unknown[]) => number; | ||
export declare const cloneInteger: (int: Int64) => thrift.Int64; | ||
export {}; |
@@ -49,9 +49,13 @@ "use strict"; | ||
/** Patch PageLocation to be three element array that has getters/setters | ||
* for each of the properties (offset, compressed_page_size, first_row_index) | ||
* This saves space considerably as we do not need to store the full variable | ||
* names for every PageLocation | ||
*/ | ||
* for each of the properties (offset, compressed_page_size, first_row_index) | ||
* This saves space considerably as we do not need to store the full variable | ||
* names for every PageLocation | ||
*/ | ||
const getterSetter = (index) => ({ | ||
get: function () { return this[index]; }, | ||
set: function (value) { return this[index] = value; } | ||
get: function () { | ||
return this[index]; | ||
}, | ||
set: function (value) { | ||
return (this[index] = value); | ||
}, | ||
}); | ||
@@ -65,9 +69,9 @@ Object.defineProperty(parquet_thrift.PageLocation.prototype, 'offset', getterSetter(0)); | ||
const serializeThrift = function (obj) { | ||
let output = []; | ||
const output = []; | ||
const callBack = function (buf) { | ||
output.push(buf); | ||
}; | ||
let transport = new thrift_1.default.TBufferedTransport(undefined, callBack); | ||
let protocol = new thrift_1.default.TCompactProtocol(transport); | ||
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872 | ||
const transport = new thrift_1.default.TBufferedTransport(undefined, callBack); | ||
const protocol = new thrift_1.default.TCompactProtocol(transport); | ||
//@ts-expect-error, https://issues.apache.org/jira/browse/THRIFT-3872 | ||
obj.write(protocol); | ||
@@ -82,6 +86,6 @@ transport.flush(); | ||
} | ||
var transport = new fixedTFramedTransport(buf); | ||
const transport = new fixedTFramedTransport(buf); | ||
transport.readPos = offset; | ||
var protocol = new thrift_1.default.TCompactProtocol(transport); | ||
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872 | ||
const protocol = new thrift_1.default.TCompactProtocol(transport); | ||
//@ts-expect-error, https://issues.apache.org/jira/browse/THRIFT-3872 | ||
obj.read(protocol); | ||
@@ -107,3 +111,3 @@ return transport.readPos - offset; | ||
const getThriftEnum = function (klass, value) { | ||
for (let k in klass) { | ||
for (const k in klass) { | ||
if (klass[k] === value) { | ||
@@ -143,3 +147,3 @@ return k; | ||
const fread = function (fd, position, length) { | ||
let buffer = Buffer.alloc(length); | ||
const buffer = Buffer.alloc(length); | ||
return new Promise((resolve, reject) => { | ||
@@ -198,4 +202,4 @@ fs_1.default.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => { | ||
return new Promise((resolve, reject) => { | ||
let outputStream = fs_1.default.createWriteStream(path, opts); | ||
outputStream.on('open', function (fd) { | ||
const outputStream = fs_1.default.createWriteStream(path, opts); | ||
outputStream.on('open', function (_fd) { | ||
resolve(outputStream); | ||
@@ -202,0 +206,0 @@ }); |
@@ -1,3 +0,1 @@ | ||
/// <reference types="node" /> | ||
/// <reference types="node" /> | ||
import stream from 'stream'; | ||
@@ -47,3 +45,3 @@ import * as parquet_shredder from './shred'; | ||
*/ | ||
close(callback?: Function): Promise<void>; | ||
close(callback?: () => void): Promise<void>; | ||
/** | ||
@@ -74,4 +72,4 @@ * Add key<>value metadata to the file | ||
schema: ParquetSchema; | ||
write: Function; | ||
close: Function; | ||
write: (buf: Buffer) => void; | ||
close: () => void; | ||
offset: Int64; | ||
@@ -88,8 +86,8 @@ rowCount: Int64; | ||
static openStream(schema: ParquetSchema, outputStream: parquet_util.WriteStreamMinimal, opts: WriterOptions): Promise<ParquetEnvelopeWriter>; | ||
constructor(schema: ParquetSchema, writeFn: Function, closeFn: Function, fileOffset: Int64, opts: WriterOptions); | ||
writeSection(buf: Buffer): any; | ||
constructor(schema: ParquetSchema, writeFn: (buf: Buffer) => void, closeFn: () => void, fileOffset: Int64, opts: WriterOptions); | ||
writeSection(buf: Buffer): void; | ||
/** | ||
* Encode the parquet file header | ||
*/ | ||
writeHeader(): any; | ||
writeHeader(): void; | ||
/** | ||
@@ -99,3 +97,3 @@ * Encode a parquet row group. The records object should be created using the | ||
*/ | ||
writeRowGroup(records: parquet_shredder.RecordBuffer): Promise<any>; | ||
writeRowGroup(records: parquet_shredder.RecordBuffer): Promise<void>; | ||
writeBloomFilters(): void; | ||
@@ -109,3 +107,3 @@ /** | ||
*/ | ||
writeFooter(userMetadata: Record<string, string>): any; | ||
writeFooter(userMetadata: Record<string, string>): void; | ||
/** | ||
@@ -123,4 +121,4 @@ * Set the parquet data page size. The data page size controls the maximum | ||
constructor(schema: ParquetSchema, opts?: {}); | ||
_transform(row: Record<string, unknown>, _encoding: string, callback: Function): void; | ||
_transform(row: Record<string, unknown>, _encoding: string, callback: (err?: Error | null, data?: any) => void): void; | ||
_flush(callback: (foo: any, bar?: any) => any): void; | ||
} |
@@ -74,3 +74,3 @@ "use strict"; | ||
static async openFile(schema, path, opts) { | ||
let outputStream = await parquet_util.osopen(path, opts); | ||
const outputStream = await parquet_util.osopen(path, opts); | ||
return ParquetWriter.openStream(schema, outputStream, opts); | ||
@@ -86,3 +86,3 @@ } | ||
} | ||
let envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts); | ||
const envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts); | ||
return new ParquetWriter(schema, envelopeWriter, opts); | ||
@@ -119,3 +119,3 @@ } | ||
useDataPageV2: this.envelopeWriter.useDataPageV2, | ||
bloomFilters: this.envelopeWriter.bloomFilters | ||
bloomFilters: this.envelopeWriter.bloomFilters, | ||
}; | ||
@@ -144,3 +144,6 @@ if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) { | ||
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
await encodePages(this.schema, this.rowBuffer, { | ||
useDataPageV2: this.envelopeWriter.useDataPageV2, | ||
bloomFilters: this.envelopeWriter.bloomFilters, | ||
}); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
@@ -204,4 +207,4 @@ this.rowBuffer = {}; | ||
static async openStream(schema, outputStream, opts) { | ||
let writeFn = parquet_util.oswrite.bind(undefined, outputStream); | ||
let closeFn = parquet_util.osend.bind(undefined, outputStream); | ||
const writeFn = parquet_util.oswrite.bind(undefined, outputStream); | ||
const closeFn = parquet_util.osend.bind(undefined, outputStream); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, new node_int64_1.default(0), opts); | ||
@@ -217,6 +220,6 @@ } | ||
this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE; | ||
this.useDataPageV2 = ("useDataPageV2" in opts) ? opts.useDataPageV2 : true; | ||
this.useDataPageV2 = 'useDataPageV2' in opts ? opts.useDataPageV2 : true; | ||
this.pageIndex = opts.pageIndex; | ||
this.bloomFilters = {}; | ||
(opts.bloomFilters || []).forEach(bloomOption => { | ||
(opts.bloomFilters || []).forEach((bloomOption) => { | ||
this.bloomFilters[bloomOption.column] = bloomFilterWriter.createSBBF(bloomOption); | ||
@@ -240,7 +243,7 @@ }); | ||
async writeRowGroup(records) { | ||
let rgroup = await encodeRowGroup(this.schema, records, { | ||
const rgroup = await encodeRowGroup(this.schema, records, { | ||
baseOffset: this.offset, | ||
pageSize: this.pageSize, | ||
useDataPageV2: this.useDataPageV2, | ||
pageIndex: this.pageIndex | ||
pageIndex: this.pageIndex, | ||
}); | ||
@@ -252,4 +255,4 @@ this.rowCount.setValue(this.rowCount.valueOf() + records.rowCount); | ||
writeBloomFilters() { | ||
this.rowGroups.forEach(group => { | ||
group.columns.forEach(column => { | ||
this.rowGroups.forEach((group) => { | ||
group.columns.forEach((column) => { | ||
if (!column.meta_data?.path_in_schema.length) { | ||
@@ -273,8 +276,8 @@ return; | ||
this.schema.fieldList.forEach((c, i) => { | ||
this.rowGroups.forEach(group => { | ||
let column = group.columns[i]; | ||
this.rowGroups.forEach((group) => { | ||
const column = group.columns[i]; | ||
if (!column) | ||
return; | ||
if (column.meta_data?.columnIndex) { | ||
let columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex); | ||
const columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex); | ||
delete column.meta_data.columnIndex; | ||
@@ -286,3 +289,3 @@ column.column_index_offset = parquet_util.cloneInteger(this.offset); | ||
if (column.meta_data?.offsetIndex) { | ||
let offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex); | ||
const offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex); | ||
delete column.meta_data.offsetIndex; | ||
@@ -308,3 +311,2 @@ column.offset_index_offset = parquet_util.cloneInteger(this.offset); | ||
} | ||
; | ||
/** | ||
@@ -326,3 +328,3 @@ * Set the parquet data page size. The data page size controls the maximum | ||
super({ objectMode: true }); | ||
let writeProxy = (function (t) { | ||
const writeProxy = (function (t) { | ||
return function (b) { | ||
@@ -332,3 +334,4 @@ t.push(b); | ||
})(this); | ||
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { | ||
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, () => { | ||
/* void */ | ||
}, new node_int64_1.default(0), opts), opts); | ||
@@ -338,3 +341,3 @@ } | ||
if (row) { | ||
this.writer.appendRow(row).then(data => callback(null, data), err => { | ||
this.writer.appendRow(row).then((data) => callback(null, data), (err) => { | ||
const fullErr = new Error(`Error transforming to parquet: ${err.toString()} row:${row}`); | ||
@@ -350,4 +353,3 @@ fullErr.message = err; | ||
_flush(callback) { | ||
this.writer.close() | ||
.then(d => callback(null, d), callback); | ||
this.writer.close().then((d) => callback(null, d), callback); | ||
} | ||
@@ -379,4 +381,6 @@ } | ||
statistics = Object.assign({}, statistics); | ||
statistics.min_value = statistics.min_value === undefined ? null : encodeStatisticsValue(statistics.min_value, column); | ||
statistics.max_value = statistics.max_value === undefined ? null : encodeStatisticsValue(statistics.max_value, column); | ||
statistics.min_value = | ||
statistics.min_value === undefined ? null : encodeStatisticsValue(statistics.min_value, column); | ||
statistics.max_value = | ||
statistics.max_value === undefined ? null : encodeStatisticsValue(statistics.max_value, column); | ||
statistics.max = statistics.max_value; | ||
@@ -387,6 +391,7 @@ statistics.min = statistics.min_value; | ||
async function encodePages(schema, rowBuffer, opts) { | ||
// generic | ||
if (!rowBuffer.pageRowCount) { | ||
return; | ||
} | ||
for (let field of schema.fieldList) { | ||
for (const field of schema.fieldList) { | ||
if (field.isNested) { | ||
@@ -398,5 +403,5 @@ continue; | ||
const values = rowBuffer.columnData[columnPath]; | ||
if (opts.bloomFilters && (columnPath in opts.bloomFilters)) { | ||
if (opts.bloomFilters && columnPath in opts.bloomFilters) { | ||
const splitBlockBloomFilter = opts.bloomFilters[columnPath]; | ||
values.values.forEach(v => splitBlockBloomFilter.insert(v)); | ||
values.values.forEach((v) => splitBlockBloomFilter.insert(v)); | ||
} | ||
@@ -423,5 +428,5 @@ let statistics = {}; | ||
} | ||
let pages = rowBuffer.pages[field.path.join(',')]; | ||
let lastPage = pages[pages.length - 1]; | ||
let first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0; | ||
const pages = rowBuffer.pages[field.path.join(',')]; | ||
const lastPage = pages[pages.length - 1]; | ||
const first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0; | ||
pages.push({ | ||
@@ -432,3 +437,3 @@ page, | ||
distinct_values: values.distinct_values, | ||
num_values: values.dlevels.length | ||
num_values: values.dlevels.length, | ||
}); | ||
@@ -448,5 +453,5 @@ values.distinct_values = new Set(); | ||
/* encode values */ | ||
let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
const valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
bitWidth: column.typeLength, | ||
...column | ||
...column, | ||
}); | ||
@@ -456,7 +461,11 @@ /* encode repetition and definition levels */ | ||
if (column.rLevelMax > 0) { | ||
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, rlevels, { bitWidth: parquet_util.getBitWidth(column.rLevelMax) }); | ||
rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, rlevels, { | ||
bitWidth: parquet_util.getBitWidth(column.rLevelMax), | ||
}); | ||
} | ||
let dLevelsBuf = Buffer.alloc(0); | ||
if (column.dLevelMax > 0) { | ||
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, dlevels, { bitWidth: parquet_util.getBitWidth(column.dLevelMax) }); | ||
dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, dlevels, { | ||
bitWidth: parquet_util.getBitWidth(column.dLevelMax), | ||
}); | ||
} | ||
@@ -466,3 +475,3 @@ /* build page header */ | ||
pageBody = await parquet_compression.deflate(column.compression, pageBody); | ||
let pageHeader = new parquet_types_1.default.PageHeader(); | ||
const pageHeader = new parquet_types_1.default.PageHeader(); | ||
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE']; | ||
@@ -477,6 +486,4 @@ pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; | ||
pageHeader.data_page_header.encoding = parquet_types_1.default.Encoding[column.encoding]; | ||
pageHeader.data_page_header.definition_level_encoding = | ||
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
pageHeader.data_page_header.repetition_level_encoding = | ||
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
pageHeader.data_page_header.definition_level_encoding = parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
pageHeader.data_page_header.repetition_level_encoding = parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
/* concat page header, repetition and definition levels and values */ | ||
@@ -490,7 +497,7 @@ return Buffer.concat([parquet_util.serializeThrift(pageHeader), pageBody]); | ||
/* encode values */ | ||
let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
const valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
bitWidth: column.typeLength, | ||
...column, | ||
}); | ||
let valuesBufCompressed = await parquet_compression.deflate(column.compression, valuesBuf); | ||
const valuesBufCompressed = await parquet_compression.deflate(column.compression, valuesBuf); | ||
/* encode repetition and definition levels */ | ||
@@ -501,3 +508,3 @@ let rLevelsBuf = Buffer.alloc(0); | ||
bitWidth: parquet_util.getBitWidth(column.rLevelMax), | ||
disableEnvelope: true | ||
disableEnvelope: true, | ||
}); | ||
@@ -509,7 +516,7 @@ } | ||
bitWidth: parquet_util.getBitWidth(column.dLevelMax), | ||
disableEnvelope: true | ||
disableEnvelope: true, | ||
}); | ||
} | ||
/* build page header */ | ||
let pageHeader = new parquet_types_1.default.PageHeader(); | ||
const pageHeader = new parquet_types_1.default.PageHeader(); | ||
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE_V2']; | ||
@@ -523,18 +530,10 @@ pageHeader.data_page_header_v2 = new parquet_types_1.default.DataPageHeaderV2(); | ||
} | ||
pageHeader.uncompressed_page_size = | ||
rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; | ||
pageHeader.compressed_page_size = | ||
rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length; | ||
pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; | ||
pageHeader.compressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length; | ||
pageHeader.data_page_header_v2.encoding = parquet_types_1.default.Encoding[column.encoding]; | ||
pageHeader.data_page_header_v2.definition_levels_byte_length = dLevelsBuf.length; | ||
pageHeader.data_page_header_v2.repetition_levels_byte_length = rLevelsBuf.length; | ||
pageHeader.data_page_header_v2.is_compressed = | ||
column.compression !== 'UNCOMPRESSED'; | ||
pageHeader.data_page_header_v2.is_compressed = column.compression !== 'UNCOMPRESSED'; | ||
/* concat page header, repetition and definition levels and values */ | ||
return Buffer.concat([ | ||
parquet_util.serializeThrift(pageHeader), | ||
rLevelsBuf, | ||
dLevelsBuf, | ||
valuesBufCompressed | ||
]); | ||
return Buffer.concat([parquet_util.serializeThrift(pageHeader), rLevelsBuf, dLevelsBuf, valuesBufCompressed]); | ||
} | ||
@@ -545,7 +544,7 @@ /** | ||
async function encodeColumnChunk(pages, opts) { | ||
let pagesBuf = Buffer.concat(pages.map(d => d.page)); | ||
let num_values = pages.reduce((p, d) => p + d.num_values, 0); | ||
const pagesBuf = Buffer.concat(pages.map((d) => d.page)); | ||
const num_values = pages.reduce((p, d) => p + d.num_values, 0); | ||
let offset = opts.baseOffset; | ||
/* prepare metadata header */ | ||
let metadata = new parquet_types_1.default.ColumnMetaData(); | ||
const metadata = new parquet_types_1.default.ColumnMetaData(); | ||
metadata.path_in_schema = opts.column.path; | ||
@@ -560,3 +559,3 @@ metadata.num_values = new node_int64_1.default(num_values); | ||
/* compile statistics ColumnIndex and OffsetIndex*/ | ||
let columnIndex = new parquet_types_1.default.ColumnIndex(); | ||
const columnIndex = new parquet_types_1.default.ColumnIndex(); | ||
columnIndex.null_pages = []; | ||
@@ -567,7 +566,7 @@ columnIndex.max_values = []; | ||
columnIndex.boundary_order = 0; | ||
let offsetIndex = new parquet_types_1.default.OffsetIndex(); | ||
const offsetIndex = new parquet_types_1.default.OffsetIndex(); | ||
offsetIndex.page_locations = []; | ||
/* prepare statistics */ | ||
let statistics = {}; | ||
let distinct_values = new Set(); | ||
const statistics = {}; | ||
const distinct_values = new Set(); | ||
statistics.null_count = new node_int64_1.default(0); | ||
@@ -592,3 +591,3 @@ statistics.distinct_count = new node_int64_1.default(0); | ||
} | ||
let pageLocation = new parquet_types_1.default.PageLocation(); | ||
const pageLocation = new parquet_types_1.default.PageLocation(); | ||
pageLocation.offset = new node_int64_1.default(offset); | ||
@@ -614,4 +613,4 @@ offset += page.page.length; | ||
/* concat metadata header and data pages */ | ||
let metadataOffset = opts.baseOffset + pagesBuf.length; | ||
let body = Buffer.concat([pagesBuf, parquet_util.serializeThrift(metadata)]); | ||
const metadataOffset = opts.baseOffset + pagesBuf.length; | ||
const body = Buffer.concat([pagesBuf, parquet_util.serializeThrift(metadata)]); | ||
return { body, metadata, metadataOffset }; | ||
@@ -623,3 +622,3 @@ } | ||
async function encodeRowGroup(schema, data, opts) { | ||
let metadata = new parquet_types_1.default.RowGroup(); | ||
const metadata = new parquet_types_1.default.RowGroup(); | ||
metadata.num_rows = new node_int64_1.default(data.rowCount); | ||
@@ -629,7 +628,7 @@ metadata.columns = []; | ||
let body = Buffer.alloc(0); | ||
for (let field of schema.fieldList) { | ||
for (const field of schema.fieldList) { | ||
if (field.isNested) { | ||
continue; | ||
} | ||
let cchunkData = await encodeColumnChunk(data.pages[field.path.join(',')], { | ||
const cchunkData = await encodeColumnChunk(data.pages[field.path.join(',')], { | ||
column: field, | ||
@@ -640,9 +639,9 @@ baseOffset: opts.baseOffset.valueOf() + body.length, | ||
useDataPageV2: opts.useDataPageV2 ?? true, | ||
pageIndex: opts.pageIndex ?? true | ||
pageIndex: opts.pageIndex ?? true, | ||
}); | ||
let cchunk = new parquet_types_1.default.ColumnChunk(); | ||
const cchunk = new parquet_types_1.default.ColumnChunk(); | ||
cchunk.file_offset = new node_int64_1.default(cchunkData.metadataOffset); | ||
cchunk.meta_data = cchunkData.metadata; | ||
metadata.columns.push(cchunk); | ||
metadata.total_byte_size = new node_int64_1.default(metadata.total_byte_size.valueOf() + (cchunkData.body.length)); | ||
metadata.total_byte_size = new node_int64_1.default(metadata.total_byte_size.valueOf() + cchunkData.body.length); | ||
body = Buffer.concat([body, cchunkData.body]); | ||
@@ -656,3 +655,3 @@ } | ||
function encodeFooter(schema, rowCount, rowGroups, userMetadata) { | ||
let metadata = new parquet_types_1.default.FileMetaData(); | ||
const metadata = new parquet_types_1.default.FileMetaData(); | ||
metadata.version = PARQUET_VERSION; | ||
@@ -664,4 +663,4 @@ metadata.created_by = '@dsnp/parquetjs'; | ||
metadata.key_value_metadata = []; | ||
for (let k in userMetadata) { | ||
let kv = new parquet_types_1.default.KeyValue(); | ||
for (const k in userMetadata) { | ||
const kv = new parquet_types_1.default.KeyValue(); | ||
kv.key = k; | ||
@@ -672,3 +671,3 @@ kv.value = userMetadata[k]; | ||
{ | ||
let schemaRoot = new parquet_types_1.default.SchemaElement(); | ||
const schemaRoot = new parquet_types_1.default.SchemaElement(); | ||
schemaRoot.name = 'root'; | ||
@@ -678,4 +677,4 @@ schemaRoot.num_children = Object.keys(schema.fields).length; | ||
} | ||
for (let field of schema.fieldList) { | ||
let schemaElem = new parquet_types_1.default.SchemaElement(); | ||
for (const field of schema.fieldList) { | ||
const schemaElem = new parquet_types_1.default.SchemaElement(); | ||
schemaElem.name = field.name; | ||
@@ -694,3 +693,3 @@ schemaElem.repetition_type = parquet_types_1.default.FieldRepetitionType[field.repetitionType]; | ||
switch (schemaElem.converted_type) { | ||
case (parquet_types_1.ConvertedType.DECIMAL): | ||
case parquet_types_1.ConvertedType.DECIMAL: | ||
schemaElem.precision = field.precision; | ||
@@ -703,4 +702,4 @@ schemaElem.scale = field.scale || 0; | ||
} | ||
let metadataEncoded = parquet_util.serializeThrift(metadata); | ||
let footerEncoded = Buffer.alloc(metadataEncoded.length + 8); | ||
const metadataEncoded = parquet_util.serializeThrift(metadata); | ||
const footerEncoded = Buffer.alloc(metadataEncoded.length + 8); | ||
metadataEncoded.copy(footerEncoded); | ||
@@ -707,0 +706,0 @@ footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length); |
@@ -6,3 +6,3 @@ { | ||
"types": "dist/parquet.d.ts", | ||
"version": "0.0.0-5db9db", | ||
"version": "0.0.0-6396d2", | ||
"homepage": "https://github.com/LibertyDSNP/parquetjs", | ||
@@ -19,14 +19,14 @@ "license": "MIT", | ||
"dependencies": { | ||
"@aws-sdk/client-s3": "^3.489.0", | ||
"@aws-sdk/client-s3": "^3.575.0", | ||
"@types/long": "^4.0.2", | ||
"@types/node-int64": "^0.4.29", | ||
"@types/thrift": "^0.10.11", | ||
"brotli-wasm": "^2.0.1", | ||
"@types/node-int64": "^0.4.32", | ||
"@types/thrift": "^0.10.17", | ||
"brotli-wasm": "^3.0.0", | ||
"browserify-zlib": "^0.2.0", | ||
"bson": "4.6.3", | ||
"cross-fetch": "^3.1.4", | ||
"int53": "^0.2.4", | ||
"long": "^4.0.0", | ||
"snappyjs": "^0.6.1", | ||
"thrift": "0.16.0", | ||
"bson": "6.7.0", | ||
"cross-fetch": "^4.0.0", | ||
"int53": "^1.0.0", | ||
"long": "^5.2.3", | ||
"snappyjs": "^0.7.0", | ||
"thrift": "0.20.0", | ||
"varint": "^6.0.0", | ||
@@ -36,27 +36,34 @@ "xxhash-wasm": "^1.0.2" | ||
"devDependencies": { | ||
"@smithy/util-stream": "^2.0.24", | ||
"@types/chai": "^4.3.5", | ||
"@types/json-schema": "^7.0.11", | ||
"@types/mocha": "^10.0.1", | ||
"@types/node": "^18.18.2", | ||
"@types/sinon": "^10.0.15", | ||
"@types/varint": "^6.0.1", | ||
"assert": "^2.0.0", | ||
"aws-sdk-client-mock": "^3.0.1", | ||
"@eslint/js": "^9.6.0", | ||
"@smithy/util-stream": "^3.0.0", | ||
"@types/chai": "^4.3.16", | ||
"@types/eslint__js": "^8.42.3", | ||
"@types/json-schema": "^7.0.15", | ||
"@types/mocha": "^10.0.6", | ||
"@types/node": "^20.12.12", | ||
"@types/sinon": "^17.0.3", | ||
"@types/varint": "^6.0.3", | ||
"assert": "^2.1.0", | ||
"aws-sdk-client-mock": "^4.0.0", | ||
"browserfs": "^1.4.3", | ||
"buffer": "^6.0.3", | ||
"chai": "4.3.6", | ||
"core-js": "^3.22.5", | ||
"esbuild": "^0.19.2", | ||
"mocha": "^10.2.0", | ||
"msw": "^1.2.1", | ||
"chai": "4.4.1", | ||
"core-js": "^3.37.1", | ||
"esbuild": "^0.21.2", | ||
"eslint": "^8.57.0", | ||
"eslint-plugin-mocha": "^10.4.3", | ||
"events": "^3.3.0", | ||
"mocha": "^10.4.0", | ||
"msw": "^2.3.0", | ||
"object-stream": "^0.0.1", | ||
"prettier": "3.3.2", | ||
"process": "^0.11.10", | ||
"regenerator-runtime": "^0.13.11", | ||
"sinon": "^15.1.0", | ||
"regenerator-runtime": "^0.14.1", | ||
"sinon": "^17.0.2", | ||
"sinon-chai": "^3.7.0", | ||
"sinon-chai-in-order": "^0.1.0", | ||
"stream-browserify": "^3.0.0", | ||
"ts-node": "^10.9.1", | ||
"typescript": "^5.0.4" | ||
"tsx": "^4.10.2", | ||
"typescript": "^5.5.2", | ||
"typescript-eslint": "^7.14.1" | ||
}, | ||
@@ -69,5 +76,7 @@ "scripts": { | ||
"type": "tsc --noEmit", | ||
"lint": "echo 'Linting, it is on the TODO list...'", | ||
"test": "mocha -r ts-node/register 'test/{,!(browser)/**}/*.{js,ts}'", | ||
"test:only": "mocha -r ts-node/register", | ||
"lint": "eslint . && npx prettier . --check", | ||
"lint:fix": "eslint --fix .", | ||
"format": "npx prettier . --write", | ||
"test": "mocha 'test/{,!(browser)/**}/*.{js,ts}'", | ||
"test:only": "mocha", | ||
"clean": "rm -Rf ./dist", | ||
@@ -74,0 +83,0 @@ "prepublishOnly": "npm run clean && npm run build:node && npm run build:types && npm run build:browser", |
189
README.md
@@ -20,2 +20,3 @@ # parquet.js | ||
This is a forked repository with code from various sources: | ||
- Primary source [ironSource](https://github.com/ironSource/parquetjs) [npm: parquetjs](https://www.npmjs.com/package/parquetjs) | ||
@@ -25,2 +26,3 @@ - Secondary source [ZJONSSON](https://github.com/ZJONSSON/parquetjs) [npm: parquetjs-lite](https://www.npmjs.com/package/parquetjs-lite) | ||
## Installation | ||
_parquet.js requires node.js >= 18.18.2_ | ||
@@ -33,24 +35,31 @@ | ||
### NodeJS | ||
To use with nodejs: | ||
```javascript | ||
import parquetjs from "@dsnp/parquetjs" | ||
import parquetjs from '@dsnp/parquetjs'; | ||
``` | ||
### Browser with Bundler | ||
To use in a browser with a bundler, depending on your needs, write the appropriate plugin or resolver to point to either the Common JS or ES Module version: | ||
```javascript | ||
// Common JS | ||
"node_modules/@dsnp/parquetjs/dist/browser/parquetjs.cjs" | ||
'node_modules/@dsnp/parquetjs/dist/browser/parquetjs.cjs'; | ||
// ES Modules | ||
"node_modules/@dsnp/parquetjs/dist/browser/parquetjs.esm" | ||
'node_modules/@dsnp/parquetjs/dist/browser/parquetjs.esm'; | ||
``` | ||
or: | ||
```javascript | ||
// Common JS | ||
import parquetjs from "@dsnp/parquetjs/dist/browser/parquetjs.cjs" | ||
import parquetjs from '@dsnp/parquetjs/dist/browser/parquetjs.cjs'; | ||
// ES Modules | ||
import parquetjs from "@dsnp/parquetjs/dist/browser/parquetjs.esm" | ||
import parquetjs from '@dsnp/parquetjs/dist/browser/parquetjs.esm'; | ||
``` | ||
### Browser Direct: ES Modules | ||
To use directly in the browser without a bundler using ES Modules: | ||
@@ -61,10 +70,11 @@ | ||
3. Use it in your html or other ES Modules: | ||
```html | ||
<script type="module"> | ||
import parquetjs from '../parquet.esm.js'; | ||
// Use parquetjs | ||
</script> | ||
``` | ||
```html | ||
<script type="module"> | ||
import parquetjs from '../parquet.esm.js'; | ||
// Use parquetjs | ||
</script> | ||
``` | ||
### Browser Direct: Plain Ol' JavaScript | ||
To use directly in the browser without a bundler or ES Modules: | ||
@@ -74,8 +84,8 @@ | ||
2. Copy to `dist/browser/parquetjs.js` the server | ||
2. Use the global `parquetjs` variable to access parquetjs functions | ||
3. Use the global `parquetjs` variable to access parquetjs functions | ||
```html | ||
<script> | ||
// console.log(parquetjs) | ||
</script> | ||
``` | ||
// console.log(parquetjs) | ||
</script> | ||
``` | ||
@@ -87,3 +97,3 @@ ## Usage: Writing files | ||
``` js | ||
```js | ||
var parquet = require('@dsnp/parquetjs'); | ||
@@ -98,3 +108,3 @@ ``` | ||
``` js | ||
```js | ||
// declare a schema for the `fruits` table | ||
@@ -106,3 +116,3 @@ var schema = new parquet.ParquetSchema({ | ||
date: { type: 'TIMESTAMP_MILLIS' }, | ||
in_stock: { type: 'BOOLEAN' } | ||
in_stock: { type: 'BOOLEAN' }, | ||
}); | ||
@@ -119,3 +129,3 @@ ``` | ||
date: parquet.ParquetFieldBuilder.createTimestampField(), | ||
in_stock: parquet.ParquetFieldBuilder.createBooleanField() | ||
in_stock: parquet.ParquetFieldBuilder.createBooleanField(), | ||
}); | ||
@@ -126,29 +136,28 @@ ``` | ||
``` js | ||
```js | ||
// declare a schema for the `fruits` JSON Schema | ||
var schema = new parquet.ParquetSchema.fromJsonSchema({ | ||
"type": "object", | ||
"properties": { | ||
"name": { | ||
"type": "string" | ||
type: 'object', | ||
properties: { | ||
name: { | ||
type: 'string', | ||
}, | ||
"quantity": { | ||
"type": "integer" | ||
quantity: { | ||
type: 'integer', | ||
}, | ||
"price": { | ||
"type": "number" | ||
price: { | ||
type: 'number', | ||
}, | ||
"date": { | ||
"type": "string", | ||
"format": "date-time" | ||
date: { | ||
type: 'string', | ||
format: 'date-time', | ||
}, | ||
"in_stock": { | ||
"type": "boolean" | ||
} | ||
in_stock: { | ||
type: 'boolean', | ||
}, | ||
}, | ||
"required": ["name", "quantity", "price", "date", "in_stock"] | ||
required: ['name', 'quantity', 'price', 'date', 'in_stock'], | ||
}); | ||
``` | ||
Note that the Parquet schema supports nesting, so you can store complex, arbitrarily | ||
@@ -162,3 +171,3 @@ nested records into a single row (more on that later) while still maintaining good | ||
``` js | ||
```js | ||
// create new ParquetWriter that writes to 'fruits.parquet` | ||
@@ -168,4 +177,4 @@ var writer = await parquet.ParquetWriter.openFile(schema, 'fruits.parquet'); | ||
// append a few rows to the file | ||
await writer.appendRow({name: 'apples', quantity: 10, price: 2.5, date: new Date(), in_stock: true}); | ||
await writer.appendRow({name: 'oranges', quantity: 10, price: 2.5, date: new Date(), in_stock: true}); | ||
await writer.appendRow({ name: 'apples', quantity: 10, price: 2.5, date: new Date(), in_stock: true }); | ||
await writer.appendRow({ name: 'oranges', quantity: 10, price: 2.5, date: new Date(), in_stock: true }); | ||
``` | ||
@@ -180,15 +189,15 @@ | ||
``` js | ||
const options = { | ||
bloomFilters: [ | ||
{ | ||
column: "name", | ||
numFilterBytes: 1024, | ||
}, | ||
{ | ||
column: "quantity", | ||
numFilterBytes: 1024, | ||
}, | ||
] | ||
}; | ||
```js | ||
const options = { | ||
bloomFilters: [ | ||
{ | ||
column: 'name', | ||
numFilterBytes: 1024, | ||
}, | ||
{ | ||
column: 'quantity', | ||
numFilterBytes: 1024, | ||
}, | ||
], | ||
}; | ||
@@ -198,11 +207,11 @@ var writer = await parquet.ParquetWriter.openFile(schema, 'fruits.parquet', options); | ||
By default, not passing any additional options calculates the optimal number of blocks according to the default number of distinct values (128*1024) and default false positive probability (0.001), which gives a filter byte size of 29,920. | ||
By default, not passing any additional options calculates the optimal number of blocks according to the default number of distinct values (128\*1024) and default false positive probability (0.001), which gives a filter byte size of 29,920. | ||
The following options are provided to have the ability to adjust the split-block bloom filter settings. | ||
`numFilterBytes` - sets the desire size of bloom filter in bytes. Defaults to 128 * 1024 * 1024 bits. | ||
`numFilterBytes` - sets the desire size of bloom filter in bytes. Defaults to 128 _ 1024 _ 1024 bits. | ||
`falsePositiveRate` - set the desired false positive percentage for bloom filter. Defaults to 0.001. | ||
`numDistinct` - sets the number of distinct values. Defaults to 128 * 1024 bits. | ||
`numDistinct` - sets the number of distinct values. Defaults to 128 \* 1024 bits. | ||
@@ -221,3 +230,3 @@ Note that if numFilterBytes is provided then falsePositiveRate and numDistinct options are ignored. | ||
``` js | ||
```js | ||
// create new ParquetReader that reads from 'fruits.parquet` | ||
@@ -231,3 +240,3 @@ let reader = await parquet.ParquetReader.openFile('fruits.parquet'); | ||
let record = null; | ||
while (record = await cursor.next()) { | ||
while ((record = await cursor.next())) { | ||
console.log(record); | ||
@@ -240,3 +249,3 @@ } | ||
``` js | ||
```js | ||
// create a new cursor that will only return the `name` and `price` columns | ||
@@ -249,3 +258,3 @@ let cursor = reader.getCursor(['name', 'price']); | ||
``` js | ||
```js | ||
await reader.close(); | ||
@@ -258,3 +267,4 @@ ``` | ||
and calling `getBloomFiltersFor`. | ||
``` js | ||
```js | ||
// create new ParquetReader that reads from 'fruits.parquet` | ||
@@ -277,2 +287,3 @@ let reader = await parquet.ParquetReader.openFile('fruits.parquet'); | ||
``` | ||
Calling `getBloomFiltersFor` on the reader returns an object with the keys being a column name and value being an array of length equal to the number of row groups that the column spans. | ||
@@ -282,3 +293,3 @@ | ||
``` js | ||
```js | ||
const sbbf = bloomFilters.name[0].ssbf; | ||
@@ -289,3 +300,2 @@ | ||
### Reading data from a url | ||
@@ -297,5 +307,5 @@ | ||
``` js | ||
```js | ||
const request = require('request'); | ||
let reader = await parquet.ParquetReader.openUrl(request,'https://domain/fruits.parquet'); | ||
let reader = await parquet.ParquetReader.openUrl(request, 'https://domain/fruits.parquet'); | ||
``` | ||
@@ -309,7 +319,7 @@ | ||
``` js | ||
```js | ||
const AWS = require('aws-sdk'); | ||
const client = new AWS.S3({ | ||
accessKeyId: 'xxxxxxxxxxx', | ||
secretAccessKey: 'xxxxxxxxxxx' | ||
secretAccessKey: 'xxxxxxxxxxx', | ||
}); | ||
@@ -319,6 +329,6 @@ | ||
Bucket: 'xxxxxxxxxxx', | ||
Key: 'xxxxxxxxxxx' | ||
Key: 'xxxxxxxxxxx', | ||
}; | ||
let reader = await parquet.ParquetReader.openS3(client,params); | ||
let reader = await parquet.ParquetReader.openS3(client, params); | ||
``` | ||
@@ -330,3 +340,3 @@ | ||
``` js | ||
```js | ||
const file = fs.readFileSync('fruits.parquet'); | ||
@@ -347,3 +357,3 @@ let reader = await parquet.ParquetReader.openBuffer(file); | ||
``` js | ||
```js | ||
var schema = new parquet.ParquetSchema({ | ||
@@ -362,3 +372,3 @@ name: { type: 'UTF8', encoding: 'PLAIN' }, | ||
``` js | ||
```js | ||
var schema = new parquet.ParquetSchema({ | ||
@@ -369,3 +379,2 @@ age: { type: 'UINT_32', encoding: 'RLE', typeLength: 7 }, | ||
### Optional Fields | ||
@@ -376,3 +385,3 @@ | ||
``` js | ||
```js | ||
var schema = new parquet.ParquetSchema({ | ||
@@ -384,7 +393,6 @@ name: { type: 'UTF8' }, | ||
var writer = await parquet.ParquetWriter.openFile(schema, 'fruits.parquet'); | ||
await writer.appendRow({name: 'apples', quantity: 10 }); | ||
await writer.appendRow({name: 'banana' }); // not in stock | ||
await writer.appendRow({ name: 'apples', quantity: 10 }); | ||
await writer.appendRow({ name: 'banana' }); // not in stock | ||
``` | ||
### Nested Rows & Arrays | ||
@@ -400,3 +408,3 @@ | ||
``` js | ||
```js | ||
// advanced fruits table | ||
@@ -411,4 +419,4 @@ var schema = new parquet.ParquetSchema({ | ||
quantity: { type: 'INT64' }, | ||
} | ||
} | ||
}, | ||
}, | ||
}); | ||
@@ -424,4 +432,4 @@ | ||
{ price: 2.45, quantity: 16 }, | ||
{ price: 2.60, quantity: 420 } | ||
] | ||
{ price: 2.6, quantity: 420 }, | ||
], | ||
}); | ||
@@ -433,5 +441,5 @@ | ||
stock: [ | ||
{ price: 1.20, quantity: 42 }, | ||
{ price: 1.30, quantity: 230 } | ||
] | ||
{ price: 1.2, quantity: 42 }, | ||
{ price: 1.3, quantity: 230 }, | ||
], | ||
}); | ||
@@ -446,3 +454,3 @@ | ||
let record = null; | ||
while (record = await cursor.next()) { | ||
while ((record = await cursor.next())) { | ||
console.log(record); | ||
@@ -455,3 +463,3 @@ } | ||
It might not be obvious why one would want to implement or use such a feature when | ||
the same can - in principle - be achieved by serializing the record using JSON | ||
the same can - in principle - be achieved by serializing the record using JSON | ||
(or a similar scheme) and then storing it into a UTF8 field: | ||
@@ -465,8 +473,6 @@ | ||
### Nested Lists for Hive / Athena | ||
Lists have to be annotated to be queriable with AWS Athena. See [parquet-format](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) for more detail and a full working example with comments in the test directory ([`test/list.js`](test/list.js)) | ||
Lists have to be annotated to be queriable with AWS Athena. See [parquet-format](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) for more detail and a full working example with comments in the test directory ([`test/list.js`](test/list.js)) | ||
### List of Supported Types & Encodings | ||
@@ -504,3 +510,2 @@ | ||
## Buffering & Row Group Size | ||
@@ -516,3 +521,3 @@ | ||
``` js | ||
```js | ||
var writer = await parquet.ParquetWriter.openFile(schema, 'fruits.parquet'); | ||
@@ -522,3 +527,2 @@ writer.setRowGroupSize(8192); | ||
## Dependencies | ||
@@ -529,3 +533,2 @@ | ||
## Notes | ||
@@ -532,0 +535,0 @@ |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Uses eval
Supply chain riskPackage uses dynamic code execution (e.g., eval()), which is a dangerous practice. This can prevent the code from running in certain environments and increases the risk that the code may contain exploits or malicious behavior.
Found 1 instance in 1 package
Uses eval
Supply chain riskPackage uses dynamic code execution (e.g., eval()), which is a dangerous practice. This can prevent the code from running in certain environments and increases the risk that the code may contain exploits or malicious behavior.
Found 1 instance in 1 package
496
12
3027323
32
20639
+ Addedbrotli-wasm@3.0.1(transitive)
+ Addedbson@6.7.0(transitive)
+ Addedcross-fetch@4.1.0(transitive)
+ Addedint53@1.0.0(transitive)
+ Addedlong@5.3.0(transitive)
+ Addedsnappyjs@0.7.0(transitive)
+ Addedthrift@0.20.0(transitive)
- Removedbase64-js@1.5.1(transitive)
- Removedbrotli-wasm@2.0.1(transitive)
- Removedbson@4.6.3(transitive)
- Removedbuffer@5.7.1(transitive)
- Removedcross-fetch@3.2.0(transitive)
- Removedieee754@1.2.1(transitive)
- Removedint53@0.2.4(transitive)
- Removedlong@4.0.0(transitive)
- Removedsnappyjs@0.6.1(transitive)
- Removedthrift@0.16.0(transitive)
Updated@aws-sdk/client-s3@^3.575.0
Updated@types/node-int64@^0.4.32
Updated@types/thrift@^0.10.17
Updatedbrotli-wasm@^3.0.0
Updatedbson@6.7.0
Updatedcross-fetch@^4.0.0
Updatedint53@^1.0.0
Updatedlong@^5.2.3
Updatedsnappyjs@^0.7.0
Updatedthrift@0.20.0