@dsnp/parquetjs
Advanced tools
Comparing version 0.0.0-33c80f to 0.0.0-43732c
/// <reference types="node" /> | ||
import Long = require('long'); | ||
import { Block } from "../types/types"; | ||
import Long from 'long'; | ||
import { Block } from "../declare"; | ||
/** | ||
@@ -5,0 +5,0 @@ * @class SplitBlockBloomFilter |
@@ -7,3 +7,3 @@ "use strict"; | ||
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); | ||
const Long = require("long"); | ||
const long_1 = __importDefault(require("long")); | ||
const xxhasher_1 = __importDefault(require("./xxhasher")); | ||
@@ -106,4 +106,4 @@ /** | ||
static getBlockIndex(h, z) { | ||
const zLong = Long.fromNumber(z, true); | ||
const hTopBits = Long.fromNumber(h.getHighBitsUnsigned(), true); | ||
const zLong = long_1.default.fromNumber(z, true); | ||
const hTopBits = long_1.default.fromNumber(h.getHighBitsUnsigned(), true); | ||
return hTopBits.mul(zLong).shiftRightUnsigned(32).getLowBitsUnsigned(); | ||
@@ -330,3 +330,3 @@ } | ||
const hashed = await this.hasher.hash64(value); | ||
return Long.fromString(hashed, true, 16); | ||
return long_1.default.fromString(hashed, true, 16); | ||
} | ||
@@ -333,0 +333,0 @@ insertHash(hashValue) { |
@@ -13,10 +13,11 @@ /** | ||
private static h64; | ||
private hashit; | ||
private hashIt; | ||
/** | ||
* @function hash64 | ||
* @description creates a hash for certain data types. | ||
* @return the 64 big XXHash as a string | ||
* @param value one of n, throw an error. | ||
* @description creates a hash for certain data types. All data is converted using toString() | ||
* prior to hashing. | ||
* @return the 64 big XXHash as a hex-encoded string. | ||
* @param value, must be of type string, Buffer, Uint8Array, Long, boolean, number, or bigint | ||
*/ | ||
hash64(value: any): Promise<string>; | ||
} |
@@ -19,4 +19,4 @@ "use strict"; | ||
class XxHasher { | ||
static h64 = (0, xxhash_wasm_1.default)().then(x => x.h64); | ||
async hashit(value) { | ||
static h64 = (0, xxhash_wasm_1.default)().then(x => x.h64ToString); | ||
async hashIt(value) { | ||
return (await XxHasher.h64)(value); | ||
@@ -26,9 +26,10 @@ } | ||
* @function hash64 | ||
* @description creates a hash for certain data types. | ||
* @return the 64 big XXHash as a string | ||
* @param value one of n, throw an error. | ||
* @description creates a hash for certain data types. All data is converted using toString() | ||
* prior to hashing. | ||
* @return the 64 big XXHash as a hex-encoded string. | ||
* @param value, must be of type string, Buffer, Uint8Array, Long, boolean, number, or bigint | ||
*/ | ||
async hash64(value) { | ||
if (typeof value === 'string') | ||
return this.hashit(value); | ||
return this.hashIt(value); | ||
if (value instanceof Buffer || | ||
@@ -40,3 +41,3 @@ value instanceof Uint8Array || | ||
typeof value === 'bigint') { | ||
return this.hashit(value.toString()); | ||
return this.hashIt(value.toString()); | ||
} | ||
@@ -43,0 +44,0 @@ throw new Error("unsupported type: " + value); |
import sbbf from "../bloom/sbbf"; | ||
import { ParquetEnvelopeReader } from "../reader"; | ||
import { ColumnChunkData } from "../types/types"; | ||
declare type bloomFilterOffsetData = { | ||
import { ColumnChunkData } from "../declare"; | ||
type bloomFilterOffsetData = { | ||
columnName: string; | ||
@@ -6,0 +6,0 @@ offsetBytes: number; |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -31,4 +35,3 @@ if (k2 === undefined) k2 = k; | ||
return columnChunkDataCollection.filter((columnChunk) => { | ||
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, }, }, } = columnChunk; | ||
return bloomFilterOffsetBuffer; | ||
return columnChunk.column.meta_data?.bloom_filter_offset; | ||
}); | ||
@@ -44,6 +47,6 @@ }; | ||
const parseBloomFilterOffsets = (ColumnChunkDataCollection) => { | ||
return ColumnChunkDataCollection.map((columnChunkData) => { | ||
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, path_in_schema: pathInSchema, }, }, rowGroupIndex, } = columnChunkData; | ||
return ColumnChunkDataCollection.map(({ rowGroupIndex, column }) => { | ||
const { bloom_filter_offset: bloomOffset, path_in_schema: pathInSchema, } = column.meta_data || {}; | ||
return { | ||
offsetBytes: toInteger(bloomFilterOffsetBuffer), | ||
offsetBytes: toInteger(bloomOffset.buffer), | ||
columnName: pathInSchema.join(","), | ||
@@ -50,0 +53,0 @@ rowGroupIndex, |
/// <reference types="node" /> | ||
import parquet_thrift from "../../gen-nodejs/parquet_types"; | ||
import SplitBlockBloomFilter from "../bloom/sbbf"; | ||
import { ColumnData, Offset, Block } from "../types/types"; | ||
declare type createSBBFParams = { | ||
import { Block } from "../declare"; | ||
import Int64 from 'node-int64'; | ||
export type createSBBFParams = { | ||
numFilterBytes?: number; | ||
falsePositiveRate?: number; | ||
numDistinct?: number; | ||
column?: any; | ||
}; | ||
export declare const createSBBF: (params: createSBBFParams) => SplitBlockBloomFilter; | ||
export declare const serializeFilterHeaders: (numberOfBytes: number) => Buffer; | ||
declare type serializeFilterDataParams = { | ||
type serializeFilterDataParams = { | ||
filterBlocks: Array<Block>; | ||
@@ -16,4 +19,4 @@ filterByteSize: number; | ||
export declare const serializeFilterData: (params: serializeFilterDataParams) => Buffer; | ||
export declare const setFilterOffset: (column: ColumnData, offset: Offset) => void; | ||
export declare const setFilterOffset: (column: parquet_thrift.ColumnChunk, offset: Int64) => void; | ||
export declare const getSerializedBloomFilterData: (splitBlockBloomFilter: InstanceType<typeof SplitBlockBloomFilter>) => Buffer; | ||
export {}; |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -65,3 +69,3 @@ if (k2 === undefined) k2 = k; | ||
const setFilterOffset = (column, offset) => { | ||
column.meta_data.bloom_filter_offset = offset; | ||
column.meta_data.bloom_filter_offset = parquet_util.cloneInteger(offset); | ||
}; | ||
@@ -68,0 +72,0 @@ exports.setFilterOffset = setFilterOffset; |
@@ -23,8 +23,8 @@ export namespace PARQUET_COMPRESSION_METHODS { | ||
export function inflate(method: any, value: any): Promise<any>; | ||
declare function deflate_identity(value: any): any; | ||
declare function inflate_identity(value: any): any; | ||
declare function deflate_identity(value: any): Buffer; | ||
declare function inflate_identity(value: any): Buffer; | ||
declare function deflate_gzip(value: any): Buffer; | ||
declare function inflate_gzip(value: any): Buffer; | ||
declare function deflate_snappy(value: any): ArrayBuffer | Uint8Array | Buffer; | ||
declare function inflate_snappy(value: any): ArrayBuffer | Uint8Array | Buffer; | ||
declare function deflate_snappy(value: any): Buffer; | ||
declare function inflate_snappy(value: any): Buffer; | ||
export {}; |
@@ -28,3 +28,3 @@ 'use strict'; | ||
function deflate_identity(value) { | ||
return value; | ||
return buffer_from_result(value); | ||
} | ||
@@ -35,3 +35,4 @@ function deflate_gzip(value) { | ||
function deflate_snappy(value) { | ||
return snappy.compress(value); | ||
const compressedValue = snappy.compress(value); | ||
return buffer_from_result(compressedValue); | ||
} | ||
@@ -48,3 +49,3 @@ /** | ||
function inflate_identity(value) { | ||
return value; | ||
return buffer_from_result(value); | ||
} | ||
@@ -55,6 +56,15 @@ function inflate_gzip(value) { | ||
function inflate_snappy(value) { | ||
return snappy.uncompress(value); | ||
const uncompressedValue = snappy.uncompress(value); | ||
return buffer_from_result(uncompressedValue); | ||
} | ||
function buffer_from_result(result) { | ||
if (Buffer.isBuffer(result)) { | ||
return result; | ||
} | ||
else { | ||
return Buffer.from(result); | ||
} | ||
} | ||
exports.PARQUET_COMPRESSION_METHODS = PARQUET_COMPRESSION_METHODS; | ||
exports.deflate = deflate; | ||
exports.inflate = inflate; |
/// <reference types="node" /> | ||
import { Statistics } from "gen-nodejs/parquet_types"; | ||
import { Statistics } from "../gen-nodejs/parquet_types"; | ||
import { ParquetEnvelopeReader } from "./reader"; | ||
import { NewFileMetaData } from "./types/types"; | ||
import { FileMetaDataExt } from "./declare"; | ||
export interface BufferReaderOptions { | ||
@@ -10,3 +10,3 @@ maxSpan?: number; | ||
default_dictionary_size?: number; | ||
metadata?: NewFileMetaData; | ||
metadata?: FileMetaDataExt; | ||
rawStatistics?: Statistics; | ||
@@ -13,0 +13,0 @@ } |
@@ -19,3 +19,3 @@ "use strict"; | ||
} | ||
async read(offset, length) { | ||
read(offset, length) { | ||
if (!this.scheduled) { | ||
@@ -22,0 +22,0 @@ this.scheduled = true; |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -6,0 +10,0 @@ if (k2 === undefined) k2 = k; |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -25,6 +29,6 @@ if (k2 === undefined) k2 = k; | ||
const decodeValues = function (type, cursor, count, opts) { | ||
opts.bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0); | ||
const bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0); | ||
cursor.offset += 1; | ||
return rle.decodeValues(type, cursor, count, Object.assign({}, opts, { disableEnvelope: true })); | ||
return rle.decodeValues(type, cursor, count, Object.assign({}, opts, { disableEnvelope: true, bitWidth })); | ||
}; | ||
exports.decodeValues = decodeValues; |
/// <reference types="node" /> | ||
import { Cursor, Options } from "./types"; | ||
declare type ValidValueTypes = "BOOLEAN" | "INT32" | "INT64" | "INT96" | "FLOAT" | "DOUBLE" | "BYTE_ARRAY" | "FIXED_LEN_BYTE_ARRAY"; | ||
type ValidValueTypes = "BOOLEAN" | "INT32" | "INT64" | "INT96" | "FLOAT" | "DOUBLE" | "BYTE_ARRAY" | "FIXED_LEN_BYTE_ARRAY"; | ||
export declare const encodeValues: (type: ValidValueTypes | string, values: Array<unknown>, opts: Options) => Buffer; | ||
export declare const decodeValues: (type: ValidValueTypes | string, cursor: Cursor, count: number, opts: Options) => number[] | boolean[] | bigint[] | Buffer[]; | ||
export {}; |
@@ -27,29 +27,96 @@ "use strict"; | ||
} | ||
function encodeValues_INT32(values) { | ||
function encodeValues_INT32(values, opts) { | ||
const isDecimal = opts?.originalType === 'DECIMAL' || opts?.column?.originalType === 'DECIMAL'; | ||
const scale = opts?.scale || 0; | ||
let buf = Buffer.alloc(4 * values.length); | ||
for (let i = 0; i < values.length; i++) { | ||
buf.writeInt32LE(values[i], i * 4); | ||
if (isDecimal) { | ||
buf.writeInt32LE(values[i] * Math.pow(10, scale), i * 4); | ||
} | ||
else { | ||
buf.writeInt32LE(values[i], i * 4); | ||
} | ||
} | ||
return buf; | ||
} | ||
function decodeValues_INT32(cursor, count) { | ||
function decodeValues_INT32(cursor, count, opts) { | ||
let values = []; | ||
for (let i = 0; i < count; ++i) { | ||
values.push(cursor.buffer.readInt32LE(cursor.offset)); | ||
cursor.offset += 4; | ||
const name = opts.name || opts.column?.name || undefined; | ||
try { | ||
if (opts.originalType === 'DECIMAL') { | ||
values = decodeValues_DECIMAL(cursor, count, opts); | ||
} | ||
else { | ||
for (let i = 0; i < count; ++i) { | ||
values.push(cursor.buffer.readInt32LE(cursor.offset)); | ||
cursor.offset += 4; | ||
} | ||
} | ||
} | ||
catch (e) { | ||
console.log(`Error thrown for column: ${name}`); | ||
throw e; | ||
} | ||
return values; | ||
} | ||
function encodeValues_INT64(values) { | ||
function encodeValues_INT64(values, opts) { | ||
const isDecimal = opts?.originalType === 'DECIMAL' || opts?.column?.originalType === 'DECIMAL'; | ||
const scale = opts?.scale || 0; | ||
let buf = Buffer.alloc(8 * values.length); | ||
for (let i = 0; i < values.length; i++) { | ||
buf.writeBigInt64LE(BigInt(values[i]), i * 8); | ||
if (isDecimal) { | ||
buf.writeBigInt64LE(BigInt(Math.floor(values[i] * Math.pow(10, scale))), i * 8); | ||
} | ||
else { | ||
buf.writeBigInt64LE(BigInt(values[i]), i * 8); | ||
} | ||
} | ||
return buf; | ||
} | ||
function decodeValues_INT64(cursor, count) { | ||
function decodeValues_INT64(cursor, count, opts) { | ||
let values = []; | ||
const name = opts.name || opts.column?.name || undefined; | ||
try { | ||
if (opts.originalType === 'DECIMAL' || opts.column?.originalType === 'DECIMAL') { | ||
let columnOptions = opts.column?.originalType ? opts.column : opts; | ||
values = decodeValues_DECIMAL(cursor, count, columnOptions); | ||
} | ||
else { | ||
for (let i = 0; i < count; ++i) { | ||
values.push(cursor.buffer.readBigInt64LE(cursor.offset)); | ||
cursor.offset += 8; | ||
} | ||
} | ||
} | ||
catch (e) { | ||
console.log(`Error thrown for column: ${name}`); | ||
throw e; | ||
} | ||
return values; | ||
} | ||
function decodeValues_DECIMAL(cursor, count, opts) { | ||
const precision = opts.precision; | ||
// Default scale to 0 per spec | ||
const scale = opts.scale || 0; | ||
const name = opts.name || undefined; | ||
if (!precision) { | ||
throw `missing option: precision (required for DECIMAL) for column: ${name}`; | ||
} | ||
let values = []; | ||
// by default we prepare the offset and bufferFunction to work with 32bit integers | ||
let offset = 4; | ||
let bufferFunction = (offset) => cursor.buffer.readInt32LE(offset); | ||
if (precision > 9) { | ||
// if the precision is over 9 digits, then we are dealing with a 64bit integer | ||
offset = 8; | ||
bufferFunction = (offset) => cursor.buffer.readBigInt64LE(offset); | ||
} | ||
for (let i = 0; i < count; ++i) { | ||
values.push(cursor.buffer.readBigInt64LE(cursor.offset)); | ||
cursor.offset += 8; | ||
const bufferSize = cursor.size || 0; | ||
if (bufferSize === 0 || cursor.offset < bufferSize) { | ||
const fullValue = bufferFunction(cursor.offset); | ||
const valueWithDecimalApplied = Number(fullValue) / Math.pow(10, scale); | ||
values.push(valueWithDecimalApplied); | ||
cursor.offset += offset; | ||
} | ||
} | ||
@@ -173,5 +240,5 @@ return values; | ||
case "INT32": | ||
return encodeValues_INT32(values); | ||
return encodeValues_INT32(values, opts); | ||
case "INT64": | ||
return encodeValues_INT64(values); | ||
return encodeValues_INT64(values, opts); | ||
case "INT96": | ||
@@ -197,5 +264,5 @@ return encodeValues_INT96(values); | ||
case "INT32": | ||
return decodeValues_INT32(cursor, count); | ||
return decodeValues_INT32(cursor, count, opts); | ||
case "INT64": | ||
return decodeValues_INT64(cursor, count); | ||
return decodeValues_INT64(cursor, count, opts); | ||
case "INT96": | ||
@@ -202,0 +269,0 @@ return decodeValues_INT96(cursor, count); |
/// <reference types="node" /> | ||
import { Cursor, Options } from './types'; | ||
export declare const encodeValues: (type: string, values: Array<number>, opts: Options) => Buffer; | ||
export declare const decodeValues: (_: string, cursor: Cursor, count: number, opts: Options) => any[]; | ||
import { Cursor } from './types'; | ||
export declare const encodeValues: (type: string, values: Array<number>, opts: { | ||
bitWidth: number; | ||
disableEnvelope?: boolean; | ||
}) => Buffer; | ||
export declare const decodeValues: (_: string, cursor: Cursor, count: number, opts: { | ||
bitWidth: number; | ||
disableEnvelope?: boolean; | ||
}) => any[]; |
"use strict"; | ||
// For questions about RLE encoding, see the spec: | ||
// | ||
// https://github.com/apache/parquet-format/blob/master/Encodings.md | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
@@ -25,5 +28,8 @@ return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
let buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8)); | ||
let remainingValue = value; | ||
// This is encoded LSB to MSB, so we pick off the least | ||
// significant byte and shift to get the next one. | ||
for (let i = 0; i < buf.length; ++i) { | ||
buf.writeUInt8(value & 0xff, i); | ||
value >> 8; | ||
buf.writeUInt8(remainingValue & 0xff, i); | ||
remainingValue = remainingValue >> 8; | ||
} | ||
@@ -111,6 +117,9 @@ return Buffer.concat([ | ||
function decodeRunRepeated(cursor, count, opts) { | ||
var bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8); | ||
let value = 0; | ||
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); ++i) { | ||
value << 8; | ||
value += cursor.buffer[cursor.offset]; | ||
for (let i = 0; i < bytesNeededForFixedBitWidth; ++i) { | ||
const byte = cursor.buffer[cursor.offset]; | ||
// Bytes are stored LSB to MSB, so we need to shift | ||
// each new byte appropriately. | ||
value += byte << (i * 8); | ||
cursor.offset += 1; | ||
@@ -117,0 +126,0 @@ } |
/// <reference types="node" /> | ||
import { PrimitiveType } from "lib/types/types"; | ||
import { ParquetCodec, OriginalType, ParquetField } from "lib/types/types"; | ||
import { Statistics } from "gen-nodejs/parquet_types"; | ||
import { PrimitiveType } from "../declare"; | ||
import { ParquetCodec, OriginalType, ParquetField } from "../declare"; | ||
import { Statistics } from "../../gen-nodejs/parquet_types"; | ||
export interface Options { | ||
typeLength: number; | ||
bitWidth: number; | ||
disableEnvelope: boolean; | ||
disableEnvelope?: boolean; | ||
primitiveType?: PrimitiveType; | ||
@@ -21,2 +21,5 @@ originalType?: OriginalType; | ||
type?: string; | ||
name?: string; | ||
precision?: number; | ||
scale?: number; | ||
} | ||
@@ -23,0 +26,0 @@ export interface Cursor { |
@@ -0,5 +1,6 @@ | ||
/// <reference types="node" /> | ||
interface PARQUET_COMPRESSION_METHODS { | ||
[key: string]: { | ||
deflate: Function; | ||
inflate: Function; | ||
deflate: (value: any) => Buffer | Promise<Buffer>; | ||
inflate: (value: any) => Buffer | Promise<Buffer>; | ||
}; | ||
@@ -11,7 +12,7 @@ } | ||
*/ | ||
export declare function deflate(method: string, value: unknown): Promise<any>; | ||
export declare function deflate(method: string, value: unknown): Promise<Buffer>; | ||
/** | ||
* Inflate a value using compression method `method` | ||
*/ | ||
export declare function inflate(method: string, value: unknown): Promise<any>; | ||
export declare function inflate(method: string, value: unknown): Promise<Buffer>; | ||
export {}; |
@@ -40,3 +40,3 @@ "use strict"; | ||
function deflate_identity(value) { | ||
return value; | ||
return buffer_from_result(value); | ||
} | ||
@@ -47,3 +47,4 @@ function deflate_gzip(value) { | ||
function deflate_snappy(value) { | ||
return snappyjs_1.default.compress(value); | ||
const compressedValue = snappyjs_1.default.compress(value); | ||
return buffer_from_result(compressedValue); | ||
} | ||
@@ -69,10 +70,11 @@ async function deflate_brotli(value) { | ||
exports.inflate = inflate; | ||
function inflate_identity(value) { | ||
return value; | ||
async function inflate_identity(value) { | ||
return buffer_from_result(value); | ||
} | ||
function inflate_gzip(value) { | ||
async function inflate_gzip(value) { | ||
return zlib_1.default.gunzipSync(value); | ||
} | ||
function inflate_snappy(value) { | ||
return snappyjs_1.default.uncompress(value); | ||
const uncompressedValue = snappyjs_1.default.uncompress(value); | ||
return buffer_from_result(uncompressedValue); | ||
} | ||
@@ -83,1 +85,9 @@ async function inflate_brotli(value) { | ||
} | ||
function buffer_from_result(result) { | ||
if (Buffer.isBuffer(result)) { | ||
return result; | ||
} | ||
else { | ||
return Buffer.from(result); | ||
} | ||
} |
@@ -7,3 +7,3 @@ /// <reference types="node" /> | ||
import { BufferReaderOptions } from './bufferReader'; | ||
import { Parameter, ColumnData, RowGroup, PageData, ClientS3, ClientParameters, NewFileMetaData } from './types/types'; | ||
import { Parameter, PageData, ClientS3, ClientParameters, FileMetaDataExt, RowGroupExt, ColumnChunkExt } from './declare'; | ||
import { Options } from './codec/types'; | ||
@@ -14,3 +14,3 @@ /** | ||
declare class ParquetCursor { | ||
metadata: NewFileMetaData; | ||
metadata: FileMetaDataExt; | ||
envelopeReader: ParquetEnvelopeReader; | ||
@@ -21,2 +21,3 @@ schema: parquet_schema.ParquetSchema; | ||
rowGroupIndex: number; | ||
cursorIndex: number; | ||
/** | ||
@@ -28,3 +29,3 @@ * Create a new parquet reader from the file metadata and an envelope reader. | ||
*/ | ||
constructor(metadata: NewFileMetaData, envelopeReader: ParquetEnvelopeReader, schema: parquet_schema.ParquetSchema, columnList: Array<Array<unknown>>); | ||
constructor(metadata: FileMetaDataExt, envelopeReader: ParquetEnvelopeReader, schema: parquet_schema.ParquetSchema, columnList: Array<Array<unknown>>); | ||
/** | ||
@@ -36,3 +37,3 @@ * Retrieve the next row from the cursor. Returns a row or NULL if the end | ||
/** | ||
* Rewind the cursor the the beginning of the file | ||
* Rewind the cursor to the beginning of the file | ||
*/ | ||
@@ -50,3 +51,3 @@ rewind(): void; | ||
envelopeReader: ParquetEnvelopeReader | null; | ||
metadata: NewFileMetaData | null; | ||
metadata: FileMetaDataExt | null; | ||
schema: parquet_schema.ParquetSchema; | ||
@@ -57,4 +58,4 @@ /** | ||
*/ | ||
static openFile(filePath: string | Buffer | URL, options: BufferReaderOptions): Promise<ParquetReader>; | ||
static openBuffer(buffer: Buffer, options: BufferReaderOptions): Promise<ParquetReader>; | ||
static openFile(filePath: string | Buffer | URL, options?: BufferReaderOptions): Promise<ParquetReader>; | ||
static openBuffer(buffer: Buffer, options?: BufferReaderOptions): Promise<ParquetReader>; | ||
/** | ||
@@ -65,3 +66,3 @@ * Open the parquet file from S3 using the supplied aws client and params | ||
*/ | ||
static openS3(client: ClientS3, params: ClientParameters, options: BufferReaderOptions): Promise<ParquetReader>; | ||
static openS3(client: ClientS3, params: ClientParameters, options?: BufferReaderOptions): Promise<ParquetReader>; | ||
/** | ||
@@ -73,4 +74,4 @@ * Open the parquet file from a url using the supplied request module | ||
*/ | ||
static openUrl(params: Parameter, options: BufferReaderOptions): Promise<ParquetReader>; | ||
static openEnvelopeReader(envelopeReader: ParquetEnvelopeReader, opts: BufferReaderOptions): Promise<ParquetReader>; | ||
static openUrl(params: Parameter | URL | string, options?: BufferReaderOptions): Promise<ParquetReader>; | ||
static openEnvelopeReader(envelopeReader: ParquetEnvelopeReader, opts?: BufferReaderOptions): Promise<ParquetReader>; | ||
/** | ||
@@ -82,3 +83,3 @@ * Create a new parquet reader from the file metadata and an envelope reader. | ||
*/ | ||
constructor(metadata: NewFileMetaData, envelopeReader: ParquetEnvelopeReader, opts: BufferReaderOptions); | ||
constructor(metadata: FileMetaDataExt, envelopeReader: ParquetEnvelopeReader, opts?: BufferReaderOptions); | ||
/** | ||
@@ -94,3 +95,3 @@ * Support `for await` iterators on the reader object | ||
*/ | ||
[Symbol.asyncIterator](): AsyncGenerator<unknown, void, unknown>; | ||
[Symbol.asyncIterator](): AsyncGenerator<{}, void, unknown>; | ||
/** | ||
@@ -106,3 +107,7 @@ * Return a cursor to the file. You may open more than one cursor and use | ||
getCursor(columnList?: Array<Array<unknown>>): ParquetCursor; | ||
getBloomFiltersFor(columnNames: string[]): Promise<Record<string, unknown[]>>; | ||
getBloomFiltersFor(columnNames: string[]): Promise<Record<string, { | ||
sbbf: import("./bloom/sbbf").default; | ||
columnName: string; | ||
rowGroupIndex: number; | ||
}[]>>; | ||
/** | ||
@@ -121,3 +126,3 @@ * Return the number of rows in this file. Note that the number of rows is | ||
getMetadata(): Record<string, unknown>; | ||
exportMetadata(indent: string | number | undefined): string; | ||
exportMetadata(indent: string | number | undefined): Promise<string>; | ||
/** | ||
@@ -136,23 +141,23 @@ * Close this parquet reader. You MUST call this method once you're finished | ||
default_dictionary_size: number; | ||
metadata?: NewFileMetaData; | ||
metadata?: FileMetaDataExt; | ||
schema?: parquet_schema.ParquetSchema; | ||
static openFile(filePath: string | Buffer | URL, options: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
static openBuffer(buffer: Buffer, options: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
static openS3(client: ClientS3, params: ClientParameters, options: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
static openUrl(params: Parameter, options: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
constructor(readFn: (offset: number, length: number, file?: string) => Promise<Buffer>, closeFn: () => unknown, fileSize: Function | number, options: BufferReaderOptions, metadata?: NewFileMetaData); | ||
static openFile(filePath: string | Buffer | URL, options?: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
static openBuffer(buffer: Buffer, options?: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
static openS3(client: ClientS3, params: ClientParameters, options?: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
static openUrl(url: Parameter | URL | string, options?: BufferReaderOptions): Promise<ParquetEnvelopeReader>; | ||
constructor(readFn: (offset: number, length: number, file?: string) => Promise<Buffer>, closeFn: () => unknown, fileSize: Function | number, options?: BufferReaderOptions, metadata?: FileMetaDataExt); | ||
read(offset: number, length: number, file?: string): Promise<Buffer>; | ||
readHeader(): Promise<void>; | ||
getColumn(path: string | ColumnData, row_group: RowGroup | number | null): ColumnData; | ||
getAllColumnChunkDataFor(paths: Array<string>, row_groups?: Array<RowGroup>): { | ||
getColumn(path: string | parquet_thrift.ColumnChunk, row_group: RowGroupExt | number | string | null): ColumnChunkExt; | ||
getAllColumnChunkDataFor(paths: Array<string>, row_groups?: Array<RowGroupExt>): { | ||
rowGroupIndex: number; | ||
column: ColumnData; | ||
column: ColumnChunkExt; | ||
}[]; | ||
readOffsetIndex(path: string | ColumnData, row_group: RowGroup | number | null, opts: Options): Promise<parquet_thrift.OffsetIndex>; | ||
readColumnIndex(path: string | ColumnData, row_group: RowGroup | number, opts: Options): Promise<parquet_thrift.ColumnIndex>; | ||
readPage(column: ColumnData, page: parquet_thrift.PageLocation | number, records: Array<Record<string, unknown>>, opts: Options): Promise<Record<string, unknown>[]>; | ||
readRowGroup(schema: parquet_schema.ParquetSchema, rowGroup: RowGroup, columnList: Array<Array<unknown>>): Promise<parquet_shredder.RecordBuffer>; | ||
readColumnChunk(schema: parquet_schema.ParquetSchema, colChunk: ColumnData, opts?: Options): Promise<PageData>; | ||
readFooter(): Promise<NewFileMetaData>; | ||
readOffsetIndex(path: string | ColumnChunkExt, row_group: RowGroupExt | number | null, opts: Options): Promise<parquet_thrift.OffsetIndex>; | ||
readColumnIndex(path: string | ColumnChunkExt, row_group: RowGroupExt | number, opts: Options): Promise<parquet_thrift.ColumnIndex>; | ||
readPage(column: ColumnChunkExt, page: parquet_thrift.PageLocation | number, records: Array<Record<string, unknown>>, opts: Options): Promise<Record<string, unknown>[]>; | ||
readRowGroup(schema: parquet_schema.ParquetSchema, rowGroup: RowGroupExt, columnList: Array<Array<unknown>>): Promise<parquet_shredder.RecordBuffer>; | ||
readColumnChunk(schema: parquet_schema.ParquetSchema, colChunk: ColumnChunkExt, opts?: Options): Promise<PageData>; | ||
readFooter(): Promise<parquet_thrift.FileMetaData>; | ||
} | ||
export {}; |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -37,3 +41,3 @@ if (k2 === undefined) k2 = k; | ||
const cross_fetch_1 = __importDefault(require("cross-fetch")); | ||
const types_1 = require("./types/types"); | ||
const declare_1 = require("./declare"); | ||
const { getBloomFiltersFor, } = bloomFilterReader; | ||
@@ -45,5 +49,5 @@ /** | ||
/** | ||
* Parquet File Format Version | ||
* Supported Parquet File Format Version for reading | ||
*/ | ||
const PARQUET_VERSION = 1; | ||
const PARQUET_VERSIONS = [1, 2]; | ||
/** | ||
@@ -64,2 +68,3 @@ * Internal type used for repetition/definition levels | ||
rowGroupIndex; | ||
cursorIndex; | ||
/** | ||
@@ -78,2 +83,3 @@ * Create a new parquet reader from the file metadata and an envelope reader. | ||
this.rowGroupIndex = 0; | ||
this.cursorIndex = 0; | ||
} | ||
@@ -85,3 +91,3 @@ /** | ||
async next() { | ||
if (this.rowGroup.length === 0) { | ||
if (this.cursorIndex >= this.rowGroup.length) { | ||
if (this.rowGroupIndex >= this.metadata.row_groups.length) { | ||
@@ -93,7 +99,8 @@ return null; | ||
this.rowGroupIndex++; | ||
this.cursorIndex = 0; | ||
} | ||
return this.rowGroup.shift(); | ||
return this.rowGroup[this.cursorIndex++]; | ||
} | ||
/** | ||
* Rewind the cursor the the beginning of the file | ||
* Rewind the cursor to the beginning of the file | ||
*/ | ||
@@ -103,2 +110,3 @@ rewind() { | ||
this.rowGroupIndex = 0; | ||
this.cursorIndex = 0; | ||
} | ||
@@ -150,3 +158,3 @@ } | ||
static async openEnvelopeReader(envelopeReader, opts) { | ||
if (opts && opts.metadata) { | ||
if (opts?.metadata) { | ||
return new ParquetReader(opts.metadata, envelopeReader, opts); | ||
@@ -172,3 +180,3 @@ } | ||
opts = opts || {}; | ||
if (metadata.version != PARQUET_VERSION) { | ||
if (!PARQUET_VERSIONS.includes(metadata.version)) { | ||
throw 'invalid parquet version'; | ||
@@ -194,7 +202,7 @@ } | ||
if (column.offsetIndex) { | ||
column.offsetIndex.page_locations.forEach(d => { | ||
Promise.resolve(column.offsetIndex).then(offset => (offset.page_locations.forEach(d => { | ||
if (Array.isArray(d)) { | ||
Object.setPrototypeOf(d, parquet_types_1.default.PageLocation.prototype); | ||
} | ||
}); | ||
}))); | ||
} | ||
@@ -288,3 +296,3 @@ }); | ||
} | ||
exportMetadata(indent) { | ||
async exportMetadata(indent) { | ||
function replacer(_key, value) { | ||
@@ -323,2 +331,14 @@ if (value instanceof parquet_types_1.default.PageLocation) { | ||
const metadata = Object.assign({}, this.metadata, { json: true }); | ||
for (let i = 0; i < metadata.row_groups.length; i++) { | ||
const rowGroup = metadata.row_groups[i]; | ||
for (let j = 0; j < rowGroup.columns.length; j++) { | ||
const column = rowGroup.columns[j]; | ||
if (column.offsetIndex instanceof Promise) { | ||
column.offsetIndex = await column.offsetIndex; | ||
} | ||
if (column.columnIndex instanceof Promise) { | ||
column.columnIndex = await column.columnIndex; | ||
} | ||
} | ||
} | ||
return JSON.stringify(metadata, replacer, indent); | ||
@@ -390,5 +410,10 @@ } | ||
} | ||
static async openUrl(params, options) { | ||
if (typeof params === 'string') | ||
params = { url: params }; | ||
static async openUrl(url, options) { | ||
let params; | ||
if (typeof url === 'string') | ||
params = { url }; | ||
else if (url instanceof URL) | ||
params = { url: url.toString() }; | ||
else | ||
params = url; | ||
if (!params.url) | ||
@@ -441,10 +466,14 @@ throw new Error('URL missing'); | ||
let column; | ||
if (!isNaN(row_group)) { | ||
row_group = this.metadata.row_groups[row_group]; | ||
let parsedRowGroup; | ||
if (!isNaN(Number(row_group))) { | ||
parsedRowGroup = this.metadata?.row_groups[Number(row_group)]; | ||
} | ||
else if (row_group instanceof parquet_types_1.default.RowGroup) { | ||
parsedRowGroup = row_group; | ||
} | ||
if (typeof path === 'string') { | ||
if (!row_group) { | ||
if (!parsedRowGroup) { | ||
throw `Missing RowGroup ${row_group}`; | ||
} | ||
column = row_group.columns.find(d => d.meta_data.path_in_schema.join(',') === path); | ||
column = parsedRowGroup.columns.find(d => d.meta_data.path_in_schema.join(',') === path); | ||
if (!column) { | ||
@@ -480,9 +509,5 @@ throw `Column ${path} Not Found`; | ||
Object.defineProperty(offset_index, 'column', { value: column, enumerable: false }); | ||
if (opts && opts.cache) { | ||
column.offsetIndex = offset_index; | ||
} | ||
return offset_index; | ||
}); | ||
if (opts && opts.cache) { | ||
//@ts-ignore | ||
if (opts?.cache) { | ||
column.offsetIndex = data; | ||
@@ -500,5 +525,5 @@ } | ||
} | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then((data) => { | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then((buf) => { | ||
let column_index = new parquet_types_1.default.ColumnIndex(); | ||
parquet_util.decodeThrift(column_index, data); | ||
parquet_util.decodeThrift(column_index, buf); | ||
Object.defineProperty(column_index, 'column', { value: column }); | ||
@@ -513,9 +538,5 @@ // decode the statistics values | ||
} | ||
if (opts && opts.cache) { | ||
column.columnIndex = column_index; | ||
} | ||
return column_index; | ||
}); | ||
if (opts && opts.cache) { | ||
//@ts-ignore | ||
if (opts?.cache) { | ||
column.columnIndex = data; | ||
@@ -532,9 +553,9 @@ } | ||
} | ||
column.meta_data.data_page_offset = page.offset; | ||
column.meta_data.total_compressed_size = page.compressed_page_size; | ||
column.meta_data.data_page_offset = parquet_util.cloneInteger(page.offset); | ||
column.meta_data.total_compressed_size = new node_int64_1.default(page.compressed_page_size); | ||
} | ||
else { | ||
const offsetIndex = column.offsetIndex || await this.readOffsetIndex(column, null, opts); | ||
column.meta_data.data_page_offset = offsetIndex.page_locations[page].offset; | ||
column.meta_data.total_compressed_size = offsetIndex.page_locations[page].compressed_page_size; | ||
const offsetIndex = await this.readOffsetIndex(column, null, opts); | ||
column.meta_data.data_page_offset = parquet_util.cloneInteger(offsetIndex.page_locations[page].offset); | ||
column.meta_data.total_compressed_size = new node_int64_1.default(offsetIndex.page_locations[page].compressed_page_size); | ||
} | ||
@@ -609,3 +630,3 @@ const chunk = await this.readColumnChunk(this.schema, column); | ||
let metadataBuf = await this.read(metadataOffset, metadataSize); | ||
let metadata = new types_1.NewFileMetaData(); | ||
let metadata = new parquet_types_1.default.FileMetaData(); | ||
parquet_util.decodeThrift(metadata, metadataBuf); | ||
@@ -656,3 +677,3 @@ return metadata; | ||
let page; | ||
const pageHeader = new types_1.NewPageHeader(); | ||
const pageHeader = new declare_1.NewPageHeader(); | ||
const headerOffset = cursor.offset; | ||
@@ -709,3 +730,6 @@ const headerSize = parquet_util.decodeThrift(pageHeader, cursor.buffer.slice(cursor.offset)); | ||
} | ||
if (opts.dictionary) { | ||
// It's possible to have a column chunk where some pages should use | ||
// the dictionary (PLAIN_DICTIONARY for example) and others should | ||
// not (PLAIN for example). | ||
if (opts.dictionary && pageData.useDictionary) { | ||
pageData.values = pageData.values.map(d => opts.dictionary[d]); | ||
@@ -788,3 +812,7 @@ } | ||
bitWidth: opts.column.typeLength, | ||
disableEnvelope: opts.column.disableEnvelope | ||
disableEnvelope: opts.column.disableEnvelope, | ||
originalType: opts.column.originalType, | ||
precision: opts.column.precision, | ||
scale: opts.column.scale, | ||
name: opts.column.name | ||
}); | ||
@@ -796,3 +824,4 @@ cursor.offset = cursorEnd; | ||
values: values, | ||
count: valueCount | ||
count: valueCount, | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' | ||
}; | ||
@@ -840,4 +869,4 @@ } | ||
let values = decodeValues(opts.type, valueEncoding, valuesBufCursor, valueCountNonNull, { | ||
typeLength: opts.column.typeLength, | ||
bitWidth: opts.column.typeLength | ||
bitWidth: opts.column.typeLength, | ||
...opts.column | ||
}); | ||
@@ -848,3 +877,4 @@ return { | ||
values: values, | ||
count: valueCount | ||
count: valueCount, | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' | ||
}; | ||
@@ -897,3 +927,5 @@ } | ||
optional: optional, | ||
repeated: repeated | ||
repeated: repeated, | ||
scale: schemaElement.scale, | ||
precision: schemaElement.precision | ||
}; | ||
@@ -908,5 +940,1 @@ } | ||
} | ||
module.exports = { | ||
ParquetEnvelopeReader, | ||
ParquetReader, | ||
}; |
@@ -1,2 +0,3 @@ | ||
import { SchemaDefinition, ParquetField } from './types/types'; | ||
import { SchemaDefinition, ParquetField } from './declare'; | ||
import { JSONSchema4 } from 'json-schema'; | ||
/** | ||
@@ -10,2 +11,6 @@ * A parquet file schema | ||
/** | ||
* Create a new schema from JSON Schema (json-schema.org) | ||
*/ | ||
static fromJsonSchema(jsonSchema: JSONSchema4): ParquetSchema; | ||
/** | ||
* Create a new schema from a JSON schema definition | ||
@@ -12,0 +17,0 @@ */ |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -26,2 +30,3 @@ if (k2 === undefined) k2 = k; | ||
const parquet_types = __importStar(require("./types")); | ||
const jsonSchema_1 = require("./jsonSchema"); | ||
const PARQUET_COLUMN_KEY_SEPARATOR = '.'; | ||
@@ -36,2 +41,9 @@ /** | ||
/** | ||
* Create a new schema from JSON Schema (json-schema.org) | ||
*/ | ||
static fromJsonSchema(jsonSchema) { | ||
const schema = (0, jsonSchema_1.fromJsonSchema)(jsonSchema); | ||
return new ParquetSchema(schema); | ||
} | ||
/** | ||
* Create a new schema from a JSON schema definition | ||
@@ -95,2 +107,3 @@ */ | ||
let fieldList = {}; | ||
let fieldErrors = []; | ||
for (let name in schema) { | ||
@@ -132,5 +145,10 @@ const opts = schema[name]; | ||
} | ||
let nameWithPath = (`${name}` || 'missing name'); | ||
if (path && path.length > 0) { | ||
nameWithPath = `${path}.${nameWithPath}`; | ||
} | ||
const typeDef = opts.type ? parquet_types.PARQUET_LOGICAL_TYPES[opts.type] : undefined; | ||
if (!typeDef) { | ||
throw 'invalid parquet type: ' + (opts.type || "missing type"); | ||
fieldErrors.push(`Invalid parquet type: ${(opts.type || "missing type")}, for Column: ${nameWithPath}`); | ||
continue; | ||
} | ||
@@ -142,3 +160,3 @@ /* field encoding */ | ||
if (!(opts.encoding in parquet_codec)) { | ||
throw 'unsupported parquet encoding: ' + opts.encoding; | ||
fieldErrors.push(`Unsupported parquet encoding: ${opts.encoding}, for Column: ${nameWithPath}`); | ||
} | ||
@@ -149,4 +167,10 @@ if (!opts.compression) { | ||
if (!(opts.compression in parquet_compression.PARQUET_COMPRESSION_METHODS)) { | ||
throw 'unsupported compression method: ' + opts.compression; | ||
fieldErrors.push(`Unsupported compression method: ${opts.compression}, for Column: ${nameWithPath}`); | ||
} | ||
if (typeDef.originalType === 'DECIMAL') { | ||
// Default scale to 0 per https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal | ||
if (typeof opts.scale === "undefined") | ||
opts.scale = 0; | ||
fieldErrors = fieldErrors.concat(errorsForDecimalOpts(typeDef.originalType, opts, nameWithPath)); | ||
} | ||
/* add to schema */ | ||
@@ -162,2 +186,4 @@ fieldList[name] = { | ||
compression: opts.compression, | ||
precision: opts.precision, | ||
scale: opts.scale, | ||
typeLength: opts.typeLength || typeDef.typeLength, | ||
@@ -168,2 +194,5 @@ rLevelMax: rLevelMax, | ||
} | ||
if (fieldErrors.length > 0) { | ||
throw fieldErrors.reduce((accumulator, currentVal) => accumulator + '\n' + currentVal); | ||
} | ||
return fieldList; | ||
@@ -185,1 +214,24 @@ } | ||
} | ||
function errorsForDecimalOpts(type, opts, columnName) { | ||
const fieldErrors = []; | ||
if (opts.precision === undefined || opts.precision < 1) { | ||
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, precision is required and must be be greater than 0`); | ||
} | ||
else if (!Number.isInteger(opts.precision)) { | ||
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, precision must be an integer`); | ||
} | ||
else if (opts.precision > 18) { | ||
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, can not handle precision over 18`); | ||
} | ||
if (typeof opts.scale === "undefined" || opts.scale < 0) { | ||
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, scale is required to be 0 or greater`); | ||
} | ||
else if (!Number.isInteger(opts.scale)) { | ||
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, scale must be an integer`); | ||
} | ||
// Default precision to 18 if it is undefined as that is a different error | ||
else if (opts.scale > (opts.precision || 18)) { | ||
fieldErrors.push(`invalid schema or precision for type: ${type}, for Column: ${columnName}, precision must be greater than or equal to scale`); | ||
} | ||
return fieldErrors; | ||
} |
import { ParquetSchema } from './schema'; | ||
import { PageData } from './types/types'; | ||
import { Page, PageData } from './declare'; | ||
/** | ||
@@ -27,6 +27,6 @@ * 'Shred' a record into a list of <value, repetition_level, definition_level> | ||
export interface RecordBuffer { | ||
columnData: Record<string, PageData>; | ||
columnData?: Record<string, PageData>; | ||
rowCount?: number; | ||
pageRowCount?: number; | ||
pages?: Record<string, object>; | ||
pages?: Record<string, Page[]>; | ||
} | ||
@@ -54,2 +54,2 @@ export declare const shredRecord: (schema: ParquetSchema, record: Record<string, unknown>, buffer: RecordBuffer) => void; | ||
*/ | ||
export declare const materializeRecords: (schema: ParquetSchema, buffer: RecordBuffer, records?: Record<string, unknown>[] | undefined) => Record<string, unknown>[]; | ||
export declare const materializeRecords: (schema: ParquetSchema, buffer: RecordBuffer, records?: Array<Record<string, unknown>>) => Record<string, unknown>[]; |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -6,0 +10,0 @@ if (k2 === undefined) k2 = k; |
@@ -1,5 +0,5 @@ | ||
import { PrimitiveType, OriginalType } from "./types/types"; | ||
interface PARQUET_LOGICAL_TYPES { | ||
[key: string]: { | ||
primitiveType: PrimitiveType; | ||
import { PrimitiveType, OriginalType, ParquetType } from "./declare"; | ||
type ParquetTypeData = { | ||
[Property in ParquetType]: { | ||
primitiveType?: PrimitiveType; | ||
toPrimitive: Function; | ||
@@ -10,4 +10,4 @@ fromPrimitive?: Function; | ||
}; | ||
} | ||
export declare const PARQUET_LOGICAL_TYPES: PARQUET_LOGICAL_TYPES; | ||
}; | ||
export declare const PARQUET_LOGICAL_TYPES: ParquetTypeData; | ||
/** | ||
@@ -14,0 +14,0 @@ * Convert a value from it's native representation to the internal/underlying |
'use strict'; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -23,2 +27,3 @@ if (k2 === undefined) k2 = k; | ||
exports.fromPrimitive = exports.toPrimitive = exports.PARQUET_LOGICAL_TYPES = void 0; | ||
// Thanks to https://github.com/kbajalc/parquets for some of the code. | ||
const BSON = __importStar(require("bson")); | ||
@@ -139,2 +144,7 @@ exports.PARQUET_LOGICAL_TYPES = { | ||
}, | ||
'DECIMAL': { | ||
primitiveType: 'INT64', | ||
originalType: 'DECIMAL', | ||
toPrimitive: toPrimitive_INT64 | ||
}, | ||
'JSON': { | ||
@@ -158,5 +168,21 @@ primitiveType: 'BYTE_ARRAY', | ||
fromPrimitive: fromPrimitive_INTERVAL | ||
}, | ||
MAP: { | ||
originalType: 'MAP', | ||
toPrimitive: toPrimitive_MAP, | ||
}, | ||
LIST: { | ||
originalType: 'LIST', | ||
toPrimitive: toPrimitive_LIST, | ||
} | ||
}; | ||
/** | ||
* Test if something is a valid Parquet Type | ||
* @param type the string of the type | ||
* @returns if type is a valid Parquet Type | ||
*/ | ||
function isParquetType(type) { | ||
return type !== undefined && (type in exports.PARQUET_LOGICAL_TYPES); | ||
} | ||
/** | ||
* Convert a value from it's native representation to the internal/underlying | ||
@@ -166,3 +192,3 @@ * primitive type | ||
function toPrimitive(type, value) { | ||
if (type === undefined || !(type in exports.PARQUET_LOGICAL_TYPES)) { | ||
if (!isParquetType(type)) { | ||
throw 'invalid type: ' + type || "undefined"; | ||
@@ -178,3 +204,3 @@ } | ||
function fromPrimitive(type, value) { | ||
if (type === undefined || !(type in exports.PARQUET_LOGICAL_TYPES)) { | ||
if (!isParquetType(type)) { | ||
throw 'invalid type: ' + type || "undefined"; | ||
@@ -289,2 +315,4 @@ } | ||
} | ||
const MIN_64 = BigInt('0x8000000000000000') * -1n; | ||
const MAX_64 = BigInt('0x7fffffffffffffff'); | ||
function toPrimitive_INT64(value) { | ||
@@ -295,3 +323,3 @@ try { | ||
v = BigInt(value); | ||
checkValidValue(-0x8000000000000000, 0x7fffffffffffffff, v); | ||
checkValidValue(MIN_64, MAX_64, v); | ||
return v; | ||
@@ -303,2 +331,3 @@ } | ||
} | ||
const MAX_U64 = BigInt('0xffffffffffffffff'); | ||
function toPrimitive_UINT64(value) { | ||
@@ -309,3 +338,3 @@ try { | ||
v = BigInt(value); | ||
checkValidValue(0, 0xffffffffffffffff, v); | ||
checkValidValue(0, MAX_U64, v); | ||
return v; | ||
@@ -317,2 +346,4 @@ } | ||
} | ||
const MIN_96 = BigInt('0x800000000000000000000000') * -1n; | ||
const MAX_96 = BigInt('0x7fffffffffffffffffffffff'); | ||
function toPrimitive_INT96(value) { | ||
@@ -323,3 +354,3 @@ try { | ||
v = BigInt(value); | ||
checkValidValue(-0x800000000000000000000000, 0x7fffffffffffffffffffffff, v); | ||
checkValidValue(MIN_96, MAX_96, v); | ||
return v; | ||
@@ -331,2 +362,8 @@ } | ||
} | ||
function toPrimitive_MAP(value) { | ||
return value; | ||
} | ||
function toPrimitive_LIST(value) { | ||
return value; | ||
} | ||
function toPrimitive_BYTE_ARRAY(value) { | ||
@@ -353,16 +390,27 @@ return Buffer.from(value); | ||
} | ||
function toPrimitive_TIME_MILLIS(value) { | ||
let v = value; | ||
if (typeof value === `string`) { | ||
v = parseInt(value, 10); | ||
function toNumberInternal(typeName, value) { | ||
let numberValue = 0; | ||
switch (typeof value) { | ||
case "string": | ||
numberValue = parseInt(value, 10); | ||
break; | ||
case "number": | ||
numberValue = value; | ||
break; | ||
default: | ||
throw `${typeName} has an invalid type: ${typeof value}`; | ||
} | ||
if (v < 0 || v > 0xffffffffffffffff || typeof v !== 'number') { | ||
throw 'invalid value for TIME_MILLIS: ' + value; | ||
// Year 2255 bug. Should eventually switch to bigint | ||
if (numberValue < 0 || numberValue >= Number.MAX_SAFE_INTEGER) { | ||
throw `${typeName} value is out of bounds: ${numberValue}`; | ||
} | ||
return v; | ||
return numberValue; | ||
} | ||
function toPrimitive_TIME_MILLIS(value) { | ||
return toNumberInternal("TIME_MILLIS", value); | ||
} | ||
function toPrimitive_TIME_MICROS(value) { | ||
const v = BigInt(value); | ||
if (v < 0n) { | ||
throw 'invalid value for TIME_MICROS: ' + value; | ||
throw 'TIME_MICROS value is out of bounds: ' + value; | ||
} | ||
@@ -377,11 +425,3 @@ return v; | ||
} | ||
/* convert from integer */ | ||
let v = value; | ||
if (typeof value === 'string') { | ||
v = parseInt(value, 10); | ||
} | ||
if (v < 0 || typeof v !== 'number') { | ||
throw 'invalid value for DATE: ' + value; | ||
} | ||
return v; | ||
return toNumberInternal("DATE", value); | ||
} | ||
@@ -396,11 +436,3 @@ function fromPrimitive_DATE(value) { | ||
} | ||
/* convert from integer */ | ||
let v = value; | ||
if (typeof value === 'string') { | ||
v = parseInt(value, 10); | ||
} | ||
if (v < 0 || typeof v !== 'number') { | ||
throw 'invalid value for TIMESTAMP_MILLIS: ' + value; | ||
} | ||
return v; | ||
return toNumberInternal("TIMESTAMP_MILLIS", value); | ||
} | ||
@@ -416,12 +448,18 @@ function fromPrimitive_TIMESTAMP_MILLIS(value) { | ||
/* convert from integer */ | ||
{ | ||
try { | ||
// Will throw if NaN | ||
const v = BigInt(value); | ||
if (v < 0n /*|| isNaN(v)*/) { | ||
throw 'invalid value for TIMESTAMP_MICROS: ' + value; | ||
if (v < 0n) { | ||
throw 'out of bounds'; | ||
} | ||
return v; | ||
} | ||
catch (e) { | ||
throw 'TIMESTAMP_MICROS value is out of bounds: ' + value; | ||
} | ||
} | ||
function fromPrimitive_TIMESTAMP_MICROS(value) { | ||
return typeof value === 'bigint' ? new Date(Number(value / 1000n)) : new Date(value / 1000); | ||
if (typeof value === 'bigint') | ||
return new Date(Number(value / 1000n)); | ||
return new Date(value / 1000); | ||
} | ||
@@ -428,0 +466,0 @@ function toPrimitive_INTERVAL(value) { |
/// <reference types="node" /> | ||
/// <reference types="node" /> | ||
/// <reference types="node-int64" /> | ||
import thrift from "thrift"; | ||
import fs, { WriteStream } from 'fs'; | ||
import * as parquet_thrift from '../gen-nodejs/parquet_types'; | ||
import { NewFileMetaData } from './types/types'; | ||
declare type Enums = typeof parquet_thrift.Encoding | typeof parquet_thrift.FieldRepetitionType | typeof parquet_thrift.Type | typeof parquet_thrift.CompressionCodec | typeof parquet_thrift.PageType | typeof parquet_thrift.ConvertedType; | ||
declare type ThriftObject = NewFileMetaData | parquet_thrift.PageHeader | parquet_thrift.BloomFilterHeader | parquet_thrift.OffsetIndex | parquet_thrift.ColumnIndex | NewFileMetaData; /** Patch PageLocation to be three element array that has getters/setters | ||
* for each of the properties (offset, compressed_page_size, first_row_index) | ||
* This saves space considerably as we do not need to store the full variable | ||
* names for every PageLocation | ||
*/ | ||
import { FileMetaDataExt, WriterOptions } from './declare'; | ||
import { Int64 } from "thrift"; | ||
export type WriteStreamMinimal = Pick<WriteStream, "write" | "end">; | ||
type Enums = typeof parquet_thrift.Encoding | typeof parquet_thrift.FieldRepetitionType | typeof parquet_thrift.Type | typeof parquet_thrift.CompressionCodec | typeof parquet_thrift.PageType | typeof parquet_thrift.ConvertedType; | ||
type ThriftObject = FileMetaDataExt | parquet_thrift.PageHeader | parquet_thrift.ColumnMetaData | parquet_thrift.BloomFilterHeader | parquet_thrift.OffsetIndex | parquet_thrift.ColumnIndex | FileMetaDataExt; | ||
/** | ||
* Helper function that serializes a thrift object into a buffer | ||
*/ | ||
export declare const serializeThrift: (obj: parquet_thrift.BloomFilterHeader) => Buffer; | ||
export declare const decodeThrift: (obj: ThriftObject, buf: Buffer, offset?: number | undefined) => number; | ||
export declare const serializeThrift: (obj: ThriftObject) => Buffer; | ||
export declare const decodeThrift: (obj: ThriftObject, buf: Buffer, offset?: number) => number; | ||
/** | ||
@@ -28,6 +29,7 @@ * Get the number of bits required to store a given value | ||
export declare const fclose: (fd: number) => Promise<unknown>; | ||
export declare const oswrite: (os: WriteStream, buf: Buffer) => Promise<unknown>; | ||
export declare const osend: (os: WriteStream) => Promise<unknown>; | ||
export declare const osopen: (path: string | Buffer | URL, opts: string) => Promise<unknown>; | ||
export declare const oswrite: (os: WriteStreamMinimal, buf: Buffer) => Promise<unknown>; | ||
export declare const osend: (os: WriteStreamMinimal) => Promise<unknown>; | ||
export declare const osopen: (path: string | Buffer | URL, opts?: WriterOptions) => Promise<WriteStream>; | ||
export declare const fieldIndexOf: (arr: Array<Array<unknown>>, elem: Array<unknown>) => number; | ||
export declare const cloneInteger: (int: Int64) => thrift.Int64; | ||
export {}; |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -25,6 +29,7 @@ if (k2 === undefined) k2 = k; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.fieldIndexOf = exports.osopen = exports.osend = exports.oswrite = exports.fclose = exports.fread = exports.fstat = exports.fopen = exports.getThriftEnum = exports.getBitWidth = exports.decodeThrift = exports.serializeThrift = void 0; | ||
exports.cloneInteger = exports.fieldIndexOf = exports.osopen = exports.osend = exports.oswrite = exports.fclose = exports.fread = exports.fstat = exports.fopen = exports.getThriftEnum = exports.getBitWidth = exports.decodeThrift = exports.serializeThrift = void 0; | ||
const thrift_1 = __importDefault(require("thrift")); | ||
const fs_1 = __importDefault(require("fs")); | ||
const parquet_thrift = __importStar(require("../gen-nodejs/parquet_types")); | ||
const thrift_2 = require("thrift"); | ||
/** | ||
@@ -44,2 +49,7 @@ * We need to patch Thrift's TFramedTransport class bc the TS type definitions | ||
} | ||
/** Patch PageLocation to be three element array that has getters/setters | ||
* for each of the properties (offset, compressed_page_size, first_row_index) | ||
* This saves space considerably as we do not need to store the full variable | ||
* names for every PageLocation | ||
*/ | ||
const getterSetter = (index) => ({ | ||
@@ -214,1 +224,5 @@ get: function () { return this[index]; }, | ||
exports.fieldIndexOf = fieldIndexOf; | ||
const cloneInteger = (int) => { | ||
return new thrift_2.Int64(int.valueOf()); | ||
}; | ||
exports.cloneInteger = cloneInteger; |
@@ -0,49 +1,11 @@ | ||
/// <reference types="node" /> | ||
/// <reference types="node" /> | ||
import stream from 'stream'; | ||
import * as parquet_shredder from './shred'; | ||
import * as parquet_util from './util'; | ||
import { WriterOptions, RowGroupExt } from './declare'; | ||
import { ParquetSchema } from './schema'; | ||
import Int64 from 'node-int64'; | ||
import SplitBlockBloomFilter from './bloom/sbbf'; | ||
/** | ||
* Create a parquet file from a schema and a number of row groups. This class | ||
* performs direct, unbuffered writes to the underlying output stream and is | ||
* intendend for advanced and internal users; the writeXXX methods must be | ||
* called in the correct order to produce a valid file. | ||
*/ | ||
export class ParquetEnvelopeWriter { | ||
/** | ||
* Create a new parquet envelope writer that writes to the specified stream | ||
*/ | ||
static openStream(schema: any, outputStream: any, opts: any): Promise<ParquetEnvelopeWriter>; | ||
constructor(schema: any, writeFn: any, closeFn: any, fileOffset: any, opts: any); | ||
schema: any; | ||
write: any; | ||
close: any; | ||
offset: any; | ||
rowCount: number; | ||
rowGroups: any[]; | ||
pageSize: any; | ||
useDataPageV2: any; | ||
pageIndex: any; | ||
bloomFilters: {}; | ||
writeSection(buf: any): any; | ||
/** | ||
* Encode the parquet file header | ||
*/ | ||
writeHeader(): any; | ||
/** | ||
* Encode a parquet row group. The records object should be created using the | ||
* shredRecord method | ||
*/ | ||
writeRowGroup(records: any): Promise<any>; | ||
writeBloomFilters(_rowGroups: any): void; | ||
/** | ||
* Write the columnIndices and offsetIndices | ||
*/ | ||
writeIndex(_rowGroups: any): void; | ||
/** | ||
* Write the parquet file footer | ||
*/ | ||
writeFooter(userMetadata: any, schema: any, rowCount: any, rowGroups: any): any; | ||
/** | ||
* Set the parquet data page size. The data page size controls the maximum | ||
* number of column values that are written to disk as a consecutive array | ||
*/ | ||
setPageSize(cnt: any): void; | ||
} | ||
/** | ||
* Write a parquet file to an output stream. The ParquetWriter will perform | ||
@@ -53,3 +15,9 @@ * buffering/batching for performance, so close() must be called after all rows | ||
*/ | ||
export class ParquetWriter { | ||
export declare class ParquetWriter { | ||
schema: ParquetSchema; | ||
envelopeWriter: ParquetEnvelopeWriter | null; | ||
rowBuffer: parquet_shredder.RecordBuffer; | ||
rowGroupSize: number; | ||
closed: boolean; | ||
userMetadata: Record<string, string>; | ||
/** | ||
@@ -59,3 +27,3 @@ * Convenience method to create a new buffered parquet writer that writes to | ||
*/ | ||
static openFile(schema: any, path: any, opts: any): Promise<ParquetWriter>; | ||
static openFile(schema: ParquetSchema, path: string | Buffer | URL, opts?: WriterOptions): Promise<ParquetWriter>; | ||
/** | ||
@@ -65,13 +33,7 @@ * Convenience method to create a new buffered parquet writer that writes to | ||
*/ | ||
static openStream(schema: any, outputStream: any, opts: any): Promise<ParquetWriter>; | ||
static openStream(schema: ParquetSchema, outputStream: parquet_util.WriteStreamMinimal, opts?: WriterOptions): Promise<ParquetWriter>; | ||
/** | ||
* Create a new buffered parquet writer for a given envelope writer | ||
*/ | ||
constructor(schema: any, envelopeWriter: any, opts: any); | ||
schema: any; | ||
envelopeWriter: any; | ||
rowBuffer: {}; | ||
rowGroupSize: any; | ||
closed: boolean; | ||
userMetadata: {}; | ||
constructor(schema: ParquetSchema, envelopeWriter: ParquetEnvelopeWriter, opts?: WriterOptions); | ||
/** | ||
@@ -81,3 +43,3 @@ * Append a single row to the parquet file. Rows are buffered in memory until | ||
*/ | ||
appendRow(row: any): Promise<void>; | ||
appendRow(row: Record<string, unknown>): Promise<void>; | ||
/** | ||
@@ -89,7 +51,7 @@ * Finish writing the parquet file and commit the footer to disk. This method | ||
*/ | ||
close(callback: any): Promise<void>; | ||
close(callback?: Function): Promise<void>; | ||
/** | ||
* Add key<>value metadata to the file | ||
*/ | ||
setMetadata(key: any, value: any): void; | ||
setMetadata(key: string, value: string): void; | ||
/** | ||
@@ -101,3 +63,3 @@ * Set the parquet row group size. This values controls the maximum number | ||
*/ | ||
setRowGroupSize(cnt: any): void; | ||
setRowGroupSize(cnt: number): void; | ||
/** | ||
@@ -107,11 +69,59 @@ * Set the parquet data page size. The data page size controls the maximum | ||
*/ | ||
setPageSize(cnt: any): void; | ||
setPageSize(cnt: number): void; | ||
} | ||
/** | ||
* Create a parquet file from a schema and a number of row groups. This class | ||
* performs direct, unbuffered writes to the underlying output stream and is | ||
* intended for advanced and internal users; the writeXXX methods must be | ||
* called in the correct order to produce a valid file. | ||
*/ | ||
export declare class ParquetEnvelopeWriter { | ||
schema: ParquetSchema; | ||
write: Function; | ||
close: Function; | ||
offset: Int64; | ||
rowCount: Int64; | ||
rowGroups: RowGroupExt[]; | ||
pageSize: number; | ||
useDataPageV2: boolean; | ||
pageIndex: boolean; | ||
bloomFilters: Record<string, SplitBlockBloomFilter>; | ||
/** | ||
* Create a new parquet envelope writer that writes to the specified stream | ||
*/ | ||
static openStream(schema: ParquetSchema, outputStream: parquet_util.WriteStreamMinimal, opts: WriterOptions): Promise<ParquetEnvelopeWriter>; | ||
constructor(schema: ParquetSchema, writeFn: Function, closeFn: Function, fileOffset: Int64, opts: WriterOptions); | ||
writeSection(buf: Buffer): any; | ||
/** | ||
* Encode the parquet file header | ||
*/ | ||
writeHeader(): any; | ||
/** | ||
* Encode a parquet row group. The records object should be created using the | ||
* shredRecord method | ||
*/ | ||
writeRowGroup(records: parquet_shredder.RecordBuffer): Promise<any>; | ||
writeBloomFilters(): void; | ||
/** | ||
* Write the columnIndices and offsetIndices | ||
*/ | ||
writeIndex(): void; | ||
/** | ||
* Write the parquet file footer | ||
*/ | ||
writeFooter(userMetadata: Record<string, string>): any; | ||
/** | ||
* Set the parquet data page size. The data page size controls the maximum | ||
* number of column values that are written to disk as a consecutive array | ||
*/ | ||
setPageSize(cnt: number): void; | ||
} | ||
/** | ||
* Create a parquet transform stream | ||
*/ | ||
export class ParquetTransformer extends stream.Transform { | ||
constructor(schema: any, opts?: {}); | ||
export declare class ParquetTransformer extends stream.Transform { | ||
writer: ParquetWriter; | ||
constructor(schema: ParquetSchema, opts?: {}); | ||
_transform(row: Record<string, unknown>, _encoding: string, callback: Function): void; | ||
_flush(callback: (foo: any, bar?: any) => any): void; | ||
} | ||
import stream = require("stream"); |
@@ -1,11 +0,39 @@ | ||
'use strict'; | ||
const stream = require('stream'); | ||
const parquet_thrift = require('../gen-nodejs/parquet_types'); | ||
const parquet_shredder = require('./shred'); | ||
const parquet_util = require('./util'); | ||
const parquet_codec = require('./codec'); | ||
const parquet_compression = require('./compression'); | ||
const parquet_types = require('./types'); | ||
const bloomFilterWriter = require("./bloomFilterIO/bloomFilterWriter"); | ||
const Long = require('long'); | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetTransformer = exports.ParquetEnvelopeWriter = exports.ParquetWriter = void 0; | ||
const stream_1 = __importDefault(require("stream")); | ||
const parquet_types_1 = __importStar(require("../gen-nodejs/parquet_types")); | ||
const parquet_shredder = __importStar(require("./shred")); | ||
const parquet_util = __importStar(require("./util")); | ||
const parquet_codec = __importStar(require("./codec")); | ||
const parquet_compression = __importStar(require("./compression")); | ||
const parquet_types = __importStar(require("./types")); | ||
const bloomFilterWriter = __importStar(require("./bloomFilterIO/bloomFilterWriter")); | ||
const node_int64_1 = __importDefault(require("node-int64")); | ||
/** | ||
@@ -35,2 +63,8 @@ * Parquet File Magic String | ||
class ParquetWriter { | ||
schema; | ||
envelopeWriter; | ||
rowBuffer; | ||
rowGroupSize; | ||
closed; | ||
userMetadata; | ||
/** | ||
@@ -78,3 +112,3 @@ * Convenience method to create a new buffered parquet writer that writes to | ||
async appendRow(row) { | ||
if (this.closed) { | ||
if (this.closed || this.envelopeWriter === null) { | ||
throw 'writer was closed'; | ||
@@ -107,12 +141,14 @@ } | ||
this.closed = true; | ||
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
if (this.envelopeWriter) { | ||
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
await this.envelopeWriter.writeBloomFilters(); | ||
await this.envelopeWriter.writeIndex(); | ||
await this.envelopeWriter.writeFooter(this.userMetadata); | ||
await this.envelopeWriter.close(); | ||
this.envelopeWriter = null; | ||
} | ||
await this.envelopeWriter.writeBloomFilters(); | ||
await this.envelopeWriter.writeIndex(); | ||
await this.envelopeWriter.writeFooter(this.userMetadata); | ||
await this.envelopeWriter.close(); | ||
this.envelopeWriter = null; | ||
if (callback) { | ||
@@ -142,12 +178,23 @@ callback(); | ||
setPageSize(cnt) { | ||
this.writer.setPageSize(cnt); | ||
this.envelopeWriter.setPageSize(cnt); | ||
} | ||
} | ||
exports.ParquetWriter = ParquetWriter; | ||
/** | ||
* Create a parquet file from a schema and a number of row groups. This class | ||
* performs direct, unbuffered writes to the underlying output stream and is | ||
* intendend for advanced and internal users; the writeXXX methods must be | ||
* intended for advanced and internal users; the writeXXX methods must be | ||
* called in the correct order to produce a valid file. | ||
*/ | ||
class ParquetEnvelopeWriter { | ||
schema; | ||
write; | ||
close; | ||
offset; | ||
rowCount; | ||
rowGroups; | ||
pageSize; | ||
useDataPageV2; | ||
pageIndex; | ||
bloomFilters; // TODO: OR filterCollection | ||
/** | ||
@@ -159,3 +206,3 @@ * Create a new parquet envelope writer that writes to the specified stream | ||
let closeFn = parquet_util.osend.bind(undefined, outputStream); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, new node_int64_1.default(0), opts); | ||
} | ||
@@ -167,3 +214,3 @@ constructor(schema, writeFn, closeFn, fileOffset, opts) { | ||
this.offset = fileOffset; | ||
this.rowCount = 0; | ||
this.rowCount = new node_int64_1.default(0); | ||
this.rowGroups = []; | ||
@@ -179,3 +226,3 @@ this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE; | ||
writeSection(buf) { | ||
this.offset += buf.length; | ||
this.offset.setValue(this.offset.valueOf() + buf.length); | ||
return this.write(buf); | ||
@@ -200,12 +247,11 @@ } | ||
}); | ||
this.rowCount += records.rowCount; | ||
this.rowCount.setValue(this.rowCount.valueOf() + records.rowCount); | ||
this.rowGroups.push(rgroup.metadata); | ||
return this.writeSection(rgroup.body); | ||
} | ||
writeBloomFilters(_rowGroups) { | ||
let rowGroups = _rowGroups || this.rowGroups; | ||
rowGroups.forEach(group => { | ||
writeBloomFilters() { | ||
this.rowGroups.forEach(group => { | ||
group.columns.forEach(column => { | ||
const columnName = column.meta_data.path_in_schema[0]; | ||
if (columnName in this.bloomFilters === false) | ||
const columnName = column.meta_data?.path_in_schema[0]; | ||
if (!columnName || columnName in this.bloomFilters === false) | ||
return; | ||
@@ -221,20 +267,19 @@ const serializedBloomFilterData = bloomFilterWriter.getSerializedBloomFilterData(this.bloomFilters[columnName]); | ||
*/ | ||
writeIndex(_rowGroups) { | ||
let rowGroups = _rowGroups || this.rowGroups; | ||
writeIndex() { | ||
this.schema.fieldList.forEach((c, i) => { | ||
rowGroups.forEach(group => { | ||
this.rowGroups.forEach(group => { | ||
let column = group.columns[i]; | ||
if (!column) | ||
return; | ||
if (column.meta_data.columnIndex) { | ||
if (column.meta_data?.columnIndex) { | ||
let columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex); | ||
delete column.meta_data.columnIndex; | ||
column.column_index_offset = this.offset; | ||
column.column_index_offset = parquet_util.cloneInteger(this.offset); | ||
column.column_index_length = columnBody.length; | ||
this.writeSection(columnBody); | ||
} | ||
if (column.meta_data.offsetIndex) { | ||
if (column.meta_data?.offsetIndex) { | ||
let offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex); | ||
delete column.meta_data.offsetIndex; | ||
column.offset_index_offset = this.offset; | ||
column.offset_index_offset = parquet_util.cloneInteger(this.offset); | ||
column.offset_index_length = offsetBody.length; | ||
@@ -249,3 +294,3 @@ this.writeSection(offsetBody); | ||
*/ | ||
writeFooter(userMetadata, schema, rowCount, rowGroups) { | ||
writeFooter(userMetadata) { | ||
if (!userMetadata) { | ||
@@ -268,6 +313,8 @@ userMetadata = {}; | ||
} | ||
exports.ParquetEnvelopeWriter = ParquetEnvelopeWriter; | ||
/** | ||
* Create a parquet transform stream | ||
*/ | ||
class ParquetTransformer extends stream.Transform { | ||
class ParquetTransformer extends stream_1.default.Transform { | ||
writer; | ||
constructor(schema, opts = {}) { | ||
@@ -280,9 +327,9 @@ super({ objectMode: true }); | ||
})(this); | ||
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, 0, opts), opts); | ||
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, new node_int64_1.default(0), opts), opts); | ||
} | ||
_transform(row, encoding, callback) { | ||
_transform(row, _encoding, callback) { | ||
if (row) { | ||
this.writer.appendRow(row).then(data => callback(null, data), err => { | ||
const fullErr = new Error(`Error transforming to parquet: ${err.toString()} row:${row}`); | ||
fullErr.origErr = err; | ||
fullErr.message = err; | ||
callback(fullErr); | ||
@@ -300,2 +347,3 @@ }); | ||
} | ||
exports.ParquetTransformer = ParquetTransformer; | ||
/** | ||
@@ -328,3 +376,3 @@ * Encode a consecutive array of data using one of the parquet encodings | ||
statistics.min = statistics.min_value; | ||
return new parquet_thrift.Statistics(statistics); | ||
return new parquet_types_1.default.Statistics(statistics); | ||
} | ||
@@ -340,3 +388,3 @@ async function encodePages(schema, rowBuffer, opts) { | ||
let page; | ||
const values = rowBuffer.columnData[field.path]; | ||
const values = rowBuffer.columnData[field.path.join(',')]; | ||
if (opts.bloomFilters && (field.name in opts.bloomFilters)) { | ||
@@ -346,3 +394,3 @@ const splitBlockBloomFilter = opts.bloomFilters[field.name]; | ||
} | ||
let statistics; | ||
let statistics = {}; | ||
if (field.statistics !== false) { | ||
@@ -358,4 +406,4 @@ statistics = {}; | ||
}); | ||
statistics.null_count = values.dlevels.length - values.values.length; | ||
statistics.distinct_count = values.distinct_values.size; | ||
statistics.null_count = new node_int64_1.default(values.dlevels.length - values.values.length); | ||
statistics.distinct_count = new node_int64_1.default(values.distinct_values.size); | ||
} | ||
@@ -366,5 +414,5 @@ if (opts.useDataPageV2) { | ||
else { | ||
page = await encodeDataPage(field, values.values, values.rlevels, values.dlevels, statistics); | ||
page = await encodeDataPage(field, values.values || [], values.rlevels || [], values.dlevels || [], statistics); | ||
} | ||
let pages = rowBuffer.pages[field.path]; | ||
let pages = rowBuffer.pages[field.path.join(',')]; | ||
let lastPage = pages[pages.length - 1]; | ||
@@ -393,4 +441,4 @@ let first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0; | ||
let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
typeLength: column.typeLength, | ||
bitWidth: column.typeLength | ||
bitWidth: column.typeLength, | ||
...column | ||
}); | ||
@@ -409,7 +457,7 @@ /* encode repetition and definition levels */ | ||
pageBody = await parquet_compression.deflate(column.compression, pageBody); | ||
let pageHeader = new parquet_thrift.PageHeader(); | ||
pageHeader.type = parquet_thrift.PageType['DATA_PAGE']; | ||
let pageHeader = new parquet_types_1.default.PageHeader(); | ||
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE']; | ||
pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; | ||
pageHeader.compressed_page_size = pageBody.length; | ||
pageHeader.data_page_header = new parquet_thrift.DataPageHeader(); | ||
pageHeader.data_page_header = new parquet_types_1.default.DataPageHeader(); | ||
pageHeader.data_page_header.num_values = dlevels.length; | ||
@@ -419,7 +467,7 @@ if (column.statistics !== false) { | ||
} | ||
pageHeader.data_page_header.encoding = parquet_thrift.Encoding[column.encoding]; | ||
pageHeader.data_page_header.encoding = parquet_types_1.default.Encoding[column.encoding]; | ||
pageHeader.data_page_header.definition_level_encoding = | ||
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING]; | ||
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
pageHeader.data_page_header.repetition_level_encoding = | ||
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING]; | ||
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
/* concat page header, repetition and definition levels and values */ | ||
@@ -434,4 +482,4 @@ return Buffer.concat([parquet_util.serializeThrift(pageHeader), pageBody]); | ||
let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
typeLength: column.typeLength, | ||
bitWidth: column.typeLength | ||
bitWidth: column.typeLength, | ||
...column, | ||
}); | ||
@@ -455,5 +503,5 @@ let valuesBufCompressed = await parquet_compression.deflate(column.compression, valuesBuf); | ||
/* build page header */ | ||
let pageHeader = new parquet_thrift.PageHeader(); | ||
pageHeader.type = parquet_thrift.PageType['DATA_PAGE_V2']; | ||
pageHeader.data_page_header_v2 = new parquet_thrift.DataPageHeaderV2(); | ||
let pageHeader = new parquet_types_1.default.PageHeader(); | ||
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE_V2']; | ||
pageHeader.data_page_header_v2 = new parquet_types_1.default.DataPageHeaderV2(); | ||
pageHeader.data_page_header_v2.num_values = dlevels.length; | ||
@@ -469,3 +517,3 @@ pageHeader.data_page_header_v2.num_nulls = dlevels.length - values.length; | ||
rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length; | ||
pageHeader.data_page_header_v2.encoding = parquet_thrift.Encoding[column.encoding]; | ||
pageHeader.data_page_header_v2.encoding = parquet_types_1.default.Encoding[column.encoding]; | ||
pageHeader.data_page_header_v2.definition_levels_byte_length = dLevelsBuf.length; | ||
@@ -491,16 +539,19 @@ pageHeader.data_page_header_v2.repetition_levels_byte_length = rLevelsBuf.length; | ||
/* prepare metadata header */ | ||
let metadata = new parquet_thrift.ColumnMetaData(); | ||
let metadata = new parquet_types_1.default.ColumnMetaData(); | ||
metadata.path_in_schema = opts.column.path; | ||
metadata.num_values = num_values; | ||
metadata.data_page_offset = opts.baseOffset; | ||
metadata.num_values = new node_int64_1.default(num_values); | ||
metadata.data_page_offset = new node_int64_1.default(opts.baseOffset); | ||
metadata.encodings = []; | ||
metadata.total_uncompressed_size = pagesBuf.length; | ||
metadata.total_compressed_size = pagesBuf.length; | ||
metadata.type = parquet_thrift.Type[opts.column.primitiveType]; | ||
metadata.codec = await parquet_thrift.CompressionCodec[opts.column.compression]; | ||
metadata.total_uncompressed_size = new node_int64_1.default(pagesBuf.length); | ||
metadata.total_compressed_size = new node_int64_1.default(pagesBuf.length); | ||
metadata.type = parquet_types_1.default.Type[opts.column.primitiveType]; | ||
metadata.codec = await parquet_types_1.default.CompressionCodec[opts.column.compression]; | ||
/* compile statistics ColumnIndex and OffsetIndex*/ | ||
let columnIndex = new parquet_thrift.ColumnIndex(); | ||
let columnIndex = new parquet_types_1.default.ColumnIndex(); | ||
columnIndex.null_pages = []; | ||
columnIndex.max_values = []; | ||
columnIndex.min_values = []; | ||
let offsetIndex = new parquet_thrift.OffsetIndex(); | ||
// Default to unordered | ||
columnIndex.boundary_order = 0; | ||
let offsetIndex = new parquet_types_1.default.OffsetIndex(); | ||
offsetIndex.page_locations = []; | ||
@@ -510,7 +561,7 @@ /* prepare statistics */ | ||
let distinct_values = new Set(); | ||
statistics.null_count = 0; | ||
statistics.distinct_count = 0; | ||
statistics.null_count = new node_int64_1.default(0); | ||
statistics.distinct_count = new node_int64_1.default(0); | ||
/* loop through pages and update indices and statistics */ | ||
for (let i = 0; i < pages.length; i++) { | ||
let page = pages[i]; | ||
const page = pages[i]; | ||
if (opts.column.statistics !== false) { | ||
@@ -523,12 +574,14 @@ if (page.statistics.max_value > statistics.max_value || i == 0) { | ||
} | ||
statistics.null_count += page.statistics.null_count; | ||
page.distinct_values.forEach(value => distinct_values.add(value)); | ||
statistics.null_count.setValue(statistics.null_count.valueOf() + (page.statistics.null_count?.valueOf() || 0)); | ||
page.distinct_values.forEach((value) => distinct_values.add(value)); | ||
// If the number of values and the count of nulls are the same, this is a null page | ||
columnIndex.null_pages.push(page.num_values === statistics.null_count.valueOf()); | ||
columnIndex.max_values.push(encodeStatisticsValue(page.statistics.max_value, opts.column)); | ||
columnIndex.min_values.push(encodeStatisticsValue(page.statistics.min_value, opts.column)); | ||
} | ||
let pageLocation = new parquet_thrift.PageLocation(); | ||
pageLocation.offset = offset; | ||
let pageLocation = new parquet_types_1.default.PageLocation(); | ||
pageLocation.offset = new node_int64_1.default(offset); | ||
offset += page.page.length; | ||
pageLocation.compressed_page_size = page.page.length; | ||
pageLocation.first_row_index = page.first_row_index; | ||
pageLocation.first_row_index = new node_int64_1.default(page.first_row_index); | ||
offsetIndex.page_locations.push(pageLocation); | ||
@@ -540,3 +593,3 @@ } | ||
if (opts.column.statistics !== false) { | ||
statistics.distinct_count = distinct_values.size; | ||
statistics.distinct_count = new node_int64_1.default(distinct_values.size); | ||
metadata.statistics = encodeStatistics(statistics, opts.column); | ||
@@ -548,8 +601,4 @@ if (opts.pageIndex !== false) { | ||
/* list encodings */ | ||
let encodingsSet = {}; | ||
encodingsSet[PARQUET_RDLVL_ENCODING] = true; | ||
encodingsSet[opts.column.encoding] = true; | ||
for (let k in encodingsSet) { | ||
metadata.encodings.push(parquet_thrift.Encoding[k]); | ||
} | ||
metadata.encodings.push(parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]); | ||
metadata.encodings.push(parquet_types_1.default.Encoding[opts.column.encoding]); | ||
/* concat metadata header and data pages */ | ||
@@ -564,6 +613,6 @@ let metadataOffset = opts.baseOffset + pagesBuf.length; | ||
async function encodeRowGroup(schema, data, opts) { | ||
let metadata = new parquet_thrift.RowGroup(); | ||
metadata.num_rows = data.rowCount; | ||
let metadata = new parquet_types_1.default.RowGroup(); | ||
metadata.num_rows = new node_int64_1.default(data.rowCount); | ||
metadata.columns = []; | ||
metadata.total_byte_size = 0; | ||
metadata.total_byte_size = new node_int64_1.default(0); | ||
let body = Buffer.alloc(0); | ||
@@ -574,16 +623,15 @@ for (let field of schema.fieldList) { | ||
} | ||
let cchunkData = await encodeColumnChunk(data.pages[field.path], { | ||
let cchunkData = await encodeColumnChunk(data.pages[field.path.join(',')], { | ||
column: field, | ||
baseOffset: opts.baseOffset + body.length, | ||
pageSize: opts.pageSize, | ||
encoding: field.encoding, | ||
rowCount: data.rowCount, | ||
useDataPageV2: opts.useDataPageV2, | ||
pageIndex: opts.pageIndex | ||
baseOffset: opts.baseOffset.valueOf() + body.length, | ||
pageSize: opts.pageSize || 0, | ||
rowCount: data.rowCount || 0, | ||
useDataPageV2: opts.useDataPageV2 ?? true, | ||
pageIndex: opts.pageIndex ?? true | ||
}); | ||
let cchunk = new parquet_thrift.ColumnChunk(); | ||
cchunk.file_offset = cchunkData.metadataOffset; | ||
let cchunk = new parquet_types_1.default.ColumnChunk(); | ||
cchunk.file_offset = new node_int64_1.default(cchunkData.metadataOffset); | ||
cchunk.meta_data = cchunkData.metadata; | ||
metadata.columns.push(cchunk); | ||
metadata.total_byte_size += cchunkData.body.length; | ||
metadata.total_byte_size = new node_int64_1.default(metadata.total_byte_size.valueOf() + (cchunkData.body.length)); | ||
body = Buffer.concat([body, cchunkData.body]); | ||
@@ -597,3 +645,3 @@ } | ||
function encodeFooter(schema, rowCount, rowGroups, userMetadata) { | ||
let metadata = new parquet_thrift.FileMetaData(); | ||
let metadata = new parquet_types_1.default.FileMetaData(); | ||
metadata.version = PARQUET_VERSION; | ||
@@ -606,3 +654,3 @@ metadata.created_by = '@dsnp/parquetjs'; | ||
for (let k in userMetadata) { | ||
let kv = new parquet_thrift.KeyValue(); | ||
let kv = new parquet_types_1.default.KeyValue(); | ||
kv.key = k; | ||
@@ -613,3 +661,3 @@ kv.value = userMetadata[k]; | ||
{ | ||
let schemaRoot = new parquet_thrift.SchemaElement(); | ||
let schemaRoot = new parquet_types_1.default.SchemaElement(); | ||
schemaRoot.name = 'root'; | ||
@@ -620,5 +668,5 @@ schemaRoot.num_children = Object.keys(schema.fields).length; | ||
for (let field of schema.fieldList) { | ||
let schemaElem = new parquet_thrift.SchemaElement(); | ||
let schemaElem = new parquet_types_1.default.SchemaElement(); | ||
schemaElem.name = field.name; | ||
schemaElem.repetition_type = parquet_thrift.FieldRepetitionType[field.repetitionType]; | ||
schemaElem.repetition_type = parquet_types_1.default.FieldRepetitionType[field.repetitionType]; | ||
if (field.isNested) { | ||
@@ -628,7 +676,14 @@ schemaElem.num_children = field.fieldCount; | ||
else { | ||
schemaElem.type = parquet_thrift.Type[field.primitiveType]; | ||
schemaElem.type = parquet_types_1.default.Type[field.primitiveType]; | ||
} | ||
if (field.originalType) { | ||
schemaElem.converted_type = parquet_thrift.ConvertedType[field.originalType]; | ||
schemaElem.converted_type = parquet_types_1.default.ConvertedType[field.originalType]; | ||
} | ||
// Support Decimal | ||
switch (schemaElem.converted_type) { | ||
case (parquet_types_1.ConvertedType.DECIMAL): | ||
schemaElem.precision = field.precision; | ||
schemaElem.scale = field.scale || 0; | ||
break; | ||
} | ||
schemaElem.type_length = field.typeLength; | ||
@@ -644,6 +699,1 @@ metadata.schema.push(schemaElem); | ||
} | ||
module.exports = { | ||
ParquetEnvelopeWriter, | ||
ParquetWriter, | ||
ParquetTransformer | ||
}; |
@@ -5,2 +5,9 @@ import * as reader from './lib/reader'; | ||
import * as shredder from './lib/shred'; | ||
import * as fields from './lib/fields'; | ||
export type ParquetEnvelopeReader = reader.ParquetEnvelopeReader; | ||
export type ParquetReader = reader.ParquetReader; | ||
export type ParquetEnvelopeWriter = writer.ParquetEnvelopeWriter; | ||
export type ParquetWriter = writer.ParquetWriter; | ||
export type ParquetTransformer = writer.ParquetTransformer; | ||
export type ParquetSchema = schema.ParquetSchema; | ||
export declare const ParquetEnvelopeReader: typeof reader.ParquetEnvelopeReader; | ||
@@ -13,1 +20,13 @@ export declare const ParquetReader: typeof reader.ParquetReader; | ||
export declare const ParquetShredder: typeof shredder; | ||
export declare const ParquetFieldBuilder: typeof fields; | ||
declare const _default: { | ||
ParquetEnvelopeReader: typeof reader.ParquetEnvelopeReader; | ||
ParquetReader: typeof reader.ParquetReader; | ||
ParquetEnvelopeWriter: typeof writer.ParquetEnvelopeWriter; | ||
ParquetWriter: typeof writer.ParquetWriter; | ||
ParquetTransformer: typeof writer.ParquetTransformer; | ||
ParquetSchema: typeof schema.ParquetSchema; | ||
ParquetShredder: typeof shredder; | ||
ParquetFieldBuilder: typeof fields; | ||
}; | ||
export default _default; |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
var desc = Object.getOwnPropertyDescriptor(m, k); | ||
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { | ||
desc = { enumerable: true, get: function() { return m[k]; } }; | ||
} | ||
Object.defineProperty(o, k2, desc); | ||
}) : (function(o, m, k, k2) { | ||
@@ -22,3 +26,3 @@ if (k2 === undefined) k2 = k; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetShredder = exports.ParquetSchema = exports.ParquetTransformer = exports.ParquetWriter = exports.ParquetEnvelopeWriter = exports.ParquetReader = exports.ParquetEnvelopeReader = void 0; | ||
exports.ParquetFieldBuilder = exports.ParquetShredder = exports.ParquetSchema = exports.ParquetTransformer = exports.ParquetWriter = exports.ParquetEnvelopeWriter = exports.ParquetReader = exports.ParquetEnvelopeReader = void 0; | ||
const reader = __importStar(require("./lib/reader")); | ||
@@ -28,2 +32,3 @@ const writer = __importStar(require("./lib/writer")); | ||
const shredder = __importStar(require("./lib/shred")); | ||
const fields = __importStar(require("./lib/fields")); | ||
exports.ParquetEnvelopeReader = reader.ParquetEnvelopeReader; | ||
@@ -36,1 +41,12 @@ exports.ParquetReader = reader.ParquetReader; | ||
exports.ParquetShredder = shredder; | ||
exports.ParquetFieldBuilder = fields; | ||
exports.default = { | ||
ParquetEnvelopeReader: exports.ParquetEnvelopeReader, | ||
ParquetReader: exports.ParquetReader, | ||
ParquetEnvelopeWriter: exports.ParquetEnvelopeWriter, | ||
ParquetWriter: exports.ParquetWriter, | ||
ParquetTransformer: exports.ParquetTransformer, | ||
ParquetSchema: exports.ParquetSchema, | ||
ParquetShredder: exports.ParquetShredder, | ||
ParquetFieldBuilder: exports.ParquetFieldBuilder, | ||
}; |
@@ -6,3 +6,3 @@ { | ||
"types": "dist/parquet.d.ts", | ||
"version": "0.0.0-33c80f", | ||
"version": "0.0.0-43732c", | ||
"homepage": "https://github.com/LibertyDSNP/parquetjs", | ||
@@ -19,46 +19,41 @@ "license": "MIT", | ||
"dependencies": { | ||
"@types/varint": "^6.0.0", | ||
"@types/long": "^4.0.2", | ||
"@types/node-int64": "^0.4.29", | ||
"@types/thrift": "^0.10.11", | ||
"browserify-zlib": "^0.2.0", | ||
"bson": "4.4.0", | ||
"bson": "4.6.3", | ||
"cross-fetch": "^3.1.4", | ||
"int53": "^0.2.4", | ||
"long": "^4.0.0", | ||
"snappyjs": "^0.6.0", | ||
"thrift": "0.14.1", | ||
"varint": "^5.0.0", | ||
"snappyjs": "^0.6.1", | ||
"thrift": "0.16.0", | ||
"varint": "^6.0.0", | ||
"wasm-brotli": "^2.0.2", | ||
"xxhash-wasm": "^0.4.1" | ||
"xxhash-wasm": "^1.0.2" | ||
}, | ||
"devDependencies": { | ||
"@babel/core": "^7.14.6", | ||
"@babel/preset-env": "^7.14.7", | ||
"@babel/preset-typescript": "^7.14.5", | ||
"@types/bson": "^4.0.3", | ||
"@types/chai": "^4.2.16", | ||
"@types/long": "^4.0.1", | ||
"@types/mocha": "^8.2.2", | ||
"@types/node": "^14.14.35", | ||
"@types/sinon": "^10.0.0", | ||
"@types/thrift": "^0.10.10", | ||
"@types/bson": "^4.2.0", | ||
"@types/chai": "^4.3.5", | ||
"@types/json-schema": "^7.0.11", | ||
"@types/mocha": "^10.0.1", | ||
"@types/node": "^16.18.32", | ||
"@types/sinon": "^10.0.15", | ||
"@types/varint": "^6.0.1", | ||
"assert": "^2.0.0", | ||
"babel-loader": "^8.2.2", | ||
"babel-plugin-add-module-exports": "^1.0.4", | ||
"browserfs": "^1.4.3", | ||
"buffer": "^6.0.3", | ||
"chai": "4.3.4", | ||
"core-js": "^3.15.1", | ||
"esbuild": "^0.14.38", | ||
"mocha": "9.2.2", | ||
"msw": "^0.39.2", | ||
"object-stream": "0.0.1", | ||
"chai": "4.3.6", | ||
"core-js": "^3.22.5", | ||
"esbuild": "^0.14.47", | ||
"mocha": "^10.2.0", | ||
"msw": "^1.2.1", | ||
"object-stream": "^0.0.1", | ||
"process": "^0.11.10", | ||
"regenerator-runtime": "^0.13.7", | ||
"sinon": "^10.0.0", | ||
"regenerator-runtime": "^0.13.11", | ||
"sinon": "^15.1.0", | ||
"sinon-chai": "^3.7.0", | ||
"sinon-chai-in-order": "^0.1.0", | ||
"source-map-loader": "^3.0.0", | ||
"stream-browserify": "^3.0.0", | ||
"ts-loader": "^9.2.3", | ||
"ts-node": "^9.1.1", | ||
"typescript": "^4.5.2" | ||
"ts-node": "^10.9.1", | ||
"typescript": "^5.0.4" | ||
}, | ||
@@ -72,3 +67,4 @@ "scripts": { | ||
"lint": "echo 'Linting, it is on the TODO list...'", | ||
"test": "mocha -r ts-node/register 'test/**/*.{js,ts}'", | ||
"test": "mocha -r ts-node/register 'test/{,!(browser)/**}/*.{js,ts}'", | ||
"test:only": "mocha -r ts-node/register", | ||
"clean": "rm -Rf ./dist", | ||
@@ -90,3 +86,3 @@ "prepublishOnly": "npm run clean && npm run build:node && npm run build:types && npm run build:browser", | ||
"engines": { | ||
"node": ">=14.16.0" | ||
"node": ">=16.15.1" | ||
}, | ||
@@ -93,0 +89,0 @@ "files": [ |
@@ -30,4 +30,4 @@ # parquet.js | ||
### NodeJS | ||
To use with nodejs: | ||
### NodeJS | ||
To use with nodejs: | ||
```javascript | ||
@@ -37,13 +37,43 @@ import parquetjs from "@dsnp/parquetjs" | ||
### Browser | ||
To use in a browser, in your bundler, depending on your needs, write the appropriate plugin or resolver to point to: | ||
### Browser with Bundler | ||
To use in a browser with a bundler, depending on your needs, write the appropriate plugin or resolver to point to either the Common JS or ES Module version: | ||
```javascript | ||
"node_modules/@dsnp/parquetjs/browser/parquetjs" | ||
// Common JS | ||
"node_modules/@dsnp/parquetjs/dist/browser/parquetjs.cjs" | ||
// ES Modules | ||
"node_modules/@dsnp/parquetjs/dist/browser/parquetjs.esm" | ||
``` | ||
or: | ||
```javascript | ||
import parquetjs from "@dsnp/parquetjs/browser/parquetjs" | ||
// Common JS | ||
import parquetjs from "@dsnp/parquetjs/dist/browser/parquetjs.cjs" | ||
// ES Modules | ||
import parquetjs from "@dsnp/parquetjs/dist/browser/parquetjs.esm" | ||
``` | ||
### Browser Direct: ES Modules | ||
To use directly in the browser without a bundler using ES Modules: | ||
1. Build the package: `npm install && npm run build:browser` | ||
2. Copy to `dist/browser/parquetjs.esm.js` the server | ||
3. Use it in your html or other ES Modules: | ||
```html | ||
<script type="module"> | ||
import parquetjs from '../parquet.esm.js'; | ||
// Use parquetjs | ||
</script> | ||
``` | ||
### Browser Direct: Plain Ol' JavaScript | ||
To use directly in the browser without a bundler or ES Modules: | ||
1. Build the package: `npm install && npm run build:browser` | ||
2. Copy to `dist/browser/parquetjs.js` the server | ||
2. Use the global `parquetjs` variable to access parquetjs functions | ||
```html | ||
<script> | ||
// console.log(parquetjs) | ||
</script> | ||
``` | ||
## Usage: Writing files | ||
@@ -62,2 +92,4 @@ | ||
### Native Schema Definition | ||
``` js | ||
@@ -74,2 +106,42 @@ // declare a schema for the `fruits` table | ||
### Helper Functions | ||
```js | ||
var schema = new parquet.ParquetSchema({ | ||
name: parquet.ParquetFieldBuilder.createStringField(), | ||
quantity: parquet.ParquetFieldBuilder.createIntField(64), | ||
price: parquet.ParquetFieldBuilder.createDoubleField(), | ||
date: parquet.ParquetFieldBuilder.createTimestampField(), | ||
in_stock: parquet.ParquetFieldBuilder.createBooleanField() | ||
}); | ||
``` | ||
### JSON Schema | ||
``` js | ||
// declare a schema for the `fruits` JSON Schema | ||
var schema = new parquet.ParquetSchema.fromJsonSchema({ | ||
"type": "object", | ||
"properties": { | ||
"name": { | ||
"type": "string" | ||
}, | ||
"quantity": { | ||
"type": "integer" | ||
}, | ||
"price": { | ||
"type": "number" | ||
}, | ||
"date": { | ||
"type": "string" | ||
}, | ||
"in_stock": { | ||
"type": "boolean" | ||
} | ||
}, | ||
"required": ["name", "quantity", "price", "date", "in_stock"] | ||
}); | ||
``` | ||
Note that the Parquet schema supports nesting, so you can store complex, arbitrarily | ||
@@ -81,3 +153,3 @@ nested records into a single row (more on that later) while still maintaining good | ||
take input rows as JSON objects, convert them to the Parquet format and store | ||
them on disk. | ||
them on disk. | ||
@@ -96,3 +168,3 @@ ``` js | ||
### Adding bloom filters | ||
### Adding bloom filters | ||
@@ -216,3 +288,3 @@ Bloom filters can be added to multiple columns as demonstrated below: | ||
Parquet files can be read from an S3 object without having to download the whole file. | ||
You will have to supply the aws-sdk client as first argument and the bucket/key information | ||
You will have to supply the aws-sdk client as first argument and the bucket/key information | ||
as second argument to the function `parquetReader.openS3`. | ||
@@ -301,3 +373,3 @@ | ||
Consider this example, which allows us to store a more advanced "fruits" table | ||
where each row contains a name, a list of colours and a list of "stock" objects. | ||
where each row contains a name, a list of colours and a list of "stock" objects. | ||
@@ -304,0 +376,0 @@ ``` js |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Uses eval
Supply chain riskPackage uses dynamic code execution (e.g., eval()), which is a dangerous practice. This can prevent the code from running in certain environments and increases the risk that the code may contain exploits or malicious behavior.
Found 2 instances in 1 package
Dynamic require
Supply chain riskDynamic require can indicate the package is performing dangerous or unsafe dynamic code execution.
Found 1 instance in 1 package
Environment variable access
Supply chain riskPackage accesses environment variables, which may be a sign of credential stuffing or data theft.
Found 2 instances in 1 package
Minified code
QualityThis package contains minified code. This may be harmless in some cases where minified code is included in packaged libraries, however packages on npm should not minify code.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
24
55
492
2748561
13
19366
3
13
21
+ Added@types/long@^4.0.2
+ Added@types/node-int64@^0.4.29
+ Added@types/thrift@^0.10.11
+ Added@types/long@4.0.2(transitive)
+ Added@types/node-int64@0.4.32(transitive)
+ Added@types/q@1.5.8(transitive)
+ Added@types/thrift@0.10.17(transitive)
+ Addedbson@4.6.3(transitive)
+ Addedthrift@0.16.0(transitive)
+ Addedvarint@6.0.0(transitive)
+ Addedxxhash-wasm@1.1.0(transitive)
- Removed@types/varint@^6.0.0
- Removed@types/varint@6.0.3(transitive)
- Removedbson@4.4.0(transitive)
- Removedthrift@0.14.1(transitive)
- Removedvarint@5.0.2(transitive)
- Removedxxhash-wasm@0.4.2(transitive)
Updatedbson@4.6.3
Updatedsnappyjs@^0.6.1
Updatedthrift@0.16.0
Updatedvarint@^6.0.0
Updatedxxhash-wasm@^1.0.2