New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

@dsnp/parquetjs

Package Overview
Dependencies
Maintainers
3
Versions
93
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@dsnp/parquetjs - npm Package Compare versions

Comparing version 0.0.0-81e38e to 0.0.0-8d34ac

dist/browser/parquet.cjs.d.ts

9

dist/lib/bloom/sbbf.js

@@ -7,3 +7,3 @@ "use strict";

const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types"));
const Long = require("long");
const long_1 = __importDefault(require("long"));
const xxhasher_1 = __importDefault(require("./xxhasher"));

@@ -106,4 +106,4 @@ /**

static getBlockIndex(h, z) {
const zLong = Long.fromNumber(z, true);
const hTopBits = Long.fromNumber(h.getHighBitsUnsigned(), true);
const zLong = long_1.default.fromNumber(z, true);
const hTopBits = long_1.default.fromNumber(h.getHighBitsUnsigned(), true);
return hTopBits.mul(zLong).shiftRightUnsigned(32).getLowBitsUnsigned();

@@ -330,3 +330,3 @@ }

const hashed = await this.hasher.hash64(value);
return Long.fromString(hashed, true, 16);
return long_1.default.fromString(hashed, true, 16);
}

@@ -374,2 +374,1 @@ insertHash(hashValue) {

exports.default = SplitBlockBloomFilter;
//# sourceMappingURL=sbbf.js.map

@@ -19,4 +19,4 @@ "use strict";

class XxHasher {
static h64 = xxhash_wasm_1.default().then(x => x.h64);
async hashit(value) {
static h64 = (0, xxhash_wasm_1.default)().then(x => x.h64ToString);
async hashIt(value) {
return (await XxHasher.h64)(value);

@@ -26,9 +26,10 @@ }

* @function hash64
* @description creates a hash for certain data types.
* @return the 64 big XXHash as a string
* @param value one of n, throw an error.
* @description creates a hash for certain data types. All data is converted using toString()
* prior to hashing.
* @return the 64 big XXHash as a hex-encoded string.
* @param value, must be of type string, Buffer, Uint8Array, Long, boolean, number, or bigint
*/
async hash64(value) {
if (typeof value === 'string')
return this.hashit(value);
return this.hashIt(value);
if (value instanceof Buffer ||

@@ -40,3 +41,3 @@ value instanceof Uint8Array ||

typeof value === 'bigint') {
return this.hashit(value.toString());
return this.hashIt(value.toString());
}

@@ -47,2 +48,1 @@ throw new Error("unsupported type: " + value);

exports.default = XxHasher;
//# sourceMappingURL=xxhasher.js.map
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {

@@ -7,3 +30,3 @@ return (mod && mod.__esModule) ? mod : { "default": mod };

exports.getBloomFiltersFor = exports.siftAllByteOffsets = exports.parseBloomFilterOffsets = void 0;
const util_1 = __importDefault(require("../util"));
const parquet_util = __importStar(require("../util"));
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types"));

@@ -13,4 +36,3 @@ const sbbf_1 = __importDefault(require("../bloom/sbbf"));

return columnChunkDataCollection.filter((columnChunk) => {
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, }, }, } = columnChunk;
return bloomFilterOffsetBuffer;
return columnChunk.column.meta_data?.bloom_filter_offset;
});

@@ -26,6 +48,6 @@ };

const parseBloomFilterOffsets = (ColumnChunkDataCollection) => {
return ColumnChunkDataCollection.map((columnChunkData) => {
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, path_in_schema: pathInSchema, }, }, rowGroupIndex, } = columnChunkData;
return ColumnChunkDataCollection.map(({ rowGroupIndex, column }) => {
const { bloom_filter_offset: bloomOffset, path_in_schema: pathInSchema, } = column.meta_data || {};
return {
offsetBytes: toInteger(bloomFilterOffsetBuffer),
offsetBytes: toInteger(bloomOffset.buffer),
columnName: pathInSchema.join(","),

@@ -44,6 +66,9 @@ rowGroupIndex,

catch (e) {
throw new Error(e);
if (typeof e === 'string')
throw new Error(e);
else
throw e;
}
const bloomFilterHeader = new parquet_types_1.default.BloomFilterHeader();
const sizeOfBloomFilterHeader = util_1.default.decodeThrift(bloomFilterHeader, bloomFilterHeaderData);
const sizeOfBloomFilterHeader = parquet_util.decodeThrift(bloomFilterHeader, bloomFilterHeaderData);
return {

@@ -63,3 +88,6 @@ bloomFilterHeader,

catch (e) {
throw new Error(e);
if (typeof e === 'string')
throw new Error(e);
else
throw e;
}

@@ -71,8 +99,8 @@ };

const siftAllByteOffsets = (columnChunkDataCollection) => {
return exports.parseBloomFilterOffsets(filterColumnChunksWithBloomFilters(columnChunkDataCollection));
return (0, exports.parseBloomFilterOffsets)(filterColumnChunksWithBloomFilters(columnChunkDataCollection));
};
exports.siftAllByteOffsets = siftAllByteOffsets;
const getBloomFiltersFor = async (columnNames, envelopeReader) => {
const columnChunkDataCollection = envelopeReader.getAllColumnChunkDataFor(columnNames);
const bloomFilterOffsetData = exports.siftAllByteOffsets(columnChunkDataCollection);
const getBloomFiltersFor = async (paths, envelopeReader) => {
const columnChunkDataCollection = envelopeReader.getAllColumnChunkDataFor(paths);
const bloomFilterOffsetData = (0, exports.siftAllByteOffsets)(columnChunkDataCollection);
const offsetByteValues = bloomFilterOffsetData.map(({ offsetBytes }) => offsetBytes);

@@ -90,2 +118,1 @@ const filterBlocksBuffers = await readFilterDataFrom(offsetByteValues, envelopeReader);

exports.getBloomFiltersFor = getBloomFiltersFor;
//# sourceMappingURL=bloomFilterReader.js.map
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {

@@ -7,3 +30,3 @@ return (mod && mod.__esModule) ? mod : { "default": mod };

exports.getSerializedBloomFilterData = exports.setFilterOffset = exports.serializeFilterData = exports.serializeFilterHeaders = exports.createSBBF = void 0;
const util_1 = __importDefault(require("../util"));
const parquet_util = __importStar(require("../util"));
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types"));

@@ -37,3 +60,3 @@ const sbbf_1 = __importDefault(require("../bloom/sbbf"));

const bloomFilterHeader = buildFilterHeader(numberOfBytes);
return util_1.default.serializeThrift(bloomFilterHeader);
return parquet_util.serializeThrift(bloomFilterHeader);
};

@@ -43,3 +66,3 @@ exports.serializeFilterHeaders = serializeFilterHeaders;

const serializedFilterBlocks = serializeFilterBlocks(params.filterBlocks);
const serializedFilterHeaders = exports.serializeFilterHeaders(params.filterByteSize);
const serializedFilterHeaders = (0, exports.serializeFilterHeaders)(params.filterByteSize);
return Buffer.concat([serializedFilterHeaders, serializedFilterBlocks]);

@@ -49,3 +72,3 @@ };

const setFilterOffset = (column, offset) => {
column.meta_data.bloom_filter_offset = offset;
column.meta_data.bloom_filter_offset = parquet_util.cloneInteger(offset);
};

@@ -56,5 +79,4 @@ exports.setFilterOffset = setFilterOffset;

const filterByteSize = splitBlockBloomFilter.getNumFilterBytes();
return exports.serializeFilterData({ filterBlocks, filterByteSize });
return (0, exports.serializeFilterData)({ filterBlocks, filterByteSize });
};
exports.getSerializedBloomFilterData = getSerializedBloomFilterData;
//# sourceMappingURL=bloomFilterWriter.js.map

@@ -28,3 +28,3 @@ 'use strict';

function deflate_identity(value) {
return value;
return buffer_from_result(value);
}

@@ -35,3 +35,4 @@ function deflate_gzip(value) {

function deflate_snappy(value) {
return snappy.compress(value);
const compressedValue = snappy.compress(value);
return buffer_from_result(compressedValue);
}

@@ -48,3 +49,3 @@ /**

function inflate_identity(value) {
return value;
return buffer_from_result(value);
}

@@ -55,7 +56,15 @@ function inflate_gzip(value) {

function inflate_snappy(value) {
return snappy.uncompress(value);
const uncompressedValue = snappy.uncompress(value);
return buffer_from_result(uncompressedValue);
}
function buffer_from_result(result) {
if (Buffer.isBuffer(result)) {
return result;
}
else {
return Buffer.from(result);
}
}
exports.PARQUET_COMPRESSION_METHODS = PARQUET_COMPRESSION_METHODS;
exports.deflate = deflate;
exports.inflate = inflate;
//# sourceMappingURL=compression.js.map
"use strict";
module.exports = class BufferReader {
Object.defineProperty(exports, "__esModule", { value: true });
class BufferReader {
maxSpan;
maxLength;
queueWait;
scheduled;
queue;
envelopeReader;
constructor(envelopeReader, options) {

@@ -12,3 +19,3 @@ options = options || {};

}
async read(offset, length) {
read(offset, length) {
if (!this.scheduled) {

@@ -59,5 +66,6 @@ this.scheduled = true;

});
readSubqueue(subqueue);
readSubqueue();
}
};
//# sourceMappingURL=bufferReader.js.map
}
exports.default = BufferReader;
;
"use strict";
module.exports.PLAIN = require('./plain');
module.exports.RLE = require('./rle');
module.exports.PLAIN_DICTIONARY = require('./plain_dictionary');
//# sourceMappingURL=index.js.map
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.RLE_DICTIONARY = exports.PLAIN_DICTIONARY = exports.RLE = exports.PLAIN = void 0;
exports.PLAIN = __importStar(require("./plain"));
exports.RLE = __importStar(require("./rle"));
exports.PLAIN_DICTIONARY = __importStar(require("./plain_dictionary"));
exports.RLE_DICTIONARY = __importStar(require("./plain_dictionary"));
"use strict";
const rle = require('./rle');
exports.decodeValues = function (type, cursor, count, opts) {
opts.bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0);
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeValues = void 0;
const rle = __importStar(require("./rle"));
const decodeValues = function (type, cursor, count, opts) {
const bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0);
cursor.offset += 1;
return rle.decodeValues(type, cursor, count, Object.assign({}, opts, { disableEnvelope: true }));
return rle.decodeValues(type, cursor, count, Object.assign({}, opts, { disableEnvelope: true, bitWidth }));
};
//# sourceMappingURL=plain_dictionary.js.map
exports.decodeValues = decodeValues;

@@ -1,3 +0,8 @@

'use strict';
const INT53 = require('int53');
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeValues = exports.encodeValues = void 0;
const int53_1 = __importDefault(require("int53"));
function encodeValues_BOOLEAN(values) {

@@ -8,3 +13,3 @@ let buf = Buffer.alloc(Math.ceil(values.length / 8));

if (values[i]) {
buf[Math.floor(i / 8)] |= (1 << (i % 8));
buf[Math.floor(i / 8)] |= 1 << i % 8;
}

@@ -18,3 +23,3 @@ }

let b = cursor.buffer[cursor.offset + Math.floor(i / 8)];
values.push((b & (1 << (i % 8))) > 0);
values.push((b & (1 << i % 8)) > 0);
}

@@ -24,29 +29,96 @@ cursor.offset += Math.ceil(count / 8);

}
function encodeValues_INT32(values) {
function encodeValues_INT32(values, opts) {
const isDecimal = opts?.originalType === 'DECIMAL' || opts?.column?.originalType === 'DECIMAL';
const scale = opts?.scale || 0;
let buf = Buffer.alloc(4 * values.length);
for (let i = 0; i < values.length; i++) {
buf.writeInt32LE(values[i], i * 4);
if (isDecimal) {
buf.writeInt32LE(values[i] * Math.pow(10, scale), i * 4);
}
else {
buf.writeInt32LE(values[i], i * 4);
}
}
return buf;
}
function decodeValues_INT32(cursor, count) {
function decodeValues_INT32(cursor, count, opts) {
let values = [];
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.readInt32LE(cursor.offset));
cursor.offset += 4;
const name = opts.name || opts.column?.name || undefined;
try {
if (opts.originalType === 'DECIMAL') {
values = decodeValues_DECIMAL(cursor, count, opts);
}
else {
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.readInt32LE(cursor.offset));
cursor.offset += 4;
}
}
}
catch (e) {
console.log(`Error thrown for column: ${name}`);
throw e;
}
return values;
}
function encodeValues_INT64(values) {
function encodeValues_INT64(values, opts) {
const isDecimal = opts?.originalType === 'DECIMAL' || opts?.column?.originalType === 'DECIMAL';
const scale = opts?.scale || 0;
let buf = Buffer.alloc(8 * values.length);
for (let i = 0; i < values.length; i++) {
buf.writeBigInt64LE(BigInt(values[i]), i * 8);
if (isDecimal) {
buf.writeBigInt64LE(BigInt(Math.floor(values[i] * Math.pow(10, scale))), i * 8);
}
else {
buf.writeBigInt64LE(BigInt(values[i]), i * 8);
}
}
return buf;
}
function decodeValues_INT64(cursor, count) {
function decodeValues_INT64(cursor, count, opts) {
let values = [];
const name = opts.name || opts.column?.name || undefined;
try {
if (opts.originalType === 'DECIMAL' || opts.column?.originalType === 'DECIMAL') {
let columnOptions = opts.column?.originalType ? opts.column : opts;
values = decodeValues_DECIMAL(cursor, count, columnOptions);
}
else {
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.readBigInt64LE(cursor.offset));
cursor.offset += 8;
}
}
}
catch (e) {
console.log(`Error thrown for column: ${name}`);
throw e;
}
return values;
}
function decodeValues_DECIMAL(cursor, count, opts) {
const precision = opts.precision;
// Default scale to 0 per spec
const scale = opts.scale || 0;
const name = opts.name || undefined;
if (!precision) {
throw `missing option: precision (required for DECIMAL) for column: ${name}`;
}
let values = [];
// by default we prepare the offset and bufferFunction to work with 32bit integers
let offset = 4;
let bufferFunction = (offset) => cursor.buffer.readInt32LE(offset);
if (precision > 9) {
// if the precision is over 9 digits, then we are dealing with a 64bit integer
offset = 8;
bufferFunction = (offset) => cursor.buffer.readBigInt64LE(offset);
}
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.readBigInt64LE(cursor.offset));
cursor.offset += 8;
const bufferSize = cursor.size || 0;
if (bufferSize === 0 || cursor.offset < bufferSize) {
const fullValue = bufferFunction(cursor.offset);
const valueWithDecimalApplied = Number(fullValue) / Math.pow(10, scale);
values.push(valueWithDecimalApplied);
cursor.offset += offset;
}
}

@@ -59,7 +131,7 @@ return values;

if (values[i] >= 0) {
INT53.writeInt64LE(values[i], buf, i * 12);
int53_1.default.writeInt64LE(values[i], buf, i * 12);
buf.writeUInt32LE(0, i * 12 + 8); // truncate to 64 actual precision
}
else {
INT53.writeInt64LE((~-values[i]) + 1, buf, i * 12);
int53_1.default.writeInt64LE(~-values[i] + 1, buf, i * 12);
buf.writeUInt32LE(0xffffffff, i * 12 + 8); // truncate to 64 actual precision

@@ -73,6 +145,6 @@ }

for (let i = 0; i < count; ++i) {
const low = INT53.readInt64LE(cursor.buffer, cursor.offset);
const low = int53_1.default.readInt64LE(cursor.buffer, cursor.offset);
const high = cursor.buffer.readUInt32LE(cursor.offset + 8);
if (high === 0xffffffff) {
values.push((~-low) + 1); // truncate to 64 actual precision
values.push(~-low + 1); // truncate to 64 actual precision
}

@@ -116,14 +188,16 @@ else {

}
// Waylands reminder to check again
function encodeValues_BYTE_ARRAY(values) {
let buf_len = 0;
const returnedValues = [];
for (let i = 0; i < values.length; i++) {
values[i] = Buffer.from(values[i]);
buf_len += 4 + values[i].length;
returnedValues[i] = Buffer.from(values[i]);
buf_len += 4 + returnedValues[i].length;
}
let buf = Buffer.alloc(buf_len);
let buf_pos = 0;
for (let i = 0; i < values.length; i++) {
buf.writeUInt32LE(values[i].length, buf_pos);
values[i].copy(buf, buf_pos + 4);
buf_pos += 4 + values[i].length;
for (let i = 0; i < returnedValues.length; i++) {
buf.writeUInt32LE(returnedValues[i].length, buf_pos);
returnedValues[i].copy(buf, buf_pos + 4);
buf_pos += 4 + returnedValues[i].length;
}

@@ -146,10 +220,10 @@ return buf;

}
let buf_len = 0;
const returnedValues = [];
for (let i = 0; i < values.length; i++) {
values[i] = Buffer.from(values[i]);
if (values[i].length !== opts.typeLength) {
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + values[i];
returnedValues[i] = Buffer.from(values[i]);
if (returnedValues[i].length !== opts.typeLength) {
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + returnedValues[i];
}
}
return Buffer.concat(values);
return Buffer.concat(returnedValues);
}

@@ -167,46 +241,47 @@ function decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts) {

}
exports.encodeValues = function (type, values, opts) {
const encodeValues = function (type, values, opts) {
switch (type) {
case 'BOOLEAN':
case "BOOLEAN":
return encodeValues_BOOLEAN(values);
case 'INT32':
return encodeValues_INT32(values);
case 'INT64':
return encodeValues_INT64(values);
case 'INT96':
case "INT32":
return encodeValues_INT32(values, opts);
case "INT64":
return encodeValues_INT64(values, opts);
case "INT96":
return encodeValues_INT96(values);
case 'FLOAT':
case "FLOAT":
return encodeValues_FLOAT(values);
case 'DOUBLE':
case "DOUBLE":
return encodeValues_DOUBLE(values);
case 'BYTE_ARRAY':
case "BYTE_ARRAY":
return encodeValues_BYTE_ARRAY(values);
case 'FIXED_LEN_BYTE_ARRAY':
case "FIXED_LEN_BYTE_ARRAY":
return encodeValues_FIXED_LEN_BYTE_ARRAY(values, opts);
default:
throw 'unsupported type: ' + type;
throw "unsupported type: " + type;
}
};
exports.decodeValues = function (type, cursor, count, opts) {
exports.encodeValues = encodeValues;
const decodeValues = function (type, cursor, count, opts) {
switch (type) {
case 'BOOLEAN':
case "BOOLEAN":
return decodeValues_BOOLEAN(cursor, count);
case 'INT32':
return decodeValues_INT32(cursor, count);
case 'INT64':
return decodeValues_INT64(cursor, count);
case 'INT96':
case "INT32":
return decodeValues_INT32(cursor, count, opts);
case "INT64":
return decodeValues_INT64(cursor, count, opts);
case "INT96":
return decodeValues_INT96(cursor, count);
case 'FLOAT':
case "FLOAT":
return decodeValues_FLOAT(cursor, count);
case 'DOUBLE':
case "DOUBLE":
return decodeValues_DOUBLE(cursor, count);
case 'BYTE_ARRAY':
case "BYTE_ARRAY":
return decodeValues_BYTE_ARRAY(cursor, count);
case 'FIXED_LEN_BYTE_ARRAY':
case "FIXED_LEN_BYTE_ARRAY":
return decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts);
default:
throw 'unsupported type: ' + type;
throw "unsupported type: " + type;
}
};
//# sourceMappingURL=plain.js.map
exports.decodeValues = decodeValues;
"use strict";
const varint = require('varint');
// For questions about RLE encoding, see the spec:
//
// https://github.com/apache/parquet-format/blob/master/Encodings.md
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeValues = exports.encodeValues = void 0;
const varint_1 = __importDefault(require("varint"));
function encodeRunBitpacked(values, opts) {

@@ -14,3 +22,3 @@ for (let i = 0; i < values.length % 8; i++) {

return Buffer.concat([
Buffer.from(varint.encode(((values.length / 8) << 1) | 1)),
Buffer.from(varint_1.default.encode(((values.length / 8) << 1) | 1)),
buf

@@ -21,12 +29,23 @@ ]);

let buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8));
let remainingValue = value;
// This is encoded LSB to MSB, so we pick off the least
// significant byte and shift to get the next one.
for (let i = 0; i < buf.length; ++i) {
buf.writeUInt8(value & 0xff, i);
value >> 8;
buf.writeUInt8(remainingValue & 0xff, i);
remainingValue = remainingValue >> 8;
}
return Buffer.concat([
Buffer.from(varint.encode(count << 1)),
Buffer.from(varint_1.default.encode(count << 1)),
buf
]);
}
exports.encodeValues = function (type, values, opts) {
function unknownToParsedInt(value) {
if (typeof value === 'string') {
return parseInt(value, 10);
}
else {
return value;
}
}
const encodeValues = function (type, values, opts) {
if (!('bitWidth' in opts)) {

@@ -39,3 +58,3 @@ throw 'bitWidth is required';

case 'INT64':
values = values.map((x) => parseInt(x, 10));
values = values.map((x) => unknownToParsedInt(x));
break;

@@ -85,2 +104,3 @@ default:

};
exports.encodeValues = encodeValues;
function decodeRunBitpacked(cursor, count, opts) {

@@ -100,6 +120,9 @@ if (count % 8 !== 0) {

function decodeRunRepeated(cursor, count, opts) {
var bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8);
let value = 0;
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); ++i) {
value << 8;
value += cursor.buffer[cursor.offset];
for (let i = 0; i < bytesNeededForFixedBitWidth; ++i) {
const byte = cursor.buffer[cursor.offset];
// Bytes are stored LSB to MSB, so we need to shift
// each new byte appropriately.
value += byte << (i * 8);
cursor.offset += 1;

@@ -109,3 +132,3 @@ }

}
exports.decodeValues = function (type, cursor, count, opts) {
const decodeValues = function (_, cursor, count, opts) {
if (!('bitWidth' in opts)) {

@@ -120,4 +143,4 @@ throw 'bitWidth is required';

while (values.length < count) {
const header = varint.decode(cursor.buffer, cursor.offset);
cursor.offset += varint.encodingLength(header);
const header = varint_1.default.decode(cursor.buffer, cursor.offset);
cursor.offset += varint_1.default.encodingLength(header);
if (header & 1) {

@@ -139,2 +162,2 @@ res = decodeRunBitpacked(cursor, (header >> 1) * 8, opts);

};
//# sourceMappingURL=rle.js.map
exports.decodeValues = decodeValues;

@@ -1,8 +0,12 @@

'use strict';
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const zlib = require('zlib');
const snappy = require('snappyjs');
exports.inflate = exports.deflate = exports.PARQUET_COMPRESSION_METHODS = void 0;
const zlib_1 = __importDefault(require("zlib"));
const snappyjs_1 = __importDefault(require("snappyjs"));
const wasm_brotli_1 = require("wasm-brotli");
// LZO compression is disabled. See: https://github.com/LibertyDSNP/parquetjs/issues/18
const PARQUET_COMPRESSION_METHODS = {
exports.PARQUET_COMPRESSION_METHODS = {
'UNCOMPRESSED': {

@@ -29,22 +33,25 @@ deflate: deflate_identity,

async function deflate(method, value) {
if (!(method in PARQUET_COMPRESSION_METHODS)) {
if (!(method in exports.PARQUET_COMPRESSION_METHODS)) {
throw 'invalid compression method: ' + method;
}
return PARQUET_COMPRESSION_METHODS[method].deflate(value);
return exports.PARQUET_COMPRESSION_METHODS[method].deflate(value);
}
exports.deflate = deflate;
function deflate_identity(value) {
return value;
return buffer_from_result(value);
}
function deflate_gzip(value) {
return zlib.gzipSync(value);
return zlib_1.default.gzipSync(value);
}
function deflate_snappy(value) {
return snappy.compress(value);
const compressedValue = snappyjs_1.default.compress(value);
return buffer_from_result(compressedValue);
}
async function deflate_brotli(value) {
const compressedContent = await wasm_brotli_1.compress(value, {
mode: 0,
quality: 8,
lgwin: 22
});
const compressedContent = await (0, wasm_brotli_1.compress)(value /*, {
mode: 0,
quality: 8,
lgwin: 22
}
*/);
return Buffer.from(compressedContent);

@@ -56,21 +63,29 @@ }

async function inflate(method, value) {
if (!(method in PARQUET_COMPRESSION_METHODS)) {
if (!(method in exports.PARQUET_COMPRESSION_METHODS)) {
throw 'invalid compression method: ' + method;
}
return await PARQUET_COMPRESSION_METHODS[method].inflate(value);
return await exports.PARQUET_COMPRESSION_METHODS[method].inflate(value);
}
function inflate_identity(value) {
return value;
exports.inflate = inflate;
async function inflate_identity(value) {
return buffer_from_result(value);
}
function inflate_gzip(value) {
return zlib.gunzipSync(value);
async function inflate_gzip(value) {
return zlib_1.default.gunzipSync(value);
}
function inflate_snappy(value) {
return snappy.uncompress(value);
const uncompressedValue = snappyjs_1.default.uncompress(value);
return buffer_from_result(uncompressedValue);
}
async function inflate_brotli(value) {
const uncompressedContent = await wasm_brotli_1.decompress(value);
const uncompressedContent = await (0, wasm_brotli_1.decompress)(value);
return Buffer.from(uncompressedContent);
}
module.exports = { PARQUET_COMPRESSION_METHODS, deflate, inflate };
//# sourceMappingURL=compression.js.map
function buffer_from_result(result) {
if (Buffer.isBuffer(result)) {
return result;
}
else {
return Buffer.from(result);
}
}

@@ -1,16 +0,42 @@

'use strict';
const fs = require('fs');
const thrift = require('thrift');
const Int64 = require('node-int64');
const parquet_thrift = require('../gen-nodejs/parquet_types');
const parquet_shredder = require('./shred');
const parquet_util = require('./util');
const parquet_schema = require('./schema');
const parquet_codec = require('./codec');
const parquet_compression = require('./compression');
const parquet_types = require('./types');
const BufferReader = require('./bufferReader');
const bloomFilterReader = require('./bloomFilterIO/bloomFilterReader');
const groupBy = require("lodash/groupBy");
const fetch = require('cross-fetch');
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetEnvelopeReader = exports.ParquetReader = void 0;
const node_int64_1 = __importDefault(require("node-int64"));
const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types"));
const parquet_shredder = __importStar(require("./shred"));
const parquet_util = __importStar(require("./util"));
const parquet_schema = __importStar(require("./schema"));
const parquet_codec = __importStar(require("./codec"));
const parquet_compression = __importStar(require("./compression"));
const parquet_types = __importStar(require("./types"));
const bufferReader_1 = __importDefault(require("./bufferReader"));
const bloomFilterReader = __importStar(require("./bloomFilterIO/bloomFilterReader"));
const cross_fetch_1 = __importDefault(require("cross-fetch"));
const declare_1 = require("./declare");
const { getBloomFiltersFor, } = bloomFilterReader;

@@ -22,5 +48,5 @@ /**

/**
* Parquet File Format Version
* Supported Parquet File Format Version for reading
*/
const PARQUET_VERSION = 1;
const PARQUET_VERSIONS = [1, 2];
/**

@@ -35,2 +61,9 @@ * Internal type used for repetition/definition levels

class ParquetCursor {
metadata;
envelopeReader;
schema;
columnList;
rowGroup;
rowGroupIndex;
cursorIndex;
/**

@@ -49,2 +82,3 @@ * Create a new parquet reader from the file metadata and an envelope reader.

this.rowGroupIndex = 0;
this.cursorIndex = 0;
}

@@ -56,3 +90,3 @@ /**

async next() {
if (this.rowGroup.length === 0) {
if (this.cursorIndex >= this.rowGroup.length) {
if (this.rowGroupIndex >= this.metadata.row_groups.length) {

@@ -64,7 +98,8 @@ return null;

this.rowGroupIndex++;
this.cursorIndex = 0;
}
return this.rowGroup.shift();
return this.rowGroup[this.cursorIndex++];
}
/**
* Rewind the cursor the the beginning of the file
* Rewind the cursor to the beginning of the file
*/

@@ -74,2 +109,3 @@ rewind() {

this.rowGroupIndex = 0;
this.cursorIndex = 0;
}

@@ -86,2 +122,5 @@ }

class ParquetReader {
envelopeReader;
metadata;
schema;
/**

@@ -119,3 +158,3 @@ * Open the parquet file pointed to by the specified path and return a new

static async openEnvelopeReader(envelopeReader, opts) {
if (opts && opts.metadata) {
if (opts?.metadata) {
return new ParquetReader(opts.metadata, envelopeReader, opts);

@@ -141,3 +180,3 @@ }

opts = opts || {};
if (metadata.version != PARQUET_VERSION) {
if (!PARQUET_VERSIONS.includes(metadata.version)) {
throw 'invalid parquet version';

@@ -154,3 +193,3 @@ }

else if (o.parquetType === 'INT64') {
return new Int64(Buffer.from(o.value));
return new node_int64_1.default(Buffer.from(o.value));
}

@@ -164,7 +203,7 @@ }

if (column.offsetIndex) {
column.offsetIndex.page_locations.forEach(d => {
Promise.resolve(column.offsetIndex).then(offset => (offset.page_locations.forEach(d => {
if (Array.isArray(d)) {
Object.setPrototypeOf(d, parquet_thrift.PageLocation.prototype);
Object.setPrototypeOf(d, parquet_types_1.default.PageLocation.prototype);
}
});
})));
}

@@ -193,2 +232,19 @@ });

/**
* Support `for await` iterators on the reader object
* Uses `ParquetCursor` still under the hood.
*
* ```js
* for await (const record of reader) {
* console.log(record);
* }
* ```
*/
async *[Symbol.asyncIterator]() {
const cursor = this.getCursor();
let record = null;
while (record = await cursor.next()) {
yield record;
}
}
/**
* Return a cursor to the file. You may open more than one cursor and use

@@ -211,7 +267,13 @@ * them concurrently. All cursors become invalid once close() is called on

const bloomFilterData = await getBloomFiltersFor(columnNames, this.envelopeReader);
return groupBy(bloomFilterData, 'columnName');
return bloomFilterData.reduce((acc, value) => {
if (acc[value.columnName])
acc[value.columnName].push(value);
else
acc[value.columnName] = [value];
return acc;
}, {});
}
/**
* Return the number of rows in this file. Note that the number of rows is
* not neccessarily equal to the number of rows in each column.
* not necessarily equal to the number of rows in each column.
*/

@@ -237,6 +299,6 @@ getRowCount() {

}
exportMetadata(indent) {
function replacer(key, value) {
if (value instanceof parquet_thrift.PageLocation) {
return [value[0], value[1], value[2]];
async exportMetadata(indent) {
function replacer(_key, value) {
if (value instanceof parquet_types_1.default.PageLocation) {
return [value.offset, value.compressed_page_size, value.first_row_index];
}

@@ -256,4 +318,4 @@ if (typeof value === 'object') {

}
if (value instanceof Int64) {
if (isFinite(value)) {
if (value instanceof node_int64_1.default) {
if (isFinite(Number(value))) {
return Number(value);

@@ -273,2 +335,14 @@ }

const metadata = Object.assign({}, this.metadata, { json: true });
for (let i = 0; i < metadata.row_groups.length; i++) {
const rowGroup = metadata.row_groups[i];
for (let j = 0; j < rowGroup.columns.length; j++) {
const column = rowGroup.columns[j];
if (column.offsetIndex instanceof Promise) {
column.offsetIndex = await column.offsetIndex;
}
if (column.columnIndex instanceof Promise) {
column.columnIndex = await column.columnIndex;
}
}
}
return JSON.stringify(metadata, replacer, indent);

@@ -289,2 +363,3 @@ }

}
exports.ParquetReader = ParquetReader;
/**

@@ -298,2 +373,9 @@ * The parquet envelope reader allows direct, unbuffered access to the individual

class ParquetEnvelopeReader {
readFn;
close;
id;
fileSize;
default_dictionary_size;
metadata;
schema;
static async openFile(filePath, options) {

@@ -322,3 +404,3 @@ let fileStat = await parquet_util.fstat(filePath);

static async openS3(client, params, options) {
let fileStat = async () => client.headObject(params).promise().then(d => d.ContentLength);
let fileStat = async () => client.headObject(params).promise().then((d) => d.ContentLength);
let readFn = async (offset, length, file) => {

@@ -335,12 +417,17 @@ if (file) {

}
static async openUrl(params, options) {
if (typeof params === 'string')
params = { url: params };
static async openUrl(url, options) {
let params;
if (typeof url === 'string')
params = { url };
else if (url instanceof URL)
params = { url: url.toString() };
else
params = url;
if (!params.url)
throw new Error('URL missing');
let base = params.url.split('/');
base = base.slice(0, base.length - 1).join('/') + '/';
const baseArr = params.url.split('/');
const base = baseArr.slice(0, baseArr.length - 1).join('/') + '/';
let defaultHeaders = params.headers || {};
let filesize = async () => {
const { headers } = await fetch(params.url);
const { headers } = await (0, cross_fetch_1.default)(params.url);
return headers.get('Content-Length');

@@ -352,3 +439,3 @@ };

let headers = Object.assign({}, defaultHeaders, { range });
const response = await fetch(url, { headers });
const response = await (0, cross_fetch_1.default)(url, { headers });
const arrayBuffer = await response.arrayBuffer();

@@ -361,3 +448,3 @@ const buffer = Buffer.from(arrayBuffer);

}
constructor(readFn, closeFn, fileSize, options) {
constructor(readFn, closeFn, fileSize, options, metadata) {
options = options || {};

@@ -369,4 +456,5 @@ this.readFn = readFn;

this.default_dictionary_size = options.default_dictionary_size || 10000000;
this.metadata = metadata;
if (options.maxLength || options.maxSpan || options.queueWait) {
const bufferReader = new BufferReader(this, options);
const bufferReader = new bufferReader_1.default(this, options);
this.read = (offset, length) => bufferReader.read(offset, length);

@@ -379,3 +467,3 @@ }

readHeader() {
return this.read(0, PARQUET_MAGIC.length).then(buf => {
return this.read(0, PARQUET_MAGIC.length).then((buf) => {
if (buf.toString() != PARQUET_MAGIC) {

@@ -389,10 +477,14 @@ throw 'not valid parquet file';

let column;
if (!isNaN(row_group)) {
row_group = this.metadata.row_groups[row_group];
let parsedRowGroup;
if (!isNaN(Number(row_group))) {
parsedRowGroup = this.metadata?.row_groups[Number(row_group)];
}
else if (row_group instanceof parquet_types_1.default.RowGroup) {
parsedRowGroup = row_group;
}
if (typeof path === 'string') {
if (!row_group) {
if (!parsedRowGroup) {
throw `Missing RowGroup ${row_group}`;
}
column = row_group.columns.find(d => d.meta_data.path_in_schema.join(',') === path);
column = parsedRowGroup.columns.find(d => d.meta_data.path_in_schema.join(',') === path);
if (!column) {

@@ -424,12 +516,9 @@ throw `Column ${path} Not Found`;

}
const data = this.read(+column.offset_index_offset, column.offset_index_length).then(data => {
let offset_index = new parquet_thrift.OffsetIndex();
const data = this.read(+column.offset_index_offset, column.offset_index_length).then((data) => {
let offset_index = new parquet_types_1.default.OffsetIndex();
parquet_util.decodeThrift(offset_index, data);
Object.defineProperty(offset_index, 'column', { value: column, enumerable: false });
if (opts && opts.cache) {
column.offsetIndex = offset_index;
}
return offset_index;
});
if (opts && opts.cache) {
if (opts?.cache) {
column.offsetIndex = data;

@@ -447,5 +536,5 @@ }

}
const data = this.read(+column.column_index_offset, column.column_index_length).then(data => {
let column_index = new parquet_thrift.ColumnIndex();
parquet_util.decodeThrift(column_index, data);
const data = this.read(+column.column_index_offset, column.column_index_length).then((buf) => {
let column_index = new parquet_types_1.default.ColumnIndex();
parquet_util.decodeThrift(column_index, buf);
Object.defineProperty(column_index, 'column', { value: column });

@@ -460,8 +549,5 @@ // decode the statistics values

}
if (opts && opts.cache) {
column.columnIndex = column_index;
}
return column_index;
});
if (opts && opts.cache) {
if (opts?.cache) {
column.columnIndex = data;

@@ -474,13 +560,13 @@ }

column.meta_data = Object.assign({}, column.meta_data);
if (page.offset !== undefined) {
if (isNaN(page.offset) || isNaN(page.compressed_page_size)) {
if (page instanceof parquet_types_1.default.PageLocation && page.offset !== undefined) {
if (isNaN(Number(page.offset)) || isNaN(page.compressed_page_size)) {
throw Error('page offset and/or size missing');
}
column.meta_data.data_page_offset = page.offset;
column.meta_data.total_compressed_size = page.compressed_page_size;
column.meta_data.data_page_offset = parquet_util.cloneInteger(page.offset);
column.meta_data.total_compressed_size = new node_int64_1.default(page.compressed_page_size);
}
else {
const offsetIndex = column.offsetIndex || await this.readOffsetIndex(column, null, opts);
column.meta_data.data_page_offset = offsetIndex.page_locations[page].offset;
column.meta_data.total_compressed_size = offsetIndex.page_locations[page].compressed_page_size;
const offsetIndex = await this.readOffsetIndex(column, null, opts);
column.meta_data.data_page_offset = parquet_util.cloneInteger(offsetIndex.page_locations[page].offset);
column.meta_data.total_compressed_size = new node_int64_1.default(offsetIndex.page_locations[page].compressed_page_size);
}

@@ -497,3 +583,5 @@ const chunk = await this.readColumnChunk(this.schema, column);

rowCount: +rowGroup.num_rows,
columnData: {}
columnData: {},
pageRowCount: 0,
pages: {}
};

@@ -506,3 +594,3 @@ for (let colChunk of rowGroup.columns) {

}
buffer.columnData[colKey] = await this.readColumnChunk(schema, colChunk);
buffer.columnData[colKey.join(',')] = await this.readColumnChunk(schema, colChunk);
}

@@ -512,9 +600,10 @@ return buffer;

async readColumnChunk(schema, colChunk, opts) {
let field = schema.findField(colChunk.meta_data.path_in_schema);
let type = parquet_util.getThriftEnum(parquet_thrift.Type, colChunk.meta_data.type);
let compression = parquet_util.getThriftEnum(parquet_thrift.CompressionCodec, colChunk.meta_data.codec);
let pagesOffset = +colChunk.meta_data.data_page_offset;
let pagesSize = +colChunk.meta_data.total_compressed_size;
let metadata = colChunk.meta_data;
let field = schema.findField(metadata.path_in_schema);
let type = parquet_util.getThriftEnum(parquet_types_1.default.Type, metadata.type);
let compression = parquet_util.getThriftEnum(parquet_types_1.default.CompressionCodec, metadata.codec);
let pagesOffset = +metadata.data_page_offset;
let pagesSize = +metadata.total_compressed_size;
if (!colChunk.file_path) {
pagesSize = Math.min(this.fileSize - pagesOffset, +colChunk.meta_data.total_compressed_size);
pagesSize = Math.min(this.fileSize - pagesOffset, +metadata.total_compressed_size);
}

@@ -527,6 +616,7 @@ opts = Object.assign({}, opts, {

column: field,
num_values: colChunk.meta_data.num_values
num_values: metadata.num_values
});
if (colChunk.meta_data.dictionary_page_offset) {
const offset = +colChunk.meta_data.dictionary_page_offset;
// If this exists and is greater than zero then we need to have an offset
if (metadata.dictionary_page_offset && +metadata.dictionary_page_offset > 0) {
const offset = +metadata.dictionary_page_offset;
const size = Math.min(+this.fileSize - offset, this.default_dictionary_size);

@@ -539,3 +629,3 @@ await this.read(offset, size, colChunk.file_path).then(async (buffer) => {

}
return this.read(pagesOffset, pagesSize, colChunk.file_path).then(pagesBuf => decodePages(pagesBuf, opts));
return this.read(pagesOffset, pagesSize, colChunk.file_path).then((pagesBuf) => decodePages(pagesBuf, opts));
}

@@ -557,3 +647,3 @@ async readFooter() {

let metadataBuf = await this.read(metadataOffset, metadataSize);
let metadata = new parquet_thrift.FileMetaData();
let metadata = new parquet_types_1.default.FileMetaData();
parquet_util.decodeThrift(metadata, metadataBuf);

@@ -563,2 +653,3 @@ return metadata;

}
exports.ParquetEnvelopeReader = ParquetEnvelopeReader;
/**

@@ -583,3 +674,3 @@ * Decode a consecutive array of data using one of the parquet encodings

if (column.originalType) {
value = parquet_types.fromPrimitive(column.originalType, value);
value = parquet_types.fromPrimitive(column.originalType, value, column);
}

@@ -605,7 +696,7 @@ return value;

let page;
const pageHeader = new parquet_thrift.PageHeader();
const pageHeader = new declare_1.NewPageHeader();
const headerOffset = cursor.offset;
const headerSize = parquet_util.decodeThrift(pageHeader, cursor.buffer.slice(cursor.offset));
cursor.offset += headerSize;
const pageType = parquet_util.getThriftEnum(parquet_thrift.PageType, pageHeader.type);
const pageType = parquet_util.getThriftEnum(parquet_types_1.default.PageType, pageHeader.type);
switch (pageType) {

@@ -658,6 +749,10 @@ case 'DATA_PAGE':

}
if (opts.dictionary) {
// It's possible to have a column chunk where some pages should use
// the dictionary (PLAIN_DICTIONARY for example) and others should
// not (PLAIN for example).
if (opts.dictionary && pageData.useDictionary) {
pageData.values = pageData.values.map(d => opts.dictionary[d]);
}
for (let i = 0; i < pageData.rlevels.length; i++) {
let length = pageData.rlevels != undefined ? pageData.rlevels.length : 0;
for (let i = 0; i < length; i++) {
data.rlevels.push(pageData.rlevels[i]);

@@ -691,9 +786,10 @@ data.dlevels.push(pageData.dlevels[i]);

}
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, header.dictionary_page_header.num_values, opts)
.map(d => d.toString());
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, (header.dictionary_page_header).num_values, opts)
.map((d) => d.toString());
}
async function decodeDataPage(cursor, header, opts) {
const cursorEnd = cursor.offset + header.compressed_page_size;
let valueCount = header.data_page_header.num_values;
let valueEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.encoding);
const dataPageHeader = header.data_page_header;
let valueCount = dataPageHeader.num_values;
let valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.encoding);
let valuesBufCursor = cursor;

@@ -709,3 +805,3 @@ if (opts.compression && opts.compression !== 'UNCOMPRESSED') {

/* read repetition levels */
let rLevelEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.repetition_level_encoding);
let rLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.repetition_level_encoding);
let rLevels = new Array(valueCount);

@@ -719,3 +815,3 @@ if (opts.rLevelMax > 0) {

/* read definition levels */
let dLevelEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.definition_level_encoding);
let dLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.definition_level_encoding);
let dLevels = new Array(valueCount);

@@ -738,3 +834,7 @@ if (opts.dLevelMax > 0) {

bitWidth: opts.column.typeLength,
disableEnvelope: opts.column.disableEnvelope
disableEnvelope: opts.column.disableEnvelope,
originalType: opts.column.originalType,
precision: opts.column.precision,
scale: opts.column.scale,
name: opts.column.name
});

@@ -746,3 +846,4 @@ cursor.offset = cursorEnd;

values: values,
count: valueCount
count: valueCount,
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY'
};

@@ -752,5 +853,6 @@ }

const cursorEnd = cursor.offset + header.compressed_page_size;
const valueCount = header.data_page_header_v2.num_values;
const valueCountNonNull = valueCount - header.data_page_header_v2.num_nulls;
const valueEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header_v2.encoding);
const dataPageHeaderV2 = header.data_page_header_v2;
const valueCount = dataPageHeaderV2.num_values;
const valueCountNonNull = valueCount - dataPageHeaderV2.num_nulls;
const valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeaderV2.encoding);
/* read repetition levels */

@@ -780,3 +882,3 @@ let rLevels = new Array(valueCount);

let valuesBufCursor = cursor;
if (header.data_page_header_v2.is_compressed) {
if (dataPageHeaderV2.is_compressed) {
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd));

@@ -791,4 +893,4 @@ valuesBufCursor = {

let values = decodeValues(opts.type, valueEncoding, valuesBufCursor, valueCountNonNull, {
typeLength: opts.column.typeLength,
bitWidth: opts.column.typeLength
bitWidth: opts.column.typeLength,
...opts.column
});

@@ -799,3 +901,4 @@ return {

values: values,
count: valueCount
count: valueCount,
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY'
};

@@ -806,3 +909,3 @@ }

schemaElements.forEach(schemaElement => {
let repetitionType = parquet_util.getThriftEnum(parquet_thrift.FieldRepetitionType, schemaElement.repetition_type);
let repetitionType = parquet_util.getThriftEnum(parquet_types_1.default.FieldRepetitionType, schemaElement.repetition_type);
let optional = false;

@@ -821,3 +924,3 @@ let repeated = false;

;
if (schemaElement.num_children > 0) {
if (schemaElement.num_children != undefined && schemaElement.num_children > 0) {
schema[schemaElement.name] = {

@@ -842,5 +945,5 @@ optional: optional,

else {
let logicalType = parquet_util.getThriftEnum(parquet_thrift.Type, schemaElement.type);
let logicalType = parquet_util.getThriftEnum(parquet_types_1.default.Type, schemaElement.type);
if (schemaElement.converted_type != null) {
logicalType = parquet_util.getThriftEnum(parquet_thrift.ConvertedType, schemaElement.converted_type);
logicalType = parquet_util.getThriftEnum(parquet_types_1.default.ConvertedType, schemaElement.converted_type);
}

@@ -851,3 +954,5 @@ schema[schemaElement.name] = {

optional: optional,
repeated: repeated
repeated: repeated,
scale: schemaElement.scale,
precision: schemaElement.precision
};

@@ -862,6 +967,1 @@ }

}
module.exports = {
ParquetEnvelopeReader,
ParquetReader,
};
//# sourceMappingURL=reader.js.map

@@ -1,6 +0,31 @@

'use strict';
const parquet_codec = require('./codec');
const parquet_compression = require('./compression');
const parquet_types = require('./types');
const parquet_util = require('./util');
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetSchema = void 0;
const parquet_codec = __importStar(require("./codec"));
const parquet_compression = __importStar(require("./compression"));
const parquet_types = __importStar(require("./types"));
const jsonSchema_1 = require("./jsonSchema");
const PARQUET_COLUMN_KEY_SEPARATOR = '.';

@@ -11,3 +36,13 @@ /**

class ParquetSchema {
schema;
fields;
fieldList;
/**
* Create a new schema from JSON Schema (json-schema.org)
*/
static fromJsonSchema(jsonSchema) {
const schema = (0, jsonSchema_1.fromJsonSchema)(jsonSchema);
return new ParquetSchema(schema);
}
/**
* Create a new schema from a JSON schema definition

@@ -24,3 +59,3 @@ */

findField(path) {
if (path.constructor !== Array) {
if (typeof path === 'string') {
path = path.split(",");

@@ -33,3 +68,6 @@ }

for (; path.length > 1; path.shift()) {
n = n[path[0]].fields;
let fields = n[path[0]]?.fields;
if (isDefined(fields)) {
n = fields;
}
}

@@ -42,3 +80,3 @@ return n[path[0]];

findFieldBranch(path) {
if (path.constructor !== Array) {
if (typeof path === 'string') {
path = path.split(",");

@@ -50,4 +88,5 @@ }

branch.push(n[path[0]]);
if (path.length > 1) {
n = n[path[0]].fields;
let fields = n[path[0]].fields;
if (path.length > 1 && isDefined(fields)) {
n = fields;
}

@@ -58,2 +97,3 @@ }

}
exports.ParquetSchema = ParquetSchema;
;

@@ -71,2 +111,3 @@ function buildFields(schema, rLevelParentMax, dLevelParentMax, path) {

let fieldList = {};
let fieldErrors = [];
for (let name in schema) {

@@ -95,3 +136,3 @@ const opts = schema[name];

name: name,
path: path.concat([name]),
path: path.concat(name),
repetitionType: repetitionType,

@@ -103,3 +144,3 @@ rLevelMax: rLevelMax,

fieldCount: Object.keys(opts.fields).length,
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat([name]))
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name))
};

@@ -110,6 +151,10 @@ if (opts.type == 'LIST' || opts.type == 'MAP')

}
/* field type */
const typeDef = parquet_types.PARQUET_LOGICAL_TYPES[opts.type];
let nameWithPath = (`${name}` || 'missing name');
if (path && path.length > 0) {
nameWithPath = `${path}.${nameWithPath}`;
}
const typeDef = opts.type ? parquet_types.getParquetTypeDataObject(opts.type, opts) : undefined;
if (!typeDef) {
throw 'invalid parquet type: ' + opts.type;
fieldErrors.push(`Invalid parquet type: ${(opts.type || "missing type")}, for Column: ${nameWithPath}`);
continue;
}

@@ -121,3 +166,3 @@ /* field encoding */

if (!(opts.encoding in parquet_codec)) {
throw 'unsupported parquet encoding: ' + opts.encodig;
fieldErrors.push(`Unsupported parquet encoding: ${opts.encoding}, for Column: ${nameWithPath}`);
}

@@ -128,4 +173,10 @@ if (!opts.compression) {

if (!(opts.compression in parquet_compression.PARQUET_COMPRESSION_METHODS)) {
throw 'unsupported compression method: ' + opts.compression;
fieldErrors.push(`Unsupported compression method: ${opts.compression}, for Column: ${nameWithPath}`);
}
if (typeDef.originalType === 'DECIMAL') {
// Default scale to 0 per https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal
if (typeof opts.scale === "undefined")
opts.scale = 0;
fieldErrors = fieldErrors.concat(errorsForDecimalOpts(typeDef.originalType, typeDef.primitiveType, opts, nameWithPath));
}
/* add to schema */

@@ -141,2 +192,4 @@ fieldList[name] = {

compression: opts.compression,
precision: opts.precision,
scale: opts.scale,
typeLength: opts.typeLength || typeDef.typeLength,

@@ -147,2 +200,5 @@ rLevelMax: rLevelMax,

}
if (fieldErrors.length > 0) {
throw fieldErrors.reduce((accumulator, currentVal) => accumulator + '\n' + currentVal);
}
return fieldList;

@@ -154,4 +210,5 @@ }

list.push(fields[k]);
if (fields[k].isNested) {
list = list.concat(listFields(fields[k].fields));
const nestedFields = fields[k].fields;
if (fields[k].isNested && isDefined(nestedFields)) {
list = list.concat(listFields(nestedFields));
}

@@ -161,3 +218,26 @@ }

}
module.exports = { ParquetSchema };
//# sourceMappingURL=schema.js.map
function isDefined(val) {
return val !== undefined;
}
function errorsForDecimalOpts(type, primitiveType, opts, columnName) {
const fieldErrors = [];
if (opts.precision === undefined || opts.precision < 1) {
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, precision is required and must be be greater than 0`);
}
else if (!Number.isInteger(opts.precision)) {
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, precision must be an integer`);
}
else if (primitiveType === "INT64" && opts.precision > 18) {
fieldErrors.push(`invalid schema for type: ${type} and primitive type: ${primitiveType} for Column: ${columnName}, can not handle precision over 18`);
}
if (typeof opts.scale === "undefined" || opts.scale < 0) {
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, scale is required to be 0 or greater`);
}
else if (!Number.isInteger(opts.scale)) {
fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, scale must be an integer`);
}
else if (opts.precision !== undefined && opts.scale > opts.precision) {
fieldErrors.push(`invalid schema or precision for type: ${type}, for Column: ${columnName}, precision must be greater than or equal to scale`);
}
return fieldErrors;
}

@@ -1,32 +0,33 @@

'use strict';
const parquet_types = require('./types');
const parquet_schema = require('./schema');
/**
* 'Shred' a record into a list of <value, repetition_level, definition_level>
* tuples per column using the Google Dremel Algorithm..
*
* The buffer argument must point to an object into which the shredded record
* will be returned. You may re-use the buffer for repeated calls to this function
* to append to an existing buffer, as long as the schema is unchanged.
*
* The format in which the shredded records will be stored in the buffer is as
* follows:
*
* buffer = {
* columnData: [
* 'my_col': {
* dlevels: [d1, d2, .. dN],
* rlevels: [r1, r2, .. rN],
* values: [v1, v2, .. vN],
* }, ...
* ],
* rowCount: X,
* }
*
*/
exports.shredRecord = function (schema, record, buffer) {
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.materializeRecords = exports.shredRecord = void 0;
const parquet_types = __importStar(require("./types"));
const shredRecord = function (schema, record, buffer) {
/* shred the record, this may raise an exception */
var recordShredded = {};
for (let field of schema.fieldList) {
recordShredded[field.path] = {
recordShredded[field.path.join(',')] = {
dlevels: [],

@@ -47,3 +48,4 @@ rlevels: [],

for (let field of schema.fieldList) {
buffer.columnData[field.path] = {
let path = field.path.join(',');
buffer.columnData[path] = {
dlevels: [],

@@ -55,3 +57,3 @@ rlevels: [],

};
buffer.pages[field.path] = [];
buffer.pages[path] = [];
}

@@ -62,4 +64,5 @@ }

for (let field of schema.fieldList) {
let record = recordShredded[field.path];
let column = buffer.columnData[field.path];
let path = field.path.join(',');
let record = recordShredded[path];
let column = buffer.columnData[path];
for (let i = 0; i < record.rlevels.length; i++) {

@@ -72,6 +75,7 @@ column.rlevels.push(record.rlevels[i]);

}
[...recordShredded[field.path].distinct_values].forEach(value => buffer.columnData[field.path].distinct_values.add(value));
buffer.columnData[field.path].count += recordShredded[field.path].count;
[...recordShredded[path].distinct_values].forEach(value => buffer.columnData[path].distinct_values.add(value));
buffer.columnData[path].count += recordShredded[path].count;
}
};
exports.shredRecord = shredRecord;
function shredRecordInternal(fields, record, data, rlvl, dlvl) {

@@ -81,8 +85,18 @@ for (let fieldName in fields) {

const fieldType = field.originalType || field.primitiveType;
const path = field.path.join(',');
// fetch values
let values = [];
if (record && (fieldName in record) && record[fieldName] !== undefined && record[fieldName] !== null) {
if (record[fieldName].constructor === Array) {
if (Array.isArray(record[fieldName])) {
values = record[fieldName];
}
else if (ArrayBuffer.isView(record[fieldName])) { // checks if any typed array
if (record[fieldName] instanceof Uint8Array) {
// wrap in a buffer, since not supported by parquet_thrift
values.push(Buffer.from(record[fieldName]));
}
else {
throw Object.prototype.toString.call(record[fieldName]) + ' is not supported';
}
}
else {

@@ -101,9 +115,9 @@ values.push(record[fieldName]);

if (values.length == 0) {
if (field.isNested) {
if (field.isNested && isDefined(field.fields)) {
shredRecordInternal(field.fields, null, data, rlvl, dlvl);
}
else {
data[field.path].rlevels.push(rlvl);
data[field.path].dlevels.push(dlvl);
data[field.path].count += 1;
data[path].rlevels.push(rlvl);
data[path].dlevels.push(dlvl);
data[path].count += 1;
}

@@ -115,11 +129,11 @@ continue;

const rlvl_i = i === 0 ? rlvl : field.rLevelMax;
if (field.isNested) {
if (field.isNested && isDefined(field.fields)) {
shredRecordInternal(field.fields, values[i], data, rlvl_i, field.dLevelMax);
}
else {
data[field.path].distinct_values.add(values[i]);
data[field.path].values.push(parquet_types.toPrimitive(fieldType, values[i]));
data[field.path].rlevels.push(rlvl_i);
data[field.path].dlevels.push(field.dLevelMax);
data[field.path].count += 1;
data[path].distinct_values.add(values[i]);
data[path].values.push(parquet_types.toPrimitive(fieldType, values[i], field));
data[path].rlevels.push(rlvl_i);
data[path].dlevels.push(field.dLevelMax);
data[path].count += 1;
}

@@ -149,3 +163,3 @@ }

*/
exports.materializeRecords = function (schema, buffer, records) {
const materializeRecords = function (schema, buffer, records) {
if (!records) {

@@ -167,3 +181,3 @@ records = [];

if (dLevel === field.dLevelMax) {
value = parquet_types.fromPrimitive(field.originalType || field.primitiveType, values.next().value);
value = parquet_types.fromPrimitive(field.originalType || field.primitiveType, values.next().value, field);
}

@@ -176,5 +190,8 @@ records[rLevels[0] - 1] = records[rLevels[0] - 1] || {};

};
exports.materializeRecords = materializeRecords;
function materializeRecordField(record, branch, rLevels, dLevel, value) {
const node = branch[0];
if (dLevel < node.dLevelMax) {
// This ensures that nulls are correctly processed
record[node.name] = value;
return;

@@ -187,10 +204,12 @@ }

}
while (record[node.name].length < rLevels[0] + 1) {
record[node.name].push({});
const recordValue = record[node.name];
while (recordValue.length < rLevels[0] + 1) {
recordValue.push({});
}
materializeRecordField(record[node.name][rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value);
materializeRecordField(recordValue[rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value);
}
else {
record[node.name] = record[node.name] || {};
materializeRecordField(record[node.name], branch.slice(1), rLevels, dLevel, value);
const recordValue = record[node.name];
materializeRecordField(recordValue, branch.slice(1), rLevels, dLevel, value);
}

@@ -203,6 +222,7 @@ }

}
while (record[node.name].length < rLevels[0] + 1) {
record[node.name].push(null);
const recordValue = record[node.name];
while (recordValue.length < rLevels[0] + 1) {
recordValue.push(null);
}
record[node.name][rLevels[0]] = value;
recordValue[rLevels[0]] = value;
}

@@ -214,2 +234,4 @@ else {

}
//# sourceMappingURL=shred.js.map
function isDefined(val) {
return val !== undefined;
}
'use strict';
const BSON = require('bson');
const PARQUET_LOGICAL_TYPES = {
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fromPrimitive = exports.toPrimitive = exports.getParquetTypeDataObject = void 0;
// Thanks to https://github.com/kbajalc/parquets for some of the code.
const BSON = __importStar(require("bson"));
function getParquetTypeDataObject(type, field) {
if (type === 'DECIMAL') {
if (field?.typeLength !== undefined) {
return {
primitiveType: 'FIXED_LEN_BYTE_ARRAY',
originalType: 'DECIMAL',
typeLength: field.typeLength,
toPrimitive: toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL
};
}
else if (field?.precision !== undefined && field.precision > 18) {
return {
primitiveType: 'BYTE_ARRAY',
originalType: 'DECIMAL',
typeLength: field.typeLength,
toPrimitive: toPrimitive_BYTE_ARRAY_DECIMAL
};
}
else {
return {
primitiveType: 'INT64',
originalType: 'DECIMAL',
toPrimitive: toPrimitive_INT64
};
}
}
else {
return PARQUET_LOGICAL_TYPE_DATA[type];
}
}
exports.getParquetTypeDataObject = getParquetTypeDataObject;
const PARQUET_LOGICAL_TYPES = new Set([
'BOOLEAN',
'INT32',
'INT64',
'INT96',
'FLOAT',
'DOUBLE',
'BYTE_ARRAY',
'FIXED_LEN_BYTE_ARRAY',
'UTF8',
'ENUM',
'TIME_MILLIS',
'TIME_MICROS',
'DATE',
'TIMESTAMP_MILLIS',
'TIMESTAMP_MICROS',
'UINT_8',
'UINT_16',
'UINT_32',
'UINT_64',
'INT_8',
'INT_16',
'INT_32',
'INT_64',
'DECIMAL',
'JSON',
'BSON',
'INTERVAL',
'MAP',
'LIST'
]);
const PARQUET_LOGICAL_TYPE_DATA = {
'BOOLEAN': {

@@ -135,14 +223,31 @@ primitiveType: 'BOOLEAN',

fromPrimitive: fromPrimitive_INTERVAL
},
MAP: {
originalType: 'MAP',
toPrimitive: toPrimitive_MAP,
},
LIST: {
originalType: 'LIST',
toPrimitive: toPrimitive_LIST,
}
};
/**
* Test if something is a valid Parquet Type
* @param type the string of the type
* @returns if type is a valid Parquet Type
*/
function isParquetType(type) {
return type !== undefined && PARQUET_LOGICAL_TYPES.has(type);
}
/**
* Convert a value from it's native representation to the internal/underlying
* primitive type
*/
function toPrimitive(type, value) {
if (!(type in PARQUET_LOGICAL_TYPES)) {
throw 'invalid type: ' + type;
function toPrimitive(type, value, field) {
if (!isParquetType(type)) {
throw 'invalid type: ' + type || "undefined";
}
return PARQUET_LOGICAL_TYPES[type].toPrimitive(value);
return getParquetTypeDataObject(type, field).toPrimitive(value);
}
exports.toPrimitive = toPrimitive;
/**

@@ -152,8 +257,9 @@ * Convert a value from it's internal/underlying primitive representation to

*/
function fromPrimitive(type, value) {
if (!(type in PARQUET_LOGICAL_TYPES)) {
throw 'invalid type: ' + type;
function fromPrimitive(type, value, field) {
if (!isParquetType(type)) {
throw 'invalid type: ' + type || "undefined";
}
if ("fromPrimitive" in PARQUET_LOGICAL_TYPES[type]) {
return PARQUET_LOGICAL_TYPES[type].fromPrimitive(value);
const typeFromPrimitive = getParquetTypeDataObject(type, field).fromPrimitive;
if (typeFromPrimitive !== undefined) {
return typeFromPrimitive(value);
}

@@ -164,2 +270,3 @@ else {

}
exports.fromPrimitive = fromPrimitive;
function toPrimitive_BOOLEAN(value) {

@@ -172,78 +279,146 @@ return !!value;

function toPrimitive_FLOAT(value) {
const v = parseFloat(value);
if (isNaN(v)) {
throw 'invalid value for FLOAT: ' + value;
if (typeof value === 'string') {
const v = parseFloat(value);
return v;
}
return v;
else if (typeof value === 'number') {
return value;
}
throw 'invalid value for FLOAT: ' + value;
}
function toPrimitive_DOUBLE(value) {
const v = parseFloat(value);
if (isNaN(v)) {
throw 'invalid value for DOUBLE: ' + value;
if (typeof value === 'string') {
const v = parseFloat(value);
return v;
}
return v;
else if (typeof value === 'number') {
return value;
}
throw 'invalid value for DOUBLE: ' + value;
}
function toPrimitive_INT8(value) {
const v = parseInt(value, 10);
if (v < -0x80 || v > 0x7f || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(-0x80, 0x7f, v);
return v;
}
catch {
throw 'invalid value for INT8: ' + value;
}
return v;
}
function toPrimitive_UINT8(value) {
const v = parseInt(value, 10);
if (v < 0 || v > 0xff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(0, 0xff, v);
return v;
}
catch {
throw 'invalid value for UINT8: ' + value;
}
return v;
}
function toPrimitive_INT16(value) {
const v = parseInt(value, 10);
if (v < -0x8000 || v > 0x7fff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(-0x8000, 0x7fff, v);
return v;
}
catch {
throw 'invalid value for INT16: ' + value;
}
return v;
}
function toPrimitive_UINT16(value) {
const v = parseInt(value, 10);
if (v < 0 || v > 0xffff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(0, 0xffff, v);
return v;
}
catch {
throw 'invalid value for UINT16: ' + value;
}
return v;
}
function toPrimitive_INT32(value) {
const v = parseInt(value, 10);
if (v < -0x80000000 || v > 0x7fffffff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(-0x80000000, 0x7fffffff, v);
return v;
}
catch {
throw 'invalid value for INT32: ' + value;
}
return v;
}
function toPrimitive_UINT32(value) {
const v = parseInt(value, 10);
if (v < 0 || v > 0xffffffffffff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(0, 0xffffffffffff, v);
return v;
}
catch {
throw 'invalid value for UINT32: ' + value;
}
return v;
}
const MIN_64 = BigInt('0x8000000000000000') * -1n;
const MAX_64 = BigInt('0x7fffffffffffffff');
function toPrimitive_INT64(value) {
const v = parseInt(value, 10);
if (isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(MIN_64, MAX_64, v);
return v;
}
catch {
throw 'invalid value for INT64: ' + value;
}
return v;
}
const MAX_U64 = BigInt('0xffffffffffffffff');
function toPrimitive_UINT64(value) {
const v = parseInt(value, 10);
if (v < 0 || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(0, MAX_U64, v);
return v;
}
catch {
throw 'invalid value for UINT64: ' + value;
}
return v;
}
const MIN_96 = BigInt('0x800000000000000000000000') * -1n;
const MAX_96 = BigInt('0x7fffffffffffffffffffffff');
function toPrimitive_INT96(value) {
const v = parseInt(value, 10);
if (isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(MIN_96, MAX_96, v);
return v;
}
catch {
throw 'invalid value for INT96: ' + value;
}
return v;
}
function toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL(value) {
return Buffer.from(value);
}
function toPrimitive_BYTE_ARRAY_DECIMAL(value) {
return Buffer.from(value);
}
function toPrimitive_MAP(value) {
return value;
}
function toPrimitive_LIST(value) {
return value;
}
function toPrimitive_BYTE_ARRAY(value) {

@@ -270,13 +445,27 @@ return Buffer.from(value);

}
function toPrimitive_TIME_MILLIS(value) {
const v = parseInt(value, 10);
if (v < 0 || v > 0xffffffffffffffff || isNaN(v)) {
throw 'invalid value for TIME_MILLIS: ' + value;
function toNumberInternal(typeName, value) {
let numberValue = 0;
switch (typeof value) {
case "string":
numberValue = parseInt(value, 10);
break;
case "number":
numberValue = value;
break;
default:
throw `${typeName} has an invalid type: ${typeof value}`;
}
return v;
// Year 2255 bug. Should eventually switch to bigint
if (numberValue < 0 || numberValue >= Number.MAX_SAFE_INTEGER) {
throw `${typeName} value is out of bounds: ${numberValue}`;
}
return numberValue;
}
function toPrimitive_TIME_MILLIS(value) {
return toNumberInternal("TIME_MILLIS", value);
}
function toPrimitive_TIME_MICROS(value) {
const v = BigInt(value);
if (v < 0n || isNaN(v)) {
throw 'invalid value for TIME_MICROS: ' + value;
if (v < 0n) {
throw 'TIME_MICROS value is out of bounds: ' + value;
}

@@ -291,10 +480,3 @@ return v;

}
/* convert from integer */
{
const v = parseInt(value, 10);
if (v < 0 || isNaN(v)) {
throw 'invalid value for DATE: ' + value;
}
return v;
}
return toNumberInternal("DATE", value);
}

@@ -309,10 +491,3 @@ function fromPrimitive_DATE(value) {

}
/* convert from integer */
{
const v = parseInt(value, 10);
if (v < 0 || isNaN(v)) {
throw 'invalid value for TIMESTAMP_MILLIS: ' + value;
}
return v;
}
return toNumberInternal("TIMESTAMP_MILLIS", value);
}

@@ -328,12 +503,18 @@ function fromPrimitive_TIMESTAMP_MILLIS(value) {

/* convert from integer */
{
try {
// Will throw if NaN
const v = BigInt(value);
if (v < 0n /*|| isNaN(v)*/) {
throw 'invalid value for TIMESTAMP_MICROS: ' + value;
if (v < 0n) {
throw 'out of bounds';
}
return v;
}
catch (e) {
throw 'TIMESTAMP_MICROS value is out of bounds: ' + value;
}
}
function fromPrimitive_TIMESTAMP_MICROS(value) {
return new Date(parseInt(value / 1000n));
if (typeof value === 'bigint')
return new Date(Number(value / 1000n));
return new Date(value / 1000);
}

@@ -357,3 +538,6 @@ function toPrimitive_INTERVAL(value) {

}
module.exports = { PARQUET_LOGICAL_TYPES, toPrimitive, fromPrimitive };
//# sourceMappingURL=types.js.map
function checkValidValue(lowerRange, upperRange, v) {
if (v < lowerRange || v > upperRange) {
throw "invalid value";
}
}

@@ -1,17 +0,47 @@

'use strict';
const fs = require('fs');
const thrift = require('thrift');
const parquet_thrift = require('../gen-nodejs/parquet_types');
/** We need to use a patched version of TFramedTransport where
* readString returns the original buffer instead of a string if the
* buffer can not be safely encoded as utf8 (see http://bit.ly/2GXeZEF)
*/
class fixedTFramedTransport extends thrift.TFramedTransport {
readString(len) {
this.ensureAvailable(len);
var buffer = this.inBuf.slice(this.readPos, this.readPos + len);
var str = this.inBuf.toString('utf8', this.readPos, this.readPos + len);
this.readPos += len;
return (Buffer.from(str).equals(buffer)) ? str : buffer;
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.cloneInteger = exports.fieldIndexOf = exports.osopen = exports.osend = exports.oswrite = exports.fclose = exports.fread = exports.fstat = exports.fopen = exports.getThriftEnum = exports.getBitWidth = exports.decodeThrift = exports.serializeThrift = void 0;
const thrift_1 = __importDefault(require("thrift"));
const fs_1 = __importDefault(require("fs"));
const parquet_thrift = __importStar(require("../gen-nodejs/parquet_types"));
const thrift_2 = require("thrift");
/**
* We need to patch Thrift's TFramedTransport class bc the TS type definitions
* do not define a `readPos` field, even though the class implementation has
* one.
*/
class fixedTFramedTransport extends thrift_1.default.TFramedTransport {
inBuf;
readPos;
constructor(inBuf) {
super(inBuf);
this.inBuf = inBuf;
this.readPos = 0;
}
}

@@ -23,40 +53,20 @@ /** Patch PageLocation to be three element array that has getters/setters

*/
const previousPageLocation = parquet_thrift.PageLocation.prototype;
const PageLocation = parquet_thrift.PageLocation.prototype = [];
PageLocation.write = previousPageLocation.write;
PageLocation.read = previousPageLocation.read;
const getterSetter = index => ({
const getterSetter = (index) => ({
get: function () { return this[index]; },
set: function (value) { return this[index] = value; }
});
Object.defineProperty(PageLocation, 'offset', getterSetter(0));
Object.defineProperty(PageLocation, 'compressed_page_size', getterSetter(1));
Object.defineProperty(PageLocation, 'first_row_index', getterSetter(2));
exports.force32 = function () {
const protocol = thrift.TCompactProtocol.prototype;
protocol.zigzagToI64 = protocol.zigzagToI32;
protocol.readVarint64 = protocol.readVarint32 = function () {
let lo = 0;
let shift = 0;
let b;
while (true) {
b = this.trans.readByte();
lo = lo | ((b & 0x7f) << shift);
shift += 7;
if (!(b & 0x80)) {
break;
}
}
return lo;
};
};
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'offset', getterSetter(0));
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'compressed_page_size', getterSetter(1));
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'first_row_index', getterSetter(2));
/**
* Helper function that serializes a thrift object into a buffer
*/
exports.serializeThrift = function (obj) {
const serializeThrift = function (obj) {
let output = [];
let transport = new thrift.TBufferedTransport(null, function (buf) {
const callBack = function (buf) {
output.push(buf);
});
let protocol = new thrift.TCompactProtocol(transport);
};
let transport = new thrift_1.default.TBufferedTransport(undefined, callBack);
let protocol = new thrift_1.default.TCompactProtocol(transport);
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872
obj.write(protocol);

@@ -66,3 +76,4 @@ transport.flush();

};
exports.decodeThrift = function (obj, buf, offset) {
exports.serializeThrift = serializeThrift;
const decodeThrift = function (obj, buf, offset) {
if (!offset) {

@@ -73,10 +84,12 @@ offset = 0;

transport.readPos = offset;
var protocol = new thrift.TCompactProtocol(transport);
var protocol = new thrift_1.default.TCompactProtocol(transport);
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872
obj.read(protocol);
return transport.readPos - offset;
};
exports.decodeThrift = decodeThrift;
/**
* Get the number of bits required to store a given value
*/
exports.getBitWidth = function (val) {
const getBitWidth = function (val) {
if (val === 0) {

@@ -89,6 +102,7 @@ return 0;

};
exports.getBitWidth = getBitWidth;
/**
* FIXME not ideal that this is linear
*/
exports.getThriftEnum = function (klass, value) {
const getThriftEnum = function (klass, value) {
for (let k in klass) {

@@ -101,5 +115,6 @@ if (klass[k] === value) {

};
exports.fopen = function (filePath) {
exports.getThriftEnum = getThriftEnum;
const fopen = function (filePath) {
return new Promise((resolve, reject) => {
fs.open(filePath, 'r', (err, fd) => {
fs_1.default.open(filePath, 'r', (err, fd) => {
if (err) {

@@ -114,5 +129,6 @@ reject(err);

};
exports.fstat = function (filePath) {
exports.fopen = fopen;
const fstat = function (filePath) {
return new Promise((resolve, reject) => {
fs.stat(filePath, (err, stat) => {
fs_1.default.stat(filePath, (err, stat) => {
if (err) {

@@ -127,6 +143,7 @@ reject(err);

};
exports.fread = function (fd, position, length) {
exports.fstat = fstat;
const fread = function (fd, position, length) {
let buffer = Buffer.alloc(length);
return new Promise((resolve, reject) => {
fs.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => {
fs_1.default.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => {
if (err || bytesRead != length) {

@@ -141,5 +158,6 @@ reject(err || Error('read failed'));

};
exports.fclose = function (fd) {
exports.fread = fread;
const fclose = function (fd) {
return new Promise((resolve, reject) => {
fs.close(fd, (err) => {
fs_1.default.close(fd, (err) => {
if (err) {

@@ -154,3 +172,4 @@ reject(err);

};
exports.oswrite = function (os, buf) {
exports.fclose = fclose;
const oswrite = function (os, buf) {
return new Promise((resolve, reject) => {

@@ -162,3 +181,3 @@ os.write(buf, (err) => {

else {
resolve();
resolve(err);
}

@@ -168,3 +187,4 @@ });

};
exports.osend = function (os) {
exports.oswrite = oswrite;
const osend = function (os) {
return new Promise((resolve, reject) => {

@@ -176,3 +196,3 @@ os.end((err) => {

else {
resolve();
resolve(err);
}

@@ -182,5 +202,6 @@ });

};
exports.osopen = function (path, opts) {
exports.osend = osend;
const osopen = function (path, opts) {
return new Promise((resolve, reject) => {
let outputStream = fs.createWriteStream(path, opts);
let outputStream = fs_1.default.createWriteStream(path, opts);
outputStream.on('open', function (fd) {

@@ -194,3 +215,4 @@ resolve(outputStream);

};
exports.fieldIndexOf = function (arr, elem) {
exports.osopen = osopen;
const fieldIndexOf = function (arr, elem) {
for (let j = 0; j < arr.length; ++j) {

@@ -213,2 +235,6 @@ if (arr[j].length !== elem.length) {

};
//# sourceMappingURL=util.js.map
exports.fieldIndexOf = fieldIndexOf;
const cloneInteger = (int) => {
return new thrift_2.Int64(int.valueOf());
};
exports.cloneInteger = cloneInteger;

@@ -1,11 +0,39 @@

'use strict';
const stream = require('stream');
const parquet_thrift = require('../gen-nodejs/parquet_types');
const parquet_shredder = require('./shred');
const parquet_util = require('./util');
const parquet_codec = require('./codec');
const parquet_compression = require('./compression');
const parquet_types = require('./types');
const bloomFilterWriter = require("./bloomFilterIO/bloomFilterWriter");
const Long = require('long');
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetTransformer = exports.ParquetEnvelopeWriter = exports.ParquetWriter = void 0;
const stream_1 = __importDefault(require("stream"));
const parquet_types_1 = __importStar(require("../gen-nodejs/parquet_types"));
const parquet_shredder = __importStar(require("./shred"));
const parquet_util = __importStar(require("./util"));
const parquet_codec = __importStar(require("./codec"));
const parquet_compression = __importStar(require("./compression"));
const parquet_types = __importStar(require("./types"));
const bloomFilterWriter = __importStar(require("./bloomFilterIO/bloomFilterWriter"));
const node_int64_1 = __importDefault(require("node-int64"));
/**

@@ -35,2 +63,8 @@ * Parquet File Magic String

class ParquetWriter {
schema;
envelopeWriter;
rowBuffer;
rowGroupSize;
closed;
userMetadata;
/**

@@ -78,3 +112,3 @@ * Convenience method to create a new buffered parquet writer that writes to

async appendRow(row) {
if (this.closed) {
if (this.closed || this.envelopeWriter === null) {
throw 'writer was closed';

@@ -107,12 +141,14 @@ }

this.closed = true;
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) {
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters });
await this.envelopeWriter.writeRowGroup(this.rowBuffer);
this.rowBuffer = {};
if (this.envelopeWriter) {
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) {
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters });
await this.envelopeWriter.writeRowGroup(this.rowBuffer);
this.rowBuffer = {};
}
await this.envelopeWriter.writeBloomFilters();
await this.envelopeWriter.writeIndex();
await this.envelopeWriter.writeFooter(this.userMetadata);
await this.envelopeWriter.close();
this.envelopeWriter = null;
}
await this.envelopeWriter.writeBloomFilters();
await this.envelopeWriter.writeIndex();
await this.envelopeWriter.writeFooter(this.userMetadata);
await this.envelopeWriter.close();
this.envelopeWriter = null;
if (callback) {

@@ -142,12 +178,23 @@ callback();

setPageSize(cnt) {
this.writer.setPageSize(cnt);
this.envelopeWriter.setPageSize(cnt);
}
}
exports.ParquetWriter = ParquetWriter;
/**
* Create a parquet file from a schema and a number of row groups. This class
* performs direct, unbuffered writes to the underlying output stream and is
* intendend for advanced and internal users; the writeXXX methods must be
* intended for advanced and internal users; the writeXXX methods must be
* called in the correct order to produce a valid file.
*/
class ParquetEnvelopeWriter {
schema;
write;
close;
offset;
rowCount;
rowGroups;
pageSize;
useDataPageV2;
pageIndex;
bloomFilters; // TODO: OR filterCollection
/**

@@ -159,3 +206,3 @@ * Create a new parquet envelope writer that writes to the specified stream

let closeFn = parquet_util.osend.bind(undefined, outputStream);
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, new node_int64_1.default(0), opts);
}

@@ -167,3 +214,3 @@ constructor(schema, writeFn, closeFn, fileOffset, opts) {

this.offset = fileOffset;
this.rowCount = 0;
this.rowCount = new node_int64_1.default(0);
this.rowGroups = [];

@@ -179,3 +226,3 @@ this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE;

writeSection(buf) {
this.offset += buf.length;
this.offset.setValue(this.offset.valueOf() + buf.length);
return this.write(buf);

@@ -200,14 +247,17 @@ }

});
this.rowCount += records.rowCount;
this.rowCount.setValue(this.rowCount.valueOf() + records.rowCount);
this.rowGroups.push(rgroup.metadata);
return this.writeSection(rgroup.body);
}
writeBloomFilters(_rowGroups) {
let rowGroups = _rowGroups || this.rowGroups;
rowGroups.forEach(group => {
writeBloomFilters() {
this.rowGroups.forEach(group => {
group.columns.forEach(column => {
const columnName = column.meta_data.path_in_schema[0];
if (columnName in this.bloomFilters === false)
if (!column.meta_data?.path_in_schema.length) {
return;
const serializedBloomFilterData = bloomFilterWriter.getSerializedBloomFilterData(this.bloomFilters[columnName]);
}
const filterName = column.meta_data?.path_in_schema.join(',');
if (!(filterName in this.bloomFilters)) {
return;
}
const serializedBloomFilterData = bloomFilterWriter.getSerializedBloomFilterData(this.bloomFilters[filterName]);
bloomFilterWriter.setFilterOffset(column, this.offset);

@@ -221,20 +271,19 @@ this.writeSection(serializedBloomFilterData);

*/
writeIndex(_rowGroups) {
let rowGroups = _rowGroups || this.rowGroups;
writeIndex() {
this.schema.fieldList.forEach((c, i) => {
rowGroups.forEach(group => {
this.rowGroups.forEach(group => {
let column = group.columns[i];
if (!column)
return;
if (column.meta_data.columnIndex) {
if (column.meta_data?.columnIndex) {
let columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex);
delete column.meta_data.columnIndex;
column.column_index_offset = this.offset;
column.column_index_offset = parquet_util.cloneInteger(this.offset);
column.column_index_length = columnBody.length;
this.writeSection(columnBody);
}
if (column.meta_data.offsetIndex) {
if (column.meta_data?.offsetIndex) {
let offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex);
delete column.meta_data.offsetIndex;
column.offset_index_offset = this.offset;
column.offset_index_offset = parquet_util.cloneInteger(this.offset);
column.offset_index_length = offsetBody.length;

@@ -249,3 +298,3 @@ this.writeSection(offsetBody);

*/
writeFooter(userMetadata, schema, rowCount, rowGroups) {
writeFooter(userMetadata) {
if (!userMetadata) {

@@ -268,6 +317,8 @@ userMetadata = {};

}
exports.ParquetEnvelopeWriter = ParquetEnvelopeWriter;
/**
* Create a parquet transform stream
*/
class ParquetTransformer extends stream.Transform {
class ParquetTransformer extends stream_1.default.Transform {
writer;
constructor(schema, opts = {}) {

@@ -280,9 +331,10 @@ super({ objectMode: true });

})(this);
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, 0, opts), opts);
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () {
}, new node_int64_1.default(0), opts), opts);
}
_transform(row, encoding, callback) {
_transform(row, _encoding, callback) {
if (row) {
this.writer.appendRow(row).then(data => callback(null, data), err => {
const fullErr = new Error(`Error transforming to parquet: ${err.toString()} row:${row}`);
fullErr.origErr = err;
fullErr.message = err;
callback(fullErr);

@@ -296,6 +348,7 @@ });

_flush(callback) {
this.writer.close(callback)
this.writer.close()
.then(d => callback(null, d), callback);
}
}
exports.ParquetTransformer = ParquetTransformer;
/**

@@ -315,3 +368,3 @@ * Encode a consecutive array of data using one of the parquet encodings

if (column.originalType) {
value = parquet_types.toPrimitive(column.originalType, value);
value = parquet_types.toPrimitive(column.originalType, value, column);
}

@@ -329,3 +382,3 @@ if (column.primitiveType !== 'BYTE_ARRAY') {

statistics.min = statistics.min_value;
return new parquet_thrift.Statistics(statistics);
return new parquet_types_1.default.Statistics(statistics);
}

@@ -341,8 +394,9 @@ async function encodePages(schema, rowBuffer, opts) {

let page;
const values = rowBuffer.columnData[field.path];
if (opts.bloomFilters && (field.name in opts.bloomFilters)) {
const splitBlockBloomFilter = opts.bloomFilters[field.name];
const columnPath = field.path.join(',');
const values = rowBuffer.columnData[columnPath];
if (opts.bloomFilters && (columnPath in opts.bloomFilters)) {
const splitBlockBloomFilter = opts.bloomFilters[columnPath];
values.values.forEach(v => splitBlockBloomFilter.insert(v));
}
let statistics;
let statistics = {};
if (field.statistics !== false) {

@@ -358,4 +412,4 @@ statistics = {};

});
statistics.null_count = values.dlevels.length - values.values.length;
statistics.distinct_count = values.distinct_values.size;
statistics.null_count = new node_int64_1.default(values.dlevels.length - values.values.length);
statistics.distinct_count = new node_int64_1.default(values.distinct_values.size);
}

@@ -366,5 +420,5 @@ if (opts.useDataPageV2) {

else {
page = await encodeDataPage(field, values.values, values.rlevels, values.dlevels, statistics);
page = await encodeDataPage(field, values.values || [], values.rlevels || [], values.dlevels || [], statistics);
}
let pages = rowBuffer.pages[field.path];
let pages = rowBuffer.pages[field.path.join(',')];
let lastPage = pages[pages.length - 1];

@@ -393,4 +447,4 @@ let first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0;

let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, {
typeLength: column.typeLength,
bitWidth: column.typeLength
bitWidth: column.typeLength,
...column
});

@@ -409,7 +463,7 @@ /* encode repetition and definition levels */

pageBody = await parquet_compression.deflate(column.compression, pageBody);
let pageHeader = new parquet_thrift.PageHeader();
pageHeader.type = parquet_thrift.PageType['DATA_PAGE'];
let pageHeader = new parquet_types_1.default.PageHeader();
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE'];
pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length;
pageHeader.compressed_page_size = pageBody.length;
pageHeader.data_page_header = new parquet_thrift.DataPageHeader();
pageHeader.data_page_header = new parquet_types_1.default.DataPageHeader();
pageHeader.data_page_header.num_values = dlevels.length;

@@ -419,7 +473,7 @@ if (column.statistics !== false) {

}
pageHeader.data_page_header.encoding = parquet_thrift.Encoding[column.encoding];
pageHeader.data_page_header.encoding = parquet_types_1.default.Encoding[column.encoding];
pageHeader.data_page_header.definition_level_encoding =
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING];
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING];
pageHeader.data_page_header.repetition_level_encoding =
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING];
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING];
/* concat page header, repetition and definition levels and values */

@@ -434,4 +488,4 @@ return Buffer.concat([parquet_util.serializeThrift(pageHeader), pageBody]);

let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, {
typeLength: column.typeLength,
bitWidth: column.typeLength
bitWidth: column.typeLength,
...column,
});

@@ -455,5 +509,5 @@ let valuesBufCompressed = await parquet_compression.deflate(column.compression, valuesBuf);

/* build page header */
let pageHeader = new parquet_thrift.PageHeader();
pageHeader.type = parquet_thrift.PageType['DATA_PAGE_V2'];
pageHeader.data_page_header_v2 = new parquet_thrift.DataPageHeaderV2();
let pageHeader = new parquet_types_1.default.PageHeader();
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE_V2'];
pageHeader.data_page_header_v2 = new parquet_types_1.default.DataPageHeaderV2();
pageHeader.data_page_header_v2.num_values = dlevels.length;

@@ -469,3 +523,3 @@ pageHeader.data_page_header_v2.num_nulls = dlevels.length - values.length;

rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length;
pageHeader.data_page_header_v2.encoding = parquet_thrift.Encoding[column.encoding];
pageHeader.data_page_header_v2.encoding = parquet_types_1.default.Encoding[column.encoding];
pageHeader.data_page_header_v2.definition_levels_byte_length = dLevelsBuf.length;

@@ -491,16 +545,19 @@ pageHeader.data_page_header_v2.repetition_levels_byte_length = rLevelsBuf.length;

/* prepare metadata header */
let metadata = new parquet_thrift.ColumnMetaData();
let metadata = new parquet_types_1.default.ColumnMetaData();
metadata.path_in_schema = opts.column.path;
metadata.num_values = num_values;
metadata.data_page_offset = opts.baseOffset;
metadata.num_values = new node_int64_1.default(num_values);
metadata.data_page_offset = new node_int64_1.default(opts.baseOffset);
metadata.encodings = [];
metadata.total_uncompressed_size = pagesBuf.length;
metadata.total_compressed_size = pagesBuf.length;
metadata.type = parquet_thrift.Type[opts.column.primitiveType];
metadata.codec = await parquet_thrift.CompressionCodec[opts.column.compression];
metadata.total_uncompressed_size = new node_int64_1.default(pagesBuf.length);
metadata.total_compressed_size = new node_int64_1.default(pagesBuf.length);
metadata.type = parquet_types_1.default.Type[opts.column.primitiveType];
metadata.codec = await parquet_types_1.default.CompressionCodec[opts.column.compression];
/* compile statistics ColumnIndex and OffsetIndex*/
let columnIndex = new parquet_thrift.ColumnIndex();
let columnIndex = new parquet_types_1.default.ColumnIndex();
columnIndex.null_pages = [];
columnIndex.max_values = [];
columnIndex.min_values = [];
let offsetIndex = new parquet_thrift.OffsetIndex();
// Default to unordered
columnIndex.boundary_order = 0;
let offsetIndex = new parquet_types_1.default.OffsetIndex();
offsetIndex.page_locations = [];

@@ -510,7 +567,7 @@ /* prepare statistics */

let distinct_values = new Set();
statistics.null_count = 0;
statistics.distinct_count = 0;
statistics.null_count = new node_int64_1.default(0);
statistics.distinct_count = new node_int64_1.default(0);
/* loop through pages and update indices and statistics */
for (let i = 0; i < pages.length; i++) {
let page = pages[i];
const page = pages[i];
if (opts.column.statistics !== false) {

@@ -523,12 +580,14 @@ if (page.statistics.max_value > statistics.max_value || i == 0) {

}
statistics.null_count += page.statistics.null_count;
page.distinct_values.forEach(value => distinct_values.add(value));
statistics.null_count.setValue(statistics.null_count.valueOf() + (page.statistics.null_count?.valueOf() || 0));
page.distinct_values.forEach((value) => distinct_values.add(value));
// If the number of values and the count of nulls are the same, this is a null page
columnIndex.null_pages.push(page.num_values === statistics.null_count.valueOf());
columnIndex.max_values.push(encodeStatisticsValue(page.statistics.max_value, opts.column));
columnIndex.min_values.push(encodeStatisticsValue(page.statistics.min_value, opts.column));
}
let pageLocation = new parquet_thrift.PageLocation();
pageLocation.offset = offset;
let pageLocation = new parquet_types_1.default.PageLocation();
pageLocation.offset = new node_int64_1.default(offset);
offset += page.page.length;
pageLocation.compressed_page_size = page.page.length;
pageLocation.first_row_index = page.first_row_index;
pageLocation.first_row_index = new node_int64_1.default(page.first_row_index);
offsetIndex.page_locations.push(pageLocation);

@@ -540,3 +599,3 @@ }

if (opts.column.statistics !== false) {
statistics.distinct_count = distinct_values.size;
statistics.distinct_count = new node_int64_1.default(distinct_values.size);
metadata.statistics = encodeStatistics(statistics, opts.column);

@@ -548,8 +607,4 @@ if (opts.pageIndex !== false) {

/* list encodings */
let encodingsSet = {};
encodingsSet[PARQUET_RDLVL_ENCODING] = true;
encodingsSet[opts.column.encoding] = true;
for (let k in encodingsSet) {
metadata.encodings.push(parquet_thrift.Encoding[k]);
}
metadata.encodings.push(parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]);
metadata.encodings.push(parquet_types_1.default.Encoding[opts.column.encoding]);
/* concat metadata header and data pages */

@@ -564,6 +619,6 @@ let metadataOffset = opts.baseOffset + pagesBuf.length;

async function encodeRowGroup(schema, data, opts) {
let metadata = new parquet_thrift.RowGroup();
metadata.num_rows = data.rowCount;
let metadata = new parquet_types_1.default.RowGroup();
metadata.num_rows = new node_int64_1.default(data.rowCount);
metadata.columns = [];
metadata.total_byte_size = 0;
metadata.total_byte_size = new node_int64_1.default(0);
let body = Buffer.alloc(0);

@@ -574,16 +629,15 @@ for (let field of schema.fieldList) {

}
let cchunkData = await encodeColumnChunk(data.pages[field.path], {
let cchunkData = await encodeColumnChunk(data.pages[field.path.join(',')], {
column: field,
baseOffset: opts.baseOffset + body.length,
pageSize: opts.pageSize,
encoding: field.encoding,
rowCount: data.rowCount,
useDataPageV2: opts.useDataPageV2,
pageIndex: opts.pageIndex
baseOffset: opts.baseOffset.valueOf() + body.length,
pageSize: opts.pageSize || 0,
rowCount: data.rowCount || 0,
useDataPageV2: opts.useDataPageV2 ?? true,
pageIndex: opts.pageIndex ?? true
});
let cchunk = new parquet_thrift.ColumnChunk();
cchunk.file_offset = cchunkData.metadataOffset;
let cchunk = new parquet_types_1.default.ColumnChunk();
cchunk.file_offset = new node_int64_1.default(cchunkData.metadataOffset);
cchunk.meta_data = cchunkData.metadata;
metadata.columns.push(cchunk);
metadata.total_byte_size += cchunkData.body.length;
metadata.total_byte_size = new node_int64_1.default(metadata.total_byte_size.valueOf() + (cchunkData.body.length));
body = Buffer.concat([body, cchunkData.body]);

@@ -597,5 +651,5 @@ }

function encodeFooter(schema, rowCount, rowGroups, userMetadata) {
let metadata = new parquet_thrift.FileMetaData();
let metadata = new parquet_types_1.default.FileMetaData();
metadata.version = PARQUET_VERSION;
metadata.created_by = 'parquet.js';
metadata.created_by = '@dsnp/parquetjs';
metadata.num_rows = rowCount;

@@ -606,3 +660,3 @@ metadata.row_groups = rowGroups;

for (let k in userMetadata) {
let kv = new parquet_thrift.KeyValue();
let kv = new parquet_types_1.default.KeyValue();
kv.key = k;

@@ -613,3 +667,3 @@ kv.value = userMetadata[k];

{
let schemaRoot = new parquet_thrift.SchemaElement();
let schemaRoot = new parquet_types_1.default.SchemaElement();
schemaRoot.name = 'root';

@@ -620,5 +674,5 @@ schemaRoot.num_children = Object.keys(schema.fields).length;

for (let field of schema.fieldList) {
let schemaElem = new parquet_thrift.SchemaElement();
let schemaElem = new parquet_types_1.default.SchemaElement();
schemaElem.name = field.name;
schemaElem.repetition_type = parquet_thrift.FieldRepetitionType[field.repetitionType];
schemaElem.repetition_type = parquet_types_1.default.FieldRepetitionType[field.repetitionType];
if (field.isNested) {

@@ -628,7 +682,14 @@ schemaElem.num_children = field.fieldCount;

else {
schemaElem.type = parquet_thrift.Type[field.primitiveType];
schemaElem.type = parquet_types_1.default.Type[field.primitiveType];
}
if (field.originalType) {
schemaElem.converted_type = parquet_thrift.ConvertedType[field.originalType];
schemaElem.converted_type = parquet_types_1.default.ConvertedType[field.originalType];
}
// Support Decimal
switch (schemaElem.converted_type) {
case (parquet_types_1.ConvertedType.DECIMAL):
schemaElem.precision = field.precision;
schemaElem.scale = field.scale || 0;
break;
}
schemaElem.type_length = field.typeLength;

@@ -644,7 +705,1 @@ metadata.schema.push(schemaElem);

}
module.exports = {
ParquetEnvelopeWriter,
ParquetWriter,
ParquetTransformer
};
//# sourceMappingURL=writer.js.map
"use strict";
const reader = require('./lib/reader');
const writer = require('./lib/writer');
const schema = require('./lib/schema');
const shredder = require('./lib/shred');
const util = require('./lib/util');
module.exports = {
ParquetEnvelopeReader: reader.ParquetEnvelopeReader,
ParquetReader: reader.ParquetReader,
ParquetEnvelopeWriter: writer.ParquetEnvelopeWriter,
ParquetWriter: writer.ParquetWriter,
ParquetTransformer: writer.ParquetTransformer,
ParquetSchema: schema.ParquetSchema,
ParquetShredder: shredder,
force32: util.force32
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
//# sourceMappingURL=parquet.js.map
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetFieldBuilder = exports.ParquetShredder = exports.ParquetSchema = exports.ParquetTransformer = exports.ParquetWriter = exports.ParquetEnvelopeWriter = exports.ParquetReader = exports.ParquetEnvelopeReader = void 0;
const reader = __importStar(require("./lib/reader"));
const writer = __importStar(require("./lib/writer"));
const schema = __importStar(require("./lib/schema"));
const shredder = __importStar(require("./lib/shred"));
const fields = __importStar(require("./lib/fields"));
exports.ParquetEnvelopeReader = reader.ParquetEnvelopeReader;
exports.ParquetReader = reader.ParquetReader;
exports.ParquetEnvelopeWriter = writer.ParquetEnvelopeWriter;
exports.ParquetWriter = writer.ParquetWriter;
exports.ParquetTransformer = writer.ParquetTransformer;
exports.ParquetSchema = schema.ParquetSchema;
exports.ParquetShredder = shredder;
exports.ParquetFieldBuilder = fields;
exports.default = {
ParquetEnvelopeReader: exports.ParquetEnvelopeReader,
ParquetReader: exports.ParquetReader,
ParquetEnvelopeWriter: exports.ParquetEnvelopeWriter,
ParquetWriter: exports.ParquetWriter,
ParquetTransformer: exports.ParquetTransformer,
ParquetSchema: exports.ParquetSchema,
ParquetShredder: exports.ParquetShredder,
ParquetFieldBuilder: exports.ParquetFieldBuilder,
};

@@ -5,3 +5,4 @@ {

"main": "dist/parquet.js",
"version": "0.0.0-81e38e",
"types": "dist/parquet.d.ts",
"version": "0.0.0-8d34ac",
"homepage": "https://github.com/LibertyDSNP/parquetjs",

@@ -18,51 +19,44 @@ "license": "MIT",

"dependencies": {
"@types/bson": "^4.0.3",
"@types/long": "^4.0.1",
"@types/node": "^14.14.35",
"@types/thrift": "^0.10.10",
"assert": "^2.0.0",
"@types/long": "^4.0.2",
"@types/node-int64": "^0.4.29",
"@types/thrift": "^0.10.11",
"browserify-zlib": "^0.2.0",
"bson": "4.4.0",
"bson": "4.6.3",
"cross-fetch": "^3.1.4",
"esbuild": "^0.12.11",
"events": "^3.3.0",
"int53": "^0.2.4",
"lodash": "^4.17.21",
"long": "^4.0.0",
"object-stream": "0.0.1",
"path-browserify": "^1.0.1",
"readable-stream": "^3.6.0",
"snappyjs": "^0.6.0",
"thrift": "0.14.1",
"util": "^0.12.4",
"varint": "^5.0.0",
"snappyjs": "^0.6.1",
"thrift": "0.16.0",
"varint": "^6.0.0",
"wasm-brotli": "^2.0.2",
"xxhash-wasm": "^0.4.1"
"xxhash-wasm": "^1.0.2"
},
"devDependencies": {
"@babel/core": "^7.14.6",
"@babel/preset-env": "^7.14.7",
"@babel/preset-typescript": "^7.14.5",
"@types/chai": "^4.2.16",
"@types/mocha": "^8.2.2",
"@types/sinon": "^10.0.0",
"babel-loader": "^8.2.2",
"babel-plugin-add-module-exports": "^1.0.4",
"@types/chai": "^4.3.5",
"@types/json-schema": "^7.0.11",
"@types/mocha": "^10.0.1",
"@types/node": "^18.18.2",
"@types/sinon": "^10.0.15",
"@types/varint": "^6.0.1",
"assert": "^2.0.0",
"browserfs": "^1.4.3",
"buffer": "^6.0.3",
"chai": "4.3.4",
"core-js": "^3.15.1",
"mocha": "8.3.2",
"msw": "^0.29.0",
"chai": "4.3.6",
"core-js": "^3.22.5",
"esbuild": "^0.19.2",
"mocha": "^10.2.0",
"msw": "^1.2.1",
"object-stream": "^0.0.1",
"process": "^0.11.10",
"regenerator-runtime": "^0.13.7",
"sinon": "^10.0.0",
"source-map-loader": "^3.0.0",
"regenerator-runtime": "^0.13.11",
"sinon": "^15.1.0",
"sinon-chai": "^3.7.0",
"sinon-chai-in-order": "^0.1.0",
"stream-browserify": "^3.0.0",
"ts-loader": "^9.2.3",
"ts-node": "^9.1.1",
"typescript": "^4.3.4"
"ts-node": "^10.9.1",
"typescript": "^5.0.4"
},
"scripts": {
"build": "npm run build:node",
"build": "npm run build:node && npm run build:types",
"build:types": "tsc -p tsconfig.types.json && cp gen-nodejs/parquet_types.d.ts dist/gen-nodejs/parquet_types.d.ts",
"build:node": "tsc -b",

@@ -72,5 +66,6 @@ "build:browser": "node esbuild.js",

"lint": "echo 'Linting, it is on the TODO list...'",
"test": "mocha -r ts-node/register 'test/**/*.{js,ts}'",
"test": "mocha -r ts-node/register 'test/{,!(browser)/**}/*.{js,ts}'",
"test:only": "mocha -r ts-node/register",
"clean": "rm -Rf ./dist",
"prepublishOnly": "npm run clean && npm run build:node && npm run build:browser",
"prepublishOnly": "npm run clean && npm run build:node && npm run build:types && npm run build:browser",
"thrift": "thrift -out gen-nodejs --gen js:ts parquet.thrift && thrift -out gen-nodejs --gen js:node parquet.thrift",

@@ -90,3 +85,3 @@ "serve": "node esbuild-serve.js"

"engines": {
"node": ">=14.16.0"
"node": ">=18.18.2"
},

@@ -93,0 +88,0 @@ "files": [

@@ -24,3 +24,3 @@ # parquet.js

## Installation
_parquet.js requires node.js >= 14.16.0_
_parquet.js requires node.js >= 18.18.2_

@@ -31,4 +31,4 @@ ```

### NodeJS
To use with nodejs:
### NodeJS
To use with nodejs:
```javascript

@@ -38,13 +38,43 @@ import parquetjs from "@dsnp/parquetjs"

### Browser
To use in a browser, in your bundler, depending on your needs, write the appropriate plugin or resolver to point to:
### Browser with Bundler
To use in a browser with a bundler, depending on your needs, write the appropriate plugin or resolver to point to either the Common JS or ES Module version:
```javascript
"node_modules/@dsnp/parquetjs/browser/parquetjs"
// Common JS
"node_modules/@dsnp/parquetjs/dist/browser/parquetjs.cjs"
// ES Modules
"node_modules/@dsnp/parquetjs/dist/browser/parquetjs.esm"
```
or:
```javascript
import parquetjs from "@dsnp/parquetjs/browser/parquetjs"
// Common JS
import parquetjs from "@dsnp/parquetjs/dist/browser/parquetjs.cjs"
// ES Modules
import parquetjs from "@dsnp/parquetjs/dist/browser/parquetjs.esm"
```
### Browser Direct: ES Modules
To use directly in the browser without a bundler using ES Modules:
1. Build the package: `npm install && npm run build:browser`
2. Copy to `dist/browser/parquetjs.esm.js` the server
3. Use it in your html or other ES Modules:
```html
<script type="module">
import parquetjs from '../parquet.esm.js';
// Use parquetjs
</script>
```
### Browser Direct: Plain Ol' JavaScript
To use directly in the browser without a bundler or ES Modules:
1. Build the package: `npm install && npm run build:browser`
2. Copy to `dist/browser/parquetjs.js` the server
2. Use the global `parquetjs` variable to access parquetjs functions
```html
<script>
// console.log(parquetjs)
</script>
```
## Usage: Writing files

@@ -63,2 +93,4 @@

### Native Schema Definition
``` js

@@ -75,2 +107,43 @@ // declare a schema for the `fruits` table

### Helper Functions
```js
var schema = new parquet.ParquetSchema({
name: parquet.ParquetFieldBuilder.createStringField(),
quantity: parquet.ParquetFieldBuilder.createIntField(64),
price: parquet.ParquetFieldBuilder.createDoubleField(),
date: parquet.ParquetFieldBuilder.createTimestampField(),
in_stock: parquet.ParquetFieldBuilder.createBooleanField()
});
```
### JSON Schema
``` js
// declare a schema for the `fruits` JSON Schema
var schema = new parquet.ParquetSchema.fromJsonSchema({
"type": "object",
"properties": {
"name": {
"type": "string"
},
"quantity": {
"type": "integer"
},
"price": {
"type": "number"
},
"date": {
"type": "string",
"format": "date-time"
},
"in_stock": {
"type": "boolean"
}
},
"required": ["name", "quantity", "price", "date", "in_stock"]
});
```
Note that the Parquet schema supports nesting, so you can store complex, arbitrarily

@@ -82,3 +155,3 @@ nested records into a single row (more on that later) while still maintaining good

take input rows as JSON objects, convert them to the Parquet format and store
them on disk.
them on disk.

@@ -97,3 +170,3 @@ ``` js

### Adding bloom filters
### Adding bloom filters

@@ -217,3 +290,3 @@ Bloom filters can be added to multiple columns as demonstrated below:

Parquet files can be read from an S3 object without having to download the whole file.
You will have to supply the aws-sdk client as first argument and the bucket/key information
You will have to supply the aws-sdk client as first argument and the bucket/key information
as second argument to the function `parquetReader.openS3`.

@@ -267,3 +340,3 @@

combination with the `BOOLEAN`, `INT32` and `INT64` types. The RLE encoding
requires an additional `bitWidth` parameter that contains the maximum number of
requires an additional `typeLength` parameter that contains the maximum number of
bits required to store the largest value of the field.

@@ -273,3 +346,3 @@

var schema = new parquet.ParquetSchema({
age: { type: 'UINT_32', encoding: 'RLE', bitWidth: 7 },
age: { type: 'UINT_32', encoding: 'RLE', typeLength: 7 },
});

@@ -304,3 +377,3 @@ ```

Consider this example, which allows us to store a more advanced "fruits" table
where each row contains a name, a list of colours and a list of "stock" objects.
where each row contains a name, a list of colours and a list of "stock" objects.

@@ -307,0 +380,0 @@ ``` js

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc