New Case Study:See how Anthropic automated 95% of dependency reviews with Socket.Learn More
Socket
Sign inDemoInstall
Socket

@dsnp/parquetjs

Package Overview
Dependencies
Maintainers
2
Versions
93
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@dsnp/parquetjs - npm Package Compare versions

Comparing version 0.0.0-ba6065 to 0.0.0-cadf48

dist/browser/parquet.cjs.js

126

dist/lib/bloom/sbbf.js
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {

@@ -46,13 +37,29 @@ return (mod && mod.__esModule) ? mod : { "default": mod };

class SplitBlockBloomFilter {
constructor() {
/**
* Instance
*/
this.splitBlockFilter = [];
this.desiredFalsePositiveRate = SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE;
this.numBlocks = 0;
this.numDistinctValues = SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES;
this.hashStrategy = new parquet_types_1.default.BloomFilterHash(new parquet_types_1.default.XxHash());
this.hasher = new xxhasher_1.default();
}
static salt = [
0x47b6137b,
0x44974d91,
0x8824ad5b,
0xa2b7289d,
0x705495c7,
0x2df1424b,
0x9efc4947,
0x5c6bfb31
];
// How many bits are in a single block:
// - Blocks are UInt32 arrays
// - There are 8 UInt32 words in each block.
static WORDS_PER_BLOCK = 8;
static WORD_SIZE = 32;
static BITS_PER_BLOCK = SplitBlockBloomFilter.WORDS_PER_BLOCK * SplitBlockBloomFilter.WORD_SIZE;
// Default number of blocks in a Split Block Bloom filter (SBBF)
static NUMBER_OF_BLOCKS = 32;
// The lower bound of SBBF size in bytes.
// Currently this is 1024
static LOWER_BOUND_BYTES = SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK / 8;
// The upper bound of SBBF size, set to default row group size in bytes.
// Note that the subsquent requirements for an effective bloom filter on a row group this size would mean this
// is unacceptably large for a lightweight client application.
static UPPER_BOUND_BYTES = 128 * 1024 * 1024;
static DEFAULT_FALSE_POSITIVE_RATE = 0.001;
static DEFAULT_DISTINCT_VALUES = 128 * 1024;
/**

@@ -161,5 +168,5 @@ * @function initBlock

for (let j = 0; j < this.WORD_SIZE; j++) {
const isSet = masked[i] & (Math.pow(2, j));
const isSet = masked[i] & (2 ** j);
if (isSet) {
b[i] = b[i] | (Math.pow(2, j));
b[i] = b[i] | (2 ** j);
}

@@ -183,5 +190,5 @@ }

for (let j = 0; j < this.WORD_SIZE; j++) {
const isSet = masked[i] & (Math.pow(2, j));
const isSet = masked[i] & (2 ** j);
if (isSet) {
const match = b[i] & (Math.pow(2, j));
const match = b[i] & (2 ** j);
if (!match) {

@@ -195,2 +202,11 @@ return false;

}
/**
* Instance
*/
splitBlockFilter = [];
desiredFalsePositiveRate = SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE;
numBlocks = 0;
numDistinctValues = SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES;
hashStrategy = new parquet_types_1.default.BloomFilterHash(new parquet_types_1.default.XxHash());
hasher = new xxhasher_1.default();
isInitialized() { return this.splitBlockFilter.length > 0; }

@@ -311,10 +327,8 @@ getFalsePositiveRate() { return this.desiredFalsePositiveRate; }

}
hash(value) {
return __awaiter(this, void 0, void 0, function* () {
if (!this.hashStrategy.hasOwnProperty("XXHASH")) {
throw new Error("unsupported hash strategy");
}
const hashed = yield this.hasher.hash64(value);
return Long.fromString(hashed, true, 16);
});
async hash(value) {
if (!this.hashStrategy.hasOwnProperty("XXHASH")) {
throw new Error("unsupported hash strategy");
}
const hashed = await this.hasher.hash64(value);
return Long.fromString(hashed, true, 16);
}

@@ -335,8 +349,6 @@ insertHash(hashValue) {

*/
insert(value) {
return __awaiter(this, void 0, void 0, function* () {
if (!this.isInitialized())
throw new Error("filter has not been initialized. call init() first");
this.insertHash(yield this.hash(value));
});
async insert(value) {
if (!this.isInitialized())
throw new Error("filter has not been initialized. call init() first");
this.insertHash(await this.hash(value));
}

@@ -358,38 +370,8 @@ checkHash(hashValue) {

*/
check(value) {
return __awaiter(this, void 0, void 0, function* () {
if (!this.isInitialized())
throw new Error("filter has not been initialized");
return this.checkHash(yield this.hash(value));
});
async check(value) {
if (!this.isInitialized())
throw new Error("filter has not been initialized");
return this.checkHash(await this.hash(value));
}
}
SplitBlockBloomFilter.salt = [
0x47b6137b,
0x44974d91,
0x8824ad5b,
0xa2b7289d,
0x705495c7,
0x2df1424b,
0x9efc4947,
0x5c6bfb31
];
// How many bits are in a single block:
// - Blocks are UInt32 arrays
// - There are 8 UInt32 words in each block.
SplitBlockBloomFilter.WORDS_PER_BLOCK = 8;
SplitBlockBloomFilter.WORD_SIZE = 32;
SplitBlockBloomFilter.BITS_PER_BLOCK = SplitBlockBloomFilter.WORDS_PER_BLOCK * SplitBlockBloomFilter.WORD_SIZE;
// Default number of blocks in a Split Block Bloom filter (SBBF)
SplitBlockBloomFilter.NUMBER_OF_BLOCKS = 32;
// The lower bound of SBBF size in bytes.
// Currently this is 1024
SplitBlockBloomFilter.LOWER_BOUND_BYTES = SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK / 8;
// The upper bound of SBBF size, set to default row group size in bytes.
// Note that the subsquent requirements for an effective bloom filter on a row group this size would mean this
// is unacceptably large for a lightweight client application.
SplitBlockBloomFilter.UPPER_BOUND_BYTES = 128 * 1024 * 1024;
SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE = 0.001;
SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES = 128 * 1024;
exports.default = SplitBlockBloomFilter;
//# sourceMappingURL=sbbf.js.map
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const xxhash_wasm_1 = __importDefault(require("xxhash-wasm"));

@@ -27,6 +19,5 @@ const long_1 = __importDefault(require("long"));

class XxHasher {
hashit(value) {
return __awaiter(this, void 0, void 0, function* () {
return (yield XxHasher.h64)(value);
});
static h64 = (0, xxhash_wasm_1.default)().then(x => x.h64);
async hashit(value) {
return (await XxHasher.h64)(value);
}

@@ -39,20 +30,16 @@ /**

*/
hash64(value) {
return __awaiter(this, void 0, void 0, function* () {
if (typeof value === 'string')
return this.hashit(value);
if (value instanceof Buffer ||
value instanceof Uint8Array ||
value instanceof long_1.default ||
typeof value === 'boolean' ||
typeof value === 'number' ||
typeof value === 'bigint') {
return this.hashit(value.toString());
}
throw new Error("unsupported type: " + value);
});
async hash64(value) {
if (typeof value === 'string')
return this.hashit(value);
if (value instanceof Buffer ||
value instanceof Uint8Array ||
value instanceof long_1.default ||
typeof value === 'boolean' ||
typeof value === 'number' ||
typeof value === 'bigint') {
return this.hashit(value.toString());
}
throw new Error("unsupported type: " + value);
}
}
XxHasher.h64 = xxhash_wasm_1.default().then(x => x.h64);
module.exports = XxHasher;
//# sourceMappingURL=xxhasher.js.map
exports.default = XxHasher;
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};

@@ -16,3 +26,3 @@ var __importDefault = (this && this.__importDefault) || function (mod) {

exports.getBloomFiltersFor = exports.siftAllByteOffsets = exports.parseBloomFilterOffsets = void 0;
const util_1 = __importDefault(require("../util"));
const parquet_util = __importStar(require("../util"));
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types"));

@@ -22,4 +32,3 @@ const sbbf_1 = __importDefault(require("../bloom/sbbf"));

return columnChunkDataCollection.filter((columnChunk) => {
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, }, }, } = columnChunk;
return bloomFilterOffsetBuffer;
return columnChunk.column.meta_data?.bloom_filter_offset;
});

@@ -35,6 +44,6 @@ };

const parseBloomFilterOffsets = (ColumnChunkDataCollection) => {
return ColumnChunkDataCollection.map((columnChunkData) => {
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, path_in_schema: pathInSchema, }, }, rowGroupIndex, } = columnChunkData;
return ColumnChunkDataCollection.map(({ rowGroupIndex, column }) => {
const { bloom_filter_offset: bloomOffset, path_in_schema: pathInSchema, } = column.meta_data || {};
return {
offsetBytes: toInteger(bloomFilterOffsetBuffer),
offsetBytes: toInteger(bloomOffset.buffer),
columnName: pathInSchema.join(","),

@@ -46,13 +55,16 @@ rowGroupIndex,

exports.parseBloomFilterOffsets = parseBloomFilterOffsets;
const getBloomFilterHeader = (offsetBytes, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () {
const getBloomFilterHeader = async (offsetBytes, envelopeReader) => {
const headerByteSizeEstimate = 200;
let bloomFilterHeaderData;
try {
bloomFilterHeaderData = yield envelopeReader.read(offsetBytes, headerByteSizeEstimate);
bloomFilterHeaderData = await envelopeReader.read(offsetBytes, headerByteSizeEstimate);
}
catch (e) {
throw new Error(e);
if (typeof e === 'string')
throw new Error(e);
else
throw e;
}
const bloomFilterHeader = new parquet_types_1.default.BloomFilterHeader();
const sizeOfBloomFilterHeader = util_1.default.decodeThrift(bloomFilterHeader, bloomFilterHeaderData);
const sizeOfBloomFilterHeader = parquet_util.decodeThrift(bloomFilterHeader, bloomFilterHeaderData);
return {

@@ -62,15 +74,18 @@ bloomFilterHeader,

};
});
const readFilterData = (offsetBytes, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () {
const { bloomFilterHeader, sizeOfBloomFilterHeader, } = yield getBloomFilterHeader(offsetBytes, envelopeReader);
};
const readFilterData = async (offsetBytes, envelopeReader) => {
const { bloomFilterHeader, sizeOfBloomFilterHeader, } = await getBloomFilterHeader(offsetBytes, envelopeReader);
const { numBytes: filterByteSize } = bloomFilterHeader;
try {
const filterBlocksOffset = offsetBytes + sizeOfBloomFilterHeader;
const buffer = yield envelopeReader.read(filterBlocksOffset, filterByteSize);
const buffer = await envelopeReader.read(filterBlocksOffset, filterByteSize);
return buffer;
}
catch (e) {
throw new Error(e);
if (typeof e === 'string')
throw new Error(e);
else
throw e;
}
});
};
const readFilterDataFrom = (offsets, envelopeReader) => {

@@ -80,10 +95,10 @@ return Promise.all(offsets.map((offset) => readFilterData(offset, envelopeReader)));

const siftAllByteOffsets = (columnChunkDataCollection) => {
return exports.parseBloomFilterOffsets(filterColumnChunksWithBloomFilters(columnChunkDataCollection));
return (0, exports.parseBloomFilterOffsets)(filterColumnChunksWithBloomFilters(columnChunkDataCollection));
};
exports.siftAllByteOffsets = siftAllByteOffsets;
const getBloomFiltersFor = (columnNames, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () {
const getBloomFiltersFor = async (columnNames, envelopeReader) => {
const columnChunkDataCollection = envelopeReader.getAllColumnChunkDataFor(columnNames);
const bloomFilterOffsetData = exports.siftAllByteOffsets(columnChunkDataCollection);
const bloomFilterOffsetData = (0, exports.siftAllByteOffsets)(columnChunkDataCollection);
const offsetByteValues = bloomFilterOffsetData.map(({ offsetBytes }) => offsetBytes);
const filterBlocksBuffers = yield readFilterDataFrom(offsetByteValues, envelopeReader);
const filterBlocksBuffers = await readFilterDataFrom(offsetByteValues, envelopeReader);
return filterBlocksBuffers.map((buffer, index) => {

@@ -97,4 +112,3 @@ const { columnName, rowGroupIndex } = bloomFilterOffsetData[index];

});
});
};
exports.getBloomFiltersFor = getBloomFiltersFor;
//# sourceMappingURL=bloomFilterReader.js.map
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {

@@ -7,3 +26,3 @@ return (mod && mod.__esModule) ? mod : { "default": mod };

exports.getSerializedBloomFilterData = exports.setFilterOffset = exports.serializeFilterData = exports.serializeFilterHeaders = exports.createSBBF = void 0;
const util_1 = __importDefault(require("../util"));
const parquet_util = __importStar(require("../util"));
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types"));

@@ -37,3 +56,3 @@ const sbbf_1 = __importDefault(require("../bloom/sbbf"));

const bloomFilterHeader = buildFilterHeader(numberOfBytes);
return util_1.default.serializeThrift(bloomFilterHeader);
return parquet_util.serializeThrift(bloomFilterHeader);
};

@@ -43,3 +62,3 @@ exports.serializeFilterHeaders = serializeFilterHeaders;

const serializedFilterBlocks = serializeFilterBlocks(params.filterBlocks);
const serializedFilterHeaders = exports.serializeFilterHeaders(params.filterByteSize);
const serializedFilterHeaders = (0, exports.serializeFilterHeaders)(params.filterByteSize);
return Buffer.concat([serializedFilterHeaders, serializedFilterBlocks]);

@@ -49,3 +68,3 @@ };

const setFilterOffset = (column, offset) => {
column.meta_data.bloom_filter_offset = offset;
column.meta_data.bloom_filter_offset = parquet_util.cloneInteger(offset);
};

@@ -56,5 +75,4 @@ exports.setFilterOffset = setFilterOffset;

const filterByteSize = splitBlockBloomFilter.getNumFilterBytes();
return exports.serializeFilterData({ filterBlocks, filterByteSize });
return (0, exports.serializeFilterData)({ filterBlocks, filterByteSize });
};
exports.getSerializedBloomFilterData = getSerializedBloomFilterData;
//# sourceMappingURL=bloomFilterWriter.js.map
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
module.exports = class BufferReader {
Object.defineProperty(exports, "__esModule", { value: true });
class BufferReader {
maxSpan;
maxLength;
queueWait;
scheduled;
queue;
envelopeReader;
constructor(envelopeReader, options) {

@@ -22,54 +20,51 @@ options = options || {};

read(offset, length) {
return __awaiter(this, void 0, void 0, function* () {
if (!this.scheduled) {
this.scheduled = true;
setTimeout(() => {
this.scheduled = false;
this.processQueue();
}, this.queueWait);
}
return new Promise((resolve, reject) => {
this.queue.push({ offset, length, resolve, reject });
});
if (!this.scheduled) {
this.scheduled = true;
setTimeout(() => {
this.scheduled = false;
this.processQueue();
}, this.queueWait);
}
return new Promise((resolve, reject) => {
this.queue.push({ offset, length, resolve, reject });
});
}
processQueue() {
return __awaiter(this, void 0, void 0, function* () {
const queue = this.queue;
if (!queue.length)
async processQueue() {
const queue = this.queue;
if (!queue.length)
return;
this.queue = [];
queue.sort((a, b) => a.offset - b.offset);
var subqueue = [];
const readSubqueue = async () => {
if (!subqueue.length) {
return;
this.queue = [];
queue.sort((a, b) => a.offset - b.offset);
var subqueue = [];
const readSubqueue = () => __awaiter(this, void 0, void 0, function* () {
if (!subqueue.length) {
return;
}
const processQueue = subqueue;
subqueue = [];
const lastElement = processQueue[processQueue.length - 1];
const start = processQueue[0].offset;
const finish = lastElement.offset + lastElement.length;
const buffer = yield this.envelopeReader.readFn(start, finish - start);
processQueue.forEach((d) => __awaiter(this, void 0, void 0, function* () {
d.resolve(buffer.slice(d.offset - start, d.offset + d.length - start));
}));
}
const processQueue = subqueue;
subqueue = [];
const lastElement = processQueue[processQueue.length - 1];
const start = processQueue[0].offset;
const finish = lastElement.offset + lastElement.length;
const buffer = await this.envelopeReader.readFn(start, finish - start);
processQueue.forEach(async (d) => {
d.resolve(buffer.slice(d.offset - start, d.offset + d.length - start));
});
queue.forEach((d, i) => {
const prev = queue[i - 1];
if (!prev || (d.offset - (prev.offset + prev.length)) < this.maxSpan) {
subqueue.push(d);
if ((d.offset + d.length) - subqueue[0].offset > this.maxLength) {
readSubqueue();
}
}
else {
};
queue.forEach((d, i) => {
const prev = queue[i - 1];
if (!prev || (d.offset - (prev.offset + prev.length)) < this.maxSpan) {
subqueue.push(d);
if ((d.offset + d.length) - subqueue[0].offset > this.maxLength) {
readSubqueue();
subqueue = [d];
}
});
readSubqueue(subqueue);
}
else {
readSubqueue();
subqueue = [d];
}
});
readSubqueue();
}
};
//# sourceMappingURL=bufferReader.js.map
}
exports.default = BufferReader;
;
"use strict";
module.exports.PLAIN = require('./plain');
module.exports.RLE = require('./rle');
module.exports.PLAIN_DICTIONARY = require('./plain_dictionary');
//# sourceMappingURL=index.js.map
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.PLAIN_DICTIONARY = exports.RLE = exports.PLAIN = void 0;
exports.PLAIN = __importStar(require("./plain"));
exports.RLE = __importStar(require("./rle"));
exports.PLAIN_DICTIONARY = __importStar(require("./plain_dictionary"));
"use strict";
const rle = require('./rle');
exports.decodeValues = function (type, cursor, count, opts) {
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeValues = void 0;
const rle = __importStar(require("./rle"));
const decodeValues = function (type, cursor, count, opts) {
opts.bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0);

@@ -8,2 +29,2 @@ cursor.offset += 1;

};
//# sourceMappingURL=plain_dictionary.js.map
exports.decodeValues = decodeValues;

@@ -1,3 +0,8 @@

'use strict';
const INT53 = require('int53');
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeValues = exports.encodeValues = void 0;
const int53_1 = __importDefault(require("int53"));
function encodeValues_BOOLEAN(values) {

@@ -8,3 +13,3 @@ let buf = Buffer.alloc(Math.ceil(values.length / 8));

if (values[i]) {
buf[Math.floor(i / 8)] |= (1 << (i % 8));
buf[Math.floor(i / 8)] |= 1 << i % 8;
}

@@ -18,3 +23,3 @@ }

let b = cursor.buffer[cursor.offset + Math.floor(i / 8)];
values.push((b & (1 << (i % 8))) > 0);
values.push((b & (1 << i % 8)) > 0);
}

@@ -42,3 +47,2 @@ cursor.offset += Math.ceil(count / 8);

for (let i = 0; i < values.length; i++) {
//console.log(typeof values[i]);
buf.writeBigInt64LE(BigInt(values[i]), i * 8);

@@ -60,7 +64,7 @@ }

if (values[i] >= 0) {
INT53.writeInt64LE(values[i], buf, i * 12);
int53_1.default.writeInt64LE(values[i], buf, i * 12);
buf.writeUInt32LE(0, i * 12 + 8); // truncate to 64 actual precision
}
else {
INT53.writeInt64LE((~-values[i]) + 1, buf, i * 12);
int53_1.default.writeInt64LE(~-values[i] + 1, buf, i * 12);
buf.writeUInt32LE(0xffffffff, i * 12 + 8); // truncate to 64 actual precision

@@ -74,6 +78,6 @@ }

for (let i = 0; i < count; ++i) {
const low = INT53.readInt64LE(cursor.buffer, cursor.offset);
const low = int53_1.default.readInt64LE(cursor.buffer, cursor.offset);
const high = cursor.buffer.readUInt32LE(cursor.offset + 8);
if (high === 0xffffffff) {
values.push((~-low) + 1); // truncate to 64 actual precision
values.push(~-low + 1); // truncate to 64 actual precision
}

@@ -117,14 +121,16 @@ else {

}
// Waylands reminder to check again
function encodeValues_BYTE_ARRAY(values) {
let buf_len = 0;
const returnedValues = [];
for (let i = 0; i < values.length; i++) {
values[i] = Buffer.from(values[i]);
buf_len += 4 + values[i].length;
returnedValues[i] = Buffer.from(values[i]);
buf_len += 4 + returnedValues[i].length;
}
let buf = Buffer.alloc(buf_len);
let buf_pos = 0;
for (let i = 0; i < values.length; i++) {
buf.writeUInt32LE(values[i].length, buf_pos);
values[i].copy(buf, buf_pos + 4);
buf_pos += 4 + values[i].length;
for (let i = 0; i < returnedValues.length; i++) {
buf.writeUInt32LE(returnedValues[i].length, buf_pos);
returnedValues[i].copy(buf, buf_pos + 4);
buf_pos += 4 + returnedValues[i].length;
}

@@ -147,10 +153,10 @@ return buf;

}
let buf_len = 0;
const returnedValues = [];
for (let i = 0; i < values.length; i++) {
values[i] = Buffer.from(values[i]);
if (values[i].length !== opts.typeLength) {
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + values[i];
returnedValues[i] = Buffer.from(values[i]);
if (returnedValues[i].length !== opts.typeLength) {
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + returnedValues[i];
}
}
return Buffer.concat(values);
return Buffer.concat(returnedValues);
}

@@ -168,46 +174,47 @@ function decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts) {

}
exports.encodeValues = function (type, values, opts) {
const encodeValues = function (type, values, opts) {
switch (type) {
case 'BOOLEAN':
case "BOOLEAN":
return encodeValues_BOOLEAN(values);
case 'INT32':
case "INT32":
return encodeValues_INT32(values);
case 'INT64':
case "INT64":
return encodeValues_INT64(values);
case 'INT96':
case "INT96":
return encodeValues_INT96(values);
case 'FLOAT':
case "FLOAT":
return encodeValues_FLOAT(values);
case 'DOUBLE':
case "DOUBLE":
return encodeValues_DOUBLE(values);
case 'BYTE_ARRAY':
case "BYTE_ARRAY":
return encodeValues_BYTE_ARRAY(values);
case 'FIXED_LEN_BYTE_ARRAY':
case "FIXED_LEN_BYTE_ARRAY":
return encodeValues_FIXED_LEN_BYTE_ARRAY(values, opts);
default:
throw 'unsupported type: ' + type;
throw "unsupported type: " + type;
}
};
exports.decodeValues = function (type, cursor, count, opts) {
exports.encodeValues = encodeValues;
const decodeValues = function (type, cursor, count, opts) {
switch (type) {
case 'BOOLEAN':
case "BOOLEAN":
return decodeValues_BOOLEAN(cursor, count);
case 'INT32':
case "INT32":
return decodeValues_INT32(cursor, count);
case 'INT64':
case "INT64":
return decodeValues_INT64(cursor, count);
case 'INT96':
case "INT96":
return decodeValues_INT96(cursor, count);
case 'FLOAT':
case "FLOAT":
return decodeValues_FLOAT(cursor, count);
case 'DOUBLE':
case "DOUBLE":
return decodeValues_DOUBLE(cursor, count);
case 'BYTE_ARRAY':
case "BYTE_ARRAY":
return decodeValues_BYTE_ARRAY(cursor, count);
case 'FIXED_LEN_BYTE_ARRAY':
case "FIXED_LEN_BYTE_ARRAY":
return decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts);
default:
throw 'unsupported type: ' + type;
throw "unsupported type: " + type;
}
};
//# sourceMappingURL=plain.js.map
exports.decodeValues = decodeValues;
"use strict";
const varint = require('varint');
// For questions about RLE encoding, see the spec:
//
// https://github.com/apache/parquet-format/blob/master/Encodings.md
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeValues = exports.encodeValues = void 0;
const varint_1 = __importDefault(require("varint"));
function encodeRunBitpacked(values, opts) {

@@ -14,3 +22,3 @@ for (let i = 0; i < values.length % 8; i++) {

return Buffer.concat([
Buffer.from(varint.encode(((values.length / 8) << 1) | 1)),
Buffer.from(varint_1.default.encode(((values.length / 8) << 1) | 1)),
buf

@@ -21,12 +29,23 @@ ]);

let buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8));
let remainingValue = value;
// This is encoded LSB to MSB, so we pick off the least
// significant byte and shift to get the next one.
for (let i = 0; i < buf.length; ++i) {
buf.writeUInt8(value & 0xff, i);
value >> 8;
buf.writeUInt8(remainingValue & 0xff, i);
remainingValue = remainingValue >> 8;
}
return Buffer.concat([
Buffer.from(varint.encode(count << 1)),
Buffer.from(varint_1.default.encode(count << 1)),
buf
]);
}
exports.encodeValues = function (type, values, opts) {
function unknownToParsedInt(value) {
if (typeof value === 'string') {
return parseInt(value, 10);
}
else {
return value;
}
}
const encodeValues = function (type, values, opts) {
if (!('bitWidth' in opts)) {

@@ -39,3 +58,3 @@ throw 'bitWidth is required';

case 'INT64':
values = values.map((x) => parseInt(x, 10));
values = values.map((x) => unknownToParsedInt(x));
break;

@@ -85,2 +104,3 @@ default:

};
exports.encodeValues = encodeValues;
function decodeRunBitpacked(cursor, count, opts) {

@@ -100,6 +120,9 @@ if (count % 8 !== 0) {

function decodeRunRepeated(cursor, count, opts) {
var bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8);
let value = 0;
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); ++i) {
value << 8;
value += cursor.buffer[cursor.offset];
for (let i = 0; i < bytesNeededForFixedBitWidth; ++i) {
const byte = cursor.buffer[cursor.offset];
// Bytes are stored LSB to MSB, so we need to shift
// each new byte appropriately.
value += byte << (i * 8);
cursor.offset += 1;

@@ -109,3 +132,3 @@ }

}
exports.decodeValues = function (type, cursor, count, opts) {
const decodeValues = function (_, cursor, count, opts) {
if (!('bitWidth' in opts)) {

@@ -120,4 +143,4 @@ throw 'bitWidth is required';

while (values.length < count) {
const header = varint.decode(cursor.buffer, cursor.offset);
cursor.offset += varint.encodingLength(header);
const header = varint_1.default.decode(cursor.buffer, cursor.offset);
cursor.offset += varint_1.default.encodingLength(header);
if (header & 1) {

@@ -139,2 +162,2 @@ res = decodeRunBitpacked(cursor, (header >> 1) * 8, opts);

};
//# sourceMappingURL=rle.js.map
exports.decodeValues = decodeValues;

@@ -1,7 +0,12 @@

'use strict';
const zlib = require('zlib');
const snappy = require('snappyjs');
const lzo = require('lzo');
const brotli = require('brotli');
const PARQUET_COMPRESSION_METHODS = {
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.inflate = exports.deflate = exports.PARQUET_COMPRESSION_METHODS = void 0;
const zlib_1 = __importDefault(require("zlib"));
const snappyjs_1 = __importDefault(require("snappyjs"));
const wasm_brotli_1 = require("wasm-brotli");
// LZO compression is disabled. See: https://github.com/LibertyDSNP/parquetjs/issues/18
exports.PARQUET_COMPRESSION_METHODS = {
'UNCOMPRESSED': {

@@ -19,6 +24,2 @@ deflate: deflate_identity,

},
'LZO': {
deflate: deflate_lzo,
inflate: inflate_lzo
},
'BROTLI': {

@@ -32,8 +33,9 @@ deflate: deflate_brotli,

*/
function deflate(method, value) {
if (!(method in PARQUET_COMPRESSION_METHODS)) {
async function deflate(method, value) {
if (!(method in exports.PARQUET_COMPRESSION_METHODS)) {
throw 'invalid compression method: ' + method;
}
return PARQUET_COMPRESSION_METHODS[method].deflate(value);
return exports.PARQUET_COMPRESSION_METHODS[method].deflate(value);
}
exports.deflate = deflate;
function deflate_identity(value) {

@@ -43,26 +45,26 @@ return value;

function deflate_gzip(value) {
return zlib.gzipSync(value);
return zlib_1.default.gzipSync(value);
}
function deflate_snappy(value) {
return snappy.compress(value);
return snappyjs_1.default.compress(value);
}
function deflate_lzo(value) {
return lzo.compress(value);
async function deflate_brotli(value) {
const compressedContent = await (0, wasm_brotli_1.compress)(value /*, {
mode: 0,
quality: 8,
lgwin: 22
}
*/);
return Buffer.from(compressedContent);
}
function deflate_brotli(value) {
return Buffer.from(brotli.compress(value, {
mode: 0,
quality: 8,
lgwin: 22
}));
}
/**
* Inflate a value using compression method `method`
*/
function inflate(method, value) {
if (!(method in PARQUET_COMPRESSION_METHODS)) {
async function inflate(method, value) {
if (!(method in exports.PARQUET_COMPRESSION_METHODS)) {
throw 'invalid compression method: ' + method;
}
return PARQUET_COMPRESSION_METHODS[method].inflate(value);
return await exports.PARQUET_COMPRESSION_METHODS[method].inflate(value);
}
exports.inflate = inflate;
function inflate_identity(value) {

@@ -72,14 +74,10 @@ return value;

function inflate_gzip(value) {
return zlib.gunzipSync(value);
return zlib_1.default.gunzipSync(value);
}
function inflate_snappy(value) {
return snappy.uncompress(value);
return snappyjs_1.default.uncompress(value);
}
function inflate_lzo(value) {
return lzo.decompress(value);
async function inflate_brotli(value) {
const uncompressedContent = await (0, wasm_brotli_1.decompress)(value);
return Buffer.from(uncompressedContent);
}
function inflate_brotli(value) {
return Buffer.from(brotli.decompress(value));
}
module.exports = { PARQUET_COMPRESSION_METHODS, deflate, inflate };
//# sourceMappingURL=compression.js.map

@@ -1,25 +0,38 @@

'use strict';
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
const fs = require('fs');
const thrift = require('thrift');
const Int64 = require('node-int64');
const parquet_thrift = require('../gen-nodejs/parquet_types');
const parquet_shredder = require('./shred');
const parquet_util = require('./util');
const parquet_schema = require('./schema');
const parquet_codec = require('./codec');
const parquet_compression = require('./compression');
const parquet_types = require('./types');
const BufferReader = require('./bufferReader');
const bloomFilterReader = require('./bloomFilterIO/bloomFilterReader');
const groupBy = require("lodash/groupBy");
const fetch = require('cross-fetch');
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetEnvelopeReader = exports.ParquetReader = void 0;
const node_int64_1 = __importDefault(require("node-int64"));
const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types"));
const parquet_shredder = __importStar(require("./shred"));
const parquet_util = __importStar(require("./util"));
const parquet_schema = __importStar(require("./schema"));
const parquet_codec = __importStar(require("./codec"));
const parquet_compression = __importStar(require("./compression"));
const parquet_types = __importStar(require("./types"));
const bufferReader_1 = __importDefault(require("./bufferReader"));
const bloomFilterReader = __importStar(require("./bloomFilterIO/bloomFilterReader"));
const cross_fetch_1 = __importDefault(require("cross-fetch"));
const types_1 = require("./types/types");
const { getBloomFiltersFor, } = bloomFilterReader;

@@ -43,2 +56,9 @@ /**

class ParquetCursor {
metadata;
envelopeReader;
schema;
columnList;
rowGroup;
rowGroupIndex;
cursorIndex;
/**

@@ -57,2 +77,3 @@ * Create a new parquet reader from the file metadata and an envelope reader.

this.rowGroupIndex = 0;
this.cursorIndex = 0;
}

@@ -63,14 +84,13 @@ /**

*/
next() {
return __awaiter(this, void 0, void 0, function* () {
if (this.rowGroup.length === 0) {
if (this.rowGroupIndex >= this.metadata.row_groups.length) {
return null;
}
let rowBuffer = yield this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList);
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer);
this.rowGroupIndex++;
async next() {
if (this.cursorIndex >= this.rowGroup.length) {
if (this.rowGroupIndex >= this.metadata.row_groups.length) {
return null;
}
return this.rowGroup.shift();
});
let rowBuffer = await this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList);
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer);
this.rowGroupIndex++;
this.cursorIndex = 0;
}
return this.rowGroup[this.cursorIndex++];
}

@@ -83,2 +103,3 @@ /**

this.rowGroupIndex = 0;
this.cursorIndex = 0;
}

@@ -95,2 +116,5 @@ }

class ParquetReader {
envelopeReader;
metadata;
schema;
/**

@@ -100,13 +124,9 @@ * Open the parquet file pointed to by the specified path and return a new

*/
static openFile(filePath, options) {
return __awaiter(this, void 0, void 0, function* () {
let envelopeReader = yield ParquetEnvelopeReader.openFile(filePath, options);
return this.openEnvelopeReader(envelopeReader, options);
});
static async openFile(filePath, options) {
let envelopeReader = await ParquetEnvelopeReader.openFile(filePath, options);
return this.openEnvelopeReader(envelopeReader, options);
}
static openBuffer(buffer, options) {
return __awaiter(this, void 0, void 0, function* () {
let envelopeReader = yield ParquetEnvelopeReader.openBuffer(buffer, options);
return this.openEnvelopeReader(envelopeReader, options);
});
static async openBuffer(buffer, options) {
let envelopeReader = await ParquetEnvelopeReader.openBuffer(buffer, options);
return this.openEnvelopeReader(envelopeReader, options);
}

@@ -118,7 +138,5 @@ /**

*/
static openS3(client, params, options) {
return __awaiter(this, void 0, void 0, function* () {
let envelopeReader = yield ParquetEnvelopeReader.openS3(client, params, options);
return this.openEnvelopeReader(envelopeReader, options);
});
static async openS3(client, params, options) {
let envelopeReader = await ParquetEnvelopeReader.openS3(client, params, options);
return this.openEnvelopeReader(envelopeReader, options);
}

@@ -131,23 +149,19 @@ /**

*/
static openUrl(params, options) {
return __awaiter(this, void 0, void 0, function* () {
let envelopeReader = yield ParquetEnvelopeReader.openUrl(params, options);
return this.openEnvelopeReader(envelopeReader, options);
});
static async openUrl(params, options) {
let envelopeReader = await ParquetEnvelopeReader.openUrl(params, options);
return this.openEnvelopeReader(envelopeReader, options);
}
static openEnvelopeReader(envelopeReader, opts) {
return __awaiter(this, void 0, void 0, function* () {
if (opts && opts.metadata) {
return new ParquetReader(opts.metadata, envelopeReader, opts);
}
try {
yield envelopeReader.readHeader();
let metadata = yield envelopeReader.readFooter();
return new ParquetReader(metadata, envelopeReader, opts);
}
catch (err) {
yield envelopeReader.close();
throw err;
}
});
static async openEnvelopeReader(envelopeReader, opts) {
if (opts && opts.metadata) {
return new ParquetReader(opts.metadata, envelopeReader, opts);
}
try {
await envelopeReader.readHeader();
let metadata = await envelopeReader.readFooter();
return new ParquetReader(metadata, envelopeReader, opts);
}
catch (err) {
await envelopeReader.close();
throw err;
}
}

@@ -174,3 +188,3 @@ /**

else if (o.parquetType === 'INT64') {
return new Int64(Buffer.from(o.value));
return new node_int64_1.default(Buffer.from(o.value));
}

@@ -184,7 +198,7 @@ }

if (column.offsetIndex) {
column.offsetIndex.page_locations.forEach(d => {
Promise.resolve(column.offsetIndex).then(offset => (offset.page_locations.forEach(d => {
if (Array.isArray(d)) {
Object.setPrototypeOf(d, parquet_thrift.PageLocation.prototype);
Object.setPrototypeOf(d, parquet_types_1.default.PageLocation.prototype);
}
});
})));
}

@@ -213,2 +227,19 @@ });

/**
* Support `for await` iterators on the reader object
* Uses `ParquetCursor` still under the hood.
*
* ```js
* for await (const record of reader) {
* console.log(record);
* }
* ```
*/
async *[Symbol.asyncIterator]() {
const cursor = this.getCursor();
let record = null;
while (record = await cursor.next()) {
yield record;
}
}
/**
* Return a cursor to the file. You may open more than one cursor and use

@@ -229,11 +260,15 @@ * them concurrently. All cursors become invalid once close() is called on

}
getBloomFiltersFor(columnNames) {
return __awaiter(this, void 0, void 0, function* () {
const bloomFilterData = yield getBloomFiltersFor(columnNames, this.envelopeReader);
return groupBy(bloomFilterData, 'columnName');
});
async getBloomFiltersFor(columnNames) {
const bloomFilterData = await getBloomFiltersFor(columnNames, this.envelopeReader);
return bloomFilterData.reduce((acc, value) => {
if (acc[value.columnName])
acc[value.columnName].push(value);
else
acc[value.columnName] = [value];
return acc;
}, {});
}
/**
* Return the number of rows in this file. Note that the number of rows is
* not neccessarily equal to the number of rows in each column.
* not necessarily equal to the number of rows in each column.
*/

@@ -259,6 +294,6 @@ getRowCount() {

}
exportMetadata(indent) {
function replacer(key, value) {
if (value instanceof parquet_thrift.PageLocation) {
return [value[0], value[1], value[2]];
async exportMetadata(indent) {
function replacer(_key, value) {
if (value instanceof parquet_types_1.default.PageLocation) {
return [value.offset, value.compressed_page_size, value.first_row_index];
}

@@ -278,4 +313,4 @@ if (typeof value === 'object') {

}
if (value instanceof Int64) {
if (isFinite(value)) {
if (value instanceof node_int64_1.default) {
if (isFinite(Number(value))) {
return Number(value);

@@ -295,2 +330,14 @@ }

const metadata = Object.assign({}, this.metadata, { json: true });
for (let i = 0; i < metadata.row_groups.length; i++) {
const rowGroup = metadata.row_groups[i];
for (let j = 0; j < rowGroup.columns.length; j++) {
const column = rowGroup.columns[j];
if (column.offsetIndex instanceof Promise) {
column.offsetIndex = await column.offsetIndex;
}
if (column.columnIndex instanceof Promise) {
column.columnIndex = await column.columnIndex;
}
}
}
return JSON.stringify(metadata, replacer, indent);

@@ -302,8 +349,6 @@ }

*/
close() {
return __awaiter(this, void 0, void 0, function* () {
yield this.envelopeReader.close();
this.envelopeReader = null;
this.metadata = null;
});
async close() {
await this.envelopeReader.close();
this.envelopeReader = null;
this.metadata = null;
}

@@ -314,2 +359,3 @@ decodePages(buffer, opts) {

}
exports.ParquetReader = ParquetReader;
/**

@@ -323,70 +369,69 @@ * The parquet envelope reader allows direct, unbuffered access to the individual

class ParquetEnvelopeReader {
static openFile(filePath, options) {
return __awaiter(this, void 0, void 0, function* () {
let fileStat = yield parquet_util.fstat(filePath);
let fileDescriptor = yield parquet_util.fopen(filePath);
let readFn = (offset, length, file) => {
if (file) {
return Promise.reject('external references are not supported');
}
return parquet_util.fread(fileDescriptor, offset, length);
};
let closeFn = parquet_util.fclose.bind(undefined, fileDescriptor);
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size, options);
});
readFn;
close;
id;
fileSize;
default_dictionary_size;
metadata;
schema;
static async openFile(filePath, options) {
let fileStat = await parquet_util.fstat(filePath);
let fileDescriptor = await parquet_util.fopen(filePath);
let readFn = (offset, length, file) => {
if (file) {
return Promise.reject('external references are not supported');
}
return parquet_util.fread(fileDescriptor, offset, length);
};
let closeFn = parquet_util.fclose.bind(undefined, fileDescriptor);
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size, options);
}
static openBuffer(buffer, options) {
return __awaiter(this, void 0, void 0, function* () {
let readFn = (offset, length, file) => {
if (file) {
return Promise.reject('external references are not supported');
}
return Promise.resolve(buffer.slice(offset, offset + length));
};
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length, options);
});
static async openBuffer(buffer, options) {
let readFn = (offset, length, file) => {
if (file) {
return Promise.reject('external references are not supported');
}
return Promise.resolve(buffer.slice(offset, offset + length));
};
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length, options);
}
static openS3(client, params, options) {
return __awaiter(this, void 0, void 0, function* () {
let fileStat = () => __awaiter(this, void 0, void 0, function* () { return client.headObject(params).promise().then(d => d.ContentLength); });
let readFn = (offset, length, file) => __awaiter(this, void 0, void 0, function* () {
if (file) {
return Promise.reject('external references are not supported');
}
let Range = `bytes=${offset}-${offset + length - 1}`;
let res = yield client.getObject(Object.assign({ Range }, params)).promise();
return Promise.resolve(res.Body);
});
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options);
});
static async openS3(client, params, options) {
let fileStat = async () => client.headObject(params).promise().then((d) => d.ContentLength);
let readFn = async (offset, length, file) => {
if (file) {
return Promise.reject('external references are not supported');
}
let Range = `bytes=${offset}-${offset + length - 1}`;
let res = await client.getObject(Object.assign({ Range }, params)).promise();
return Promise.resolve(res.Body);
};
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options);
}
static openUrl(params, options) {
return __awaiter(this, void 0, void 0, function* () {
if (typeof params === 'string')
params = { url: params };
if (!params.url)
throw new Error('URL missing');
let base = params.url.split('/');
base = base.slice(0, base.length - 1).join('/') + '/';
let defaultHeaders = params.headers || {};
let filesize = () => __awaiter(this, void 0, void 0, function* () {
const { headers } = yield fetch(params.url);
return headers.get('Content-Length');
});
let readFn = (offset, length, file) => __awaiter(this, void 0, void 0, function* () {
let url = file ? base + file : params.url;
let range = `bytes=${offset}-${offset + length - 1}`;
let headers = Object.assign({}, defaultHeaders, { range });
const response = yield fetch(url, { headers });
const arrayBuffer = yield response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
return buffer;
});
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, filesize, options);
});
static async openUrl(params, options) {
if (typeof params === 'string')
params = { url: params };
if (!params.url)
throw new Error('URL missing');
const baseArr = params.url.split('/');
const base = baseArr.slice(0, baseArr.length - 1).join('/') + '/';
let defaultHeaders = params.headers || {};
let filesize = async () => {
const { headers } = await (0, cross_fetch_1.default)(params.url);
return headers.get('Content-Length');
};
let readFn = async (offset, length, file) => {
let url = file ? base + file : params.url;
let range = `bytes=${offset}-${offset + length - 1}`;
let headers = Object.assign({}, defaultHeaders, { range });
const response = await (0, cross_fetch_1.default)(url, { headers });
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
return buffer;
};
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, filesize, options);
}
constructor(readFn, closeFn, fileSize, options) {
constructor(readFn, closeFn, fileSize, options, metadata) {
options = options || {};

@@ -398,4 +443,5 @@ this.readFn = readFn;

this.default_dictionary_size = options.default_dictionary_size || 10000000;
this.metadata = metadata;
if (options.maxLength || options.maxSpan || options.queueWait) {
const bufferReader = new BufferReader(this, options);
const bufferReader = new bufferReader_1.default(this, options);
this.read = (offset, length) => bufferReader.read(offset, length);

@@ -408,3 +454,3 @@ }

readHeader() {
return this.read(0, PARQUET_MAGIC.length).then(buf => {
return this.read(0, PARQUET_MAGIC.length).then((buf) => {
if (buf.toString() != PARQUET_MAGIC) {

@@ -418,10 +464,14 @@ throw 'not valid parquet file';

let column;
if (!isNaN(row_group)) {
row_group = this.metadata.row_groups[row_group];
let parsedRowGroup;
if (!isNaN(Number(row_group))) {
parsedRowGroup = this.metadata?.row_groups[Number(row_group)];
}
else if (row_group instanceof parquet_types_1.default.RowGroup) {
parsedRowGroup = row_group;
}
if (typeof path === 'string') {
if (!row_group) {
if (!parsedRowGroup) {
throw `Missing RowGroup ${row_group}`;
}
column = row_group.columns.find(d => d.meta_data.path_in_schema.join(',') === path);
column = parsedRowGroup.columns.find(d => d.meta_data.path_in_schema.join(',') === path);
if (!column) {

@@ -453,12 +503,9 @@ throw `Column ${path} Not Found`;

}
const data = this.read(+column.offset_index_offset, column.offset_index_length).then(data => {
let offset_index = new parquet_thrift.OffsetIndex();
const data = this.read(+column.offset_index_offset, column.offset_index_length).then((data) => {
let offset_index = new parquet_types_1.default.OffsetIndex();
parquet_util.decodeThrift(offset_index, data);
Object.defineProperty(offset_index, 'column', { value: column, enumerable: false });
if (opts && opts.cache) {
column.offsetIndex = offset_index;
}
return offset_index;
});
if (opts && opts.cache) {
if (opts?.cache) {
column.offsetIndex = data;

@@ -476,5 +523,5 @@ }

}
const data = this.read(+column.column_index_offset, column.column_index_length).then(data => {
let column_index = new parquet_thrift.ColumnIndex();
parquet_util.decodeThrift(column_index, data);
const data = this.read(+column.column_index_offset, column.column_index_length).then((buf) => {
let column_index = new parquet_types_1.default.ColumnIndex();
parquet_util.decodeThrift(column_index, buf);
Object.defineProperty(column_index, 'column', { value: column });

@@ -489,8 +536,5 @@ // decode the statistics values

}
if (opts && opts.cache) {
column.columnIndex = column_index;
}
return column_index;
});
if (opts && opts.cache) {
if (opts?.cache) {
column.columnIndex = data;

@@ -500,52 +544,50 @@ }

}
readPage(column, page, records, opts) {
return __awaiter(this, void 0, void 0, function* () {
column = Object.assign({}, column);
column.meta_data = Object.assign({}, column.meta_data);
if (page.offset !== undefined) {
if (isNaN(page.offset) || isNaN(page.compressed_page_size)) {
throw Error('page offset and/or size missing');
}
column.meta_data.data_page_offset = page.offset;
column.meta_data.total_compressed_size = page.compressed_page_size;
async readPage(column, page, records, opts) {
column = Object.assign({}, column);
column.meta_data = Object.assign({}, column.meta_data);
if (page instanceof parquet_types_1.default.PageLocation && page.offset !== undefined) {
if (isNaN(Number(page.offset)) || isNaN(page.compressed_page_size)) {
throw Error('page offset and/or size missing');
}
else {
const offsetIndex = column.offsetIndex || (yield this.readOffsetIndex(column, null, opts));
column.meta_data.data_page_offset = offsetIndex.page_locations[page].offset;
column.meta_data.total_compressed_size = offsetIndex.page_locations[page].compressed_page_size;
}
const chunk = yield this.readColumnChunk(this.schema, column);
Object.defineProperty(chunk, 'column', { value: column });
let data = {
columnData: { [chunk.column.meta_data.path_in_schema.join(',')]: chunk }
};
return parquet_shredder.materializeRecords(this.schema, data, records);
});
column.meta_data.data_page_offset = parquet_util.cloneInteger(page.offset);
column.meta_data.total_compressed_size = new node_int64_1.default(page.compressed_page_size);
}
else {
const offsetIndex = await this.readOffsetIndex(column, null, opts);
column.meta_data.data_page_offset = parquet_util.cloneInteger(offsetIndex.page_locations[page].offset);
column.meta_data.total_compressed_size = new node_int64_1.default(offsetIndex.page_locations[page].compressed_page_size);
}
const chunk = await this.readColumnChunk(this.schema, column);
Object.defineProperty(chunk, 'column', { value: column });
let data = {
columnData: { [chunk.column.meta_data.path_in_schema.join(',')]: chunk }
};
return parquet_shredder.materializeRecords(this.schema, data, records);
}
readRowGroup(schema, rowGroup, columnList) {
return __awaiter(this, void 0, void 0, function* () {
var buffer = {
rowCount: +rowGroup.num_rows,
columnData: {}
};
for (let colChunk of rowGroup.columns) {
const colMetadata = colChunk.meta_data;
const colKey = colMetadata.path_in_schema;
if (columnList.length > 0 && parquet_util.fieldIndexOf(columnList, colKey) < 0) {
continue;
}
buffer.columnData[colKey] = yield this.readColumnChunk(schema, colChunk);
async readRowGroup(schema, rowGroup, columnList) {
var buffer = {
rowCount: +rowGroup.num_rows,
columnData: {},
pageRowCount: 0,
pages: {}
};
for (let colChunk of rowGroup.columns) {
const colMetadata = colChunk.meta_data;
const colKey = colMetadata.path_in_schema;
if (columnList.length > 0 && parquet_util.fieldIndexOf(columnList, colKey) < 0) {
continue;
}
return buffer;
});
buffer.columnData[colKey.join(',')] = await this.readColumnChunk(schema, colChunk);
}
return buffer;
}
readColumnChunk(schema, colChunk, opts) {
let dictionary = Promise.resolve();
let field = schema.findField(colChunk.meta_data.path_in_schema);
let type = parquet_util.getThriftEnum(parquet_thrift.Type, colChunk.meta_data.type);
let compression = parquet_util.getThriftEnum(parquet_thrift.CompressionCodec, colChunk.meta_data.codec);
let pagesOffset = +colChunk.meta_data.data_page_offset;
let pagesSize = +colChunk.meta_data.total_compressed_size;
async readColumnChunk(schema, colChunk, opts) {
let metadata = colChunk.meta_data;
let field = schema.findField(metadata.path_in_schema);
let type = parquet_util.getThriftEnum(parquet_types_1.default.Type, metadata.type);
let compression = parquet_util.getThriftEnum(parquet_types_1.default.CompressionCodec, metadata.codec);
let pagesOffset = +metadata.data_page_offset;
let pagesSize = +metadata.total_compressed_size;
if (!colChunk.file_path) {
pagesSize = Math.min(this.fileSize - pagesOffset, +colChunk.meta_data.total_compressed_size);
pagesSize = Math.min(this.fileSize - pagesOffset, +metadata.total_compressed_size);
}

@@ -558,36 +600,36 @@ opts = Object.assign({}, opts, {

column: field,
num_values: colChunk.meta_data.num_values
num_values: metadata.num_values
});
if (colChunk.meta_data.dictionary_page_offset) {
const offset = +colChunk.meta_data.dictionary_page_offset;
if (metadata.dictionary_page_offset) {
const offset = +metadata.dictionary_page_offset;
const size = Math.min(+this.fileSize - offset, this.default_dictionary_size);
dictionary = this.read(offset, size, colChunk.file_path).then(buffer => decodePage({ offset: 0, buffer, size: buffer.length }, opts).dictionary);
await this.read(offset, size, colChunk.file_path).then(async (buffer) => {
await decodePage({ offset: 0, buffer, size: buffer.length }, opts).then(dict => {
opts.dictionary = opts.dictionary || dict.dictionary;
});
});
}
return dictionary.then(dict => {
opts.dictionary = opts.dictionary || dict;
return this.read(pagesOffset, pagesSize, colChunk.file_path).then(pagesBuf => decodePages(pagesBuf, opts));
});
return this.read(pagesOffset, pagesSize, colChunk.file_path).then((pagesBuf) => decodePages(pagesBuf, opts));
}
readFooter() {
return __awaiter(this, void 0, void 0, function* () {
if (typeof this.fileSize === 'function') {
this.fileSize = yield this.fileSize();
}
let trailerLen = PARQUET_MAGIC.length + 4;
let trailerBuf = yield this.read(this.fileSize - trailerLen, trailerLen);
if (trailerBuf.slice(4).toString() != PARQUET_MAGIC) {
throw 'not a valid parquet file';
}
let metadataSize = trailerBuf.readUInt32LE(0);
let metadataOffset = this.fileSize - metadataSize - trailerLen;
if (metadataOffset < PARQUET_MAGIC.length) {
throw 'invalid metadata size';
}
let metadataBuf = yield this.read(metadataOffset, metadataSize);
let metadata = new parquet_thrift.FileMetaData();
parquet_util.decodeThrift(metadata, metadataBuf);
return metadata;
});
async readFooter() {
if (typeof this.fileSize === 'function') {
this.fileSize = await this.fileSize();
}
let trailerLen = PARQUET_MAGIC.length + 4;
let trailerBuf = await this.read(this.fileSize - trailerLen, trailerLen);
if (trailerBuf.slice(4).toString() != PARQUET_MAGIC) {
throw 'not a valid parquet file';
}
let metadataSize = trailerBuf.readUInt32LE(0);
let metadataOffset = this.fileSize - metadataSize - trailerLen;
if (metadataOffset < PARQUET_MAGIC.length) {
throw 'invalid metadata size';
}
let metadataBuf = await this.read(metadataOffset, metadataSize);
let metadata = new parquet_types_1.default.FileMetaData();
parquet_util.decodeThrift(metadata, metadataBuf);
return metadata;
}
}
exports.ParquetEnvelopeReader = ParquetEnvelopeReader;
/**

@@ -630,10 +672,10 @@ * Decode a consecutive array of data using one of the parquet encodings

}
function decodePage(cursor, opts) {
async function decodePage(cursor, opts) {
opts = opts || {};
let page;
const pageHeader = new parquet_thrift.PageHeader();
const pageHeader = new types_1.NewPageHeader();
const headerOffset = cursor.offset;
const headerSize = parquet_util.decodeThrift(pageHeader, cursor.buffer.slice(cursor.offset));
cursor.offset += headerSize;
const pageType = parquet_util.getThriftEnum(parquet_thrift.PageType, pageHeader.type);
const pageType = parquet_util.getThriftEnum(parquet_types_1.default.PageType, pageHeader.type);
switch (pageType) {

@@ -644,3 +686,3 @@ case 'DATA_PAGE':

}
page = decodeDataPage(cursor, pageHeader, opts);
page = await decodeDataPage(cursor, pageHeader, opts);
break;

@@ -651,7 +693,8 @@ case 'DATA_PAGE_V2':

}
page = decodeDataPageV2(cursor, pageHeader, opts);
page = await decodeDataPageV2(cursor, pageHeader, opts);
break;
case 'DICTIONARY_PAGE':
const dict = await decodeDictionaryPage(cursor, pageHeader, opts);
page = {
dictionary: decodeDictionaryPage(cursor, pageHeader, opts)
dictionary: dict
};

@@ -667,3 +710,3 @@ break;

}
function decodePages(buffer, opts) {
async function decodePages(buffer, opts) {
opts = opts || {};

@@ -683,3 +726,3 @@ let cursor = {

while (cursor.offset < cursor.size && (!opts.num_values || data.dlevels.length < opts.num_values)) {
const pageData = decodePage(cursor, opts);
const pageData = await decodePage(cursor, opts);
if (pageData.dictionary) {

@@ -689,6 +732,10 @@ opts.dictionary = pageData.dictionary;

}
if (opts.dictionary) {
// It's possible to have a column chunk where some pages should use
// the dictionary (PLAIN_DICTIONARY for example) and others should
// not (PLAIN for example).
if (opts.dictionary && pageData.useDictionary) {
pageData.values = pageData.values.map(d => opts.dictionary[d]);
}
for (let i = 0; i < pageData.rlevels.length; i++) {
let length = pageData.rlevels != undefined ? pageData.rlevels.length : 0;
for (let i = 0; i < length; i++) {
data.rlevels.push(pageData.rlevels[i]);

@@ -706,3 +753,3 @@ data.dlevels.push(pageData.dlevels[i]);

}
function decodeDictionaryPage(cursor, header, opts) {
async function decodeDictionaryPage(cursor, header, opts) {
const cursorEnd = cursor.offset + header.compressed_page_size;

@@ -716,3 +763,3 @@ let dictCursor = {

if (opts.compression && opts.compression !== 'UNCOMPRESSED') {
let valuesBuf = parquet_compression.inflate(opts.compression, dictCursor.buffer.slice(dictCursor.offset, cursorEnd));
let valuesBuf = await parquet_compression.inflate(opts.compression, dictCursor.buffer.slice(dictCursor.offset, cursorEnd));
dictCursor = {

@@ -724,12 +771,13 @@ buffer: valuesBuf,

}
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, header.dictionary_page_header.num_values, opts)
.map(d => d.toString());
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, (header.dictionary_page_header).num_values, opts)
.map((d) => d.toString());
}
function decodeDataPage(cursor, header, opts) {
async function decodeDataPage(cursor, header, opts) {
const cursorEnd = cursor.offset + header.compressed_page_size;
let valueCount = header.data_page_header.num_values;
let valueEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.encoding);
const dataPageHeader = header.data_page_header;
let valueCount = dataPageHeader.num_values;
let valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.encoding);
let valuesBufCursor = cursor;
if (opts.compression && opts.compression !== 'UNCOMPRESSED') {
let valuesBuf = parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd));
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd));
valuesBufCursor = {

@@ -742,3 +790,3 @@ buffer: valuesBuf,

/* read repetition levels */
let rLevelEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.repetition_level_encoding);
let rLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.repetition_level_encoding);
let rLevels = new Array(valueCount);

@@ -752,3 +800,3 @@ if (opts.rLevelMax > 0) {

/* read definition levels */
let dLevelEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.definition_level_encoding);
let dLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.definition_level_encoding);
let dLevels = new Array(valueCount);

@@ -778,10 +826,12 @@ if (opts.dLevelMax > 0) {

values: values,
count: valueCount
count: valueCount,
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY'
};
}
function decodeDataPageV2(cursor, header, opts) {
async function decodeDataPageV2(cursor, header, opts) {
const cursorEnd = cursor.offset + header.compressed_page_size;
const valueCount = header.data_page_header_v2.num_values;
const valueCountNonNull = valueCount - header.data_page_header_v2.num_nulls;
const valueEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header_v2.encoding);
const dataPageHeaderV2 = header.data_page_header_v2;
const valueCount = dataPageHeaderV2.num_values;
const valueCountNonNull = valueCount - dataPageHeaderV2.num_nulls;
const valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeaderV2.encoding);
/* read repetition levels */

@@ -811,4 +861,4 @@ let rLevels = new Array(valueCount);

let valuesBufCursor = cursor;
if (header.data_page_header_v2.is_compressed) {
let valuesBuf = parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd));
if (dataPageHeaderV2.is_compressed) {
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd));
valuesBufCursor = {

@@ -829,3 +879,4 @@ buffer: valuesBuf,

values: values,
count: valueCount
count: valueCount,
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY'
};

@@ -836,3 +887,3 @@ }

schemaElements.forEach(schemaElement => {
let repetitionType = parquet_util.getThriftEnum(parquet_thrift.FieldRepetitionType, schemaElement.repetition_type);
let repetitionType = parquet_util.getThriftEnum(parquet_types_1.default.FieldRepetitionType, schemaElement.repetition_type);
let optional = false;

@@ -851,3 +902,3 @@ let repeated = false;

;
if (schemaElement.num_children > 0) {
if (schemaElement.num_children != undefined && schemaElement.num_children > 0) {
schema[schemaElement.name] = {

@@ -872,5 +923,5 @@ optional: optional,

else {
let logicalType = parquet_util.getThriftEnum(parquet_thrift.Type, schemaElement.type);
let logicalType = parquet_util.getThriftEnum(parquet_types_1.default.Type, schemaElement.type);
if (schemaElement.converted_type != null) {
logicalType = parquet_util.getThriftEnum(parquet_thrift.ConvertedType, schemaElement.converted_type);
logicalType = parquet_util.getThriftEnum(parquet_types_1.default.ConvertedType, schemaElement.converted_type);
}

@@ -895,2 +946,1 @@ schema[schemaElement.name] = {

};
//# sourceMappingURL=reader.js.map

@@ -1,6 +0,26 @@

'use strict';
const parquet_codec = require('./codec');
const parquet_compression = require('./compression');
const parquet_types = require('./types');
const parquet_util = require('./util');
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetSchema = void 0;
const parquet_codec = __importStar(require("./codec"));
const parquet_compression = __importStar(require("./compression"));
const parquet_types = __importStar(require("./types"));
const PARQUET_COLUMN_KEY_SEPARATOR = '.';

@@ -11,2 +31,5 @@ /**

class ParquetSchema {
schema;
fields;
fieldList;
/**

@@ -24,3 +47,3 @@ * Create a new schema from a JSON schema definition

findField(path) {
if (path.constructor !== Array) {
if (typeof path === 'string') {
path = path.split(",");

@@ -33,3 +56,6 @@ }

for (; path.length > 1; path.shift()) {
n = n[path[0]].fields;
let fields = n[path[0]]?.fields;
if (isDefined(fields)) {
n = fields;
}
}

@@ -42,3 +68,3 @@ return n[path[0]];

findFieldBranch(path) {
if (path.constructor !== Array) {
if (typeof path === 'string') {
path = path.split(",");

@@ -50,4 +76,5 @@ }

branch.push(n[path[0]]);
if (path.length > 1) {
n = n[path[0]].fields;
let fields = n[path[0]].fields;
if (path.length > 1 && isDefined(fields)) {
n = fields;
}

@@ -58,2 +85,3 @@ }

}
exports.ParquetSchema = ParquetSchema;
;

@@ -94,3 +122,3 @@ function buildFields(schema, rLevelParentMax, dLevelParentMax, path) {

name: name,
path: path.concat([name]),
path: path.concat(name),
repetitionType: repetitionType,

@@ -102,3 +130,3 @@ rLevelMax: rLevelMax,

fieldCount: Object.keys(opts.fields).length,
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat([name]))
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name))
};

@@ -109,6 +137,5 @@ if (opts.type == 'LIST' || opts.type == 'MAP')

}
/* field type */
const typeDef = parquet_types.PARQUET_LOGICAL_TYPES[opts.type];
const typeDef = opts.type ? parquet_types.PARQUET_LOGICAL_TYPES[opts.type] : undefined;
if (!typeDef) {
throw 'invalid parquet type: ' + opts.type;
throw 'invalid parquet type: ' + (opts.type || "missing type");
}

@@ -120,3 +147,3 @@ /* field encoding */

if (!(opts.encoding in parquet_codec)) {
throw 'unsupported parquet encoding: ' + opts.encodig;
throw 'unsupported parquet encoding: ' + opts.encoding;
}

@@ -150,4 +177,5 @@ if (!opts.compression) {

list.push(fields[k]);
if (fields[k].isNested) {
list = list.concat(listFields(fields[k].fields));
const nestedFields = fields[k].fields;
if (fields[k].isNested && isDefined(nestedFields)) {
list = list.concat(listFields(nestedFields));
}

@@ -157,3 +185,4 @@ }

}
module.exports = { ParquetSchema };
//# sourceMappingURL=schema.js.map
function isDefined(val) {
return val !== undefined;
}

@@ -1,32 +0,29 @@

'use strict';
const parquet_types = require('./types');
const parquet_schema = require('./schema');
/**
* 'Shred' a record into a list of <value, repetition_level, definition_level>
* tuples per column using the Google Dremel Algorithm..
*
* The buffer argument must point to an object into which the shredded record
* will be returned. You may re-use the buffer for repeated calls to this function
* to append to an existing buffer, as long as the schema is unchanged.
*
* The format in which the shredded records will be stored in the buffer is as
* follows:
*
* buffer = {
* columnData: [
* 'my_col': {
* dlevels: [d1, d2, .. dN],
* rlevels: [r1, r2, .. rN],
* values: [v1, v2, .. vN],
* }, ...
* ],
* rowCount: X,
* }
*
*/
exports.shredRecord = function (schema, record, buffer) {
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.materializeRecords = exports.shredRecord = void 0;
const parquet_types = __importStar(require("./types"));
const shredRecord = function (schema, record, buffer) {
/* shred the record, this may raise an exception */
var recordShredded = {};
for (let field of schema.fieldList) {
recordShredded[field.path] = {
recordShredded[field.path.join(',')] = {
dlevels: [],

@@ -47,3 +44,4 @@ rlevels: [],

for (let field of schema.fieldList) {
buffer.columnData[field.path] = {
let path = field.path.join(',');
buffer.columnData[path] = {
dlevels: [],

@@ -55,3 +53,3 @@ rlevels: [],

};
buffer.pages[field.path] = [];
buffer.pages[path] = [];
}

@@ -62,4 +60,5 @@ }

for (let field of schema.fieldList) {
let record = recordShredded[field.path];
let column = buffer.columnData[field.path];
let path = field.path.join(',');
let record = recordShredded[path];
let column = buffer.columnData[path];
for (let i = 0; i < record.rlevels.length; i++) {

@@ -72,6 +71,7 @@ column.rlevels.push(record.rlevels[i]);

}
[...recordShredded[field.path].distinct_values].forEach(value => buffer.columnData[field.path].distinct_values.add(value));
buffer.columnData[field.path].count += recordShredded[field.path].count;
[...recordShredded[path].distinct_values].forEach(value => buffer.columnData[path].distinct_values.add(value));
buffer.columnData[path].count += recordShredded[path].count;
}
};
exports.shredRecord = shredRecord;
function shredRecordInternal(fields, record, data, rlvl, dlvl) {

@@ -81,8 +81,18 @@ for (let fieldName in fields) {

const fieldType = field.originalType || field.primitiveType;
const path = field.path.join(',');
// fetch values
let values = [];
if (record && (fieldName in record) && record[fieldName] !== undefined && record[fieldName] !== null) {
if (record[fieldName].constructor === Array) {
if (Array.isArray(record[fieldName])) {
values = record[fieldName];
}
else if (ArrayBuffer.isView(record[fieldName])) { // checks if any typed array
if (record[fieldName] instanceof Uint8Array) {
// wrap in a buffer, since not supported by parquet_thrift
values.push(Buffer.from(record[fieldName]));
}
else {
throw Object.prototype.toString.call(record[fieldName]) + ' is not supported';
}
}
else {

@@ -101,9 +111,9 @@ values.push(record[fieldName]);

if (values.length == 0) {
if (field.isNested) {
if (field.isNested && isDefined(field.fields)) {
shredRecordInternal(field.fields, null, data, rlvl, dlvl);
}
else {
data[field.path].rlevels.push(rlvl);
data[field.path].dlevels.push(dlvl);
data[field.path].count += 1;
data[path].rlevels.push(rlvl);
data[path].dlevels.push(dlvl);
data[path].count += 1;
}

@@ -115,11 +125,11 @@ continue;

const rlvl_i = i === 0 ? rlvl : field.rLevelMax;
if (field.isNested) {
if (field.isNested && isDefined(field.fields)) {
shredRecordInternal(field.fields, values[i], data, rlvl_i, field.dLevelMax);
}
else {
data[field.path].distinct_values.add(values[i]);
data[field.path].values.push(parquet_types.toPrimitive(fieldType, values[i]));
data[field.path].rlevels.push(rlvl_i);
data[field.path].dlevels.push(field.dLevelMax);
data[field.path].count += 1;
data[path].distinct_values.add(values[i]);
data[path].values.push(parquet_types.toPrimitive(fieldType, values[i]));
data[path].rlevels.push(rlvl_i);
data[path].dlevels.push(field.dLevelMax);
data[path].count += 1;
}

@@ -149,3 +159,3 @@ }

*/
exports.materializeRecords = function (schema, buffer, records) {
const materializeRecords = function (schema, buffer, records) {
if (!records) {

@@ -175,2 +185,3 @@ records = [];

};
exports.materializeRecords = materializeRecords;
function materializeRecordField(record, branch, rLevels, dLevel, value) {

@@ -186,10 +197,12 @@ const node = branch[0];

}
while (record[node.name].length < rLevels[0] + 1) {
record[node.name].push({});
const recordValue = record[node.name];
while (recordValue.length < rLevels[0] + 1) {
recordValue.push({});
}
materializeRecordField(record[node.name][rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value);
materializeRecordField(recordValue[rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value);
}
else {
record[node.name] = record[node.name] || {};
materializeRecordField(record[node.name], branch.slice(1), rLevels, dLevel, value);
const recordValue = record[node.name];
materializeRecordField(recordValue, branch.slice(1), rLevels, dLevel, value);
}

@@ -202,6 +215,7 @@ }

}
while (record[node.name].length < rLevels[0] + 1) {
record[node.name].push(null);
const recordValue = record[node.name];
while (recordValue.length < rLevels[0] + 1) {
recordValue.push(null);
}
record[node.name][rLevels[0]] = value;
recordValue[rLevels[0]] = value;
}

@@ -213,2 +227,4 @@ else {

}
//# sourceMappingURL=shred.js.map
function isDefined(val) {
return val !== undefined;
}
'use strict';
const BSON = require('bson');
const PARQUET_LOGICAL_TYPES = {
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fromPrimitive = exports.toPrimitive = exports.PARQUET_LOGICAL_TYPES = void 0;
const BSON = __importStar(require("bson"));
exports.PARQUET_LOGICAL_TYPES = {
'BOOLEAN': {

@@ -142,7 +163,8 @@ primitiveType: 'BOOLEAN',

function toPrimitive(type, value) {
if (!(type in PARQUET_LOGICAL_TYPES)) {
throw 'invalid type: ' + type;
if (type === undefined || !(type in exports.PARQUET_LOGICAL_TYPES)) {
throw 'invalid type: ' + type || "undefined";
}
return PARQUET_LOGICAL_TYPES[type].toPrimitive(value);
return exports.PARQUET_LOGICAL_TYPES[type].toPrimitive(value);
}
exports.toPrimitive = toPrimitive;
/**

@@ -153,7 +175,8 @@ * Convert a value from it's internal/underlying primitive representation to

function fromPrimitive(type, value) {
if (!(type in PARQUET_LOGICAL_TYPES)) {
throw 'invalid type: ' + type;
if (type === undefined || !(type in exports.PARQUET_LOGICAL_TYPES)) {
throw 'invalid type: ' + type || "undefined";
}
if ("fromPrimitive" in PARQUET_LOGICAL_TYPES[type]) {
return PARQUET_LOGICAL_TYPES[type].fromPrimitive(value);
const typeFromPrimitive = exports.PARQUET_LOGICAL_TYPES[type].fromPrimitive;
if (typeFromPrimitive !== undefined) {
return typeFromPrimitive(value);
}

@@ -164,2 +187,3 @@ else {

}
exports.fromPrimitive = fromPrimitive;
function toPrimitive_BOOLEAN(value) {

@@ -172,77 +196,128 @@ return !!value;

function toPrimitive_FLOAT(value) {
const v = parseFloat(value);
if (isNaN(v)) {
throw 'invalid value for FLOAT: ' + value;
if (typeof value === 'string') {
const v = parseFloat(value);
return v;
}
return v;
else if (typeof value === 'number') {
return value;
}
throw 'invalid value for FLOAT: ' + value;
}
function toPrimitive_DOUBLE(value) {
const v = parseFloat(value);
if (isNaN(v)) {
throw 'invalid value for DOUBLE: ' + value;
if (typeof value === 'string') {
const v = parseFloat(value);
return v;
}
return v;
else if (typeof value === 'number') {
return value;
}
throw 'invalid value for DOUBLE: ' + value;
}
function toPrimitive_INT8(value) {
const v = parseInt(value, 10);
if (v < -0x80 || v > 0x7f || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(-0x80, 0x7f, v);
return v;
}
catch {
throw 'invalid value for INT8: ' + value;
}
return v;
}
function toPrimitive_UINT8(value) {
const v = parseInt(value, 10);
if (v < 0 || v > 0xff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(0, 0xff, v);
return v;
}
catch {
throw 'invalid value for UINT8: ' + value;
}
return v;
}
function toPrimitive_INT16(value) {
const v = parseInt(value, 10);
if (v < -0x8000 || v > 0x7fff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(-0x8000, 0x7fff, v);
return v;
}
catch {
throw 'invalid value for INT16: ' + value;
}
return v;
}
function toPrimitive_UINT16(value) {
const v = parseInt(value, 10);
if (v < 0 || v > 0xffff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(0, 0xffff, v);
return v;
}
catch {
throw 'invalid value for UINT16: ' + value;
}
return v;
}
function toPrimitive_INT32(value) {
const v = parseInt(value, 10);
if (v < -0x80000000 || v > 0x7fffffff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(-0x80000000, 0x7fffffff, v);
return v;
}
catch {
throw 'invalid value for INT32: ' + value;
}
return v;
}
function toPrimitive_UINT32(value) {
const v = parseInt(value, 10);
if (v < 0 || v > 0xffffffffffff || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(0, 0xffffffffffff, v);
return v;
}
catch {
throw 'invalid value for UINT32: ' + value;
}
return v;
}
function toPrimitive_INT64(value) {
const v = parseInt(value, 10);
if (isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(-0x8000000000000000, 0x7fffffffffffffff, v);
return v;
}
catch {
throw 'invalid value for INT64: ' + value;
}
return v;
}
function toPrimitive_UINT64(value) {
const v = parseInt(value, 10);
if (v < 0 || isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(0, 0xffffffffffffffff, v);
return v;
}
catch {
throw 'invalid value for UINT64: ' + value;
}
return v;
}
function toPrimitive_INT96(value) {
const v = parseInt(value, 10);
if (isNaN(v)) {
try {
let v = value;
if (typeof v === 'string')
v = BigInt(value);
checkValidValue(-0x800000000000000000000000, 0x7fffffffffffffffffffffff, v);
return v;
}
catch {
throw 'invalid value for INT96: ' + value;
}
return v;
}

@@ -265,12 +340,13 @@ function toPrimitive_BYTE_ARRAY(value) {

function toPrimitive_BSON(value) {
var encoder = new BSON();
return Buffer.from(encoder.serialize(value));
return Buffer.from(BSON.serialize(value));
}
function fromPrimitive_BSON(value) {
var decoder = new BSON();
return decoder.deserialize(value);
return BSON.deserialize(value);
}
function toPrimitive_TIME_MILLIS(value) {
const v = parseInt(value, 10);
if (v < 0 || v > 0xffffffffffffffff || isNaN(v)) {
let v = value;
if (typeof value === `string`) {
v = parseInt(value, 10);
}
if (v < 0 || v > 0xffffffffffffffff || typeof v !== 'number') {
throw 'invalid value for TIME_MILLIS: ' + value;

@@ -282,3 +358,3 @@ }

const v = BigInt(value);
if (v < 0n || isNaN(v)) {
if (v < 0n) {
throw 'invalid value for TIME_MICROS: ' + value;

@@ -295,9 +371,10 @@ }

/* convert from integer */
{
const v = parseInt(value, 10);
if (v < 0 || isNaN(v)) {
throw 'invalid value for DATE: ' + value;
}
return v;
let v = value;
if (typeof value === 'string') {
v = parseInt(value, 10);
}
if (v < 0 || typeof v !== 'number') {
throw 'invalid value for DATE: ' + value;
}
return v;
}

@@ -313,9 +390,10 @@ function fromPrimitive_DATE(value) {

/* convert from integer */
{
const v = parseInt(value, 10);
if (v < 0 || isNaN(v)) {
throw 'invalid value for TIMESTAMP_MILLIS: ' + value;
}
return v;
let v = value;
if (typeof value === 'string') {
v = parseInt(value, 10);
}
if (v < 0 || typeof v !== 'number') {
throw 'invalid value for TIMESTAMP_MILLIS: ' + value;
}
return v;
}

@@ -340,3 +418,3 @@ function fromPrimitive_TIMESTAMP_MILLIS(value) {

function fromPrimitive_TIMESTAMP_MICROS(value) {
return new Date(parseInt(value / 1000n));
return typeof value === 'bigint' ? new Date(Number(value / 1000n)) : new Date(value / 1000);
}

@@ -360,3 +438,6 @@ function toPrimitive_INTERVAL(value) {

}
module.exports = { PARQUET_LOGICAL_TYPES, toPrimitive, fromPrimitive };
//# sourceMappingURL=types.js.map
function checkValidValue(lowerRange, upperRange, v) {
if (v < lowerRange || v > upperRange) {
throw "invalid value";
}
}

@@ -132,2 +132,10 @@ "use strict";

fromPrimitive: fromPrimitive_INTERVAL
},
MAP: {
originalType: 'MAP',
toPrimitive: toPrimitive_MAP,
},
LIST: {
originalType: 'LIST',
toPrimitive: toPrimitive_LIST,
}

@@ -246,2 +254,8 @@ };

}
function toPrimitive_MAP(value) {
return value;
}
function toPrimitive_LIST(value) {
return value;
}
function toPrimitive_BYTE_ARRAY(value) {

@@ -351,2 +365,1 @@ return Buffer.from(value);

}
//# sourceMappingURL=parquet_primitives.js.map
"use strict";
// Lifted from https://github.com/kbajalc/parquets
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.NewPageHeader = void 0;
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types"));
;
//# sourceMappingURL=types.js.map
class NewPageHeader extends parquet_types_1.default.PageHeader {
offset;
headerSize;
constructor() {
super();
}
}
exports.NewPageHeader = NewPageHeader;

@@ -1,16 +0,42 @@

'use strict';
const fs = require('fs');
const thrift = require('thrift');
const parquet_thrift = require('../gen-nodejs/parquet_types');
/** We need to use a patched version of TFramedTransport where
* readString returns the original buffer instead of a string if the
* buffer can not be safely encoded as utf8 (see http://bit.ly/2GXeZEF)
*/
class fixedTFramedTransport extends thrift.TFramedTransport {
readString(len) {
this.ensureAvailable(len);
var buffer = this.inBuf.slice(this.readPos, this.readPos + len);
var str = this.inBuf.toString('utf8', this.readPos, this.readPos + len);
this.readPos += len;
return (Buffer.from(str).equals(buffer)) ? str : buffer;
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.cloneInteger = exports.fieldIndexOf = exports.osopen = exports.osend = exports.oswrite = exports.fclose = exports.fread = exports.fstat = exports.fopen = exports.getThriftEnum = exports.getBitWidth = exports.decodeThrift = exports.serializeThrift = void 0;
const thrift_1 = __importDefault(require("thrift"));
const fs_1 = __importDefault(require("fs"));
const parquet_thrift = __importStar(require("../gen-nodejs/parquet_types"));
const thrift_2 = require("thrift");
/**
* We need to patch Thrift's TFramedTransport class bc the TS type definitions
* do not define a `readPos` field, even though the class implementation has
* one.
*/
class fixedTFramedTransport extends thrift_1.default.TFramedTransport {
inBuf;
readPos;
constructor(inBuf) {
super(inBuf);
this.inBuf = inBuf;
this.readPos = 0;
}

@@ -23,40 +49,20 @@ }

*/
const previousPageLocation = parquet_thrift.PageLocation.prototype;
const PageLocation = parquet_thrift.PageLocation.prototype = [];
PageLocation.write = previousPageLocation.write;
PageLocation.read = previousPageLocation.read;
const getterSetter = index => ({
const getterSetter = (index) => ({
get: function () { return this[index]; },
set: function (value) { return this[index] = value; }
});
Object.defineProperty(PageLocation, 'offset', getterSetter(0));
Object.defineProperty(PageLocation, 'compressed_page_size', getterSetter(1));
Object.defineProperty(PageLocation, 'first_row_index', getterSetter(2));
exports.force32 = function () {
const protocol = thrift.TCompactProtocol.prototype;
protocol.zigzagToI64 = protocol.zigzagToI32;
protocol.readVarint64 = protocol.readVarint32 = function () {
let lo = 0;
let shift = 0;
let b;
while (true) {
b = this.trans.readByte();
lo = lo | ((b & 0x7f) << shift);
shift += 7;
if (!(b & 0x80)) {
break;
}
}
return lo;
};
};
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'offset', getterSetter(0));
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'compressed_page_size', getterSetter(1));
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'first_row_index', getterSetter(2));
/**
* Helper function that serializes a thrift object into a buffer
*/
exports.serializeThrift = function (obj) {
const serializeThrift = function (obj) {
let output = [];
let transport = new thrift.TBufferedTransport(null, function (buf) {
const callBack = function (buf) {
output.push(buf);
});
let protocol = new thrift.TCompactProtocol(transport);
};
let transport = new thrift_1.default.TBufferedTransport(undefined, callBack);
let protocol = new thrift_1.default.TCompactProtocol(transport);
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872
obj.write(protocol);

@@ -66,3 +72,4 @@ transport.flush();

};
exports.decodeThrift = function (obj, buf, offset) {
exports.serializeThrift = serializeThrift;
const decodeThrift = function (obj, buf, offset) {
if (!offset) {

@@ -73,10 +80,12 @@ offset = 0;

transport.readPos = offset;
var protocol = new thrift.TCompactProtocol(transport);
var protocol = new thrift_1.default.TCompactProtocol(transport);
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872
obj.read(protocol);
return transport.readPos - offset;
};
exports.decodeThrift = decodeThrift;
/**
* Get the number of bits required to store a given value
*/
exports.getBitWidth = function (val) {
const getBitWidth = function (val) {
if (val === 0) {

@@ -89,6 +98,7 @@ return 0;

};
exports.getBitWidth = getBitWidth;
/**
* FIXME not ideal that this is linear
*/
exports.getThriftEnum = function (klass, value) {
const getThriftEnum = function (klass, value) {
for (let k in klass) {

@@ -101,5 +111,6 @@ if (klass[k] === value) {

};
exports.fopen = function (filePath) {
exports.getThriftEnum = getThriftEnum;
const fopen = function (filePath) {
return new Promise((resolve, reject) => {
fs.open(filePath, 'r', (err, fd) => {
fs_1.default.open(filePath, 'r', (err, fd) => {
if (err) {

@@ -114,5 +125,6 @@ reject(err);

};
exports.fstat = function (filePath) {
exports.fopen = fopen;
const fstat = function (filePath) {
return new Promise((resolve, reject) => {
fs.stat(filePath, (err, stat) => {
fs_1.default.stat(filePath, (err, stat) => {
if (err) {

@@ -127,6 +139,7 @@ reject(err);

};
exports.fread = function (fd, position, length) {
exports.fstat = fstat;
const fread = function (fd, position, length) {
let buffer = Buffer.alloc(length);
return new Promise((resolve, reject) => {
fs.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => {
fs_1.default.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => {
if (err || bytesRead != length) {

@@ -141,5 +154,6 @@ reject(err || Error('read failed'));

};
exports.fclose = function (fd) {
exports.fread = fread;
const fclose = function (fd) {
return new Promise((resolve, reject) => {
fs.close(fd, (err) => {
fs_1.default.close(fd, (err) => {
if (err) {

@@ -154,3 +168,4 @@ reject(err);

};
exports.oswrite = function (os, buf) {
exports.fclose = fclose;
const oswrite = function (os, buf) {
return new Promise((resolve, reject) => {

@@ -162,3 +177,3 @@ os.write(buf, (err) => {

else {
resolve();
resolve(err);
}

@@ -168,3 +183,4 @@ });

};
exports.osend = function (os) {
exports.oswrite = oswrite;
const osend = function (os) {
return new Promise((resolve, reject) => {

@@ -176,3 +192,3 @@ os.end((err) => {

else {
resolve();
resolve(err);
}

@@ -182,5 +198,6 @@ });

};
exports.osopen = function (path, opts) {
exports.osend = osend;
const osopen = function (path, opts) {
return new Promise((resolve, reject) => {
let outputStream = fs.createWriteStream(path, opts);
let outputStream = fs_1.default.createWriteStream(path, opts);
outputStream.on('open', function (fd) {

@@ -194,3 +211,4 @@ resolve(outputStream);

};
exports.fieldIndexOf = function (arr, elem) {
exports.osopen = osopen;
const fieldIndexOf = function (arr, elem) {
for (let j = 0; j < arr.length; ++j) {

@@ -213,2 +231,6 @@ if (arr[j].length !== elem.length) {

};
//# sourceMappingURL=util.js.map
exports.fieldIndexOf = fieldIndexOf;
const cloneInteger = (int) => {
return new thrift_2.Int64(int.valueOf());
};
exports.cloneInteger = cloneInteger;

@@ -1,20 +0,35 @@

'use strict';
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
const stream = require('stream');
const parquet_thrift = require('../gen-nodejs/parquet_types');
const parquet_shredder = require('./shred');
const parquet_util = require('./util');
const parquet_codec = require('./codec');
const parquet_compression = require('./compression');
const parquet_types = require('./types');
const bloomFilterWriter = require("./bloomFilterIO/bloomFilterWriter");
const Long = require('long');
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetTransformer = exports.ParquetEnvelopeWriter = exports.ParquetWriter = void 0;
const stream_1 = __importDefault(require("stream"));
const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types"));
const parquet_shredder = __importStar(require("./shred"));
const parquet_util = __importStar(require("./util"));
const parquet_codec = __importStar(require("./codec"));
const parquet_compression = __importStar(require("./compression"));
const parquet_types = __importStar(require("./types"));
const bloomFilterWriter = __importStar(require("./bloomFilterIO/bloomFilterWriter"));
const node_int64_1 = __importDefault(require("node-int64"));
/**

@@ -44,2 +59,8 @@ * Parquet File Magic String

class ParquetWriter {
schema;
envelopeWriter;
rowBuffer;
rowGroupSize;
closed;
userMetadata;
/**

@@ -49,7 +70,5 @@ * Convenience method to create a new buffered parquet writer that writes to

*/
static openFile(schema, path, opts) {
return __awaiter(this, void 0, void 0, function* () {
let outputStream = yield parquet_util.osopen(path, opts);
return ParquetWriter.openStream(schema, outputStream, opts);
});
static async openFile(schema, path, opts) {
let outputStream = await parquet_util.osopen(path, opts);
return ParquetWriter.openStream(schema, outputStream, opts);
}

@@ -60,10 +79,8 @@ /**

*/
static openStream(schema, outputStream, opts) {
return __awaiter(this, void 0, void 0, function* () {
if (!opts) {
opts = {};
}
let envelopeWriter = yield ParquetEnvelopeWriter.openStream(schema, outputStream, opts);
return new ParquetWriter(schema, envelopeWriter, opts);
});
static async openStream(schema, outputStream, opts) {
if (!opts) {
opts = {};
}
let envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts);
return new ParquetWriter(schema, envelopeWriter, opts);
}

@@ -92,21 +109,19 @@ /**

*/
appendRow(row) {
return __awaiter(this, void 0, void 0, function* () {
if (this.closed) {
throw 'writer was closed';
}
parquet_shredder.shredRecord(this.schema, row, this.rowBuffer);
const options = {
useDataPageV2: this.envelopeWriter.useDataPageV2,
bloomFilters: this.envelopeWriter.bloomFilters
};
if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) {
encodePages(this.schema, this.rowBuffer, options);
}
if (this.rowBuffer.rowCount >= this.rowGroupSize) {
encodePages(this.schema, this.rowBuffer, options);
yield this.envelopeWriter.writeRowGroup(this.rowBuffer);
this.rowBuffer = {};
}
});
async appendRow(row) {
if (this.closed || this.envelopeWriter === null) {
throw 'writer was closed';
}
parquet_shredder.shredRecord(this.schema, row, this.rowBuffer);
const options = {
useDataPageV2: this.envelopeWriter.useDataPageV2,
bloomFilters: this.envelopeWriter.bloomFilters
};
if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) {
await encodePages(this.schema, this.rowBuffer, options);
}
if (this.rowBuffer.rowCount >= this.rowGroupSize) {
await encodePages(this.schema, this.rowBuffer, options);
await this.envelopeWriter.writeRowGroup(this.rowBuffer);
this.rowBuffer = {};
}
}

@@ -119,22 +134,22 @@ /**

*/
close(callback) {
return __awaiter(this, void 0, void 0, function* () {
if (this.closed) {
throw 'writer was closed';
}
this.closed = true;
async close(callback) {
if (this.closed) {
throw 'writer was closed';
}
this.closed = true;
if (this.envelopeWriter) {
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) {
encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters });
yield this.envelopeWriter.writeRowGroup(this.rowBuffer);
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters });
await this.envelopeWriter.writeRowGroup(this.rowBuffer);
this.rowBuffer = {};
}
yield this.envelopeWriter.writeBloomFilters();
yield this.envelopeWriter.writeIndex();
yield this.envelopeWriter.writeFooter(this.userMetadata);
yield this.envelopeWriter.close();
await this.envelopeWriter.writeBloomFilters();
await this.envelopeWriter.writeIndex();
await this.envelopeWriter.writeFooter(this.userMetadata);
await this.envelopeWriter.close();
this.envelopeWriter = null;
if (callback) {
callback();
}
});
}
if (callback) {
callback();
}
}

@@ -161,21 +176,30 @@ /**

setPageSize(cnt) {
this.writer.setPageSize(cnt);
this.envelopeWriter.setPageSize(cnt);
}
}
exports.ParquetWriter = ParquetWriter;
/**
* Create a parquet file from a schema and a number of row groups. This class
* performs direct, unbuffered writes to the underlying output stream and is
* intendend for advanced and internal users; the writeXXX methods must be
* intended for advanced and internal users; the writeXXX methods must be
* called in the correct order to produce a valid file.
*/
class ParquetEnvelopeWriter {
schema;
write;
close;
offset;
rowCount;
rowGroups;
pageSize;
useDataPageV2;
pageIndex;
bloomFilters; // TODO: OR filterCollection
/**
* Create a new parquet envelope writer that writes to the specified stream
*/
static openStream(schema, outputStream, opts) {
return __awaiter(this, void 0, void 0, function* () {
let writeFn = parquet_util.oswrite.bind(undefined, outputStream);
let closeFn = parquet_util.osend.bind(undefined, outputStream);
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);
});
static async openStream(schema, outputStream, opts) {
let writeFn = parquet_util.oswrite.bind(undefined, outputStream);
let closeFn = parquet_util.osend.bind(undefined, outputStream);
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, new node_int64_1.default(0), opts);
}

@@ -187,3 +211,3 @@ constructor(schema, writeFn, closeFn, fileOffset, opts) {

this.offset = fileOffset;
this.rowCount = 0;
this.rowCount = new node_int64_1.default(0);
this.rowGroups = [];

@@ -199,3 +223,3 @@ this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE;

writeSection(buf) {
this.offset += buf.length;
this.offset.setValue(this.offset.valueOf() + buf.length);
return this.write(buf);

@@ -213,4 +237,4 @@ }

*/
writeRowGroup(records) {
let rgroup = encodeRowGroup(this.schema, records, {
async writeRowGroup(records) {
let rgroup = await encodeRowGroup(this.schema, records, {
baseOffset: this.offset,

@@ -221,12 +245,11 @@ pageSize: this.pageSize,

});
this.rowCount += records.rowCount;
this.rowCount.setValue(this.rowCount.valueOf() + records.rowCount);
this.rowGroups.push(rgroup.metadata);
return this.writeSection(rgroup.body);
}
writeBloomFilters(_rowGroups) {
let rowGroups = _rowGroups || this.rowGroups;
rowGroups.forEach(group => {
writeBloomFilters() {
this.rowGroups.forEach(group => {
group.columns.forEach(column => {
const columnName = column.meta_data.path_in_schema[0];
if (columnName in this.bloomFilters === false)
const columnName = column.meta_data?.path_in_schema[0];
if (!columnName || columnName in this.bloomFilters === false)
return;

@@ -242,20 +265,19 @@ const serializedBloomFilterData = bloomFilterWriter.getSerializedBloomFilterData(this.bloomFilters[columnName]);

*/
writeIndex(_rowGroups) {
let rowGroups = _rowGroups || this.rowGroups;
writeIndex() {
this.schema.fieldList.forEach((c, i) => {
rowGroups.forEach(group => {
this.rowGroups.forEach(group => {
let column = group.columns[i];
if (!column)
return;
if (column.meta_data.columnIndex) {
if (column.meta_data?.columnIndex) {
let columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex);
delete column.meta_data.columnIndex;
column.column_index_offset = this.offset;
column.column_index_offset = parquet_util.cloneInteger(this.offset);
column.column_index_length = columnBody.length;
this.writeSection(columnBody);
}
if (column.meta_data.offsetIndex) {
if (column.meta_data?.offsetIndex) {
let offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex);
delete column.meta_data.offsetIndex;
column.offset_index_offset = this.offset;
column.offset_index_offset = parquet_util.cloneInteger(this.offset);
column.offset_index_length = offsetBody.length;

@@ -270,3 +292,3 @@ this.writeSection(offsetBody);

*/
writeFooter(userMetadata, schema, rowCount, rowGroups) {
writeFooter(userMetadata) {
if (!userMetadata) {

@@ -289,6 +311,8 @@ userMetadata = {};

}
exports.ParquetEnvelopeWriter = ParquetEnvelopeWriter;
/**
* Create a parquet transform stream
*/
class ParquetTransformer extends stream.Transform {
class ParquetTransformer extends stream_1.default.Transform {
writer;
constructor(schema, opts = {}) {

@@ -301,9 +325,9 @@ super({ objectMode: true });

})(this);
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, 0, opts), opts);
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, new node_int64_1.default(0), opts), opts);
}
_transform(row, encoding, callback) {
_transform(row, _encoding, callback) {
if (row) {
this.writer.appendRow(row).then(data => callback(null, data), err => {
const fullErr = new Error(`Error transforming to parquet: ${err.toString()} row:${row}`);
fullErr.origErr = err;
fullErr.message = err;
callback(fullErr);

@@ -317,6 +341,7 @@ });

_flush(callback) {
this.writer.close(callback)
this.writer.close()
.then(d => callback(null, d), callback);
}
}
exports.ParquetTransformer = ParquetTransformer;
/**

@@ -349,5 +374,5 @@ * Encode a consecutive array of data using one of the parquet encodings

statistics.min = statistics.min_value;
return new parquet_thrift.Statistics(statistics);
return new parquet_types_1.default.Statistics(statistics);
}
function encodePages(schema, rowBuffer, opts) {
async function encodePages(schema, rowBuffer, opts) {
if (!rowBuffer.pageRowCount) {

@@ -361,3 +386,3 @@ return;

let page;
const values = rowBuffer.columnData[field.path];
const values = rowBuffer.columnData[field.path.join(',')];
if (opts.bloomFilters && (field.name in opts.bloomFilters)) {

@@ -367,3 +392,3 @@ const splitBlockBloomFilter = opts.bloomFilters[field.name];

}
let statistics;
let statistics = {};
if (field.statistics !== false) {

@@ -379,12 +404,12 @@ statistics = {};

});
statistics.null_count = values.dlevels.length - values.values.length;
statistics.distinct_count = values.distinct_values.size;
statistics.null_count = new node_int64_1.default(values.dlevels.length - values.values.length);
statistics.distinct_count = new node_int64_1.default(values.distinct_values.size);
}
if (opts.useDataPageV2) {
page = encodeDataPageV2(field, values.count, values.values, values.rlevels, values.dlevels, statistics);
page = await encodeDataPageV2(field, values.count, values.values, values.rlevels, values.dlevels, statistics);
}
else {
page = encodeDataPage(field, values.values, values.rlevels, values.dlevels, statistics);
page = await encodeDataPage(field, values.values || [], values.rlevels || [], values.dlevels || [], statistics);
}
let pages = rowBuffer.pages[field.path];
let pages = rowBuffer.pages[field.path.join(',')];
let lastPage = pages[pages.length - 1];

@@ -410,3 +435,3 @@ let first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0;

*/
function encodeDataPage(column, values, rlevels, dlevels, statistics) {
async function encodeDataPage(column, values, rlevels, dlevels, statistics) {
/* encode values */

@@ -428,8 +453,8 @@ let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, {

let pageBody = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]);
pageBody = parquet_compression.deflate(column.compression, pageBody);
let pageHeader = new parquet_thrift.PageHeader();
pageHeader.type = parquet_thrift.PageType['DATA_PAGE'];
pageBody = await parquet_compression.deflate(column.compression, pageBody);
let pageHeader = new parquet_types_1.default.PageHeader();
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE'];
pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length;
pageHeader.compressed_page_size = pageBody.length;
pageHeader.data_page_header = new parquet_thrift.DataPageHeader();
pageHeader.data_page_header = new parquet_types_1.default.DataPageHeader();
pageHeader.data_page_header.num_values = dlevels.length;

@@ -439,7 +464,7 @@ if (column.statistics !== false) {

}
pageHeader.data_page_header.encoding = parquet_thrift.Encoding[column.encoding];
pageHeader.data_page_header.encoding = parquet_types_1.default.Encoding[column.encoding];
pageHeader.data_page_header.definition_level_encoding =
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING];
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING];
pageHeader.data_page_header.repetition_level_encoding =
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING];
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING];
/* concat page header, repetition and definition levels and values */

@@ -451,3 +476,3 @@ return Buffer.concat([parquet_util.serializeThrift(pageHeader), pageBody]);

*/
function encodeDataPageV2(column, rowCount, values, rlevels, dlevels, statistics) {
async function encodeDataPageV2(column, rowCount, values, rlevels, dlevels, statistics) {
/* encode values */

@@ -458,3 +483,3 @@ let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, {

});
let valuesBufCompressed = parquet_compression.deflate(column.compression, valuesBuf);
let valuesBufCompressed = await parquet_compression.deflate(column.compression, valuesBuf);
/* encode repetition and definition levels */

@@ -476,5 +501,5 @@ let rLevelsBuf = Buffer.alloc(0);

/* build page header */
let pageHeader = new parquet_thrift.PageHeader();
pageHeader.type = parquet_thrift.PageType['DATA_PAGE_V2'];
pageHeader.data_page_header_v2 = new parquet_thrift.DataPageHeaderV2();
let pageHeader = new parquet_types_1.default.PageHeader();
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE_V2'];
pageHeader.data_page_header_v2 = new parquet_types_1.default.DataPageHeaderV2();
pageHeader.data_page_header_v2.num_values = dlevels.length;

@@ -490,3 +515,3 @@ pageHeader.data_page_header_v2.num_nulls = dlevels.length - values.length;

rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length;
pageHeader.data_page_header_v2.encoding = parquet_thrift.Encoding[column.encoding];
pageHeader.data_page_header_v2.encoding = parquet_types_1.default.Encoding[column.encoding];
pageHeader.data_page_header_v2.definition_levels_byte_length = dLevelsBuf.length;

@@ -507,3 +532,3 @@ pageHeader.data_page_header_v2.repetition_levels_byte_length = rLevelsBuf.length;

*/
function encodeColumnChunk(pages, opts) {
async function encodeColumnChunk(pages, opts) {
let pagesBuf = Buffer.concat(pages.map(d => d.page));

@@ -513,16 +538,16 @@ let num_values = pages.reduce((p, d) => p + d.num_values, 0);

/* prepare metadata header */
let metadata = new parquet_thrift.ColumnMetaData();
let metadata = new parquet_types_1.default.ColumnMetaData();
metadata.path_in_schema = opts.column.path;
metadata.num_values = num_values;
metadata.data_page_offset = opts.baseOffset;
metadata.num_values = new node_int64_1.default(num_values);
metadata.data_page_offset = new node_int64_1.default(opts.baseOffset);
metadata.encodings = [];
metadata.total_uncompressed_size = pagesBuf.length;
metadata.total_compressed_size = pagesBuf.length;
metadata.type = parquet_thrift.Type[opts.column.primitiveType];
metadata.codec = parquet_thrift.CompressionCodec[opts.column.compression];
metadata.total_uncompressed_size = new node_int64_1.default(pagesBuf.length);
metadata.total_compressed_size = new node_int64_1.default(pagesBuf.length);
metadata.type = parquet_types_1.default.Type[opts.column.primitiveType];
metadata.codec = await parquet_types_1.default.CompressionCodec[opts.column.compression];
/* compile statistics ColumnIndex and OffsetIndex*/
let columnIndex = new parquet_thrift.ColumnIndex();
let columnIndex = new parquet_types_1.default.ColumnIndex();
columnIndex.max_values = [];
columnIndex.min_values = [];
let offsetIndex = new parquet_thrift.OffsetIndex();
let offsetIndex = new parquet_types_1.default.OffsetIndex();
offsetIndex.page_locations = [];

@@ -532,7 +557,7 @@ /* prepare statistics */

let distinct_values = new Set();
statistics.null_count = 0;
statistics.distinct_count = 0;
statistics.null_count = new node_int64_1.default(0);
statistics.distinct_count = new node_int64_1.default(0);
/* loop through pages and update indices and statistics */
for (let i = 0; i < pages.length; i++) {
let page = pages[i];
const page = pages[i];
if (opts.column.statistics !== false) {

@@ -545,12 +570,12 @@ if (page.statistics.max_value > statistics.max_value || i == 0) {

}
statistics.null_count += page.statistics.null_count;
page.distinct_values.forEach(value => distinct_values.add(value));
statistics.null_count.setValue(statistics.null_count.valueOf() + (page.statistics.null_count?.valueOf() || 0));
page.distinct_values.forEach((value) => distinct_values.add(value));
columnIndex.max_values.push(encodeStatisticsValue(page.statistics.max_value, opts.column));
columnIndex.min_values.push(encodeStatisticsValue(page.statistics.min_value, opts.column));
}
let pageLocation = new parquet_thrift.PageLocation();
pageLocation.offset = offset;
let pageLocation = new parquet_types_1.default.PageLocation();
pageLocation.offset = new node_int64_1.default(offset);
offset += page.page.length;
pageLocation.compressed_page_size = page.page.length;
pageLocation.first_row_index = page.first_row_index;
pageLocation.first_row_index = new node_int64_1.default(page.first_row_index);
offsetIndex.page_locations.push(pageLocation);

@@ -562,3 +587,3 @@ }

if (opts.column.statistics !== false) {
statistics.distinct_count = distinct_values.size;
statistics.distinct_count = new node_int64_1.default(distinct_values.size);
metadata.statistics = encodeStatistics(statistics, opts.column);

@@ -570,8 +595,4 @@ if (opts.pageIndex !== false) {

/* list encodings */
let encodingsSet = {};
encodingsSet[PARQUET_RDLVL_ENCODING] = true;
encodingsSet[opts.column.encoding] = true;
for (let k in encodingsSet) {
metadata.encodings.push(parquet_thrift.Encoding[k]);
}
metadata.encodings.push(parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]);
metadata.encodings.push(parquet_types_1.default.Encoding[opts.column.encoding]);
/* concat metadata header and data pages */

@@ -585,7 +606,7 @@ let metadataOffset = opts.baseOffset + pagesBuf.length;

*/
function encodeRowGroup(schema, data, opts) {
let metadata = new parquet_thrift.RowGroup();
metadata.num_rows = data.rowCount;
async function encodeRowGroup(schema, data, opts) {
let metadata = new parquet_types_1.default.RowGroup();
metadata.num_rows = new node_int64_1.default(data.rowCount);
metadata.columns = [];
metadata.total_byte_size = 0;
metadata.total_byte_size = new node_int64_1.default(0);
let body = Buffer.alloc(0);

@@ -596,16 +617,15 @@ for (let field of schema.fieldList) {

}
let cchunkData = encodeColumnChunk(data.pages[field.path], {
let cchunkData = await encodeColumnChunk(data.pages[field.path.join(',')], {
column: field,
baseOffset: opts.baseOffset + body.length,
pageSize: opts.pageSize,
encoding: field.encoding,
rowCount: data.rowCount,
useDataPageV2: opts.useDataPageV2,
pageIndex: opts.pageIndex
baseOffset: opts.baseOffset.valueOf() + body.length,
pageSize: opts.pageSize || 0,
rowCount: data.rowCount || 0,
useDataPageV2: opts.useDataPageV2 ?? true,
pageIndex: opts.pageIndex ?? true
});
let cchunk = new parquet_thrift.ColumnChunk();
cchunk.file_offset = cchunkData.metadataOffset;
let cchunk = new parquet_types_1.default.ColumnChunk();
cchunk.file_offset = new node_int64_1.default(cchunkData.metadataOffset);
cchunk.meta_data = cchunkData.metadata;
metadata.columns.push(cchunk);
metadata.total_byte_size += cchunkData.body.length;
metadata.total_byte_size = new node_int64_1.default(metadata.total_byte_size.valueOf() + (cchunkData.body.length));
body = Buffer.concat([body, cchunkData.body]);

@@ -619,5 +639,5 @@ }

function encodeFooter(schema, rowCount, rowGroups, userMetadata) {
let metadata = new parquet_thrift.FileMetaData();
let metadata = new parquet_types_1.default.FileMetaData();
metadata.version = PARQUET_VERSION;
metadata.created_by = 'parquet.js';
metadata.created_by = '@dsnp/parquetjs';
metadata.num_rows = rowCount;

@@ -628,3 +648,3 @@ metadata.row_groups = rowGroups;

for (let k in userMetadata) {
let kv = new parquet_thrift.KeyValue();
let kv = new parquet_types_1.default.KeyValue();
kv.key = k;

@@ -635,3 +655,3 @@ kv.value = userMetadata[k];

{
let schemaRoot = new parquet_thrift.SchemaElement();
let schemaRoot = new parquet_types_1.default.SchemaElement();
schemaRoot.name = 'root';

@@ -642,5 +662,5 @@ schemaRoot.num_children = Object.keys(schema.fields).length;

for (let field of schema.fieldList) {
let schemaElem = new parquet_thrift.SchemaElement();
let schemaElem = new parquet_types_1.default.SchemaElement();
schemaElem.name = field.name;
schemaElem.repetition_type = parquet_thrift.FieldRepetitionType[field.repetitionType];
schemaElem.repetition_type = parquet_types_1.default.FieldRepetitionType[field.repetitionType];
if (field.isNested) {

@@ -650,6 +670,6 @@ schemaElem.num_children = field.fieldCount;

else {
schemaElem.type = parquet_thrift.Type[field.primitiveType];
schemaElem.type = parquet_types_1.default.Type[field.primitiveType];
}
if (field.originalType) {
schemaElem.converted_type = parquet_thrift.ConvertedType[field.originalType];
schemaElem.converted_type = parquet_types_1.default.ConvertedType[field.originalType];
}

@@ -671,2 +691,1 @@ schemaElem.type_length = field.typeLength;

};
//# sourceMappingURL=writer.js.map
"use strict";
const reader = require('./lib/reader');
const writer = require('./lib/writer');
const schema = require('./lib/schema');
const shredder = require('./lib/shred');
const util = require('./lib/util');
module.exports = {
ParquetEnvelopeReader: reader.ParquetEnvelopeReader,
ParquetReader: reader.ParquetReader,
ParquetEnvelopeWriter: writer.ParquetEnvelopeWriter,
ParquetWriter: writer.ParquetWriter,
ParquetTransformer: writer.ParquetTransformer,
ParquetSchema: schema.ParquetSchema,
ParquetShredder: shredder,
force32: util.force32
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
//# sourceMappingURL=parquet.js.map
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetShredder = exports.ParquetSchema = exports.ParquetTransformer = exports.ParquetWriter = exports.ParquetEnvelopeWriter = exports.ParquetReader = exports.ParquetEnvelopeReader = void 0;
const reader = __importStar(require("./lib/reader"));
const writer = __importStar(require("./lib/writer"));
const schema = __importStar(require("./lib/schema"));
const shredder = __importStar(require("./lib/shred"));
exports.ParquetEnvelopeReader = reader.ParquetEnvelopeReader;
exports.ParquetReader = reader.ParquetReader;
exports.ParquetEnvelopeWriter = writer.ParquetEnvelopeWriter;
exports.ParquetWriter = writer.ParquetWriter;
exports.ParquetTransformer = writer.ParquetTransformer;
exports.ParquetSchema = schema.ParquetSchema;
exports.ParquetShredder = shredder;

@@ -5,3 +5,4 @@ {

"main": "dist/parquet.js",
"version": "0.0.0-ba6065",
"types": "dist/parquet.d.ts",
"version": "0.0.0-cadf48",
"homepage": "https://github.com/LibertyDSNP/parquetjs",

@@ -18,36 +19,52 @@ "license": "MIT",

"dependencies": {
"@types/brotli": "^1.3.0",
"@types/bson": "^4.0.3",
"@types/long": "^4.0.1",
"@types/node": "^14.14.35",
"@types/thrift": "^0.10.10",
"brotli": "^1.3.0",
"bson": "^2.0.8",
"@types/varint": "^6.0.0",
"browserify-zlib": "^0.2.0",
"bson": "4.4.0",
"cross-fetch": "^3.1.4",
"int53": "^0.2.4",
"lodash": "^4.17.21",
"long": "^4.0.0",
"lzo": "^0.4.0",
"object-stream": "0.0.1",
"snappyjs": "^0.6.0",
"thrift": "0.14.1",
"typescript": "^4.3.2",
"varint": "^5.0.0",
"wasm-brotli": "^2.0.2",
"xxhash-wasm": "^0.4.1"
},
"devDependencies": {
"@babel/core": "7.13.10",
"@babel/preset-env": "7.13.12",
"@babel/preset-typescript": "7.13.0",
"@babel/core": "^7.14.6",
"@babel/preset-env": "^7.14.7",
"@babel/preset-typescript": "^7.14.5",
"@types/bson": "^4.0.3",
"@types/chai": "^4.2.16",
"@types/long": "^4.0.1",
"@types/mocha": "^8.2.2",
"@types/node": "^14.14.35",
"@types/sinon": "^10.0.0",
"@types/thrift": "^0.10.10",
"assert": "^2.0.0",
"babel-loader": "^8.2.2",
"babel-plugin-add-module-exports": "^1.0.4",
"browserfs": "^1.4.3",
"buffer": "^6.0.3",
"chai": "4.3.4",
"mocha": "8.3.2",
"msw": "^0.29.0",
"core-js": "^3.15.1",
"esbuild": "^0.14.38",
"mocha": "9.2.2",
"msw": "^0.39.2",
"object-stream": "0.0.1",
"process": "^0.11.10",
"regenerator-runtime": "^0.13.7",
"sinon": "^10.0.0",
"ts-node": "^9.1.1"
"sinon-chai": "^3.7.0",
"sinon-chai-in-order": "^0.1.0",
"source-map-loader": "^3.0.0",
"stream-browserify": "^3.0.0",
"ts-loader": "^9.2.3",
"ts-node": "^9.1.1",
"typescript": "^4.5.2"
},
"scripts": {
"build": "tsc -b",
"build": "npm run build:node && npm run build:types",
"build:types": "tsc -p tsconfig.types.json && cp gen-nodejs/parquet_types.d.ts dist/gen-nodejs/parquet_types.d.ts",
"build:node": "tsc -b",
"build:browser": "node esbuild.js",
"type": "tsc --noEmit",

@@ -57,5 +74,16 @@ "lint": "echo 'Linting, it is on the TODO list...'",

"clean": "rm -Rf ./dist",
"prepublishOnly": "npm run clean && npm run build",
"thrift": "thrift -out gen-nodejs --gen js:ts parquet.thrift && thrift -out gen-nodejs --gen js:node parquet.thrift"
"prepublishOnly": "npm run clean && npm run build:node && npm run build:types && npm run build:browser",
"thrift": "thrift -out gen-nodejs --gen js:ts parquet.thrift && thrift -out gen-nodejs --gen js:node parquet.thrift",
"serve": "node esbuild-serve.js"
},
"browser": {
"assert": "assert",
"events": "events",
"fs": "browserfs",
"path": "path-browserify",
"stream": "readable-stream",
"thrift": "./node_modules/thrift/lib/nodejs/lib/thrift/browser.js",
"util": "util",
"zlib": "browserify-zlib"
},
"engines": {

@@ -62,0 +90,0 @@ "node": ">=14.16.0"

@@ -17,4 +17,4 @@ # parquet.js

Forked Notice
-------------
## Forked Notice
This is a forked repository with code from various sources:

@@ -24,7 +24,5 @@ - Primary source [ironSource](https://github.com/ironSource/parquetjs) [npm: parquetjs](https://www.npmjs.com/package/parquetjs)

Installation
------------
## Installation
_parquet.js requires node.js >= 14.16.0_
To use parquet.js with node.js, install it using npm:
```

@@ -34,8 +32,21 @@ $ npm install @dsnp/parquetjs

_parquet.js requires node.js >= 14.16.0_
### NodeJS
To use with nodejs:
```javascript
import parquetjs from "@dsnp/parquetjs"
```
### Browser
To use in a browser, in your bundler, depending on your needs, write the appropriate plugin or resolver to point to:
```javascript
"node_modules/@dsnp/parquetjs/browser/parquetjs"
```
or:
Usage: Writing files
--------------------
```javascript
import parquetjs from "@dsnp/parquetjs/browser/parquetjs"
```
## Usage: Writing files
Once you have installed the parquet.js library, you can import it as a single

@@ -116,4 +127,3 @@ module:

Usage: Reading files
--------------------
## Usage: Reading files

@@ -231,4 +241,3 @@ A parquet reader allows retrieving the rows from a parquet file in order.

Encodings
---------
## Encodings

@@ -265,4 +274,3 @@ Internally, the Parquet format will store values from each field as consecutive

Optional Fields
---------------
### Optional Fields

@@ -284,4 +292,3 @@ By default, all fields are required to be present in each row. You can also mark

Nested Rows & Arrays
--------------------
### Nested Rows & Arrays

@@ -356,4 +363,3 @@ Parquet supports nested schemas that allow you to store rows that have a more

Nested Lists for Hive / Athena
-----------------------
### Nested Lists for Hive / Athena

@@ -363,4 +369,3 @@ Lists have to be annotated to be queriable with AWS Athena. See [parquet-format](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) for more detail and a full working example with comments in the test directory ([`test/list.js`](test/list.js))

List of Supported Types & Encodings
-----------------------------------
### List of Supported Types & Encodings

@@ -398,4 +403,3 @@ We aim to be feature-complete and add new features as they are added to the

Buffering & Row Group Size
--------------------------
## Buffering & Row Group Size

@@ -416,4 +420,3 @@ When writing a Parquet file, the `ParquetWriter` will buffer rows in memory

Dependencies
-------------
## Dependencies

@@ -423,6 +426,6 @@ Parquet uses [thrift](https://thrift.apache.org/) to encode the schema and other

Notes
-----
## Notes
Currently parquet-cpp doesn't fully support DATA_PAGE_V2. You can work around this
by setting the useDataPageV2 option to false.

Sorry, the diff of this file is too big to display

SocketSocket SOC 2 Logo

Product

  • Package Alerts
  • Integrations
  • Docs
  • Pricing
  • FAQ
  • Roadmap
  • Changelog

Packages

npm

Stay in touch

Get open source security insights delivered straight into your inbox.


  • Terms
  • Privacy
  • Security

Made with ⚡️ by Socket Inc