@dsnp/parquetjs
Advanced tools
Comparing version 0.0.0-ba6065 to 0.0.0-cadf48
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
@@ -46,13 +37,29 @@ return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
class SplitBlockBloomFilter { | ||
constructor() { | ||
/** | ||
* Instance | ||
*/ | ||
this.splitBlockFilter = []; | ||
this.desiredFalsePositiveRate = SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE; | ||
this.numBlocks = 0; | ||
this.numDistinctValues = SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES; | ||
this.hashStrategy = new parquet_types_1.default.BloomFilterHash(new parquet_types_1.default.XxHash()); | ||
this.hasher = new xxhasher_1.default(); | ||
} | ||
static salt = [ | ||
0x47b6137b, | ||
0x44974d91, | ||
0x8824ad5b, | ||
0xa2b7289d, | ||
0x705495c7, | ||
0x2df1424b, | ||
0x9efc4947, | ||
0x5c6bfb31 | ||
]; | ||
// How many bits are in a single block: | ||
// - Blocks are UInt32 arrays | ||
// - There are 8 UInt32 words in each block. | ||
static WORDS_PER_BLOCK = 8; | ||
static WORD_SIZE = 32; | ||
static BITS_PER_BLOCK = SplitBlockBloomFilter.WORDS_PER_BLOCK * SplitBlockBloomFilter.WORD_SIZE; | ||
// Default number of blocks in a Split Block Bloom filter (SBBF) | ||
static NUMBER_OF_BLOCKS = 32; | ||
// The lower bound of SBBF size in bytes. | ||
// Currently this is 1024 | ||
static LOWER_BOUND_BYTES = SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK / 8; | ||
// The upper bound of SBBF size, set to default row group size in bytes. | ||
// Note that the subsquent requirements for an effective bloom filter on a row group this size would mean this | ||
// is unacceptably large for a lightweight client application. | ||
static UPPER_BOUND_BYTES = 128 * 1024 * 1024; | ||
static DEFAULT_FALSE_POSITIVE_RATE = 0.001; | ||
static DEFAULT_DISTINCT_VALUES = 128 * 1024; | ||
/** | ||
@@ -161,5 +168,5 @@ * @function initBlock | ||
for (let j = 0; j < this.WORD_SIZE; j++) { | ||
const isSet = masked[i] & (Math.pow(2, j)); | ||
const isSet = masked[i] & (2 ** j); | ||
if (isSet) { | ||
b[i] = b[i] | (Math.pow(2, j)); | ||
b[i] = b[i] | (2 ** j); | ||
} | ||
@@ -183,5 +190,5 @@ } | ||
for (let j = 0; j < this.WORD_SIZE; j++) { | ||
const isSet = masked[i] & (Math.pow(2, j)); | ||
const isSet = masked[i] & (2 ** j); | ||
if (isSet) { | ||
const match = b[i] & (Math.pow(2, j)); | ||
const match = b[i] & (2 ** j); | ||
if (!match) { | ||
@@ -195,2 +202,11 @@ return false; | ||
} | ||
/** | ||
* Instance | ||
*/ | ||
splitBlockFilter = []; | ||
desiredFalsePositiveRate = SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE; | ||
numBlocks = 0; | ||
numDistinctValues = SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES; | ||
hashStrategy = new parquet_types_1.default.BloomFilterHash(new parquet_types_1.default.XxHash()); | ||
hasher = new xxhasher_1.default(); | ||
isInitialized() { return this.splitBlockFilter.length > 0; } | ||
@@ -311,10 +327,8 @@ getFalsePositiveRate() { return this.desiredFalsePositiveRate; } | ||
} | ||
hash(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!this.hashStrategy.hasOwnProperty("XXHASH")) { | ||
throw new Error("unsupported hash strategy"); | ||
} | ||
const hashed = yield this.hasher.hash64(value); | ||
return Long.fromString(hashed, true, 16); | ||
}); | ||
async hash(value) { | ||
if (!this.hashStrategy.hasOwnProperty("XXHASH")) { | ||
throw new Error("unsupported hash strategy"); | ||
} | ||
const hashed = await this.hasher.hash64(value); | ||
return Long.fromString(hashed, true, 16); | ||
} | ||
@@ -335,8 +349,6 @@ insertHash(hashValue) { | ||
*/ | ||
insert(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized. call init() first"); | ||
this.insertHash(yield this.hash(value)); | ||
}); | ||
async insert(value) { | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized. call init() first"); | ||
this.insertHash(await this.hash(value)); | ||
} | ||
@@ -358,38 +370,8 @@ checkHash(hashValue) { | ||
*/ | ||
check(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized"); | ||
return this.checkHash(yield this.hash(value)); | ||
}); | ||
async check(value) { | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized"); | ||
return this.checkHash(await this.hash(value)); | ||
} | ||
} | ||
SplitBlockBloomFilter.salt = [ | ||
0x47b6137b, | ||
0x44974d91, | ||
0x8824ad5b, | ||
0xa2b7289d, | ||
0x705495c7, | ||
0x2df1424b, | ||
0x9efc4947, | ||
0x5c6bfb31 | ||
]; | ||
// How many bits are in a single block: | ||
// - Blocks are UInt32 arrays | ||
// - There are 8 UInt32 words in each block. | ||
SplitBlockBloomFilter.WORDS_PER_BLOCK = 8; | ||
SplitBlockBloomFilter.WORD_SIZE = 32; | ||
SplitBlockBloomFilter.BITS_PER_BLOCK = SplitBlockBloomFilter.WORDS_PER_BLOCK * SplitBlockBloomFilter.WORD_SIZE; | ||
// Default number of blocks in a Split Block Bloom filter (SBBF) | ||
SplitBlockBloomFilter.NUMBER_OF_BLOCKS = 32; | ||
// The lower bound of SBBF size in bytes. | ||
// Currently this is 1024 | ||
SplitBlockBloomFilter.LOWER_BOUND_BYTES = SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK / 8; | ||
// The upper bound of SBBF size, set to default row group size in bytes. | ||
// Note that the subsquent requirements for an effective bloom filter on a row group this size would mean this | ||
// is unacceptably large for a lightweight client application. | ||
SplitBlockBloomFilter.UPPER_BOUND_BYTES = 128 * 1024 * 1024; | ||
SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE = 0.001; | ||
SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES = 128 * 1024; | ||
exports.default = SplitBlockBloomFilter; | ||
//# sourceMappingURL=sbbf.js.map |
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
const xxhash_wasm_1 = __importDefault(require("xxhash-wasm")); | ||
@@ -27,6 +19,5 @@ const long_1 = __importDefault(require("long")); | ||
class XxHasher { | ||
hashit(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
return (yield XxHasher.h64)(value); | ||
}); | ||
static h64 = (0, xxhash_wasm_1.default)().then(x => x.h64); | ||
async hashit(value) { | ||
return (await XxHasher.h64)(value); | ||
} | ||
@@ -39,20 +30,16 @@ /** | ||
*/ | ||
hash64(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (typeof value === 'string') | ||
return this.hashit(value); | ||
if (value instanceof Buffer || | ||
value instanceof Uint8Array || | ||
value instanceof long_1.default || | ||
typeof value === 'boolean' || | ||
typeof value === 'number' || | ||
typeof value === 'bigint') { | ||
return this.hashit(value.toString()); | ||
} | ||
throw new Error("unsupported type: " + value); | ||
}); | ||
async hash64(value) { | ||
if (typeof value === 'string') | ||
return this.hashit(value); | ||
if (value instanceof Buffer || | ||
value instanceof Uint8Array || | ||
value instanceof long_1.default || | ||
typeof value === 'boolean' || | ||
typeof value === 'number' || | ||
typeof value === 'bigint') { | ||
return this.hashit(value.toString()); | ||
} | ||
throw new Error("unsupported type: " + value); | ||
} | ||
} | ||
XxHasher.h64 = xxhash_wasm_1.default().then(x => x.h64); | ||
module.exports = XxHasher; | ||
//# sourceMappingURL=xxhasher.js.map | ||
exports.default = XxHasher; |
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
@@ -16,3 +26,3 @@ var __importDefault = (this && this.__importDefault) || function (mod) { | ||
exports.getBloomFiltersFor = exports.siftAllByteOffsets = exports.parseBloomFilterOffsets = void 0; | ||
const util_1 = __importDefault(require("../util")); | ||
const parquet_util = __importStar(require("../util")); | ||
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); | ||
@@ -22,4 +32,3 @@ const sbbf_1 = __importDefault(require("../bloom/sbbf")); | ||
return columnChunkDataCollection.filter((columnChunk) => { | ||
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, }, }, } = columnChunk; | ||
return bloomFilterOffsetBuffer; | ||
return columnChunk.column.meta_data?.bloom_filter_offset; | ||
}); | ||
@@ -35,6 +44,6 @@ }; | ||
const parseBloomFilterOffsets = (ColumnChunkDataCollection) => { | ||
return ColumnChunkDataCollection.map((columnChunkData) => { | ||
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, path_in_schema: pathInSchema, }, }, rowGroupIndex, } = columnChunkData; | ||
return ColumnChunkDataCollection.map(({ rowGroupIndex, column }) => { | ||
const { bloom_filter_offset: bloomOffset, path_in_schema: pathInSchema, } = column.meta_data || {}; | ||
return { | ||
offsetBytes: toInteger(bloomFilterOffsetBuffer), | ||
offsetBytes: toInteger(bloomOffset.buffer), | ||
columnName: pathInSchema.join(","), | ||
@@ -46,13 +55,16 @@ rowGroupIndex, | ||
exports.parseBloomFilterOffsets = parseBloomFilterOffsets; | ||
const getBloomFilterHeader = (offsetBytes, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () { | ||
const getBloomFilterHeader = async (offsetBytes, envelopeReader) => { | ||
const headerByteSizeEstimate = 200; | ||
let bloomFilterHeaderData; | ||
try { | ||
bloomFilterHeaderData = yield envelopeReader.read(offsetBytes, headerByteSizeEstimate); | ||
bloomFilterHeaderData = await envelopeReader.read(offsetBytes, headerByteSizeEstimate); | ||
} | ||
catch (e) { | ||
throw new Error(e); | ||
if (typeof e === 'string') | ||
throw new Error(e); | ||
else | ||
throw e; | ||
} | ||
const bloomFilterHeader = new parquet_types_1.default.BloomFilterHeader(); | ||
const sizeOfBloomFilterHeader = util_1.default.decodeThrift(bloomFilterHeader, bloomFilterHeaderData); | ||
const sizeOfBloomFilterHeader = parquet_util.decodeThrift(bloomFilterHeader, bloomFilterHeaderData); | ||
return { | ||
@@ -62,15 +74,18 @@ bloomFilterHeader, | ||
}; | ||
}); | ||
const readFilterData = (offsetBytes, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () { | ||
const { bloomFilterHeader, sizeOfBloomFilterHeader, } = yield getBloomFilterHeader(offsetBytes, envelopeReader); | ||
}; | ||
const readFilterData = async (offsetBytes, envelopeReader) => { | ||
const { bloomFilterHeader, sizeOfBloomFilterHeader, } = await getBloomFilterHeader(offsetBytes, envelopeReader); | ||
const { numBytes: filterByteSize } = bloomFilterHeader; | ||
try { | ||
const filterBlocksOffset = offsetBytes + sizeOfBloomFilterHeader; | ||
const buffer = yield envelopeReader.read(filterBlocksOffset, filterByteSize); | ||
const buffer = await envelopeReader.read(filterBlocksOffset, filterByteSize); | ||
return buffer; | ||
} | ||
catch (e) { | ||
throw new Error(e); | ||
if (typeof e === 'string') | ||
throw new Error(e); | ||
else | ||
throw e; | ||
} | ||
}); | ||
}; | ||
const readFilterDataFrom = (offsets, envelopeReader) => { | ||
@@ -80,10 +95,10 @@ return Promise.all(offsets.map((offset) => readFilterData(offset, envelopeReader))); | ||
const siftAllByteOffsets = (columnChunkDataCollection) => { | ||
return exports.parseBloomFilterOffsets(filterColumnChunksWithBloomFilters(columnChunkDataCollection)); | ||
return (0, exports.parseBloomFilterOffsets)(filterColumnChunksWithBloomFilters(columnChunkDataCollection)); | ||
}; | ||
exports.siftAllByteOffsets = siftAllByteOffsets; | ||
const getBloomFiltersFor = (columnNames, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () { | ||
const getBloomFiltersFor = async (columnNames, envelopeReader) => { | ||
const columnChunkDataCollection = envelopeReader.getAllColumnChunkDataFor(columnNames); | ||
const bloomFilterOffsetData = exports.siftAllByteOffsets(columnChunkDataCollection); | ||
const bloomFilterOffsetData = (0, exports.siftAllByteOffsets)(columnChunkDataCollection); | ||
const offsetByteValues = bloomFilterOffsetData.map(({ offsetBytes }) => offsetBytes); | ||
const filterBlocksBuffers = yield readFilterDataFrom(offsetByteValues, envelopeReader); | ||
const filterBlocksBuffers = await readFilterDataFrom(offsetByteValues, envelopeReader); | ||
return filterBlocksBuffers.map((buffer, index) => { | ||
@@ -97,4 +112,3 @@ const { columnName, rowGroupIndex } = bloomFilterOffsetData[index]; | ||
}); | ||
}); | ||
}; | ||
exports.getBloomFiltersFor = getBloomFiltersFor; | ||
//# sourceMappingURL=bloomFilterReader.js.map |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
@@ -7,3 +26,3 @@ return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
exports.getSerializedBloomFilterData = exports.setFilterOffset = exports.serializeFilterData = exports.serializeFilterHeaders = exports.createSBBF = void 0; | ||
const util_1 = __importDefault(require("../util")); | ||
const parquet_util = __importStar(require("../util")); | ||
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); | ||
@@ -37,3 +56,3 @@ const sbbf_1 = __importDefault(require("../bloom/sbbf")); | ||
const bloomFilterHeader = buildFilterHeader(numberOfBytes); | ||
return util_1.default.serializeThrift(bloomFilterHeader); | ||
return parquet_util.serializeThrift(bloomFilterHeader); | ||
}; | ||
@@ -43,3 +62,3 @@ exports.serializeFilterHeaders = serializeFilterHeaders; | ||
const serializedFilterBlocks = serializeFilterBlocks(params.filterBlocks); | ||
const serializedFilterHeaders = exports.serializeFilterHeaders(params.filterByteSize); | ||
const serializedFilterHeaders = (0, exports.serializeFilterHeaders)(params.filterByteSize); | ||
return Buffer.concat([serializedFilterHeaders, serializedFilterBlocks]); | ||
@@ -49,3 +68,3 @@ }; | ||
const setFilterOffset = (column, offset) => { | ||
column.meta_data.bloom_filter_offset = offset; | ||
column.meta_data.bloom_filter_offset = parquet_util.cloneInteger(offset); | ||
}; | ||
@@ -56,5 +75,4 @@ exports.setFilterOffset = setFilterOffset; | ||
const filterByteSize = splitBlockBloomFilter.getNumFilterBytes(); | ||
return exports.serializeFilterData({ filterBlocks, filterByteSize }); | ||
return (0, exports.serializeFilterData)({ filterBlocks, filterByteSize }); | ||
}; | ||
exports.getSerializedBloomFilterData = getSerializedBloomFilterData; | ||
//# sourceMappingURL=bloomFilterWriter.js.map |
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
module.exports = class BufferReader { | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
class BufferReader { | ||
maxSpan; | ||
maxLength; | ||
queueWait; | ||
scheduled; | ||
queue; | ||
envelopeReader; | ||
constructor(envelopeReader, options) { | ||
@@ -22,54 +20,51 @@ options = options || {}; | ||
read(offset, length) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!this.scheduled) { | ||
this.scheduled = true; | ||
setTimeout(() => { | ||
this.scheduled = false; | ||
this.processQueue(); | ||
}, this.queueWait); | ||
} | ||
return new Promise((resolve, reject) => { | ||
this.queue.push({ offset, length, resolve, reject }); | ||
}); | ||
if (!this.scheduled) { | ||
this.scheduled = true; | ||
setTimeout(() => { | ||
this.scheduled = false; | ||
this.processQueue(); | ||
}, this.queueWait); | ||
} | ||
return new Promise((resolve, reject) => { | ||
this.queue.push({ offset, length, resolve, reject }); | ||
}); | ||
} | ||
processQueue() { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
const queue = this.queue; | ||
if (!queue.length) | ||
async processQueue() { | ||
const queue = this.queue; | ||
if (!queue.length) | ||
return; | ||
this.queue = []; | ||
queue.sort((a, b) => a.offset - b.offset); | ||
var subqueue = []; | ||
const readSubqueue = async () => { | ||
if (!subqueue.length) { | ||
return; | ||
this.queue = []; | ||
queue.sort((a, b) => a.offset - b.offset); | ||
var subqueue = []; | ||
const readSubqueue = () => __awaiter(this, void 0, void 0, function* () { | ||
if (!subqueue.length) { | ||
return; | ||
} | ||
const processQueue = subqueue; | ||
subqueue = []; | ||
const lastElement = processQueue[processQueue.length - 1]; | ||
const start = processQueue[0].offset; | ||
const finish = lastElement.offset + lastElement.length; | ||
const buffer = yield this.envelopeReader.readFn(start, finish - start); | ||
processQueue.forEach((d) => __awaiter(this, void 0, void 0, function* () { | ||
d.resolve(buffer.slice(d.offset - start, d.offset + d.length - start)); | ||
})); | ||
} | ||
const processQueue = subqueue; | ||
subqueue = []; | ||
const lastElement = processQueue[processQueue.length - 1]; | ||
const start = processQueue[0].offset; | ||
const finish = lastElement.offset + lastElement.length; | ||
const buffer = await this.envelopeReader.readFn(start, finish - start); | ||
processQueue.forEach(async (d) => { | ||
d.resolve(buffer.slice(d.offset - start, d.offset + d.length - start)); | ||
}); | ||
queue.forEach((d, i) => { | ||
const prev = queue[i - 1]; | ||
if (!prev || (d.offset - (prev.offset + prev.length)) < this.maxSpan) { | ||
subqueue.push(d); | ||
if ((d.offset + d.length) - subqueue[0].offset > this.maxLength) { | ||
readSubqueue(); | ||
} | ||
} | ||
else { | ||
}; | ||
queue.forEach((d, i) => { | ||
const prev = queue[i - 1]; | ||
if (!prev || (d.offset - (prev.offset + prev.length)) < this.maxSpan) { | ||
subqueue.push(d); | ||
if ((d.offset + d.length) - subqueue[0].offset > this.maxLength) { | ||
readSubqueue(); | ||
subqueue = [d]; | ||
} | ||
}); | ||
readSubqueue(subqueue); | ||
} | ||
else { | ||
readSubqueue(); | ||
subqueue = [d]; | ||
} | ||
}); | ||
readSubqueue(); | ||
} | ||
}; | ||
//# sourceMappingURL=bufferReader.js.map | ||
} | ||
exports.default = BufferReader; | ||
; |
"use strict"; | ||
module.exports.PLAIN = require('./plain'); | ||
module.exports.RLE = require('./rle'); | ||
module.exports.PLAIN_DICTIONARY = require('./plain_dictionary'); | ||
//# sourceMappingURL=index.js.map | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.PLAIN_DICTIONARY = exports.RLE = exports.PLAIN = void 0; | ||
exports.PLAIN = __importStar(require("./plain")); | ||
exports.RLE = __importStar(require("./rle")); | ||
exports.PLAIN_DICTIONARY = __importStar(require("./plain_dictionary")); |
"use strict"; | ||
const rle = require('./rle'); | ||
exports.decodeValues = function (type, cursor, count, opts) { | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.decodeValues = void 0; | ||
const rle = __importStar(require("./rle")); | ||
const decodeValues = function (type, cursor, count, opts) { | ||
opts.bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0); | ||
@@ -8,2 +29,2 @@ cursor.offset += 1; | ||
}; | ||
//# sourceMappingURL=plain_dictionary.js.map | ||
exports.decodeValues = decodeValues; |
@@ -1,3 +0,8 @@ | ||
'use strict'; | ||
const INT53 = require('int53'); | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.decodeValues = exports.encodeValues = void 0; | ||
const int53_1 = __importDefault(require("int53")); | ||
function encodeValues_BOOLEAN(values) { | ||
@@ -8,3 +13,3 @@ let buf = Buffer.alloc(Math.ceil(values.length / 8)); | ||
if (values[i]) { | ||
buf[Math.floor(i / 8)] |= (1 << (i % 8)); | ||
buf[Math.floor(i / 8)] |= 1 << i % 8; | ||
} | ||
@@ -18,3 +23,3 @@ } | ||
let b = cursor.buffer[cursor.offset + Math.floor(i / 8)]; | ||
values.push((b & (1 << (i % 8))) > 0); | ||
values.push((b & (1 << i % 8)) > 0); | ||
} | ||
@@ -42,3 +47,2 @@ cursor.offset += Math.ceil(count / 8); | ||
for (let i = 0; i < values.length; i++) { | ||
//console.log(typeof values[i]); | ||
buf.writeBigInt64LE(BigInt(values[i]), i * 8); | ||
@@ -60,7 +64,7 @@ } | ||
if (values[i] >= 0) { | ||
INT53.writeInt64LE(values[i], buf, i * 12); | ||
int53_1.default.writeInt64LE(values[i], buf, i * 12); | ||
buf.writeUInt32LE(0, i * 12 + 8); // truncate to 64 actual precision | ||
} | ||
else { | ||
INT53.writeInt64LE((~-values[i]) + 1, buf, i * 12); | ||
int53_1.default.writeInt64LE(~-values[i] + 1, buf, i * 12); | ||
buf.writeUInt32LE(0xffffffff, i * 12 + 8); // truncate to 64 actual precision | ||
@@ -74,6 +78,6 @@ } | ||
for (let i = 0; i < count; ++i) { | ||
const low = INT53.readInt64LE(cursor.buffer, cursor.offset); | ||
const low = int53_1.default.readInt64LE(cursor.buffer, cursor.offset); | ||
const high = cursor.buffer.readUInt32LE(cursor.offset + 8); | ||
if (high === 0xffffffff) { | ||
values.push((~-low) + 1); // truncate to 64 actual precision | ||
values.push(~-low + 1); // truncate to 64 actual precision | ||
} | ||
@@ -117,14 +121,16 @@ else { | ||
} | ||
// Waylands reminder to check again | ||
function encodeValues_BYTE_ARRAY(values) { | ||
let buf_len = 0; | ||
const returnedValues = []; | ||
for (let i = 0; i < values.length; i++) { | ||
values[i] = Buffer.from(values[i]); | ||
buf_len += 4 + values[i].length; | ||
returnedValues[i] = Buffer.from(values[i]); | ||
buf_len += 4 + returnedValues[i].length; | ||
} | ||
let buf = Buffer.alloc(buf_len); | ||
let buf_pos = 0; | ||
for (let i = 0; i < values.length; i++) { | ||
buf.writeUInt32LE(values[i].length, buf_pos); | ||
values[i].copy(buf, buf_pos + 4); | ||
buf_pos += 4 + values[i].length; | ||
for (let i = 0; i < returnedValues.length; i++) { | ||
buf.writeUInt32LE(returnedValues[i].length, buf_pos); | ||
returnedValues[i].copy(buf, buf_pos + 4); | ||
buf_pos += 4 + returnedValues[i].length; | ||
} | ||
@@ -147,10 +153,10 @@ return buf; | ||
} | ||
let buf_len = 0; | ||
const returnedValues = []; | ||
for (let i = 0; i < values.length; i++) { | ||
values[i] = Buffer.from(values[i]); | ||
if (values[i].length !== opts.typeLength) { | ||
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + values[i]; | ||
returnedValues[i] = Buffer.from(values[i]); | ||
if (returnedValues[i].length !== opts.typeLength) { | ||
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + returnedValues[i]; | ||
} | ||
} | ||
return Buffer.concat(values); | ||
return Buffer.concat(returnedValues); | ||
} | ||
@@ -168,46 +174,47 @@ function decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts) { | ||
} | ||
exports.encodeValues = function (type, values, opts) { | ||
const encodeValues = function (type, values, opts) { | ||
switch (type) { | ||
case 'BOOLEAN': | ||
case "BOOLEAN": | ||
return encodeValues_BOOLEAN(values); | ||
case 'INT32': | ||
case "INT32": | ||
return encodeValues_INT32(values); | ||
case 'INT64': | ||
case "INT64": | ||
return encodeValues_INT64(values); | ||
case 'INT96': | ||
case "INT96": | ||
return encodeValues_INT96(values); | ||
case 'FLOAT': | ||
case "FLOAT": | ||
return encodeValues_FLOAT(values); | ||
case 'DOUBLE': | ||
case "DOUBLE": | ||
return encodeValues_DOUBLE(values); | ||
case 'BYTE_ARRAY': | ||
case "BYTE_ARRAY": | ||
return encodeValues_BYTE_ARRAY(values); | ||
case 'FIXED_LEN_BYTE_ARRAY': | ||
case "FIXED_LEN_BYTE_ARRAY": | ||
return encodeValues_FIXED_LEN_BYTE_ARRAY(values, opts); | ||
default: | ||
throw 'unsupported type: ' + type; | ||
throw "unsupported type: " + type; | ||
} | ||
}; | ||
exports.decodeValues = function (type, cursor, count, opts) { | ||
exports.encodeValues = encodeValues; | ||
const decodeValues = function (type, cursor, count, opts) { | ||
switch (type) { | ||
case 'BOOLEAN': | ||
case "BOOLEAN": | ||
return decodeValues_BOOLEAN(cursor, count); | ||
case 'INT32': | ||
case "INT32": | ||
return decodeValues_INT32(cursor, count); | ||
case 'INT64': | ||
case "INT64": | ||
return decodeValues_INT64(cursor, count); | ||
case 'INT96': | ||
case "INT96": | ||
return decodeValues_INT96(cursor, count); | ||
case 'FLOAT': | ||
case "FLOAT": | ||
return decodeValues_FLOAT(cursor, count); | ||
case 'DOUBLE': | ||
case "DOUBLE": | ||
return decodeValues_DOUBLE(cursor, count); | ||
case 'BYTE_ARRAY': | ||
case "BYTE_ARRAY": | ||
return decodeValues_BYTE_ARRAY(cursor, count); | ||
case 'FIXED_LEN_BYTE_ARRAY': | ||
case "FIXED_LEN_BYTE_ARRAY": | ||
return decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts); | ||
default: | ||
throw 'unsupported type: ' + type; | ||
throw "unsupported type: " + type; | ||
} | ||
}; | ||
//# sourceMappingURL=plain.js.map | ||
exports.decodeValues = decodeValues; |
"use strict"; | ||
const varint = require('varint'); | ||
// For questions about RLE encoding, see the spec: | ||
// | ||
// https://github.com/apache/parquet-format/blob/master/Encodings.md | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.decodeValues = exports.encodeValues = void 0; | ||
const varint_1 = __importDefault(require("varint")); | ||
function encodeRunBitpacked(values, opts) { | ||
@@ -14,3 +22,3 @@ for (let i = 0; i < values.length % 8; i++) { | ||
return Buffer.concat([ | ||
Buffer.from(varint.encode(((values.length / 8) << 1) | 1)), | ||
Buffer.from(varint_1.default.encode(((values.length / 8) << 1) | 1)), | ||
buf | ||
@@ -21,12 +29,23 @@ ]); | ||
let buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8)); | ||
let remainingValue = value; | ||
// This is encoded LSB to MSB, so we pick off the least | ||
// significant byte and shift to get the next one. | ||
for (let i = 0; i < buf.length; ++i) { | ||
buf.writeUInt8(value & 0xff, i); | ||
value >> 8; | ||
buf.writeUInt8(remainingValue & 0xff, i); | ||
remainingValue = remainingValue >> 8; | ||
} | ||
return Buffer.concat([ | ||
Buffer.from(varint.encode(count << 1)), | ||
Buffer.from(varint_1.default.encode(count << 1)), | ||
buf | ||
]); | ||
} | ||
exports.encodeValues = function (type, values, opts) { | ||
function unknownToParsedInt(value) { | ||
if (typeof value === 'string') { | ||
return parseInt(value, 10); | ||
} | ||
else { | ||
return value; | ||
} | ||
} | ||
const encodeValues = function (type, values, opts) { | ||
if (!('bitWidth' in opts)) { | ||
@@ -39,3 +58,3 @@ throw 'bitWidth is required'; | ||
case 'INT64': | ||
values = values.map((x) => parseInt(x, 10)); | ||
values = values.map((x) => unknownToParsedInt(x)); | ||
break; | ||
@@ -85,2 +104,3 @@ default: | ||
}; | ||
exports.encodeValues = encodeValues; | ||
function decodeRunBitpacked(cursor, count, opts) { | ||
@@ -100,6 +120,9 @@ if (count % 8 !== 0) { | ||
function decodeRunRepeated(cursor, count, opts) { | ||
var bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8); | ||
let value = 0; | ||
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); ++i) { | ||
value << 8; | ||
value += cursor.buffer[cursor.offset]; | ||
for (let i = 0; i < bytesNeededForFixedBitWidth; ++i) { | ||
const byte = cursor.buffer[cursor.offset]; | ||
// Bytes are stored LSB to MSB, so we need to shift | ||
// each new byte appropriately. | ||
value += byte << (i * 8); | ||
cursor.offset += 1; | ||
@@ -109,3 +132,3 @@ } | ||
} | ||
exports.decodeValues = function (type, cursor, count, opts) { | ||
const decodeValues = function (_, cursor, count, opts) { | ||
if (!('bitWidth' in opts)) { | ||
@@ -120,4 +143,4 @@ throw 'bitWidth is required'; | ||
while (values.length < count) { | ||
const header = varint.decode(cursor.buffer, cursor.offset); | ||
cursor.offset += varint.encodingLength(header); | ||
const header = varint_1.default.decode(cursor.buffer, cursor.offset); | ||
cursor.offset += varint_1.default.encodingLength(header); | ||
if (header & 1) { | ||
@@ -139,2 +162,2 @@ res = decodeRunBitpacked(cursor, (header >> 1) * 8, opts); | ||
}; | ||
//# sourceMappingURL=rle.js.map | ||
exports.decodeValues = decodeValues; |
@@ -1,7 +0,12 @@ | ||
'use strict'; | ||
const zlib = require('zlib'); | ||
const snappy = require('snappyjs'); | ||
const lzo = require('lzo'); | ||
const brotli = require('brotli'); | ||
const PARQUET_COMPRESSION_METHODS = { | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.inflate = exports.deflate = exports.PARQUET_COMPRESSION_METHODS = void 0; | ||
const zlib_1 = __importDefault(require("zlib")); | ||
const snappyjs_1 = __importDefault(require("snappyjs")); | ||
const wasm_brotli_1 = require("wasm-brotli"); | ||
// LZO compression is disabled. See: https://github.com/LibertyDSNP/parquetjs/issues/18 | ||
exports.PARQUET_COMPRESSION_METHODS = { | ||
'UNCOMPRESSED': { | ||
@@ -19,6 +24,2 @@ deflate: deflate_identity, | ||
}, | ||
'LZO': { | ||
deflate: deflate_lzo, | ||
inflate: inflate_lzo | ||
}, | ||
'BROTLI': { | ||
@@ -32,8 +33,9 @@ deflate: deflate_brotli, | ||
*/ | ||
function deflate(method, value) { | ||
if (!(method in PARQUET_COMPRESSION_METHODS)) { | ||
async function deflate(method, value) { | ||
if (!(method in exports.PARQUET_COMPRESSION_METHODS)) { | ||
throw 'invalid compression method: ' + method; | ||
} | ||
return PARQUET_COMPRESSION_METHODS[method].deflate(value); | ||
return exports.PARQUET_COMPRESSION_METHODS[method].deflate(value); | ||
} | ||
exports.deflate = deflate; | ||
function deflate_identity(value) { | ||
@@ -43,26 +45,26 @@ return value; | ||
function deflate_gzip(value) { | ||
return zlib.gzipSync(value); | ||
return zlib_1.default.gzipSync(value); | ||
} | ||
function deflate_snappy(value) { | ||
return snappy.compress(value); | ||
return snappyjs_1.default.compress(value); | ||
} | ||
function deflate_lzo(value) { | ||
return lzo.compress(value); | ||
async function deflate_brotli(value) { | ||
const compressedContent = await (0, wasm_brotli_1.compress)(value /*, { | ||
mode: 0, | ||
quality: 8, | ||
lgwin: 22 | ||
} | ||
*/); | ||
return Buffer.from(compressedContent); | ||
} | ||
function deflate_brotli(value) { | ||
return Buffer.from(brotli.compress(value, { | ||
mode: 0, | ||
quality: 8, | ||
lgwin: 22 | ||
})); | ||
} | ||
/** | ||
* Inflate a value using compression method `method` | ||
*/ | ||
function inflate(method, value) { | ||
if (!(method in PARQUET_COMPRESSION_METHODS)) { | ||
async function inflate(method, value) { | ||
if (!(method in exports.PARQUET_COMPRESSION_METHODS)) { | ||
throw 'invalid compression method: ' + method; | ||
} | ||
return PARQUET_COMPRESSION_METHODS[method].inflate(value); | ||
return await exports.PARQUET_COMPRESSION_METHODS[method].inflate(value); | ||
} | ||
exports.inflate = inflate; | ||
function inflate_identity(value) { | ||
@@ -72,14 +74,10 @@ return value; | ||
function inflate_gzip(value) { | ||
return zlib.gunzipSync(value); | ||
return zlib_1.default.gunzipSync(value); | ||
} | ||
function inflate_snappy(value) { | ||
return snappy.uncompress(value); | ||
return snappyjs_1.default.uncompress(value); | ||
} | ||
function inflate_lzo(value) { | ||
return lzo.decompress(value); | ||
async function inflate_brotli(value) { | ||
const uncompressedContent = await (0, wasm_brotli_1.decompress)(value); | ||
return Buffer.from(uncompressedContent); | ||
} | ||
function inflate_brotli(value) { | ||
return Buffer.from(brotli.decompress(value)); | ||
} | ||
module.exports = { PARQUET_COMPRESSION_METHODS, deflate, inflate }; | ||
//# sourceMappingURL=compression.js.map |
@@ -1,25 +0,38 @@ | ||
'use strict'; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
const fs = require('fs'); | ||
const thrift = require('thrift'); | ||
const Int64 = require('node-int64'); | ||
const parquet_thrift = require('../gen-nodejs/parquet_types'); | ||
const parquet_shredder = require('./shred'); | ||
const parquet_util = require('./util'); | ||
const parquet_schema = require('./schema'); | ||
const parquet_codec = require('./codec'); | ||
const parquet_compression = require('./compression'); | ||
const parquet_types = require('./types'); | ||
const BufferReader = require('./bufferReader'); | ||
const bloomFilterReader = require('./bloomFilterIO/bloomFilterReader'); | ||
const groupBy = require("lodash/groupBy"); | ||
const fetch = require('cross-fetch'); | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetEnvelopeReader = exports.ParquetReader = void 0; | ||
const node_int64_1 = __importDefault(require("node-int64")); | ||
const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types")); | ||
const parquet_shredder = __importStar(require("./shred")); | ||
const parquet_util = __importStar(require("./util")); | ||
const parquet_schema = __importStar(require("./schema")); | ||
const parquet_codec = __importStar(require("./codec")); | ||
const parquet_compression = __importStar(require("./compression")); | ||
const parquet_types = __importStar(require("./types")); | ||
const bufferReader_1 = __importDefault(require("./bufferReader")); | ||
const bloomFilterReader = __importStar(require("./bloomFilterIO/bloomFilterReader")); | ||
const cross_fetch_1 = __importDefault(require("cross-fetch")); | ||
const types_1 = require("./types/types"); | ||
const { getBloomFiltersFor, } = bloomFilterReader; | ||
@@ -43,2 +56,9 @@ /** | ||
class ParquetCursor { | ||
metadata; | ||
envelopeReader; | ||
schema; | ||
columnList; | ||
rowGroup; | ||
rowGroupIndex; | ||
cursorIndex; | ||
/** | ||
@@ -57,2 +77,3 @@ * Create a new parquet reader from the file metadata and an envelope reader. | ||
this.rowGroupIndex = 0; | ||
this.cursorIndex = 0; | ||
} | ||
@@ -63,14 +84,13 @@ /** | ||
*/ | ||
next() { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (this.rowGroup.length === 0) { | ||
if (this.rowGroupIndex >= this.metadata.row_groups.length) { | ||
return null; | ||
} | ||
let rowBuffer = yield this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList); | ||
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer); | ||
this.rowGroupIndex++; | ||
async next() { | ||
if (this.cursorIndex >= this.rowGroup.length) { | ||
if (this.rowGroupIndex >= this.metadata.row_groups.length) { | ||
return null; | ||
} | ||
return this.rowGroup.shift(); | ||
}); | ||
let rowBuffer = await this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList); | ||
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer); | ||
this.rowGroupIndex++; | ||
this.cursorIndex = 0; | ||
} | ||
return this.rowGroup[this.cursorIndex++]; | ||
} | ||
@@ -83,2 +103,3 @@ /** | ||
this.rowGroupIndex = 0; | ||
this.cursorIndex = 0; | ||
} | ||
@@ -95,2 +116,5 @@ } | ||
class ParquetReader { | ||
envelopeReader; | ||
metadata; | ||
schema; | ||
/** | ||
@@ -100,13 +124,9 @@ * Open the parquet file pointed to by the specified path and return a new | ||
*/ | ||
static openFile(filePath, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let envelopeReader = yield ParquetEnvelopeReader.openFile(filePath, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
}); | ||
static async openFile(filePath, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openFile(filePath, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
static openBuffer(buffer, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let envelopeReader = yield ParquetEnvelopeReader.openBuffer(buffer, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
}); | ||
static async openBuffer(buffer, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openBuffer(buffer, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
@@ -118,7 +138,5 @@ /** | ||
*/ | ||
static openS3(client, params, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let envelopeReader = yield ParquetEnvelopeReader.openS3(client, params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
}); | ||
static async openS3(client, params, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openS3(client, params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
@@ -131,23 +149,19 @@ /** | ||
*/ | ||
static openUrl(params, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let envelopeReader = yield ParquetEnvelopeReader.openUrl(params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
}); | ||
static async openUrl(params, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openUrl(params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
static openEnvelopeReader(envelopeReader, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (opts && opts.metadata) { | ||
return new ParquetReader(opts.metadata, envelopeReader, opts); | ||
} | ||
try { | ||
yield envelopeReader.readHeader(); | ||
let metadata = yield envelopeReader.readFooter(); | ||
return new ParquetReader(metadata, envelopeReader, opts); | ||
} | ||
catch (err) { | ||
yield envelopeReader.close(); | ||
throw err; | ||
} | ||
}); | ||
static async openEnvelopeReader(envelopeReader, opts) { | ||
if (opts && opts.metadata) { | ||
return new ParquetReader(opts.metadata, envelopeReader, opts); | ||
} | ||
try { | ||
await envelopeReader.readHeader(); | ||
let metadata = await envelopeReader.readFooter(); | ||
return new ParquetReader(metadata, envelopeReader, opts); | ||
} | ||
catch (err) { | ||
await envelopeReader.close(); | ||
throw err; | ||
} | ||
} | ||
@@ -174,3 +188,3 @@ /** | ||
else if (o.parquetType === 'INT64') { | ||
return new Int64(Buffer.from(o.value)); | ||
return new node_int64_1.default(Buffer.from(o.value)); | ||
} | ||
@@ -184,7 +198,7 @@ } | ||
if (column.offsetIndex) { | ||
column.offsetIndex.page_locations.forEach(d => { | ||
Promise.resolve(column.offsetIndex).then(offset => (offset.page_locations.forEach(d => { | ||
if (Array.isArray(d)) { | ||
Object.setPrototypeOf(d, parquet_thrift.PageLocation.prototype); | ||
Object.setPrototypeOf(d, parquet_types_1.default.PageLocation.prototype); | ||
} | ||
}); | ||
}))); | ||
} | ||
@@ -213,2 +227,19 @@ }); | ||
/** | ||
* Support `for await` iterators on the reader object | ||
* Uses `ParquetCursor` still under the hood. | ||
* | ||
* ```js | ||
* for await (const record of reader) { | ||
* console.log(record); | ||
* } | ||
* ``` | ||
*/ | ||
async *[Symbol.asyncIterator]() { | ||
const cursor = this.getCursor(); | ||
let record = null; | ||
while (record = await cursor.next()) { | ||
yield record; | ||
} | ||
} | ||
/** | ||
* Return a cursor to the file. You may open more than one cursor and use | ||
@@ -229,11 +260,15 @@ * them concurrently. All cursors become invalid once close() is called on | ||
} | ||
getBloomFiltersFor(columnNames) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
const bloomFilterData = yield getBloomFiltersFor(columnNames, this.envelopeReader); | ||
return groupBy(bloomFilterData, 'columnName'); | ||
}); | ||
async getBloomFiltersFor(columnNames) { | ||
const bloomFilterData = await getBloomFiltersFor(columnNames, this.envelopeReader); | ||
return bloomFilterData.reduce((acc, value) => { | ||
if (acc[value.columnName]) | ||
acc[value.columnName].push(value); | ||
else | ||
acc[value.columnName] = [value]; | ||
return acc; | ||
}, {}); | ||
} | ||
/** | ||
* Return the number of rows in this file. Note that the number of rows is | ||
* not neccessarily equal to the number of rows in each column. | ||
* not necessarily equal to the number of rows in each column. | ||
*/ | ||
@@ -259,6 +294,6 @@ getRowCount() { | ||
} | ||
exportMetadata(indent) { | ||
function replacer(key, value) { | ||
if (value instanceof parquet_thrift.PageLocation) { | ||
return [value[0], value[1], value[2]]; | ||
async exportMetadata(indent) { | ||
function replacer(_key, value) { | ||
if (value instanceof parquet_types_1.default.PageLocation) { | ||
return [value.offset, value.compressed_page_size, value.first_row_index]; | ||
} | ||
@@ -278,4 +313,4 @@ if (typeof value === 'object') { | ||
} | ||
if (value instanceof Int64) { | ||
if (isFinite(value)) { | ||
if (value instanceof node_int64_1.default) { | ||
if (isFinite(Number(value))) { | ||
return Number(value); | ||
@@ -295,2 +330,14 @@ } | ||
const metadata = Object.assign({}, this.metadata, { json: true }); | ||
for (let i = 0; i < metadata.row_groups.length; i++) { | ||
const rowGroup = metadata.row_groups[i]; | ||
for (let j = 0; j < rowGroup.columns.length; j++) { | ||
const column = rowGroup.columns[j]; | ||
if (column.offsetIndex instanceof Promise) { | ||
column.offsetIndex = await column.offsetIndex; | ||
} | ||
if (column.columnIndex instanceof Promise) { | ||
column.columnIndex = await column.columnIndex; | ||
} | ||
} | ||
} | ||
return JSON.stringify(metadata, replacer, indent); | ||
@@ -302,8 +349,6 @@ } | ||
*/ | ||
close() { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
yield this.envelopeReader.close(); | ||
this.envelopeReader = null; | ||
this.metadata = null; | ||
}); | ||
async close() { | ||
await this.envelopeReader.close(); | ||
this.envelopeReader = null; | ||
this.metadata = null; | ||
} | ||
@@ -314,2 +359,3 @@ decodePages(buffer, opts) { | ||
} | ||
exports.ParquetReader = ParquetReader; | ||
/** | ||
@@ -323,70 +369,69 @@ * The parquet envelope reader allows direct, unbuffered access to the individual | ||
class ParquetEnvelopeReader { | ||
static openFile(filePath, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let fileStat = yield parquet_util.fstat(filePath); | ||
let fileDescriptor = yield parquet_util.fopen(filePath); | ||
let readFn = (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
return parquet_util.fread(fileDescriptor, offset, length); | ||
}; | ||
let closeFn = parquet_util.fclose.bind(undefined, fileDescriptor); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size, options); | ||
}); | ||
readFn; | ||
close; | ||
id; | ||
fileSize; | ||
default_dictionary_size; | ||
metadata; | ||
schema; | ||
static async openFile(filePath, options) { | ||
let fileStat = await parquet_util.fstat(filePath); | ||
let fileDescriptor = await parquet_util.fopen(filePath); | ||
let readFn = (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
return parquet_util.fread(fileDescriptor, offset, length); | ||
}; | ||
let closeFn = parquet_util.fclose.bind(undefined, fileDescriptor); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size, options); | ||
} | ||
static openBuffer(buffer, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let readFn = (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
return Promise.resolve(buffer.slice(offset, offset + length)); | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length, options); | ||
}); | ||
static async openBuffer(buffer, options) { | ||
let readFn = (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
return Promise.resolve(buffer.slice(offset, offset + length)); | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length, options); | ||
} | ||
static openS3(client, params, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let fileStat = () => __awaiter(this, void 0, void 0, function* () { return client.headObject(params).promise().then(d => d.ContentLength); }); | ||
let readFn = (offset, length, file) => __awaiter(this, void 0, void 0, function* () { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
let Range = `bytes=${offset}-${offset + length - 1}`; | ||
let res = yield client.getObject(Object.assign({ Range }, params)).promise(); | ||
return Promise.resolve(res.Body); | ||
}); | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options); | ||
}); | ||
static async openS3(client, params, options) { | ||
let fileStat = async () => client.headObject(params).promise().then((d) => d.ContentLength); | ||
let readFn = async (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
let Range = `bytes=${offset}-${offset + length - 1}`; | ||
let res = await client.getObject(Object.assign({ Range }, params)).promise(); | ||
return Promise.resolve(res.Body); | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options); | ||
} | ||
static openUrl(params, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (typeof params === 'string') | ||
params = { url: params }; | ||
if (!params.url) | ||
throw new Error('URL missing'); | ||
let base = params.url.split('/'); | ||
base = base.slice(0, base.length - 1).join('/') + '/'; | ||
let defaultHeaders = params.headers || {}; | ||
let filesize = () => __awaiter(this, void 0, void 0, function* () { | ||
const { headers } = yield fetch(params.url); | ||
return headers.get('Content-Length'); | ||
}); | ||
let readFn = (offset, length, file) => __awaiter(this, void 0, void 0, function* () { | ||
let url = file ? base + file : params.url; | ||
let range = `bytes=${offset}-${offset + length - 1}`; | ||
let headers = Object.assign({}, defaultHeaders, { range }); | ||
const response = yield fetch(url, { headers }); | ||
const arrayBuffer = yield response.arrayBuffer(); | ||
const buffer = Buffer.from(arrayBuffer); | ||
return buffer; | ||
}); | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, filesize, options); | ||
}); | ||
static async openUrl(params, options) { | ||
if (typeof params === 'string') | ||
params = { url: params }; | ||
if (!params.url) | ||
throw new Error('URL missing'); | ||
const baseArr = params.url.split('/'); | ||
const base = baseArr.slice(0, baseArr.length - 1).join('/') + '/'; | ||
let defaultHeaders = params.headers || {}; | ||
let filesize = async () => { | ||
const { headers } = await (0, cross_fetch_1.default)(params.url); | ||
return headers.get('Content-Length'); | ||
}; | ||
let readFn = async (offset, length, file) => { | ||
let url = file ? base + file : params.url; | ||
let range = `bytes=${offset}-${offset + length - 1}`; | ||
let headers = Object.assign({}, defaultHeaders, { range }); | ||
const response = await (0, cross_fetch_1.default)(url, { headers }); | ||
const arrayBuffer = await response.arrayBuffer(); | ||
const buffer = Buffer.from(arrayBuffer); | ||
return buffer; | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, filesize, options); | ||
} | ||
constructor(readFn, closeFn, fileSize, options) { | ||
constructor(readFn, closeFn, fileSize, options, metadata) { | ||
options = options || {}; | ||
@@ -398,4 +443,5 @@ this.readFn = readFn; | ||
this.default_dictionary_size = options.default_dictionary_size || 10000000; | ||
this.metadata = metadata; | ||
if (options.maxLength || options.maxSpan || options.queueWait) { | ||
const bufferReader = new BufferReader(this, options); | ||
const bufferReader = new bufferReader_1.default(this, options); | ||
this.read = (offset, length) => bufferReader.read(offset, length); | ||
@@ -408,3 +454,3 @@ } | ||
readHeader() { | ||
return this.read(0, PARQUET_MAGIC.length).then(buf => { | ||
return this.read(0, PARQUET_MAGIC.length).then((buf) => { | ||
if (buf.toString() != PARQUET_MAGIC) { | ||
@@ -418,10 +464,14 @@ throw 'not valid parquet file'; | ||
let column; | ||
if (!isNaN(row_group)) { | ||
row_group = this.metadata.row_groups[row_group]; | ||
let parsedRowGroup; | ||
if (!isNaN(Number(row_group))) { | ||
parsedRowGroup = this.metadata?.row_groups[Number(row_group)]; | ||
} | ||
else if (row_group instanceof parquet_types_1.default.RowGroup) { | ||
parsedRowGroup = row_group; | ||
} | ||
if (typeof path === 'string') { | ||
if (!row_group) { | ||
if (!parsedRowGroup) { | ||
throw `Missing RowGroup ${row_group}`; | ||
} | ||
column = row_group.columns.find(d => d.meta_data.path_in_schema.join(',') === path); | ||
column = parsedRowGroup.columns.find(d => d.meta_data.path_in_schema.join(',') === path); | ||
if (!column) { | ||
@@ -453,12 +503,9 @@ throw `Column ${path} Not Found`; | ||
} | ||
const data = this.read(+column.offset_index_offset, column.offset_index_length).then(data => { | ||
let offset_index = new parquet_thrift.OffsetIndex(); | ||
const data = this.read(+column.offset_index_offset, column.offset_index_length).then((data) => { | ||
let offset_index = new parquet_types_1.default.OffsetIndex(); | ||
parquet_util.decodeThrift(offset_index, data); | ||
Object.defineProperty(offset_index, 'column', { value: column, enumerable: false }); | ||
if (opts && opts.cache) { | ||
column.offsetIndex = offset_index; | ||
} | ||
return offset_index; | ||
}); | ||
if (opts && opts.cache) { | ||
if (opts?.cache) { | ||
column.offsetIndex = data; | ||
@@ -476,5 +523,5 @@ } | ||
} | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then(data => { | ||
let column_index = new parquet_thrift.ColumnIndex(); | ||
parquet_util.decodeThrift(column_index, data); | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then((buf) => { | ||
let column_index = new parquet_types_1.default.ColumnIndex(); | ||
parquet_util.decodeThrift(column_index, buf); | ||
Object.defineProperty(column_index, 'column', { value: column }); | ||
@@ -489,8 +536,5 @@ // decode the statistics values | ||
} | ||
if (opts && opts.cache) { | ||
column.columnIndex = column_index; | ||
} | ||
return column_index; | ||
}); | ||
if (opts && opts.cache) { | ||
if (opts?.cache) { | ||
column.columnIndex = data; | ||
@@ -500,52 +544,50 @@ } | ||
} | ||
readPage(column, page, records, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
column = Object.assign({}, column); | ||
column.meta_data = Object.assign({}, column.meta_data); | ||
if (page.offset !== undefined) { | ||
if (isNaN(page.offset) || isNaN(page.compressed_page_size)) { | ||
throw Error('page offset and/or size missing'); | ||
} | ||
column.meta_data.data_page_offset = page.offset; | ||
column.meta_data.total_compressed_size = page.compressed_page_size; | ||
async readPage(column, page, records, opts) { | ||
column = Object.assign({}, column); | ||
column.meta_data = Object.assign({}, column.meta_data); | ||
if (page instanceof parquet_types_1.default.PageLocation && page.offset !== undefined) { | ||
if (isNaN(Number(page.offset)) || isNaN(page.compressed_page_size)) { | ||
throw Error('page offset and/or size missing'); | ||
} | ||
else { | ||
const offsetIndex = column.offsetIndex || (yield this.readOffsetIndex(column, null, opts)); | ||
column.meta_data.data_page_offset = offsetIndex.page_locations[page].offset; | ||
column.meta_data.total_compressed_size = offsetIndex.page_locations[page].compressed_page_size; | ||
} | ||
const chunk = yield this.readColumnChunk(this.schema, column); | ||
Object.defineProperty(chunk, 'column', { value: column }); | ||
let data = { | ||
columnData: { [chunk.column.meta_data.path_in_schema.join(',')]: chunk } | ||
}; | ||
return parquet_shredder.materializeRecords(this.schema, data, records); | ||
}); | ||
column.meta_data.data_page_offset = parquet_util.cloneInteger(page.offset); | ||
column.meta_data.total_compressed_size = new node_int64_1.default(page.compressed_page_size); | ||
} | ||
else { | ||
const offsetIndex = await this.readOffsetIndex(column, null, opts); | ||
column.meta_data.data_page_offset = parquet_util.cloneInteger(offsetIndex.page_locations[page].offset); | ||
column.meta_data.total_compressed_size = new node_int64_1.default(offsetIndex.page_locations[page].compressed_page_size); | ||
} | ||
const chunk = await this.readColumnChunk(this.schema, column); | ||
Object.defineProperty(chunk, 'column', { value: column }); | ||
let data = { | ||
columnData: { [chunk.column.meta_data.path_in_schema.join(',')]: chunk } | ||
}; | ||
return parquet_shredder.materializeRecords(this.schema, data, records); | ||
} | ||
readRowGroup(schema, rowGroup, columnList) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
var buffer = { | ||
rowCount: +rowGroup.num_rows, | ||
columnData: {} | ||
}; | ||
for (let colChunk of rowGroup.columns) { | ||
const colMetadata = colChunk.meta_data; | ||
const colKey = colMetadata.path_in_schema; | ||
if (columnList.length > 0 && parquet_util.fieldIndexOf(columnList, colKey) < 0) { | ||
continue; | ||
} | ||
buffer.columnData[colKey] = yield this.readColumnChunk(schema, colChunk); | ||
async readRowGroup(schema, rowGroup, columnList) { | ||
var buffer = { | ||
rowCount: +rowGroup.num_rows, | ||
columnData: {}, | ||
pageRowCount: 0, | ||
pages: {} | ||
}; | ||
for (let colChunk of rowGroup.columns) { | ||
const colMetadata = colChunk.meta_data; | ||
const colKey = colMetadata.path_in_schema; | ||
if (columnList.length > 0 && parquet_util.fieldIndexOf(columnList, colKey) < 0) { | ||
continue; | ||
} | ||
return buffer; | ||
}); | ||
buffer.columnData[colKey.join(',')] = await this.readColumnChunk(schema, colChunk); | ||
} | ||
return buffer; | ||
} | ||
readColumnChunk(schema, colChunk, opts) { | ||
let dictionary = Promise.resolve(); | ||
let field = schema.findField(colChunk.meta_data.path_in_schema); | ||
let type = parquet_util.getThriftEnum(parquet_thrift.Type, colChunk.meta_data.type); | ||
let compression = parquet_util.getThriftEnum(parquet_thrift.CompressionCodec, colChunk.meta_data.codec); | ||
let pagesOffset = +colChunk.meta_data.data_page_offset; | ||
let pagesSize = +colChunk.meta_data.total_compressed_size; | ||
async readColumnChunk(schema, colChunk, opts) { | ||
let metadata = colChunk.meta_data; | ||
let field = schema.findField(metadata.path_in_schema); | ||
let type = parquet_util.getThriftEnum(parquet_types_1.default.Type, metadata.type); | ||
let compression = parquet_util.getThriftEnum(parquet_types_1.default.CompressionCodec, metadata.codec); | ||
let pagesOffset = +metadata.data_page_offset; | ||
let pagesSize = +metadata.total_compressed_size; | ||
if (!colChunk.file_path) { | ||
pagesSize = Math.min(this.fileSize - pagesOffset, +colChunk.meta_data.total_compressed_size); | ||
pagesSize = Math.min(this.fileSize - pagesOffset, +metadata.total_compressed_size); | ||
} | ||
@@ -558,36 +600,36 @@ opts = Object.assign({}, opts, { | ||
column: field, | ||
num_values: colChunk.meta_data.num_values | ||
num_values: metadata.num_values | ||
}); | ||
if (colChunk.meta_data.dictionary_page_offset) { | ||
const offset = +colChunk.meta_data.dictionary_page_offset; | ||
if (metadata.dictionary_page_offset) { | ||
const offset = +metadata.dictionary_page_offset; | ||
const size = Math.min(+this.fileSize - offset, this.default_dictionary_size); | ||
dictionary = this.read(offset, size, colChunk.file_path).then(buffer => decodePage({ offset: 0, buffer, size: buffer.length }, opts).dictionary); | ||
await this.read(offset, size, colChunk.file_path).then(async (buffer) => { | ||
await decodePage({ offset: 0, buffer, size: buffer.length }, opts).then(dict => { | ||
opts.dictionary = opts.dictionary || dict.dictionary; | ||
}); | ||
}); | ||
} | ||
return dictionary.then(dict => { | ||
opts.dictionary = opts.dictionary || dict; | ||
return this.read(pagesOffset, pagesSize, colChunk.file_path).then(pagesBuf => decodePages(pagesBuf, opts)); | ||
}); | ||
return this.read(pagesOffset, pagesSize, colChunk.file_path).then((pagesBuf) => decodePages(pagesBuf, opts)); | ||
} | ||
readFooter() { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (typeof this.fileSize === 'function') { | ||
this.fileSize = yield this.fileSize(); | ||
} | ||
let trailerLen = PARQUET_MAGIC.length + 4; | ||
let trailerBuf = yield this.read(this.fileSize - trailerLen, trailerLen); | ||
if (trailerBuf.slice(4).toString() != PARQUET_MAGIC) { | ||
throw 'not a valid parquet file'; | ||
} | ||
let metadataSize = trailerBuf.readUInt32LE(0); | ||
let metadataOffset = this.fileSize - metadataSize - trailerLen; | ||
if (metadataOffset < PARQUET_MAGIC.length) { | ||
throw 'invalid metadata size'; | ||
} | ||
let metadataBuf = yield this.read(metadataOffset, metadataSize); | ||
let metadata = new parquet_thrift.FileMetaData(); | ||
parquet_util.decodeThrift(metadata, metadataBuf); | ||
return metadata; | ||
}); | ||
async readFooter() { | ||
if (typeof this.fileSize === 'function') { | ||
this.fileSize = await this.fileSize(); | ||
} | ||
let trailerLen = PARQUET_MAGIC.length + 4; | ||
let trailerBuf = await this.read(this.fileSize - trailerLen, trailerLen); | ||
if (trailerBuf.slice(4).toString() != PARQUET_MAGIC) { | ||
throw 'not a valid parquet file'; | ||
} | ||
let metadataSize = trailerBuf.readUInt32LE(0); | ||
let metadataOffset = this.fileSize - metadataSize - trailerLen; | ||
if (metadataOffset < PARQUET_MAGIC.length) { | ||
throw 'invalid metadata size'; | ||
} | ||
let metadataBuf = await this.read(metadataOffset, metadataSize); | ||
let metadata = new parquet_types_1.default.FileMetaData(); | ||
parquet_util.decodeThrift(metadata, metadataBuf); | ||
return metadata; | ||
} | ||
} | ||
exports.ParquetEnvelopeReader = ParquetEnvelopeReader; | ||
/** | ||
@@ -630,10 +672,10 @@ * Decode a consecutive array of data using one of the parquet encodings | ||
} | ||
function decodePage(cursor, opts) { | ||
async function decodePage(cursor, opts) { | ||
opts = opts || {}; | ||
let page; | ||
const pageHeader = new parquet_thrift.PageHeader(); | ||
const pageHeader = new types_1.NewPageHeader(); | ||
const headerOffset = cursor.offset; | ||
const headerSize = parquet_util.decodeThrift(pageHeader, cursor.buffer.slice(cursor.offset)); | ||
cursor.offset += headerSize; | ||
const pageType = parquet_util.getThriftEnum(parquet_thrift.PageType, pageHeader.type); | ||
const pageType = parquet_util.getThriftEnum(parquet_types_1.default.PageType, pageHeader.type); | ||
switch (pageType) { | ||
@@ -644,3 +686,3 @@ case 'DATA_PAGE': | ||
} | ||
page = decodeDataPage(cursor, pageHeader, opts); | ||
page = await decodeDataPage(cursor, pageHeader, opts); | ||
break; | ||
@@ -651,7 +693,8 @@ case 'DATA_PAGE_V2': | ||
} | ||
page = decodeDataPageV2(cursor, pageHeader, opts); | ||
page = await decodeDataPageV2(cursor, pageHeader, opts); | ||
break; | ||
case 'DICTIONARY_PAGE': | ||
const dict = await decodeDictionaryPage(cursor, pageHeader, opts); | ||
page = { | ||
dictionary: decodeDictionaryPage(cursor, pageHeader, opts) | ||
dictionary: dict | ||
}; | ||
@@ -667,3 +710,3 @@ break; | ||
} | ||
function decodePages(buffer, opts) { | ||
async function decodePages(buffer, opts) { | ||
opts = opts || {}; | ||
@@ -683,3 +726,3 @@ let cursor = { | ||
while (cursor.offset < cursor.size && (!opts.num_values || data.dlevels.length < opts.num_values)) { | ||
const pageData = decodePage(cursor, opts); | ||
const pageData = await decodePage(cursor, opts); | ||
if (pageData.dictionary) { | ||
@@ -689,6 +732,10 @@ opts.dictionary = pageData.dictionary; | ||
} | ||
if (opts.dictionary) { | ||
// It's possible to have a column chunk where some pages should use | ||
// the dictionary (PLAIN_DICTIONARY for example) and others should | ||
// not (PLAIN for example). | ||
if (opts.dictionary && pageData.useDictionary) { | ||
pageData.values = pageData.values.map(d => opts.dictionary[d]); | ||
} | ||
for (let i = 0; i < pageData.rlevels.length; i++) { | ||
let length = pageData.rlevels != undefined ? pageData.rlevels.length : 0; | ||
for (let i = 0; i < length; i++) { | ||
data.rlevels.push(pageData.rlevels[i]); | ||
@@ -706,3 +753,3 @@ data.dlevels.push(pageData.dlevels[i]); | ||
} | ||
function decodeDictionaryPage(cursor, header, opts) { | ||
async function decodeDictionaryPage(cursor, header, opts) { | ||
const cursorEnd = cursor.offset + header.compressed_page_size; | ||
@@ -716,3 +763,3 @@ let dictCursor = { | ||
if (opts.compression && opts.compression !== 'UNCOMPRESSED') { | ||
let valuesBuf = parquet_compression.inflate(opts.compression, dictCursor.buffer.slice(dictCursor.offset, cursorEnd)); | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, dictCursor.buffer.slice(dictCursor.offset, cursorEnd)); | ||
dictCursor = { | ||
@@ -724,12 +771,13 @@ buffer: valuesBuf, | ||
} | ||
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, header.dictionary_page_header.num_values, opts) | ||
.map(d => d.toString()); | ||
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, (header.dictionary_page_header).num_values, opts) | ||
.map((d) => d.toString()); | ||
} | ||
function decodeDataPage(cursor, header, opts) { | ||
async function decodeDataPage(cursor, header, opts) { | ||
const cursorEnd = cursor.offset + header.compressed_page_size; | ||
let valueCount = header.data_page_header.num_values; | ||
let valueEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.encoding); | ||
const dataPageHeader = header.data_page_header; | ||
let valueCount = dataPageHeader.num_values; | ||
let valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.encoding); | ||
let valuesBufCursor = cursor; | ||
if (opts.compression && opts.compression !== 'UNCOMPRESSED') { | ||
let valuesBuf = parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd)); | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd)); | ||
valuesBufCursor = { | ||
@@ -742,3 +790,3 @@ buffer: valuesBuf, | ||
/* read repetition levels */ | ||
let rLevelEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.repetition_level_encoding); | ||
let rLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.repetition_level_encoding); | ||
let rLevels = new Array(valueCount); | ||
@@ -752,3 +800,3 @@ if (opts.rLevelMax > 0) { | ||
/* read definition levels */ | ||
let dLevelEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.definition_level_encoding); | ||
let dLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.definition_level_encoding); | ||
let dLevels = new Array(valueCount); | ||
@@ -778,10 +826,12 @@ if (opts.dLevelMax > 0) { | ||
values: values, | ||
count: valueCount | ||
count: valueCount, | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' | ||
}; | ||
} | ||
function decodeDataPageV2(cursor, header, opts) { | ||
async function decodeDataPageV2(cursor, header, opts) { | ||
const cursorEnd = cursor.offset + header.compressed_page_size; | ||
const valueCount = header.data_page_header_v2.num_values; | ||
const valueCountNonNull = valueCount - header.data_page_header_v2.num_nulls; | ||
const valueEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header_v2.encoding); | ||
const dataPageHeaderV2 = header.data_page_header_v2; | ||
const valueCount = dataPageHeaderV2.num_values; | ||
const valueCountNonNull = valueCount - dataPageHeaderV2.num_nulls; | ||
const valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeaderV2.encoding); | ||
/* read repetition levels */ | ||
@@ -811,4 +861,4 @@ let rLevels = new Array(valueCount); | ||
let valuesBufCursor = cursor; | ||
if (header.data_page_header_v2.is_compressed) { | ||
let valuesBuf = parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd)); | ||
if (dataPageHeaderV2.is_compressed) { | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd)); | ||
valuesBufCursor = { | ||
@@ -829,3 +879,4 @@ buffer: valuesBuf, | ||
values: values, | ||
count: valueCount | ||
count: valueCount, | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' | ||
}; | ||
@@ -836,3 +887,3 @@ } | ||
schemaElements.forEach(schemaElement => { | ||
let repetitionType = parquet_util.getThriftEnum(parquet_thrift.FieldRepetitionType, schemaElement.repetition_type); | ||
let repetitionType = parquet_util.getThriftEnum(parquet_types_1.default.FieldRepetitionType, schemaElement.repetition_type); | ||
let optional = false; | ||
@@ -851,3 +902,3 @@ let repeated = false; | ||
; | ||
if (schemaElement.num_children > 0) { | ||
if (schemaElement.num_children != undefined && schemaElement.num_children > 0) { | ||
schema[schemaElement.name] = { | ||
@@ -872,5 +923,5 @@ optional: optional, | ||
else { | ||
let logicalType = parquet_util.getThriftEnum(parquet_thrift.Type, schemaElement.type); | ||
let logicalType = parquet_util.getThriftEnum(parquet_types_1.default.Type, schemaElement.type); | ||
if (schemaElement.converted_type != null) { | ||
logicalType = parquet_util.getThriftEnum(parquet_thrift.ConvertedType, schemaElement.converted_type); | ||
logicalType = parquet_util.getThriftEnum(parquet_types_1.default.ConvertedType, schemaElement.converted_type); | ||
} | ||
@@ -895,2 +946,1 @@ schema[schemaElement.name] = { | ||
}; | ||
//# sourceMappingURL=reader.js.map |
@@ -1,6 +0,26 @@ | ||
'use strict'; | ||
const parquet_codec = require('./codec'); | ||
const parquet_compression = require('./compression'); | ||
const parquet_types = require('./types'); | ||
const parquet_util = require('./util'); | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetSchema = void 0; | ||
const parquet_codec = __importStar(require("./codec")); | ||
const parquet_compression = __importStar(require("./compression")); | ||
const parquet_types = __importStar(require("./types")); | ||
const PARQUET_COLUMN_KEY_SEPARATOR = '.'; | ||
@@ -11,2 +31,5 @@ /** | ||
class ParquetSchema { | ||
schema; | ||
fields; | ||
fieldList; | ||
/** | ||
@@ -24,3 +47,3 @@ * Create a new schema from a JSON schema definition | ||
findField(path) { | ||
if (path.constructor !== Array) { | ||
if (typeof path === 'string') { | ||
path = path.split(","); | ||
@@ -33,3 +56,6 @@ } | ||
for (; path.length > 1; path.shift()) { | ||
n = n[path[0]].fields; | ||
let fields = n[path[0]]?.fields; | ||
if (isDefined(fields)) { | ||
n = fields; | ||
} | ||
} | ||
@@ -42,3 +68,3 @@ return n[path[0]]; | ||
findFieldBranch(path) { | ||
if (path.constructor !== Array) { | ||
if (typeof path === 'string') { | ||
path = path.split(","); | ||
@@ -50,4 +76,5 @@ } | ||
branch.push(n[path[0]]); | ||
if (path.length > 1) { | ||
n = n[path[0]].fields; | ||
let fields = n[path[0]].fields; | ||
if (path.length > 1 && isDefined(fields)) { | ||
n = fields; | ||
} | ||
@@ -58,2 +85,3 @@ } | ||
} | ||
exports.ParquetSchema = ParquetSchema; | ||
; | ||
@@ -94,3 +122,3 @@ function buildFields(schema, rLevelParentMax, dLevelParentMax, path) { | ||
name: name, | ||
path: path.concat([name]), | ||
path: path.concat(name), | ||
repetitionType: repetitionType, | ||
@@ -102,3 +130,3 @@ rLevelMax: rLevelMax, | ||
fieldCount: Object.keys(opts.fields).length, | ||
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat([name])) | ||
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name)) | ||
}; | ||
@@ -109,6 +137,5 @@ if (opts.type == 'LIST' || opts.type == 'MAP') | ||
} | ||
/* field type */ | ||
const typeDef = parquet_types.PARQUET_LOGICAL_TYPES[opts.type]; | ||
const typeDef = opts.type ? parquet_types.PARQUET_LOGICAL_TYPES[opts.type] : undefined; | ||
if (!typeDef) { | ||
throw 'invalid parquet type: ' + opts.type; | ||
throw 'invalid parquet type: ' + (opts.type || "missing type"); | ||
} | ||
@@ -120,3 +147,3 @@ /* field encoding */ | ||
if (!(opts.encoding in parquet_codec)) { | ||
throw 'unsupported parquet encoding: ' + opts.encodig; | ||
throw 'unsupported parquet encoding: ' + opts.encoding; | ||
} | ||
@@ -150,4 +177,5 @@ if (!opts.compression) { | ||
list.push(fields[k]); | ||
if (fields[k].isNested) { | ||
list = list.concat(listFields(fields[k].fields)); | ||
const nestedFields = fields[k].fields; | ||
if (fields[k].isNested && isDefined(nestedFields)) { | ||
list = list.concat(listFields(nestedFields)); | ||
} | ||
@@ -157,3 +185,4 @@ } | ||
} | ||
module.exports = { ParquetSchema }; | ||
//# sourceMappingURL=schema.js.map | ||
function isDefined(val) { | ||
return val !== undefined; | ||
} |
@@ -1,32 +0,29 @@ | ||
'use strict'; | ||
const parquet_types = require('./types'); | ||
const parquet_schema = require('./schema'); | ||
/** | ||
* 'Shred' a record into a list of <value, repetition_level, definition_level> | ||
* tuples per column using the Google Dremel Algorithm.. | ||
* | ||
* The buffer argument must point to an object into which the shredded record | ||
* will be returned. You may re-use the buffer for repeated calls to this function | ||
* to append to an existing buffer, as long as the schema is unchanged. | ||
* | ||
* The format in which the shredded records will be stored in the buffer is as | ||
* follows: | ||
* | ||
* buffer = { | ||
* columnData: [ | ||
* 'my_col': { | ||
* dlevels: [d1, d2, .. dN], | ||
* rlevels: [r1, r2, .. rN], | ||
* values: [v1, v2, .. vN], | ||
* }, ... | ||
* ], | ||
* rowCount: X, | ||
* } | ||
* | ||
*/ | ||
exports.shredRecord = function (schema, record, buffer) { | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.materializeRecords = exports.shredRecord = void 0; | ||
const parquet_types = __importStar(require("./types")); | ||
const shredRecord = function (schema, record, buffer) { | ||
/* shred the record, this may raise an exception */ | ||
var recordShredded = {}; | ||
for (let field of schema.fieldList) { | ||
recordShredded[field.path] = { | ||
recordShredded[field.path.join(',')] = { | ||
dlevels: [], | ||
@@ -47,3 +44,4 @@ rlevels: [], | ||
for (let field of schema.fieldList) { | ||
buffer.columnData[field.path] = { | ||
let path = field.path.join(','); | ||
buffer.columnData[path] = { | ||
dlevels: [], | ||
@@ -55,3 +53,3 @@ rlevels: [], | ||
}; | ||
buffer.pages[field.path] = []; | ||
buffer.pages[path] = []; | ||
} | ||
@@ -62,4 +60,5 @@ } | ||
for (let field of schema.fieldList) { | ||
let record = recordShredded[field.path]; | ||
let column = buffer.columnData[field.path]; | ||
let path = field.path.join(','); | ||
let record = recordShredded[path]; | ||
let column = buffer.columnData[path]; | ||
for (let i = 0; i < record.rlevels.length; i++) { | ||
@@ -72,6 +71,7 @@ column.rlevels.push(record.rlevels[i]); | ||
} | ||
[...recordShredded[field.path].distinct_values].forEach(value => buffer.columnData[field.path].distinct_values.add(value)); | ||
buffer.columnData[field.path].count += recordShredded[field.path].count; | ||
[...recordShredded[path].distinct_values].forEach(value => buffer.columnData[path].distinct_values.add(value)); | ||
buffer.columnData[path].count += recordShredded[path].count; | ||
} | ||
}; | ||
exports.shredRecord = shredRecord; | ||
function shredRecordInternal(fields, record, data, rlvl, dlvl) { | ||
@@ -81,8 +81,18 @@ for (let fieldName in fields) { | ||
const fieldType = field.originalType || field.primitiveType; | ||
const path = field.path.join(','); | ||
// fetch values | ||
let values = []; | ||
if (record && (fieldName in record) && record[fieldName] !== undefined && record[fieldName] !== null) { | ||
if (record[fieldName].constructor === Array) { | ||
if (Array.isArray(record[fieldName])) { | ||
values = record[fieldName]; | ||
} | ||
else if (ArrayBuffer.isView(record[fieldName])) { // checks if any typed array | ||
if (record[fieldName] instanceof Uint8Array) { | ||
// wrap in a buffer, since not supported by parquet_thrift | ||
values.push(Buffer.from(record[fieldName])); | ||
} | ||
else { | ||
throw Object.prototype.toString.call(record[fieldName]) + ' is not supported'; | ||
} | ||
} | ||
else { | ||
@@ -101,9 +111,9 @@ values.push(record[fieldName]); | ||
if (values.length == 0) { | ||
if (field.isNested) { | ||
if (field.isNested && isDefined(field.fields)) { | ||
shredRecordInternal(field.fields, null, data, rlvl, dlvl); | ||
} | ||
else { | ||
data[field.path].rlevels.push(rlvl); | ||
data[field.path].dlevels.push(dlvl); | ||
data[field.path].count += 1; | ||
data[path].rlevels.push(rlvl); | ||
data[path].dlevels.push(dlvl); | ||
data[path].count += 1; | ||
} | ||
@@ -115,11 +125,11 @@ continue; | ||
const rlvl_i = i === 0 ? rlvl : field.rLevelMax; | ||
if (field.isNested) { | ||
if (field.isNested && isDefined(field.fields)) { | ||
shredRecordInternal(field.fields, values[i], data, rlvl_i, field.dLevelMax); | ||
} | ||
else { | ||
data[field.path].distinct_values.add(values[i]); | ||
data[field.path].values.push(parquet_types.toPrimitive(fieldType, values[i])); | ||
data[field.path].rlevels.push(rlvl_i); | ||
data[field.path].dlevels.push(field.dLevelMax); | ||
data[field.path].count += 1; | ||
data[path].distinct_values.add(values[i]); | ||
data[path].values.push(parquet_types.toPrimitive(fieldType, values[i])); | ||
data[path].rlevels.push(rlvl_i); | ||
data[path].dlevels.push(field.dLevelMax); | ||
data[path].count += 1; | ||
} | ||
@@ -149,3 +159,3 @@ } | ||
*/ | ||
exports.materializeRecords = function (schema, buffer, records) { | ||
const materializeRecords = function (schema, buffer, records) { | ||
if (!records) { | ||
@@ -175,2 +185,3 @@ records = []; | ||
}; | ||
exports.materializeRecords = materializeRecords; | ||
function materializeRecordField(record, branch, rLevels, dLevel, value) { | ||
@@ -186,10 +197,12 @@ const node = branch[0]; | ||
} | ||
while (record[node.name].length < rLevels[0] + 1) { | ||
record[node.name].push({}); | ||
const recordValue = record[node.name]; | ||
while (recordValue.length < rLevels[0] + 1) { | ||
recordValue.push({}); | ||
} | ||
materializeRecordField(record[node.name][rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value); | ||
materializeRecordField(recordValue[rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value); | ||
} | ||
else { | ||
record[node.name] = record[node.name] || {}; | ||
materializeRecordField(record[node.name], branch.slice(1), rLevels, dLevel, value); | ||
const recordValue = record[node.name]; | ||
materializeRecordField(recordValue, branch.slice(1), rLevels, dLevel, value); | ||
} | ||
@@ -202,6 +215,7 @@ } | ||
} | ||
while (record[node.name].length < rLevels[0] + 1) { | ||
record[node.name].push(null); | ||
const recordValue = record[node.name]; | ||
while (recordValue.length < rLevels[0] + 1) { | ||
recordValue.push(null); | ||
} | ||
record[node.name][rLevels[0]] = value; | ||
recordValue[rLevels[0]] = value; | ||
} | ||
@@ -213,2 +227,4 @@ else { | ||
} | ||
//# sourceMappingURL=shred.js.map | ||
function isDefined(val) { | ||
return val !== undefined; | ||
} |
'use strict'; | ||
const BSON = require('bson'); | ||
const PARQUET_LOGICAL_TYPES = { | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.fromPrimitive = exports.toPrimitive = exports.PARQUET_LOGICAL_TYPES = void 0; | ||
const BSON = __importStar(require("bson")); | ||
exports.PARQUET_LOGICAL_TYPES = { | ||
'BOOLEAN': { | ||
@@ -142,7 +163,8 @@ primitiveType: 'BOOLEAN', | ||
function toPrimitive(type, value) { | ||
if (!(type in PARQUET_LOGICAL_TYPES)) { | ||
throw 'invalid type: ' + type; | ||
if (type === undefined || !(type in exports.PARQUET_LOGICAL_TYPES)) { | ||
throw 'invalid type: ' + type || "undefined"; | ||
} | ||
return PARQUET_LOGICAL_TYPES[type].toPrimitive(value); | ||
return exports.PARQUET_LOGICAL_TYPES[type].toPrimitive(value); | ||
} | ||
exports.toPrimitive = toPrimitive; | ||
/** | ||
@@ -153,7 +175,8 @@ * Convert a value from it's internal/underlying primitive representation to | ||
function fromPrimitive(type, value) { | ||
if (!(type in PARQUET_LOGICAL_TYPES)) { | ||
throw 'invalid type: ' + type; | ||
if (type === undefined || !(type in exports.PARQUET_LOGICAL_TYPES)) { | ||
throw 'invalid type: ' + type || "undefined"; | ||
} | ||
if ("fromPrimitive" in PARQUET_LOGICAL_TYPES[type]) { | ||
return PARQUET_LOGICAL_TYPES[type].fromPrimitive(value); | ||
const typeFromPrimitive = exports.PARQUET_LOGICAL_TYPES[type].fromPrimitive; | ||
if (typeFromPrimitive !== undefined) { | ||
return typeFromPrimitive(value); | ||
} | ||
@@ -164,2 +187,3 @@ else { | ||
} | ||
exports.fromPrimitive = fromPrimitive; | ||
function toPrimitive_BOOLEAN(value) { | ||
@@ -172,77 +196,128 @@ return !!value; | ||
function toPrimitive_FLOAT(value) { | ||
const v = parseFloat(value); | ||
if (isNaN(v)) { | ||
throw 'invalid value for FLOAT: ' + value; | ||
if (typeof value === 'string') { | ||
const v = parseFloat(value); | ||
return v; | ||
} | ||
return v; | ||
else if (typeof value === 'number') { | ||
return value; | ||
} | ||
throw 'invalid value for FLOAT: ' + value; | ||
} | ||
function toPrimitive_DOUBLE(value) { | ||
const v = parseFloat(value); | ||
if (isNaN(v)) { | ||
throw 'invalid value for DOUBLE: ' + value; | ||
if (typeof value === 'string') { | ||
const v = parseFloat(value); | ||
return v; | ||
} | ||
return v; | ||
else if (typeof value === 'number') { | ||
return value; | ||
} | ||
throw 'invalid value for DOUBLE: ' + value; | ||
} | ||
function toPrimitive_INT8(value) { | ||
const v = parseInt(value, 10); | ||
if (v < -0x80 || v > 0x7f || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x80, 0x7f, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT8: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_UINT8(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || v > 0xff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(0, 0xff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for UINT8: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_INT16(value) { | ||
const v = parseInt(value, 10); | ||
if (v < -0x8000 || v > 0x7fff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x8000, 0x7fff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT16: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_UINT16(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || v > 0xffff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(0, 0xffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for UINT16: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_INT32(value) { | ||
const v = parseInt(value, 10); | ||
if (v < -0x80000000 || v > 0x7fffffff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x80000000, 0x7fffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT32: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_UINT32(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || v > 0xffffffffffff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(0, 0xffffffffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for UINT32: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_INT64(value) { | ||
const v = parseInt(value, 10); | ||
if (isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x8000000000000000, 0x7fffffffffffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT64: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_UINT64(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(0, 0xffffffffffffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for UINT64: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_INT96(value) { | ||
const v = parseInt(value, 10); | ||
if (isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x800000000000000000000000, 0x7fffffffffffffffffffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT96: ' + value; | ||
} | ||
return v; | ||
} | ||
@@ -265,12 +340,13 @@ function toPrimitive_BYTE_ARRAY(value) { | ||
function toPrimitive_BSON(value) { | ||
var encoder = new BSON(); | ||
return Buffer.from(encoder.serialize(value)); | ||
return Buffer.from(BSON.serialize(value)); | ||
} | ||
function fromPrimitive_BSON(value) { | ||
var decoder = new BSON(); | ||
return decoder.deserialize(value); | ||
return BSON.deserialize(value); | ||
} | ||
function toPrimitive_TIME_MILLIS(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || v > 0xffffffffffffffff || isNaN(v)) { | ||
let v = value; | ||
if (typeof value === `string`) { | ||
v = parseInt(value, 10); | ||
} | ||
if (v < 0 || v > 0xffffffffffffffff || typeof v !== 'number') { | ||
throw 'invalid value for TIME_MILLIS: ' + value; | ||
@@ -282,3 +358,3 @@ } | ||
const v = BigInt(value); | ||
if (v < 0n || isNaN(v)) { | ||
if (v < 0n) { | ||
throw 'invalid value for TIME_MICROS: ' + value; | ||
@@ -295,9 +371,10 @@ } | ||
/* convert from integer */ | ||
{ | ||
const v = parseInt(value, 10); | ||
if (v < 0 || isNaN(v)) { | ||
throw 'invalid value for DATE: ' + value; | ||
} | ||
return v; | ||
let v = value; | ||
if (typeof value === 'string') { | ||
v = parseInt(value, 10); | ||
} | ||
if (v < 0 || typeof v !== 'number') { | ||
throw 'invalid value for DATE: ' + value; | ||
} | ||
return v; | ||
} | ||
@@ -313,9 +390,10 @@ function fromPrimitive_DATE(value) { | ||
/* convert from integer */ | ||
{ | ||
const v = parseInt(value, 10); | ||
if (v < 0 || isNaN(v)) { | ||
throw 'invalid value for TIMESTAMP_MILLIS: ' + value; | ||
} | ||
return v; | ||
let v = value; | ||
if (typeof value === 'string') { | ||
v = parseInt(value, 10); | ||
} | ||
if (v < 0 || typeof v !== 'number') { | ||
throw 'invalid value for TIMESTAMP_MILLIS: ' + value; | ||
} | ||
return v; | ||
} | ||
@@ -340,3 +418,3 @@ function fromPrimitive_TIMESTAMP_MILLIS(value) { | ||
function fromPrimitive_TIMESTAMP_MICROS(value) { | ||
return new Date(parseInt(value / 1000n)); | ||
return typeof value === 'bigint' ? new Date(Number(value / 1000n)) : new Date(value / 1000); | ||
} | ||
@@ -360,3 +438,6 @@ function toPrimitive_INTERVAL(value) { | ||
} | ||
module.exports = { PARQUET_LOGICAL_TYPES, toPrimitive, fromPrimitive }; | ||
//# sourceMappingURL=types.js.map | ||
function checkValidValue(lowerRange, upperRange, v) { | ||
if (v < lowerRange || v > upperRange) { | ||
throw "invalid value"; | ||
} | ||
} |
@@ -132,2 +132,10 @@ "use strict"; | ||
fromPrimitive: fromPrimitive_INTERVAL | ||
}, | ||
MAP: { | ||
originalType: 'MAP', | ||
toPrimitive: toPrimitive_MAP, | ||
}, | ||
LIST: { | ||
originalType: 'LIST', | ||
toPrimitive: toPrimitive_LIST, | ||
} | ||
@@ -246,2 +254,8 @@ }; | ||
} | ||
function toPrimitive_MAP(value) { | ||
return value; | ||
} | ||
function toPrimitive_LIST(value) { | ||
return value; | ||
} | ||
function toPrimitive_BYTE_ARRAY(value) { | ||
@@ -351,2 +365,1 @@ return Buffer.from(value); | ||
} | ||
//# sourceMappingURL=parquet_primitives.js.map |
"use strict"; | ||
// Lifted from https://github.com/kbajalc/parquets | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.NewPageHeader = void 0; | ||
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); | ||
; | ||
//# sourceMappingURL=types.js.map | ||
class NewPageHeader extends parquet_types_1.default.PageHeader { | ||
offset; | ||
headerSize; | ||
constructor() { | ||
super(); | ||
} | ||
} | ||
exports.NewPageHeader = NewPageHeader; |
@@ -1,16 +0,42 @@ | ||
'use strict'; | ||
const fs = require('fs'); | ||
const thrift = require('thrift'); | ||
const parquet_thrift = require('../gen-nodejs/parquet_types'); | ||
/** We need to use a patched version of TFramedTransport where | ||
* readString returns the original buffer instead of a string if the | ||
* buffer can not be safely encoded as utf8 (see http://bit.ly/2GXeZEF) | ||
*/ | ||
class fixedTFramedTransport extends thrift.TFramedTransport { | ||
readString(len) { | ||
this.ensureAvailable(len); | ||
var buffer = this.inBuf.slice(this.readPos, this.readPos + len); | ||
var str = this.inBuf.toString('utf8', this.readPos, this.readPos + len); | ||
this.readPos += len; | ||
return (Buffer.from(str).equals(buffer)) ? str : buffer; | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.cloneInteger = exports.fieldIndexOf = exports.osopen = exports.osend = exports.oswrite = exports.fclose = exports.fread = exports.fstat = exports.fopen = exports.getThriftEnum = exports.getBitWidth = exports.decodeThrift = exports.serializeThrift = void 0; | ||
const thrift_1 = __importDefault(require("thrift")); | ||
const fs_1 = __importDefault(require("fs")); | ||
const parquet_thrift = __importStar(require("../gen-nodejs/parquet_types")); | ||
const thrift_2 = require("thrift"); | ||
/** | ||
* We need to patch Thrift's TFramedTransport class bc the TS type definitions | ||
* do not define a `readPos` field, even though the class implementation has | ||
* one. | ||
*/ | ||
class fixedTFramedTransport extends thrift_1.default.TFramedTransport { | ||
inBuf; | ||
readPos; | ||
constructor(inBuf) { | ||
super(inBuf); | ||
this.inBuf = inBuf; | ||
this.readPos = 0; | ||
} | ||
@@ -23,40 +49,20 @@ } | ||
*/ | ||
const previousPageLocation = parquet_thrift.PageLocation.prototype; | ||
const PageLocation = parquet_thrift.PageLocation.prototype = []; | ||
PageLocation.write = previousPageLocation.write; | ||
PageLocation.read = previousPageLocation.read; | ||
const getterSetter = index => ({ | ||
const getterSetter = (index) => ({ | ||
get: function () { return this[index]; }, | ||
set: function (value) { return this[index] = value; } | ||
}); | ||
Object.defineProperty(PageLocation, 'offset', getterSetter(0)); | ||
Object.defineProperty(PageLocation, 'compressed_page_size', getterSetter(1)); | ||
Object.defineProperty(PageLocation, 'first_row_index', getterSetter(2)); | ||
exports.force32 = function () { | ||
const protocol = thrift.TCompactProtocol.prototype; | ||
protocol.zigzagToI64 = protocol.zigzagToI32; | ||
protocol.readVarint64 = protocol.readVarint32 = function () { | ||
let lo = 0; | ||
let shift = 0; | ||
let b; | ||
while (true) { | ||
b = this.trans.readByte(); | ||
lo = lo | ((b & 0x7f) << shift); | ||
shift += 7; | ||
if (!(b & 0x80)) { | ||
break; | ||
} | ||
} | ||
return lo; | ||
}; | ||
}; | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'offset', getterSetter(0)); | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'compressed_page_size', getterSetter(1)); | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'first_row_index', getterSetter(2)); | ||
/** | ||
* Helper function that serializes a thrift object into a buffer | ||
*/ | ||
exports.serializeThrift = function (obj) { | ||
const serializeThrift = function (obj) { | ||
let output = []; | ||
let transport = new thrift.TBufferedTransport(null, function (buf) { | ||
const callBack = function (buf) { | ||
output.push(buf); | ||
}); | ||
let protocol = new thrift.TCompactProtocol(transport); | ||
}; | ||
let transport = new thrift_1.default.TBufferedTransport(undefined, callBack); | ||
let protocol = new thrift_1.default.TCompactProtocol(transport); | ||
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872 | ||
obj.write(protocol); | ||
@@ -66,3 +72,4 @@ transport.flush(); | ||
}; | ||
exports.decodeThrift = function (obj, buf, offset) { | ||
exports.serializeThrift = serializeThrift; | ||
const decodeThrift = function (obj, buf, offset) { | ||
if (!offset) { | ||
@@ -73,10 +80,12 @@ offset = 0; | ||
transport.readPos = offset; | ||
var protocol = new thrift.TCompactProtocol(transport); | ||
var protocol = new thrift_1.default.TCompactProtocol(transport); | ||
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872 | ||
obj.read(protocol); | ||
return transport.readPos - offset; | ||
}; | ||
exports.decodeThrift = decodeThrift; | ||
/** | ||
* Get the number of bits required to store a given value | ||
*/ | ||
exports.getBitWidth = function (val) { | ||
const getBitWidth = function (val) { | ||
if (val === 0) { | ||
@@ -89,6 +98,7 @@ return 0; | ||
}; | ||
exports.getBitWidth = getBitWidth; | ||
/** | ||
* FIXME not ideal that this is linear | ||
*/ | ||
exports.getThriftEnum = function (klass, value) { | ||
const getThriftEnum = function (klass, value) { | ||
for (let k in klass) { | ||
@@ -101,5 +111,6 @@ if (klass[k] === value) { | ||
}; | ||
exports.fopen = function (filePath) { | ||
exports.getThriftEnum = getThriftEnum; | ||
const fopen = function (filePath) { | ||
return new Promise((resolve, reject) => { | ||
fs.open(filePath, 'r', (err, fd) => { | ||
fs_1.default.open(filePath, 'r', (err, fd) => { | ||
if (err) { | ||
@@ -114,5 +125,6 @@ reject(err); | ||
}; | ||
exports.fstat = function (filePath) { | ||
exports.fopen = fopen; | ||
const fstat = function (filePath) { | ||
return new Promise((resolve, reject) => { | ||
fs.stat(filePath, (err, stat) => { | ||
fs_1.default.stat(filePath, (err, stat) => { | ||
if (err) { | ||
@@ -127,6 +139,7 @@ reject(err); | ||
}; | ||
exports.fread = function (fd, position, length) { | ||
exports.fstat = fstat; | ||
const fread = function (fd, position, length) { | ||
let buffer = Buffer.alloc(length); | ||
return new Promise((resolve, reject) => { | ||
fs.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => { | ||
fs_1.default.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => { | ||
if (err || bytesRead != length) { | ||
@@ -141,5 +154,6 @@ reject(err || Error('read failed')); | ||
}; | ||
exports.fclose = function (fd) { | ||
exports.fread = fread; | ||
const fclose = function (fd) { | ||
return new Promise((resolve, reject) => { | ||
fs.close(fd, (err) => { | ||
fs_1.default.close(fd, (err) => { | ||
if (err) { | ||
@@ -154,3 +168,4 @@ reject(err); | ||
}; | ||
exports.oswrite = function (os, buf) { | ||
exports.fclose = fclose; | ||
const oswrite = function (os, buf) { | ||
return new Promise((resolve, reject) => { | ||
@@ -162,3 +177,3 @@ os.write(buf, (err) => { | ||
else { | ||
resolve(); | ||
resolve(err); | ||
} | ||
@@ -168,3 +183,4 @@ }); | ||
}; | ||
exports.osend = function (os) { | ||
exports.oswrite = oswrite; | ||
const osend = function (os) { | ||
return new Promise((resolve, reject) => { | ||
@@ -176,3 +192,3 @@ os.end((err) => { | ||
else { | ||
resolve(); | ||
resolve(err); | ||
} | ||
@@ -182,5 +198,6 @@ }); | ||
}; | ||
exports.osopen = function (path, opts) { | ||
exports.osend = osend; | ||
const osopen = function (path, opts) { | ||
return new Promise((resolve, reject) => { | ||
let outputStream = fs.createWriteStream(path, opts); | ||
let outputStream = fs_1.default.createWriteStream(path, opts); | ||
outputStream.on('open', function (fd) { | ||
@@ -194,3 +211,4 @@ resolve(outputStream); | ||
}; | ||
exports.fieldIndexOf = function (arr, elem) { | ||
exports.osopen = osopen; | ||
const fieldIndexOf = function (arr, elem) { | ||
for (let j = 0; j < arr.length; ++j) { | ||
@@ -213,2 +231,6 @@ if (arr[j].length !== elem.length) { | ||
}; | ||
//# sourceMappingURL=util.js.map | ||
exports.fieldIndexOf = fieldIndexOf; | ||
const cloneInteger = (int) => { | ||
return new thrift_2.Int64(int.valueOf()); | ||
}; | ||
exports.cloneInteger = cloneInteger; |
@@ -1,20 +0,35 @@ | ||
'use strict'; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
const stream = require('stream'); | ||
const parquet_thrift = require('../gen-nodejs/parquet_types'); | ||
const parquet_shredder = require('./shred'); | ||
const parquet_util = require('./util'); | ||
const parquet_codec = require('./codec'); | ||
const parquet_compression = require('./compression'); | ||
const parquet_types = require('./types'); | ||
const bloomFilterWriter = require("./bloomFilterIO/bloomFilterWriter"); | ||
const Long = require('long'); | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetTransformer = exports.ParquetEnvelopeWriter = exports.ParquetWriter = void 0; | ||
const stream_1 = __importDefault(require("stream")); | ||
const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types")); | ||
const parquet_shredder = __importStar(require("./shred")); | ||
const parquet_util = __importStar(require("./util")); | ||
const parquet_codec = __importStar(require("./codec")); | ||
const parquet_compression = __importStar(require("./compression")); | ||
const parquet_types = __importStar(require("./types")); | ||
const bloomFilterWriter = __importStar(require("./bloomFilterIO/bloomFilterWriter")); | ||
const node_int64_1 = __importDefault(require("node-int64")); | ||
/** | ||
@@ -44,2 +59,8 @@ * Parquet File Magic String | ||
class ParquetWriter { | ||
schema; | ||
envelopeWriter; | ||
rowBuffer; | ||
rowGroupSize; | ||
closed; | ||
userMetadata; | ||
/** | ||
@@ -49,7 +70,5 @@ * Convenience method to create a new buffered parquet writer that writes to | ||
*/ | ||
static openFile(schema, path, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let outputStream = yield parquet_util.osopen(path, opts); | ||
return ParquetWriter.openStream(schema, outputStream, opts); | ||
}); | ||
static async openFile(schema, path, opts) { | ||
let outputStream = await parquet_util.osopen(path, opts); | ||
return ParquetWriter.openStream(schema, outputStream, opts); | ||
} | ||
@@ -60,10 +79,8 @@ /** | ||
*/ | ||
static openStream(schema, outputStream, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!opts) { | ||
opts = {}; | ||
} | ||
let envelopeWriter = yield ParquetEnvelopeWriter.openStream(schema, outputStream, opts); | ||
return new ParquetWriter(schema, envelopeWriter, opts); | ||
}); | ||
static async openStream(schema, outputStream, opts) { | ||
if (!opts) { | ||
opts = {}; | ||
} | ||
let envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts); | ||
return new ParquetWriter(schema, envelopeWriter, opts); | ||
} | ||
@@ -92,21 +109,19 @@ /** | ||
*/ | ||
appendRow(row) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (this.closed) { | ||
throw 'writer was closed'; | ||
} | ||
parquet_shredder.shredRecord(this.schema, row, this.rowBuffer); | ||
const options = { | ||
useDataPageV2: this.envelopeWriter.useDataPageV2, | ||
bloomFilters: this.envelopeWriter.bloomFilters | ||
}; | ||
if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) { | ||
encodePages(this.schema, this.rowBuffer, options); | ||
} | ||
if (this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
encodePages(this.schema, this.rowBuffer, options); | ||
yield this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
}); | ||
async appendRow(row) { | ||
if (this.closed || this.envelopeWriter === null) { | ||
throw 'writer was closed'; | ||
} | ||
parquet_shredder.shredRecord(this.schema, row, this.rowBuffer); | ||
const options = { | ||
useDataPageV2: this.envelopeWriter.useDataPageV2, | ||
bloomFilters: this.envelopeWriter.bloomFilters | ||
}; | ||
if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) { | ||
await encodePages(this.schema, this.rowBuffer, options); | ||
} | ||
if (this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
await encodePages(this.schema, this.rowBuffer, options); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
} | ||
@@ -119,22 +134,22 @@ /** | ||
*/ | ||
close(callback) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (this.closed) { | ||
throw 'writer was closed'; | ||
} | ||
this.closed = true; | ||
async close(callback) { | ||
if (this.closed) { | ||
throw 'writer was closed'; | ||
} | ||
this.closed = true; | ||
if (this.envelopeWriter) { | ||
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
yield this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
yield this.envelopeWriter.writeBloomFilters(); | ||
yield this.envelopeWriter.writeIndex(); | ||
yield this.envelopeWriter.writeFooter(this.userMetadata); | ||
yield this.envelopeWriter.close(); | ||
await this.envelopeWriter.writeBloomFilters(); | ||
await this.envelopeWriter.writeIndex(); | ||
await this.envelopeWriter.writeFooter(this.userMetadata); | ||
await this.envelopeWriter.close(); | ||
this.envelopeWriter = null; | ||
if (callback) { | ||
callback(); | ||
} | ||
}); | ||
} | ||
if (callback) { | ||
callback(); | ||
} | ||
} | ||
@@ -161,21 +176,30 @@ /** | ||
setPageSize(cnt) { | ||
this.writer.setPageSize(cnt); | ||
this.envelopeWriter.setPageSize(cnt); | ||
} | ||
} | ||
exports.ParquetWriter = ParquetWriter; | ||
/** | ||
* Create a parquet file from a schema and a number of row groups. This class | ||
* performs direct, unbuffered writes to the underlying output stream and is | ||
* intendend for advanced and internal users; the writeXXX methods must be | ||
* intended for advanced and internal users; the writeXXX methods must be | ||
* called in the correct order to produce a valid file. | ||
*/ | ||
class ParquetEnvelopeWriter { | ||
schema; | ||
write; | ||
close; | ||
offset; | ||
rowCount; | ||
rowGroups; | ||
pageSize; | ||
useDataPageV2; | ||
pageIndex; | ||
bloomFilters; // TODO: OR filterCollection | ||
/** | ||
* Create a new parquet envelope writer that writes to the specified stream | ||
*/ | ||
static openStream(schema, outputStream, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let writeFn = parquet_util.oswrite.bind(undefined, outputStream); | ||
let closeFn = parquet_util.osend.bind(undefined, outputStream); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts); | ||
}); | ||
static async openStream(schema, outputStream, opts) { | ||
let writeFn = parquet_util.oswrite.bind(undefined, outputStream); | ||
let closeFn = parquet_util.osend.bind(undefined, outputStream); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, new node_int64_1.default(0), opts); | ||
} | ||
@@ -187,3 +211,3 @@ constructor(schema, writeFn, closeFn, fileOffset, opts) { | ||
this.offset = fileOffset; | ||
this.rowCount = 0; | ||
this.rowCount = new node_int64_1.default(0); | ||
this.rowGroups = []; | ||
@@ -199,3 +223,3 @@ this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE; | ||
writeSection(buf) { | ||
this.offset += buf.length; | ||
this.offset.setValue(this.offset.valueOf() + buf.length); | ||
return this.write(buf); | ||
@@ -213,4 +237,4 @@ } | ||
*/ | ||
writeRowGroup(records) { | ||
let rgroup = encodeRowGroup(this.schema, records, { | ||
async writeRowGroup(records) { | ||
let rgroup = await encodeRowGroup(this.schema, records, { | ||
baseOffset: this.offset, | ||
@@ -221,12 +245,11 @@ pageSize: this.pageSize, | ||
}); | ||
this.rowCount += records.rowCount; | ||
this.rowCount.setValue(this.rowCount.valueOf() + records.rowCount); | ||
this.rowGroups.push(rgroup.metadata); | ||
return this.writeSection(rgroup.body); | ||
} | ||
writeBloomFilters(_rowGroups) { | ||
let rowGroups = _rowGroups || this.rowGroups; | ||
rowGroups.forEach(group => { | ||
writeBloomFilters() { | ||
this.rowGroups.forEach(group => { | ||
group.columns.forEach(column => { | ||
const columnName = column.meta_data.path_in_schema[0]; | ||
if (columnName in this.bloomFilters === false) | ||
const columnName = column.meta_data?.path_in_schema[0]; | ||
if (!columnName || columnName in this.bloomFilters === false) | ||
return; | ||
@@ -242,20 +265,19 @@ const serializedBloomFilterData = bloomFilterWriter.getSerializedBloomFilterData(this.bloomFilters[columnName]); | ||
*/ | ||
writeIndex(_rowGroups) { | ||
let rowGroups = _rowGroups || this.rowGroups; | ||
writeIndex() { | ||
this.schema.fieldList.forEach((c, i) => { | ||
rowGroups.forEach(group => { | ||
this.rowGroups.forEach(group => { | ||
let column = group.columns[i]; | ||
if (!column) | ||
return; | ||
if (column.meta_data.columnIndex) { | ||
if (column.meta_data?.columnIndex) { | ||
let columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex); | ||
delete column.meta_data.columnIndex; | ||
column.column_index_offset = this.offset; | ||
column.column_index_offset = parquet_util.cloneInteger(this.offset); | ||
column.column_index_length = columnBody.length; | ||
this.writeSection(columnBody); | ||
} | ||
if (column.meta_data.offsetIndex) { | ||
if (column.meta_data?.offsetIndex) { | ||
let offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex); | ||
delete column.meta_data.offsetIndex; | ||
column.offset_index_offset = this.offset; | ||
column.offset_index_offset = parquet_util.cloneInteger(this.offset); | ||
column.offset_index_length = offsetBody.length; | ||
@@ -270,3 +292,3 @@ this.writeSection(offsetBody); | ||
*/ | ||
writeFooter(userMetadata, schema, rowCount, rowGroups) { | ||
writeFooter(userMetadata) { | ||
if (!userMetadata) { | ||
@@ -289,6 +311,8 @@ userMetadata = {}; | ||
} | ||
exports.ParquetEnvelopeWriter = ParquetEnvelopeWriter; | ||
/** | ||
* Create a parquet transform stream | ||
*/ | ||
class ParquetTransformer extends stream.Transform { | ||
class ParquetTransformer extends stream_1.default.Transform { | ||
writer; | ||
constructor(schema, opts = {}) { | ||
@@ -301,9 +325,9 @@ super({ objectMode: true }); | ||
})(this); | ||
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, 0, opts), opts); | ||
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, new node_int64_1.default(0), opts), opts); | ||
} | ||
_transform(row, encoding, callback) { | ||
_transform(row, _encoding, callback) { | ||
if (row) { | ||
this.writer.appendRow(row).then(data => callback(null, data), err => { | ||
const fullErr = new Error(`Error transforming to parquet: ${err.toString()} row:${row}`); | ||
fullErr.origErr = err; | ||
fullErr.message = err; | ||
callback(fullErr); | ||
@@ -317,6 +341,7 @@ }); | ||
_flush(callback) { | ||
this.writer.close(callback) | ||
this.writer.close() | ||
.then(d => callback(null, d), callback); | ||
} | ||
} | ||
exports.ParquetTransformer = ParquetTransformer; | ||
/** | ||
@@ -349,5 +374,5 @@ * Encode a consecutive array of data using one of the parquet encodings | ||
statistics.min = statistics.min_value; | ||
return new parquet_thrift.Statistics(statistics); | ||
return new parquet_types_1.default.Statistics(statistics); | ||
} | ||
function encodePages(schema, rowBuffer, opts) { | ||
async function encodePages(schema, rowBuffer, opts) { | ||
if (!rowBuffer.pageRowCount) { | ||
@@ -361,3 +386,3 @@ return; | ||
let page; | ||
const values = rowBuffer.columnData[field.path]; | ||
const values = rowBuffer.columnData[field.path.join(',')]; | ||
if (opts.bloomFilters && (field.name in opts.bloomFilters)) { | ||
@@ -367,3 +392,3 @@ const splitBlockBloomFilter = opts.bloomFilters[field.name]; | ||
} | ||
let statistics; | ||
let statistics = {}; | ||
if (field.statistics !== false) { | ||
@@ -379,12 +404,12 @@ statistics = {}; | ||
}); | ||
statistics.null_count = values.dlevels.length - values.values.length; | ||
statistics.distinct_count = values.distinct_values.size; | ||
statistics.null_count = new node_int64_1.default(values.dlevels.length - values.values.length); | ||
statistics.distinct_count = new node_int64_1.default(values.distinct_values.size); | ||
} | ||
if (opts.useDataPageV2) { | ||
page = encodeDataPageV2(field, values.count, values.values, values.rlevels, values.dlevels, statistics); | ||
page = await encodeDataPageV2(field, values.count, values.values, values.rlevels, values.dlevels, statistics); | ||
} | ||
else { | ||
page = encodeDataPage(field, values.values, values.rlevels, values.dlevels, statistics); | ||
page = await encodeDataPage(field, values.values || [], values.rlevels || [], values.dlevels || [], statistics); | ||
} | ||
let pages = rowBuffer.pages[field.path]; | ||
let pages = rowBuffer.pages[field.path.join(',')]; | ||
let lastPage = pages[pages.length - 1]; | ||
@@ -410,3 +435,3 @@ let first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0; | ||
*/ | ||
function encodeDataPage(column, values, rlevels, dlevels, statistics) { | ||
async function encodeDataPage(column, values, rlevels, dlevels, statistics) { | ||
/* encode values */ | ||
@@ -428,8 +453,8 @@ let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
let pageBody = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]); | ||
pageBody = parquet_compression.deflate(column.compression, pageBody); | ||
let pageHeader = new parquet_thrift.PageHeader(); | ||
pageHeader.type = parquet_thrift.PageType['DATA_PAGE']; | ||
pageBody = await parquet_compression.deflate(column.compression, pageBody); | ||
let pageHeader = new parquet_types_1.default.PageHeader(); | ||
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE']; | ||
pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; | ||
pageHeader.compressed_page_size = pageBody.length; | ||
pageHeader.data_page_header = new parquet_thrift.DataPageHeader(); | ||
pageHeader.data_page_header = new parquet_types_1.default.DataPageHeader(); | ||
pageHeader.data_page_header.num_values = dlevels.length; | ||
@@ -439,7 +464,7 @@ if (column.statistics !== false) { | ||
} | ||
pageHeader.data_page_header.encoding = parquet_thrift.Encoding[column.encoding]; | ||
pageHeader.data_page_header.encoding = parquet_types_1.default.Encoding[column.encoding]; | ||
pageHeader.data_page_header.definition_level_encoding = | ||
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING]; | ||
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
pageHeader.data_page_header.repetition_level_encoding = | ||
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING]; | ||
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
/* concat page header, repetition and definition levels and values */ | ||
@@ -451,3 +476,3 @@ return Buffer.concat([parquet_util.serializeThrift(pageHeader), pageBody]); | ||
*/ | ||
function encodeDataPageV2(column, rowCount, values, rlevels, dlevels, statistics) { | ||
async function encodeDataPageV2(column, rowCount, values, rlevels, dlevels, statistics) { | ||
/* encode values */ | ||
@@ -458,3 +483,3 @@ let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
}); | ||
let valuesBufCompressed = parquet_compression.deflate(column.compression, valuesBuf); | ||
let valuesBufCompressed = await parquet_compression.deflate(column.compression, valuesBuf); | ||
/* encode repetition and definition levels */ | ||
@@ -476,5 +501,5 @@ let rLevelsBuf = Buffer.alloc(0); | ||
/* build page header */ | ||
let pageHeader = new parquet_thrift.PageHeader(); | ||
pageHeader.type = parquet_thrift.PageType['DATA_PAGE_V2']; | ||
pageHeader.data_page_header_v2 = new parquet_thrift.DataPageHeaderV2(); | ||
let pageHeader = new parquet_types_1.default.PageHeader(); | ||
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE_V2']; | ||
pageHeader.data_page_header_v2 = new parquet_types_1.default.DataPageHeaderV2(); | ||
pageHeader.data_page_header_v2.num_values = dlevels.length; | ||
@@ -490,3 +515,3 @@ pageHeader.data_page_header_v2.num_nulls = dlevels.length - values.length; | ||
rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length; | ||
pageHeader.data_page_header_v2.encoding = parquet_thrift.Encoding[column.encoding]; | ||
pageHeader.data_page_header_v2.encoding = parquet_types_1.default.Encoding[column.encoding]; | ||
pageHeader.data_page_header_v2.definition_levels_byte_length = dLevelsBuf.length; | ||
@@ -507,3 +532,3 @@ pageHeader.data_page_header_v2.repetition_levels_byte_length = rLevelsBuf.length; | ||
*/ | ||
function encodeColumnChunk(pages, opts) { | ||
async function encodeColumnChunk(pages, opts) { | ||
let pagesBuf = Buffer.concat(pages.map(d => d.page)); | ||
@@ -513,16 +538,16 @@ let num_values = pages.reduce((p, d) => p + d.num_values, 0); | ||
/* prepare metadata header */ | ||
let metadata = new parquet_thrift.ColumnMetaData(); | ||
let metadata = new parquet_types_1.default.ColumnMetaData(); | ||
metadata.path_in_schema = opts.column.path; | ||
metadata.num_values = num_values; | ||
metadata.data_page_offset = opts.baseOffset; | ||
metadata.num_values = new node_int64_1.default(num_values); | ||
metadata.data_page_offset = new node_int64_1.default(opts.baseOffset); | ||
metadata.encodings = []; | ||
metadata.total_uncompressed_size = pagesBuf.length; | ||
metadata.total_compressed_size = pagesBuf.length; | ||
metadata.type = parquet_thrift.Type[opts.column.primitiveType]; | ||
metadata.codec = parquet_thrift.CompressionCodec[opts.column.compression]; | ||
metadata.total_uncompressed_size = new node_int64_1.default(pagesBuf.length); | ||
metadata.total_compressed_size = new node_int64_1.default(pagesBuf.length); | ||
metadata.type = parquet_types_1.default.Type[opts.column.primitiveType]; | ||
metadata.codec = await parquet_types_1.default.CompressionCodec[opts.column.compression]; | ||
/* compile statistics ColumnIndex and OffsetIndex*/ | ||
let columnIndex = new parquet_thrift.ColumnIndex(); | ||
let columnIndex = new parquet_types_1.default.ColumnIndex(); | ||
columnIndex.max_values = []; | ||
columnIndex.min_values = []; | ||
let offsetIndex = new parquet_thrift.OffsetIndex(); | ||
let offsetIndex = new parquet_types_1.default.OffsetIndex(); | ||
offsetIndex.page_locations = []; | ||
@@ -532,7 +557,7 @@ /* prepare statistics */ | ||
let distinct_values = new Set(); | ||
statistics.null_count = 0; | ||
statistics.distinct_count = 0; | ||
statistics.null_count = new node_int64_1.default(0); | ||
statistics.distinct_count = new node_int64_1.default(0); | ||
/* loop through pages and update indices and statistics */ | ||
for (let i = 0; i < pages.length; i++) { | ||
let page = pages[i]; | ||
const page = pages[i]; | ||
if (opts.column.statistics !== false) { | ||
@@ -545,12 +570,12 @@ if (page.statistics.max_value > statistics.max_value || i == 0) { | ||
} | ||
statistics.null_count += page.statistics.null_count; | ||
page.distinct_values.forEach(value => distinct_values.add(value)); | ||
statistics.null_count.setValue(statistics.null_count.valueOf() + (page.statistics.null_count?.valueOf() || 0)); | ||
page.distinct_values.forEach((value) => distinct_values.add(value)); | ||
columnIndex.max_values.push(encodeStatisticsValue(page.statistics.max_value, opts.column)); | ||
columnIndex.min_values.push(encodeStatisticsValue(page.statistics.min_value, opts.column)); | ||
} | ||
let pageLocation = new parquet_thrift.PageLocation(); | ||
pageLocation.offset = offset; | ||
let pageLocation = new parquet_types_1.default.PageLocation(); | ||
pageLocation.offset = new node_int64_1.default(offset); | ||
offset += page.page.length; | ||
pageLocation.compressed_page_size = page.page.length; | ||
pageLocation.first_row_index = page.first_row_index; | ||
pageLocation.first_row_index = new node_int64_1.default(page.first_row_index); | ||
offsetIndex.page_locations.push(pageLocation); | ||
@@ -562,3 +587,3 @@ } | ||
if (opts.column.statistics !== false) { | ||
statistics.distinct_count = distinct_values.size; | ||
statistics.distinct_count = new node_int64_1.default(distinct_values.size); | ||
metadata.statistics = encodeStatistics(statistics, opts.column); | ||
@@ -570,8 +595,4 @@ if (opts.pageIndex !== false) { | ||
/* list encodings */ | ||
let encodingsSet = {}; | ||
encodingsSet[PARQUET_RDLVL_ENCODING] = true; | ||
encodingsSet[opts.column.encoding] = true; | ||
for (let k in encodingsSet) { | ||
metadata.encodings.push(parquet_thrift.Encoding[k]); | ||
} | ||
metadata.encodings.push(parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]); | ||
metadata.encodings.push(parquet_types_1.default.Encoding[opts.column.encoding]); | ||
/* concat metadata header and data pages */ | ||
@@ -585,7 +606,7 @@ let metadataOffset = opts.baseOffset + pagesBuf.length; | ||
*/ | ||
function encodeRowGroup(schema, data, opts) { | ||
let metadata = new parquet_thrift.RowGroup(); | ||
metadata.num_rows = data.rowCount; | ||
async function encodeRowGroup(schema, data, opts) { | ||
let metadata = new parquet_types_1.default.RowGroup(); | ||
metadata.num_rows = new node_int64_1.default(data.rowCount); | ||
metadata.columns = []; | ||
metadata.total_byte_size = 0; | ||
metadata.total_byte_size = new node_int64_1.default(0); | ||
let body = Buffer.alloc(0); | ||
@@ -596,16 +617,15 @@ for (let field of schema.fieldList) { | ||
} | ||
let cchunkData = encodeColumnChunk(data.pages[field.path], { | ||
let cchunkData = await encodeColumnChunk(data.pages[field.path.join(',')], { | ||
column: field, | ||
baseOffset: opts.baseOffset + body.length, | ||
pageSize: opts.pageSize, | ||
encoding: field.encoding, | ||
rowCount: data.rowCount, | ||
useDataPageV2: opts.useDataPageV2, | ||
pageIndex: opts.pageIndex | ||
baseOffset: opts.baseOffset.valueOf() + body.length, | ||
pageSize: opts.pageSize || 0, | ||
rowCount: data.rowCount || 0, | ||
useDataPageV2: opts.useDataPageV2 ?? true, | ||
pageIndex: opts.pageIndex ?? true | ||
}); | ||
let cchunk = new parquet_thrift.ColumnChunk(); | ||
cchunk.file_offset = cchunkData.metadataOffset; | ||
let cchunk = new parquet_types_1.default.ColumnChunk(); | ||
cchunk.file_offset = new node_int64_1.default(cchunkData.metadataOffset); | ||
cchunk.meta_data = cchunkData.metadata; | ||
metadata.columns.push(cchunk); | ||
metadata.total_byte_size += cchunkData.body.length; | ||
metadata.total_byte_size = new node_int64_1.default(metadata.total_byte_size.valueOf() + (cchunkData.body.length)); | ||
body = Buffer.concat([body, cchunkData.body]); | ||
@@ -619,5 +639,5 @@ } | ||
function encodeFooter(schema, rowCount, rowGroups, userMetadata) { | ||
let metadata = new parquet_thrift.FileMetaData(); | ||
let metadata = new parquet_types_1.default.FileMetaData(); | ||
metadata.version = PARQUET_VERSION; | ||
metadata.created_by = 'parquet.js'; | ||
metadata.created_by = '@dsnp/parquetjs'; | ||
metadata.num_rows = rowCount; | ||
@@ -628,3 +648,3 @@ metadata.row_groups = rowGroups; | ||
for (let k in userMetadata) { | ||
let kv = new parquet_thrift.KeyValue(); | ||
let kv = new parquet_types_1.default.KeyValue(); | ||
kv.key = k; | ||
@@ -635,3 +655,3 @@ kv.value = userMetadata[k]; | ||
{ | ||
let schemaRoot = new parquet_thrift.SchemaElement(); | ||
let schemaRoot = new parquet_types_1.default.SchemaElement(); | ||
schemaRoot.name = 'root'; | ||
@@ -642,5 +662,5 @@ schemaRoot.num_children = Object.keys(schema.fields).length; | ||
for (let field of schema.fieldList) { | ||
let schemaElem = new parquet_thrift.SchemaElement(); | ||
let schemaElem = new parquet_types_1.default.SchemaElement(); | ||
schemaElem.name = field.name; | ||
schemaElem.repetition_type = parquet_thrift.FieldRepetitionType[field.repetitionType]; | ||
schemaElem.repetition_type = parquet_types_1.default.FieldRepetitionType[field.repetitionType]; | ||
if (field.isNested) { | ||
@@ -650,6 +670,6 @@ schemaElem.num_children = field.fieldCount; | ||
else { | ||
schemaElem.type = parquet_thrift.Type[field.primitiveType]; | ||
schemaElem.type = parquet_types_1.default.Type[field.primitiveType]; | ||
} | ||
if (field.originalType) { | ||
schemaElem.converted_type = parquet_thrift.ConvertedType[field.originalType]; | ||
schemaElem.converted_type = parquet_types_1.default.ConvertedType[field.originalType]; | ||
} | ||
@@ -671,2 +691,1 @@ schemaElem.type_length = field.typeLength; | ||
}; | ||
//# sourceMappingURL=writer.js.map |
"use strict"; | ||
const reader = require('./lib/reader'); | ||
const writer = require('./lib/writer'); | ||
const schema = require('./lib/schema'); | ||
const shredder = require('./lib/shred'); | ||
const util = require('./lib/util'); | ||
module.exports = { | ||
ParquetEnvelopeReader: reader.ParquetEnvelopeReader, | ||
ParquetReader: reader.ParquetReader, | ||
ParquetEnvelopeWriter: writer.ParquetEnvelopeWriter, | ||
ParquetWriter: writer.ParquetWriter, | ||
ParquetTransformer: writer.ParquetTransformer, | ||
ParquetSchema: schema.ParquetSchema, | ||
ParquetShredder: shredder, | ||
force32: util.force32 | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
//# sourceMappingURL=parquet.js.map | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetShredder = exports.ParquetSchema = exports.ParquetTransformer = exports.ParquetWriter = exports.ParquetEnvelopeWriter = exports.ParquetReader = exports.ParquetEnvelopeReader = void 0; | ||
const reader = __importStar(require("./lib/reader")); | ||
const writer = __importStar(require("./lib/writer")); | ||
const schema = __importStar(require("./lib/schema")); | ||
const shredder = __importStar(require("./lib/shred")); | ||
exports.ParquetEnvelopeReader = reader.ParquetEnvelopeReader; | ||
exports.ParquetReader = reader.ParquetReader; | ||
exports.ParquetEnvelopeWriter = writer.ParquetEnvelopeWriter; | ||
exports.ParquetWriter = writer.ParquetWriter; | ||
exports.ParquetTransformer = writer.ParquetTransformer; | ||
exports.ParquetSchema = schema.ParquetSchema; | ||
exports.ParquetShredder = shredder; |
@@ -5,3 +5,4 @@ { | ||
"main": "dist/parquet.js", | ||
"version": "0.0.0-ba6065", | ||
"types": "dist/parquet.d.ts", | ||
"version": "0.0.0-cadf48", | ||
"homepage": "https://github.com/LibertyDSNP/parquetjs", | ||
@@ -18,36 +19,52 @@ "license": "MIT", | ||
"dependencies": { | ||
"@types/brotli": "^1.3.0", | ||
"@types/bson": "^4.0.3", | ||
"@types/long": "^4.0.1", | ||
"@types/node": "^14.14.35", | ||
"@types/thrift": "^0.10.10", | ||
"brotli": "^1.3.0", | ||
"bson": "^2.0.8", | ||
"@types/varint": "^6.0.0", | ||
"browserify-zlib": "^0.2.0", | ||
"bson": "4.4.0", | ||
"cross-fetch": "^3.1.4", | ||
"int53": "^0.2.4", | ||
"lodash": "^4.17.21", | ||
"long": "^4.0.0", | ||
"lzo": "^0.4.0", | ||
"object-stream": "0.0.1", | ||
"snappyjs": "^0.6.0", | ||
"thrift": "0.14.1", | ||
"typescript": "^4.3.2", | ||
"varint": "^5.0.0", | ||
"wasm-brotli": "^2.0.2", | ||
"xxhash-wasm": "^0.4.1" | ||
}, | ||
"devDependencies": { | ||
"@babel/core": "7.13.10", | ||
"@babel/preset-env": "7.13.12", | ||
"@babel/preset-typescript": "7.13.0", | ||
"@babel/core": "^7.14.6", | ||
"@babel/preset-env": "^7.14.7", | ||
"@babel/preset-typescript": "^7.14.5", | ||
"@types/bson": "^4.0.3", | ||
"@types/chai": "^4.2.16", | ||
"@types/long": "^4.0.1", | ||
"@types/mocha": "^8.2.2", | ||
"@types/node": "^14.14.35", | ||
"@types/sinon": "^10.0.0", | ||
"@types/thrift": "^0.10.10", | ||
"assert": "^2.0.0", | ||
"babel-loader": "^8.2.2", | ||
"babel-plugin-add-module-exports": "^1.0.4", | ||
"browserfs": "^1.4.3", | ||
"buffer": "^6.0.3", | ||
"chai": "4.3.4", | ||
"mocha": "8.3.2", | ||
"msw": "^0.29.0", | ||
"core-js": "^3.15.1", | ||
"esbuild": "^0.14.38", | ||
"mocha": "9.2.2", | ||
"msw": "^0.39.2", | ||
"object-stream": "0.0.1", | ||
"process": "^0.11.10", | ||
"regenerator-runtime": "^0.13.7", | ||
"sinon": "^10.0.0", | ||
"ts-node": "^9.1.1" | ||
"sinon-chai": "^3.7.0", | ||
"sinon-chai-in-order": "^0.1.0", | ||
"source-map-loader": "^3.0.0", | ||
"stream-browserify": "^3.0.0", | ||
"ts-loader": "^9.2.3", | ||
"ts-node": "^9.1.1", | ||
"typescript": "^4.5.2" | ||
}, | ||
"scripts": { | ||
"build": "tsc -b", | ||
"build": "npm run build:node && npm run build:types", | ||
"build:types": "tsc -p tsconfig.types.json && cp gen-nodejs/parquet_types.d.ts dist/gen-nodejs/parquet_types.d.ts", | ||
"build:node": "tsc -b", | ||
"build:browser": "node esbuild.js", | ||
"type": "tsc --noEmit", | ||
@@ -57,5 +74,16 @@ "lint": "echo 'Linting, it is on the TODO list...'", | ||
"clean": "rm -Rf ./dist", | ||
"prepublishOnly": "npm run clean && npm run build", | ||
"thrift": "thrift -out gen-nodejs --gen js:ts parquet.thrift && thrift -out gen-nodejs --gen js:node parquet.thrift" | ||
"prepublishOnly": "npm run clean && npm run build:node && npm run build:types && npm run build:browser", | ||
"thrift": "thrift -out gen-nodejs --gen js:ts parquet.thrift && thrift -out gen-nodejs --gen js:node parquet.thrift", | ||
"serve": "node esbuild-serve.js" | ||
}, | ||
"browser": { | ||
"assert": "assert", | ||
"events": "events", | ||
"fs": "browserfs", | ||
"path": "path-browserify", | ||
"stream": "readable-stream", | ||
"thrift": "./node_modules/thrift/lib/nodejs/lib/thrift/browser.js", | ||
"util": "util", | ||
"zlib": "browserify-zlib" | ||
}, | ||
"engines": { | ||
@@ -62,0 +90,0 @@ "node": ">=14.16.0" |
@@ -17,4 +17,4 @@ # parquet.js | ||
Forked Notice | ||
------------- | ||
## Forked Notice | ||
This is a forked repository with code from various sources: | ||
@@ -24,7 +24,5 @@ - Primary source [ironSource](https://github.com/ironSource/parquetjs) [npm: parquetjs](https://www.npmjs.com/package/parquetjs) | ||
Installation | ||
------------ | ||
## Installation | ||
_parquet.js requires node.js >= 14.16.0_ | ||
To use parquet.js with node.js, install it using npm: | ||
``` | ||
@@ -34,8 +32,21 @@ $ npm install @dsnp/parquetjs | ||
_parquet.js requires node.js >= 14.16.0_ | ||
### NodeJS | ||
To use with nodejs: | ||
```javascript | ||
import parquetjs from "@dsnp/parquetjs" | ||
``` | ||
### Browser | ||
To use in a browser, in your bundler, depending on your needs, write the appropriate plugin or resolver to point to: | ||
```javascript | ||
"node_modules/@dsnp/parquetjs/browser/parquetjs" | ||
``` | ||
or: | ||
Usage: Writing files | ||
-------------------- | ||
```javascript | ||
import parquetjs from "@dsnp/parquetjs/browser/parquetjs" | ||
``` | ||
## Usage: Writing files | ||
Once you have installed the parquet.js library, you can import it as a single | ||
@@ -116,4 +127,3 @@ module: | ||
Usage: Reading files | ||
-------------------- | ||
## Usage: Reading files | ||
@@ -231,4 +241,3 @@ A parquet reader allows retrieving the rows from a parquet file in order. | ||
Encodings | ||
--------- | ||
## Encodings | ||
@@ -265,4 +274,3 @@ Internally, the Parquet format will store values from each field as consecutive | ||
Optional Fields | ||
--------------- | ||
### Optional Fields | ||
@@ -284,4 +292,3 @@ By default, all fields are required to be present in each row. You can also mark | ||
Nested Rows & Arrays | ||
-------------------- | ||
### Nested Rows & Arrays | ||
@@ -356,4 +363,3 @@ Parquet supports nested schemas that allow you to store rows that have a more | ||
Nested Lists for Hive / Athena | ||
----------------------- | ||
### Nested Lists for Hive / Athena | ||
@@ -363,4 +369,3 @@ Lists have to be annotated to be queriable with AWS Athena. See [parquet-format](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) for more detail and a full working example with comments in the test directory ([`test/list.js`](test/list.js)) | ||
List of Supported Types & Encodings | ||
----------------------------------- | ||
### List of Supported Types & Encodings | ||
@@ -398,4 +403,3 @@ We aim to be feature-complete and add new features as they are added to the | ||
Buffering & Row Group Size | ||
-------------------------- | ||
## Buffering & Row Group Size | ||
@@ -416,4 +420,3 @@ When writing a Parquet file, the `ParquetWriter` will buffer rows in memory | ||
Dependencies | ||
------------- | ||
## Dependencies | ||
@@ -423,6 +426,6 @@ Parquet uses [thrift](https://thrift.apache.org/) to encode the schema and other | ||
Notes | ||
----- | ||
## Notes | ||
Currently parquet-cpp doesn't fully support DATA_PAGE_V2. You can work around this | ||
by setting the useDataPageV2 option to false. |
Sorry, the diff of this file is too big to display
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
Network access
Supply chain riskThis module accesses the network.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
2787462
11
51
19444
420
1
0
31
+ Added@types/varint@^6.0.0
+ Addedbrowserify-zlib@^0.2.0
+ Addedwasm-brotli@^2.0.2
+ Added@types/node@22.13.4(transitive)
+ Added@types/varint@6.0.3(transitive)
+ Addedbrowserify-zlib@0.2.0(transitive)
+ Addedbson@4.4.0(transitive)
+ Addedbuffer@5.7.1(transitive)
+ Addedieee754@1.2.1(transitive)
+ Addedpako@1.0.11(transitive)
+ Addedundici-types@6.20.0(transitive)
+ Addedwasm-brotli@2.0.2(transitive)
- Removed@types/brotli@^1.3.0
- Removed@types/bson@^4.0.3
- Removed@types/long@^4.0.1
- Removed@types/node@^14.14.35
- Removed@types/thrift@^0.10.10
- Removedbrotli@^1.3.0
- Removedlodash@^4.17.21
- Removedlzo@^0.4.0
- Removedobject-stream@0.0.1
- Removedtypescript@^4.3.2
- Removed@types/brotli@1.3.4(transitive)
- Removed@types/bson@4.2.4(transitive)
- Removed@types/long@4.0.2(transitive)
- Removed@types/node@14.18.63(transitive)
- Removed@types/node-int64@0.4.32(transitive)
- Removed@types/q@1.5.8(transitive)
- Removed@types/thrift@0.10.17(transitive)
- Removedbindings@1.2.1(transitive)
- Removedbrotli@1.3.3(transitive)
- Removedbson@2.0.8(transitive)
- Removedlodash@4.17.21(transitive)
- Removedlzo@0.4.11(transitive)
- Removedobject-stream@0.0.1(transitive)
- Removedtypescript@4.9.5(transitive)
Updatedbson@4.4.0