@dsnp/parquetjs
Advanced tools
Comparing version 0.0.0-2ccc4e to 0.0.0-33c80f
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
@@ -46,13 +37,29 @@ return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
class SplitBlockBloomFilter { | ||
constructor() { | ||
/** | ||
* Instance | ||
*/ | ||
this.splitBlockFilter = []; | ||
this.desiredFalsePositiveRate = SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE; | ||
this.numBlocks = 0; | ||
this.numDistinctValues = SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES; | ||
this.hashStrategy = new parquet_types_1.default.BloomFilterHash(parquet_types_1.default.XxHash); | ||
this.hasher = new xxhasher_1.default(); | ||
} | ||
static salt = [ | ||
0x47b6137b, | ||
0x44974d91, | ||
0x8824ad5b, | ||
0xa2b7289d, | ||
0x705495c7, | ||
0x2df1424b, | ||
0x9efc4947, | ||
0x5c6bfb31 | ||
]; | ||
// How many bits are in a single block: | ||
// - Blocks are UInt32 arrays | ||
// - There are 8 UInt32 words in each block. | ||
static WORDS_PER_BLOCK = 8; | ||
static WORD_SIZE = 32; | ||
static BITS_PER_BLOCK = SplitBlockBloomFilter.WORDS_PER_BLOCK * SplitBlockBloomFilter.WORD_SIZE; | ||
// Default number of blocks in a Split Block Bloom filter (SBBF) | ||
static NUMBER_OF_BLOCKS = 32; | ||
// The lower bound of SBBF size in bytes. | ||
// Currently this is 1024 | ||
static LOWER_BOUND_BYTES = SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK / 8; | ||
// The upper bound of SBBF size, set to default row group size in bytes. | ||
// Note that the subsquent requirements for an effective bloom filter on a row group this size would mean this | ||
// is unacceptably large for a lightweight client application. | ||
static UPPER_BOUND_BYTES = 128 * 1024 * 1024; | ||
static DEFAULT_FALSE_POSITIVE_RATE = 0.001; | ||
static DEFAULT_DISTINCT_VALUES = 128 * 1024; | ||
/** | ||
@@ -161,5 +168,5 @@ * @function initBlock | ||
for (let j = 0; j < this.WORD_SIZE; j++) { | ||
const isSet = masked[i] & (Math.pow(2, j)); | ||
const isSet = masked[i] & (2 ** j); | ||
if (isSet) { | ||
b[i] = b[i] | (Math.pow(2, j)); | ||
b[i] = b[i] | (2 ** j); | ||
} | ||
@@ -183,5 +190,5 @@ } | ||
for (let j = 0; j < this.WORD_SIZE; j++) { | ||
const isSet = masked[i] & (Math.pow(2, j)); | ||
const isSet = masked[i] & (2 ** j); | ||
if (isSet) { | ||
const match = b[i] & (Math.pow(2, j)); | ||
const match = b[i] & (2 ** j); | ||
if (!match) { | ||
@@ -195,2 +202,11 @@ return false; | ||
} | ||
/** | ||
* Instance | ||
*/ | ||
splitBlockFilter = []; | ||
desiredFalsePositiveRate = SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE; | ||
numBlocks = 0; | ||
numDistinctValues = SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES; | ||
hashStrategy = new parquet_types_1.default.BloomFilterHash(new parquet_types_1.default.XxHash()); | ||
hasher = new xxhasher_1.default(); | ||
isInitialized() { return this.splitBlockFilter.length > 0; } | ||
@@ -311,10 +327,8 @@ getFalsePositiveRate() { return this.desiredFalsePositiveRate; } | ||
} | ||
hash(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!this.hashStrategy.hasOwnProperty("XXHASH")) { | ||
throw new Error("unsupported hash strategy"); | ||
} | ||
const hashed = yield this.hasher.hash64(value); | ||
return Long.fromString(hashed, true, 16); | ||
}); | ||
async hash(value) { | ||
if (!this.hashStrategy.hasOwnProperty("XXHASH")) { | ||
throw new Error("unsupported hash strategy"); | ||
} | ||
const hashed = await this.hasher.hash64(value); | ||
return Long.fromString(hashed, true, 16); | ||
} | ||
@@ -335,8 +349,6 @@ insertHash(hashValue) { | ||
*/ | ||
insert(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized. call init() first"); | ||
this.insertHash(yield this.hash(value)); | ||
}); | ||
async insert(value) { | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized. call init() first"); | ||
this.insertHash(await this.hash(value)); | ||
} | ||
@@ -358,38 +370,8 @@ checkHash(hashValue) { | ||
*/ | ||
check(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized"); | ||
return this.checkHash(yield this.hash(value)); | ||
}); | ||
async check(value) { | ||
if (!this.isInitialized()) | ||
throw new Error("filter has not been initialized"); | ||
return this.checkHash(await this.hash(value)); | ||
} | ||
} | ||
SplitBlockBloomFilter.salt = [ | ||
0x47b6137b, | ||
0x44974d91, | ||
0x8824ad5b, | ||
0xa2b7289d, | ||
0x705495c7, | ||
0x2df1424b, | ||
0x9efc4947, | ||
0x5c6bfb31 | ||
]; | ||
// How many bits are in a single block: | ||
// - Blocks are UInt32 arrays | ||
// - There are 8 UInt32 words in each block. | ||
SplitBlockBloomFilter.WORDS_PER_BLOCK = 8; | ||
SplitBlockBloomFilter.WORD_SIZE = 32; | ||
SplitBlockBloomFilter.BITS_PER_BLOCK = SplitBlockBloomFilter.WORDS_PER_BLOCK * SplitBlockBloomFilter.WORD_SIZE; | ||
// Default number of blocks in a Split Block Bloom filter (SBBF) | ||
SplitBlockBloomFilter.NUMBER_OF_BLOCKS = 32; | ||
// The lower bound of SBBF size in bytes. | ||
// Currently this is 1024 | ||
SplitBlockBloomFilter.LOWER_BOUND_BYTES = SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK / 8; | ||
// The upper bound of SBBF size, set to default row group size in bytes. | ||
// Note that the subsquent requirements for an effective bloom filter on a row group this size would mean this | ||
// is unacceptably large for a lightweight client application. | ||
SplitBlockBloomFilter.UPPER_BOUND_BYTES = 128 * 1024 * 1024; | ||
SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE = 0.001; | ||
SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES = 128 * 1024; | ||
exports.default = SplitBlockBloomFilter; | ||
//# sourceMappingURL=sbbf.js.map |
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
const xxhash_wasm_1 = __importDefault(require("xxhash-wasm")); | ||
@@ -27,6 +19,5 @@ const long_1 = __importDefault(require("long")); | ||
class XxHasher { | ||
hashit(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
return (yield XxHasher.h64)(value); | ||
}); | ||
static h64 = (0, xxhash_wasm_1.default)().then(x => x.h64); | ||
async hashit(value) { | ||
return (await XxHasher.h64)(value); | ||
} | ||
@@ -39,20 +30,16 @@ /** | ||
*/ | ||
hash64(value) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (typeof value === 'string') | ||
return this.hashit(value); | ||
if (value instanceof Buffer || | ||
value instanceof Uint8Array || | ||
value instanceof long_1.default || | ||
typeof value === 'boolean' || | ||
typeof value === 'number' || | ||
typeof value === 'bigint') { | ||
return this.hashit(value.toString()); | ||
} | ||
throw new Error("unsupported type: " + value); | ||
}); | ||
async hash64(value) { | ||
if (typeof value === 'string') | ||
return this.hashit(value); | ||
if (value instanceof Buffer || | ||
value instanceof Uint8Array || | ||
value instanceof long_1.default || | ||
typeof value === 'boolean' || | ||
typeof value === 'number' || | ||
typeof value === 'bigint') { | ||
return this.hashit(value.toString()); | ||
} | ||
throw new Error("unsupported type: " + value); | ||
} | ||
} | ||
XxHasher.h64 = xxhash_wasm_1.default().then(x => x.h64); | ||
module.exports = XxHasher; | ||
//# sourceMappingURL=xxhasher.js.map | ||
exports.default = XxHasher; |
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
@@ -16,3 +26,3 @@ var __importDefault = (this && this.__importDefault) || function (mod) { | ||
exports.getBloomFiltersFor = exports.siftAllByteOffsets = exports.parseBloomFilterOffsets = void 0; | ||
const util_1 = __importDefault(require("../util")); | ||
const parquet_util = __importStar(require("../util")); | ||
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); | ||
@@ -44,13 +54,16 @@ const sbbf_1 = __importDefault(require("../bloom/sbbf")); | ||
exports.parseBloomFilterOffsets = parseBloomFilterOffsets; | ||
const getBloomFilterHeader = (offsetBytes, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () { | ||
const getBloomFilterHeader = async (offsetBytes, envelopeReader) => { | ||
const headerByteSizeEstimate = 200; | ||
let bloomFilterHeaderData; | ||
try { | ||
bloomFilterHeaderData = yield envelopeReader.read(offsetBytes, headerByteSizeEstimate); | ||
bloomFilterHeaderData = await envelopeReader.read(offsetBytes, headerByteSizeEstimate); | ||
} | ||
catch (e) { | ||
throw new Error(e); | ||
if (typeof e === 'string') | ||
throw new Error(e); | ||
else | ||
throw e; | ||
} | ||
const bloomFilterHeader = new parquet_types_1.default.BloomFilterHeader(); | ||
const sizeOfBloomFilterHeader = util_1.default.decodeThrift(bloomFilterHeader, bloomFilterHeaderData); | ||
const sizeOfBloomFilterHeader = parquet_util.decodeThrift(bloomFilterHeader, bloomFilterHeaderData); | ||
return { | ||
@@ -60,15 +73,18 @@ bloomFilterHeader, | ||
}; | ||
}); | ||
const readFilterData = (offsetBytes, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () { | ||
const { bloomFilterHeader, sizeOfBloomFilterHeader, } = yield getBloomFilterHeader(offsetBytes, envelopeReader); | ||
}; | ||
const readFilterData = async (offsetBytes, envelopeReader) => { | ||
const { bloomFilterHeader, sizeOfBloomFilterHeader, } = await getBloomFilterHeader(offsetBytes, envelopeReader); | ||
const { numBytes: filterByteSize } = bloomFilterHeader; | ||
try { | ||
const filterBlocksOffset = offsetBytes + sizeOfBloomFilterHeader; | ||
const buffer = yield envelopeReader.read(filterBlocksOffset, filterByteSize); | ||
const buffer = await envelopeReader.read(filterBlocksOffset, filterByteSize); | ||
return buffer; | ||
} | ||
catch (e) { | ||
throw new Error(e); | ||
if (typeof e === 'string') | ||
throw new Error(e); | ||
else | ||
throw e; | ||
} | ||
}); | ||
}; | ||
const readFilterDataFrom = (offsets, envelopeReader) => { | ||
@@ -78,10 +94,10 @@ return Promise.all(offsets.map((offset) => readFilterData(offset, envelopeReader))); | ||
const siftAllByteOffsets = (columnChunkDataCollection) => { | ||
return exports.parseBloomFilterOffsets(filterColumnChunksWithBloomFilters(columnChunkDataCollection)); | ||
return (0, exports.parseBloomFilterOffsets)(filterColumnChunksWithBloomFilters(columnChunkDataCollection)); | ||
}; | ||
exports.siftAllByteOffsets = siftAllByteOffsets; | ||
const getBloomFiltersFor = (columnNames, envelopeReader) => __awaiter(void 0, void 0, void 0, function* () { | ||
const getBloomFiltersFor = async (columnNames, envelopeReader) => { | ||
const columnChunkDataCollection = envelopeReader.getAllColumnChunkDataFor(columnNames); | ||
const bloomFilterOffsetData = exports.siftAllByteOffsets(columnChunkDataCollection); | ||
const bloomFilterOffsetData = (0, exports.siftAllByteOffsets)(columnChunkDataCollection); | ||
const offsetByteValues = bloomFilterOffsetData.map(({ offsetBytes }) => offsetBytes); | ||
const filterBlocksBuffers = yield readFilterDataFrom(offsetByteValues, envelopeReader); | ||
const filterBlocksBuffers = await readFilterDataFrom(offsetByteValues, envelopeReader); | ||
return filterBlocksBuffers.map((buffer, index) => { | ||
@@ -95,4 +111,3 @@ const { columnName, rowGroupIndex } = bloomFilterOffsetData[index]; | ||
}); | ||
}); | ||
}; | ||
exports.getBloomFiltersFor = getBloomFiltersFor; | ||
//# sourceMappingURL=bloomFilterReader.js.map |
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
@@ -7,3 +26,3 @@ return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
exports.getSerializedBloomFilterData = exports.setFilterOffset = exports.serializeFilterData = exports.serializeFilterHeaders = exports.createSBBF = void 0; | ||
const util_1 = __importDefault(require("../util")); | ||
const parquet_util = __importStar(require("../util")); | ||
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); | ||
@@ -37,3 +56,3 @@ const sbbf_1 = __importDefault(require("../bloom/sbbf")); | ||
const bloomFilterHeader = buildFilterHeader(numberOfBytes); | ||
return util_1.default.serializeThrift(bloomFilterHeader); | ||
return parquet_util.serializeThrift(bloomFilterHeader); | ||
}; | ||
@@ -43,3 +62,3 @@ exports.serializeFilterHeaders = serializeFilterHeaders; | ||
const serializedFilterBlocks = serializeFilterBlocks(params.filterBlocks); | ||
const serializedFilterHeaders = exports.serializeFilterHeaders(params.filterByteSize); | ||
const serializedFilterHeaders = (0, exports.serializeFilterHeaders)(params.filterByteSize); | ||
return Buffer.concat([serializedFilterHeaders, serializedFilterBlocks]); | ||
@@ -55,5 +74,4 @@ }; | ||
const filterByteSize = splitBlockBloomFilter.getNumFilterBytes(); | ||
return exports.serializeFilterData({ filterBlocks, filterByteSize }); | ||
return (0, exports.serializeFilterData)({ filterBlocks, filterByteSize }); | ||
}; | ||
exports.getSerializedBloomFilterData = getSerializedBloomFilterData; | ||
//# sourceMappingURL=bloomFilterWriter.js.map |
"use strict"; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
module.exports = class BufferReader { | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
class BufferReader { | ||
maxSpan; | ||
maxLength; | ||
queueWait; | ||
scheduled; | ||
queue; | ||
envelopeReader; | ||
constructor(envelopeReader, options) { | ||
@@ -21,55 +19,52 @@ options = options || {}; | ||
} | ||
read(offset, length) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!this.scheduled) { | ||
this.scheduled = true; | ||
setTimeout(() => { | ||
this.scheduled = false; | ||
this.processQueue(); | ||
}, this.queueWait); | ||
} | ||
return new Promise((resolve, reject) => { | ||
this.queue.push({ offset, length, resolve, reject }); | ||
}); | ||
async read(offset, length) { | ||
if (!this.scheduled) { | ||
this.scheduled = true; | ||
setTimeout(() => { | ||
this.scheduled = false; | ||
this.processQueue(); | ||
}, this.queueWait); | ||
} | ||
return new Promise((resolve, reject) => { | ||
this.queue.push({ offset, length, resolve, reject }); | ||
}); | ||
} | ||
processQueue() { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
const queue = this.queue; | ||
if (!queue.length) | ||
async processQueue() { | ||
const queue = this.queue; | ||
if (!queue.length) | ||
return; | ||
this.queue = []; | ||
queue.sort((a, b) => a.offset - b.offset); | ||
var subqueue = []; | ||
const readSubqueue = async () => { | ||
if (!subqueue.length) { | ||
return; | ||
this.queue = []; | ||
queue.sort((a, b) => a.offset - b.offset); | ||
var subqueue = []; | ||
const readSubqueue = () => __awaiter(this, void 0, void 0, function* () { | ||
if (!subqueue.length) { | ||
return; | ||
} | ||
const processQueue = subqueue; | ||
subqueue = []; | ||
const lastElement = processQueue[processQueue.length - 1]; | ||
const start = processQueue[0].offset; | ||
const finish = lastElement.offset + lastElement.length; | ||
const buffer = yield this.envelopeReader.readFn(start, finish - start); | ||
processQueue.forEach((d) => __awaiter(this, void 0, void 0, function* () { | ||
d.resolve(buffer.slice(d.offset - start, d.offset + d.length - start)); | ||
})); | ||
} | ||
const processQueue = subqueue; | ||
subqueue = []; | ||
const lastElement = processQueue[processQueue.length - 1]; | ||
const start = processQueue[0].offset; | ||
const finish = lastElement.offset + lastElement.length; | ||
const buffer = await this.envelopeReader.readFn(start, finish - start); | ||
processQueue.forEach(async (d) => { | ||
d.resolve(buffer.slice(d.offset - start, d.offset + d.length - start)); | ||
}); | ||
queue.forEach((d, i) => { | ||
const prev = queue[i - 1]; | ||
if (!prev || (d.offset - (prev.offset + prev.length)) < this.maxSpan) { | ||
subqueue.push(d); | ||
if ((d.offset + d.length) - subqueue[0].offset > this.maxLength) { | ||
readSubqueue(); | ||
} | ||
} | ||
else { | ||
}; | ||
queue.forEach((d, i) => { | ||
const prev = queue[i - 1]; | ||
if (!prev || (d.offset - (prev.offset + prev.length)) < this.maxSpan) { | ||
subqueue.push(d); | ||
if ((d.offset + d.length) - subqueue[0].offset > this.maxLength) { | ||
readSubqueue(); | ||
subqueue = [d]; | ||
} | ||
}); | ||
readSubqueue(subqueue); | ||
} | ||
else { | ||
readSubqueue(); | ||
subqueue = [d]; | ||
} | ||
}); | ||
readSubqueue(); | ||
} | ||
}; | ||
//# sourceMappingURL=bufferReader.js.map | ||
} | ||
exports.default = BufferReader; | ||
; |
"use strict"; | ||
module.exports.PLAIN = require('./plain'); | ||
module.exports.RLE = require('./rle'); | ||
module.exports.PLAIN_DICTIONARY = require('./plain_dictionary'); | ||
//# sourceMappingURL=index.js.map | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.PLAIN_DICTIONARY = exports.RLE = exports.PLAIN = void 0; | ||
exports.PLAIN = __importStar(require("./plain")); | ||
exports.RLE = __importStar(require("./rle")); | ||
exports.PLAIN_DICTIONARY = __importStar(require("./plain_dictionary")); |
"use strict"; | ||
const rle = require('./rle'); | ||
exports.decodeValues = function (type, cursor, count, opts) { | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.decodeValues = void 0; | ||
const rle = __importStar(require("./rle")); | ||
const decodeValues = function (type, cursor, count, opts) { | ||
opts.bitWidth = cursor.buffer.slice(cursor.offset, cursor.offset + 1).readInt8(0); | ||
@@ -8,2 +29,2 @@ cursor.offset += 1; | ||
}; | ||
//# sourceMappingURL=plain_dictionary.js.map | ||
exports.decodeValues = decodeValues; |
@@ -1,3 +0,8 @@ | ||
'use strict'; | ||
const INT53 = require('int53'); | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.decodeValues = exports.encodeValues = void 0; | ||
const int53_1 = __importDefault(require("int53")); | ||
function encodeValues_BOOLEAN(values) { | ||
@@ -8,3 +13,3 @@ let buf = Buffer.alloc(Math.ceil(values.length / 8)); | ||
if (values[i]) { | ||
buf[Math.floor(i / 8)] |= (1 << (i % 8)); | ||
buf[Math.floor(i / 8)] |= 1 << i % 8; | ||
} | ||
@@ -18,3 +23,3 @@ } | ||
let b = cursor.buffer[cursor.offset + Math.floor(i / 8)]; | ||
values.push((b & (1 << (i % 8))) > 0); | ||
values.push((b & (1 << i % 8)) > 0); | ||
} | ||
@@ -42,3 +47,2 @@ cursor.offset += Math.ceil(count / 8); | ||
for (let i = 0; i < values.length; i++) { | ||
//console.log(typeof values[i]); | ||
buf.writeBigInt64LE(BigInt(values[i]), i * 8); | ||
@@ -60,7 +64,7 @@ } | ||
if (values[i] >= 0) { | ||
INT53.writeInt64LE(values[i], buf, i * 12); | ||
int53_1.default.writeInt64LE(values[i], buf, i * 12); | ||
buf.writeUInt32LE(0, i * 12 + 8); // truncate to 64 actual precision | ||
} | ||
else { | ||
INT53.writeInt64LE((~-values[i]) + 1, buf, i * 12); | ||
int53_1.default.writeInt64LE(~-values[i] + 1, buf, i * 12); | ||
buf.writeUInt32LE(0xffffffff, i * 12 + 8); // truncate to 64 actual precision | ||
@@ -74,6 +78,6 @@ } | ||
for (let i = 0; i < count; ++i) { | ||
const low = INT53.readInt64LE(cursor.buffer, cursor.offset); | ||
const low = int53_1.default.readInt64LE(cursor.buffer, cursor.offset); | ||
const high = cursor.buffer.readUInt32LE(cursor.offset + 8); | ||
if (high === 0xffffffff) { | ||
values.push((~-low) + 1); // truncate to 64 actual precision | ||
values.push(~-low + 1); // truncate to 64 actual precision | ||
} | ||
@@ -117,14 +121,16 @@ else { | ||
} | ||
// Waylands reminder to check again | ||
function encodeValues_BYTE_ARRAY(values) { | ||
let buf_len = 0; | ||
const returnedValues = []; | ||
for (let i = 0; i < values.length; i++) { | ||
values[i] = Buffer.from(values[i]); | ||
buf_len += 4 + values[i].length; | ||
returnedValues[i] = Buffer.from(values[i]); | ||
buf_len += 4 + returnedValues[i].length; | ||
} | ||
let buf = Buffer.alloc(buf_len); | ||
let buf_pos = 0; | ||
for (let i = 0; i < values.length; i++) { | ||
buf.writeUInt32LE(values[i].length, buf_pos); | ||
values[i].copy(buf, buf_pos + 4); | ||
buf_pos += 4 + values[i].length; | ||
for (let i = 0; i < returnedValues.length; i++) { | ||
buf.writeUInt32LE(returnedValues[i].length, buf_pos); | ||
returnedValues[i].copy(buf, buf_pos + 4); | ||
buf_pos += 4 + returnedValues[i].length; | ||
} | ||
@@ -147,10 +153,10 @@ return buf; | ||
} | ||
let buf_len = 0; | ||
const returnedValues = []; | ||
for (let i = 0; i < values.length; i++) { | ||
values[i] = Buffer.from(values[i]); | ||
if (values[i].length !== opts.typeLength) { | ||
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + values[i]; | ||
returnedValues[i] = Buffer.from(values[i]); | ||
if (returnedValues[i].length !== opts.typeLength) { | ||
throw "invalid value for FIXED_LEN_BYTE_ARRAY: " + returnedValues[i]; | ||
} | ||
} | ||
return Buffer.concat(values); | ||
return Buffer.concat(returnedValues); | ||
} | ||
@@ -168,46 +174,47 @@ function decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts) { | ||
} | ||
exports.encodeValues = function (type, values, opts) { | ||
const encodeValues = function (type, values, opts) { | ||
switch (type) { | ||
case 'BOOLEAN': | ||
case "BOOLEAN": | ||
return encodeValues_BOOLEAN(values); | ||
case 'INT32': | ||
case "INT32": | ||
return encodeValues_INT32(values); | ||
case 'INT64': | ||
case "INT64": | ||
return encodeValues_INT64(values); | ||
case 'INT96': | ||
case "INT96": | ||
return encodeValues_INT96(values); | ||
case 'FLOAT': | ||
case "FLOAT": | ||
return encodeValues_FLOAT(values); | ||
case 'DOUBLE': | ||
case "DOUBLE": | ||
return encodeValues_DOUBLE(values); | ||
case 'BYTE_ARRAY': | ||
case "BYTE_ARRAY": | ||
return encodeValues_BYTE_ARRAY(values); | ||
case 'FIXED_LEN_BYTE_ARRAY': | ||
case "FIXED_LEN_BYTE_ARRAY": | ||
return encodeValues_FIXED_LEN_BYTE_ARRAY(values, opts); | ||
default: | ||
throw 'unsupported type: ' + type; | ||
throw "unsupported type: " + type; | ||
} | ||
}; | ||
exports.decodeValues = function (type, cursor, count, opts) { | ||
exports.encodeValues = encodeValues; | ||
const decodeValues = function (type, cursor, count, opts) { | ||
switch (type) { | ||
case 'BOOLEAN': | ||
case "BOOLEAN": | ||
return decodeValues_BOOLEAN(cursor, count); | ||
case 'INT32': | ||
case "INT32": | ||
return decodeValues_INT32(cursor, count); | ||
case 'INT64': | ||
case "INT64": | ||
return decodeValues_INT64(cursor, count); | ||
case 'INT96': | ||
case "INT96": | ||
return decodeValues_INT96(cursor, count); | ||
case 'FLOAT': | ||
case "FLOAT": | ||
return decodeValues_FLOAT(cursor, count); | ||
case 'DOUBLE': | ||
case "DOUBLE": | ||
return decodeValues_DOUBLE(cursor, count); | ||
case 'BYTE_ARRAY': | ||
case "BYTE_ARRAY": | ||
return decodeValues_BYTE_ARRAY(cursor, count); | ||
case 'FIXED_LEN_BYTE_ARRAY': | ||
case "FIXED_LEN_BYTE_ARRAY": | ||
return decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts); | ||
default: | ||
throw 'unsupported type: ' + type; | ||
throw "unsupported type: " + type; | ||
} | ||
}; | ||
//# sourceMappingURL=plain.js.map | ||
exports.decodeValues = decodeValues; |
"use strict"; | ||
const varint = require('varint'); | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.decodeValues = exports.encodeValues = void 0; | ||
const varint_1 = __importDefault(require("varint")); | ||
function encodeRunBitpacked(values, opts) { | ||
@@ -14,3 +19,3 @@ for (let i = 0; i < values.length % 8; i++) { | ||
return Buffer.concat([ | ||
Buffer.from(varint.encode(((values.length / 8) << 1) | 1)), | ||
Buffer.from(varint_1.default.encode(((values.length / 8) << 1) | 1)), | ||
buf | ||
@@ -26,7 +31,15 @@ ]); | ||
return Buffer.concat([ | ||
Buffer.from(varint.encode(count << 1)), | ||
Buffer.from(varint_1.default.encode(count << 1)), | ||
buf | ||
]); | ||
} | ||
exports.encodeValues = function (type, values, opts) { | ||
function unknownToParsedInt(value) { | ||
if (typeof value === 'string') { | ||
return parseInt(value, 10); | ||
} | ||
else { | ||
return value; | ||
} | ||
} | ||
const encodeValues = function (type, values, opts) { | ||
if (!('bitWidth' in opts)) { | ||
@@ -39,3 +52,3 @@ throw 'bitWidth is required'; | ||
case 'INT64': | ||
values = values.map((x) => parseInt(x, 10)); | ||
values = values.map((x) => unknownToParsedInt(x)); | ||
break; | ||
@@ -85,2 +98,3 @@ default: | ||
}; | ||
exports.encodeValues = encodeValues; | ||
function decodeRunBitpacked(cursor, count, opts) { | ||
@@ -108,3 +122,3 @@ if (count % 8 !== 0) { | ||
} | ||
exports.decodeValues = function (type, cursor, count, opts) { | ||
const decodeValues = function (_, cursor, count, opts) { | ||
if (!('bitWidth' in opts)) { | ||
@@ -119,4 +133,4 @@ throw 'bitWidth is required'; | ||
while (values.length < count) { | ||
const header = varint.decode(cursor.buffer, cursor.offset); | ||
cursor.offset += varint.encodingLength(header); | ||
const header = varint_1.default.decode(cursor.buffer, cursor.offset); | ||
cursor.offset += varint_1.default.encodingLength(header); | ||
if (header & 1) { | ||
@@ -138,2 +152,2 @@ res = decodeRunBitpacked(cursor, (header >> 1) * 8, opts); | ||
}; | ||
//# sourceMappingURL=rle.js.map | ||
exports.decodeValues = decodeValues; |
@@ -1,7 +0,12 @@ | ||
'use strict'; | ||
const zlib = require('zlib'); | ||
const snappy = require('snappyjs'); | ||
const lzo = require('lzo'); | ||
const brotli = require('brotli'); | ||
const PARQUET_COMPRESSION_METHODS = { | ||
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.inflate = exports.deflate = exports.PARQUET_COMPRESSION_METHODS = void 0; | ||
const zlib_1 = __importDefault(require("zlib")); | ||
const snappyjs_1 = __importDefault(require("snappyjs")); | ||
const wasm_brotli_1 = require("wasm-brotli"); | ||
// LZO compression is disabled. See: https://github.com/LibertyDSNP/parquetjs/issues/18 | ||
exports.PARQUET_COMPRESSION_METHODS = { | ||
'UNCOMPRESSED': { | ||
@@ -19,6 +24,2 @@ deflate: deflate_identity, | ||
}, | ||
'LZO': { | ||
deflate: deflate_lzo, | ||
inflate: inflate_lzo | ||
}, | ||
'BROTLI': { | ||
@@ -32,8 +33,9 @@ deflate: deflate_brotli, | ||
*/ | ||
function deflate(method, value) { | ||
if (!(method in PARQUET_COMPRESSION_METHODS)) { | ||
async function deflate(method, value) { | ||
if (!(method in exports.PARQUET_COMPRESSION_METHODS)) { | ||
throw 'invalid compression method: ' + method; | ||
} | ||
return PARQUET_COMPRESSION_METHODS[method].deflate(value); | ||
return exports.PARQUET_COMPRESSION_METHODS[method].deflate(value); | ||
} | ||
exports.deflate = deflate; | ||
function deflate_identity(value) { | ||
@@ -43,26 +45,26 @@ return value; | ||
function deflate_gzip(value) { | ||
return zlib.gzipSync(value); | ||
return zlib_1.default.gzipSync(value); | ||
} | ||
function deflate_snappy(value) { | ||
return snappy.compress(value); | ||
return snappyjs_1.default.compress(value); | ||
} | ||
function deflate_lzo(value) { | ||
return lzo.compress(value); | ||
async function deflate_brotli(value) { | ||
const compressedContent = await (0, wasm_brotli_1.compress)(value /*, { | ||
mode: 0, | ||
quality: 8, | ||
lgwin: 22 | ||
} | ||
*/); | ||
return Buffer.from(compressedContent); | ||
} | ||
function deflate_brotli(value) { | ||
return Buffer.from(brotli.compress(value, { | ||
mode: 0, | ||
quality: 8, | ||
lgwin: 22 | ||
})); | ||
} | ||
/** | ||
* Inflate a value using compression method `method` | ||
*/ | ||
function inflate(method, value) { | ||
if (!(method in PARQUET_COMPRESSION_METHODS)) { | ||
async function inflate(method, value) { | ||
if (!(method in exports.PARQUET_COMPRESSION_METHODS)) { | ||
throw 'invalid compression method: ' + method; | ||
} | ||
return PARQUET_COMPRESSION_METHODS[method].inflate(value); | ||
return await exports.PARQUET_COMPRESSION_METHODS[method].inflate(value); | ||
} | ||
exports.inflate = inflate; | ||
function inflate_identity(value) { | ||
@@ -72,14 +74,10 @@ return value; | ||
function inflate_gzip(value) { | ||
return zlib.gunzipSync(value); | ||
return zlib_1.default.gunzipSync(value); | ||
} | ||
function inflate_snappy(value) { | ||
return snappy.uncompress(value); | ||
return snappyjs_1.default.uncompress(value); | ||
} | ||
function inflate_lzo(value) { | ||
return lzo.decompress(value); | ||
async function inflate_brotli(value) { | ||
const uncompressedContent = await (0, wasm_brotli_1.decompress)(value); | ||
return Buffer.from(uncompressedContent); | ||
} | ||
function inflate_brotli(value) { | ||
return Buffer.from(brotli.decompress(value)); | ||
} | ||
module.exports = { PARQUET_COMPRESSION_METHODS, deflate, inflate }; | ||
//# sourceMappingURL=compression.js.map |
@@ -1,24 +0,38 @@ | ||
'use strict'; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
const fs = require('fs'); | ||
const thrift = require('thrift'); | ||
const Int64 = require('node-int64'); | ||
const parquet_thrift = require('../gen-nodejs/parquet_types'); | ||
const parquet_shredder = require('./shred'); | ||
const parquet_util = require('./util'); | ||
const parquet_schema = require('./schema'); | ||
const parquet_codec = require('./codec'); | ||
const parquet_compression = require('./compression'); | ||
const parquet_types = require('./types'); | ||
const BufferReader = require('./bufferReader'); | ||
const bloomFilterReader = require('./bloomFilterIO/bloomFilterReader'); | ||
const groupBy = require("lodash/groupBy"); | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetEnvelopeReader = exports.ParquetReader = void 0; | ||
const node_int64_1 = __importDefault(require("node-int64")); | ||
const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types")); | ||
const parquet_shredder = __importStar(require("./shred")); | ||
const parquet_util = __importStar(require("./util")); | ||
const parquet_schema = __importStar(require("./schema")); | ||
const parquet_codec = __importStar(require("./codec")); | ||
const parquet_compression = __importStar(require("./compression")); | ||
const parquet_types = __importStar(require("./types")); | ||
const bufferReader_1 = __importDefault(require("./bufferReader")); | ||
const bloomFilterReader = __importStar(require("./bloomFilterIO/bloomFilterReader")); | ||
const cross_fetch_1 = __importDefault(require("cross-fetch")); | ||
const types_1 = require("./types/types"); | ||
const { getBloomFiltersFor, } = bloomFilterReader; | ||
@@ -42,2 +56,8 @@ /** | ||
class ParquetCursor { | ||
metadata; | ||
envelopeReader; | ||
schema; | ||
columnList; | ||
rowGroup; | ||
rowGroupIndex; | ||
/** | ||
@@ -61,14 +81,12 @@ * Create a new parquet reader from the file metadata and an envelope reader. | ||
*/ | ||
next() { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (this.rowGroup.length === 0) { | ||
if (this.rowGroupIndex >= this.metadata.row_groups.length) { | ||
return null; | ||
} | ||
let rowBuffer = yield this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList); | ||
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer); | ||
this.rowGroupIndex++; | ||
async next() { | ||
if (this.rowGroup.length === 0) { | ||
if (this.rowGroupIndex >= this.metadata.row_groups.length) { | ||
return null; | ||
} | ||
return this.rowGroup.shift(); | ||
}); | ||
let rowBuffer = await this.envelopeReader.readRowGroup(this.schema, this.metadata.row_groups[this.rowGroupIndex], this.columnList); | ||
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer); | ||
this.rowGroupIndex++; | ||
} | ||
return this.rowGroup.shift(); | ||
} | ||
@@ -92,2 +110,5 @@ /** | ||
class ParquetReader { | ||
envelopeReader; | ||
metadata; | ||
schema; | ||
/** | ||
@@ -97,13 +118,9 @@ * Open the parquet file pointed to by the specified path and return a new | ||
*/ | ||
static openFile(filePath, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let envelopeReader = yield ParquetEnvelopeReader.openFile(filePath, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
}); | ||
static async openFile(filePath, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openFile(filePath, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
static openBuffer(buffer, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let envelopeReader = yield ParquetEnvelopeReader.openBuffer(buffer, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
}); | ||
static async openBuffer(buffer, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openBuffer(buffer, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
@@ -115,7 +132,5 @@ /** | ||
*/ | ||
static openS3(client, params, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let envelopeReader = yield ParquetEnvelopeReader.openS3(client, params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
}); | ||
static async openS3(client, params, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openS3(client, params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
@@ -128,23 +143,19 @@ /** | ||
*/ | ||
static openUrl(request, params, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let envelopeReader = yield ParquetEnvelopeReader.openUrl(request, params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
}); | ||
static async openUrl(params, options) { | ||
let envelopeReader = await ParquetEnvelopeReader.openUrl(params, options); | ||
return this.openEnvelopeReader(envelopeReader, options); | ||
} | ||
static openEnvelopeReader(envelopeReader, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (opts && opts.metadata) { | ||
return new ParquetReader(opts.metadata, envelopeReader, opts); | ||
} | ||
try { | ||
yield envelopeReader.readHeader(); | ||
let metadata = yield envelopeReader.readFooter(); | ||
return new ParquetReader(metadata, envelopeReader, opts); | ||
} | ||
catch (err) { | ||
yield envelopeReader.close(); | ||
throw err; | ||
} | ||
}); | ||
static async openEnvelopeReader(envelopeReader, opts) { | ||
if (opts && opts.metadata) { | ||
return new ParquetReader(opts.metadata, envelopeReader, opts); | ||
} | ||
try { | ||
await envelopeReader.readHeader(); | ||
let metadata = await envelopeReader.readFooter(); | ||
return new ParquetReader(metadata, envelopeReader, opts); | ||
} | ||
catch (err) { | ||
await envelopeReader.close(); | ||
throw err; | ||
} | ||
} | ||
@@ -171,3 +182,3 @@ /** | ||
else if (o.parquetType === 'INT64') { | ||
return new Int64(Buffer.from(o.value)); | ||
return new node_int64_1.default(Buffer.from(o.value)); | ||
} | ||
@@ -183,3 +194,3 @@ } | ||
if (Array.isArray(d)) { | ||
Object.setPrototypeOf(d, parquet_thrift.PageLocation.prototype); | ||
Object.setPrototypeOf(d, parquet_types_1.default.PageLocation.prototype); | ||
} | ||
@@ -210,2 +221,19 @@ }); | ||
/** | ||
* Support `for await` iterators on the reader object | ||
* Uses `ParquetCursor` still under the hood. | ||
* | ||
* ```js | ||
* for await (const record of reader) { | ||
* console.log(record); | ||
* } | ||
* ``` | ||
*/ | ||
async *[Symbol.asyncIterator]() { | ||
const cursor = this.getCursor(); | ||
let record = null; | ||
while (record = await cursor.next()) { | ||
yield record; | ||
} | ||
} | ||
/** | ||
* Return a cursor to the file. You may open more than one cursor and use | ||
@@ -226,11 +254,15 @@ * them concurrently. All cursors become invalid once close() is called on | ||
} | ||
getBloomFiltersFor(columnNames) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
const bloomFilterData = yield getBloomFiltersFor(columnNames, this.envelopeReader); | ||
return groupBy(bloomFilterData, 'columnName'); | ||
}); | ||
async getBloomFiltersFor(columnNames) { | ||
const bloomFilterData = await getBloomFiltersFor(columnNames, this.envelopeReader); | ||
return bloomFilterData.reduce((acc, value) => { | ||
if (acc[value.columnName]) | ||
acc[value.columnName].push(value); | ||
else | ||
acc[value.columnName] = [value]; | ||
return acc; | ||
}, {}); | ||
} | ||
/** | ||
* Return the number of rows in this file. Note that the number of rows is | ||
* not neccessarily equal to the number of rows in each column. | ||
* not necessarily equal to the number of rows in each column. | ||
*/ | ||
@@ -257,5 +289,5 @@ getRowCount() { | ||
exportMetadata(indent) { | ||
function replacer(key, value) { | ||
if (value instanceof parquet_thrift.PageLocation) { | ||
return [value[0], value[1], value[2]]; | ||
function replacer(_key, value) { | ||
if (value instanceof parquet_types_1.default.PageLocation) { | ||
return [value.offset, value.compressed_page_size, value.first_row_index]; | ||
} | ||
@@ -275,4 +307,4 @@ if (typeof value === 'object') { | ||
} | ||
if (value instanceof Int64) { | ||
if (isFinite(value)) { | ||
if (value instanceof node_int64_1.default) { | ||
if (isFinite(Number(value))) { | ||
return Number(value); | ||
@@ -298,8 +330,6 @@ } | ||
*/ | ||
close() { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
yield this.envelopeReader.close(); | ||
this.envelopeReader = null; | ||
this.metadata = null; | ||
}); | ||
async close() { | ||
await this.envelopeReader.close(); | ||
this.envelopeReader = null; | ||
this.metadata = null; | ||
} | ||
@@ -310,2 +340,3 @@ decodePages(buffer, opts) { | ||
} | ||
exports.ParquetReader = ParquetReader; | ||
/** | ||
@@ -319,84 +350,69 @@ * The parquet envelope reader allows direct, unbuffered access to the individual | ||
class ParquetEnvelopeReader { | ||
static openFile(filePath, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let fileStat = yield parquet_util.fstat(filePath); | ||
let fileDescriptor = yield parquet_util.fopen(filePath); | ||
let readFn = (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
return parquet_util.fread(fileDescriptor, offset, length); | ||
}; | ||
let closeFn = parquet_util.fclose.bind(undefined, fileDescriptor); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size, options); | ||
}); | ||
readFn; | ||
close; | ||
id; | ||
fileSize; | ||
default_dictionary_size; | ||
metadata; | ||
schema; | ||
static async openFile(filePath, options) { | ||
let fileStat = await parquet_util.fstat(filePath); | ||
let fileDescriptor = await parquet_util.fopen(filePath); | ||
let readFn = (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
return parquet_util.fread(fileDescriptor, offset, length); | ||
}; | ||
let closeFn = parquet_util.fclose.bind(undefined, fileDescriptor); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size, options); | ||
} | ||
static openBuffer(buffer, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let readFn = (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
return Promise.resolve(buffer.slice(offset, offset + length)); | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length, options); | ||
}); | ||
static async openBuffer(buffer, options) { | ||
let readFn = (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
return Promise.resolve(buffer.slice(offset, offset + length)); | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length, options); | ||
} | ||
static openS3(client, params, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let fileStat = () => __awaiter(this, void 0, void 0, function* () { return client.headObject(params).promise().then(d => d.ContentLength); }); | ||
let readFn = (offset, length, file) => __awaiter(this, void 0, void 0, function* () { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
let Range = `bytes=${offset}-${offset + length - 1}`; | ||
let res = yield client.getObject(Object.assign({ Range }, params)).promise(); | ||
return Promise.resolve(res.Body); | ||
}); | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options); | ||
}); | ||
static async openS3(client, params, options) { | ||
let fileStat = async () => client.headObject(params).promise().then((d) => d.ContentLength); | ||
let readFn = async (offset, length, file) => { | ||
if (file) { | ||
return Promise.reject('external references are not supported'); | ||
} | ||
let Range = `bytes=${offset}-${offset + length - 1}`; | ||
let res = await client.getObject(Object.assign({ Range }, params)).promise(); | ||
return Promise.resolve(res.Body); | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options); | ||
} | ||
static openUrl(request, params, options) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (typeof params === 'string') | ||
params = { url: params }; | ||
if (!params.url) | ||
throw new Error('URL missing'); | ||
let base = params.url.split('/'); | ||
base = base.slice(0, base.length - 1).join('/') + '/'; | ||
params.encoding = params.encoding || null; | ||
let defaultHeaders = params.headers || {}; | ||
let filesize = () => __awaiter(this, void 0, void 0, function* () { | ||
return new Promise((resolve, reject) => { | ||
let req = request(params); | ||
req.on('response', res => { | ||
req.abort(); | ||
resolve(res.headers['content-length']); | ||
}); | ||
req.on('error', reject); | ||
}); | ||
}); | ||
let readFn = (offset, length, file) => { | ||
let url = file ? base + file : params.url; | ||
let range = `bytes=${offset}-${offset + length - 1}`; | ||
let headers = Object.assign({}, defaultHeaders, { range }); | ||
let req = Object.assign({}, params, { headers, url }); | ||
return new Promise((resolve, reject) => { | ||
request(req, (err, res) => { | ||
if (err) { | ||
reject(err); | ||
} | ||
else { | ||
resolve(res.body); | ||
} | ||
}); | ||
}); | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, filesize, options); | ||
}); | ||
static async openUrl(params, options) { | ||
if (typeof params === 'string') | ||
params = { url: params }; | ||
if (!params.url) | ||
throw new Error('URL missing'); | ||
const baseArr = params.url.split('/'); | ||
const base = baseArr.slice(0, baseArr.length - 1).join('/') + '/'; | ||
let defaultHeaders = params.headers || {}; | ||
let filesize = async () => { | ||
const { headers } = await (0, cross_fetch_1.default)(params.url); | ||
return headers.get('Content-Length'); | ||
}; | ||
let readFn = async (offset, length, file) => { | ||
let url = file ? base + file : params.url; | ||
let range = `bytes=${offset}-${offset + length - 1}`; | ||
let headers = Object.assign({}, defaultHeaders, { range }); | ||
const response = await (0, cross_fetch_1.default)(url, { headers }); | ||
const arrayBuffer = await response.arrayBuffer(); | ||
const buffer = Buffer.from(arrayBuffer); | ||
return buffer; | ||
}; | ||
let closeFn = () => ({}); | ||
return new ParquetEnvelopeReader(readFn, closeFn, filesize, options); | ||
} | ||
constructor(readFn, closeFn, fileSize, options) { | ||
constructor(readFn, closeFn, fileSize, options, metadata) { | ||
options = options || {}; | ||
@@ -408,4 +424,5 @@ this.readFn = readFn; | ||
this.default_dictionary_size = options.default_dictionary_size || 10000000; | ||
this.metadata = metadata; | ||
if (options.maxLength || options.maxSpan || options.queueWait) { | ||
const bufferReader = new BufferReader(this, options); | ||
const bufferReader = new bufferReader_1.default(this, options); | ||
this.read = (offset, length) => bufferReader.read(offset, length); | ||
@@ -418,3 +435,3 @@ } | ||
readHeader() { | ||
return this.read(0, PARQUET_MAGIC.length).then(buf => { | ||
return this.read(0, PARQUET_MAGIC.length).then((buf) => { | ||
if (buf.toString() != PARQUET_MAGIC) { | ||
@@ -462,4 +479,4 @@ throw 'not valid parquet file'; | ||
} | ||
const data = this.read(+column.offset_index_offset, column.offset_index_length).then(data => { | ||
let offset_index = new parquet_thrift.OffsetIndex(); | ||
const data = this.read(+column.offset_index_offset, column.offset_index_length).then((data) => { | ||
let offset_index = new parquet_types_1.default.OffsetIndex(); | ||
parquet_util.decodeThrift(offset_index, data); | ||
@@ -473,2 +490,3 @@ Object.defineProperty(offset_index, 'column', { value: column, enumerable: false }); | ||
if (opts && opts.cache) { | ||
//@ts-ignore | ||
column.offsetIndex = data; | ||
@@ -486,4 +504,4 @@ } | ||
} | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then(data => { | ||
let column_index = new parquet_thrift.ColumnIndex(); | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then((data) => { | ||
let column_index = new parquet_types_1.default.ColumnIndex(); | ||
parquet_util.decodeThrift(column_index, data); | ||
@@ -505,2 +523,3 @@ Object.defineProperty(column_index, 'column', { value: column }); | ||
if (opts && opts.cache) { | ||
//@ts-ignore | ||
column.columnIndex = data; | ||
@@ -510,52 +529,50 @@ } | ||
} | ||
readPage(column, page, records, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
column = Object.assign({}, column); | ||
column.meta_data = Object.assign({}, column.meta_data); | ||
if (page.offset !== undefined) { | ||
if (isNaN(page.offset) || isNaN(page.compressed_page_size)) { | ||
throw Error('page offset and/or size missing'); | ||
} | ||
column.meta_data.data_page_offset = page.offset; | ||
column.meta_data.total_compressed_size = page.compressed_page_size; | ||
async readPage(column, page, records, opts) { | ||
column = Object.assign({}, column); | ||
column.meta_data = Object.assign({}, column.meta_data); | ||
if (page instanceof parquet_types_1.default.PageLocation && page.offset !== undefined) { | ||
if (isNaN(Number(page.offset)) || isNaN(page.compressed_page_size)) { | ||
throw Error('page offset and/or size missing'); | ||
} | ||
else { | ||
const offsetIndex = column.offsetIndex || (yield this.readOffsetIndex(column, null, opts)); | ||
column.meta_data.data_page_offset = offsetIndex.page_locations[page].offset; | ||
column.meta_data.total_compressed_size = offsetIndex.page_locations[page].compressed_page_size; | ||
} | ||
const chunk = yield this.readColumnChunk(this.schema, column); | ||
Object.defineProperty(chunk, 'column', { value: column }); | ||
let data = { | ||
columnData: { [chunk.column.meta_data.path_in_schema.join(',')]: chunk } | ||
}; | ||
return parquet_shredder.materializeRecords(this.schema, data, records); | ||
}); | ||
column.meta_data.data_page_offset = page.offset; | ||
column.meta_data.total_compressed_size = page.compressed_page_size; | ||
} | ||
else { | ||
const offsetIndex = column.offsetIndex || await this.readOffsetIndex(column, null, opts); | ||
column.meta_data.data_page_offset = offsetIndex.page_locations[page].offset; | ||
column.meta_data.total_compressed_size = offsetIndex.page_locations[page].compressed_page_size; | ||
} | ||
const chunk = await this.readColumnChunk(this.schema, column); | ||
Object.defineProperty(chunk, 'column', { value: column }); | ||
let data = { | ||
columnData: { [chunk.column.meta_data.path_in_schema.join(',')]: chunk } | ||
}; | ||
return parquet_shredder.materializeRecords(this.schema, data, records); | ||
} | ||
readRowGroup(schema, rowGroup, columnList) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
var buffer = { | ||
rowCount: +rowGroup.num_rows, | ||
columnData: {} | ||
}; | ||
for (let colChunk of rowGroup.columns) { | ||
const colMetadata = colChunk.meta_data; | ||
const colKey = colMetadata.path_in_schema; | ||
if (columnList.length > 0 && parquet_util.fieldIndexOf(columnList, colKey) < 0) { | ||
continue; | ||
} | ||
buffer.columnData[colKey] = yield this.readColumnChunk(schema, colChunk); | ||
async readRowGroup(schema, rowGroup, columnList) { | ||
var buffer = { | ||
rowCount: +rowGroup.num_rows, | ||
columnData: {}, | ||
pageRowCount: 0, | ||
pages: {} | ||
}; | ||
for (let colChunk of rowGroup.columns) { | ||
const colMetadata = colChunk.meta_data; | ||
const colKey = colMetadata.path_in_schema; | ||
if (columnList.length > 0 && parquet_util.fieldIndexOf(columnList, colKey) < 0) { | ||
continue; | ||
} | ||
return buffer; | ||
}); | ||
buffer.columnData[colKey.join(',')] = await this.readColumnChunk(schema, colChunk); | ||
} | ||
return buffer; | ||
} | ||
readColumnChunk(schema, colChunk, opts) { | ||
let dictionary = Promise.resolve(); | ||
let field = schema.findField(colChunk.meta_data.path_in_schema); | ||
let type = parquet_util.getThriftEnum(parquet_thrift.Type, colChunk.meta_data.type); | ||
let compression = parquet_util.getThriftEnum(parquet_thrift.CompressionCodec, colChunk.meta_data.codec); | ||
let pagesOffset = +colChunk.meta_data.data_page_offset; | ||
let pagesSize = +colChunk.meta_data.total_compressed_size; | ||
async readColumnChunk(schema, colChunk, opts) { | ||
let metadata = colChunk.meta_data; | ||
let field = schema.findField(metadata.path_in_schema); | ||
let type = parquet_util.getThriftEnum(parquet_types_1.default.Type, metadata.type); | ||
let compression = parquet_util.getThriftEnum(parquet_types_1.default.CompressionCodec, metadata.codec); | ||
let pagesOffset = +metadata.data_page_offset; | ||
let pagesSize = +metadata.total_compressed_size; | ||
if (!colChunk.file_path) { | ||
pagesSize = Math.min(this.fileSize - pagesOffset, +colChunk.meta_data.total_compressed_size); | ||
pagesSize = Math.min(this.fileSize - pagesOffset, +metadata.total_compressed_size); | ||
} | ||
@@ -568,36 +585,36 @@ opts = Object.assign({}, opts, { | ||
column: field, | ||
num_values: colChunk.meta_data.num_values | ||
num_values: metadata.num_values | ||
}); | ||
if (colChunk.meta_data.dictionary_page_offset) { | ||
const offset = +colChunk.meta_data.dictionary_page_offset; | ||
if (metadata.dictionary_page_offset) { | ||
const offset = +metadata.dictionary_page_offset; | ||
const size = Math.min(+this.fileSize - offset, this.default_dictionary_size); | ||
dictionary = this.read(offset, size, colChunk.file_path).then(buffer => decodePage({ offset: 0, buffer, size: buffer.length }, opts).dictionary); | ||
await this.read(offset, size, colChunk.file_path).then(async (buffer) => { | ||
await decodePage({ offset: 0, buffer, size: buffer.length }, opts).then(dict => { | ||
opts.dictionary = opts.dictionary || dict.dictionary; | ||
}); | ||
}); | ||
} | ||
return dictionary.then(dict => { | ||
opts.dictionary = opts.dictionary || dict; | ||
return this.read(pagesOffset, pagesSize, colChunk.file_path).then(pagesBuf => decodePages(pagesBuf, opts)); | ||
}); | ||
return this.read(pagesOffset, pagesSize, colChunk.file_path).then((pagesBuf) => decodePages(pagesBuf, opts)); | ||
} | ||
readFooter() { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (typeof this.fileSize === 'function') { | ||
this.fileSize = yield this.fileSize(); | ||
} | ||
let trailerLen = PARQUET_MAGIC.length + 4; | ||
let trailerBuf = yield this.read(this.fileSize - trailerLen, trailerLen); | ||
if (trailerBuf.slice(4).toString() != PARQUET_MAGIC) { | ||
throw 'not a valid parquet file'; | ||
} | ||
let metadataSize = trailerBuf.readUInt32LE(0); | ||
let metadataOffset = this.fileSize - metadataSize - trailerLen; | ||
if (metadataOffset < PARQUET_MAGIC.length) { | ||
throw 'invalid metadata size'; | ||
} | ||
let metadataBuf = yield this.read(metadataOffset, metadataSize); | ||
let metadata = new parquet_thrift.FileMetaData(); | ||
parquet_util.decodeThrift(metadata, metadataBuf); | ||
return metadata; | ||
}); | ||
async readFooter() { | ||
if (typeof this.fileSize === 'function') { | ||
this.fileSize = await this.fileSize(); | ||
} | ||
let trailerLen = PARQUET_MAGIC.length + 4; | ||
let trailerBuf = await this.read(this.fileSize - trailerLen, trailerLen); | ||
if (trailerBuf.slice(4).toString() != PARQUET_MAGIC) { | ||
throw 'not a valid parquet file'; | ||
} | ||
let metadataSize = trailerBuf.readUInt32LE(0); | ||
let metadataOffset = this.fileSize - metadataSize - trailerLen; | ||
if (metadataOffset < PARQUET_MAGIC.length) { | ||
throw 'invalid metadata size'; | ||
} | ||
let metadataBuf = await this.read(metadataOffset, metadataSize); | ||
let metadata = new types_1.NewFileMetaData(); | ||
parquet_util.decodeThrift(metadata, metadataBuf); | ||
return metadata; | ||
} | ||
} | ||
exports.ParquetEnvelopeReader = ParquetEnvelopeReader; | ||
/** | ||
@@ -640,10 +657,10 @@ * Decode a consecutive array of data using one of the parquet encodings | ||
} | ||
function decodePage(cursor, opts) { | ||
async function decodePage(cursor, opts) { | ||
opts = opts || {}; | ||
let page; | ||
const pageHeader = new parquet_thrift.PageHeader(); | ||
const pageHeader = new types_1.NewPageHeader(); | ||
const headerOffset = cursor.offset; | ||
const headerSize = parquet_util.decodeThrift(pageHeader, cursor.buffer.slice(cursor.offset)); | ||
cursor.offset += headerSize; | ||
const pageType = parquet_util.getThriftEnum(parquet_thrift.PageType, pageHeader.type); | ||
const pageType = parquet_util.getThriftEnum(parquet_types_1.default.PageType, pageHeader.type); | ||
switch (pageType) { | ||
@@ -654,3 +671,3 @@ case 'DATA_PAGE': | ||
} | ||
page = decodeDataPage(cursor, pageHeader, opts); | ||
page = await decodeDataPage(cursor, pageHeader, opts); | ||
break; | ||
@@ -661,7 +678,8 @@ case 'DATA_PAGE_V2': | ||
} | ||
page = decodeDataPageV2(cursor, pageHeader, opts); | ||
page = await decodeDataPageV2(cursor, pageHeader, opts); | ||
break; | ||
case 'DICTIONARY_PAGE': | ||
const dict = await decodeDictionaryPage(cursor, pageHeader, opts); | ||
page = { | ||
dictionary: decodeDictionaryPage(cursor, pageHeader, opts) | ||
dictionary: dict | ||
}; | ||
@@ -677,3 +695,3 @@ break; | ||
} | ||
function decodePages(buffer, opts) { | ||
async function decodePages(buffer, opts) { | ||
opts = opts || {}; | ||
@@ -693,3 +711,3 @@ let cursor = { | ||
while (cursor.offset < cursor.size && (!opts.num_values || data.dlevels.length < opts.num_values)) { | ||
const pageData = decodePage(cursor, opts); | ||
const pageData = await decodePage(cursor, opts); | ||
if (pageData.dictionary) { | ||
@@ -702,3 +720,4 @@ opts.dictionary = pageData.dictionary; | ||
} | ||
for (let i = 0; i < pageData.rlevels.length; i++) { | ||
let length = pageData.rlevels != undefined ? pageData.rlevels.length : 0; | ||
for (let i = 0; i < length; i++) { | ||
data.rlevels.push(pageData.rlevels[i]); | ||
@@ -716,3 +735,3 @@ data.dlevels.push(pageData.dlevels[i]); | ||
} | ||
function decodeDictionaryPage(cursor, header, opts) { | ||
async function decodeDictionaryPage(cursor, header, opts) { | ||
const cursorEnd = cursor.offset + header.compressed_page_size; | ||
@@ -726,3 +745,3 @@ let dictCursor = { | ||
if (opts.compression && opts.compression !== 'UNCOMPRESSED') { | ||
let valuesBuf = parquet_compression.inflate(opts.compression, dictCursor.buffer.slice(dictCursor.offset, cursorEnd)); | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, dictCursor.buffer.slice(dictCursor.offset, cursorEnd)); | ||
dictCursor = { | ||
@@ -734,12 +753,13 @@ buffer: valuesBuf, | ||
} | ||
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, header.dictionary_page_header.num_values, opts) | ||
.map(d => d.toString()); | ||
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, (header.dictionary_page_header).num_values, opts) | ||
.map((d) => d.toString()); | ||
} | ||
function decodeDataPage(cursor, header, opts) { | ||
async function decodeDataPage(cursor, header, opts) { | ||
const cursorEnd = cursor.offset + header.compressed_page_size; | ||
let valueCount = header.data_page_header.num_values; | ||
let valueEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.encoding); | ||
const dataPageHeader = header.data_page_header; | ||
let valueCount = dataPageHeader.num_values; | ||
let valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.encoding); | ||
let valuesBufCursor = cursor; | ||
if (opts.compression && opts.compression !== 'UNCOMPRESSED') { | ||
let valuesBuf = parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd)); | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd)); | ||
valuesBufCursor = { | ||
@@ -752,3 +772,3 @@ buffer: valuesBuf, | ||
/* read repetition levels */ | ||
let rLevelEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.repetition_level_encoding); | ||
let rLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.repetition_level_encoding); | ||
let rLevels = new Array(valueCount); | ||
@@ -762,3 +782,3 @@ if (opts.rLevelMax > 0) { | ||
/* read definition levels */ | ||
let dLevelEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header.definition_level_encoding); | ||
let dLevelEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeader.definition_level_encoding); | ||
let dLevels = new Array(valueCount); | ||
@@ -791,7 +811,8 @@ if (opts.dLevelMax > 0) { | ||
} | ||
function decodeDataPageV2(cursor, header, opts) { | ||
async function decodeDataPageV2(cursor, header, opts) { | ||
const cursorEnd = cursor.offset + header.compressed_page_size; | ||
const valueCount = header.data_page_header_v2.num_values; | ||
const valueCountNonNull = valueCount - header.data_page_header_v2.num_nulls; | ||
const valueEncoding = parquet_util.getThriftEnum(parquet_thrift.Encoding, header.data_page_header_v2.encoding); | ||
const dataPageHeaderV2 = header.data_page_header_v2; | ||
const valueCount = dataPageHeaderV2.num_values; | ||
const valueCountNonNull = valueCount - dataPageHeaderV2.num_nulls; | ||
const valueEncoding = parquet_util.getThriftEnum(parquet_types_1.default.Encoding, dataPageHeaderV2.encoding); | ||
/* read repetition levels */ | ||
@@ -821,4 +842,4 @@ let rLevels = new Array(valueCount); | ||
let valuesBufCursor = cursor; | ||
if (header.data_page_header_v2.is_compressed) { | ||
let valuesBuf = parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd)); | ||
if (dataPageHeaderV2.is_compressed) { | ||
let valuesBuf = await parquet_compression.inflate(opts.compression, cursor.buffer.slice(cursor.offset, cursorEnd)); | ||
valuesBufCursor = { | ||
@@ -845,3 +866,3 @@ buffer: valuesBuf, | ||
schemaElements.forEach(schemaElement => { | ||
let repetitionType = parquet_util.getThriftEnum(parquet_thrift.FieldRepetitionType, schemaElement.repetition_type); | ||
let repetitionType = parquet_util.getThriftEnum(parquet_types_1.default.FieldRepetitionType, schemaElement.repetition_type); | ||
let optional = false; | ||
@@ -860,3 +881,3 @@ let repeated = false; | ||
; | ||
if (schemaElement.num_children > 0) { | ||
if (schemaElement.num_children != undefined && schemaElement.num_children > 0) { | ||
schema[schemaElement.name] = { | ||
@@ -881,5 +902,5 @@ optional: optional, | ||
else { | ||
let logicalType = parquet_util.getThriftEnum(parquet_thrift.Type, schemaElement.type); | ||
let logicalType = parquet_util.getThriftEnum(parquet_types_1.default.Type, schemaElement.type); | ||
if (schemaElement.converted_type != null) { | ||
logicalType = parquet_util.getThriftEnum(parquet_thrift.ConvertedType, schemaElement.converted_type); | ||
logicalType = parquet_util.getThriftEnum(parquet_types_1.default.ConvertedType, schemaElement.converted_type); | ||
} | ||
@@ -904,2 +925,1 @@ schema[schemaElement.name] = { | ||
}; | ||
//# sourceMappingURL=reader.js.map |
@@ -1,6 +0,26 @@ | ||
'use strict'; | ||
const parquet_codec = require('./codec'); | ||
const parquet_compression = require('./compression'); | ||
const parquet_types = require('./types'); | ||
const parquet_util = require('./util'); | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetSchema = void 0; | ||
const parquet_codec = __importStar(require("./codec")); | ||
const parquet_compression = __importStar(require("./compression")); | ||
const parquet_types = __importStar(require("./types")); | ||
const PARQUET_COLUMN_KEY_SEPARATOR = '.'; | ||
@@ -11,2 +31,5 @@ /** | ||
class ParquetSchema { | ||
schema; | ||
fields; | ||
fieldList; | ||
/** | ||
@@ -24,3 +47,3 @@ * Create a new schema from a JSON schema definition | ||
findField(path) { | ||
if (path.constructor !== Array) { | ||
if (typeof path === 'string') { | ||
path = path.split(","); | ||
@@ -33,3 +56,6 @@ } | ||
for (; path.length > 1; path.shift()) { | ||
n = n[path[0]].fields; | ||
let fields = n[path[0]]?.fields; | ||
if (isDefined(fields)) { | ||
n = fields; | ||
} | ||
} | ||
@@ -42,3 +68,3 @@ return n[path[0]]; | ||
findFieldBranch(path) { | ||
if (path.constructor !== Array) { | ||
if (typeof path === 'string') { | ||
path = path.split(","); | ||
@@ -50,4 +76,5 @@ } | ||
branch.push(n[path[0]]); | ||
if (path.length > 1) { | ||
n = n[path[0]].fields; | ||
let fields = n[path[0]].fields; | ||
if (path.length > 1 && isDefined(fields)) { | ||
n = fields; | ||
} | ||
@@ -58,2 +85,3 @@ } | ||
} | ||
exports.ParquetSchema = ParquetSchema; | ||
; | ||
@@ -94,3 +122,3 @@ function buildFields(schema, rLevelParentMax, dLevelParentMax, path) { | ||
name: name, | ||
path: path.concat([name]), | ||
path: path.concat(name), | ||
repetitionType: repetitionType, | ||
@@ -102,3 +130,3 @@ rLevelMax: rLevelMax, | ||
fieldCount: Object.keys(opts.fields).length, | ||
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat([name])) | ||
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name)) | ||
}; | ||
@@ -109,6 +137,5 @@ if (opts.type == 'LIST' || opts.type == 'MAP') | ||
} | ||
/* field type */ | ||
const typeDef = parquet_types.PARQUET_LOGICAL_TYPES[opts.type]; | ||
const typeDef = opts.type ? parquet_types.PARQUET_LOGICAL_TYPES[opts.type] : undefined; | ||
if (!typeDef) { | ||
throw 'invalid parquet type: ' + opts.type; | ||
throw 'invalid parquet type: ' + (opts.type || "missing type"); | ||
} | ||
@@ -120,3 +147,3 @@ /* field encoding */ | ||
if (!(opts.encoding in parquet_codec)) { | ||
throw 'unsupported parquet encoding: ' + opts.encodig; | ||
throw 'unsupported parquet encoding: ' + opts.encoding; | ||
} | ||
@@ -150,4 +177,5 @@ if (!opts.compression) { | ||
list.push(fields[k]); | ||
if (fields[k].isNested) { | ||
list = list.concat(listFields(fields[k].fields)); | ||
const nestedFields = fields[k].fields; | ||
if (fields[k].isNested && isDefined(nestedFields)) { | ||
list = list.concat(listFields(nestedFields)); | ||
} | ||
@@ -157,3 +185,4 @@ } | ||
} | ||
module.exports = { ParquetSchema }; | ||
//# sourceMappingURL=schema.js.map | ||
function isDefined(val) { | ||
return val !== undefined; | ||
} |
@@ -1,32 +0,29 @@ | ||
'use strict'; | ||
const parquet_types = require('./types'); | ||
const parquet_schema = require('./schema'); | ||
/** | ||
* 'Shred' a record into a list of <value, repetition_level, definition_level> | ||
* tuples per column using the Google Dremel Algorithm.. | ||
* | ||
* The buffer argument must point to an object into which the shredded record | ||
* will be returned. You may re-use the buffer for repeated calls to this function | ||
* to append to an existing buffer, as long as the schema is unchanged. | ||
* | ||
* The format in which the shredded records will be stored in the buffer is as | ||
* follows: | ||
* | ||
* buffer = { | ||
* columnData: [ | ||
* 'my_col': { | ||
* dlevels: [d1, d2, .. dN], | ||
* rlevels: [r1, r2, .. rN], | ||
* values: [v1, v2, .. vN], | ||
* }, ... | ||
* ], | ||
* rowCount: X, | ||
* } | ||
* | ||
*/ | ||
exports.shredRecord = function (schema, record, buffer) { | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.materializeRecords = exports.shredRecord = void 0; | ||
const parquet_types = __importStar(require("./types")); | ||
const shredRecord = function (schema, record, buffer) { | ||
/* shred the record, this may raise an exception */ | ||
var recordShredded = {}; | ||
for (let field of schema.fieldList) { | ||
recordShredded[field.path] = { | ||
recordShredded[field.path.join(',')] = { | ||
dlevels: [], | ||
@@ -47,3 +44,4 @@ rlevels: [], | ||
for (let field of schema.fieldList) { | ||
buffer.columnData[field.path] = { | ||
let path = field.path.join(','); | ||
buffer.columnData[path] = { | ||
dlevels: [], | ||
@@ -55,3 +53,3 @@ rlevels: [], | ||
}; | ||
buffer.pages[field.path] = []; | ||
buffer.pages[path] = []; | ||
} | ||
@@ -62,4 +60,5 @@ } | ||
for (let field of schema.fieldList) { | ||
let record = recordShredded[field.path]; | ||
let column = buffer.columnData[field.path]; | ||
let path = field.path.join(','); | ||
let record = recordShredded[path]; | ||
let column = buffer.columnData[path]; | ||
for (let i = 0; i < record.rlevels.length; i++) { | ||
@@ -72,6 +71,7 @@ column.rlevels.push(record.rlevels[i]); | ||
} | ||
[...recordShredded[field.path].distinct_values].forEach(value => buffer.columnData[field.path].distinct_values.add(value)); | ||
buffer.columnData[field.path].count += recordShredded[field.path].count; | ||
[...recordShredded[path].distinct_values].forEach(value => buffer.columnData[path].distinct_values.add(value)); | ||
buffer.columnData[path].count += recordShredded[path].count; | ||
} | ||
}; | ||
exports.shredRecord = shredRecord; | ||
function shredRecordInternal(fields, record, data, rlvl, dlvl) { | ||
@@ -81,8 +81,18 @@ for (let fieldName in fields) { | ||
const fieldType = field.originalType || field.primitiveType; | ||
const path = field.path.join(','); | ||
// fetch values | ||
let values = []; | ||
if (record && (fieldName in record) && record[fieldName] !== undefined && record[fieldName] !== null) { | ||
if (record[fieldName].constructor === Array) { | ||
if (Array.isArray(record[fieldName])) { | ||
values = record[fieldName]; | ||
} | ||
else if (ArrayBuffer.isView(record[fieldName])) { // checks if any typed array | ||
if (record[fieldName] instanceof Uint8Array) { | ||
// wrap in a buffer, since not supported by parquet_thrift | ||
values.push(Buffer.from(record[fieldName])); | ||
} | ||
else { | ||
throw Object.prototype.toString.call(record[fieldName]) + ' is not supported'; | ||
} | ||
} | ||
else { | ||
@@ -101,9 +111,9 @@ values.push(record[fieldName]); | ||
if (values.length == 0) { | ||
if (field.isNested) { | ||
if (field.isNested && isDefined(field.fields)) { | ||
shredRecordInternal(field.fields, null, data, rlvl, dlvl); | ||
} | ||
else { | ||
data[field.path].rlevels.push(rlvl); | ||
data[field.path].dlevels.push(dlvl); | ||
data[field.path].count += 1; | ||
data[path].rlevels.push(rlvl); | ||
data[path].dlevels.push(dlvl); | ||
data[path].count += 1; | ||
} | ||
@@ -115,11 +125,11 @@ continue; | ||
const rlvl_i = i === 0 ? rlvl : field.rLevelMax; | ||
if (field.isNested) { | ||
if (field.isNested && isDefined(field.fields)) { | ||
shredRecordInternal(field.fields, values[i], data, rlvl_i, field.dLevelMax); | ||
} | ||
else { | ||
data[field.path].distinct_values.add(values[i]); | ||
data[field.path].values.push(parquet_types.toPrimitive(fieldType, values[i])); | ||
data[field.path].rlevels.push(rlvl_i); | ||
data[field.path].dlevels.push(field.dLevelMax); | ||
data[field.path].count += 1; | ||
data[path].distinct_values.add(values[i]); | ||
data[path].values.push(parquet_types.toPrimitive(fieldType, values[i])); | ||
data[path].rlevels.push(rlvl_i); | ||
data[path].dlevels.push(field.dLevelMax); | ||
data[path].count += 1; | ||
} | ||
@@ -149,3 +159,3 @@ } | ||
*/ | ||
exports.materializeRecords = function (schema, buffer, records) { | ||
const materializeRecords = function (schema, buffer, records) { | ||
if (!records) { | ||
@@ -175,2 +185,3 @@ records = []; | ||
}; | ||
exports.materializeRecords = materializeRecords; | ||
function materializeRecordField(record, branch, rLevels, dLevel, value) { | ||
@@ -186,10 +197,12 @@ const node = branch[0]; | ||
} | ||
while (record[node.name].length < rLevels[0] + 1) { | ||
record[node.name].push({}); | ||
const recordValue = record[node.name]; | ||
while (recordValue.length < rLevels[0] + 1) { | ||
recordValue.push({}); | ||
} | ||
materializeRecordField(record[node.name][rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value); | ||
materializeRecordField(recordValue[rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value); | ||
} | ||
else { | ||
record[node.name] = record[node.name] || {}; | ||
materializeRecordField(record[node.name], branch.slice(1), rLevels, dLevel, value); | ||
const recordValue = record[node.name]; | ||
materializeRecordField(recordValue, branch.slice(1), rLevels, dLevel, value); | ||
} | ||
@@ -202,6 +215,7 @@ } | ||
} | ||
while (record[node.name].length < rLevels[0] + 1) { | ||
record[node.name].push(null); | ||
const recordValue = record[node.name]; | ||
while (recordValue.length < rLevels[0] + 1) { | ||
recordValue.push(null); | ||
} | ||
record[node.name][rLevels[0]] = value; | ||
recordValue[rLevels[0]] = value; | ||
} | ||
@@ -213,2 +227,4 @@ else { | ||
} | ||
//# sourceMappingURL=shred.js.map | ||
function isDefined(val) { | ||
return val !== undefined; | ||
} |
'use strict'; | ||
const BSON = require('bson'); | ||
const PARQUET_LOGICAL_TYPES = { | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.fromPrimitive = exports.toPrimitive = exports.PARQUET_LOGICAL_TYPES = void 0; | ||
const BSON = __importStar(require("bson")); | ||
exports.PARQUET_LOGICAL_TYPES = { | ||
'BOOLEAN': { | ||
@@ -142,7 +163,8 @@ primitiveType: 'BOOLEAN', | ||
function toPrimitive(type, value) { | ||
if (!(type in PARQUET_LOGICAL_TYPES)) { | ||
throw 'invalid type: ' + type; | ||
if (type === undefined || !(type in exports.PARQUET_LOGICAL_TYPES)) { | ||
throw 'invalid type: ' + type || "undefined"; | ||
} | ||
return PARQUET_LOGICAL_TYPES[type].toPrimitive(value); | ||
return exports.PARQUET_LOGICAL_TYPES[type].toPrimitive(value); | ||
} | ||
exports.toPrimitive = toPrimitive; | ||
/** | ||
@@ -153,7 +175,8 @@ * Convert a value from it's internal/underlying primitive representation to | ||
function fromPrimitive(type, value) { | ||
if (!(type in PARQUET_LOGICAL_TYPES)) { | ||
throw 'invalid type: ' + type; | ||
if (type === undefined || !(type in exports.PARQUET_LOGICAL_TYPES)) { | ||
throw 'invalid type: ' + type || "undefined"; | ||
} | ||
if ("fromPrimitive" in PARQUET_LOGICAL_TYPES[type]) { | ||
return PARQUET_LOGICAL_TYPES[type].fromPrimitive(value); | ||
const typeFromPrimitive = exports.PARQUET_LOGICAL_TYPES[type].fromPrimitive; | ||
if (typeFromPrimitive !== undefined) { | ||
return typeFromPrimitive(value); | ||
} | ||
@@ -164,2 +187,3 @@ else { | ||
} | ||
exports.fromPrimitive = fromPrimitive; | ||
function toPrimitive_BOOLEAN(value) { | ||
@@ -172,77 +196,128 @@ return !!value; | ||
function toPrimitive_FLOAT(value) { | ||
const v = parseFloat(value); | ||
if (isNaN(v)) { | ||
throw 'invalid value for FLOAT: ' + value; | ||
if (typeof value === 'string') { | ||
const v = parseFloat(value); | ||
return v; | ||
} | ||
return v; | ||
else if (typeof value === 'number') { | ||
return value; | ||
} | ||
throw 'invalid value for FLOAT: ' + value; | ||
} | ||
function toPrimitive_DOUBLE(value) { | ||
const v = parseFloat(value); | ||
if (isNaN(v)) { | ||
throw 'invalid value for DOUBLE: ' + value; | ||
if (typeof value === 'string') { | ||
const v = parseFloat(value); | ||
return v; | ||
} | ||
return v; | ||
else if (typeof value === 'number') { | ||
return value; | ||
} | ||
throw 'invalid value for DOUBLE: ' + value; | ||
} | ||
function toPrimitive_INT8(value) { | ||
const v = parseInt(value, 10); | ||
if (v < -0x80 || v > 0x7f || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x80, 0x7f, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT8: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_UINT8(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || v > 0xff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(0, 0xff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for UINT8: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_INT16(value) { | ||
const v = parseInt(value, 10); | ||
if (v < -0x8000 || v > 0x7fff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x8000, 0x7fff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT16: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_UINT16(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || v > 0xffff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(0, 0xffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for UINT16: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_INT32(value) { | ||
const v = parseInt(value, 10); | ||
if (v < -0x80000000 || v > 0x7fffffff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x80000000, 0x7fffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT32: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_UINT32(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || v > 0xffffffffffff || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(0, 0xffffffffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for UINT32: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_INT64(value) { | ||
const v = parseInt(value, 10); | ||
if (isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x8000000000000000, 0x7fffffffffffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT64: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_UINT64(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(0, 0xffffffffffffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for UINT64: ' + value; | ||
} | ||
return v; | ||
} | ||
function toPrimitive_INT96(value) { | ||
const v = parseInt(value, 10); | ||
if (isNaN(v)) { | ||
try { | ||
let v = value; | ||
if (typeof v === 'string') | ||
v = BigInt(value); | ||
checkValidValue(-0x800000000000000000000000, 0x7fffffffffffffffffffffff, v); | ||
return v; | ||
} | ||
catch { | ||
throw 'invalid value for INT96: ' + value; | ||
} | ||
return v; | ||
} | ||
@@ -265,12 +340,13 @@ function toPrimitive_BYTE_ARRAY(value) { | ||
function toPrimitive_BSON(value) { | ||
var encoder = new BSON(); | ||
return Buffer.from(encoder.serialize(value)); | ||
return Buffer.from(BSON.serialize(value)); | ||
} | ||
function fromPrimitive_BSON(value) { | ||
var decoder = new BSON(); | ||
return decoder.deserialize(value); | ||
return BSON.deserialize(value); | ||
} | ||
function toPrimitive_TIME_MILLIS(value) { | ||
const v = parseInt(value, 10); | ||
if (v < 0 || v > 0xffffffffffffffff || isNaN(v)) { | ||
let v = value; | ||
if (typeof value === `string`) { | ||
v = parseInt(value, 10); | ||
} | ||
if (v < 0 || v > 0xffffffffffffffff || typeof v !== 'number') { | ||
throw 'invalid value for TIME_MILLIS: ' + value; | ||
@@ -282,3 +358,3 @@ } | ||
const v = BigInt(value); | ||
if (v < 0n || isNaN(v)) { | ||
if (v < 0n) { | ||
throw 'invalid value for TIME_MICROS: ' + value; | ||
@@ -295,9 +371,10 @@ } | ||
/* convert from integer */ | ||
{ | ||
const v = parseInt(value, 10); | ||
if (v < 0 || isNaN(v)) { | ||
throw 'invalid value for DATE: ' + value; | ||
} | ||
return v; | ||
let v = value; | ||
if (typeof value === 'string') { | ||
v = parseInt(value, 10); | ||
} | ||
if (v < 0 || typeof v !== 'number') { | ||
throw 'invalid value for DATE: ' + value; | ||
} | ||
return v; | ||
} | ||
@@ -313,9 +390,10 @@ function fromPrimitive_DATE(value) { | ||
/* convert from integer */ | ||
{ | ||
const v = parseInt(value, 10); | ||
if (v < 0 || isNaN(v)) { | ||
throw 'invalid value for TIMESTAMP_MILLIS: ' + value; | ||
} | ||
return v; | ||
let v = value; | ||
if (typeof value === 'string') { | ||
v = parseInt(value, 10); | ||
} | ||
if (v < 0 || typeof v !== 'number') { | ||
throw 'invalid value for TIMESTAMP_MILLIS: ' + value; | ||
} | ||
return v; | ||
} | ||
@@ -340,3 +418,3 @@ function fromPrimitive_TIMESTAMP_MILLIS(value) { | ||
function fromPrimitive_TIMESTAMP_MICROS(value) { | ||
return new Date(parseInt(value / 1000n)); | ||
return typeof value === 'bigint' ? new Date(Number(value / 1000n)) : new Date(value / 1000); | ||
} | ||
@@ -360,3 +438,6 @@ function toPrimitive_INTERVAL(value) { | ||
} | ||
module.exports = { PARQUET_LOGICAL_TYPES, toPrimitive, fromPrimitive }; | ||
//# sourceMappingURL=types.js.map | ||
function checkValidValue(lowerRange, upperRange, v) { | ||
if (v < lowerRange || v > upperRange) { | ||
throw "invalid value"; | ||
} | ||
} |
@@ -132,2 +132,10 @@ "use strict"; | ||
fromPrimitive: fromPrimitive_INTERVAL | ||
}, | ||
MAP: { | ||
originalType: 'MAP', | ||
toPrimitive: toPrimitive_MAP, | ||
}, | ||
LIST: { | ||
originalType: 'LIST', | ||
toPrimitive: toPrimitive_LIST, | ||
} | ||
@@ -246,2 +254,8 @@ }; | ||
} | ||
function toPrimitive_MAP(value) { | ||
return value; | ||
} | ||
function toPrimitive_LIST(value) { | ||
return value; | ||
} | ||
function toPrimitive_BYTE_ARRAY(value) { | ||
@@ -351,2 +365,1 @@ return Buffer.from(value); | ||
} | ||
//# sourceMappingURL=parquet_primitives.js.map |
"use strict"; | ||
// Lifted from https://github.com/kbajalc/parquets | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.NewPageHeader = exports.NewFileMetaData = void 0; | ||
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); | ||
; | ||
//# sourceMappingURL=types.js.map | ||
class NewFileMetaData extends parquet_types_1.default.FileMetaData { | ||
json; | ||
//@ts-ignore | ||
row_groups; | ||
constructor() { | ||
super(); | ||
} | ||
} | ||
exports.NewFileMetaData = NewFileMetaData; | ||
class NewPageHeader extends parquet_types_1.default.PageHeader { | ||
offset; | ||
headerSize; | ||
constructor() { | ||
super(); | ||
} | ||
} | ||
exports.NewPageHeader = NewPageHeader; |
@@ -1,61 +0,61 @@ | ||
'use strict'; | ||
const fs = require('fs'); | ||
const thrift = require('thrift'); | ||
const parquet_thrift = require('../gen-nodejs/parquet_types'); | ||
/** We need to use a patched version of TFramedTransport where | ||
* readString returns the original buffer instead of a string if the | ||
* buffer can not be safely encoded as utf8 (see http://bit.ly/2GXeZEF) | ||
*/ | ||
class fixedTFramedTransport extends thrift.TFramedTransport { | ||
readString(len) { | ||
this.ensureAvailable(len); | ||
var buffer = this.inBuf.slice(this.readPos, this.readPos + len); | ||
var str = this.inBuf.toString('utf8', this.readPos, this.readPos + len); | ||
this.readPos += len; | ||
return (Buffer.from(str).equals(buffer)) ? str : buffer; | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.fieldIndexOf = exports.osopen = exports.osend = exports.oswrite = exports.fclose = exports.fread = exports.fstat = exports.fopen = exports.getThriftEnum = exports.getBitWidth = exports.decodeThrift = exports.serializeThrift = void 0; | ||
const thrift_1 = __importDefault(require("thrift")); | ||
const fs_1 = __importDefault(require("fs")); | ||
const parquet_thrift = __importStar(require("../gen-nodejs/parquet_types")); | ||
/** | ||
* We need to patch Thrift's TFramedTransport class bc the TS type definitions | ||
* do not define a `readPos` field, even though the class implementation has | ||
* one. | ||
*/ | ||
class fixedTFramedTransport extends thrift_1.default.TFramedTransport { | ||
inBuf; | ||
readPos; | ||
constructor(inBuf) { | ||
super(inBuf); | ||
this.inBuf = inBuf; | ||
this.readPos = 0; | ||
} | ||
} | ||
/** Patch PageLocation to be three element array that has getters/setters | ||
* for each of the properties (offset, compressed_page_size, first_row_index) | ||
* This saves space considerably as we do not need to store the full variable | ||
* names for every PageLocation | ||
*/ | ||
const previousPageLocation = parquet_thrift.PageLocation.prototype; | ||
const PageLocation = parquet_thrift.PageLocation.prototype = []; | ||
PageLocation.write = previousPageLocation.write; | ||
PageLocation.read = previousPageLocation.read; | ||
const getterSetter = index => ({ | ||
const getterSetter = (index) => ({ | ||
get: function () { return this[index]; }, | ||
set: function (value) { return this[index] = value; } | ||
}); | ||
Object.defineProperty(PageLocation, 'offset', getterSetter(0)); | ||
Object.defineProperty(PageLocation, 'compressed_page_size', getterSetter(1)); | ||
Object.defineProperty(PageLocation, 'first_row_index', getterSetter(2)); | ||
exports.force32 = function () { | ||
const protocol = thrift.TCompactProtocol.prototype; | ||
protocol.zigzagToI64 = protocol.zigzagToI32; | ||
protocol.readVarint64 = protocol.readVarint32 = function () { | ||
let lo = 0; | ||
let shift = 0; | ||
let b; | ||
while (true) { | ||
b = this.trans.readByte(); | ||
lo = lo | ((b & 0x7f) << shift); | ||
shift += 7; | ||
if (!(b & 0x80)) { | ||
break; | ||
} | ||
} | ||
return lo; | ||
}; | ||
}; | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'offset', getterSetter(0)); | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'compressed_page_size', getterSetter(1)); | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'first_row_index', getterSetter(2)); | ||
/** | ||
* Helper function that serializes a thrift object into a buffer | ||
*/ | ||
exports.serializeThrift = function (obj) { | ||
const serializeThrift = function (obj) { | ||
let output = []; | ||
let transport = new thrift.TBufferedTransport(null, function (buf) { | ||
const callBack = function (buf) { | ||
output.push(buf); | ||
}); | ||
let protocol = new thrift.TCompactProtocol(transport); | ||
}; | ||
let transport = new thrift_1.default.TBufferedTransport(undefined, callBack); | ||
let protocol = new thrift_1.default.TCompactProtocol(transport); | ||
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872 | ||
obj.write(protocol); | ||
@@ -65,3 +65,4 @@ transport.flush(); | ||
}; | ||
exports.decodeThrift = function (obj, buf, offset) { | ||
exports.serializeThrift = serializeThrift; | ||
const decodeThrift = function (obj, buf, offset) { | ||
if (!offset) { | ||
@@ -72,10 +73,12 @@ offset = 0; | ||
transport.readPos = offset; | ||
var protocol = new thrift.TCompactProtocol(transport); | ||
var protocol = new thrift_1.default.TCompactProtocol(transport); | ||
//@ts-ignore, https://issues.apache.org/jira/browse/THRIFT-3872 | ||
obj.read(protocol); | ||
return transport.readPos - offset; | ||
}; | ||
exports.decodeThrift = decodeThrift; | ||
/** | ||
* Get the number of bits required to store a given value | ||
*/ | ||
exports.getBitWidth = function (val) { | ||
const getBitWidth = function (val) { | ||
if (val === 0) { | ||
@@ -88,6 +91,7 @@ return 0; | ||
}; | ||
exports.getBitWidth = getBitWidth; | ||
/** | ||
* FIXME not ideal that this is linear | ||
*/ | ||
exports.getThriftEnum = function (klass, value) { | ||
const getThriftEnum = function (klass, value) { | ||
for (let k in klass) { | ||
@@ -100,5 +104,6 @@ if (klass[k] === value) { | ||
}; | ||
exports.fopen = function (filePath) { | ||
exports.getThriftEnum = getThriftEnum; | ||
const fopen = function (filePath) { | ||
return new Promise((resolve, reject) => { | ||
fs.open(filePath, 'r', (err, fd) => { | ||
fs_1.default.open(filePath, 'r', (err, fd) => { | ||
if (err) { | ||
@@ -113,5 +118,6 @@ reject(err); | ||
}; | ||
exports.fstat = function (filePath) { | ||
exports.fopen = fopen; | ||
const fstat = function (filePath) { | ||
return new Promise((resolve, reject) => { | ||
fs.stat(filePath, (err, stat) => { | ||
fs_1.default.stat(filePath, (err, stat) => { | ||
if (err) { | ||
@@ -126,6 +132,7 @@ reject(err); | ||
}; | ||
exports.fread = function (fd, position, length) { | ||
exports.fstat = fstat; | ||
const fread = function (fd, position, length) { | ||
let buffer = Buffer.alloc(length); | ||
return new Promise((resolve, reject) => { | ||
fs.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => { | ||
fs_1.default.read(fd, buffer, 0, length, position, (err, bytesRead, buf) => { | ||
if (err || bytesRead != length) { | ||
@@ -140,5 +147,6 @@ reject(err || Error('read failed')); | ||
}; | ||
exports.fclose = function (fd) { | ||
exports.fread = fread; | ||
const fclose = function (fd) { | ||
return new Promise((resolve, reject) => { | ||
fs.close(fd, (err) => { | ||
fs_1.default.close(fd, (err) => { | ||
if (err) { | ||
@@ -153,3 +161,4 @@ reject(err); | ||
}; | ||
exports.oswrite = function (os, buf) { | ||
exports.fclose = fclose; | ||
const oswrite = function (os, buf) { | ||
return new Promise((resolve, reject) => { | ||
@@ -161,3 +170,3 @@ os.write(buf, (err) => { | ||
else { | ||
resolve(); | ||
resolve(err); | ||
} | ||
@@ -167,3 +176,4 @@ }); | ||
}; | ||
exports.osend = function (os) { | ||
exports.oswrite = oswrite; | ||
const osend = function (os) { | ||
return new Promise((resolve, reject) => { | ||
@@ -175,3 +185,3 @@ os.end((err) => { | ||
else { | ||
resolve(); | ||
resolve(err); | ||
} | ||
@@ -181,5 +191,6 @@ }); | ||
}; | ||
exports.osopen = function (path, opts) { | ||
exports.osend = osend; | ||
const osopen = function (path, opts) { | ||
return new Promise((resolve, reject) => { | ||
let outputStream = fs.createWriteStream(path, opts); | ||
let outputStream = fs_1.default.createWriteStream(path, opts); | ||
outputStream.on('open', function (fd) { | ||
@@ -193,3 +204,4 @@ resolve(outputStream); | ||
}; | ||
exports.fieldIndexOf = function (arr, elem) { | ||
exports.osopen = osopen; | ||
const fieldIndexOf = function (arr, elem) { | ||
for (let j = 0; j < arr.length; ++j) { | ||
@@ -212,2 +224,2 @@ if (arr[j].length !== elem.length) { | ||
}; | ||
//# sourceMappingURL=util.js.map | ||
exports.fieldIndexOf = fieldIndexOf; |
'use strict'; | ||
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { | ||
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } | ||
return new (P || (P = Promise))(function (resolve, reject) { | ||
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } | ||
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } | ||
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } | ||
step((generator = generator.apply(thisArg, _arguments || [])).next()); | ||
}); | ||
}; | ||
const stream = require('stream'); | ||
@@ -48,7 +39,5 @@ const parquet_thrift = require('../gen-nodejs/parquet_types'); | ||
*/ | ||
static openFile(schema, path, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let outputStream = yield parquet_util.osopen(path, opts); | ||
return ParquetWriter.openStream(schema, outputStream, opts); | ||
}); | ||
static async openFile(schema, path, opts) { | ||
let outputStream = await parquet_util.osopen(path, opts); | ||
return ParquetWriter.openStream(schema, outputStream, opts); | ||
} | ||
@@ -59,10 +48,8 @@ /** | ||
*/ | ||
static openStream(schema, outputStream, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (!opts) { | ||
opts = {}; | ||
} | ||
let envelopeWriter = yield ParquetEnvelopeWriter.openStream(schema, outputStream, opts); | ||
return new ParquetWriter(schema, envelopeWriter, opts); | ||
}); | ||
static async openStream(schema, outputStream, opts) { | ||
if (!opts) { | ||
opts = {}; | ||
} | ||
let envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts); | ||
return new ParquetWriter(schema, envelopeWriter, opts); | ||
} | ||
@@ -91,21 +78,19 @@ /** | ||
*/ | ||
appendRow(row) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (this.closed) { | ||
throw 'writer was closed'; | ||
} | ||
parquet_shredder.shredRecord(this.schema, row, this.rowBuffer); | ||
const options = { | ||
useDataPageV2: this.envelopeWriter.useDataPageV2, | ||
bloomFilters: this.envelopeWriter.bloomFilters | ||
}; | ||
if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) { | ||
encodePages(this.schema, this.rowBuffer, options); | ||
} | ||
if (this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
encodePages(this.schema, this.rowBuffer, options); | ||
yield this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
}); | ||
async appendRow(row) { | ||
if (this.closed) { | ||
throw 'writer was closed'; | ||
} | ||
parquet_shredder.shredRecord(this.schema, row, this.rowBuffer); | ||
const options = { | ||
useDataPageV2: this.envelopeWriter.useDataPageV2, | ||
bloomFilters: this.envelopeWriter.bloomFilters | ||
}; | ||
if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) { | ||
await encodePages(this.schema, this.rowBuffer, options); | ||
} | ||
if (this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
await encodePages(this.schema, this.rowBuffer, options); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
} | ||
@@ -118,22 +103,20 @@ /** | ||
*/ | ||
close(callback) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
if (this.closed) { | ||
throw 'writer was closed'; | ||
} | ||
this.closed = true; | ||
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
yield this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
yield this.envelopeWriter.writeBloomFilters(); | ||
yield this.envelopeWriter.writeIndex(); | ||
yield this.envelopeWriter.writeFooter(this.userMetadata); | ||
yield this.envelopeWriter.close(); | ||
this.envelopeWriter = null; | ||
if (callback) { | ||
callback(); | ||
} | ||
}); | ||
async close(callback) { | ||
if (this.closed) { | ||
throw 'writer was closed'; | ||
} | ||
this.closed = true; | ||
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
await this.envelopeWriter.writeBloomFilters(); | ||
await this.envelopeWriter.writeIndex(); | ||
await this.envelopeWriter.writeFooter(this.userMetadata); | ||
await this.envelopeWriter.close(); | ||
this.envelopeWriter = null; | ||
if (callback) { | ||
callback(); | ||
} | ||
} | ||
@@ -173,8 +156,6 @@ /** | ||
*/ | ||
static openStream(schema, outputStream, opts) { | ||
return __awaiter(this, void 0, void 0, function* () { | ||
let writeFn = parquet_util.oswrite.bind(undefined, outputStream); | ||
let closeFn = parquet_util.osend.bind(undefined, outputStream); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts); | ||
}); | ||
static async openStream(schema, outputStream, opts) { | ||
let writeFn = parquet_util.oswrite.bind(undefined, outputStream); | ||
let closeFn = parquet_util.osend.bind(undefined, outputStream); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts); | ||
} | ||
@@ -210,4 +191,4 @@ constructor(schema, writeFn, closeFn, fileOffset, opts) { | ||
*/ | ||
writeRowGroup(records) { | ||
let rgroup = encodeRowGroup(this.schema, records, { | ||
async writeRowGroup(records) { | ||
let rgroup = await encodeRowGroup(this.schema, records, { | ||
baseOffset: this.offset, | ||
@@ -309,3 +290,3 @@ pageSize: this.pageSize, | ||
_flush(callback) { | ||
this.writer.close(callback) | ||
this.writer.close() | ||
.then(d => callback(null, d), callback); | ||
@@ -343,3 +324,3 @@ } | ||
} | ||
function encodePages(schema, rowBuffer, opts) { | ||
async function encodePages(schema, rowBuffer, opts) { | ||
if (!rowBuffer.pageRowCount) { | ||
@@ -373,6 +354,6 @@ return; | ||
if (opts.useDataPageV2) { | ||
page = encodeDataPageV2(field, values.count, values.values, values.rlevels, values.dlevels, statistics); | ||
page = await encodeDataPageV2(field, values.count, values.values, values.rlevels, values.dlevels, statistics); | ||
} | ||
else { | ||
page = encodeDataPage(field, values.values, values.rlevels, values.dlevels, statistics); | ||
page = await encodeDataPage(field, values.values, values.rlevels, values.dlevels, statistics); | ||
} | ||
@@ -400,3 +381,3 @@ let pages = rowBuffer.pages[field.path]; | ||
*/ | ||
function encodeDataPage(column, values, rlevels, dlevels, statistics) { | ||
async function encodeDataPage(column, values, rlevels, dlevels, statistics) { | ||
/* encode values */ | ||
@@ -418,3 +399,3 @@ let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
let pageBody = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]); | ||
pageBody = parquet_compression.deflate(column.compression, pageBody); | ||
pageBody = await parquet_compression.deflate(column.compression, pageBody); | ||
let pageHeader = new parquet_thrift.PageHeader(); | ||
@@ -440,3 +421,3 @@ pageHeader.type = parquet_thrift.PageType['DATA_PAGE']; | ||
*/ | ||
function encodeDataPageV2(column, rowCount, values, rlevels, dlevels, statistics) { | ||
async function encodeDataPageV2(column, rowCount, values, rlevels, dlevels, statistics) { | ||
/* encode values */ | ||
@@ -447,3 +428,3 @@ let valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { | ||
}); | ||
let valuesBufCompressed = parquet_compression.deflate(column.compression, valuesBuf); | ||
let valuesBufCompressed = await parquet_compression.deflate(column.compression, valuesBuf); | ||
/* encode repetition and definition levels */ | ||
@@ -494,3 +475,3 @@ let rLevelsBuf = Buffer.alloc(0); | ||
*/ | ||
function encodeColumnChunk(pages, opts) { | ||
async function encodeColumnChunk(pages, opts) { | ||
let pagesBuf = Buffer.concat(pages.map(d => d.page)); | ||
@@ -508,3 +489,3 @@ let num_values = pages.reduce((p, d) => p + d.num_values, 0); | ||
metadata.type = parquet_thrift.Type[opts.column.primitiveType]; | ||
metadata.codec = parquet_thrift.CompressionCodec[opts.column.compression]; | ||
metadata.codec = await parquet_thrift.CompressionCodec[opts.column.compression]; | ||
/* compile statistics ColumnIndex and OffsetIndex*/ | ||
@@ -568,3 +549,3 @@ let columnIndex = new parquet_thrift.ColumnIndex(); | ||
*/ | ||
function encodeRowGroup(schema, data, opts) { | ||
async function encodeRowGroup(schema, data, opts) { | ||
let metadata = new parquet_thrift.RowGroup(); | ||
@@ -579,3 +560,3 @@ metadata.num_rows = data.rowCount; | ||
} | ||
let cchunkData = encodeColumnChunk(data.pages[field.path], { | ||
let cchunkData = await encodeColumnChunk(data.pages[field.path], { | ||
column: field, | ||
@@ -604,3 +585,3 @@ baseOffset: opts.baseOffset + body.length, | ||
metadata.version = PARQUET_VERSION; | ||
metadata.created_by = 'parquet.js'; | ||
metadata.created_by = '@dsnp/parquetjs'; | ||
metadata.num_rows = rowCount; | ||
@@ -650,2 +631,1 @@ metadata.row_groups = rowGroups; | ||
}; | ||
//# sourceMappingURL=writer.js.map |
"use strict"; | ||
const reader = require('./lib/reader'); | ||
const writer = require('./lib/writer'); | ||
const schema = require('./lib/schema'); | ||
const shredder = require('./lib/shred'); | ||
const util = require('./lib/util'); | ||
module.exports = { | ||
ParquetEnvelopeReader: reader.ParquetEnvelopeReader, | ||
ParquetReader: reader.ParquetReader, | ||
ParquetEnvelopeWriter: writer.ParquetEnvelopeWriter, | ||
ParquetWriter: writer.ParquetWriter, | ||
ParquetTransformer: writer.ParquetTransformer, | ||
ParquetSchema: schema.ParquetSchema, | ||
ParquetShredder: shredder, | ||
force32: util.force32 | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
//# sourceMappingURL=parquet.js.map | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetShredder = exports.ParquetSchema = exports.ParquetTransformer = exports.ParquetWriter = exports.ParquetEnvelopeWriter = exports.ParquetReader = exports.ParquetEnvelopeReader = void 0; | ||
const reader = __importStar(require("./lib/reader")); | ||
const writer = __importStar(require("./lib/writer")); | ||
const schema = __importStar(require("./lib/schema")); | ||
const shredder = __importStar(require("./lib/shred")); | ||
exports.ParquetEnvelopeReader = reader.ParquetEnvelopeReader; | ||
exports.ParquetReader = reader.ParquetReader; | ||
exports.ParquetEnvelopeWriter = writer.ParquetEnvelopeWriter; | ||
exports.ParquetWriter = writer.ParquetWriter; | ||
exports.ParquetTransformer = writer.ParquetTransformer; | ||
exports.ParquetSchema = schema.ParquetSchema; | ||
exports.ParquetShredder = shredder; |
@@ -5,3 +5,4 @@ { | ||
"main": "dist/parquet.js", | ||
"version": "0.0.0-2ccc4e", | ||
"types": "dist/parquet.d.ts", | ||
"version": "0.0.0-33c80f", | ||
"homepage": "https://github.com/LibertyDSNP/parquetjs", | ||
@@ -18,35 +19,52 @@ "license": "MIT", | ||
"dependencies": { | ||
"@types/brotli": "^1.3.0", | ||
"@types/bson": "^4.0.3", | ||
"@types/express": "^4.17.11", | ||
"@types/long": "^4.0.1", | ||
"@types/node": "^14.14.35", | ||
"@types/request": "^2.48.5", | ||
"@types/sinon": "^10.0.0", | ||
"brotli": "^1.3.0", | ||
"bson": "^2.0.8", | ||
"@types/varint": "^6.0.0", | ||
"browserify-zlib": "^0.2.0", | ||
"bson": "4.4.0", | ||
"cross-fetch": "^3.1.4", | ||
"int53": "^0.2.4", | ||
"lodash": "^4.17.21", | ||
"long": "^4.0.0", | ||
"lzo": "^0.4.0", | ||
"object-stream": "0.0.1", | ||
"snappyjs": "^0.6.0", | ||
"thrift": "^0.14.1", | ||
"typescript": "^4.2.3", | ||
"thrift": "0.14.1", | ||
"varint": "^5.0.0", | ||
"wasm-brotli": "^2.0.2", | ||
"xxhash-wasm": "^0.4.1" | ||
}, | ||
"devDependencies": { | ||
"@babel/core": "7.13.10", | ||
"@babel/preset-env": "7.13.12", | ||
"@babel/preset-typescript": "7.13.0", | ||
"@babel/core": "^7.14.6", | ||
"@babel/preset-env": "^7.14.7", | ||
"@babel/preset-typescript": "^7.14.5", | ||
"@types/bson": "^4.0.3", | ||
"@types/chai": "^4.2.16", | ||
"@types/long": "^4.0.1", | ||
"@types/mocha": "^8.2.2", | ||
"@types/node": "^14.14.35", | ||
"@types/sinon": "^10.0.0", | ||
"@types/thrift": "^0.10.10", | ||
"assert": "^2.0.0", | ||
"babel-loader": "^8.2.2", | ||
"babel-plugin-add-module-exports": "^1.0.4", | ||
"browserfs": "^1.4.3", | ||
"buffer": "^6.0.3", | ||
"chai": "4.3.4", | ||
"mocha": "8.3.2", | ||
"core-js": "^3.15.1", | ||
"esbuild": "^0.14.38", | ||
"mocha": "9.2.2", | ||
"msw": "^0.39.2", | ||
"object-stream": "0.0.1", | ||
"process": "^0.11.10", | ||
"regenerator-runtime": "^0.13.7", | ||
"sinon": "^10.0.0", | ||
"ts-node": "^9.1.1" | ||
"sinon-chai": "^3.7.0", | ||
"sinon-chai-in-order": "^0.1.0", | ||
"source-map-loader": "^3.0.0", | ||
"stream-browserify": "^3.0.0", | ||
"ts-loader": "^9.2.3", | ||
"ts-node": "^9.1.1", | ||
"typescript": "^4.5.2" | ||
}, | ||
"scripts": { | ||
"build": "tsc -b", | ||
"build": "npm run build:node && npm run build:types", | ||
"build:types": "tsc -p tsconfig.types.json && cp gen-nodejs/parquet_types.d.ts dist/gen-nodejs/parquet_types.d.ts", | ||
"build:node": "tsc -b", | ||
"build:browser": "node esbuild.js", | ||
"type": "tsc --noEmit", | ||
@@ -56,4 +74,16 @@ "lint": "echo 'Linting, it is on the TODO list...'", | ||
"clean": "rm -Rf ./dist", | ||
"prepare": "npm run build" | ||
"prepublishOnly": "npm run clean && npm run build:node && npm run build:types && npm run build:browser", | ||
"thrift": "thrift -out gen-nodejs --gen js:ts parquet.thrift && thrift -out gen-nodejs --gen js:node parquet.thrift", | ||
"serve": "node esbuild-serve.js" | ||
}, | ||
"browser": { | ||
"assert": "assert", | ||
"events": "events", | ||
"fs": "browserfs", | ||
"path": "path-browserify", | ||
"stream": "readable-stream", | ||
"thrift": "./node_modules/thrift/lib/nodejs/lib/thrift/browser.js", | ||
"util": "util", | ||
"zlib": "browserify-zlib" | ||
}, | ||
"engines": { | ||
@@ -60,0 +90,0 @@ "node": ">=14.16.0" |
@@ -6,3 +6,3 @@ # parquet.js | ||
[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) | ||
[![npm version](https://badge.fury.io/js/%40unfinishedlabs%2Fparquet-js.svg)](https://badge.fury.io/js/%40unfinishedlabs%2Fparquet-js) | ||
[![npm version](https://badge.fury.io/js/%40unfinishedlabs%2Fparquet-js.svg)](https://badge.fury.io/js/%40dsnp%2Fparquetjs) | ||
@@ -18,4 +18,4 @@ This package contains a fully asynchronous, pure JavaScript implementation of | ||
Forked Notice | ||
------------- | ||
## Forked Notice | ||
This is a forked repository with code from various sources: | ||
@@ -25,7 +25,5 @@ - Primary source [ironSource](https://github.com/ironSource/parquetjs) [npm: parquetjs](https://www.npmjs.com/package/parquetjs) | ||
Installation | ||
------------ | ||
## Installation | ||
_parquet.js requires node.js >= 14.16.0_ | ||
To use parquet.js with node.js, install it using npm: | ||
``` | ||
@@ -35,8 +33,21 @@ $ npm install @dsnp/parquetjs | ||
_parquet.js requires node.js >= 7.6.0_ | ||
### NodeJS | ||
To use with nodejs: | ||
```javascript | ||
import parquetjs from "@dsnp/parquetjs" | ||
``` | ||
### Browser | ||
To use in a browser, in your bundler, depending on your needs, write the appropriate plugin or resolver to point to: | ||
```javascript | ||
"node_modules/@dsnp/parquetjs/browser/parquetjs" | ||
``` | ||
or: | ||
Usage: Writing files | ||
-------------------- | ||
```javascript | ||
import parquetjs from "@dsnp/parquetjs/browser/parquetjs" | ||
``` | ||
## Usage: Writing files | ||
Once you have installed the parquet.js library, you can import it as a single | ||
@@ -46,3 +57,3 @@ module: | ||
``` js | ||
var parquet = require('parquetjs-lite'); | ||
var parquet = require('@dsnp/parquetjs'); | ||
``` | ||
@@ -118,4 +129,3 @@ | ||
Usage: Reading files | ||
-------------------- | ||
## Usage: Reading files | ||
@@ -233,4 +243,3 @@ A parquet reader allows retrieving the rows from a parquet file in order. | ||
Encodings | ||
--------- | ||
## Encodings | ||
@@ -267,4 +276,3 @@ Internally, the Parquet format will store values from each field as consecutive | ||
Optional Fields | ||
--------------- | ||
### Optional Fields | ||
@@ -286,4 +294,3 @@ By default, all fields are required to be present in each row. You can also mark | ||
Nested Rows & Arrays | ||
-------------------- | ||
### Nested Rows & Arrays | ||
@@ -358,4 +365,3 @@ Parquet supports nested schemas that allow you to store rows that have a more | ||
Nested Lists for Hive / Athena | ||
----------------------- | ||
### Nested Lists for Hive / Athena | ||
@@ -365,4 +371,3 @@ Lists have to be annotated to be queriable with AWS Athena. See [parquet-format](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists) for more detail and a full working example with comments in the test directory ([`test/list.js`](test/list.js)) | ||
List of Supported Types & Encodings | ||
----------------------------------- | ||
### List of Supported Types & Encodings | ||
@@ -400,4 +405,3 @@ We aim to be feature-complete and add new features as they are added to the | ||
Buffering & Row Group Size | ||
-------------------------- | ||
## Buffering & Row Group Size | ||
@@ -418,4 +422,3 @@ When writing a Parquet file, the `ParquetWriter` will buffer rows in memory | ||
Dependencies | ||
------------- | ||
## Dependencies | ||
@@ -425,6 +428,6 @@ Parquet uses [thrift](https://thrift.apache.org/) to encode the schema and other | ||
Notes | ||
----- | ||
## Notes | ||
Currently parquet-cpp doesn't fully support DATA_PAGE_V2. You can work around this | ||
by setting the useDataPageV2 option to false. |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Filesystem access
Supply chain riskAccesses the file system, and could potentially read sensitive data.
Found 1 instance in 1 package
2781312
11
51
19369
420
1
31
+ Added@types/varint@^6.0.0
+ Addedbrowserify-zlib@^0.2.0
+ Addedcross-fetch@^3.1.4
+ Addedwasm-brotli@^2.0.2
+ Added@types/node@22.9.1(transitive)
+ Added@types/varint@6.0.3(transitive)
+ Addedbrowserify-zlib@0.2.0(transitive)
+ Addedbson@4.4.0(transitive)
+ Addedbuffer@5.7.1(transitive)
+ Addedcross-fetch@3.1.8(transitive)
+ Addedieee754@1.2.1(transitive)
+ Addednode-fetch@2.7.0(transitive)
+ Addedpako@1.0.11(transitive)
+ Addedthrift@0.14.1(transitive)
+ Addedtr46@0.0.3(transitive)
+ Addedundici-types@6.19.8(transitive)
+ Addedwasm-brotli@2.0.2(transitive)
+ Addedwebidl-conversions@3.0.1(transitive)
+ Addedwhatwg-url@5.0.0(transitive)
- Removed@types/brotli@^1.3.0
- Removed@types/bson@^4.0.3
- Removed@types/express@^4.17.11
- Removed@types/long@^4.0.1
- Removed@types/node@^14.14.35
- Removed@types/request@^2.48.5
- Removed@types/sinon@^10.0.0
- Removedbrotli@^1.3.0
- Removedlodash@^4.17.21
- Removedlzo@^0.4.0
- Removedobject-stream@0.0.1
- Removedtypescript@^4.2.3
- Removed@types/body-parser@1.19.5(transitive)
- Removed@types/brotli@1.3.4(transitive)
- Removed@types/bson@4.2.4(transitive)
- Removed@types/caseless@0.12.5(transitive)
- Removed@types/connect@3.4.38(transitive)
- Removed@types/express@4.17.21(transitive)
- Removed@types/express-serve-static-core@4.19.6(transitive)
- Removed@types/http-errors@2.0.4(transitive)
- Removed@types/long@4.0.2(transitive)
- Removed@types/mime@1.3.5(transitive)
- Removed@types/node@14.18.63(transitive)
- Removed@types/qs@6.9.17(transitive)
- Removed@types/range-parser@1.2.7(transitive)
- Removed@types/request@2.48.12(transitive)
- Removed@types/send@0.17.4(transitive)
- Removed@types/serve-static@1.15.7(transitive)
- Removed@types/sinon@10.0.20(transitive)
- Removed@types/sinonjs__fake-timers@8.1.5(transitive)
- Removed@types/tough-cookie@4.0.5(transitive)
- Removedasynckit@0.4.0(transitive)
- Removedbindings@1.2.1(transitive)
- Removedbrotli@1.3.3(transitive)
- Removedbson@2.0.8(transitive)
- Removedcombined-stream@1.0.8(transitive)
- Removeddelayed-stream@1.0.0(transitive)
- Removedform-data@2.5.2(transitive)
- Removedlodash@4.17.21(transitive)
- Removedlzo@0.4.11(transitive)
- Removedmime-db@1.52.0(transitive)
- Removedmime-types@2.1.35(transitive)
- Removedobject-stream@0.0.1(transitive)
- Removedsafe-buffer@5.2.1(transitive)
- Removedthrift@0.14.2(transitive)
- Removedtypescript@4.9.5(transitive)
Updatedbson@4.4.0
Updatedthrift@0.14.1