@dsnp/parquetjs
Advanced tools
Comparing version 0.0.0-4bd382 to 0.0.0-555af7
@@ -371,2 +371,1 @@ "use strict"; | ||
exports.default = SplitBlockBloomFilter; | ||
//# sourceMappingURL=sbbf.js.map |
@@ -44,2 +44,1 @@ "use strict"; | ||
exports.default = XxHasher; | ||
//# sourceMappingURL=xxhasher.js.map |
@@ -31,4 +31,3 @@ "use strict"; | ||
return columnChunkDataCollection.filter((columnChunk) => { | ||
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, }, }, } = columnChunk; | ||
return bloomFilterOffsetBuffer; | ||
return columnChunk.column.meta_data?.bloom_filter_offset; | ||
}); | ||
@@ -44,6 +43,6 @@ }; | ||
const parseBloomFilterOffsets = (ColumnChunkDataCollection) => { | ||
return ColumnChunkDataCollection.map((columnChunkData) => { | ||
const { column: { meta_data: { bloom_filter_offset: { buffer: bloomFilterOffsetBuffer }, path_in_schema: pathInSchema, }, }, rowGroupIndex, } = columnChunkData; | ||
return ColumnChunkDataCollection.map(({ rowGroupIndex, column }) => { | ||
const { bloom_filter_offset: bloomOffset, path_in_schema: pathInSchema, } = column.meta_data || {}; | ||
return { | ||
offsetBytes: toInteger(bloomFilterOffsetBuffer), | ||
offsetBytes: toInteger(bloomOffset.buffer), | ||
columnName: pathInSchema.join(","), | ||
@@ -111,2 +110,1 @@ rowGroupIndex, | ||
exports.getBloomFiltersFor = getBloomFiltersFor; | ||
//# sourceMappingURL=bloomFilterReader.js.map |
@@ -65,3 +65,3 @@ "use strict"; | ||
const setFilterOffset = (column, offset) => { | ||
column.meta_data.bloom_filter_offset = offset; | ||
column.meta_data.bloom_filter_offset = parquet_util.cloneInteger(offset); | ||
}; | ||
@@ -75,2 +75,1 @@ exports.setFilterOffset = setFilterOffset; | ||
exports.getSerializedBloomFilterData = getSerializedBloomFilterData; | ||
//# sourceMappingURL=bloomFilterWriter.js.map |
@@ -57,2 +57,1 @@ 'use strict'; | ||
exports.inflate = inflate; | ||
//# sourceMappingURL=compression.js.map |
@@ -19,3 +19,3 @@ "use strict"; | ||
} | ||
async read(offset, length) { | ||
read(offset, length) { | ||
if (!this.scheduled) { | ||
@@ -71,2 +71,1 @@ this.scheduled = true; | ||
; | ||
//# sourceMappingURL=bufferReader.js.map |
@@ -26,2 +26,1 @@ "use strict"; | ||
exports.PLAIN_DICTIONARY = __importStar(require("./plain_dictionary")); | ||
//# sourceMappingURL=index.js.map |
@@ -30,2 +30,1 @@ "use strict"; | ||
exports.decodeValues = decodeValues; | ||
//# sourceMappingURL=plain_dictionary.js.map |
@@ -213,2 +213,1 @@ "use strict"; | ||
exports.decodeValues = decodeValues; | ||
//# sourceMappingURL=plain.js.map |
"use strict"; | ||
// For questions about RLE encoding, see the spec: | ||
// | ||
// https://github.com/apache/parquet-format/blob/master/Encodings.md | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
@@ -25,5 +28,8 @@ return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
let buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8)); | ||
let remainingValue = value; | ||
// This is encoded LSB to MSB, so we pick off the least | ||
// significant byte and shift to get the next one. | ||
for (let i = 0; i < buf.length; ++i) { | ||
buf.writeUInt8(value & 0xff, i); | ||
value >> 8; | ||
buf.writeUInt8(remainingValue & 0xff, i); | ||
remainingValue = remainingValue >> 8; | ||
} | ||
@@ -111,6 +117,9 @@ return Buffer.concat([ | ||
function decodeRunRepeated(cursor, count, opts) { | ||
var bytesNeededForFixedBitWidth = Math.ceil(opts.bitWidth / 8); | ||
let value = 0; | ||
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); ++i) { | ||
value << 8; | ||
value += cursor.buffer[cursor.offset]; | ||
for (let i = 0; i < bytesNeededForFixedBitWidth; ++i) { | ||
const byte = cursor.buffer[cursor.offset]; | ||
// Bytes are stored LSB to MSB, so we need to shift | ||
// each new byte appropriately. | ||
value += byte << (i * 8); | ||
cursor.offset += 1; | ||
@@ -149,2 +158,1 @@ } | ||
exports.decodeValues = decodeValues; | ||
//# sourceMappingURL=rle.js.map |
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
//# sourceMappingURL=types.js.map |
@@ -80,2 +80,1 @@ "use strict"; | ||
} | ||
//# sourceMappingURL=compression.js.map |
@@ -25,3 +25,3 @@ "use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetEnvelopeReader = void 0; | ||
exports.ParquetEnvelopeReader = exports.ParquetReader = void 0; | ||
const node_int64_1 = __importDefault(require("node-int64")); | ||
@@ -186,7 +186,7 @@ const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types")); | ||
if (column.offsetIndex) { | ||
column.offsetIndex.page_locations.forEach(d => { | ||
Promise.resolve(column.offsetIndex).then(offset => (offset.page_locations.forEach(d => { | ||
if (Array.isArray(d)) { | ||
Object.setPrototypeOf(d, parquet_types_1.default.PageLocation.prototype); | ||
} | ||
}); | ||
}))); | ||
} | ||
@@ -215,2 +215,19 @@ }); | ||
/** | ||
* Support `for await` iterators on the reader object | ||
* Uses `ParquetCursor` still under the hood. | ||
* | ||
* ```js | ||
* for await (const record of reader) { | ||
* console.log(record); | ||
* } | ||
* ``` | ||
*/ | ||
async *[Symbol.asyncIterator]() { | ||
const cursor = this.getCursor(); | ||
let record = null; | ||
while (record = await cursor.next()) { | ||
yield record; | ||
} | ||
} | ||
/** | ||
* Return a cursor to the file. You may open more than one cursor and use | ||
@@ -243,3 +260,3 @@ * them concurrently. All cursors become invalid once close() is called on | ||
* Return the number of rows in this file. Note that the number of rows is | ||
* not neccessarily equal to the number of rows in each column. | ||
* not necessarily equal to the number of rows in each column. | ||
*/ | ||
@@ -265,3 +282,3 @@ getRowCount() { | ||
} | ||
exportMetadata(indent) { | ||
async exportMetadata(indent) { | ||
function replacer(_key, value) { | ||
@@ -300,2 +317,14 @@ if (value instanceof parquet_types_1.default.PageLocation) { | ||
const metadata = Object.assign({}, this.metadata, { json: true }); | ||
for (let i = 0; i < metadata.row_groups.length; i++) { | ||
const rowGroup = metadata.row_groups[i]; | ||
for (let j = 0; j < rowGroup.columns.length; j++) { | ||
const column = rowGroup.columns[j]; | ||
if (column.offsetIndex instanceof Promise) { | ||
column.offsetIndex = await column.offsetIndex; | ||
} | ||
if (column.columnIndex instanceof Promise) { | ||
column.columnIndex = await column.columnIndex; | ||
} | ||
} | ||
} | ||
return JSON.stringify(metadata, replacer, indent); | ||
@@ -316,2 +345,3 @@ } | ||
} | ||
exports.ParquetReader = ParquetReader; | ||
/** | ||
@@ -417,10 +447,14 @@ * The parquet envelope reader allows direct, unbuffered access to the individual | ||
let column; | ||
if (!isNaN(row_group)) { | ||
row_group = this.metadata.row_groups[row_group]; | ||
let parsedRowGroup; | ||
if (!isNaN(Number(row_group))) { | ||
parsedRowGroup = this.metadata?.row_groups[Number(row_group)]; | ||
} | ||
else if (row_group instanceof parquet_types_1.default.RowGroup) { | ||
parsedRowGroup = row_group; | ||
} | ||
if (typeof path === 'string') { | ||
if (!row_group) { | ||
if (!parsedRowGroup) { | ||
throw `Missing RowGroup ${row_group}`; | ||
} | ||
column = row_group.columns.find(d => d.meta_data.path_in_schema.join(',') === path); | ||
column = parsedRowGroup.columns.find(d => d.meta_data.path_in_schema.join(',') === path); | ||
if (!column) { | ||
@@ -456,9 +490,5 @@ throw `Column ${path} Not Found`; | ||
Object.defineProperty(offset_index, 'column', { value: column, enumerable: false }); | ||
if (opts && opts.cache) { | ||
column.offsetIndex = offset_index; | ||
} | ||
return offset_index; | ||
}); | ||
if (opts && opts.cache) { | ||
//@ts-ignore | ||
if (opts?.cache) { | ||
column.offsetIndex = data; | ||
@@ -476,5 +506,5 @@ } | ||
} | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then((data) => { | ||
const data = this.read(+column.column_index_offset, column.column_index_length).then((buf) => { | ||
let column_index = new parquet_types_1.default.ColumnIndex(); | ||
parquet_util.decodeThrift(column_index, data); | ||
parquet_util.decodeThrift(column_index, buf); | ||
Object.defineProperty(column_index, 'column', { value: column }); | ||
@@ -489,9 +519,5 @@ // decode the statistics values | ||
} | ||
if (opts && opts.cache) { | ||
column.columnIndex = column_index; | ||
} | ||
return column_index; | ||
}); | ||
if (opts && opts.cache) { | ||
//@ts-ignore | ||
if (opts?.cache) { | ||
column.columnIndex = data; | ||
@@ -508,9 +534,9 @@ } | ||
} | ||
column.meta_data.data_page_offset = page.offset; | ||
column.meta_data.total_compressed_size = page.compressed_page_size; | ||
column.meta_data.data_page_offset = parquet_util.cloneInteger(page.offset); | ||
column.meta_data.total_compressed_size = new node_int64_1.default(page.compressed_page_size); | ||
} | ||
else { | ||
const offsetIndex = column.offsetIndex || await this.readOffsetIndex(column, null, opts); | ||
column.meta_data.data_page_offset = offsetIndex.page_locations[page].offset; | ||
column.meta_data.total_compressed_size = offsetIndex.page_locations[page].compressed_page_size; | ||
const offsetIndex = await this.readOffsetIndex(column, null, opts); | ||
column.meta_data.data_page_offset = parquet_util.cloneInteger(offsetIndex.page_locations[page].offset); | ||
column.meta_data.total_compressed_size = new node_int64_1.default(offsetIndex.page_locations[page].compressed_page_size); | ||
} | ||
@@ -585,3 +611,3 @@ const chunk = await this.readColumnChunk(this.schema, column); | ||
let metadataBuf = await this.read(metadataOffset, metadataSize); | ||
let metadata = new types_1.NewFileMetaData(); | ||
let metadata = new parquet_types_1.default.FileMetaData(); | ||
parquet_util.decodeThrift(metadata, metadataBuf); | ||
@@ -684,3 +710,6 @@ return metadata; | ||
} | ||
if (opts.dictionary) { | ||
// It's possible to have a column chunk where some pages should use | ||
// the dictionary (PLAIN_DICTIONARY for example) and others should | ||
// not (PLAIN for example). | ||
if (opts.dictionary && pageData.useDictionary) { | ||
pageData.values = pageData.values.map(d => opts.dictionary[d]); | ||
@@ -770,3 +799,4 @@ } | ||
values: values, | ||
count: valueCount | ||
count: valueCount, | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' | ||
}; | ||
@@ -821,3 +851,4 @@ } | ||
values: values, | ||
count: valueCount | ||
count: valueCount, | ||
useDictionary: valueEncoding === 'PLAIN_DICTIONARY' || valueEncoding === 'RLE_DICTIONARY' | ||
}; | ||
@@ -884,2 +915,1 @@ } | ||
}; | ||
//# sourceMappingURL=reader.js.map |
@@ -177,2 +177,1 @@ "use strict"; | ||
} | ||
//# sourceMappingURL=schema.js.map |
@@ -219,2 +219,1 @@ "use strict"; | ||
} | ||
//# sourceMappingURL=shred.js.map |
@@ -434,2 +434,1 @@ 'use strict'; | ||
} | ||
//# sourceMappingURL=types.js.map |
@@ -363,2 +363,1 @@ "use strict"; | ||
} | ||
//# sourceMappingURL=parquet_primitives.js.map |
@@ -7,14 +7,5 @@ "use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.NewPageHeader = exports.NewFileMetaData = void 0; | ||
exports.NewPageHeader = void 0; | ||
const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); | ||
; | ||
class NewFileMetaData extends parquet_types_1.default.FileMetaData { | ||
json; | ||
//@ts-ignore | ||
row_groups; | ||
constructor() { | ||
super(); | ||
} | ||
} | ||
exports.NewFileMetaData = NewFileMetaData; | ||
class NewPageHeader extends parquet_types_1.default.PageHeader { | ||
@@ -28,2 +19,1 @@ offset; | ||
exports.NewPageHeader = NewPageHeader; | ||
//# sourceMappingURL=types.js.map |
@@ -25,6 +25,7 @@ "use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.fieldIndexOf = exports.osopen = exports.osend = exports.oswrite = exports.fclose = exports.fread = exports.fstat = exports.fopen = exports.getThriftEnum = exports.getBitWidth = exports.decodeThrift = exports.serializeThrift = void 0; | ||
exports.cloneInteger = exports.fieldIndexOf = exports.osopen = exports.osend = exports.oswrite = exports.fclose = exports.fread = exports.fstat = exports.fopen = exports.getThriftEnum = exports.getBitWidth = exports.decodeThrift = exports.serializeThrift = void 0; | ||
const thrift_1 = __importDefault(require("thrift")); | ||
const fs_1 = __importDefault(require("fs")); | ||
const parquet_thrift = __importStar(require("../gen-nodejs/parquet_types")); | ||
const thrift_2 = require("thrift"); | ||
/** | ||
@@ -44,10 +45,7 @@ * We need to patch Thrift's TFramedTransport class bc the TS type definitions | ||
} | ||
// Issue at https://github.com/LibertyDSNP/parquetjs/issues/42 | ||
const previousPageLocation = new parquet_thrift.PageLocation(); | ||
//@ts-ignore | ||
const PageLocation = parquet_thrift.PageLocation.prototype = []; | ||
//@ts-ignore | ||
PageLocation.write = previousPageLocation.write; | ||
//@ts-ignore | ||
PageLocation.read = previousPageLocation.read; | ||
/** Patch PageLocation to be three element array that has getters/setters | ||
* for each of the properties (offset, compressed_page_size, first_row_index) | ||
* This saves space considerably as we do not need to store the full variable | ||
* names for every PageLocation | ||
*/ | ||
const getterSetter = (index) => ({ | ||
@@ -57,5 +55,5 @@ get: function () { return this[index]; }, | ||
}); | ||
Object.defineProperty(PageLocation, 'offset', getterSetter(0)); | ||
Object.defineProperty(PageLocation, 'compressed_page_size', getterSetter(1)); | ||
Object.defineProperty(PageLocation, 'first_row_index', getterSetter(2)); | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'offset', getterSetter(0)); | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'compressed_page_size', getterSetter(1)); | ||
Object.defineProperty(parquet_thrift.PageLocation.prototype, 'first_row_index', getterSetter(2)); | ||
/** | ||
@@ -223,2 +221,5 @@ * Helper function that serializes a thrift object into a buffer | ||
exports.fieldIndexOf = fieldIndexOf; | ||
//# sourceMappingURL=util.js.map | ||
const cloneInteger = (int) => { | ||
return new thrift_2.Int64(int.valueOf()); | ||
}; | ||
exports.cloneInteger = cloneInteger; |
@@ -1,11 +0,35 @@ | ||
'use strict'; | ||
const stream = require('stream'); | ||
const parquet_thrift = require('../gen-nodejs/parquet_types'); | ||
const parquet_shredder = require('./shred'); | ||
const parquet_util = require('./util'); | ||
const parquet_codec = require('./codec'); | ||
const parquet_compression = require('./compression'); | ||
const parquet_types = require('./types'); | ||
const bloomFilterWriter = require("./bloomFilterIO/bloomFilterWriter"); | ||
const Long = require('long'); | ||
"use strict"; | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetTransformer = exports.ParquetEnvelopeWriter = exports.ParquetWriter = void 0; | ||
const stream_1 = __importDefault(require("stream")); | ||
const parquet_types_1 = __importDefault(require("../gen-nodejs/parquet_types")); | ||
const parquet_shredder = __importStar(require("./shred")); | ||
const parquet_util = __importStar(require("./util")); | ||
const parquet_codec = __importStar(require("./codec")); | ||
const parquet_compression = __importStar(require("./compression")); | ||
const parquet_types = __importStar(require("./types")); | ||
const bloomFilterWriter = __importStar(require("./bloomFilterIO/bloomFilterWriter")); | ||
const node_int64_1 = __importDefault(require("node-int64")); | ||
/** | ||
@@ -35,2 +59,8 @@ * Parquet File Magic String | ||
class ParquetWriter { | ||
schema; | ||
envelopeWriter; | ||
rowBuffer; | ||
rowGroupSize; | ||
closed; | ||
userMetadata; | ||
/** | ||
@@ -78,3 +108,3 @@ * Convenience method to create a new buffered parquet writer that writes to | ||
async appendRow(row) { | ||
if (this.closed) { | ||
if (this.closed || this.envelopeWriter === null) { | ||
throw 'writer was closed'; | ||
@@ -107,12 +137,14 @@ } | ||
this.closed = true; | ||
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
if (this.envelopeWriter) { | ||
if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { | ||
await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters }); | ||
await this.envelopeWriter.writeRowGroup(this.rowBuffer); | ||
this.rowBuffer = {}; | ||
} | ||
await this.envelopeWriter.writeBloomFilters(); | ||
await this.envelopeWriter.writeIndex(); | ||
await this.envelopeWriter.writeFooter(this.userMetadata); | ||
await this.envelopeWriter.close(); | ||
this.envelopeWriter = null; | ||
} | ||
await this.envelopeWriter.writeBloomFilters(); | ||
await this.envelopeWriter.writeIndex(); | ||
await this.envelopeWriter.writeFooter(this.userMetadata); | ||
await this.envelopeWriter.close(); | ||
this.envelopeWriter = null; | ||
if (callback) { | ||
@@ -142,12 +174,23 @@ callback(); | ||
setPageSize(cnt) { | ||
this.writer.setPageSize(cnt); | ||
this.envelopeWriter.setPageSize(cnt); | ||
} | ||
} | ||
exports.ParquetWriter = ParquetWriter; | ||
/** | ||
* Create a parquet file from a schema and a number of row groups. This class | ||
* performs direct, unbuffered writes to the underlying output stream and is | ||
* intendend for advanced and internal users; the writeXXX methods must be | ||
* intended for advanced and internal users; the writeXXX methods must be | ||
* called in the correct order to produce a valid file. | ||
*/ | ||
class ParquetEnvelopeWriter { | ||
schema; | ||
write; | ||
close; | ||
offset; | ||
rowCount; | ||
rowGroups; | ||
pageSize; | ||
useDataPageV2; | ||
pageIndex; | ||
bloomFilters; // TODO: OR filterCollection | ||
/** | ||
@@ -159,3 +202,3 @@ * Create a new parquet envelope writer that writes to the specified stream | ||
let closeFn = parquet_util.osend.bind(undefined, outputStream); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts); | ||
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, new node_int64_1.default(0), opts); | ||
} | ||
@@ -167,3 +210,3 @@ constructor(schema, writeFn, closeFn, fileOffset, opts) { | ||
this.offset = fileOffset; | ||
this.rowCount = 0; | ||
this.rowCount = new node_int64_1.default(0); | ||
this.rowGroups = []; | ||
@@ -179,3 +222,3 @@ this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE; | ||
writeSection(buf) { | ||
this.offset += buf.length; | ||
this.offset.setValue(this.offset.valueOf() + buf.length); | ||
return this.write(buf); | ||
@@ -200,12 +243,11 @@ } | ||
}); | ||
this.rowCount += records.rowCount; | ||
this.rowCount.setValue(this.rowCount.valueOf() + records.rowCount); | ||
this.rowGroups.push(rgroup.metadata); | ||
return this.writeSection(rgroup.body); | ||
} | ||
writeBloomFilters(_rowGroups) { | ||
let rowGroups = _rowGroups || this.rowGroups; | ||
rowGroups.forEach(group => { | ||
writeBloomFilters() { | ||
this.rowGroups.forEach(group => { | ||
group.columns.forEach(column => { | ||
const columnName = column.meta_data.path_in_schema[0]; | ||
if (columnName in this.bloomFilters === false) | ||
const columnName = column.meta_data?.path_in_schema[0]; | ||
if (!columnName || columnName in this.bloomFilters === false) | ||
return; | ||
@@ -221,20 +263,19 @@ const serializedBloomFilterData = bloomFilterWriter.getSerializedBloomFilterData(this.bloomFilters[columnName]); | ||
*/ | ||
writeIndex(_rowGroups) { | ||
let rowGroups = _rowGroups || this.rowGroups; | ||
writeIndex() { | ||
this.schema.fieldList.forEach((c, i) => { | ||
rowGroups.forEach(group => { | ||
this.rowGroups.forEach(group => { | ||
let column = group.columns[i]; | ||
if (!column) | ||
return; | ||
if (column.meta_data.columnIndex) { | ||
if (column.meta_data?.columnIndex) { | ||
let columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex); | ||
delete column.meta_data.columnIndex; | ||
column.column_index_offset = this.offset; | ||
column.column_index_offset = parquet_util.cloneInteger(this.offset); | ||
column.column_index_length = columnBody.length; | ||
this.writeSection(columnBody); | ||
} | ||
if (column.meta_data.offsetIndex) { | ||
if (column.meta_data?.offsetIndex) { | ||
let offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex); | ||
delete column.meta_data.offsetIndex; | ||
column.offset_index_offset = this.offset; | ||
column.offset_index_offset = parquet_util.cloneInteger(this.offset); | ||
column.offset_index_length = offsetBody.length; | ||
@@ -249,3 +290,3 @@ this.writeSection(offsetBody); | ||
*/ | ||
writeFooter(userMetadata, schema, rowCount, rowGroups) { | ||
writeFooter(userMetadata) { | ||
if (!userMetadata) { | ||
@@ -268,6 +309,8 @@ userMetadata = {}; | ||
} | ||
exports.ParquetEnvelopeWriter = ParquetEnvelopeWriter; | ||
/** | ||
* Create a parquet transform stream | ||
*/ | ||
class ParquetTransformer extends stream.Transform { | ||
class ParquetTransformer extends stream_1.default.Transform { | ||
writer; | ||
constructor(schema, opts = {}) { | ||
@@ -280,9 +323,9 @@ super({ objectMode: true }); | ||
})(this); | ||
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, 0, opts), opts); | ||
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, function () { }, new node_int64_1.default(0), opts), opts); | ||
} | ||
_transform(row, encoding, callback) { | ||
_transform(row, _encoding, callback) { | ||
if (row) { | ||
this.writer.appendRow(row).then(data => callback(null, data), err => { | ||
const fullErr = new Error(`Error transforming to parquet: ${err.toString()} row:${row}`); | ||
fullErr.origErr = err; | ||
fullErr.message = err; | ||
callback(fullErr); | ||
@@ -300,2 +343,3 @@ }); | ||
} | ||
exports.ParquetTransformer = ParquetTransformer; | ||
/** | ||
@@ -328,3 +372,3 @@ * Encode a consecutive array of data using one of the parquet encodings | ||
statistics.min = statistics.min_value; | ||
return new parquet_thrift.Statistics(statistics); | ||
return new parquet_types_1.default.Statistics(statistics); | ||
} | ||
@@ -340,3 +384,3 @@ async function encodePages(schema, rowBuffer, opts) { | ||
let page; | ||
const values = rowBuffer.columnData[field.path]; | ||
const values = rowBuffer.columnData[field.path.join(',')]; | ||
if (opts.bloomFilters && (field.name in opts.bloomFilters)) { | ||
@@ -346,3 +390,3 @@ const splitBlockBloomFilter = opts.bloomFilters[field.name]; | ||
} | ||
let statistics; | ||
let statistics = {}; | ||
if (field.statistics !== false) { | ||
@@ -358,4 +402,4 @@ statistics = {}; | ||
}); | ||
statistics.null_count = values.dlevels.length - values.values.length; | ||
statistics.distinct_count = values.distinct_values.size; | ||
statistics.null_count = new node_int64_1.default(values.dlevels.length - values.values.length); | ||
statistics.distinct_count = new node_int64_1.default(values.distinct_values.size); | ||
} | ||
@@ -366,5 +410,5 @@ if (opts.useDataPageV2) { | ||
else { | ||
page = await encodeDataPage(field, values.values, values.rlevels, values.dlevels, statistics); | ||
page = await encodeDataPage(field, values.values || [], values.rlevels || [], values.dlevels || [], statistics); | ||
} | ||
let pages = rowBuffer.pages[field.path]; | ||
let pages = rowBuffer.pages[field.path.join(',')]; | ||
let lastPage = pages[pages.length - 1]; | ||
@@ -408,7 +452,7 @@ let first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0; | ||
pageBody = await parquet_compression.deflate(column.compression, pageBody); | ||
let pageHeader = new parquet_thrift.PageHeader(); | ||
pageHeader.type = parquet_thrift.PageType['DATA_PAGE']; | ||
let pageHeader = new parquet_types_1.default.PageHeader(); | ||
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE']; | ||
pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; | ||
pageHeader.compressed_page_size = pageBody.length; | ||
pageHeader.data_page_header = new parquet_thrift.DataPageHeader(); | ||
pageHeader.data_page_header = new parquet_types_1.default.DataPageHeader(); | ||
pageHeader.data_page_header.num_values = dlevels.length; | ||
@@ -418,7 +462,7 @@ if (column.statistics !== false) { | ||
} | ||
pageHeader.data_page_header.encoding = parquet_thrift.Encoding[column.encoding]; | ||
pageHeader.data_page_header.encoding = parquet_types_1.default.Encoding[column.encoding]; | ||
pageHeader.data_page_header.definition_level_encoding = | ||
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING]; | ||
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
pageHeader.data_page_header.repetition_level_encoding = | ||
parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING]; | ||
parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; | ||
/* concat page header, repetition and definition levels and values */ | ||
@@ -453,5 +497,5 @@ return Buffer.concat([parquet_util.serializeThrift(pageHeader), pageBody]); | ||
/* build page header */ | ||
let pageHeader = new parquet_thrift.PageHeader(); | ||
pageHeader.type = parquet_thrift.PageType['DATA_PAGE_V2']; | ||
pageHeader.data_page_header_v2 = new parquet_thrift.DataPageHeaderV2(); | ||
let pageHeader = new parquet_types_1.default.PageHeader(); | ||
pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE_V2']; | ||
pageHeader.data_page_header_v2 = new parquet_types_1.default.DataPageHeaderV2(); | ||
pageHeader.data_page_header_v2.num_values = dlevels.length; | ||
@@ -467,3 +511,3 @@ pageHeader.data_page_header_v2.num_nulls = dlevels.length - values.length; | ||
rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length; | ||
pageHeader.data_page_header_v2.encoding = parquet_thrift.Encoding[column.encoding]; | ||
pageHeader.data_page_header_v2.encoding = parquet_types_1.default.Encoding[column.encoding]; | ||
pageHeader.data_page_header_v2.definition_levels_byte_length = dLevelsBuf.length; | ||
@@ -489,16 +533,16 @@ pageHeader.data_page_header_v2.repetition_levels_byte_length = rLevelsBuf.length; | ||
/* prepare metadata header */ | ||
let metadata = new parquet_thrift.ColumnMetaData(); | ||
let metadata = new parquet_types_1.default.ColumnMetaData(); | ||
metadata.path_in_schema = opts.column.path; | ||
metadata.num_values = num_values; | ||
metadata.data_page_offset = opts.baseOffset; | ||
metadata.num_values = new node_int64_1.default(num_values); | ||
metadata.data_page_offset = new node_int64_1.default(opts.baseOffset); | ||
metadata.encodings = []; | ||
metadata.total_uncompressed_size = pagesBuf.length; | ||
metadata.total_compressed_size = pagesBuf.length; | ||
metadata.type = parquet_thrift.Type[opts.column.primitiveType]; | ||
metadata.codec = await parquet_thrift.CompressionCodec[opts.column.compression]; | ||
metadata.total_uncompressed_size = new node_int64_1.default(pagesBuf.length); | ||
metadata.total_compressed_size = new node_int64_1.default(pagesBuf.length); | ||
metadata.type = parquet_types_1.default.Type[opts.column.primitiveType]; | ||
metadata.codec = await parquet_types_1.default.CompressionCodec[opts.column.compression]; | ||
/* compile statistics ColumnIndex and OffsetIndex*/ | ||
let columnIndex = new parquet_thrift.ColumnIndex(); | ||
let columnIndex = new parquet_types_1.default.ColumnIndex(); | ||
columnIndex.max_values = []; | ||
columnIndex.min_values = []; | ||
let offsetIndex = new parquet_thrift.OffsetIndex(); | ||
let offsetIndex = new parquet_types_1.default.OffsetIndex(); | ||
offsetIndex.page_locations = []; | ||
@@ -508,7 +552,7 @@ /* prepare statistics */ | ||
let distinct_values = new Set(); | ||
statistics.null_count = 0; | ||
statistics.distinct_count = 0; | ||
statistics.null_count = new node_int64_1.default(0); | ||
statistics.distinct_count = new node_int64_1.default(0); | ||
/* loop through pages and update indices and statistics */ | ||
for (let i = 0; i < pages.length; i++) { | ||
let page = pages[i]; | ||
const page = pages[i]; | ||
if (opts.column.statistics !== false) { | ||
@@ -521,12 +565,12 @@ if (page.statistics.max_value > statistics.max_value || i == 0) { | ||
} | ||
statistics.null_count += page.statistics.null_count; | ||
page.distinct_values.forEach(value => distinct_values.add(value)); | ||
statistics.null_count.setValue(statistics.null_count.valueOf() + (page.statistics.null_count?.valueOf() || 0)); | ||
page.distinct_values.forEach((value) => distinct_values.add(value)); | ||
columnIndex.max_values.push(encodeStatisticsValue(page.statistics.max_value, opts.column)); | ||
columnIndex.min_values.push(encodeStatisticsValue(page.statistics.min_value, opts.column)); | ||
} | ||
let pageLocation = new parquet_thrift.PageLocation(); | ||
pageLocation.offset = offset; | ||
let pageLocation = new parquet_types_1.default.PageLocation(); | ||
pageLocation.offset = new node_int64_1.default(offset); | ||
offset += page.page.length; | ||
pageLocation.compressed_page_size = page.page.length; | ||
pageLocation.first_row_index = page.first_row_index; | ||
pageLocation.first_row_index = new node_int64_1.default(page.first_row_index); | ||
offsetIndex.page_locations.push(pageLocation); | ||
@@ -538,3 +582,3 @@ } | ||
if (opts.column.statistics !== false) { | ||
statistics.distinct_count = distinct_values.size; | ||
statistics.distinct_count = new node_int64_1.default(distinct_values.size); | ||
metadata.statistics = encodeStatistics(statistics, opts.column); | ||
@@ -546,8 +590,4 @@ if (opts.pageIndex !== false) { | ||
/* list encodings */ | ||
let encodingsSet = {}; | ||
encodingsSet[PARQUET_RDLVL_ENCODING] = true; | ||
encodingsSet[opts.column.encoding] = true; | ||
for (let k in encodingsSet) { | ||
metadata.encodings.push(parquet_thrift.Encoding[k]); | ||
} | ||
metadata.encodings.push(parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]); | ||
metadata.encodings.push(parquet_types_1.default.Encoding[opts.column.encoding]); | ||
/* concat metadata header and data pages */ | ||
@@ -562,6 +602,6 @@ let metadataOffset = opts.baseOffset + pagesBuf.length; | ||
async function encodeRowGroup(schema, data, opts) { | ||
let metadata = new parquet_thrift.RowGroup(); | ||
metadata.num_rows = data.rowCount; | ||
let metadata = new parquet_types_1.default.RowGroup(); | ||
metadata.num_rows = new node_int64_1.default(data.rowCount); | ||
metadata.columns = []; | ||
metadata.total_byte_size = 0; | ||
metadata.total_byte_size = new node_int64_1.default(0); | ||
let body = Buffer.alloc(0); | ||
@@ -572,16 +612,15 @@ for (let field of schema.fieldList) { | ||
} | ||
let cchunkData = await encodeColumnChunk(data.pages[field.path], { | ||
let cchunkData = await encodeColumnChunk(data.pages[field.path.join(',')], { | ||
column: field, | ||
baseOffset: opts.baseOffset + body.length, | ||
pageSize: opts.pageSize, | ||
encoding: field.encoding, | ||
rowCount: data.rowCount, | ||
useDataPageV2: opts.useDataPageV2, | ||
pageIndex: opts.pageIndex | ||
baseOffset: opts.baseOffset.valueOf() + body.length, | ||
pageSize: opts.pageSize || 0, | ||
rowCount: data.rowCount || 0, | ||
useDataPageV2: opts.useDataPageV2 ?? true, | ||
pageIndex: opts.pageIndex ?? true | ||
}); | ||
let cchunk = new parquet_thrift.ColumnChunk(); | ||
cchunk.file_offset = cchunkData.metadataOffset; | ||
let cchunk = new parquet_types_1.default.ColumnChunk(); | ||
cchunk.file_offset = new node_int64_1.default(cchunkData.metadataOffset); | ||
cchunk.meta_data = cchunkData.metadata; | ||
metadata.columns.push(cchunk); | ||
metadata.total_byte_size += cchunkData.body.length; | ||
metadata.total_byte_size = new node_int64_1.default(metadata.total_byte_size.valueOf() + (cchunkData.body.length)); | ||
body = Buffer.concat([body, cchunkData.body]); | ||
@@ -595,5 +634,5 @@ } | ||
function encodeFooter(schema, rowCount, rowGroups, userMetadata) { | ||
let metadata = new parquet_thrift.FileMetaData(); | ||
let metadata = new parquet_types_1.default.FileMetaData(); | ||
metadata.version = PARQUET_VERSION; | ||
metadata.created_by = 'parquet.js'; | ||
metadata.created_by = '@dsnp/parquetjs'; | ||
metadata.num_rows = rowCount; | ||
@@ -604,3 +643,3 @@ metadata.row_groups = rowGroups; | ||
for (let k in userMetadata) { | ||
let kv = new parquet_thrift.KeyValue(); | ||
let kv = new parquet_types_1.default.KeyValue(); | ||
kv.key = k; | ||
@@ -611,3 +650,3 @@ kv.value = userMetadata[k]; | ||
{ | ||
let schemaRoot = new parquet_thrift.SchemaElement(); | ||
let schemaRoot = new parquet_types_1.default.SchemaElement(); | ||
schemaRoot.name = 'root'; | ||
@@ -618,5 +657,5 @@ schemaRoot.num_children = Object.keys(schema.fields).length; | ||
for (let field of schema.fieldList) { | ||
let schemaElem = new parquet_thrift.SchemaElement(); | ||
let schemaElem = new parquet_types_1.default.SchemaElement(); | ||
schemaElem.name = field.name; | ||
schemaElem.repetition_type = parquet_thrift.FieldRepetitionType[field.repetitionType]; | ||
schemaElem.repetition_type = parquet_types_1.default.FieldRepetitionType[field.repetitionType]; | ||
if (field.isNested) { | ||
@@ -626,6 +665,6 @@ schemaElem.num_children = field.fieldCount; | ||
else { | ||
schemaElem.type = parquet_thrift.Type[field.primitiveType]; | ||
schemaElem.type = parquet_types_1.default.Type[field.primitiveType]; | ||
} | ||
if (field.originalType) { | ||
schemaElem.converted_type = parquet_thrift.ConvertedType[field.originalType]; | ||
schemaElem.converted_type = parquet_types_1.default.ConvertedType[field.originalType]; | ||
} | ||
@@ -647,2 +686,1 @@ schemaElem.type_length = field.typeLength; | ||
}; | ||
//# sourceMappingURL=writer.js.map |
"use strict"; | ||
const reader = require('./lib/reader'); | ||
const writer = require('./lib/writer'); | ||
const schema = require('./lib/schema'); | ||
const shredder = require('./lib/shred'); | ||
const util = require('./lib/util'); | ||
module.exports = { | ||
ParquetEnvelopeReader: reader.ParquetEnvelopeReader, | ||
ParquetReader: reader.ParquetReader, | ||
ParquetEnvelopeWriter: writer.ParquetEnvelopeWriter, | ||
ParquetWriter: writer.ParquetWriter, | ||
ParquetTransformer: writer.ParquetTransformer, | ||
ParquetSchema: schema.ParquetSchema, | ||
ParquetShredder: shredder, | ||
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); | ||
}) : (function(o, m, k, k2) { | ||
if (k2 === undefined) k2 = k; | ||
o[k2] = m[k]; | ||
})); | ||
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { | ||
Object.defineProperty(o, "default", { enumerable: true, value: v }); | ||
}) : function(o, v) { | ||
o["default"] = v; | ||
}); | ||
var __importStar = (this && this.__importStar) || function (mod) { | ||
if (mod && mod.__esModule) return mod; | ||
var result = {}; | ||
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); | ||
__setModuleDefault(result, mod); | ||
return result; | ||
}; | ||
//# sourceMappingURL=parquet.js.map | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.ParquetShredder = exports.ParquetSchema = exports.ParquetTransformer = exports.ParquetWriter = exports.ParquetEnvelopeWriter = exports.ParquetReader = exports.ParquetEnvelopeReader = void 0; | ||
const reader = __importStar(require("./lib/reader")); | ||
const writer = __importStar(require("./lib/writer")); | ||
const schema = __importStar(require("./lib/schema")); | ||
const shredder = __importStar(require("./lib/shred")); | ||
exports.ParquetEnvelopeReader = reader.ParquetEnvelopeReader; | ||
exports.ParquetReader = reader.ParquetReader; | ||
exports.ParquetEnvelopeWriter = writer.ParquetEnvelopeWriter; | ||
exports.ParquetWriter = writer.ParquetWriter; | ||
exports.ParquetTransformer = writer.ParquetTransformer; | ||
exports.ParquetSchema = schema.ParquetSchema; | ||
exports.ParquetShredder = shredder; |
@@ -5,3 +5,4 @@ { | ||
"main": "dist/parquet.js", | ||
"version": "0.0.0-4bd382", | ||
"types": "dist/parquet.d.ts", | ||
"version": "0.0.0-555af7", | ||
"homepage": "https://github.com/LibertyDSNP/parquetjs", | ||
@@ -48,5 +49,5 @@ "license": "MIT", | ||
"core-js": "^3.15.1", | ||
"esbuild": "^0.14.1", | ||
"mocha": "8.3.2", | ||
"msw": "^0.29.0", | ||
"esbuild": "^0.14.38", | ||
"mocha": "9.2.2", | ||
"msw": "^0.39.2", | ||
"object-stream": "0.0.1", | ||
@@ -56,2 +57,4 @@ "process": "^0.11.10", | ||
"sinon": "^10.0.0", | ||
"sinon-chai": "^3.7.0", | ||
"sinon-chai-in-order": "^0.1.0", | ||
"source-map-loader": "^3.0.0", | ||
@@ -64,3 +67,4 @@ "stream-browserify": "^3.0.0", | ||
"scripts": { | ||
"build": "npm run build:node", | ||
"build": "npm run build:node && npm run build:types", | ||
"build:types": "tsc -p tsconfig.types.json && cp gen-nodejs/parquet_types.d.ts dist/gen-nodejs/parquet_types.d.ts", | ||
"build:node": "tsc -b", | ||
@@ -72,3 +76,3 @@ "build:browser": "node esbuild.js", | ||
"clean": "rm -Rf ./dist", | ||
"prepublishOnly": "npm run clean && npm run build:node && npm run build:browser", | ||
"prepublishOnly": "npm run clean && npm run build:node && npm run build:types && npm run build:browser", | ||
"thrift": "thrift -out gen-nodejs --gen js:ts parquet.thrift && thrift -out gen-nodejs --gen js:node parquet.thrift", | ||
@@ -75,0 +79,0 @@ "serve": "node esbuild-serve.js" |
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
Sorry, the diff of this file is too big to display
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
19438
2786982
31