Comparing version 0.10.6 to 0.10.7
@@ -53,3 +53,3 @@ "use strict"; | ||
buf.fill(0); | ||
for (let i = 0; i < values.length; ++i) { | ||
for (let i = 0; i < values.length; i++) { | ||
if (values[i]) { | ||
@@ -63,3 +63,3 @@ buf[Math.floor(i / 8)] |= (1 << (i % 8)); | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
const b = cursor.buffer[cursor.offset + Math.floor(i / 8)]; | ||
@@ -80,3 +80,3 @@ values.push((b & (1 << (i % 8))) > 0); | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
values.push(cursor.buffer.readInt32LE(cursor.offset)); | ||
@@ -96,3 +96,3 @@ cursor.offset += 4; | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
values.push(INT53.readInt64LE(cursor.buffer, cursor.offset)); | ||
@@ -119,3 +119,3 @@ cursor.offset += 8; | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
const low = INT53.readInt64LE(cursor.buffer, cursor.offset); | ||
@@ -142,3 +142,3 @@ const high = cursor.buffer.readUInt32LE(cursor.offset + 8); | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
values.push(cursor.buffer.readFloatLE(cursor.offset)); | ||
@@ -158,3 +158,3 @@ cursor.offset += 4; | ||
const values = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
values.push(cursor.buffer.readDoubleLE(cursor.offset)); | ||
@@ -161,0 +161,0 @@ cursor.offset += 8; |
@@ -9,3 +9,3 @@ "use strict"; | ||
const buf = Buffer.alloc(Math.ceil(opts.bitWidth * (values.length / 8))); | ||
for (let b = 0; b < opts.bitWidth * values.length; ++b) { | ||
for (let b = 0; b < opts.bitWidth * values.length; b++) { | ||
if ((values[Math.floor(b / opts.bitWidth)] & (1 << b % opts.bitWidth)) > 0) { | ||
@@ -22,3 +22,3 @@ buf[Math.floor(b / 8)] |= (1 << (b % 8)); | ||
const buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8)); | ||
for (let i = 0; i < buf.length; ++i) { | ||
for (let i = 0; i < buf.length; i++) { | ||
buf.writeUInt8(value & 0xff, i); | ||
@@ -93,3 +93,3 @@ value >> 8; | ||
const values = new Array(count).fill(0); | ||
for (let b = 0; b < opts.bitWidth * count; ++b) { | ||
for (let b = 0; b < opts.bitWidth * count; b++) { | ||
if (cursor.buffer[cursor.offset + Math.floor(b / 8)] & (1 << (b % 8))) { | ||
@@ -104,3 +104,3 @@ values[Math.floor(b / opts.bitWidth)] |= (1 << b % opts.bitWidth); | ||
let value = 0; | ||
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); ++i) { | ||
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); i++) { | ||
value << 8; | ||
@@ -107,0 +107,0 @@ value += cursor.buffer[cursor.offset]; |
@@ -5,3 +5,3 @@ "use strict"; | ||
const zlib = require("zlib"); | ||
const snappyjs = require("snappyjs"); | ||
const snappyjs = require("./snappy"); | ||
let brotli; | ||
@@ -8,0 +8,0 @@ let lzo; |
@@ -8,5 +8,5 @@ export declare type ParquetCodec = 'PLAIN' | 'RLE'; | ||
export interface SchemaDefinition { | ||
[string: string]: ElementDefinition; | ||
[string: string]: FieldDefinition; | ||
} | ||
export interface ElementDefinition { | ||
export interface FieldDefinition { | ||
type?: ParquetType; | ||
@@ -20,5 +20,6 @@ typeLength?: number; | ||
} | ||
export interface FieldDefinition { | ||
export interface ParquetField { | ||
name: string; | ||
path: string[]; | ||
key: string; | ||
primitiveType?: PrimitiveType; | ||
@@ -34,3 +35,3 @@ originalType?: OriginalType; | ||
fieldCount?: number; | ||
fields?: Record<string, FieldDefinition>; | ||
fields?: Record<string, ParquetField>; | ||
} | ||
@@ -37,0 +38,0 @@ export interface ParquetBuffer { |
/// <reference types="node" /> | ||
import { ParquetBuffer, ParquetData, ParquetRecord } from './declare'; | ||
import { ColumnChunk, FileMetaData, RowGroup } from './gen'; | ||
import { ParquetSchema } from './schema'; | ||
import { ColumnChunk, FileMetaData, RowGroup } from './thrift'; | ||
/** | ||
@@ -6,0 +6,0 @@ * A parquet cursor is used to retrieve rows from a parquet file in order |
@@ -5,6 +5,6 @@ "use strict"; | ||
const Compression = require("./compression"); | ||
// tslint:disable-next-line:max-line-length | ||
const gen_1 = require("./gen"); | ||
const schema_1 = require("./schema"); | ||
const Shred = require("./shred"); | ||
// tslint:disable-next-line:max-line-length | ||
const thrift_1 = require("./thrift"); | ||
const Util = require("./util"); | ||
@@ -195,6 +195,6 @@ // import Fs = require('fs'); | ||
const field = schema.findField(colChunk.meta_data.path_in_schema); | ||
const type = Util.getThriftEnum(gen_1.Type, colChunk.meta_data.type); | ||
const type = Util.getThriftEnum(thrift_1.Type, colChunk.meta_data.type); | ||
if (type !== field.primitiveType) | ||
throw new Error('chunk type not matching schema: ' + type); | ||
const compression = Util.getThriftEnum(gen_1.CompressionCodec, colChunk.meta_data.codec); | ||
const compression = Util.getThriftEnum(thrift_1.CompressionCodec, colChunk.meta_data.codec); | ||
const pagesOffset = +colChunk.meta_data.data_page_offset; | ||
@@ -250,3 +250,3 @@ const pagesSize = +colChunk.meta_data.total_compressed_size; | ||
cursor.offset += length; | ||
const pageType = Util.getThriftEnum(gen_1.PageType, pageHeader.type); | ||
const pageType = Util.getThriftEnum(thrift_1.PageType, pageHeader.type); | ||
let pageData = null; | ||
@@ -300,3 +300,3 @@ switch (pageType) { | ||
/* read repetition levels */ | ||
const rLevelEncoding = Util.getThriftEnum(gen_1.Encoding, header.data_page_header.repetition_level_encoding); | ||
const rLevelEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.repetition_level_encoding); | ||
// tslint:disable-next-line:prefer-array-literal | ||
@@ -315,3 +315,3 @@ let rLevels = new Array(valueCount); | ||
/* read definition levels */ | ||
const dLevelEncoding = Util.getThriftEnum(gen_1.Encoding, header.data_page_header.definition_level_encoding); | ||
const dLevelEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.definition_level_encoding); | ||
// tslint:disable-next-line:prefer-array-literal | ||
@@ -332,7 +332,7 @@ let dLevels = new Array(valueCount); | ||
if (dlvl === column.dLevelMax) { | ||
++valueCountNonNull; | ||
valueCountNonNull++; | ||
} | ||
} | ||
/* read values */ | ||
const valueEncoding = Util.getThriftEnum(gen_1.Encoding, header.data_page_header.encoding); | ||
const valueEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header.encoding); | ||
const values = decodeValues(column.primitiveType, valueEncoding, dataCursor, valueCountNonNull, { | ||
@@ -356,3 +356,3 @@ typeLength: column.typeLength, | ||
const valueCountNonNull = valueCount - header.data_page_header_v2.num_nulls; | ||
const valueEncoding = Util.getThriftEnum(gen_1.Encoding, header.data_page_header_v2.encoding); | ||
const valueEncoding = Util.getThriftEnum(thrift_1.Encoding, header.data_page_header_v2.encoding); | ||
/* read repetition levels */ | ||
@@ -409,3 +409,3 @@ // tslint:disable-next-line:prefer-array-literal | ||
const schemaElement = schemaElements[next]; | ||
const repetitionType = next > 0 ? Util.getThriftEnum(gen_1.FieldRepetitionType, schemaElement.repetition_type) : 'ROOT'; | ||
const repetitionType = next > 0 ? Util.getThriftEnum(thrift_1.FieldRepetitionType, schemaElement.repetition_type) : 'ROOT'; | ||
let optional = false; | ||
@@ -434,5 +434,5 @@ let repeated = false; | ||
else { | ||
let logicalType = Util.getThriftEnum(gen_1.Type, schemaElement.type); | ||
let logicalType = Util.getThriftEnum(thrift_1.Type, schemaElement.type); | ||
if (schemaElement.converted_type != null) { | ||
logicalType = Util.getThriftEnum(gen_1.ConvertedType, schemaElement.converted_type); | ||
logicalType = Util.getThriftEnum(thrift_1.ConvertedType, schemaElement.converted_type); | ||
} | ||
@@ -439,0 +439,0 @@ schema[schemaElement.name] = { |
@@ -1,2 +0,2 @@ | ||
import { ElementDefinition, FieldDefinition, ParquetBuffer, ParquetCompression, ParquetRecord, SchemaDefinition } from './declare'; | ||
import { FieldDefinition, ParquetBuffer, ParquetCompression, ParquetField, ParquetRecord, SchemaDefinition } from './declare'; | ||
/** | ||
@@ -6,5 +6,5 @@ * A parquet file schema | ||
export declare class ParquetSchema { | ||
schema: Record<string, ElementDefinition>; | ||
fields: Record<string, FieldDefinition>; | ||
fieldList: FieldDefinition[]; | ||
schema: Record<string, FieldDefinition>; | ||
fields: Record<string, ParquetField>; | ||
fieldList: ParquetField[]; | ||
/** | ||
@@ -17,12 +17,13 @@ * Create a new schema from a JSON schema definition | ||
*/ | ||
findField(path: string): FieldDefinition; | ||
findField(path: string[]): FieldDefinition; | ||
findField(path: string): ParquetField; | ||
findField(path: string[]): ParquetField; | ||
/** | ||
* Retrieve a field definition and all the field's ancestors | ||
*/ | ||
findFieldBranch(path: string): FieldDefinition[]; | ||
findFieldBranch(path: string[]): FieldDefinition[]; | ||
findFieldBranch(path: string): ParquetField[]; | ||
findFieldBranch(path: string[]): ParquetField[]; | ||
shredRecord(record: ParquetRecord, buffer: ParquetBuffer): void; | ||
materializeRecords(buffer: ParquetBuffer): ParquetRecord[]; | ||
compress(type: ParquetCompression): this; | ||
buffer(): ParquetBuffer; | ||
} |
@@ -60,2 +60,5 @@ "use strict"; | ||
} | ||
buffer() { | ||
return shred_1.shredBuffer(this); | ||
} | ||
} | ||
@@ -96,5 +99,7 @@ exports.ParquetSchema = ParquetSchema; | ||
if (opts.fields) { | ||
const cpath = path.concat([name]); | ||
fieldList[name] = { | ||
name, | ||
path: path.concat([name]), | ||
path: cpath, | ||
key: cpath.join(), | ||
repetitionType, | ||
@@ -105,3 +110,3 @@ rLevelMax, | ||
fieldCount: Object.keys(opts.fields).length, | ||
fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat([name])) | ||
fields: buildFields(opts.fields, rLevelMax, dLevelMax, cpath) | ||
}; | ||
@@ -123,2 +128,3 @@ continue; | ||
/* add to schema */ | ||
const cpath = path.concat([name]); | ||
fieldList[name] = { | ||
@@ -128,3 +134,4 @@ name, | ||
originalType: typeDef.originalType, | ||
path: path.concat([name]), | ||
path: cpath, | ||
key: cpath.join(), | ||
repetitionType, | ||
@@ -131,0 +138,0 @@ encoding: opts.encoding, |
import { ParquetBuffer, ParquetRecord } from './declare'; | ||
import { ParquetSchema } from './schema'; | ||
export declare function shredBuffer(schema: ParquetSchema): ParquetBuffer; | ||
/** | ||
@@ -24,3 +25,2 @@ * 'Shred' a record into a list of <value, repetition_level, definition_level> | ||
* } | ||
* | ||
*/ | ||
@@ -46,4 +46,3 @@ export declare function shredRecord(schema: ParquetSchema, record: any, buffer: ParquetBuffer): void; | ||
* } | ||
* | ||
*/ | ||
export declare function materializeRecords(schema: ParquetSchema, buffer: ParquetBuffer): ParquetRecord[]; |
180
lib/shred.js
"use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
const Types = require("./types"); | ||
function shredBuffer(schema) { | ||
const columnData = {}; | ||
for (const field of schema.fieldList) { | ||
columnData[field.key] = { | ||
dlevels: [], | ||
rlevels: [], | ||
values: [], | ||
count: 0 | ||
}; | ||
} | ||
return { rowCount: 0, columnData }; | ||
} | ||
exports.shredBuffer = shredBuffer; | ||
/** | ||
@@ -25,51 +38,33 @@ * 'Shred' a record into a list of <value, repetition_level, definition_level> | ||
* } | ||
* | ||
*/ | ||
function shredRecord(schema, record, buffer) { | ||
/* shred the record, this may raise an exception */ | ||
const recordShredded = {}; | ||
for (const field of schema.fieldList) { | ||
recordShredded[field.path.join()] = { | ||
dlevels: [], | ||
rlevels: [], | ||
values: [], | ||
count: 0 | ||
}; | ||
} | ||
shredRecordInternal(schema.fields, record, recordShredded, 0, 0); | ||
const data = shredBuffer(schema).columnData; | ||
shredRecordFields(schema.fields, record, data, 0, 0); | ||
/* if no error during shredding, add the shredded record to the buffer */ | ||
if (!('columnData' in buffer) || !('rowCount' in buffer)) { | ||
buffer.rowCount = 0; | ||
buffer.columnData = {}; | ||
for (const field of schema.fieldList) { | ||
const cd = { | ||
dlevels: [], | ||
rlevels: [], | ||
values: [], | ||
count: 0 | ||
}; | ||
buffer.columnData[field.path.join()] = cd; | ||
} | ||
buffer.rowCount = 1; | ||
buffer.columnData = data; | ||
return; | ||
} | ||
buffer.rowCount += 1; | ||
for (const field of schema.fieldList) { | ||
Array.prototype.push.apply(buffer.columnData[field.path.join()].rlevels, recordShredded[field.path.join()].rlevels); | ||
Array.prototype.push.apply(buffer.columnData[field.path.join()].dlevels, recordShredded[field.path.join()].dlevels); | ||
Array.prototype.push.apply(buffer.columnData[field.path.join()].values, recordShredded[field.path.join()].values); | ||
buffer.columnData[field.path.join()].count += recordShredded[field.path.join()].count; | ||
Array.prototype.push.apply(buffer.columnData[field.key].rlevels, data[field.key].rlevels); | ||
Array.prototype.push.apply(buffer.columnData[field.key].dlevels, data[field.key].dlevels); | ||
Array.prototype.push.apply(buffer.columnData[field.key].values, data[field.key].values); | ||
buffer.columnData[field.key].count += data[field.key].count; | ||
} | ||
} | ||
exports.shredRecord = shredRecord; | ||
function shredRecordInternal(fields, record, data, rlvl, dlvl) { | ||
for (const fieldName in fields) { | ||
const field = fields[fieldName]; | ||
const fieldType = field.originalType || field.primitiveType; | ||
function shredRecordFields(fields, record, data, rLevel, dLevel) { | ||
for (const name in fields) { | ||
const field = fields[name]; | ||
// fetch values | ||
let values = []; | ||
if (record && (fieldName in record) && record[fieldName] !== undefined && record[fieldName] !== null) { | ||
if (record[fieldName].constructor === Array) { | ||
values = record[fieldName]; | ||
if (record && (field.name in record) && record[field.name] !== undefined && record[field.name] !== null) { | ||
if (record[field.name].constructor === Array) { | ||
values = record[field.name]; | ||
} | ||
else { | ||
values.push(record[fieldName]); | ||
values.push(record[field.name]); | ||
} | ||
@@ -87,8 +82,8 @@ } | ||
if (field.isNested) { | ||
shredRecordInternal(field.fields, null, data, rlvl, dlvl); | ||
shredRecordFields(field.fields, null, data, rLevel, dLevel); | ||
} | ||
else { | ||
data[field.path.join()].rlevels.push(rlvl); | ||
data[field.path.join()].dlevels.push(dlvl); | ||
data[field.path.join()].count += 1; | ||
data[field.key].count += 1; | ||
data[field.key].rlevels.push(rLevel); | ||
data[field.key].dlevels.push(dLevel); | ||
} | ||
@@ -98,13 +93,12 @@ continue; | ||
// push values | ||
for (let i = 0; i < values.length; ++i) { | ||
// tslint:disable-next-line:variable-name | ||
const rlvl_i = i === 0 ? rlvl : field.rLevelMax; | ||
for (let i = 0; i < values.length; i++) { | ||
const rlvl = i === 0 ? rLevel : field.rLevelMax; | ||
if (field.isNested) { | ||
shredRecordInternal(field.fields, values[i], data, rlvl_i, field.dLevelMax); | ||
shredRecordFields(field.fields, values[i], data, rlvl, field.dLevelMax); | ||
} | ||
else { | ||
data[field.path.join()].values.push(Types.toPrimitive(fieldType, values[i])); | ||
data[field.path.join()].rlevels.push(rlvl_i); | ||
data[field.path.join()].dlevels.push(field.dLevelMax); | ||
data[field.path.join()].count += 1; | ||
data[field.key].count += 1; | ||
data[field.key].rlevels.push(rlvl); | ||
data[field.key].dlevels.push(field.dLevelMax); | ||
data[field.key].values.push(Types.toPrimitive(field.originalType || field.primitiveType, values[i])); | ||
} | ||
@@ -132,66 +126,66 @@ } | ||
* } | ||
* | ||
*/ | ||
function materializeRecords(schema, buffer) { | ||
const records = []; | ||
for (let i = 0; i < buffer.rowCount; ++i) { | ||
for (let i = 0; i < buffer.rowCount; i++) | ||
records.push({}); | ||
for (const key in buffer.columnData) { | ||
materializeColumn(schema, buffer, key, records); | ||
} | ||
for (const k in buffer.columnData) { | ||
const field = schema.findField(k); | ||
const fieldBranch = schema.findFieldBranch(k); | ||
const values = buffer.columnData[k].values[Symbol.iterator](); | ||
// tslint:disable-next-line:prefer-array-literal | ||
const rLevels = new Array(field.rLevelMax + 1); | ||
rLevels.fill(0); | ||
for (let i = 0; i < buffer.columnData[k].count; ++i) { | ||
const dLevel = buffer.columnData[k].dlevels[i]; | ||
const rLevel = buffer.columnData[k].rlevels[i]; | ||
rLevels[rLevel]++; | ||
rLevels.fill(0, rLevel + 1); | ||
let value = null; | ||
if (dLevel === field.dLevelMax) { | ||
value = Types.fromPrimitive(field.originalType || field.primitiveType, values.next().value); | ||
} | ||
materializeRecordField(records[rLevels[0] - 1], fieldBranch, rLevels.slice(1), dLevel, value); | ||
} | ||
} | ||
return records; | ||
} | ||
exports.materializeRecords = materializeRecords; | ||
function materializeRecordField(record, branch, rLevels, dLevel, value) { | ||
const node = branch[0]; | ||
if (dLevel < node.dLevelMax) { | ||
function materializeColumn(schema, buffer, key, records) { | ||
const data = buffer.columnData[key]; | ||
if (!data.count) | ||
return; | ||
} | ||
if (branch.length > 1) { | ||
if (node.repetitionType === 'REPEATED') { | ||
if (!(node.name in record)) { | ||
record[node.name] = []; | ||
const field = schema.findField(key); | ||
const branch = schema.findFieldBranch(key); | ||
// tslint:disable-next-line:prefer-array-literal | ||
const rLevels = new Array(field.rLevelMax + 1).fill(0); | ||
let vIndex = 0; | ||
for (let i = 0; i < data.count; i++) { | ||
const dLevel = data.dlevels[i]; | ||
const rLevel = data.rlevels[i]; | ||
rLevels[rLevel]++; | ||
rLevels.fill(0, rLevel + 1); | ||
let rIndex = 0; | ||
let record = records[rLevels[rIndex++] - 1]; | ||
// Internal nodes | ||
for (const step of branch) { | ||
if (step === field) | ||
break; | ||
if (dLevel < step.dLevelMax) | ||
break; | ||
if (step.repetitionType === 'REPEATED') { | ||
if (!(step.name in record)) | ||
record[step.name] = []; | ||
const ix = rLevels[rIndex++]; | ||
while (record[step.name].length <= ix) | ||
record[step.name].push({}); | ||
record = record[step.name][ix]; | ||
} | ||
while (record[node.name].length < rLevels[0] + 1) { | ||
record[node.name].push({}); | ||
else { | ||
record[step.name] = record[step.name] || {}; | ||
record = record[step.name]; | ||
} | ||
materializeRecordField(record[node.name][rLevels[0]], branch.slice(1), rLevels.slice(1), dLevel, value); | ||
} | ||
else { | ||
record[node.name] = record[node.name] || {}; | ||
materializeRecordField(record[node.name], branch.slice(1), rLevels, dLevel, value); | ||
} | ||
} | ||
else { | ||
if (node.repetitionType === 'REPEATED') { | ||
if (!(node.name in record)) { | ||
record[node.name] = []; | ||
// Leaf node | ||
if (dLevel === field.dLevelMax) { | ||
const value = Types.fromPrimitive(field.originalType || field.primitiveType, data.values[vIndex]); | ||
vIndex++; | ||
if (field.repetitionType === 'REPEATED') { | ||
if (!(field.name in record)) | ||
record[field.name] = []; | ||
const ix = rLevels[rIndex]; | ||
while (record[field.name].length <= ix) | ||
record[field.name].push(null); | ||
record[field.name][ix] = value; | ||
} | ||
while (record[node.name].length < rLevels[0] + 1) { | ||
record[node.name].push(null); | ||
else { | ||
record[field.name] = value; | ||
} | ||
record[node.name][rLevels[0]] = value; | ||
} | ||
else { | ||
record[node.name] = value; | ||
} | ||
} | ||
} | ||
//# sourceMappingURL=shred.js.map |
/// <reference types="node" /> | ||
import fs = require('fs'); | ||
import { FileMetaData, PageHeader } from './gen'; | ||
import { FileMetaData, PageHeader } from './thrift'; | ||
export interface WriteStreamOptions { | ||
@@ -5,0 +5,0 @@ flags?: string; |
@@ -5,3 +5,3 @@ "use strict"; | ||
const thrift_1 = require("thrift"); | ||
const gen_1 = require("./gen"); | ||
const thrift_2 = require("./thrift"); | ||
class UFramedTransport extends thrift_1.TFramedTransport { | ||
@@ -43,3 +43,3 @@ } | ||
const protocol = new thrift_1.TCompactProtocol(transport); | ||
const metadata = gen_1.FileMetaData.read(protocol); | ||
const metadata = thrift_2.FileMetaData.read(protocol); | ||
return { length: transport.readPos - offset, metadata }; | ||
@@ -56,3 +56,3 @@ } | ||
const protocol = new thrift_1.TCompactProtocol(transport); | ||
const pageHeader = gen_1.PageHeader.read(protocol); | ||
const pageHeader = thrift_2.PageHeader.read(protocol); | ||
return { length: transport.readPos - offset, pageHeader }; | ||
@@ -177,7 +177,7 @@ } | ||
function fieldIndexOf(arr, elem) { | ||
for (let j = 0; j < arr.length; ++j) { | ||
for (let j = 0; j < arr.length; j++) { | ||
if (arr[j].length > elem.length) | ||
continue; | ||
let m = true; | ||
for (let i = 0; i < elem.length; ++i) { | ||
for (let i = 0; i < elem.length; i++) { | ||
if (arr[j][i] === elem[i] || arr[j][i] === '+' || arr[j][i] === '#') | ||
@@ -184,0 +184,0 @@ continue; |
@@ -5,4 +5,4 @@ /// <reference types="node" /> | ||
import { ParquetBuffer } from './declare'; | ||
import { RowGroup } from './gen'; | ||
import { ParquetSchema } from './schema'; | ||
import { RowGroup } from './thrift'; | ||
export interface ParquetWriterOptions { | ||
@@ -9,0 +9,0 @@ baseOffset?: number; |
@@ -6,5 +6,5 @@ "use strict"; | ||
const Compression = require("./compression"); | ||
const Shred = require("./shred"); | ||
// tslint:disable-next-line:max-line-length | ||
const gen_1 = require("./gen"); | ||
const Shred = require("./shred"); | ||
const thrift_1 = require("./thrift"); | ||
const Util = require("./util"); | ||
@@ -275,9 +275,9 @@ const Int64 = require("node-int64"); | ||
/* build page header */ | ||
const header = new gen_1.PageHeader({ | ||
type: gen_1.PageType.DATA_PAGE, | ||
data_page_header: new gen_1.DataPageHeader({ | ||
const header = new thrift_1.PageHeader({ | ||
type: thrift_1.PageType.DATA_PAGE, | ||
data_page_header: new thrift_1.DataPageHeader({ | ||
num_values: data.count, | ||
encoding: gen_1.Encoding[column.encoding], | ||
definition_level_encoding: gen_1.Encoding[PARQUET_RDLVL_ENCODING], | ||
repetition_level_encoding: gen_1.Encoding[PARQUET_RDLVL_ENCODING], | ||
encoding: thrift_1.Encoding[column.encoding], | ||
definition_level_encoding: thrift_1.Encoding[PARQUET_RDLVL_ENCODING], | ||
repetition_level_encoding: thrift_1.Encoding[PARQUET_RDLVL_ENCODING], | ||
}), | ||
@@ -322,9 +322,9 @@ uncompressed_page_size: dataBuf.length, | ||
/* build page header */ | ||
const header = new gen_1.PageHeader({ | ||
type: gen_1.PageType.DATA_PAGE_V2, | ||
data_page_header_v2: new gen_1.DataPageHeaderV2({ | ||
const header = new thrift_1.PageHeader({ | ||
type: thrift_1.PageType.DATA_PAGE_V2, | ||
data_page_header_v2: new thrift_1.DataPageHeaderV2({ | ||
num_values: data.count, | ||
num_nulls: data.count - data.values.length, | ||
num_rows: rowCount, | ||
encoding: gen_1.Encoding[column.encoding], | ||
encoding: thrift_1.Encoding[column.encoding], | ||
definition_levels_byte_length: dLevelsBuf.length, | ||
@@ -376,3 +376,3 @@ repetition_levels_byte_length: rLevelsBuf.length, | ||
/* prepare metadata header */ | ||
const metadata = new gen_1.ColumnMetaData({ | ||
const metadata = new thrift_1.ColumnMetaData({ | ||
path_in_schema: column.path, | ||
@@ -384,4 +384,4 @@ num_values: data.count, | ||
total_compressed_size, | ||
type: gen_1.Type[column.primitiveType], | ||
codec: gen_1.CompressionCodec[column.compression] | ||
type: thrift_1.Type[column.primitiveType], | ||
codec: thrift_1.CompressionCodec[column.compression] | ||
}); | ||
@@ -393,3 +393,3 @@ /* list encodings */ | ||
for (const k in encodingsSet) { | ||
metadata.encodings.push(gen_1.Encoding[k]); | ||
metadata.encodings.push(thrift_1.Encoding[k]); | ||
} | ||
@@ -405,3 +405,3 @@ /* concat metadata header and data pages */ | ||
function encodeRowGroup(schema, data, opts) { | ||
const metadata = new gen_1.RowGroup({ | ||
const metadata = new thrift_1.RowGroup({ | ||
num_rows: data.rowCount, | ||
@@ -417,3 +417,3 @@ columns: [], | ||
const cchunkData = encodeColumnChunk(field, data, body.length, opts); | ||
const cchunk = new gen_1.ColumnChunk({ | ||
const cchunk = new thrift_1.ColumnChunk({ | ||
file_offset: cchunkData.metadataOffset, | ||
@@ -432,3 +432,3 @@ meta_data: cchunkData.metadata | ||
function encodeFooter(schema, rowCount, rowGroups, userMetadata) { | ||
const metadata = new gen_1.FileMetaData({ | ||
const metadata = new thrift_1.FileMetaData({ | ||
version: PARQUET_VERSION, | ||
@@ -442,3 +442,3 @@ created_by: 'parquets', | ||
for (const key in userMetadata) { | ||
const kv = new gen_1.KeyValue({ | ||
const kv = new thrift_1.KeyValue({ | ||
key, | ||
@@ -450,3 +450,3 @@ value: userMetadata[key] | ||
{ | ||
const schemaRoot = new gen_1.SchemaElement({ | ||
const schemaRoot = new thrift_1.SchemaElement({ | ||
name: 'root', | ||
@@ -458,4 +458,4 @@ num_children: Object.keys(schema.fields).length | ||
for (const field of schema.fieldList) { | ||
const relt = gen_1.FieldRepetitionType[field.repetitionType]; | ||
const schemaElem = new gen_1.SchemaElement({ | ||
const relt = thrift_1.FieldRepetitionType[field.repetitionType]; | ||
const schemaElem = new thrift_1.SchemaElement({ | ||
name: field.name, | ||
@@ -468,6 +468,6 @@ repetition_type: relt | ||
else { | ||
schemaElem.type = gen_1.Type[field.primitiveType]; | ||
schemaElem.type = thrift_1.Type[field.primitiveType]; | ||
} | ||
if (field.originalType) { | ||
schemaElem.converted_type = gen_1.ConvertedType[field.originalType]; | ||
schemaElem.converted_type = thrift_1.ConvertedType[field.originalType]; | ||
} | ||
@@ -474,0 +474,0 @@ schemaElem.type_length = field.typeLength; |
{ | ||
"name": "parquets", | ||
"description": "TypeScript implementation of the Parquet file format, based on parquet.js", | ||
"version": "0.10.6", | ||
"version": "0.10.7", | ||
"upstream": "0.10.1", | ||
@@ -42,6 +42,4 @@ "homepage": "https://github.com/kbajalc/parquets", | ||
"bson": "^4.0.2", | ||
"debug": "^4.1.1", | ||
"int53": "^1.0.0", | ||
"node-int64": "^0.4.0", | ||
"snappyjs": "^0.6.0", | ||
"thrift": "^0.12.0", | ||
@@ -60,6 +58,7 @@ "varint": "^5.0.0" | ||
"@types/debug": "^4.1.4", | ||
"@types/jest": "^24.0.16", | ||
"@types/jest": "^24.0.17", | ||
"@types/mocha": "^5.2.7", | ||
"@types/node": "^10.14.14", | ||
"@types/node": "^10.14.15", | ||
"@types/node-int64": "^0.4.29", | ||
"@types/snappy": "^6.0.0", | ||
"@types/thrift": "^0.10.8", | ||
@@ -70,2 +69,3 @@ "@types/varint": "^5.0.0", | ||
"chai": "^4.2.0", | ||
"debug": "^4.1.1", | ||
"jest": "^24.8.0", | ||
@@ -75,7 +75,6 @@ "jest-environment-node": "^24.8.0", | ||
"lzo": "^0.4.11", | ||
"mocha": "^6.2.0", | ||
"node-snappy": "^0.1.4", | ||
"object-stream": "0.0.1", | ||
"snappy": "^6.2.3", | ||
"ts-jest": "^24.0.2", | ||
"ts-mocha": "^6.0.0", | ||
"ts-node": "^8.3.0", | ||
@@ -82,0 +81,0 @@ "tslint": "^5.18.0", |
@@ -13,9 +13,8 @@ # parquets | ||
with the [Parquet specification](https://github.com/apache/parquet-format) and is being tested | ||
for compatibility with Apache's Java [reference implementation](https://github.com/apache/parquet-mr). | ||
for compatibility with Apache's [reference implementation](https://github.com/apache/parquet-mr). | ||
**WARNING**: *There are compatibility issues with the reference implementation [Appache Drill](https://drill.apache.org)*: | ||
**WARNING**: *There are compatibility issues with the reference implementation*: | ||
- only GZIP and SNAPPY compressions are compatible | ||
- resolved problem with columns 'optional': true and with 'compression' enabled | ||
- files with 'repeated' columns can not be read with Drill | ||
- columns with nested 'fields' are not compatible | ||
- [Parquet Tools](https://github.com/apache/parquet-mr/tree/master/parquet-tools) are command line tools that aid in the inspection of Parquet files. | ||
- always verify your table structure loaded with realistic data sample can be read by Parquet Tools! | ||
@@ -22,0 +21,0 @@ **What is Parquet?**: Parquet is a column-oriented file format; it allows you to |
@@ -54,3 +54,3 @@ import { PrimitiveType } from '../declare'; | ||
buf.fill(0); | ||
for (let i = 0; i < values.length; ++i) { | ||
for (let i = 0; i < values.length; i++) { | ||
if (values[i]) { | ||
@@ -65,3 +65,3 @@ buf[Math.floor(i / 8)] |= (1 << (i % 8)); | ||
const values: boolean[] = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
const b = cursor.buffer[cursor.offset + Math.floor(i / 8)]; | ||
@@ -84,3 +84,3 @@ values.push((b & (1 << (i % 8))) > 0); | ||
const values: number[] = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
values.push(cursor.buffer.readInt32LE(cursor.offset)); | ||
@@ -102,3 +102,3 @@ cursor.offset += 4; | ||
const values: number[] = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
values.push(INT53.readInt64LE(cursor.buffer, cursor.offset)); | ||
@@ -126,3 +126,3 @@ cursor.offset += 8; | ||
const values: number[] = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
const low = INT53.readInt64LE(cursor.buffer, cursor.offset); | ||
@@ -150,3 +150,3 @@ const high = cursor.buffer.readUInt32LE(cursor.offset + 8); | ||
const values: number[] = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
values.push(cursor.buffer.readFloatLE(cursor.offset)); | ||
@@ -168,3 +168,3 @@ cursor.offset += 4; | ||
const values: number[] = []; | ||
for (let i = 0; i < count; ++i) { | ||
for (let i = 0; i < count; i++) { | ||
values.push(cursor.buffer.readDoubleLE(cursor.offset)); | ||
@@ -171,0 +171,0 @@ cursor.offset += 8; |
@@ -11,3 +11,3 @@ import varint = require('varint'); | ||
const buf = Buffer.alloc(Math.ceil(opts.bitWidth * (values.length / 8))); | ||
for (let b = 0; b < opts.bitWidth * values.length; ++b) { | ||
for (let b = 0; b < opts.bitWidth * values.length; b++) { | ||
if ((values[Math.floor(b / opts.bitWidth)] & (1 << b % opts.bitWidth)) > 0) { | ||
@@ -27,3 +27,3 @@ buf[Math.floor(b / 8)] |= (1 << (b % 8)); | ||
for (let i = 0; i < buf.length; ++i) { | ||
for (let i = 0; i < buf.length; i++) { | ||
buf.writeUInt8(value & 0xff, i); | ||
@@ -107,3 +107,3 @@ value >> 8; | ||
const values = new Array(count).fill(0); | ||
for (let b = 0; b < opts.bitWidth * count; ++b) { | ||
for (let b = 0; b < opts.bitWidth * count; b++) { | ||
if (cursor.buffer[cursor.offset + Math.floor(b / 8)] & (1 << (b % 8))) { | ||
@@ -120,3 +120,3 @@ values[Math.floor(b / opts.bitWidth)] |= (1 << b % opts.bitWidth); | ||
let value = 0; | ||
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); ++i) { | ||
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); i++) { | ||
value << 8; | ||
@@ -123,0 +123,0 @@ value += cursor.buffer[cursor.offset]; |
import { ParquetCompression } from './declare'; | ||
import * as Util from './util'; | ||
import zlib = require('zlib'); | ||
import snappyjs = require('snappyjs'); | ||
import snappyjs = require('./snappy'); | ||
@@ -6,0 +6,0 @@ let brotli: any; |
@@ -0,1 +1,2 @@ | ||
export type ParquetCodec = 'PLAIN' | 'RLE'; | ||
@@ -43,6 +44,6 @@ export type ParquetCompression = 'UNCOMPRESSED' | 'GZIP' | 'SNAPPY' | 'LZO' | 'BROTLI' | 'LZ4'; | ||
export interface SchemaDefinition { | ||
[string: string]: ElementDefinition; | ||
[string: string]: FieldDefinition; | ||
} | ||
export interface ElementDefinition { | ||
export interface FieldDefinition { | ||
type?: ParquetType; | ||
@@ -57,5 +58,6 @@ typeLength?: number; | ||
export interface FieldDefinition { | ||
export interface ParquetField { | ||
name: string; | ||
path: string[]; | ||
key: string; | ||
primitiveType?: PrimitiveType; | ||
@@ -71,3 +73,3 @@ originalType?: OriginalType; | ||
fieldCount?: number; | ||
fields?: Record<string, FieldDefinition>; | ||
fields?: Record<string, ParquetField>; | ||
} | ||
@@ -74,0 +76,0 @@ |
@@ -12,9 +12,9 @@ declare module 'int53' { | ||
declare module 'snappyjs' { | ||
declare function compress(uncompressed: Buffer): Buffer; | ||
declare function compress(uncompressed: ArrayBuffer): ArrayBuffer; | ||
declare function compress(uncompressed: Uint8Array): Uint8Array; | ||
declare function uncompress(compressed: Buffer): Buffer; | ||
declare function uncompress(compressed: ArrayBuffer): ArrayBuffer; | ||
declare function uncompress(compressed: Uint8Array): Uint8Array; | ||
} | ||
// declare module 'snappyjs' { | ||
// declare function compress(uncompressed: Buffer): Buffer; | ||
// declare function compress(uncompressed: ArrayBuffer): ArrayBuffer; | ||
// declare function compress(uncompressed: Uint8Array): Uint8Array; | ||
// declare function uncompress(compressed: Buffer): Buffer; | ||
// declare function uncompress(compressed: ArrayBuffer): ArrayBuffer; | ||
// declare function uncompress(compressed: Uint8Array): Uint8Array; | ||
// } |
import { CursorBuffer, ParquetCodecOptions, PARQUET_CODEC } from './codec'; | ||
import * as Compression from './compression'; | ||
import { FieldDefinition, ParquetBuffer, ParquetCodec, ParquetCompression, ParquetData, ParquetRecord, ParquetType, PrimitiveType, SchemaDefinition } from './declare'; | ||
// tslint:disable-next-line:max-line-length | ||
import { ColumnChunk, CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, FileMetaData, PageHeader, PageType, RowGroup, SchemaElement, Type } from './gen'; | ||
import { ParquetBuffer, ParquetCodec, ParquetCompression, ParquetData, ParquetField, ParquetRecord, ParquetType, PrimitiveType, SchemaDefinition } from './declare'; | ||
import { ParquetSchema } from './schema'; | ||
import * as Shred from './shred'; | ||
// tslint:disable-next-line:max-line-length | ||
import { ColumnChunk, CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, FileMetaData, PageHeader, PageType, RowGroup, SchemaElement, Type } from './thrift'; | ||
import * as Util from './util'; | ||
@@ -301,3 +301,3 @@ // import Fs = require('fs'); | ||
function decodeDataPages(buffer: Buffer, column: FieldDefinition, compression: ParquetCompression): ParquetData { | ||
function decodeDataPages(buffer: Buffer, column: ParquetField, compression: ParquetCompression): ParquetData { | ||
const cursor: CursorBuffer = { | ||
@@ -348,3 +348,3 @@ buffer, | ||
function decodeDataPage(cursor: CursorBuffer, header: PageHeader, column: FieldDefinition, compression: ParquetCompression): ParquetData { | ||
function decodeDataPage(cursor: CursorBuffer, header: PageHeader, column: ParquetField, compression: ParquetCompression): ParquetData { | ||
const cursorEnd = cursor.offset + header.compressed_page_size; | ||
@@ -433,3 +433,3 @@ const valueCount = header.data_page_header.num_values; | ||
if (dlvl === column.dLevelMax) { | ||
++valueCountNonNull; | ||
valueCountNonNull++; | ||
} | ||
@@ -466,3 +466,3 @@ } | ||
function decodeDataPageV2(cursor: CursorBuffer, header: PageHeader, column: FieldDefinition, compression: ParquetCompression): ParquetData { | ||
function decodeDataPageV2(cursor: CursorBuffer, header: PageHeader, column: ParquetField, compression: ParquetCompression): ParquetData { | ||
const cursorEnd = cursor.offset + header.compressed_page_size; | ||
@@ -469,0 +469,0 @@ |
import { PARQUET_CODEC } from './codec'; | ||
import { PARQUET_COMPRESSION_METHODS } from './compression'; | ||
import { ElementDefinition, FieldDefinition, ParquetBuffer, ParquetCompression, ParquetRecord, RepetitionType, SchemaDefinition } from './declare'; | ||
import { materializeRecords, shredRecord } from './shred'; | ||
import { FieldDefinition, ParquetBuffer, ParquetCompression, ParquetField, ParquetRecord, RepetitionType, SchemaDefinition } from './declare'; | ||
import { materializeRecords, shredBuffer, shredRecord } from './shred'; | ||
import { PARQUET_LOGICAL_TYPES } from './types'; | ||
@@ -11,5 +11,5 @@ | ||
export class ParquetSchema { | ||
public schema: Record<string, ElementDefinition>; | ||
public fields: Record<string, FieldDefinition>; | ||
public fieldList: FieldDefinition[]; | ||
public schema: Record<string, FieldDefinition>; | ||
public fields: Record<string, ParquetField>; | ||
public fieldList: ParquetField[]; | ||
@@ -28,5 +28,5 @@ /** | ||
*/ | ||
findField(path: string): FieldDefinition; | ||
findField(path: string[]): FieldDefinition; | ||
findField(path: any): FieldDefinition { | ||
findField(path: string): ParquetField; | ||
findField(path: string[]): ParquetField; | ||
findField(path: any): ParquetField { | ||
if (path.constructor !== Array) { | ||
@@ -51,4 +51,4 @@ // tslint:disable-next-line:no-parameter-reassignment | ||
*/ | ||
findFieldBranch(path: string): FieldDefinition[]; | ||
findFieldBranch(path: string[]): FieldDefinition[]; | ||
findFieldBranch(path: string): ParquetField[]; | ||
findFieldBranch(path: string[]): ParquetField[]; | ||
findFieldBranch(path: any): any[] { | ||
@@ -83,2 +83,6 @@ if (path.constructor !== Array) { | ||
} | ||
buffer(): ParquetBuffer { | ||
return shredBuffer(this); | ||
} | ||
} | ||
@@ -102,4 +106,4 @@ | ||
path: string[] | ||
): Record<string, FieldDefinition> { | ||
const fieldList: Record<string, FieldDefinition> = {}; | ||
): Record<string, ParquetField> { | ||
const fieldList: Record<string, ParquetField> = {}; | ||
@@ -128,5 +132,7 @@ for (const name in schema) { | ||
if (opts.fields) { | ||
const cpath = path.concat([name]); | ||
fieldList[name] = { | ||
name, | ||
path: path.concat([name]), | ||
path: cpath, | ||
key: cpath.join(), | ||
repetitionType, | ||
@@ -141,3 +147,4 @@ rLevelMax, | ||
dLevelMax, | ||
path.concat([name])) | ||
cpath | ||
) | ||
}; | ||
@@ -163,2 +170,3 @@ continue; | ||
/* add to schema */ | ||
const cpath = path.concat([name]); | ||
fieldList[name] = { | ||
@@ -168,3 +176,4 @@ name, | ||
originalType: typeDef.originalType, | ||
path: path.concat([name]), | ||
path: cpath, | ||
key: cpath.join(), | ||
repetitionType, | ||
@@ -181,4 +190,4 @@ encoding: opts.encoding, | ||
function listFields(fields: Record<string, FieldDefinition>): FieldDefinition[] { | ||
let list: FieldDefinition[] = []; | ||
function listFields(fields: Record<string, ParquetField>): ParquetField[] { | ||
let list: ParquetField[] = []; | ||
for (const k in fields) { | ||
@@ -185,0 +194,0 @@ list.push(fields[k]); |
237
src/shred.ts
@@ -1,5 +0,18 @@ | ||
import { FieldDefinition, ParquetBuffer, ParquetData, ParquetRecord } from './declare'; | ||
import { ParquetBuffer, ParquetData, ParquetField, ParquetRecord } from './declare'; | ||
import { ParquetSchema } from './schema'; | ||
import * as Types from './types'; | ||
export function shredBuffer(schema: ParquetSchema): ParquetBuffer { | ||
const columnData: Record<string, ParquetData> = {}; | ||
for (const field of schema.fieldList) { | ||
columnData[field.key] = { | ||
dlevels: [], | ||
rlevels: [], | ||
values: [], | ||
count: 0 | ||
}; | ||
} | ||
return { rowCount: 0, columnData }; | ||
} | ||
/** | ||
@@ -26,73 +39,43 @@ * 'Shred' a record into a list of <value, repetition_level, definition_level> | ||
* } | ||
* | ||
*/ | ||
export function shredRecord(schema: ParquetSchema, record: any, buffer: ParquetBuffer): void { | ||
/* shred the record, this may raise an exception */ | ||
const recordShredded: Record<string, ParquetData> = {}; | ||
for (const field of schema.fieldList) { | ||
recordShredded[field.path.join()] = { | ||
dlevels: [], | ||
rlevels: [], | ||
values: [], | ||
count: 0 | ||
}; | ||
} | ||
const data = shredBuffer(schema).columnData; | ||
shredRecordInternal(schema.fields, record, recordShredded, 0, 0); | ||
shredRecordFields(schema.fields, record, data, 0, 0); | ||
/* if no error during shredding, add the shredded record to the buffer */ | ||
if (!('columnData' in buffer) || !('rowCount' in buffer)) { | ||
buffer.rowCount = 0; | ||
buffer.columnData = {}; | ||
for (const field of schema.fieldList) { | ||
const cd: ParquetData = { | ||
dlevels: [], | ||
rlevels: [], | ||
values: [], | ||
count: 0 | ||
}; | ||
buffer.columnData[field.path.join()] = cd; | ||
} | ||
buffer.rowCount = 1; | ||
buffer.columnData = data; | ||
return; | ||
} | ||
buffer.rowCount += 1; | ||
for (const field of schema.fieldList) { | ||
Array.prototype.push.apply( | ||
buffer.columnData[field.path.join()].rlevels, | ||
recordShredded[field.path.join()].rlevels); | ||
Array.prototype.push.apply( | ||
buffer.columnData[field.path.join()].dlevels, | ||
recordShredded[field.path.join()].dlevels); | ||
Array.prototype.push.apply( | ||
buffer.columnData[field.path.join()].values, | ||
recordShredded[field.path.join()].values); | ||
buffer.columnData[field.path.join()].count += recordShredded[field.path.join()].count; | ||
Array.prototype.push.apply(buffer.columnData[field.key].rlevels, data[field.key].rlevels); | ||
Array.prototype.push.apply(buffer.columnData[field.key].dlevels, data[field.key].dlevels); | ||
Array.prototype.push.apply(buffer.columnData[field.key].values, data[field.key].values); | ||
buffer.columnData[field.key].count += data[field.key].count; | ||
} | ||
} | ||
function shredRecordInternal( | ||
fields: Record<string, FieldDefinition>, | ||
function shredRecordFields( | ||
fields: Record<string, ParquetField>, | ||
record: any, | ||
data: Record<string, ParquetData>, | ||
rlvl: number, | ||
dlvl: number | ||
rLevel: number, | ||
dLevel: number | ||
) { | ||
for (const fieldName in fields) { | ||
const field = fields[fieldName]; | ||
const fieldType = field.originalType || field.primitiveType; | ||
for (const name in fields) { | ||
const field = fields[name]; | ||
// fetch values | ||
let values = []; | ||
if (record && (fieldName in record) && record[fieldName] !== undefined && record[fieldName] !== null) { | ||
if (record[fieldName].constructor === Array) { | ||
values = record[fieldName]; | ||
if (record && (field.name in record) && record[field.name] !== undefined && record[field.name] !== null) { | ||
if (record[field.name].constructor === Array) { | ||
values = record[field.name]; | ||
} else { | ||
values.push(record[fieldName]); | ||
values.push(record[field.name]); | ||
} | ||
} | ||
// check values | ||
@@ -102,3 +85,2 @@ if (values.length === 0 && !!record && field.repetitionType === 'REQUIRED') { | ||
} | ||
if (values.length > 1 && field.repetitionType !== 'REPEATED') { | ||
@@ -111,12 +93,12 @@ throw new Error(`too many values for field: ${field.name}`); | ||
if (field.isNested) { | ||
shredRecordInternal( | ||
shredRecordFields( | ||
field.fields, | ||
null, | ||
data, | ||
rlvl, | ||
dlvl); | ||
rLevel, | ||
dLevel); | ||
} else { | ||
data[field.path.join()].rlevels.push(rlvl); | ||
data[field.path.join()].dlevels.push(dlvl); | ||
data[field.path.join()].count += 1; | ||
data[field.key].count += 1; | ||
data[field.key].rlevels.push(rLevel); | ||
data[field.key].dlevels.push(dLevel); | ||
} | ||
@@ -127,18 +109,19 @@ continue; | ||
// push values | ||
for (let i = 0; i < values.length; ++i) { | ||
// tslint:disable-next-line:variable-name | ||
const rlvl_i = i === 0 ? rlvl : field.rLevelMax; | ||
for (let i = 0; i < values.length; i++) { | ||
const rlvl = i === 0 ? rLevel : field.rLevelMax; | ||
if (field.isNested) { | ||
shredRecordInternal( | ||
shredRecordFields( | ||
field.fields, | ||
values[i], | ||
data, | ||
rlvl_i, | ||
rlvl, | ||
field.dLevelMax); | ||
} else { | ||
data[field.path.join()].values.push(Types.toPrimitive(fieldType, values[i])); | ||
data[field.path.join()].rlevels.push(rlvl_i); | ||
data[field.path.join()].dlevels.push(field.dLevelMax); | ||
data[field.path.join()].count += 1; | ||
data[field.key].count += 1; | ||
data[field.key].rlevels.push(rlvl); | ||
data[field.key].dlevels.push(field.dLevelMax); | ||
data[field.key].values.push(Types.toPrimitive( | ||
field.originalType || field.primitiveType, | ||
values[i] | ||
)); | ||
} | ||
@@ -167,93 +150,63 @@ } | ||
* } | ||
* | ||
*/ | ||
export function materializeRecords(schema: ParquetSchema, buffer: ParquetBuffer): ParquetRecord[] { | ||
const records: ParquetRecord[] = []; | ||
for (let i = 0; i < buffer.rowCount; ++i) { | ||
records.push({}); | ||
for (let i = 0; i < buffer.rowCount; i++) records.push({}); | ||
for (const key in buffer.columnData) { | ||
materializeColumn(schema, buffer, key, records); | ||
} | ||
for (const k in buffer.columnData) { | ||
const field = schema.findField(k); | ||
const fieldBranch = schema.findFieldBranch(k); | ||
const values = buffer.columnData[k].values[Symbol.iterator](); | ||
// tslint:disable-next-line:prefer-array-literal | ||
const rLevels = new Array(field.rLevelMax + 1); | ||
rLevels.fill(0); | ||
for (let i = 0; i < buffer.columnData[k].count; ++i) { | ||
const dLevel = buffer.columnData[k].dlevels[i]; | ||
const rLevel = buffer.columnData[k].rlevels[i]; | ||
rLevels[rLevel]++; | ||
rLevels.fill(0, rLevel + 1); | ||
let value = null; | ||
if (dLevel === field.dLevelMax) { | ||
value = Types.fromPrimitive( | ||
field.originalType || field.primitiveType, | ||
values.next().value); | ||
} | ||
materializeRecordField( | ||
records[rLevels[0] - 1], | ||
fieldBranch, | ||
rLevels.slice(1), | ||
dLevel, | ||
value); | ||
} | ||
} | ||
return records; | ||
} | ||
function materializeRecordField(record: any, branch: FieldDefinition[], rLevels: number[], dLevel: number, value: any): void { | ||
const node = branch[0]; | ||
function materializeColumn(schema: ParquetSchema, buffer: ParquetBuffer, key: string, records: ParquetRecord[]) { | ||
const data = buffer.columnData[key]; | ||
if (!data.count) return; | ||
if (dLevel < node.dLevelMax) { | ||
return; | ||
} | ||
const field = schema.findField(key); | ||
const branch = schema.findFieldBranch(key); | ||
if (branch.length > 1) { | ||
if (node.repetitionType === 'REPEATED') { | ||
if (!(node.name in record)) { | ||
record[node.name] = []; | ||
} | ||
// tslint:disable-next-line:prefer-array-literal | ||
const rLevels: number[] = new Array(field.rLevelMax + 1).fill(0); | ||
let vIndex = 0; | ||
for (let i = 0; i < data.count; i++) { | ||
const dLevel = data.dlevels[i]; | ||
const rLevel = data.rlevels[i]; | ||
rLevels[rLevel]++; | ||
rLevels.fill(0, rLevel + 1); | ||
while (record[node.name].length < rLevels[0] + 1) { | ||
record[node.name].push({}); | ||
let rIndex = 0; | ||
let record = records[rLevels[rIndex++] - 1]; | ||
// Internal nodes | ||
for (const step of branch) { | ||
if (step === field) break; | ||
if (dLevel < step.dLevelMax) break; | ||
if (step.repetitionType === 'REPEATED') { | ||
if (!(step.name in record)) record[step.name] = []; | ||
const ix = rLevels[rIndex++]; | ||
while (record[step.name].length <= ix) record[step.name].push({}); | ||
record = record[step.name][ix]; | ||
} else { | ||
record[step.name] = record[step.name] || {}; | ||
record = record[step.name]; | ||
} | ||
materializeRecordField( | ||
record[node.name][rLevels[0]], | ||
branch.slice(1), | ||
rLevels.slice(1), | ||
dLevel, | ||
value); | ||
} else { | ||
record[node.name] = record[node.name] || {}; | ||
materializeRecordField( | ||
record[node.name], | ||
branch.slice(1), | ||
rLevels, | ||
dLevel, | ||
value); | ||
} | ||
} else { | ||
if (node.repetitionType === 'REPEATED') { | ||
if (!(node.name in record)) { | ||
record[node.name] = []; | ||
} | ||
while (record[node.name].length < rLevels[0] + 1) { | ||
record[node.name].push(null); | ||
// Leaf node | ||
if (dLevel === field.dLevelMax) { | ||
const value = Types.fromPrimitive( | ||
field.originalType || field.primitiveType, | ||
data.values[vIndex] | ||
); | ||
vIndex++; | ||
if (field.repetitionType === 'REPEATED') { | ||
if (!(field.name in record)) record[field.name] = []; | ||
const ix = rLevels[rIndex]; | ||
while (record[field.name].length <= ix) record[field.name].push(null); | ||
record[field.name][ix] = value; | ||
} else { | ||
record[field.name] = value; | ||
} | ||
record[node.name][rLevels[0]] = value; | ||
} else { | ||
record[node.name] = value; | ||
} | ||
} | ||
} |
import fs = require('fs'); | ||
import { TBufferedTransport, TCompactProtocol, TFramedTransport } from 'thrift'; | ||
import { FileMetaData, PageHeader } from './gen'; | ||
import { FileMetaData, PageHeader } from './thrift'; | ||
@@ -183,6 +183,6 @@ export interface WriteStreamOptions { | ||
export function fieldIndexOf(arr: string[][], elem: string[]): number { | ||
for (let j = 0; j < arr.length; ++j) { | ||
for (let j = 0; j < arr.length; j++) { | ||
if (arr[j].length > elem.length) continue; | ||
let m = true; | ||
for (let i = 0; i < elem.length; ++i) { | ||
for (let i = 0; i < elem.length; i++) { | ||
if (arr[j][i] === elem[i] || arr[j][i] === '+' || arr[j][i] === '#') continue; | ||
@@ -189,0 +189,0 @@ if (i >= arr[j].length && arr[j][arr[j].length - 1] === '#') continue; |
@@ -5,7 +5,7 @@ import { WriteStream } from 'fs'; | ||
import * as Compression from './compression'; | ||
import { FieldDefinition, ParquetBuffer, ParquetCodec, ParquetData, PrimitiveType } from './declare'; | ||
// tslint:disable-next-line:max-line-length | ||
import { ColumnChunk, ColumnMetaData, CompressionCodec, ConvertedType, DataPageHeader, DataPageHeaderV2, Encoding, FieldRepetitionType, FileMetaData, KeyValue, PageHeader, PageType, RowGroup, SchemaElement, Type } from './gen'; | ||
import { ParquetBuffer, ParquetCodec, ParquetData, ParquetField, PrimitiveType } from './declare'; | ||
import { ParquetSchema } from './schema'; | ||
import * as Shred from './shred'; | ||
// tslint:disable-next-line:max-line-length | ||
import { ColumnChunk, ColumnMetaData, CompressionCodec, ConvertedType, DataPageHeader, DataPageHeaderV2, Encoding, FieldRepetitionType, FileMetaData, KeyValue, PageHeader, PageType, RowGroup, SchemaElement, Type } from './thrift'; | ||
import * as Util from './util'; | ||
@@ -332,3 +332,3 @@ import Int64 = require('node-int64'); | ||
function encodeDataPage( | ||
column: FieldDefinition, | ||
column: ParquetField, | ||
data: ParquetData | ||
@@ -413,3 +413,3 @@ ): { | ||
function encodeDataPageV2( | ||
column: FieldDefinition, | ||
column: ParquetField, | ||
data: ParquetData, | ||
@@ -491,3 +491,3 @@ rowCount: number | ||
function encodeColumnChunk( | ||
column: FieldDefinition, | ||
column: ParquetField, | ||
buffer: ParquetBuffer, | ||
@@ -494,0 +494,0 @@ offset: number, |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
License Policy Violation
LicenseThis package is not allowed per your license policy. Review the package's license to ensure compliance.
Found 1 instance in 1 package
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
753612
5
245
15470
27
330
- Removeddebug@^4.1.1
- Removedsnappyjs@^0.6.0
- Removeddebug@4.3.7(transitive)
- Removedms@2.1.3(transitive)
- Removedsnappyjs@0.6.1(transitive)