mongodb-schema
Advanced tools
Comparing version
@@ -5,3 +5,4 @@ /// <reference types="node" /> | ||
import stream from './stream'; | ||
import type { SchemaParseOptions, Schema, SchemaField } from './stream'; | ||
import { SchemaAnalyzer } from './schema-analyzer'; | ||
import type { SchemaParseOptions, Schema, SchemaField } from './schema-analyzer'; | ||
import * as schemaStats from './stats'; | ||
@@ -12,2 +13,2 @@ type MongoDBCursor = AggregationCursor | FindCursor; | ||
export type { Schema, SchemaField }; | ||
export { stream, schemaStats }; | ||
export { stream, SchemaAnalyzer, schemaStats }; |
@@ -29,3 +29,3 @@ "use strict"; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
exports.schemaStats = exports.stream = void 0; | ||
exports.schemaStats = exports.SchemaAnalyzer = exports.stream = void 0; | ||
const stream_1 = require("stream"); | ||
@@ -35,2 +35,4 @@ const util_1 = require("util"); | ||
exports.stream = stream_2.default; | ||
const schema_analyzer_1 = require("./schema-analyzer"); | ||
Object.defineProperty(exports, "SchemaAnalyzer", { enumerable: true, get: function () { return schema_analyzer_1.SchemaAnalyzer; } }); | ||
const schemaStats = __importStar(require("./stats")); | ||
@@ -37,0 +39,0 @@ exports.schemaStats = schemaStats; |
@@ -1,2 +0,2 @@ | ||
import type { Schema } from './stream'; | ||
import type { Schema } from './schema-analyzer'; | ||
declare function widthRecursive(schema?: Schema): number; | ||
@@ -3,0 +3,0 @@ declare function depthRecursive(schema?: Schema): number; |
/// <reference types="node" /> | ||
import { Duplex } from 'stream'; | ||
import type { ObjectId, MinKey, MaxKey, Long, Double, Int32, Decimal128, Binary, BSONRegExp, Code, BSONSymbol, Timestamp } from 'bson'; | ||
type BaseSchemaType = { | ||
path: string; | ||
count: number; | ||
probability: number; | ||
has_duplicates: boolean; | ||
unique: number; | ||
}; | ||
type ConstantSchemaType = BaseSchemaType & { | ||
name: 'Null' | 'Undefined'; | ||
}; | ||
type TypeCastMap = { | ||
Array: unknown[]; | ||
Binary: Binary; | ||
Boolean: boolean; | ||
Code: Code; | ||
Date: Date; | ||
Decimal128: Decimal128; | ||
Double: Double; | ||
Int32: Int32; | ||
Int64: Long; | ||
MaxKey: MaxKey; | ||
MinKey: MinKey; | ||
Null: null; | ||
Object: Record<string, unknown>; | ||
ObjectId: ObjectId; | ||
BSONRegExp: BSONRegExp; | ||
String: string; | ||
BSONSymbol: BSONSymbol; | ||
Timestamp: Timestamp; | ||
Undefined: undefined; | ||
}; | ||
type TypeCastTypes = keyof TypeCastMap; | ||
type BSONValue = TypeCastMap[TypeCastTypes]; | ||
export type PrimitiveSchemaType = BaseSchemaType & { | ||
name: 'String' | 'Number' | 'Int32' | 'Boolean' | 'Decimal128' | 'Long' | 'ObjectId' | 'Date' | 'RegExp' | 'Symbol' | 'MaxKey' | 'MinKey' | 'Binary' | 'Code' | 'Timestamp' | 'DBRef'; | ||
values: BSONValue[]; | ||
}; | ||
export type ArraySchemaType = BaseSchemaType & { | ||
name: 'Array'; | ||
lengths: number[]; | ||
average_length: number; | ||
total_count: number; | ||
types: SchemaType[]; | ||
}; | ||
export type DocumentSchemaType = BaseSchemaType & { | ||
name: 'Document'; | ||
fields: SchemaField[]; | ||
}; | ||
export type SchemaType = ConstantSchemaType | PrimitiveSchemaType | ArraySchemaType | DocumentSchemaType; | ||
export type SchemaField = { | ||
name: string; | ||
count: number; | ||
path: string; | ||
type: string | string[]; | ||
probability: number; | ||
has_duplicates: boolean; | ||
types: SchemaType[]; | ||
}; | ||
export type Schema = { | ||
count: number; | ||
fields: SchemaField[]; | ||
}; | ||
type SemanticTypeFunction = ((value: string, path?: string) => boolean); | ||
type SemanticTypeMap = { | ||
[typeName: string]: SemanticTypeFunction | boolean; | ||
}; | ||
export type SchemaParseOptions = { | ||
semanticTypes?: boolean | SemanticTypeMap; | ||
storeValues?: boolean; | ||
}; | ||
declare function parse(options?: SchemaParseOptions): Duplex; | ||
export default parse; | ||
import type { Document } from 'bson'; | ||
import { SchemaAnalyzer } from './schema-analyzer'; | ||
import type { SchemaParseOptions } from './schema-analyzer'; | ||
export declare class ParseStream extends Duplex { | ||
analyzer: SchemaAnalyzer; | ||
constructor(options?: SchemaParseOptions); | ||
_write(obj: Document, enc: unknown, cb: () => void): void; | ||
_read(): void; | ||
_final(cb: () => void): void; | ||
} | ||
export default function makeParseStream(options?: SchemaParseOptions): ParseStream; |
"use strict"; | ||
var __importDefault = (this && this.__importDefault) || function (mod) { | ||
return (mod && mod.__esModule) ? mod : { "default": mod }; | ||
}; | ||
Object.defineProperty(exports, "__esModule", { value: true }); | ||
const reservoir_1 = __importDefault(require("reservoir")); | ||
exports.ParseStream = void 0; | ||
const stream_1 = require("stream"); | ||
const lodash_1 = __importDefault(require("lodash")); | ||
const semantic_types_1 = __importDefault(require("./semantic-types")); | ||
function extractStringValueFromBSON(value) { | ||
if (value && value._bsontype) { | ||
if (['Decimal128', 'Long'].includes(value._bsontype)) { | ||
return value.toString(); | ||
} | ||
if (['Double', 'Int32'].includes(value._bsontype)) { | ||
return String(value.value); | ||
} | ||
const schema_analyzer_1 = require("./schema-analyzer"); | ||
class ParseStream extends stream_1.Duplex { | ||
constructor(options) { | ||
super({ objectMode: true }); | ||
this.analyzer = new schema_analyzer_1.SchemaAnalyzer(options); | ||
} | ||
if (typeof value === 'string') { | ||
return value; | ||
_write(obj, enc, cb) { | ||
this.analyzer.analyzeDoc(obj); | ||
this.emit('progress', obj); | ||
cb(); | ||
} | ||
return String(value); | ||
} | ||
function fieldComparator(a, b) { | ||
const aName = a.name; | ||
const bName = b.name; | ||
if (aName === '_id') { | ||
return -1; | ||
_read() { } | ||
_final(cb) { | ||
this.push(this.analyzer.getResult()); | ||
this.push(null); | ||
cb(); | ||
} | ||
if (bName === '_id') { | ||
return 1; | ||
} | ||
return aName.toLowerCase() < bName.toLowerCase() ? -1 : 1; | ||
} | ||
function finalizeSchema(schema, parent, tag) { | ||
if (schema === undefined) { | ||
return; | ||
} | ||
if (tag === undefined) { | ||
finalizeSchema(schema.fields, schema, 'fields'); | ||
} | ||
if (tag === 'fields') { | ||
Object.values(schema).forEach((field) => { | ||
const missing = parent.count - field.count; | ||
if (missing > 0) { | ||
field.types.Undefined = { | ||
name: 'Undefined', | ||
type: 'Undefined', | ||
path: field.path, | ||
count: missing | ||
}; | ||
} | ||
field.total_count = Object.values(field.types) | ||
.map((v) => v.count) | ||
.reduce((p, c) => p + c, 0); | ||
finalizeSchema(field.types, field, 'types'); | ||
field.type = field.types.map((v) => v.name); | ||
if (field.type.length === 1) { | ||
field.type = field.type[0]; | ||
} | ||
field.has_duplicates = !!field.types.find((v) => v.has_duplicates); | ||
field.probability = field.count / parent.count; | ||
}); | ||
parent.fields = Object.values(parent.fields).sort(fieldComparator); | ||
} | ||
if (tag === 'types') { | ||
Object.values(schema).forEach((type) => { | ||
type.total_count = (type.lengths || []).reduce((p, c) => p + c || 0, 0); | ||
finalizeSchema(type.fields, type, 'fields'); | ||
finalizeSchema(type.types, type, 'types'); | ||
type.probability = type.count / (parent.total_count || parent.count); | ||
if (type.name === 'Null' || type.name === 'Undefined') { | ||
delete type.values; | ||
type.unique = type.count === 0 ? 0 : 1; | ||
type.has_duplicates = type.count > 1; | ||
} | ||
else if (type.values) { | ||
type.unique = new Set(type.values.map(extractStringValueFromBSON)).size; | ||
type.has_duplicates = type.unique !== type.values.length; | ||
} | ||
if (type.lengths) { | ||
type.average_length = type.total_count / type.lengths.length; | ||
} | ||
}); | ||
parent.types = Object.values(parent.types).sort((a, b) => b.probability - a.probability); | ||
} | ||
exports.ParseStream = ParseStream; | ||
function makeParseStream(options) { | ||
return new ParseStream(options); | ||
} | ||
function parse(options) { | ||
options = { semanticTypes: false, storeValues: true, ...options }; | ||
let semanticTypes = { | ||
...semantic_types_1.default | ||
}; | ||
if (typeof options.semanticTypes === 'object') { | ||
const enabledTypes = Object.entries(options.semanticTypes) | ||
.filter(([, v]) => typeof v === 'boolean' && v) | ||
.map(([k]) => k.toLowerCase()); | ||
semanticTypes = { | ||
...Object.entries(semanticTypes) | ||
.filter(([k]) => enabledTypes.includes(k.toLowerCase())) | ||
.reduce((p, [k, v]) => ({ ...p, [k]: v }), {}) | ||
}; | ||
Object.entries(options.semanticTypes) | ||
.filter(([, v]) => typeof v === 'function') | ||
.forEach(([k, v]) => { semanticTypes[k] = v; }); | ||
} | ||
const rootSchema = { | ||
fields: {}, | ||
count: 0 | ||
}; | ||
let finalized = false; | ||
function getBSONType(value) { | ||
let T; | ||
if (value && value._bsontype) { | ||
T = value._bsontype; | ||
} | ||
else { | ||
T = Object.prototype.toString.call(value).replace(/^\[object (\w+)\]$/, '$1'); | ||
} | ||
if (T === 'Object') { | ||
T = 'Document'; | ||
} | ||
return T; | ||
} | ||
function getSemanticType(value, path) { | ||
const returnValue = Object.entries(semanticTypes) | ||
.filter(([, v]) => { | ||
return v(value, path); | ||
}) | ||
.map(([k]) => k)[0]; | ||
return returnValue; | ||
} | ||
function addToValue(type, value) { | ||
if (type.name === 'String') { | ||
if (value.length > 10000) { | ||
value = value.charCodeAt(10000 - 1) === value.codePointAt(10000 - 1) | ||
? value.slice(0, 10000) | ||
: value.slice(0, 10000 - 1); | ||
} | ||
} | ||
type.values.pushSome(value); | ||
} | ||
function addToType(path, value, schema) { | ||
var _a; | ||
const bsonType = getBSONType(value); | ||
const typeName = (options === null || options === void 0 ? void 0 : options.semanticTypes) ? getSemanticType(value, path) || bsonType : bsonType; | ||
const type = schema[typeName] = lodash_1.default.get(schema, typeName, { | ||
name: typeName, | ||
bsonType: bsonType, | ||
path: path, | ||
count: 0 | ||
}); | ||
type.count++; | ||
if (typeName === 'Array') { | ||
type.types = (_a = type.types) !== null && _a !== void 0 ? _a : {}; | ||
type.lengths = type.lengths || []; | ||
type.lengths.push(value.length); | ||
value.forEach((v) => addToType(path, v, type.types)); | ||
} | ||
else if (typeName === 'Document') { | ||
type.fields = lodash_1.default.get(type, 'fields', {}); | ||
Object.entries(value).forEach(([k, v]) => addToField(`${path}.${k}`, v, type.fields)); | ||
} | ||
else if (options === null || options === void 0 ? void 0 : options.storeValues) { | ||
const defaultValue = bsonType === 'String' | ||
? (0, reservoir_1.default)(100) : (0, reservoir_1.default)(10000); | ||
type.values = type.values || defaultValue; | ||
addToValue(type, value); | ||
} | ||
} | ||
function addToField(path, value, schema) { | ||
const pathSplitOnDot = path.split('.'); | ||
const defaults = { | ||
[path]: { | ||
name: pathSplitOnDot[pathSplitOnDot.length - 1], | ||
path: path, | ||
count: 0, | ||
types: {} | ||
} | ||
}; | ||
lodash_1.default.defaultsDeep(schema, defaults); | ||
const field = schema[path]; | ||
field.count++; | ||
addToType(path, value, field.types); | ||
} | ||
function cleanup() { | ||
if (!finalized) { | ||
finalizeSchema(rootSchema); | ||
finalized = true; | ||
} | ||
} | ||
return new stream_1.Duplex({ | ||
objectMode: true, | ||
write(obj, enc, cb) { | ||
for (const key of Object.keys(obj)) { | ||
addToField(key, obj[key], rootSchema.fields); | ||
} | ||
rootSchema.count += 1; | ||
this.emit('progress', obj); | ||
cb(); | ||
}, | ||
read() { }, | ||
final(cb) { | ||
cleanup(); | ||
this.push(rootSchema); | ||
this.push(null); | ||
cb(); | ||
} | ||
}); | ||
} | ||
exports.default = parse; | ||
exports.default = makeParseStream; | ||
//# sourceMappingURL=stream.js.map |
{ | ||
"name": "mongodb-schema", | ||
"description": "Infer the probabilistic schema for a MongoDB collection.", | ||
"version": "10.0.1", | ||
"version": "10.0.2", | ||
"author": { | ||
@@ -36,3 +36,3 @@ "name": "MongoDB Inc", | ||
"scripts": { | ||
"test": "nyc mocha --colors -r ts-node/register test/*.ts", | ||
"test": "nyc mocha --timeout 5000 --colors -r ts-node/register test/*.ts", | ||
"test-example-parse-from-file": "ts-node examples/parse-from-file.ts", | ||
@@ -39,0 +39,0 @@ "test-example-parse-schema": "ts-node examples/parse-schema.ts", |
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Sorry, the diff of this file is not supported yet
Major refactor
Supply chain riskPackage has recently undergone a major refactor. It may be unstable or indicate significant internal changes. Use caution when updating to versions that include significant changes.
Found 1 instance in 1 package
New author
Supply chain riskA new npm collaborator published a version of the package for the first time. New collaborators are usually benign additions to a project, but do indicate a change to the security surface area of a package.
Found 1 instance in 1 package
62710
4.94%23
15%533
13.89%2
Infinity%