@useatlas/bigquery
Advanced tools
+427
| /** | ||
| * BigQuery profiler — the introspection half of the datasource contract | ||
| * (ADR-0017). Enumerates objects (`listBigQueryObjects`) and profiles columns | ||
| * + sample values (`profileBigQuery`) into the SDK's structural | ||
| * `ProfilingResult` mirror, which the host's registry-resolved profiler seam | ||
| * feeds into `SemanticGenerator` without importing this package. The CLI | ||
| * (`atlas init` / `atlas diff`) consumes these exports directly. | ||
| * | ||
| * BigQuery is net-new on the contract — there is no prior CLI profiler to port. | ||
| * | ||
| * Cost posture (the whole point of profiling BigQuery carefully): BigQuery | ||
| * bills by bytes SCANNED, so a naive profiler that runs `COUNT(*)`, | ||
| * `COUNT(DISTINCT col)`, or `SELECT DISTINCT col` over a table would scan the | ||
| * whole table and bill the operator for every profiled column. This profiler | ||
| * NEVER does a full scan: | ||
| * | ||
| * - Structure (datasets, tables, views, columns, types) comes from | ||
| * `INFORMATION_SCHEMA.{TABLES,COLUMNS}` — metadata queries that process | ||
| * ~0 bytes of table data. | ||
| * - Row counts come from `INFORMATION_SCHEMA.TABLE_STORAGE.total_rows` — | ||
| * table metadata, NOT a scanning `COUNT(*)`. | ||
| * - Sample values come from a single bounded read per table (one small read | ||
| * shared across all columns), not a per-column scan. Enum-like detection is | ||
| * derived from that bounded sample, never a full `COUNT(DISTINCT)`. | ||
| * | ||
| * CRITICAL: a plain `SELECT * ... LIMIT n` does NOT bound BigQuery cost — a | ||
| * `LIMIT` is applied AFTER the scan, so BigQuery still bills for every byte | ||
| * of every selected column across the whole table. For base tables we use | ||
| * `TABLESAMPLE SYSTEM (n PERCENT)`, which reads (and bills) only the | ||
| * sampled storage blocks, then `LIMIT` to cap rows returned. Views can't be | ||
| * `TABLESAMPLE`d, so they fall back to `LIMIT` — but a view's underlying | ||
| * scan is bounded by its own definition, and views report 0 storage rows, | ||
| * so the exposure is far smaller than an unbounded base-table scan. | ||
| * | ||
| * Every query runs through {@link createBigQueryConnection}, whose `query()` | ||
| * issues read-only BigQuery jobs (no DML/DDL), so profiling honors the same | ||
| * read-only posture as the query path and never mutates the project. Caught | ||
| * errors are type-narrowed and never echo credentials. | ||
| */ | ||
| import type { | ||
| PluginDatabaseObject, | ||
| PluginColumnProfile, | ||
| PluginTableProfile, | ||
| PluginProfileError, | ||
| PluginProfilingResult, | ||
| PluginListObjectsOptions, | ||
| PluginProfileOptions, | ||
| } from "@useatlas/plugin-sdk"; | ||
| import { | ||
| createBigQueryConnection, | ||
| parseBigQueryUrl, | ||
| normalizeBigQueryConfigFields, | ||
| type BigQueryConnectionConfig, | ||
| } from "./connection"; | ||
| /** Connection-level errors that will fail every remaining table — abort fast. */ | ||
| const FATAL_ERROR_PATTERN = | ||
| /\bECONNRESET\b|\bECONNREFUSED\b|\bEHOSTUNREACH\b|\bENOTFOUND\b|\bEPIPE\b|\bETIMEDOUT\b/i; | ||
| /** | ||
| * Detect a connection-level (vs. per-table) failure. A fatal error means the | ||
| * whole profile should abort rather than recording N identical per-table | ||
| * errors. Mirrors the host's `isFatalConnectionError` (kept local to avoid | ||
| * importing `@atlas/api` into the plugin package — ADR-0013). | ||
| */ | ||
| function isFatalConnectionError(err: unknown): boolean { | ||
| if (!(err instanceof Error)) return FATAL_ERROR_PATTERN.test(String(err)); | ||
| if (FATAL_ERROR_PATTERN.test(err.message)) return true; | ||
| const code = (err as NodeJS.ErrnoException).code; | ||
| if (code && FATAL_ERROR_PATTERN.test(code)) return true; | ||
| if (err.cause) return isFatalConnectionError(err.cause); | ||
| return false; | ||
| } | ||
| /** Backtick-escape a BigQuery identifier (doubles any embedded backtick). */ | ||
| function bqIdentifier(name: string): string { | ||
| return `\`${name.replace(/`/g, "``")}\``; | ||
| } | ||
| /** Single-quote escape for a BigQuery string literal. */ | ||
| function bqLiteral(value: string): string { | ||
| return value.replace(/\\/g, "\\\\").replace(/'/g, "\\'"); | ||
| } | ||
| /** | ||
| * Build the base connection config from the host-carried tenant config | ||
| * (#3664 — the ADR-0017 amendment that lets a multi-field-credential profiler | ||
| * authenticate with the TENANT's own creds over the registry/MCP seam, mirroring | ||
| * Elasticsearch's `configForOptions`). | ||
| * | ||
| * BigQuery is non-url-shaped: its credentials (`service_account_json`) live in a | ||
| * SEPARATE config field, NEVER in the connection string. The Admin → Connections | ||
| * catalog form collects `service_account_json` (raw key JSON string) + `project_id` | ||
| * (snake_case) + the generic `schema` routing hint; map them onto the connection | ||
| * factory's `credentials` (object) + `projectId` + `dataset` shape. A | ||
| * programmatic caller passing camelCase `projectId`/`credentials`/`dataset` | ||
| * directly still works. | ||
| * | ||
| * @throws {Error} when `service_account_json` is present but not valid JSON. | ||
| */ | ||
| function bigQueryConfigFromTenantConfig( | ||
| raw: Readonly<Record<string, unknown>>, | ||
| ): BigQueryConnectionConfig { | ||
| // Shared with the connection factory's `createFromConfig` (index.ts) so the | ||
| // profiling path and the query path resolve project + credentials identically: | ||
| // snake_case `project_id` → `projectId`, `service_account_json` → | ||
| // `credentials` (validated as a JSON object, never a silent ADC fallback), | ||
| // keyFilename-wins precedence. | ||
| const normalized = normalizeBigQueryConfigFields(raw); | ||
| const str = (v: unknown): string | undefined => | ||
| typeof v === "string" && v.length > 0 ? v : undefined; | ||
| // BigQuery's dataset rides on the generic `schema` routing hint over the seam | ||
| // (the catalog form has no dedicated dataset field); an explicit `dataset` | ||
| // still wins for a programmatic caller. | ||
| const dataset = str(normalized.dataset) ?? str(normalized.schema); | ||
| const credentials = | ||
| normalized.credentials && | ||
| typeof normalized.credentials === "object" && | ||
| !Array.isArray(normalized.credentials) | ||
| ? (normalized.credentials as Record<string, unknown>) | ||
| : undefined; | ||
| return { | ||
| ...(str(normalized.projectId) !== undefined ? { projectId: str(normalized.projectId)! } : {}), | ||
| ...(dataset !== undefined ? { dataset } : {}), | ||
| ...(str(normalized.location) !== undefined ? { location: str(normalized.location)! } : {}), | ||
| ...(str(normalized.keyFilename) !== undefined ? { keyFilename: str(normalized.keyFilename)! } : {}), | ||
| ...(credentials !== undefined ? { credentials } : {}), | ||
| }; | ||
| } | ||
| /** | ||
| * Resolve the dataset to introspect and the base connection config. | ||
| * | ||
| * Two credential sources, chosen by whether the host seam supplied the tenant's | ||
| * config (registry/MCP path) or only a url (CLI / static-config path): | ||
| * - `options.config` present → the host carried the datasource's DECRYPTED | ||
| * config; build from the tenant's own service-account creds (#3664). | ||
| * - `options.config` absent → parse the `bigquery://` url (operator shell). | ||
| * | ||
| * `options.schema` (the SDK routing hint) wins over the dataset embedded in | ||
| * either source, mirroring how the CLI passes `--schema`. | ||
| * | ||
| * @throws {Error} when no dataset or no project can be resolved. | ||
| */ | ||
| function resolveConfig( | ||
| options: PluginListObjectsOptions, | ||
| ): { config: BigQueryConnectionConfig; projectId: string; dataset: string } { | ||
| // Prefer the tenant config (registry/MCP path) ONLY when it actually carries | ||
| // connection fields (project / credentials / key file). An empty or | ||
| // label-only `config` (e.g. `{}` or `{ description }`) must NOT suppress URL | ||
| // parsing — fall back to the `bigquery://` url (CLI / static path) so a valid | ||
| // url isn't discarded just because a truthy-but-empty config was passed (#3664 review). | ||
| const fromConfig = options.config ? bigQueryConfigFromTenantConfig(options.config) : undefined; | ||
| const base = | ||
| fromConfig && (fromConfig.projectId || fromConfig.credentials || fromConfig.keyFilename) | ||
| ? fromConfig | ||
| : parseBigQueryUrl(options.url); | ||
| const dataset = options.schema ?? base.dataset; | ||
| if (!dataset) { | ||
| throw new Error( | ||
| "BigQuery profiling requires a dataset. Pass it in the URL " + | ||
| "(bigquery://project/dataset), via the schema option, or as a `schema` " + | ||
| "field on the datasource config.", | ||
| ); | ||
| } | ||
| const projectId = base.projectId; | ||
| if (!projectId) { | ||
| throw new Error( | ||
| "BigQuery profiling requires a project (the bigquery:// URL host or a " + | ||
| "`project_id` datasource config field).", | ||
| ); | ||
| } | ||
| // Scope the connection to the resolved dataset so unqualified references | ||
| // (e.g. INFORMATION_SCHEMA) resolve against it. | ||
| return { config: { ...base, dataset }, projectId, dataset }; | ||
| } | ||
| /** A row from INFORMATION_SCHEMA.TABLES. */ | ||
| interface TablesRow { | ||
| table_name: string; | ||
| table_type: string; | ||
| } | ||
| function tablesRowToObject(r: TablesRow): PluginDatabaseObject { | ||
| // BigQuery table_type: "BASE TABLE", "VIEW", "MATERIALIZED VIEW", | ||
| // "EXTERNAL", "SNAPSHOT", "CLONE". Map views to "view", materialized views | ||
| // to "materialized_view", everything else to "table". | ||
| const t = r.table_type.toUpperCase(); | ||
| if (t === "VIEW") return { name: r.table_name, type: "view" }; | ||
| if (t === "MATERIALIZED VIEW") return { name: r.table_name, type: "materialized_view" }; | ||
| return { name: r.table_name, type: "table" }; | ||
| } | ||
| /** | ||
| * Build the INFORMATION_SCHEMA.TABLES query for a dataset. The | ||
| * dataset-qualified `INFORMATION_SCHEMA.TABLES` view is a metadata read — it | ||
| * scans no table data. | ||
| */ | ||
| function listObjectsSql(dataset: string): string { | ||
| return `SELECT table_name, table_type | ||
| FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.TABLES | ||
| ORDER BY table_name`; | ||
| } | ||
| /** | ||
| * Enumerate the queryable tables/views in a BigQuery dataset via | ||
| * INFORMATION_SCHEMA.TABLES (metadata-only — no table scan). | ||
| */ | ||
| export async function listBigQueryObjects( | ||
| options: PluginListObjectsOptions, | ||
| ): Promise<PluginDatabaseObject[]> { | ||
| const { config, dataset } = resolveConfig(options); | ||
| const conn = createBigQueryConnection(config); | ||
| try { | ||
| const { rows } = await conn.query(listObjectsSql(dataset)); | ||
| return (rows as unknown as TablesRow[]).map(tablesRowToObject); | ||
| } finally { | ||
| await conn.close(); | ||
| } | ||
| } | ||
| /** A row from INFORMATION_SCHEMA.COLUMNS. */ | ||
| interface ColumnsRow { | ||
| table_name: string; | ||
| column_name: string; | ||
| data_type: string; | ||
| is_nullable: string; | ||
| } | ||
| /** | ||
| * Build the bounded sample-read SQL for one object. Cost posture (see header): | ||
| * a `LIMIT` alone does NOT bound BigQuery bytes billed — it is applied after the | ||
| * scan. For base tables we use `TABLESAMPLE SYSTEM`, which reads and bills only | ||
| * the sampled storage blocks, capping cost on large tables. Views and | ||
| * materialized views can't be `TABLESAMPLE`d (BigQuery rejects it), so they fall | ||
| * back to a plain `LIMIT`; a view's scan is bounded by its own definition and | ||
| * views carry no base storage of their own. | ||
| */ | ||
| function sampleSql( | ||
| dataset: string, | ||
| tableName: string, | ||
| objectType: PluginDatabaseObject["type"], | ||
| ): string { | ||
| const target = `${bqIdentifier(dataset)}.${bqIdentifier(tableName)}`; | ||
| if (objectType === "table") { | ||
| // 1 PERCENT keeps large tables cheap; small tables return whole blocks | ||
| // (still cheap). LIMIT then caps the rows we materialize. | ||
| return `SELECT * FROM ${target} TABLESAMPLE SYSTEM (1 PERCENT) LIMIT 100`; | ||
| } | ||
| return `SELECT * FROM ${target} LIMIT 100`; | ||
| } | ||
| /** Heuristic: which BigQuery types are text-like and can be enum-like. */ | ||
| function isTextType(dataType: string): boolean { | ||
| const base = dataType.toUpperCase(); | ||
| return base === "STRING" || base.startsWith("STRING("); | ||
| } | ||
| /** | ||
| * Profile a BigQuery dataset into a {@link PluginProfilingResult}. For each | ||
| * object: row count (from table metadata), column types/nullability (from | ||
| * INFORMATION_SCHEMA), and a single bounded sample read used to derive | ||
| * per-column sample values and enum-like-ness. No full table scans — see the | ||
| * file header. | ||
| * | ||
| * BigQuery has no enforced primary/foreign keys (constraints are unenforced | ||
| * metadata, rarely populated), so PK/FK fields are left empty — the semantic | ||
| * generator infers relationships from naming + sample values downstream. | ||
| */ | ||
| export async function profileBigQuery( | ||
| options: PluginProfileOptions, | ||
| ): Promise<PluginProfilingResult> { | ||
| const { selectedTables, prefetchedObjects, progress } = options; | ||
| const { config, dataset } = resolveConfig(options); | ||
| const conn = createBigQueryConnection(config); | ||
| const profiles: PluginTableProfile[] = []; | ||
| const errors: PluginProfileError[] = []; | ||
| try { | ||
| const allObjects: PluginDatabaseObject[] = prefetchedObjects | ||
| ? prefetchedObjects | ||
| : ((await conn.query(listObjectsSql(dataset))).rows as unknown as TablesRow[]).map( | ||
| tablesRowToObject, | ||
| ); | ||
| const objectsToProfile = selectedTables | ||
| ? allObjects.filter((o) => selectedTables.includes(o.name)) | ||
| : allObjects; | ||
| progress?.onStart(objectsToProfile.length); | ||
| // Row counts for the whole dataset come from one metadata read of | ||
| // INFORMATION_SCHEMA.TABLE_STORAGE (total_rows) — no per-table COUNT(*). | ||
| const rowCounts = new Map<string, number>(); | ||
| if (objectsToProfile.length > 0) { | ||
| try { | ||
| const storageRows = ( | ||
| await conn.query( | ||
| `SELECT table_name, total_rows | ||
| FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.TABLE_STORAGE`, | ||
| ) | ||
| ).rows as unknown as { table_name: string; total_rows: string | number }[]; | ||
| for (const r of storageRows) { | ||
| rowCounts.set(r.table_name, Number(r.total_rows ?? 0)); | ||
| } | ||
| } catch (storageErr) { | ||
| if (isFatalConnectionError(storageErr)) throw storageErr; | ||
| // Non-fatal: TABLE_STORAGE may be unavailable (e.g. for some view-only | ||
| // datasets or restricted permissions). Fall back to a 0 row count per | ||
| // table rather than failing the whole profile — views report 0 anyway. | ||
| } | ||
| } | ||
| for (const [i, obj] of objectsToProfile.entries()) { | ||
| const tableName = obj.name; | ||
| const objectType = obj.type; | ||
| const objectLabel = objectType === "view" ? " [view]" : ""; | ||
| progress?.onTableStart(tableName + objectLabel, i, objectsToProfile.length); | ||
| try { | ||
| const rowCount = rowCounts.get(tableName) ?? 0; | ||
| const colRows = ( | ||
| await conn.query( | ||
| `SELECT column_name, data_type, is_nullable | ||
| FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.COLUMNS | ||
| WHERE table_name = '${bqLiteral(tableName)}' | ||
| ORDER BY ordinal_position`, | ||
| ) | ||
| ).rows as unknown as ColumnsRow[]; | ||
| // One bounded sample read shared across every column — never a full | ||
| // scan. Base tables use TABLESAMPLE SYSTEM so BigQuery bills only the | ||
| // sampled storage blocks (a bare LIMIT would still bill the whole | ||
| // table); views fall back to LIMIT. See sampleSql / the file header. | ||
| let sampleRows: Record<string, unknown>[] = []; | ||
| try { | ||
| sampleRows = ( | ||
| await conn.query(sampleSql(dataset, tableName, objectType)) | ||
| ).rows as Record<string, unknown>[]; | ||
| } catch (sampleErr) { | ||
| if (isFatalConnectionError(sampleErr)) throw sampleErr; | ||
| // Non-fatal: emit columns with metadata but no sample values rather | ||
| // than failing the table (e.g. a view that errors on read). | ||
| } | ||
| const columns: PluginColumnProfile[] = colRows.map((col) => { | ||
| const present = sampleRows | ||
| .map((r) => r[col.column_name]) | ||
| .filter((v) => v !== null && v !== undefined); | ||
| const distinct = Array.from(new Set(present.map((v) => String(v)))); | ||
| const nullCount = sampleRows.length - present.length; | ||
| // Enum-like derived from the bounded sample only (no full COUNT | ||
| // DISTINCT): a text column with few distinct sampled values where | ||
| // repetition is observed (distinct < sampled rows). The ratio is | ||
| // sample-scoped and intentionally lenient — the bounded sample can't | ||
| // prove true low cardinality, so this is a hint for the generator, | ||
| // not a guarantee. | ||
| const isEnumLike = | ||
| isTextType(col.data_type) && | ||
| present.length > 0 && | ||
| distinct.length > 0 && | ||
| distinct.length < 20 && | ||
| distinct.length < present.length; | ||
| const sampleLimit = isEnumLike ? 100 : 10; | ||
| const sampleValues = distinct.slice(0, sampleLimit); | ||
| return { | ||
| name: col.column_name, | ||
| type: col.data_type, | ||
| nullable: col.is_nullable.toUpperCase() === "YES", | ||
| // Cardinality from a bounded sample is not a true table-wide count, | ||
| // so report null rather than a misleading sample-scoped number. | ||
| unique_count: null, | ||
| null_count: sampleRows.length > 0 ? nullCount : null, | ||
| sample_values: sampleValues, | ||
| is_primary_key: false, | ||
| is_foreign_key: false, | ||
| fk_target_table: null, | ||
| fk_target_column: null, | ||
| is_enum_like: isEnumLike, | ||
| profiler_notes: | ||
| sampleRows.length > 0 | ||
| ? [ | ||
| `Sampled ${sampleRows.length} row(s) (${ | ||
| objectType === "table" ? "TABLESAMPLE-bounded" : "LIMIT-bounded" | ||
| }, no full scan)`, | ||
| ] | ||
| : [], | ||
| } satisfies PluginColumnProfile; | ||
| }); | ||
| profiles.push({ | ||
| table_name: tableName, | ||
| object_type: objectType, | ||
| row_count: rowCount, | ||
| columns, | ||
| primary_key_columns: [], | ||
| foreign_keys: [], | ||
| inferred_foreign_keys: [], | ||
| profiler_notes: [], | ||
| table_flags: { possibly_abandoned: false, possibly_denormalized: false }, | ||
| }); | ||
| progress?.onTableDone(tableName, i, objectsToProfile.length); | ||
| } catch (err) { | ||
| const msg = err instanceof Error ? err.message : String(err); | ||
| if (isFatalConnectionError(err)) { | ||
| // Connection-level — every remaining table will fail the same way. | ||
| throw new Error(`Fatal database error while profiling ${tableName}: ${msg}`, { | ||
| cause: err, | ||
| }); | ||
| } | ||
| progress?.onTableError(tableName, msg, i, objectsToProfile.length); | ||
| errors.push({ table: tableName, error: msg }); | ||
| } | ||
| } | ||
| } finally { | ||
| await conn.close(); | ||
| } | ||
| return { profiles, errors }; | ||
| } |
+5
-5
| { | ||
| "name": "@useatlas/bigquery", | ||
| "version": "0.0.5", | ||
| "version": "0.0.6", | ||
| "description": "Atlas BigQuery datasource plugin", | ||
@@ -20,3 +20,3 @@ "type": "module", | ||
| "scripts": { | ||
| "test": "bun test __tests__/bigquery.test.ts" | ||
| "test": "bun test __tests__/bigquery.test.ts __tests__/profiler.test.ts __tests__/cost-estimator.test.ts" | ||
| }, | ||
@@ -46,5 +46,5 @@ "keywords": [ | ||
| "devDependencies": { | ||
| "ai": "^6.0.141", | ||
| "hono": "^4.12.9", | ||
| "zod": "^4.3.6", | ||
| "ai": "^6.0.193", | ||
| "hono": "^4.12.23", | ||
| "zod": "^4.4.3", | ||
| "@useatlas/plugin-sdk": "workspace:*" | ||
@@ -51,0 +51,0 @@ }, |
+128
-1
@@ -35,2 +35,119 @@ /** | ||
| /** | ||
| * Parse a `bigquery://` connection URL into a {@link BigQueryConnectionConfig}. | ||
| * | ||
| * Format: `bigquery://[location@]<project>[/<dataset>][?keyFilename=<path>]` | ||
| * | ||
| * Credentials are NOT embedded in the URL — they resolve the same way the | ||
| * BigQuery client always resolves them: an explicit `keyFilename` query param | ||
| * (or the `GOOGLE_APPLICATION_CREDENTIALS` env var the client reads), or | ||
| * Application Default Credentials in a GCP environment. This mirrors the | ||
| * Snowflake/Salesforce CLI url shape (the operator runs in their own shell — | ||
| * the same trust model as the static atlas.config.ts path) and keeps the | ||
| * url-based profiler-options contract (ADR-0017) usable for BigQuery without | ||
| * ever surfacing a service-account key in a connection string. | ||
| * | ||
| * @throws {Error} If the scheme is not `bigquery://` or no project is present. | ||
| */ | ||
| export function parseBigQueryUrl(url: string): BigQueryConnectionConfig { | ||
| if (!url.startsWith("bigquery://")) { | ||
| throw new Error('BigQuery URL must start with "bigquery://".'); | ||
| } | ||
| let parsed: URL; | ||
| try { | ||
| parsed = new URL(url); | ||
| } catch (err) { | ||
| throw new Error( | ||
| `Invalid bigquery:// URL: ${err instanceof Error ? err.message : String(err)}`, | ||
| { cause: err }, | ||
| ); | ||
| } | ||
| // `bigquery://location@project/dataset` — the URL host is `project`, and an | ||
| // optional `location@` prefix lands in the username. The path's first | ||
| // segment is the optional default dataset. | ||
| const location = parsed.username ? decodeURIComponent(parsed.username) : undefined; | ||
| const projectId = parsed.hostname ? decodeURIComponent(parsed.hostname) : ""; | ||
| if (!projectId) { | ||
| throw new Error( | ||
| "bigquery:// URL must include a project: bigquery://[location@]<project>[/<dataset>].", | ||
| ); | ||
| } | ||
| const datasetSegment = parsed.pathname.replace(/^\//, "").split("/")[0]; | ||
| const dataset = datasetSegment ? decodeURIComponent(datasetSegment) : undefined; | ||
| const keyFilename = parsed.searchParams.get("keyFilename") ?? undefined; | ||
| const locationParam = parsed.searchParams.get("location") ?? undefined; | ||
| return { | ||
| projectId, | ||
| dataset, | ||
| location: location ?? locationParam, | ||
| keyFilename, | ||
| }; | ||
| } | ||
| /** | ||
| * Coerce a service-account JSON string into a credentials object, with an | ||
| * actionable error for malformed input. Shared by the connection factory's | ||
| * config normalization ({@link normalizeBigQueryConfigFields}) and the | ||
| * registry/MCP profiler so the two credential paths can't drift (#3664). | ||
| * | ||
| * @throws {Error} when the string is not valid JSON, OR parses to something | ||
| * other than a JSON object (a non-null, non-array object). The shape check | ||
| * matters: `JSON.parse("null")` / a parsed array/number/string would | ||
| * otherwise be handed on as a falsy-or-malformed `credentials`, and the | ||
| * BigQuery client would silently fall back to Application Default Credentials | ||
| * (operator/ambient env) — a per-tenant-credential-isolation violation. Fail | ||
| * loud instead. | ||
| */ | ||
| export function parseServiceAccountJson(serviceAccountJson: string): Record<string, unknown> { | ||
| let parsed: unknown; | ||
| try { | ||
| parsed = JSON.parse(serviceAccountJson); | ||
| } catch (err) { | ||
| throw new Error( | ||
| `BigQuery service_account_json is not valid JSON: ${err instanceof Error ? err.message : String(err)}`, | ||
| { cause: err }, | ||
| ); | ||
| } | ||
| if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) { | ||
| throw new Error( | ||
| "BigQuery service_account_json must be a JSON object (the service account key contents).", | ||
| ); | ||
| } | ||
| return parsed as Record<string, unknown>; | ||
| } | ||
| /** | ||
| * Normalize a DB-stored / tenant BigQuery config (the Admin → Connections | ||
| * catalog form shape: snake_case `service_account_json` + `project_id`) onto the | ||
| * connection factory's field shape (camelCase `credentials` + `projectId`), | ||
| * WITHOUT clobbering explicit camelCase values a programmatic caller passed. | ||
| * Shared by `createFromConfig` (index.ts) and the registry/MCP profiler | ||
| * (profiler.ts `bigQueryConfigFromTenantConfig`) so the connection path and the | ||
| * profiling path resolve credentials identically and can't drift (#3664). An | ||
| * explicit `keyFilename` takes precedence over `service_account_json` (the key | ||
| * file wins), matching the connection factory's historical behavior. | ||
| * | ||
| * @throws {Error} when `service_account_json` is present but not a valid JSON object. | ||
| */ | ||
| export function normalizeBigQueryConfigFields( | ||
| raw: Readonly<Record<string, unknown>>, | ||
| ): Record<string, unknown> { | ||
| const out: Record<string, unknown> = { ...raw }; | ||
| if (out.projectId === undefined && typeof out.project_id === "string") { | ||
| out.projectId = out.project_id; | ||
| } | ||
| if ( | ||
| out.credentials === undefined && | ||
| out.keyFilename === undefined && | ||
| typeof out.service_account_json === "string" && | ||
| out.service_account_json.trim().length > 0 | ||
| ) { | ||
| out.credentials = parseServiceAccountJson(out.service_account_json); | ||
| } | ||
| return out; | ||
| } | ||
| /** | ||
| * Create a PluginDBConnection backed by @google-cloud/bigquery. | ||
@@ -55,5 +172,15 @@ * BigQuery is stateless REST — no pool to manage. | ||
| (err as NodeJS.ErrnoException).code === "MODULE_NOT_FOUND"; | ||
| if (isNotFound) { | ||
| // Only surface the install hint when the missing module is THIS package, not | ||
| // a transitive dep that failed to load (same MODULE_NOT_FOUND code, different | ||
| // named module). Node and bun both name the missing request quoted in the | ||
| // message, so a transitive failure won't match our own specifier. | ||
| const ownPackageMissing = | ||
| isNotFound && | ||
| (err instanceof Error ? err.message : String(err)).includes( | ||
| "'@google-cloud/bigquery'", | ||
| ); | ||
| if (ownPackageMissing) { | ||
| throw new Error( | ||
| "BigQuery support requires the @google-cloud/bigquery package. Install it with: bun add @google-cloud/bigquery", | ||
| { cause: err }, | ||
| ); | ||
@@ -60,0 +187,0 @@ } |
+127
-17
@@ -8,3 +8,6 @@ /** | ||
| * | ||
| * Usage in atlas.config.ts: | ||
| * Two registration modes: | ||
| * | ||
| * 1. Static config-defined datasource (self-host / operator-baked) — pass a | ||
| * `projectId` and the plugin wires a single connection at boot: | ||
| * ```typescript | ||
@@ -24,2 +27,12 @@ * import { defineConfig } from "@atlas/api/lib/config"; | ||
| * ``` | ||
| * | ||
| * 2. Adapter-only (SaaS per-workspace) — pass no `projectId` and the plugin | ||
| * registers purely as an adapter, so customers add their own BigQuery | ||
| * project per workspace via Admin → Connections (DB-stored, encrypted). No | ||
| * operator config, no static datasource: | ||
| * ```typescript | ||
| * export default defineConfig({ | ||
| * plugins: [bigqueryPlugin({})], | ||
| * }); | ||
| * ``` | ||
| */ | ||
@@ -33,3 +46,5 @@ | ||
| extractProjectId, | ||
| normalizeBigQueryConfigFields, | ||
| } from "./connection"; | ||
| import { listBigQueryObjects, profileBigQuery } from "./profiler"; | ||
| import { BIGQUERY_FORBIDDEN_PATTERNS } from "./validation"; | ||
@@ -63,2 +78,16 @@ import { estimateQueryCost, formatBytes } from "./cost-estimator"; | ||
| /** | ||
| * Strict runtime schema for a DB-stored (admin-UI-registered) BigQuery | ||
| * datasource install. The config-time {@link BigQueryConfigSchema} leaves | ||
| * `projectId` optional so the plugin can register as an ADAPTER ONLY | ||
| * (`bigqueryPlugin({})` parses), but a per-(workspace, install) datasource | ||
| * must identify a concrete project — credentials may still come from a | ||
| * keyFilename, inline credentials, or ADC, but the project is mandatory. | ||
| * Used by `connection.createFromConfig` so a runtime config missing | ||
| * `projectId` is REJECTED. | ||
| */ | ||
| const BigQueryRuntimeConfigSchema = BigQueryConfigSchema.extend({ | ||
| projectId: z.string().min(1, "BigQuery projectId must not be empty"), | ||
| }); | ||
| /** | ||
| * BigQuery config type. Uses `z.input` so both the `bigqueryPlugin()` factory | ||
@@ -83,2 +112,11 @@ * (which applies Zod defaults) and direct `buildBigQueryPlugin()` callers work | ||
| // A static config-defined BigQuery datasource is identified by ANY connection | ||
| // field: an explicit projectId, a keyFilename, or inline credentials. Unlike | ||
| // the url-based datasources, BigQuery can run static with NO projectId — the | ||
| // project is inferred from the key/credentials or Application Default | ||
| // Credentials (createBigQueryConnection omits projectId in that case). Only | ||
| // when none of these is present does the plugin register adapter-only (SaaS | ||
| // per-workspace — datasources are DB-stored, added via Admin → Connections). | ||
| const hasStaticConfig = !!(config.projectId || config.keyFilename || config.credentials); | ||
| const hooks: PluginHooks = { | ||
@@ -123,2 +161,70 @@ beforeQuery: [ | ||
| const connection: AtlasDatasourcePlugin<BigQueryConfig>["connection"] = { | ||
| // DB-driven (admin-UI-registered) datasources: build a connection from the | ||
| // per-(workspace, install) config decrypted from `workspace_plugins`, | ||
| // re-validated through the strict runtime schema (projectId required). | ||
| // Always available — this is the SaaS per-workspace path and the only path | ||
| // in adapter-only mode. | ||
| createFromConfig: (runtimeConfig) => { | ||
| // Normalize the catalog form's snake_case / service_account_json shape onto | ||
| // the factory's camelCase / credentials shape before validating. | ||
| const parsed = BigQueryRuntimeConfigSchema.parse( | ||
| normalizeBigQueryConfigFields(runtimeConfig), | ||
| ); | ||
| const built = createBigQueryConnection({ | ||
| projectId: parsed.projectId, | ||
| dataset: parsed.dataset, | ||
| location: parsed.location, | ||
| keyFilename: parsed.keyFilename, | ||
| credentials: parsed.credentials, | ||
| logger: log, | ||
| }); | ||
| // #3667 — introspection as a capability of the built connection. BigQuery | ||
| // is non-url-shaped: the profiler reads the tenant's service-account creds | ||
| // from the bound `config` (the same record `createFromConfig` received), | ||
| // never a url — there is no synthetic url / URL-shape gate. | ||
| return { | ||
| ...built, | ||
| listObjects: (o) => listBigQueryObjects({ url: "", schema: o?.schema ?? parsed.dataset, config: runtimeConfig }), | ||
| profile: (o) => | ||
| profileBigQuery({ | ||
| url: "", | ||
| schema: o?.schema ?? parsed.dataset, | ||
| config: runtimeConfig, | ||
| selectedTables: o?.selectedTables, | ||
| prefetchedObjects: o?.prefetchedObjects, | ||
| progress: o?.progress, | ||
| logger: o?.logger, | ||
| }), | ||
| }; | ||
| }, | ||
| dbType: "bigquery", | ||
| parserDialect: "BigQuery", | ||
| forbiddenPatterns: BIGQUERY_FORBIDDEN_PATTERNS, | ||
| // Introspection half of the datasource contract (ADR-0017). The host | ||
| // resolves `profile` off the registry (same predicate as `createFromConfig`) | ||
| // and feeds it into SemanticGenerator's profiler seam; the CLI consumes | ||
| // these exports directly. Both run read-only and never full-scan a table — | ||
| // structure/row-counts come from INFORMATION_SCHEMA metadata, sampling is | ||
| // LIMIT-bounded (BigQuery bills by bytes scanned). | ||
| listObjects: listBigQueryObjects, | ||
| profile: profileBigQuery, | ||
| }; | ||
| // When any static connection field is configured the plugin wires a | ||
| // config-defined connection at boot; without one it is registered adapter-only. | ||
| // `config.projectId` may be undefined here (key/credentials/ADC supply it) — | ||
| // createBigQueryConnection handles an absent projectId. | ||
| if (hasStaticConfig) { | ||
| connection.create = () => | ||
| createBigQueryConnection({ | ||
| projectId: config.projectId, | ||
| dataset: config.dataset, | ||
| location: config.location, | ||
| keyFilename: config.keyFilename, | ||
| credentials: config.credentials, | ||
| logger: log, | ||
| }); | ||
| } | ||
| return { | ||
@@ -131,16 +237,3 @@ id: "bigquery-datasource", | ||
| connection: { | ||
| create: () => | ||
| createBigQueryConnection({ | ||
| projectId: config.projectId, | ||
| dataset: config.dataset, | ||
| location: config.location, | ||
| keyFilename: config.keyFilename, | ||
| credentials: config.credentials, | ||
| logger: log, | ||
| }), | ||
| dbType: "bigquery", | ||
| parserDialect: "BigQuery", | ||
| forbiddenPatterns: BIGQUERY_FORBIDDEN_PATTERNS, | ||
| }, | ||
| connection, | ||
@@ -168,6 +261,18 @@ entities: [], | ||
| log = ctx.logger; | ||
| ctx.logger.info(`BigQuery datasource plugin initialized (project: ${extractProjectId(config)})`); | ||
| if (hasStaticConfig) { | ||
| ctx.logger.info(`BigQuery datasource plugin initialized (project: ${extractProjectId(config)})`); | ||
| } else { | ||
| ctx.logger.info( | ||
| "BigQuery datasource plugin registered as adapter-only — per-workspace datasources via Admin → Connections", | ||
| ); | ||
| } | ||
| }, | ||
| async healthCheck(): Promise<PluginHealthResult> { | ||
| // Adapter-only: no static datasource to probe. The plugin itself is a | ||
| // healthy adapter; per-workspace connections are health-checked by the | ||
| // ConnectionRegistry once installed. | ||
| if (!hasStaticConfig) { | ||
| return { healthy: true, message: "adapter-only: no static datasource configured" }; | ||
| } | ||
| const start = performance.now(); | ||
@@ -211,3 +316,6 @@ let conn: PluginDBConnection | undefined; | ||
| * ```typescript | ||
| * // Static datasource (self-host): | ||
| * plugins: [bigqueryPlugin({ projectId: "my-project", dataset: "analytics" })] | ||
| * // Adapter-only (SaaS — customers bring their own per workspace): | ||
| * plugins: [bigqueryPlugin({})] | ||
| * ``` | ||
@@ -220,5 +328,7 @@ */ | ||
| export { createBigQueryConnection, extractProjectId } from "./connection"; | ||
| export { createBigQueryConnection, extractProjectId, parseBigQueryUrl } from "./connection"; | ||
| export type { BigQueryConnectionConfig } from "./connection"; | ||
| export { listBigQueryObjects, profileBigQuery } from "./profiler"; | ||
| export { BIGQUERY_FORBIDDEN_PATTERNS } from "./validation"; | ||
| export { estimateQueryCost, formatBytes, _resetCachedClient } from "./cost-estimator"; | ||
| export type { CostEstimate } from "./cost-estimator"; |
49519
145.3%8
14.29%1049
146.24%