@useatlas/bigquery - npm Package Compare versions

Comparing version

0.0.5

0.0.6

+427

src/profiler.ts

		/**
		* BigQuery profiler — the introspection half of the datasource contract
		* (ADR-0017). Enumerates objects (`listBigQueryObjects`) and profiles columns
		* + sample values (`profileBigQuery`) into the SDK's structural
		* `ProfilingResult` mirror, which the host's registry-resolved profiler seam
		* feeds into `SemanticGenerator` without importing this package. The CLI
		* (`atlas init` / `atlas diff`) consumes these exports directly.
		*
		* BigQuery is net-new on the contract — there is no prior CLI profiler to port.
		*
		* Cost posture (the whole point of profiling BigQuery carefully): BigQuery
		* bills by bytes SCANNED, so a naive profiler that runs `COUNT(*)`,
		* `COUNT(DISTINCT col)`, or `SELECT DISTINCT col` over a table would scan the
		* whole table and bill the operator for every profiled column. This profiler
		* NEVER does a full scan:
		*
		* - Structure (datasets, tables, views, columns, types) comes from
		* `INFORMATION_SCHEMA.{TABLES,COLUMNS}` — metadata queries that process
		* ~0 bytes of table data.
		* - Row counts come from `INFORMATION_SCHEMA.TABLE_STORAGE.total_rows` —
		* table metadata, NOT a scanning `COUNT(*)`.
		* - Sample values come from a single bounded read per table (one small read
		* shared across all columns), not a per-column scan. Enum-like detection is
		* derived from that bounded sample, never a full `COUNT(DISTINCT)`.
		*
		* CRITICAL: a plain `SELECT * ... LIMIT n` does NOT bound BigQuery cost — a
		* `LIMIT` is applied AFTER the scan, so BigQuery still bills for every byte
		* of every selected column across the whole table. For base tables we use
		* `TABLESAMPLE SYSTEM (n PERCENT)`, which reads (and bills) only the
		* sampled storage blocks, then `LIMIT` to cap rows returned. Views can't be
		* `TABLESAMPLE`d, so they fall back to `LIMIT` — but a view's underlying
		* scan is bounded by its own definition, and views report 0 storage rows,
		* so the exposure is far smaller than an unbounded base-table scan.
		*
		* Every query runs through {@link createBigQueryConnection}, whose `query()`
		* issues read-only BigQuery jobs (no DML/DDL), so profiling honors the same
		* read-only posture as the query path and never mutates the project. Caught
		* errors are type-narrowed and never echo credentials.
		*/

		import type {
		PluginDatabaseObject,
		PluginColumnProfile,
		PluginTableProfile,
		PluginProfileError,
		PluginProfilingResult,
		PluginListObjectsOptions,
		PluginProfileOptions,
		} from "@useatlas/plugin-sdk";
		import {
		createBigQueryConnection,
		parseBigQueryUrl,
		normalizeBigQueryConfigFields,
		type BigQueryConnectionConfig,
		} from "./connection";

		/** Connection-level errors that will fail every remaining table — abort fast. */
		const FATAL_ERROR_PATTERN =
		/\bECONNRESET\b\|\bECONNREFUSED\b\|\bEHOSTUNREACH\b\|\bENOTFOUND\b\|\bEPIPE\b\|\bETIMEDOUT\b/i;

		/**
		* Detect a connection-level (vs. per-table) failure. A fatal error means the
		* whole profile should abort rather than recording N identical per-table
		* errors. Mirrors the host's `isFatalConnectionError` (kept local to avoid
		* importing `@atlas/api` into the plugin package — ADR-0013).
		*/
		function isFatalConnectionError(err: unknown): boolean {
		if (!(err instanceof Error)) return FATAL_ERROR_PATTERN.test(String(err));
		if (FATAL_ERROR_PATTERN.test(err.message)) return true;
		const code = (err as NodeJS.ErrnoException).code;
		if (code && FATAL_ERROR_PATTERN.test(code)) return true;
		if (err.cause) return isFatalConnectionError(err.cause);
		return false;
		}

		/** Backtick-escape a BigQuery identifier (doubles any embedded backtick). */
		function bqIdentifier(name: string): string {
		return `\`${name.replace(/`/g, "``")}\``;
		}

		/** Single-quote escape for a BigQuery string literal. */
		function bqLiteral(value: string): string {
		return value.replace(/\\/g, "\\\\").replace(/'/g, "\\'");
		}

		/**
		* Build the base connection config from the host-carried tenant config
		* (#3664 — the ADR-0017 amendment that lets a multi-field-credential profiler
		* authenticate with the TENANT's own creds over the registry/MCP seam, mirroring
		* Elasticsearch's `configForOptions`).
		*
		* BigQuery is non-url-shaped: its credentials (`service_account_json`) live in a
		* SEPARATE config field, NEVER in the connection string. The Admin → Connections
		* catalog form collects `service_account_json` (raw key JSON string) + `project_id`
		* (snake_case) + the generic `schema` routing hint; map them onto the connection
		* factory's `credentials` (object) + `projectId` + `dataset` shape. A
		* programmatic caller passing camelCase `projectId`/`credentials`/`dataset`
		* directly still works.
		*
		* @throws {Error} when `service_account_json` is present but not valid JSON.
		*/
		function bigQueryConfigFromTenantConfig(
		raw: Readonly<Record<string, unknown>>,
		): BigQueryConnectionConfig {
		// Shared with the connection factory's `createFromConfig` (index.ts) so the
		// profiling path and the query path resolve project + credentials identically:
		// snake_case `project_id` → `projectId`, `service_account_json` →
		// `credentials` (validated as a JSON object, never a silent ADC fallback),
		// keyFilename-wins precedence.
		const normalized = normalizeBigQueryConfigFields(raw);
		const str = (v: unknown): string \| undefined =>
		typeof v === "string" && v.length > 0 ? v : undefined;
		// BigQuery's dataset rides on the generic `schema` routing hint over the seam
		// (the catalog form has no dedicated dataset field); an explicit `dataset`
		// still wins for a programmatic caller.
		const dataset = str(normalized.dataset) ?? str(normalized.schema);
		const credentials =
		normalized.credentials &&
		typeof normalized.credentials === "object" &&
		!Array.isArray(normalized.credentials)
		? (normalized.credentials as Record<string, unknown>)
		: undefined;

		return {
		...(str(normalized.projectId) !== undefined ? { projectId: str(normalized.projectId)! } : {}),
		...(dataset !== undefined ? { dataset } : {}),
		...(str(normalized.location) !== undefined ? { location: str(normalized.location)! } : {}),
		...(str(normalized.keyFilename) !== undefined ? { keyFilename: str(normalized.keyFilename)! } : {}),
		...(credentials !== undefined ? { credentials } : {}),
		};
		}

		/**
		* Resolve the dataset to introspect and the base connection config.
		*
		* Two credential sources, chosen by whether the host seam supplied the tenant's
		* config (registry/MCP path) or only a url (CLI / static-config path):
		* - `options.config` present → the host carried the datasource's DECRYPTED
		* config; build from the tenant's own service-account creds (#3664).
		* - `options.config` absent → parse the `bigquery://` url (operator shell).
		*
		* `options.schema` (the SDK routing hint) wins over the dataset embedded in
		* either source, mirroring how the CLI passes `--schema`.
		*
		* @throws {Error} when no dataset or no project can be resolved.
		*/
		function resolveConfig(
		options: PluginListObjectsOptions,
		): { config: BigQueryConnectionConfig; projectId: string; dataset: string } {
		// Prefer the tenant config (registry/MCP path) ONLY when it actually carries
		// connection fields (project / credentials / key file). An empty or
		// label-only `config` (e.g. `{}` or `{ description }`) must NOT suppress URL
		// parsing — fall back to the `bigquery://` url (CLI / static path) so a valid
		// url isn't discarded just because a truthy-but-empty config was passed (#3664 review).
		const fromConfig = options.config ? bigQueryConfigFromTenantConfig(options.config) : undefined;
		const base =
		fromConfig && (fromConfig.projectId \|\| fromConfig.credentials \|\| fromConfig.keyFilename)
		? fromConfig
		: parseBigQueryUrl(options.url);
		const dataset = options.schema ?? base.dataset;
		if (!dataset) {
		throw new Error(
		"BigQuery profiling requires a dataset. Pass it in the URL " +
		"(bigquery://project/dataset), via the schema option, or as a `schema` " +
		"field on the datasource config.",
		);
		}
		const projectId = base.projectId;
		if (!projectId) {
		throw new Error(
		"BigQuery profiling requires a project (the bigquery:// URL host or a " +
		"`project_id` datasource config field).",
		);
		}
		// Scope the connection to the resolved dataset so unqualified references
		// (e.g. INFORMATION_SCHEMA) resolve against it.
		return { config: { ...base, dataset }, projectId, dataset };
		}

		/** A row from INFORMATION_SCHEMA.TABLES. */
		interface TablesRow {
		table_name: string;
		table_type: string;
		}

		function tablesRowToObject(r: TablesRow): PluginDatabaseObject {
		// BigQuery table_type: "BASE TABLE", "VIEW", "MATERIALIZED VIEW",
		// "EXTERNAL", "SNAPSHOT", "CLONE". Map views to "view", materialized views
		// to "materialized_view", everything else to "table".
		const t = r.table_type.toUpperCase();
		if (t === "VIEW") return { name: r.table_name, type: "view" };
		if (t === "MATERIALIZED VIEW") return { name: r.table_name, type: "materialized_view" };
		return { name: r.table_name, type: "table" };
		}

		/**
		* Build the INFORMATION_SCHEMA.TABLES query for a dataset. The
		* dataset-qualified `INFORMATION_SCHEMA.TABLES` view is a metadata read — it
		* scans no table data.
		*/
		function listObjectsSql(dataset: string): string {
		return `SELECT table_name, table_type
		FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.TABLES
		ORDER BY table_name`;
		}

		/**
		* Enumerate the queryable tables/views in a BigQuery dataset via
		* INFORMATION_SCHEMA.TABLES (metadata-only — no table scan).
		*/
		export async function listBigQueryObjects(
		options: PluginListObjectsOptions,
		): Promise<PluginDatabaseObject[]> {
		const { config, dataset } = resolveConfig(options);
		const conn = createBigQueryConnection(config);
		try {
		const { rows } = await conn.query(listObjectsSql(dataset));
		return (rows as unknown as TablesRow[]).map(tablesRowToObject);
		} finally {
		await conn.close();
		}
		}

		/** A row from INFORMATION_SCHEMA.COLUMNS. */
		interface ColumnsRow {
		table_name: string;
		column_name: string;
		data_type: string;
		is_nullable: string;
		}

		/**
		* Build the bounded sample-read SQL for one object. Cost posture (see header):
		* a `LIMIT` alone does NOT bound BigQuery bytes billed — it is applied after the
		* scan. For base tables we use `TABLESAMPLE SYSTEM`, which reads and bills only
		* the sampled storage blocks, capping cost on large tables. Views and
		* materialized views can't be `TABLESAMPLE`d (BigQuery rejects it), so they fall
		* back to a plain `LIMIT`; a view's scan is bounded by its own definition and
		* views carry no base storage of their own.
		*/
		function sampleSql(
		dataset: string,
		tableName: string,
		objectType: PluginDatabaseObject["type"],
		): string {
		const target = `${bqIdentifier(dataset)}.${bqIdentifier(tableName)}`;
		if (objectType === "table") {
		// 1 PERCENT keeps large tables cheap; small tables return whole blocks
		// (still cheap). LIMIT then caps the rows we materialize.
		return `SELECT * FROM ${target} TABLESAMPLE SYSTEM (1 PERCENT) LIMIT 100`;
		}
		return `SELECT * FROM ${target} LIMIT 100`;
		}

		/** Heuristic: which BigQuery types are text-like and can be enum-like. */
		function isTextType(dataType: string): boolean {
		const base = dataType.toUpperCase();
		return base === "STRING" \|\| base.startsWith("STRING(");
		}

		/**
		* Profile a BigQuery dataset into a {@link PluginProfilingResult}. For each
		* object: row count (from table metadata), column types/nullability (from
		* INFORMATION_SCHEMA), and a single bounded sample read used to derive
		* per-column sample values and enum-like-ness. No full table scans — see the
		* file header.
		*
		* BigQuery has no enforced primary/foreign keys (constraints are unenforced
		* metadata, rarely populated), so PK/FK fields are left empty — the semantic
		* generator infers relationships from naming + sample values downstream.
		*/
		export async function profileBigQuery(
		options: PluginProfileOptions,
		): Promise<PluginProfilingResult> {
		const { selectedTables, prefetchedObjects, progress } = options;
		const { config, dataset } = resolveConfig(options);
		const conn = createBigQueryConnection(config);

		const profiles: PluginTableProfile[] = [];
		const errors: PluginProfileError[] = [];

		try {
		const allObjects: PluginDatabaseObject[] = prefetchedObjects
		? prefetchedObjects
		: ((await conn.query(listObjectsSql(dataset))).rows as unknown as TablesRow[]).map(
		tablesRowToObject,
		);

		const objectsToProfile = selectedTables
		? allObjects.filter((o) => selectedTables.includes(o.name))
		: allObjects;

		progress?.onStart(objectsToProfile.length);

		// Row counts for the whole dataset come from one metadata read of
		// INFORMATION_SCHEMA.TABLE_STORAGE (total_rows) — no per-table COUNT(*).
		const rowCounts = new Map<string, number>();
		if (objectsToProfile.length > 0) {
		try {
		const storageRows = (
		await conn.query(
		`SELECT table_name, total_rows
		FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.TABLE_STORAGE`,
		)
		).rows as unknown as { table_name: string; total_rows: string \| number }[];
		for (const r of storageRows) {
		rowCounts.set(r.table_name, Number(r.total_rows ?? 0));
		}
		} catch (storageErr) {
		if (isFatalConnectionError(storageErr)) throw storageErr;
		// Non-fatal: TABLE_STORAGE may be unavailable (e.g. for some view-only
		// datasets or restricted permissions). Fall back to a 0 row count per
		// table rather than failing the whole profile — views report 0 anyway.
		}
		}

		for (const [i, obj] of objectsToProfile.entries()) {
		const tableName = obj.name;
		const objectType = obj.type;
		const objectLabel = objectType === "view" ? " [view]" : "";
		progress?.onTableStart(tableName + objectLabel, i, objectsToProfile.length);

		try {
		const rowCount = rowCounts.get(tableName) ?? 0;

		const colRows = (
		await conn.query(
		`SELECT column_name, data_type, is_nullable
		FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.COLUMNS
		WHERE table_name = '${bqLiteral(tableName)}'
		ORDER BY ordinal_position`,
		)
		).rows as unknown as ColumnsRow[];

		// One bounded sample read shared across every column — never a full
		// scan. Base tables use TABLESAMPLE SYSTEM so BigQuery bills only the
		// sampled storage blocks (a bare LIMIT would still bill the whole
		// table); views fall back to LIMIT. See sampleSql / the file header.
		let sampleRows: Record<string, unknown>[] = [];
		try {
		sampleRows = (
		await conn.query(sampleSql(dataset, tableName, objectType))
		).rows as Record<string, unknown>[];
		} catch (sampleErr) {
		if (isFatalConnectionError(sampleErr)) throw sampleErr;
		// Non-fatal: emit columns with metadata but no sample values rather
		// than failing the table (e.g. a view that errors on read).
		}

		const columns: PluginColumnProfile[] = colRows.map((col) => {
		const present = sampleRows
		.map((r) => r[col.column_name])
		.filter((v) => v !== null && v !== undefined);
		const distinct = Array.from(new Set(present.map((v) => String(v))));
		const nullCount = sampleRows.length - present.length;

		// Enum-like derived from the bounded sample only (no full COUNT
		// DISTINCT): a text column with few distinct sampled values where
		// repetition is observed (distinct < sampled rows). The ratio is
		// sample-scoped and intentionally lenient — the bounded sample can't
		// prove true low cardinality, so this is a hint for the generator,
		// not a guarantee.
		const isEnumLike =
		isTextType(col.data_type) &&
		present.length > 0 &&
		distinct.length > 0 &&
		distinct.length < 20 &&
		distinct.length < present.length;

		const sampleLimit = isEnumLike ? 100 : 10;
		const sampleValues = distinct.slice(0, sampleLimit);

		return {
		name: col.column_name,
		type: col.data_type,
		nullable: col.is_nullable.toUpperCase() === "YES",
		// Cardinality from a bounded sample is not a true table-wide count,
		// so report null rather than a misleading sample-scoped number.
		unique_count: null,
		null_count: sampleRows.length > 0 ? nullCount : null,
		sample_values: sampleValues,
		is_primary_key: false,
		is_foreign_key: false,
		fk_target_table: null,
		fk_target_column: null,
		is_enum_like: isEnumLike,
		profiler_notes:
		sampleRows.length > 0
		? [
		`Sampled ${sampleRows.length} row(s) (${
		objectType === "table" ? "TABLESAMPLE-bounded" : "LIMIT-bounded"
		}, no full scan)`,
		]
		: [],
		} satisfies PluginColumnProfile;
		});

		profiles.push({
		table_name: tableName,
		object_type: objectType,
		row_count: rowCount,
		columns,
		primary_key_columns: [],
		foreign_keys: [],
		inferred_foreign_keys: [],
		profiler_notes: [],
		table_flags: { possibly_abandoned: false, possibly_denormalized: false },
		});
		progress?.onTableDone(tableName, i, objectsToProfile.length);
		} catch (err) {
		const msg = err instanceof Error ? err.message : String(err);
		if (isFatalConnectionError(err)) {
		// Connection-level — every remaining table will fail the same way.
		throw new Error(`Fatal database error while profiling ${tableName}: ${msg}`, {
		cause: err,
		});
		}
		progress?.onTableError(tableName, msg, i, objectsToProfile.length);
		errors.push({ table: tableName, error: msg });
		}
		}
		} finally {
		await conn.close();
		}

		return { profiles, errors };
		}

+5

-5

package.json

		{
		"name": "@useatlas/bigquery",
		"version": "0.0.5",
		"version": "0.0.6",
		"description": "Atlas BigQuery datasource plugin",
		@@ -20,3 +20,3 @@ "type": "module",
		"scripts": {
		"test": "bun test __tests__/bigquery.test.ts"
		"test": "bun test __tests__/bigquery.test.ts __tests__/profiler.test.ts __tests__/cost-estimator.test.ts"
		},
		@@ -46,5 +46,5 @@ "keywords": [
		"devDependencies": {
		"ai": "^6.0.141",
		"hono": "^4.12.9",
		"zod": "^4.3.6",
		"ai": "^6.0.193",
		"hono": "^4.12.23",
		"zod": "^4.4.3",
		"@useatlas/plugin-sdk": "workspace:*"
		@@ -51,0 +51,0 @@ },

+128

-1

src/connection.ts

		@@ -35,2 +35,119 @@ /**
		/**
		* Parse a `bigquery://` connection URL into a {@link BigQueryConnectionConfig}.
		*
		* Format: `bigquery://[location@]<project>[/<dataset>][?keyFilename=<path>]`
		*
		* Credentials are NOT embedded in the URL — they resolve the same way the
		* BigQuery client always resolves them: an explicit `keyFilename` query param
		* (or the `GOOGLE_APPLICATION_CREDENTIALS` env var the client reads), or
		* Application Default Credentials in a GCP environment. This mirrors the
		* Snowflake/Salesforce CLI url shape (the operator runs in their own shell —
		* the same trust model as the static atlas.config.ts path) and keeps the
		* url-based profiler-options contract (ADR-0017) usable for BigQuery without
		* ever surfacing a service-account key in a connection string.
		*
		* @throws {Error} If the scheme is not `bigquery://` or no project is present.
		*/
		export function parseBigQueryUrl(url: string): BigQueryConnectionConfig {
		if (!url.startsWith("bigquery://")) {
		throw new Error('BigQuery URL must start with "bigquery://".');
		}

		let parsed: URL;
		try {
		parsed = new URL(url);
		} catch (err) {
		throw new Error(
		`Invalid bigquery:// URL: ${err instanceof Error ? err.message : String(err)}`,
		{ cause: err },
		);
		}

		// `bigquery://location@project/dataset` — the URL host is `project`, and an
		// optional `location@` prefix lands in the username. The path's first
		// segment is the optional default dataset.
		const location = parsed.username ? decodeURIComponent(parsed.username) : undefined;
		const projectId = parsed.hostname ? decodeURIComponent(parsed.hostname) : "";
		if (!projectId) {
		throw new Error(
		"bigquery:// URL must include a project: bigquery://[location@]<project>[/<dataset>].",
		);
		}
		const datasetSegment = parsed.pathname.replace(/^\//, "").split("/")[0];
		const dataset = datasetSegment ? decodeURIComponent(datasetSegment) : undefined;
		const keyFilename = parsed.searchParams.get("keyFilename") ?? undefined;
		const locationParam = parsed.searchParams.get("location") ?? undefined;

		return {
		projectId,
		dataset,
		location: location ?? locationParam,
		keyFilename,
		};
		}

		/**
		* Coerce a service-account JSON string into a credentials object, with an
		* actionable error for malformed input. Shared by the connection factory's
		* config normalization ({@link normalizeBigQueryConfigFields}) and the
		* registry/MCP profiler so the two credential paths can't drift (#3664).
		*
		* @throws {Error} when the string is not valid JSON, OR parses to something
		* other than a JSON object (a non-null, non-array object). The shape check
		* matters: `JSON.parse("null")` / a parsed array/number/string would
		* otherwise be handed on as a falsy-or-malformed `credentials`, and the
		* BigQuery client would silently fall back to Application Default Credentials
		* (operator/ambient env) — a per-tenant-credential-isolation violation. Fail
		* loud instead.
		*/
		export function parseServiceAccountJson(serviceAccountJson: string): Record<string, unknown> {
		let parsed: unknown;
		try {
		parsed = JSON.parse(serviceAccountJson);
		} catch (err) {
		throw new Error(
		`BigQuery service_account_json is not valid JSON: ${err instanceof Error ? err.message : String(err)}`,
		{ cause: err },
		);
		}
		if (parsed === null \|\| typeof parsed !== "object" \|\| Array.isArray(parsed)) {
		throw new Error(
		"BigQuery service_account_json must be a JSON object (the service account key contents).",
		);
		}
		return parsed as Record<string, unknown>;
		}

		/**
		* Normalize a DB-stored / tenant BigQuery config (the Admin → Connections
		* catalog form shape: snake_case `service_account_json` + `project_id`) onto the
		* connection factory's field shape (camelCase `credentials` + `projectId`),
		* WITHOUT clobbering explicit camelCase values a programmatic caller passed.
		* Shared by `createFromConfig` (index.ts) and the registry/MCP profiler
		* (profiler.ts `bigQueryConfigFromTenantConfig`) so the connection path and the
		* profiling path resolve credentials identically and can't drift (#3664). An
		* explicit `keyFilename` takes precedence over `service_account_json` (the key
		* file wins), matching the connection factory's historical behavior.
		*
		* @throws {Error} when `service_account_json` is present but not a valid JSON object.
		*/
		export function normalizeBigQueryConfigFields(
		raw: Readonly<Record<string, unknown>>,
		): Record<string, unknown> {
		const out: Record<string, unknown> = { ...raw };
		if (out.projectId === undefined && typeof out.project_id === "string") {
		out.projectId = out.project_id;
		}
		if (
		out.credentials === undefined &&
		out.keyFilename === undefined &&
		typeof out.service_account_json === "string" &&
		out.service_account_json.trim().length > 0
		) {
		out.credentials = parseServiceAccountJson(out.service_account_json);
		}
		return out;
		}

		/**
		* Create a PluginDBConnection backed by @google-cloud/bigquery.
		@@ -55,5 +172,15 @@ * BigQuery is stateless REST — no pool to manage.
		(err as NodeJS.ErrnoException).code === "MODULE_NOT_FOUND";
		if (isNotFound) {
		// Only surface the install hint when the missing module is THIS package, not
		// a transitive dep that failed to load (same MODULE_NOT_FOUND code, different
		// named module). Node and bun both name the missing request quoted in the
		// message, so a transitive failure won't match our own specifier.
		const ownPackageMissing =
		isNotFound &&
		(err instanceof Error ? err.message : String(err)).includes(
		"'@google-cloud/bigquery'",
		);
		if (ownPackageMissing) {
		throw new Error(
		"BigQuery support requires the @google-cloud/bigquery package. Install it with: bun add @google-cloud/bigquery",
		{ cause: err },
		);
		@@ -60,0 +187,0 @@ }

+127

-17

src/index.ts

		@@ -8,3 +8,6 @@ /**
		*
		* Usage in atlas.config.ts:
		* Two registration modes:
		*
		* 1. Static config-defined datasource (self-host / operator-baked) — pass a
		* `projectId` and the plugin wires a single connection at boot:
		* ```typescript
		@@ -24,2 +27,12 @@ * import { defineConfig } from "@atlas/api/lib/config";
		* ```
		*
		* 2. Adapter-only (SaaS per-workspace) — pass no `projectId` and the plugin
		* registers purely as an adapter, so customers add their own BigQuery
		* project per workspace via Admin → Connections (DB-stored, encrypted). No
		* operator config, no static datasource:
		* ```typescript
		* export default defineConfig({
		* plugins: [bigqueryPlugin({})],
		* });
		* ```
		*/
		@@ -33,3 +46,5 @@
		extractProjectId,
		normalizeBigQueryConfigFields,
		} from "./connection";
		import { listBigQueryObjects, profileBigQuery } from "./profiler";
		import { BIGQUERY_FORBIDDEN_PATTERNS } from "./validation";
		@@ -63,2 +78,16 @@ import { estimateQueryCost, formatBytes } from "./cost-estimator";
		/**
		* Strict runtime schema for a DB-stored (admin-UI-registered) BigQuery
		* datasource install. The config-time {@link BigQueryConfigSchema} leaves
		* `projectId` optional so the plugin can register as an ADAPTER ONLY
		* (`bigqueryPlugin({})` parses), but a per-(workspace, install) datasource
		* must identify a concrete project — credentials may still come from a
		* keyFilename, inline credentials, or ADC, but the project is mandatory.
		* Used by `connection.createFromConfig` so a runtime config missing
		* `projectId` is REJECTED.
		*/
		const BigQueryRuntimeConfigSchema = BigQueryConfigSchema.extend({
		projectId: z.string().min(1, "BigQuery projectId must not be empty"),
		});

		/**
		* BigQuery config type. Uses `z.input` so both the `bigqueryPlugin()` factory
		@@ -83,2 +112,11 @@ * (which applies Zod defaults) and direct `buildBigQueryPlugin()` callers work

		// A static config-defined BigQuery datasource is identified by ANY connection
		// field: an explicit projectId, a keyFilename, or inline credentials. Unlike
		// the url-based datasources, BigQuery can run static with NO projectId — the
		// project is inferred from the key/credentials or Application Default
		// Credentials (createBigQueryConnection omits projectId in that case). Only
		// when none of these is present does the plugin register adapter-only (SaaS
		// per-workspace — datasources are DB-stored, added via Admin → Connections).
		const hasStaticConfig = !!(config.projectId \|\| config.keyFilename \|\| config.credentials);

		const hooks: PluginHooks = {
		@@ -123,2 +161,70 @@ beforeQuery: [

		const connection: AtlasDatasourcePlugin<BigQueryConfig>["connection"] = {
		// DB-driven (admin-UI-registered) datasources: build a connection from the
		// per-(workspace, install) config decrypted from `workspace_plugins`,
		// re-validated through the strict runtime schema (projectId required).
		// Always available — this is the SaaS per-workspace path and the only path
		// in adapter-only mode.
		createFromConfig: (runtimeConfig) => {
		// Normalize the catalog form's snake_case / service_account_json shape onto
		// the factory's camelCase / credentials shape before validating.
		const parsed = BigQueryRuntimeConfigSchema.parse(
		normalizeBigQueryConfigFields(runtimeConfig),
		);
		const built = createBigQueryConnection({
		projectId: parsed.projectId,
		dataset: parsed.dataset,
		location: parsed.location,
		keyFilename: parsed.keyFilename,
		credentials: parsed.credentials,
		logger: log,
		});
		// #3667 — introspection as a capability of the built connection. BigQuery
		// is non-url-shaped: the profiler reads the tenant's service-account creds
		// from the bound `config` (the same record `createFromConfig` received),
		// never a url — there is no synthetic url / URL-shape gate.
		return {
		...built,
		listObjects: (o) => listBigQueryObjects({ url: "", schema: o?.schema ?? parsed.dataset, config: runtimeConfig }),
		profile: (o) =>
		profileBigQuery({
		url: "",
		schema: o?.schema ?? parsed.dataset,
		config: runtimeConfig,
		selectedTables: o?.selectedTables,
		prefetchedObjects: o?.prefetchedObjects,
		progress: o?.progress,
		logger: o?.logger,
		}),
		};
		},
		dbType: "bigquery",
		parserDialect: "BigQuery",
		forbiddenPatterns: BIGQUERY_FORBIDDEN_PATTERNS,
		// Introspection half of the datasource contract (ADR-0017). The host
		// resolves `profile` off the registry (same predicate as `createFromConfig`)
		// and feeds it into SemanticGenerator's profiler seam; the CLI consumes
		// these exports directly. Both run read-only and never full-scan a table —
		// structure/row-counts come from INFORMATION_SCHEMA metadata, sampling is
		// LIMIT-bounded (BigQuery bills by bytes scanned).
		listObjects: listBigQueryObjects,
		profile: profileBigQuery,
		};

		// When any static connection field is configured the plugin wires a
		// config-defined connection at boot; without one it is registered adapter-only.
		// `config.projectId` may be undefined here (key/credentials/ADC supply it) —
		// createBigQueryConnection handles an absent projectId.
		if (hasStaticConfig) {
		connection.create = () =>
		createBigQueryConnection({
		projectId: config.projectId,
		dataset: config.dataset,
		location: config.location,
		keyFilename: config.keyFilename,
		credentials: config.credentials,
		logger: log,
		});
		}

		return {
		@@ -131,16 +237,3 @@ id: "bigquery-datasource",

		connection: {
		create: () =>
		createBigQueryConnection({
		projectId: config.projectId,
		dataset: config.dataset,
		location: config.location,
		keyFilename: config.keyFilename,
		credentials: config.credentials,
		logger: log,
		}),
		dbType: "bigquery",
		parserDialect: "BigQuery",
		forbiddenPatterns: BIGQUERY_FORBIDDEN_PATTERNS,
		},
		connection,

		@@ -168,6 +261,18 @@ entities: [],
		log = ctx.logger;
		ctx.logger.info(`BigQuery datasource plugin initialized (project: ${extractProjectId(config)})`);
		if (hasStaticConfig) {
		ctx.logger.info(`BigQuery datasource plugin initialized (project: ${extractProjectId(config)})`);
		} else {
		ctx.logger.info(
		"BigQuery datasource plugin registered as adapter-only — per-workspace datasources via Admin → Connections",
		);
		}
		},

		async healthCheck(): Promise<PluginHealthResult> {
		// Adapter-only: no static datasource to probe. The plugin itself is a
		// healthy adapter; per-workspace connections are health-checked by the
		// ConnectionRegistry once installed.
		if (!hasStaticConfig) {
		return { healthy: true, message: "adapter-only: no static datasource configured" };
		}
		const start = performance.now();
		@@ -211,3 +316,6 @@ let conn: PluginDBConnection \| undefined;
		* ```typescript
		* // Static datasource (self-host):
		* plugins: [bigqueryPlugin({ projectId: "my-project", dataset: "analytics" })]
		* // Adapter-only (SaaS — customers bring their own per workspace):
		* plugins: [bigqueryPlugin({})]
		* ```
		@@ -220,5 +328,7 @@ */

		export { createBigQueryConnection, extractProjectId } from "./connection";
		export { createBigQueryConnection, extractProjectId, parseBigQueryUrl } from "./connection";
		export type { BigQueryConnectionConfig } from "./connection";
		export { listBigQueryObjects, profileBigQuery } from "./profiler";
		export { BIGQUERY_FORBIDDEN_PATTERNS } from "./validation";
		export { estimateQueryCost, formatBytes, _resetCachedClient } from "./cost-estimator";
		export type { CostEstimate } from "./cost-estimator";

@useatlas/bigquery - npm Package Compare versions

Improved metrics