🚀 Socket Launch Week Day 5:Introducing Repository Access Permissions and Custom Roles.Learn more
Sign In

@useatlas/bigquery

Package Overview
Dependencies
Maintainers
1
Versions
2
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@useatlas/bigquery - npm Package Compare versions

Comparing version
0.0.5
to
0.0.6
+427
src/profiler.ts
/**
* BigQuery profiler — the introspection half of the datasource contract
* (ADR-0017). Enumerates objects (`listBigQueryObjects`) and profiles columns
* + sample values (`profileBigQuery`) into the SDK's structural
* `ProfilingResult` mirror, which the host's registry-resolved profiler seam
* feeds into `SemanticGenerator` without importing this package. The CLI
* (`atlas init` / `atlas diff`) consumes these exports directly.
*
* BigQuery is net-new on the contract — there is no prior CLI profiler to port.
*
* Cost posture (the whole point of profiling BigQuery carefully): BigQuery
* bills by bytes SCANNED, so a naive profiler that runs `COUNT(*)`,
* `COUNT(DISTINCT col)`, or `SELECT DISTINCT col` over a table would scan the
* whole table and bill the operator for every profiled column. This profiler
* NEVER does a full scan:
*
* - Structure (datasets, tables, views, columns, types) comes from
* `INFORMATION_SCHEMA.{TABLES,COLUMNS}` — metadata queries that process
* ~0 bytes of table data.
* - Row counts come from `INFORMATION_SCHEMA.TABLE_STORAGE.total_rows` —
* table metadata, NOT a scanning `COUNT(*)`.
* - Sample values come from a single bounded read per table (one small read
* shared across all columns), not a per-column scan. Enum-like detection is
* derived from that bounded sample, never a full `COUNT(DISTINCT)`.
*
* CRITICAL: a plain `SELECT * ... LIMIT n` does NOT bound BigQuery cost — a
* `LIMIT` is applied AFTER the scan, so BigQuery still bills for every byte
* of every selected column across the whole table. For base tables we use
* `TABLESAMPLE SYSTEM (n PERCENT)`, which reads (and bills) only the
* sampled storage blocks, then `LIMIT` to cap rows returned. Views can't be
* `TABLESAMPLE`d, so they fall back to `LIMIT` — but a view's underlying
* scan is bounded by its own definition, and views report 0 storage rows,
* so the exposure is far smaller than an unbounded base-table scan.
*
* Every query runs through {@link createBigQueryConnection}, whose `query()`
* issues read-only BigQuery jobs (no DML/DDL), so profiling honors the same
* read-only posture as the query path and never mutates the project. Caught
* errors are type-narrowed and never echo credentials.
*/
import type {
PluginDatabaseObject,
PluginColumnProfile,
PluginTableProfile,
PluginProfileError,
PluginProfilingResult,
PluginListObjectsOptions,
PluginProfileOptions,
} from "@useatlas/plugin-sdk";
import {
createBigQueryConnection,
parseBigQueryUrl,
normalizeBigQueryConfigFields,
type BigQueryConnectionConfig,
} from "./connection";
/** Connection-level errors that will fail every remaining table — abort fast. */
const FATAL_ERROR_PATTERN =
/\bECONNRESET\b|\bECONNREFUSED\b|\bEHOSTUNREACH\b|\bENOTFOUND\b|\bEPIPE\b|\bETIMEDOUT\b/i;
/**
* Detect a connection-level (vs. per-table) failure. A fatal error means the
* whole profile should abort rather than recording N identical per-table
* errors. Mirrors the host's `isFatalConnectionError` (kept local to avoid
* importing `@atlas/api` into the plugin package — ADR-0013).
*/
function isFatalConnectionError(err: unknown): boolean {
if (!(err instanceof Error)) return FATAL_ERROR_PATTERN.test(String(err));
if (FATAL_ERROR_PATTERN.test(err.message)) return true;
const code = (err as NodeJS.ErrnoException).code;
if (code && FATAL_ERROR_PATTERN.test(code)) return true;
if (err.cause) return isFatalConnectionError(err.cause);
return false;
}
/** Backtick-escape a BigQuery identifier (doubles any embedded backtick). */
function bqIdentifier(name: string): string {
return `\`${name.replace(/`/g, "``")}\``;
}
/** Single-quote escape for a BigQuery string literal. */
function bqLiteral(value: string): string {
return value.replace(/\\/g, "\\\\").replace(/'/g, "\\'");
}
/**
* Build the base connection config from the host-carried tenant config
* (#3664 — the ADR-0017 amendment that lets a multi-field-credential profiler
* authenticate with the TENANT's own creds over the registry/MCP seam, mirroring
* Elasticsearch's `configForOptions`).
*
* BigQuery is non-url-shaped: its credentials (`service_account_json`) live in a
* SEPARATE config field, NEVER in the connection string. The Admin → Connections
* catalog form collects `service_account_json` (raw key JSON string) + `project_id`
* (snake_case) + the generic `schema` routing hint; map them onto the connection
* factory's `credentials` (object) + `projectId` + `dataset` shape. A
* programmatic caller passing camelCase `projectId`/`credentials`/`dataset`
* directly still works.
*
* @throws {Error} when `service_account_json` is present but not valid JSON.
*/
function bigQueryConfigFromTenantConfig(
raw: Readonly<Record<string, unknown>>,
): BigQueryConnectionConfig {
// Shared with the connection factory's `createFromConfig` (index.ts) so the
// profiling path and the query path resolve project + credentials identically:
// snake_case `project_id` → `projectId`, `service_account_json` →
// `credentials` (validated as a JSON object, never a silent ADC fallback),
// keyFilename-wins precedence.
const normalized = normalizeBigQueryConfigFields(raw);
const str = (v: unknown): string | undefined =>
typeof v === "string" && v.length > 0 ? v : undefined;
// BigQuery's dataset rides on the generic `schema` routing hint over the seam
// (the catalog form has no dedicated dataset field); an explicit `dataset`
// still wins for a programmatic caller.
const dataset = str(normalized.dataset) ?? str(normalized.schema);
const credentials =
normalized.credentials &&
typeof normalized.credentials === "object" &&
!Array.isArray(normalized.credentials)
? (normalized.credentials as Record<string, unknown>)
: undefined;
return {
...(str(normalized.projectId) !== undefined ? { projectId: str(normalized.projectId)! } : {}),
...(dataset !== undefined ? { dataset } : {}),
...(str(normalized.location) !== undefined ? { location: str(normalized.location)! } : {}),
...(str(normalized.keyFilename) !== undefined ? { keyFilename: str(normalized.keyFilename)! } : {}),
...(credentials !== undefined ? { credentials } : {}),
};
}
/**
* Resolve the dataset to introspect and the base connection config.
*
* Two credential sources, chosen by whether the host seam supplied the tenant's
* config (registry/MCP path) or only a url (CLI / static-config path):
* - `options.config` present → the host carried the datasource's DECRYPTED
* config; build from the tenant's own service-account creds (#3664).
* - `options.config` absent → parse the `bigquery://` url (operator shell).
*
* `options.schema` (the SDK routing hint) wins over the dataset embedded in
* either source, mirroring how the CLI passes `--schema`.
*
* @throws {Error} when no dataset or no project can be resolved.
*/
function resolveConfig(
options: PluginListObjectsOptions,
): { config: BigQueryConnectionConfig; projectId: string; dataset: string } {
// Prefer the tenant config (registry/MCP path) ONLY when it actually carries
// connection fields (project / credentials / key file). An empty or
// label-only `config` (e.g. `{}` or `{ description }`) must NOT suppress URL
// parsing — fall back to the `bigquery://` url (CLI / static path) so a valid
// url isn't discarded just because a truthy-but-empty config was passed (#3664 review).
const fromConfig = options.config ? bigQueryConfigFromTenantConfig(options.config) : undefined;
const base =
fromConfig && (fromConfig.projectId || fromConfig.credentials || fromConfig.keyFilename)
? fromConfig
: parseBigQueryUrl(options.url);
const dataset = options.schema ?? base.dataset;
if (!dataset) {
throw new Error(
"BigQuery profiling requires a dataset. Pass it in the URL " +
"(bigquery://project/dataset), via the schema option, or as a `schema` " +
"field on the datasource config.",
);
}
const projectId = base.projectId;
if (!projectId) {
throw new Error(
"BigQuery profiling requires a project (the bigquery:// URL host or a " +
"`project_id` datasource config field).",
);
}
// Scope the connection to the resolved dataset so unqualified references
// (e.g. INFORMATION_SCHEMA) resolve against it.
return { config: { ...base, dataset }, projectId, dataset };
}
/** A row from INFORMATION_SCHEMA.TABLES. */
interface TablesRow {
table_name: string;
table_type: string;
}
function tablesRowToObject(r: TablesRow): PluginDatabaseObject {
// BigQuery table_type: "BASE TABLE", "VIEW", "MATERIALIZED VIEW",
// "EXTERNAL", "SNAPSHOT", "CLONE". Map views to "view", materialized views
// to "materialized_view", everything else to "table".
const t = r.table_type.toUpperCase();
if (t === "VIEW") return { name: r.table_name, type: "view" };
if (t === "MATERIALIZED VIEW") return { name: r.table_name, type: "materialized_view" };
return { name: r.table_name, type: "table" };
}
/**
* Build the INFORMATION_SCHEMA.TABLES query for a dataset. The
* dataset-qualified `INFORMATION_SCHEMA.TABLES` view is a metadata read — it
* scans no table data.
*/
function listObjectsSql(dataset: string): string {
return `SELECT table_name, table_type
FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.TABLES
ORDER BY table_name`;
}
/**
* Enumerate the queryable tables/views in a BigQuery dataset via
* INFORMATION_SCHEMA.TABLES (metadata-only — no table scan).
*/
export async function listBigQueryObjects(
options: PluginListObjectsOptions,
): Promise<PluginDatabaseObject[]> {
const { config, dataset } = resolveConfig(options);
const conn = createBigQueryConnection(config);
try {
const { rows } = await conn.query(listObjectsSql(dataset));
return (rows as unknown as TablesRow[]).map(tablesRowToObject);
} finally {
await conn.close();
}
}
/** A row from INFORMATION_SCHEMA.COLUMNS. */
interface ColumnsRow {
table_name: string;
column_name: string;
data_type: string;
is_nullable: string;
}
/**
* Build the bounded sample-read SQL for one object. Cost posture (see header):
* a `LIMIT` alone does NOT bound BigQuery bytes billed — it is applied after the
* scan. For base tables we use `TABLESAMPLE SYSTEM`, which reads and bills only
* the sampled storage blocks, capping cost on large tables. Views and
* materialized views can't be `TABLESAMPLE`d (BigQuery rejects it), so they fall
* back to a plain `LIMIT`; a view's scan is bounded by its own definition and
* views carry no base storage of their own.
*/
function sampleSql(
dataset: string,
tableName: string,
objectType: PluginDatabaseObject["type"],
): string {
const target = `${bqIdentifier(dataset)}.${bqIdentifier(tableName)}`;
if (objectType === "table") {
// 1 PERCENT keeps large tables cheap; small tables return whole blocks
// (still cheap). LIMIT then caps the rows we materialize.
return `SELECT * FROM ${target} TABLESAMPLE SYSTEM (1 PERCENT) LIMIT 100`;
}
return `SELECT * FROM ${target} LIMIT 100`;
}
/** Heuristic: which BigQuery types are text-like and can be enum-like. */
function isTextType(dataType: string): boolean {
const base = dataType.toUpperCase();
return base === "STRING" || base.startsWith("STRING(");
}
/**
* Profile a BigQuery dataset into a {@link PluginProfilingResult}. For each
* object: row count (from table metadata), column types/nullability (from
* INFORMATION_SCHEMA), and a single bounded sample read used to derive
* per-column sample values and enum-like-ness. No full table scans — see the
* file header.
*
* BigQuery has no enforced primary/foreign keys (constraints are unenforced
* metadata, rarely populated), so PK/FK fields are left empty — the semantic
* generator infers relationships from naming + sample values downstream.
*/
export async function profileBigQuery(
options: PluginProfileOptions,
): Promise<PluginProfilingResult> {
const { selectedTables, prefetchedObjects, progress } = options;
const { config, dataset } = resolveConfig(options);
const conn = createBigQueryConnection(config);
const profiles: PluginTableProfile[] = [];
const errors: PluginProfileError[] = [];
try {
const allObjects: PluginDatabaseObject[] = prefetchedObjects
? prefetchedObjects
: ((await conn.query(listObjectsSql(dataset))).rows as unknown as TablesRow[]).map(
tablesRowToObject,
);
const objectsToProfile = selectedTables
? allObjects.filter((o) => selectedTables.includes(o.name))
: allObjects;
progress?.onStart(objectsToProfile.length);
// Row counts for the whole dataset come from one metadata read of
// INFORMATION_SCHEMA.TABLE_STORAGE (total_rows) — no per-table COUNT(*).
const rowCounts = new Map<string, number>();
if (objectsToProfile.length > 0) {
try {
const storageRows = (
await conn.query(
`SELECT table_name, total_rows
FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.TABLE_STORAGE`,
)
).rows as unknown as { table_name: string; total_rows: string | number }[];
for (const r of storageRows) {
rowCounts.set(r.table_name, Number(r.total_rows ?? 0));
}
} catch (storageErr) {
if (isFatalConnectionError(storageErr)) throw storageErr;
// Non-fatal: TABLE_STORAGE may be unavailable (e.g. for some view-only
// datasets or restricted permissions). Fall back to a 0 row count per
// table rather than failing the whole profile — views report 0 anyway.
}
}
for (const [i, obj] of objectsToProfile.entries()) {
const tableName = obj.name;
const objectType = obj.type;
const objectLabel = objectType === "view" ? " [view]" : "";
progress?.onTableStart(tableName + objectLabel, i, objectsToProfile.length);
try {
const rowCount = rowCounts.get(tableName) ?? 0;
const colRows = (
await conn.query(
`SELECT column_name, data_type, is_nullable
FROM ${bqIdentifier(dataset)}.INFORMATION_SCHEMA.COLUMNS
WHERE table_name = '${bqLiteral(tableName)}'
ORDER BY ordinal_position`,
)
).rows as unknown as ColumnsRow[];
// One bounded sample read shared across every column — never a full
// scan. Base tables use TABLESAMPLE SYSTEM so BigQuery bills only the
// sampled storage blocks (a bare LIMIT would still bill the whole
// table); views fall back to LIMIT. See sampleSql / the file header.
let sampleRows: Record<string, unknown>[] = [];
try {
sampleRows = (
await conn.query(sampleSql(dataset, tableName, objectType))
).rows as Record<string, unknown>[];
} catch (sampleErr) {
if (isFatalConnectionError(sampleErr)) throw sampleErr;
// Non-fatal: emit columns with metadata but no sample values rather
// than failing the table (e.g. a view that errors on read).
}
const columns: PluginColumnProfile[] = colRows.map((col) => {
const present = sampleRows
.map((r) => r[col.column_name])
.filter((v) => v !== null && v !== undefined);
const distinct = Array.from(new Set(present.map((v) => String(v))));
const nullCount = sampleRows.length - present.length;
// Enum-like derived from the bounded sample only (no full COUNT
// DISTINCT): a text column with few distinct sampled values where
// repetition is observed (distinct < sampled rows). The ratio is
// sample-scoped and intentionally lenient — the bounded sample can't
// prove true low cardinality, so this is a hint for the generator,
// not a guarantee.
const isEnumLike =
isTextType(col.data_type) &&
present.length > 0 &&
distinct.length > 0 &&
distinct.length < 20 &&
distinct.length < present.length;
const sampleLimit = isEnumLike ? 100 : 10;
const sampleValues = distinct.slice(0, sampleLimit);
return {
name: col.column_name,
type: col.data_type,
nullable: col.is_nullable.toUpperCase() === "YES",
// Cardinality from a bounded sample is not a true table-wide count,
// so report null rather than a misleading sample-scoped number.
unique_count: null,
null_count: sampleRows.length > 0 ? nullCount : null,
sample_values: sampleValues,
is_primary_key: false,
is_foreign_key: false,
fk_target_table: null,
fk_target_column: null,
is_enum_like: isEnumLike,
profiler_notes:
sampleRows.length > 0
? [
`Sampled ${sampleRows.length} row(s) (${
objectType === "table" ? "TABLESAMPLE-bounded" : "LIMIT-bounded"
}, no full scan)`,
]
: [],
} satisfies PluginColumnProfile;
});
profiles.push({
table_name: tableName,
object_type: objectType,
row_count: rowCount,
columns,
primary_key_columns: [],
foreign_keys: [],
inferred_foreign_keys: [],
profiler_notes: [],
table_flags: { possibly_abandoned: false, possibly_denormalized: false },
});
progress?.onTableDone(tableName, i, objectsToProfile.length);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
if (isFatalConnectionError(err)) {
// Connection-level — every remaining table will fail the same way.
throw new Error(`Fatal database error while profiling ${tableName}: ${msg}`, {
cause: err,
});
}
progress?.onTableError(tableName, msg, i, objectsToProfile.length);
errors.push({ table: tableName, error: msg });
}
}
} finally {
await conn.close();
}
return { profiles, errors };
}
+5
-5
{
"name": "@useatlas/bigquery",
"version": "0.0.5",
"version": "0.0.6",
"description": "Atlas BigQuery datasource plugin",

@@ -20,3 +20,3 @@ "type": "module",

"scripts": {
"test": "bun test __tests__/bigquery.test.ts"
"test": "bun test __tests__/bigquery.test.ts __tests__/profiler.test.ts __tests__/cost-estimator.test.ts"
},

@@ -46,5 +46,5 @@ "keywords": [

"devDependencies": {
"ai": "^6.0.141",
"hono": "^4.12.9",
"zod": "^4.3.6",
"ai": "^6.0.193",
"hono": "^4.12.23",
"zod": "^4.4.3",
"@useatlas/plugin-sdk": "workspace:*"

@@ -51,0 +51,0 @@ },

@@ -35,2 +35,119 @@ /**

/**
* Parse a `bigquery://` connection URL into a {@link BigQueryConnectionConfig}.
*
* Format: `bigquery://[location@]<project>[/<dataset>][?keyFilename=<path>]`
*
* Credentials are NOT embedded in the URL — they resolve the same way the
* BigQuery client always resolves them: an explicit `keyFilename` query param
* (or the `GOOGLE_APPLICATION_CREDENTIALS` env var the client reads), or
* Application Default Credentials in a GCP environment. This mirrors the
* Snowflake/Salesforce CLI url shape (the operator runs in their own shell —
* the same trust model as the static atlas.config.ts path) and keeps the
* url-based profiler-options contract (ADR-0017) usable for BigQuery without
* ever surfacing a service-account key in a connection string.
*
* @throws {Error} If the scheme is not `bigquery://` or no project is present.
*/
export function parseBigQueryUrl(url: string): BigQueryConnectionConfig {
if (!url.startsWith("bigquery://")) {
throw new Error('BigQuery URL must start with "bigquery://".');
}
let parsed: URL;
try {
parsed = new URL(url);
} catch (err) {
throw new Error(
`Invalid bigquery:// URL: ${err instanceof Error ? err.message : String(err)}`,
{ cause: err },
);
}
// `bigquery://location@project/dataset` — the URL host is `project`, and an
// optional `location@` prefix lands in the username. The path's first
// segment is the optional default dataset.
const location = parsed.username ? decodeURIComponent(parsed.username) : undefined;
const projectId = parsed.hostname ? decodeURIComponent(parsed.hostname) : "";
if (!projectId) {
throw new Error(
"bigquery:// URL must include a project: bigquery://[location@]<project>[/<dataset>].",
);
}
const datasetSegment = parsed.pathname.replace(/^\//, "").split("/")[0];
const dataset = datasetSegment ? decodeURIComponent(datasetSegment) : undefined;
const keyFilename = parsed.searchParams.get("keyFilename") ?? undefined;
const locationParam = parsed.searchParams.get("location") ?? undefined;
return {
projectId,
dataset,
location: location ?? locationParam,
keyFilename,
};
}
/**
* Coerce a service-account JSON string into a credentials object, with an
* actionable error for malformed input. Shared by the connection factory's
* config normalization ({@link normalizeBigQueryConfigFields}) and the
* registry/MCP profiler so the two credential paths can't drift (#3664).
*
* @throws {Error} when the string is not valid JSON, OR parses to something
* other than a JSON object (a non-null, non-array object). The shape check
* matters: `JSON.parse("null")` / a parsed array/number/string would
* otherwise be handed on as a falsy-or-malformed `credentials`, and the
* BigQuery client would silently fall back to Application Default Credentials
* (operator/ambient env) — a per-tenant-credential-isolation violation. Fail
* loud instead.
*/
export function parseServiceAccountJson(serviceAccountJson: string): Record<string, unknown> {
let parsed: unknown;
try {
parsed = JSON.parse(serviceAccountJson);
} catch (err) {
throw new Error(
`BigQuery service_account_json is not valid JSON: ${err instanceof Error ? err.message : String(err)}`,
{ cause: err },
);
}
if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) {
throw new Error(
"BigQuery service_account_json must be a JSON object (the service account key contents).",
);
}
return parsed as Record<string, unknown>;
}
/**
* Normalize a DB-stored / tenant BigQuery config (the Admin → Connections
* catalog form shape: snake_case `service_account_json` + `project_id`) onto the
* connection factory's field shape (camelCase `credentials` + `projectId`),
* WITHOUT clobbering explicit camelCase values a programmatic caller passed.
* Shared by `createFromConfig` (index.ts) and the registry/MCP profiler
* (profiler.ts `bigQueryConfigFromTenantConfig`) so the connection path and the
* profiling path resolve credentials identically and can't drift (#3664). An
* explicit `keyFilename` takes precedence over `service_account_json` (the key
* file wins), matching the connection factory's historical behavior.
*
* @throws {Error} when `service_account_json` is present but not a valid JSON object.
*/
export function normalizeBigQueryConfigFields(
raw: Readonly<Record<string, unknown>>,
): Record<string, unknown> {
const out: Record<string, unknown> = { ...raw };
if (out.projectId === undefined && typeof out.project_id === "string") {
out.projectId = out.project_id;
}
if (
out.credentials === undefined &&
out.keyFilename === undefined &&
typeof out.service_account_json === "string" &&
out.service_account_json.trim().length > 0
) {
out.credentials = parseServiceAccountJson(out.service_account_json);
}
return out;
}
/**
* Create a PluginDBConnection backed by @google-cloud/bigquery.

@@ -55,5 +172,15 @@ * BigQuery is stateless REST — no pool to manage.

(err as NodeJS.ErrnoException).code === "MODULE_NOT_FOUND";
if (isNotFound) {
// Only surface the install hint when the missing module is THIS package, not
// a transitive dep that failed to load (same MODULE_NOT_FOUND code, different
// named module). Node and bun both name the missing request quoted in the
// message, so a transitive failure won't match our own specifier.
const ownPackageMissing =
isNotFound &&
(err instanceof Error ? err.message : String(err)).includes(
"'@google-cloud/bigquery'",
);
if (ownPackageMissing) {
throw new Error(
"BigQuery support requires the @google-cloud/bigquery package. Install it with: bun add @google-cloud/bigquery",
{ cause: err },
);

@@ -60,0 +187,0 @@ }

@@ -8,3 +8,6 @@ /**

*
* Usage in atlas.config.ts:
* Two registration modes:
*
* 1. Static config-defined datasource (self-host / operator-baked) — pass a
* `projectId` and the plugin wires a single connection at boot:
* ```typescript

@@ -24,2 +27,12 @@ * import { defineConfig } from "@atlas/api/lib/config";

* ```
*
* 2. Adapter-only (SaaS per-workspace) — pass no `projectId` and the plugin
* registers purely as an adapter, so customers add their own BigQuery
* project per workspace via Admin → Connections (DB-stored, encrypted). No
* operator config, no static datasource:
* ```typescript
* export default defineConfig({
* plugins: [bigqueryPlugin({})],
* });
* ```
*/

@@ -33,3 +46,5 @@

extractProjectId,
normalizeBigQueryConfigFields,
} from "./connection";
import { listBigQueryObjects, profileBigQuery } from "./profiler";
import { BIGQUERY_FORBIDDEN_PATTERNS } from "./validation";

@@ -63,2 +78,16 @@ import { estimateQueryCost, formatBytes } from "./cost-estimator";

/**
* Strict runtime schema for a DB-stored (admin-UI-registered) BigQuery
* datasource install. The config-time {@link BigQueryConfigSchema} leaves
* `projectId` optional so the plugin can register as an ADAPTER ONLY
* (`bigqueryPlugin({})` parses), but a per-(workspace, install) datasource
* must identify a concrete project — credentials may still come from a
* keyFilename, inline credentials, or ADC, but the project is mandatory.
* Used by `connection.createFromConfig` so a runtime config missing
* `projectId` is REJECTED.
*/
const BigQueryRuntimeConfigSchema = BigQueryConfigSchema.extend({
projectId: z.string().min(1, "BigQuery projectId must not be empty"),
});
/**
* BigQuery config type. Uses `z.input` so both the `bigqueryPlugin()` factory

@@ -83,2 +112,11 @@ * (which applies Zod defaults) and direct `buildBigQueryPlugin()` callers work

// A static config-defined BigQuery datasource is identified by ANY connection
// field: an explicit projectId, a keyFilename, or inline credentials. Unlike
// the url-based datasources, BigQuery can run static with NO projectId — the
// project is inferred from the key/credentials or Application Default
// Credentials (createBigQueryConnection omits projectId in that case). Only
// when none of these is present does the plugin register adapter-only (SaaS
// per-workspace — datasources are DB-stored, added via Admin → Connections).
const hasStaticConfig = !!(config.projectId || config.keyFilename || config.credentials);
const hooks: PluginHooks = {

@@ -123,2 +161,70 @@ beforeQuery: [

const connection: AtlasDatasourcePlugin<BigQueryConfig>["connection"] = {
// DB-driven (admin-UI-registered) datasources: build a connection from the
// per-(workspace, install) config decrypted from `workspace_plugins`,
// re-validated through the strict runtime schema (projectId required).
// Always available — this is the SaaS per-workspace path and the only path
// in adapter-only mode.
createFromConfig: (runtimeConfig) => {
// Normalize the catalog form's snake_case / service_account_json shape onto
// the factory's camelCase / credentials shape before validating.
const parsed = BigQueryRuntimeConfigSchema.parse(
normalizeBigQueryConfigFields(runtimeConfig),
);
const built = createBigQueryConnection({
projectId: parsed.projectId,
dataset: parsed.dataset,
location: parsed.location,
keyFilename: parsed.keyFilename,
credentials: parsed.credentials,
logger: log,
});
// #3667 — introspection as a capability of the built connection. BigQuery
// is non-url-shaped: the profiler reads the tenant's service-account creds
// from the bound `config` (the same record `createFromConfig` received),
// never a url — there is no synthetic url / URL-shape gate.
return {
...built,
listObjects: (o) => listBigQueryObjects({ url: "", schema: o?.schema ?? parsed.dataset, config: runtimeConfig }),
profile: (o) =>
profileBigQuery({
url: "",
schema: o?.schema ?? parsed.dataset,
config: runtimeConfig,
selectedTables: o?.selectedTables,
prefetchedObjects: o?.prefetchedObjects,
progress: o?.progress,
logger: o?.logger,
}),
};
},
dbType: "bigquery",
parserDialect: "BigQuery",
forbiddenPatterns: BIGQUERY_FORBIDDEN_PATTERNS,
// Introspection half of the datasource contract (ADR-0017). The host
// resolves `profile` off the registry (same predicate as `createFromConfig`)
// and feeds it into SemanticGenerator's profiler seam; the CLI consumes
// these exports directly. Both run read-only and never full-scan a table —
// structure/row-counts come from INFORMATION_SCHEMA metadata, sampling is
// LIMIT-bounded (BigQuery bills by bytes scanned).
listObjects: listBigQueryObjects,
profile: profileBigQuery,
};
// When any static connection field is configured the plugin wires a
// config-defined connection at boot; without one it is registered adapter-only.
// `config.projectId` may be undefined here (key/credentials/ADC supply it) —
// createBigQueryConnection handles an absent projectId.
if (hasStaticConfig) {
connection.create = () =>
createBigQueryConnection({
projectId: config.projectId,
dataset: config.dataset,
location: config.location,
keyFilename: config.keyFilename,
credentials: config.credentials,
logger: log,
});
}
return {

@@ -131,16 +237,3 @@ id: "bigquery-datasource",

connection: {
create: () =>
createBigQueryConnection({
projectId: config.projectId,
dataset: config.dataset,
location: config.location,
keyFilename: config.keyFilename,
credentials: config.credentials,
logger: log,
}),
dbType: "bigquery",
parserDialect: "BigQuery",
forbiddenPatterns: BIGQUERY_FORBIDDEN_PATTERNS,
},
connection,

@@ -168,6 +261,18 @@ entities: [],

log = ctx.logger;
ctx.logger.info(`BigQuery datasource plugin initialized (project: ${extractProjectId(config)})`);
if (hasStaticConfig) {
ctx.logger.info(`BigQuery datasource plugin initialized (project: ${extractProjectId(config)})`);
} else {
ctx.logger.info(
"BigQuery datasource plugin registered as adapter-only — per-workspace datasources via Admin → Connections",
);
}
},
async healthCheck(): Promise<PluginHealthResult> {
// Adapter-only: no static datasource to probe. The plugin itself is a
// healthy adapter; per-workspace connections are health-checked by the
// ConnectionRegistry once installed.
if (!hasStaticConfig) {
return { healthy: true, message: "adapter-only: no static datasource configured" };
}
const start = performance.now();

@@ -211,3 +316,6 @@ let conn: PluginDBConnection | undefined;

* ```typescript
* // Static datasource (self-host):
* plugins: [bigqueryPlugin({ projectId: "my-project", dataset: "analytics" })]
* // Adapter-only (SaaS — customers bring their own per workspace):
* plugins: [bigqueryPlugin({})]
* ```

@@ -220,5 +328,7 @@ */

export { createBigQueryConnection, extractProjectId } from "./connection";
export { createBigQueryConnection, extractProjectId, parseBigQueryUrl } from "./connection";
export type { BigQueryConnectionConfig } from "./connection";
export { listBigQueryObjects, profileBigQuery } from "./profiler";
export { BIGQUERY_FORBIDDEN_PATTERNS } from "./validation";
export { estimateQueryCost, formatBytes, _resetCachedClient } from "./cost-estimator";
export type { CostEstimate } from "./cost-estimator";