@ai-sdk/provider - npm Package Compare versions

+1

src/realtime-model/index.ts

export * from './v4/index';

+20

src/realtime-model/v4/index.ts

		export type {
		RealtimeFactoryV4 as Experimental_RealtimeFactoryV4,
		RealtimeFactoryV4GetTokenOptions as Experimental_RealtimeFactoryV4GetTokenOptions,
		RealtimeFactoryV4GetTokenResult as Experimental_RealtimeFactoryV4GetTokenResult,
		} from './realtime-factory-v4';
		export type { RealtimeModelV4 as Experimental_RealtimeModelV4 } from './realtime-model-v4';
		export type { RealtimeModelV4ClientEvent as Experimental_RealtimeModelV4ClientEvent } from './realtime-model-v4-client-event';
		export type {
		RealtimeModelV4ClientSecretOptions as Experimental_RealtimeModelV4ClientSecretOptions,
		RealtimeModelV4ClientSecretResult as Experimental_RealtimeModelV4ClientSecretResult,
		} from './realtime-model-v4-client-secret';
		export type {
		RealtimeModelV4ConversationItem as Experimental_RealtimeModelV4ConversationItem,
		RealtimeModelV4TextMessage as Experimental_RealtimeModelV4TextMessage,
		RealtimeModelV4AudioMessage as Experimental_RealtimeModelV4AudioMessage,
		RealtimeModelV4FunctionCallOutput as Experimental_RealtimeModelV4FunctionCallOutput,
		} from './realtime-model-v4-conversation-item';
		export type { RealtimeModelV4ServerEvent as Experimental_RealtimeModelV4ServerEvent } from './realtime-model-v4-server-event';
		export type { RealtimeModelV4SessionConfig as Experimental_RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
		export type { RealtimeModelV4ToolDefinition as Experimental_RealtimeModelV4ToolDefinition } from './realtime-model-v4-tool-definition';

+20

src/realtime-model/v4/realtime-factory-v4.ts

		import type { RealtimeModelV4 } from './realtime-model-v4';
		import type { RealtimeModelV4ClientSecretOptions } from './realtime-model-v4-client-secret';

		export type RealtimeFactoryV4GetTokenOptions = {
		model: string;
		} & RealtimeModelV4ClientSecretOptions;

		export type RealtimeFactoryV4GetTokenResult = {
		token: string;
		url: string;
		expiresAt?: number;
		};

		export interface RealtimeFactoryV4 {
		(modelId: string): RealtimeModelV4;

		getToken(
		options: RealtimeFactoryV4GetTokenOptions,
		): Promise<RealtimeFactoryV4GetTokenResult>;
		}

+68

src/realtime-model/v4/realtime-model-v4-client-event.ts

		import type { RealtimeModelV4ConversationItem } from './realtime-model-v4-conversation-item';
		import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';

		/**
		* Normalized events sent from the browser to the realtime model.
		* Each provider maps this to its native event format before sending
		* over the WebSocket.
		*/
		export type RealtimeModelV4ClientEvent =
		// ── Session ────────────────────────────────────────────────────────

		\| {
		type: 'session-update';
		config: RealtimeModelV4SessionConfig;
		}

		// ── Input audio buffer ─────────────────────────────────────────────
		\| {
		type: 'input-audio-append';

		/**
		* Base64-encoded audio chunk to append to the input buffer.
		*/
		audio: string;
		}
		\| {
		type: 'input-audio-commit';
		}
		\| {
		type: 'input-audio-clear';
		}

		// ── Conversation items ─────────────────────────────────────────────
		\| {
		type: 'conversation-item-create';
		item: RealtimeModelV4ConversationItem;
		}
		\| {
		type: 'conversation-item-truncate';

		/**
		* The ID of the assistant message item to truncate.
		*/
		itemId: string;

		/**
		* The index of the content part to truncate.
		*/
		contentIndex: number;

		/**
		* Truncate audio after this many milliseconds.
		*/
		audioEndMs: number;
		}

		// ── Response control ───────────────────────────────────────────────
		\| {
		type: 'response-create';
		options?: {
		modalities?: string[];
		instructions?: string;
		metadata?: Record<string, unknown>;
		};
		}
		\| {
		type: 'response-cancel';
		};

+40

src/realtime-model/v4/realtime-model-v4-client-secret.ts

		import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';

		/**
		* Options for creating an ephemeral client secret for browser-side
		* WebSocket connections to a realtime model.
		*/
		export type RealtimeModelV4ClientSecretOptions = {
		/**
		* Number of seconds until the client secret expires.
		*/
		expiresAfterSeconds?: number;

		/**
		* Optional session configuration to embed in the token request.
		* Some providers (e.g. Google) require the full session config at token creation time.
		*/
		sessionConfig?: RealtimeModelV4SessionConfig;
		};

		/**
		* Result of creating an ephemeral client secret.
		*/
		export type RealtimeModelV4ClientSecretResult = {
		/**
		* The ephemeral token value. Used as a Bearer token or in the
		* WebSocket subprotocol header for authentication.
		*/
		token: string;

		/**
		* The WebSocket URL to connect to. Includes any provider-specific
		* query parameters (e.g. model ID).
		*/
		url: string;

		/**
		* Unix timestamp (seconds) when this client secret expires.
		*/
		expiresAt?: number;
		};

+55

src/realtime-model/v4/realtime-model-v4-conversation-item.ts

		/**
		* A conversation item that can be created by the client and sent to
		* the model via the conversation.item.create event.
		*/
		export type RealtimeModelV4ConversationItem =
		\| RealtimeModelV4TextMessage
		\| RealtimeModelV4AudioMessage
		\| RealtimeModelV4FunctionCallOutput;

		/**
		* A text message from the user.
		*/
		export type RealtimeModelV4TextMessage = {
		type: 'text-message';
		role: 'user';
		text: string;
		};

		/**
		* An audio message from the user (complete audio, not streamed).
		*/
		export type RealtimeModelV4AudioMessage = {
		type: 'audio-message';
		role: 'user';

		/**
		* Base64-encoded audio data.
		*/
		audio: string;
		};

		/**
		* The output of a function call, sent back to the model so it can
		* continue generating a response using the tool result.
		*/
		export type RealtimeModelV4FunctionCallOutput = {
		type: 'function-call-output';

		/**
		* The call ID from the function-call-arguments-done event.
		* Must match so the model knows which function call this result is for.
		*/
		callId: string;

		/**
		* The name of the function that was called.
		* Required by some providers (e.g. Google) in the tool response routing.
		*/
		name?: string;

		/**
		* JSON string containing the function call result.
		*/
		output: string;
		};

+199

src/realtime-model/v4/realtime-model-v4-server-event.ts

		/**
		* Normalized events emitted by the realtime model (model → browser).
		* Each provider maps its native event format to this discriminated union.
		*
		* Every event includes a `raw` field with the original provider-specific
		* event data for debugging and provider-specific access.
		*/
		export type RealtimeModelV4ServerEvent =
		// ── Session lifecycle ──────────────────────────────────────────────

		\| {
		type: 'session-created';
		sessionId?: string;
		raw: unknown;
		}
		\| {
		type: 'session-updated';
		raw: unknown;
		}

		// ── Input audio buffer ─────────────────────────────────────────────
		\| {
		type: 'speech-started';
		itemId?: string;
		raw: unknown;
		}
		\| {
		type: 'speech-stopped';
		itemId?: string;
		raw: unknown;
		}
		\| {
		type: 'audio-committed';
		itemId?: string;
		previousItemId?: string;
		raw: unknown;
		}

		// ── Conversation items ─────────────────────────────────────────────
		\| {
		type: 'conversation-item-added';
		itemId: string;
		item: unknown;
		raw: unknown;
		}
		\| {
		type: 'input-transcription-completed';
		itemId: string;
		transcript: string;
		raw: unknown;
		}

		// ── Response lifecycle ─────────────────────────────────────────────
		\| {
		type: 'response-created';
		responseId: string;
		raw: unknown;
		}
		\| {
		type: 'response-done';
		responseId: string;
		status: string;
		raw: unknown;
		}

		// ── Output item lifecycle ──────────────────────────────────────────
		\| {
		type: 'output-item-added';
		responseId: string;
		itemId: string;
		raw: unknown;
		}
		\| {
		type: 'output-item-done';
		responseId: string;
		itemId: string;
		raw: unknown;
		}
		\| {
		type: 'content-part-added';
		responseId: string;
		itemId: string;
		raw: unknown;
		}
		\| {
		type: 'content-part-done';
		responseId: string;
		itemId: string;
		raw: unknown;
		}

		// ── Audio output ───────────────────────────────────────────────────
		\| {
		type: 'audio-delta';
		responseId: string;
		itemId: string;

		/**
		* Base64-encoded audio chunk.
		*/
		delta: string;
		raw: unknown;
		}
		\| {
		type: 'audio-done';
		responseId: string;
		itemId: string;
		raw: unknown;
		}

		// ── Audio transcript output ────────────────────────────────────────
		\| {
		type: 'audio-transcript-delta';
		responseId: string;
		itemId: string;

		/**
		* Text chunk of the audio transcript.
		*/
		delta: string;
		raw: unknown;
		}
		\| {
		type: 'audio-transcript-done';
		responseId: string;
		itemId: string;
		transcript?: string;
		raw: unknown;
		}

		// ── Text output ────────────────────────────────────────────────────
		\| {
		type: 'text-delta';
		responseId: string;
		itemId: string;

		/**
		* Text chunk of the model's text response.
		*/
		delta: string;
		raw: unknown;
		}
		\| {
		type: 'text-done';
		responseId: string;
		itemId: string;
		text?: string;
		raw: unknown;
		}

		// ── Function calling ───────────────────────────────────────────────
		\| {
		type: 'function-call-arguments-delta';
		responseId: string;
		itemId: string;
		callId: string;

		/**
		* Partial JSON string of function call arguments.
		*/
		delta: string;
		raw: unknown;
		}
		\| {
		type: 'function-call-arguments-done';
		responseId: string;
		itemId: string;
		callId: string;

		/**
		* The name of the function to call.
		*/
		name: string;

		/**
		* Complete JSON string of function call arguments.
		*/
		arguments: string;
		raw: unknown;
		}

		// ── Error ──────────────────────────────────────────────────────────
		\| {
		type: 'error';
		message: string;
		code?: string;
		raw: unknown;
		}

		// ── Custom / provider-specific ────────────────────────────────────
		\| {
		type: 'custom';

		/**
		* The original event type string from the provider.
		*/
		rawType: string;
		raw: unknown;
		};

+142

src/realtime-model/v4/realtime-model-v4-session-config.ts

		import type { RealtimeModelV4ToolDefinition } from './realtime-model-v4-tool-definition';

		/**
		* Provider-neutral configuration for a realtime session.
		* Each provider maps this to their specific session.update payload.
		*/
		export type RealtimeModelV4SessionConfig = {
		/**
		* System instructions for the model.
		*/
		instructions?: string;

		/**
		* Voice to use for audio output.
		*/
		voice?: string;

		/**
		* Which output modalities the model should produce.
		*/
		outputModalities?: Array<'text' \| 'audio'>;

		/**
		* Audio format configuration for input audio.
		*/
		inputAudioFormat?: {
		/**
		* Audio format type (e.g. "audio/pcm", "audio/pcmu", "audio/pcma").
		*/
		type: string;

		/**
		* Sample rate in Hz. Only applicable for PCM format.
		*/
		rate?: number;
		};

		/**
		* Input audio transcription configuration.
		*
		* When enabled, providers that support input transcription emit normalized
		* `input-transcription-completed` events that can be rendered as user
		* messages.
		*/
		inputAudioTranscription?: {
		/**
		* Provider-specific transcription model.
		*/
		model?: string;

		/**
		* Optional language hint for the input audio.
		*/
		language?: string;

		/**
		* Optional prompt to guide transcription.
		*/
		prompt?: string;
		};

		/**
		* Output audio transcription configuration.
		*
		* When enabled, providers that support output transcription emit normalized
		* `audio-transcript-delta` / `audio-transcript-done` events for the model's
		* spoken response. Some providers transcribe output by default; setting this
		* makes the behavior explicit rather than relying on that default.
		*/
		outputAudioTranscription?: {
		/**
		* Provider-specific transcription model.
		*/
		model?: string;

		/**
		* Optional language hint for the output audio.
		*/
		language?: string;

		/**
		* Optional prompt to guide transcription.
		*/
		prompt?: string;
		};

		/**
		* Audio format configuration for output audio.
		*/
		outputAudioFormat?: {
		/**
		* Audio format type (e.g. "audio/pcm", "audio/pcmu", "audio/pcma").
		*/
		type: string;

		/**
		* Sample rate in Hz. Only applicable for PCM format.
		*/
		rate?: number;
		};

		/**
		* Voice activity detection configuration.
		* Set to null or type 'disabled' to turn off VAD (push-to-talk mode).
		*/
		turnDetection?: {
		/**
		* VAD mode. 'server-vad' for automatic detection,
		* 'semantic-vad' for OpenAI's semantic detection,
		* 'disabled' to turn off VAD.
		*/
		type: 'server-vad' \| 'semantic-vad' \| 'disabled';

		/**
		* VAD activation threshold (0.0-1.0).
		* Higher values require louder audio to trigger.
		*/
		threshold?: number;

		/**
		* How long the user must be silent (in ms) before
		* the server ends the turn.
		*/
		silenceDurationMs?: number;

		/**
		* Amount of audio (in ms) to include before the
		* detected start of speech.
		*/
		prefixPaddingMs?: number;
		} \| null;

		/**
		* Tool definitions available to the model in this session.
		*/
		tools?: RealtimeModelV4ToolDefinition[];

		/**
		* Provider-specific options that are passed through to the provider.
		*/
		providerOptions?: Record<string, unknown>;
		};

+28

src/realtime-model/v4/realtime-model-v4-tool-definition.ts

		import type { JSONSchema7 } from 'json-schema';

		/**
		* A tool definition for realtime models. Sent as part of the session
		* configuration so the model knows which functions it can call.
		*/
		export type RealtimeModelV4ToolDefinition = {
		/**
		* The type of the tool (always 'function').
		*/
		type: 'function';

		/**
		* The name of the tool. Unique within the session.
		*/
		name: string;

		/**
		* A description of what the tool does. The model uses this to decide
		* whether to call the tool.
		*/
		description?: string;

		/**
		* JSON Schema describing the parameters the tool expects.
		*/
		parameters: JSONSchema7;
		};

+89

src/realtime-model/v4/realtime-model-v4.ts

		import type {
		RealtimeModelV4ClientSecretOptions,
		RealtimeModelV4ClientSecretResult,
		} from './realtime-model-v4-client-secret';
		import type { RealtimeModelV4ClientEvent } from './realtime-model-v4-client-event';
		import type { RealtimeModelV4ServerEvent } from './realtime-model-v4-server-event';
		import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';

		/**
		* Specification for a realtime model that supports bidirectional
		* audio/text communication over WebSocket.
		*
		* Providers implement this interface to enable realtime voice
		* conversations through the AI SDK.
		*/
		export type RealtimeModelV4 = {
		/**
		* The realtime model must specify which interface version it implements.
		*/
		readonly specificationVersion: 'v4';

		/**
		* Provider ID (e.g. 'openai', 'xai').
		*/
		readonly provider: string;

		/**
		* Provider-specific model ID (e.g. 'gpt-4o-realtime', 'grok-3').
		*/
		readonly modelId: string;

		/**
		* Server-side: Creates an ephemeral client secret for authenticating
		* browser-side WebSocket connections. The secret is short-lived and
		* safe to expose to client code.
		*
		* Naming: "do" prefix to prevent accidental direct usage by the user.
		*/
		doCreateClientSecret(
		options: RealtimeModelV4ClientSecretOptions,
		): PromiseLike<RealtimeModelV4ClientSecretResult>;

		/**
		* Browser-side: Returns the WebSocket URL and subprotocols to use
		* when connecting. Each provider has its own authentication mechanism
		* (e.g. OpenAI uses subprotocol headers, xAI may use query params).
		*/
		getWebSocketConfig(options: { token: string; url: string }): {
		url: string;
		protocols?: string[];
		};

		/**
		* Browser-side: Parses a raw JSON event received over the WebSocket
		* and returns one or more normalized events. Providers map their native
		* event format to the common RealtimeModelV4ServerEvent union.
		*
		* Returns an array when a single provider message maps to multiple
		* normalized events (e.g. Google's serverContent can contain audio,
		* text, and turn-complete data in one message).
		*/
		parseServerEvent(
		raw: unknown,
		): RealtimeModelV4ServerEvent \| RealtimeModelV4ServerEvent[];

		/**
		* Browser-side: Serializes a normalized client event into the
		* provider's native JSON format for sending over the WebSocket.
		*/
		serializeClientEvent(
		event: RealtimeModelV4ClientEvent,
		): unknown \| PromiseLike<unknown>;

		/**
		* Browser-side: Builds the provider-specific session configuration
		* payload from a normalized session config. Used to construct the
		* session.update event sent after WebSocket connection.
		*/
		buildSessionConfig(config: RealtimeModelV4SessionConfig): unknown;

		/**
		* Browser-side: Returns a message to auto-send back over the WebSocket
		* in response to a raw incoming message, or null if no response is needed.
		*
		* Used for provider-specific keepalive protocols (e.g. ping/pong).
		* Called by the session layer before parseServerEvent.
		*/
		getHealthCheckResponse?(raw: unknown): unknown \| null;
		};

+40

-0

CHANGELOG.md

		# @ai-sdk/provider

		## 4.0.0-beta.19

		### Patch Changes

		- b8396f0: trigger initial beta release

		## 4.0.0-canary.18

		### Patch Changes

		- ce769dd: feat(provider): add experimental Realtime API support for voice conversations

		Adds first-class support for realtime (speech-to-speech) APIs:

		- `Experimental_RealtimeModelV4` spec in `@ai-sdk/provider` with normalized event types and factory
		- OpenAI, Google, and xAI realtime provider implementations
		- `openai.experimental_realtime()` / `google.experimental_realtime()` / `xai.experimental_realtime()` work in both server and browser
		- `.getToken()` static method on each provider for server-side ephemeral token creation
		- `experimental_getRealtimeToolDefinitions` helper for provider session tool definitions
		- `experimental_useRealtime` hook in `@ai-sdk/react` returning `UIMessage[]` (aligned with `useChat`), with `onToolCall` and `addToolOutput` for client-driven tool execution
		- `inputAudioTranscription` session config for showing transcribed user audio messages when supported by the provider

		## 4.0.0-canary.17

		### Patch Changes

		- 7fc6bd6: Raise minimum supported Node.js version to 22. Supported versions: 22, 24, and 26.

		## 4.0.0-canary.16

		### Major Changes

		- 5463d0d: feat(provider): align tool result output content file part types with top-level message file part types

		## 4.0.0-canary.15

		### Patch Changes

		- 0c4c275: trigger initial canary release

		## 4.0.0-beta.14
		@@ -4,0 +44,0 @@

+4

-4

package.json

		{
		"name": "@ai-sdk/provider",
		"version": "4.0.0-beta.14",
		"version": "4.0.0-beta.19",
		"type": "module",
		@@ -33,4 +33,4 @@ "license": "Apache-2.0",
		"@types/json-schema": "7.0.15",
		"@types/node": "20.17.24",
		"tsup": "^8",
		"@types/node": "22.19.19",
		"tsup": "^8.5.1",
		"typescript": "5.8.3",
		@@ -40,3 +40,3 @@ "@vercel/ai-tsconfig": "0.0.0"
		"engines": {
		"node": ">=18"
		"node": ">=22"
		},
		@@ -43,0 +43,0 @@ "publishConfig": {

+1

-0

src/index.ts

		@@ -11,2 +11,3 @@ export * from './embedding-model/index';
		export * from './provider/index';
		export * from './realtime-model/v4/index';
		export * from './reranking-model/index';
		@@ -13,0 +14,0 @@ export * from './shared/index';

+17

-38

src/language-model/v4/language-model-v4-prompt.ts

		@@ -8,3 +8,2 @@ import type { JSONValue } from '../../json-value/json-value';
		import type { SharedV4ProviderOptions } from '../../shared/v4/shared-v4-provider-options';
		import type { SharedV4ProviderReference } from '../../shared/v4/shared-v4-provider-reference';

		@@ -363,11 +362,24 @@ /**
		\| {
		type: 'file-data';
		type: 'file';

		/**
		* Base-64 encoded media data.
		* File data as a tagged discriminated union:
		*
		* - `{ type: 'data', data }`: raw bytes (Uint8Array) or base64-encoded string.
		* - `{ type: 'url', url }`: a URL that points to the file.
		* - `{ type: 'reference', reference }`: a provider reference (`{ [provider]: id }`).
		* - `{ type: 'text', text }`: inline text content (e.g. an inline text document).
		*/
		data: string;
		data: SharedV4FileData;

		/**
		* IANA media type.
		* Either a full IANA media type (`type/subtype`, e.g. `image/png`) or just
		* the top-level IANA segment (e.g. `image`, `audio`, `video`, `text`).
		*
		* ``-subtype wildcards (e.g. `image/`) are normalized as equivalent to the
		* top-level segment alone (e.g. `image`). Providers can use the helpers in
		* `@ai-sdk/provider-utils` (`isFullMediaType`, `getTopLevelMediaType`,
		* `detectMediaType`) to resolve the field according to their API
		* requirements.
		*
		* @see https://www.iana.org/assignments/media-types/media-types.xhtml
		@@ -388,36 +400,3 @@ */
		\| {
		type: 'file-url';

		/**
		* URL of the file.
		*/
		url: string;

		/**
		* IANA media type.
		* @see https://www.iana.org/assignments/media-types/media-types.xhtml
		*/
		mediaType: string;

		/**
		* Provider-specific options.
		*/
		providerOptions?: SharedV4ProviderOptions;
		}
		\| {
		type: 'file-reference';

		/**
		* Provider-specific references for the file.
		* The key is the provider name, e.g. 'openai' or 'anthropic'.
		*/
		providerReference: SharedV4ProviderReference;

		/**
		* Provider-specific options.
		*/
		providerOptions?: SharedV4ProviderOptions;
		}
		\| {
		/**
		* Custom content part. This can be used to implement
		@@ -424,0 +403,0 @@ * provider-specific content parts.

dist/index.d.ts

Sorry, the diff of this file is too big to display

@ai-sdk/provider - npm Package Compare versions

Improved metrics