🚀 Socket Launch Week Day 5:Introducing Repository Access Permissions and Custom Roles.Learn more
Sign In

@ai-sdk/provider

Package Overview
Dependencies
Maintainers
3
Versions
157
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

@ai-sdk/provider - npm Package Compare versions

Comparing version
4.0.0-beta.14
to
4.0.0-beta.19
+1
src/realtime-model/index.ts
export * from './v4/index';
export type {
RealtimeFactoryV4 as Experimental_RealtimeFactoryV4,
RealtimeFactoryV4GetTokenOptions as Experimental_RealtimeFactoryV4GetTokenOptions,
RealtimeFactoryV4GetTokenResult as Experimental_RealtimeFactoryV4GetTokenResult,
} from './realtime-factory-v4';
export type { RealtimeModelV4 as Experimental_RealtimeModelV4 } from './realtime-model-v4';
export type { RealtimeModelV4ClientEvent as Experimental_RealtimeModelV4ClientEvent } from './realtime-model-v4-client-event';
export type {
RealtimeModelV4ClientSecretOptions as Experimental_RealtimeModelV4ClientSecretOptions,
RealtimeModelV4ClientSecretResult as Experimental_RealtimeModelV4ClientSecretResult,
} from './realtime-model-v4-client-secret';
export type {
RealtimeModelV4ConversationItem as Experimental_RealtimeModelV4ConversationItem,
RealtimeModelV4TextMessage as Experimental_RealtimeModelV4TextMessage,
RealtimeModelV4AudioMessage as Experimental_RealtimeModelV4AudioMessage,
RealtimeModelV4FunctionCallOutput as Experimental_RealtimeModelV4FunctionCallOutput,
} from './realtime-model-v4-conversation-item';
export type { RealtimeModelV4ServerEvent as Experimental_RealtimeModelV4ServerEvent } from './realtime-model-v4-server-event';
export type { RealtimeModelV4SessionConfig as Experimental_RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
export type { RealtimeModelV4ToolDefinition as Experimental_RealtimeModelV4ToolDefinition } from './realtime-model-v4-tool-definition';
import type { RealtimeModelV4 } from './realtime-model-v4';
import type { RealtimeModelV4ClientSecretOptions } from './realtime-model-v4-client-secret';
export type RealtimeFactoryV4GetTokenOptions = {
model: string;
} & RealtimeModelV4ClientSecretOptions;
export type RealtimeFactoryV4GetTokenResult = {
token: string;
url: string;
expiresAt?: number;
};
export interface RealtimeFactoryV4 {
(modelId: string): RealtimeModelV4;
getToken(
options: RealtimeFactoryV4GetTokenOptions,
): Promise<RealtimeFactoryV4GetTokenResult>;
}
import type { RealtimeModelV4ConversationItem } from './realtime-model-v4-conversation-item';
import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
/**
* Normalized events sent from the browser to the realtime model.
* Each provider maps this to its native event format before sending
* over the WebSocket.
*/
export type RealtimeModelV4ClientEvent =
// ── Session ────────────────────────────────────────────────────────
| {
type: 'session-update';
config: RealtimeModelV4SessionConfig;
}
// ── Input audio buffer ─────────────────────────────────────────────
| {
type: 'input-audio-append';
/**
* Base64-encoded audio chunk to append to the input buffer.
*/
audio: string;
}
| {
type: 'input-audio-commit';
}
| {
type: 'input-audio-clear';
}
// ── Conversation items ─────────────────────────────────────────────
| {
type: 'conversation-item-create';
item: RealtimeModelV4ConversationItem;
}
| {
type: 'conversation-item-truncate';
/**
* The ID of the assistant message item to truncate.
*/
itemId: string;
/**
* The index of the content part to truncate.
*/
contentIndex: number;
/**
* Truncate audio after this many milliseconds.
*/
audioEndMs: number;
}
// ── Response control ───────────────────────────────────────────────
| {
type: 'response-create';
options?: {
modalities?: string[];
instructions?: string;
metadata?: Record<string, unknown>;
};
}
| {
type: 'response-cancel';
};
import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
/**
* Options for creating an ephemeral client secret for browser-side
* WebSocket connections to a realtime model.
*/
export type RealtimeModelV4ClientSecretOptions = {
/**
* Number of seconds until the client secret expires.
*/
expiresAfterSeconds?: number;
/**
* Optional session configuration to embed in the token request.
* Some providers (e.g. Google) require the full session config at token creation time.
*/
sessionConfig?: RealtimeModelV4SessionConfig;
};
/**
* Result of creating an ephemeral client secret.
*/
export type RealtimeModelV4ClientSecretResult = {
/**
* The ephemeral token value. Used as a Bearer token or in the
* WebSocket subprotocol header for authentication.
*/
token: string;
/**
* The WebSocket URL to connect to. Includes any provider-specific
* query parameters (e.g. model ID).
*/
url: string;
/**
* Unix timestamp (seconds) when this client secret expires.
*/
expiresAt?: number;
};
/**
* A conversation item that can be created by the client and sent to
* the model via the conversation.item.create event.
*/
export type RealtimeModelV4ConversationItem =
| RealtimeModelV4TextMessage
| RealtimeModelV4AudioMessage
| RealtimeModelV4FunctionCallOutput;
/**
* A text message from the user.
*/
export type RealtimeModelV4TextMessage = {
type: 'text-message';
role: 'user';
text: string;
};
/**
* An audio message from the user (complete audio, not streamed).
*/
export type RealtimeModelV4AudioMessage = {
type: 'audio-message';
role: 'user';
/**
* Base64-encoded audio data.
*/
audio: string;
};
/**
* The output of a function call, sent back to the model so it can
* continue generating a response using the tool result.
*/
export type RealtimeModelV4FunctionCallOutput = {
type: 'function-call-output';
/**
* The call ID from the function-call-arguments-done event.
* Must match so the model knows which function call this result is for.
*/
callId: string;
/**
* The name of the function that was called.
* Required by some providers (e.g. Google) in the tool response routing.
*/
name?: string;
/**
* JSON string containing the function call result.
*/
output: string;
};
/**
* Normalized events emitted by the realtime model (model → browser).
* Each provider maps its native event format to this discriminated union.
*
* Every event includes a `raw` field with the original provider-specific
* event data for debugging and provider-specific access.
*/
export type RealtimeModelV4ServerEvent =
// ── Session lifecycle ──────────────────────────────────────────────
| {
type: 'session-created';
sessionId?: string;
raw: unknown;
}
| {
type: 'session-updated';
raw: unknown;
}
// ── Input audio buffer ─────────────────────────────────────────────
| {
type: 'speech-started';
itemId?: string;
raw: unknown;
}
| {
type: 'speech-stopped';
itemId?: string;
raw: unknown;
}
| {
type: 'audio-committed';
itemId?: string;
previousItemId?: string;
raw: unknown;
}
// ── Conversation items ─────────────────────────────────────────────
| {
type: 'conversation-item-added';
itemId: string;
item: unknown;
raw: unknown;
}
| {
type: 'input-transcription-completed';
itemId: string;
transcript: string;
raw: unknown;
}
// ── Response lifecycle ─────────────────────────────────────────────
| {
type: 'response-created';
responseId: string;
raw: unknown;
}
| {
type: 'response-done';
responseId: string;
status: string;
raw: unknown;
}
// ── Output item lifecycle ──────────────────────────────────────────
| {
type: 'output-item-added';
responseId: string;
itemId: string;
raw: unknown;
}
| {
type: 'output-item-done';
responseId: string;
itemId: string;
raw: unknown;
}
| {
type: 'content-part-added';
responseId: string;
itemId: string;
raw: unknown;
}
| {
type: 'content-part-done';
responseId: string;
itemId: string;
raw: unknown;
}
// ── Audio output ───────────────────────────────────────────────────
| {
type: 'audio-delta';
responseId: string;
itemId: string;
/**
* Base64-encoded audio chunk.
*/
delta: string;
raw: unknown;
}
| {
type: 'audio-done';
responseId: string;
itemId: string;
raw: unknown;
}
// ── Audio transcript output ────────────────────────────────────────
| {
type: 'audio-transcript-delta';
responseId: string;
itemId: string;
/**
* Text chunk of the audio transcript.
*/
delta: string;
raw: unknown;
}
| {
type: 'audio-transcript-done';
responseId: string;
itemId: string;
transcript?: string;
raw: unknown;
}
// ── Text output ────────────────────────────────────────────────────
| {
type: 'text-delta';
responseId: string;
itemId: string;
/**
* Text chunk of the model's text response.
*/
delta: string;
raw: unknown;
}
| {
type: 'text-done';
responseId: string;
itemId: string;
text?: string;
raw: unknown;
}
// ── Function calling ───────────────────────────────────────────────
| {
type: 'function-call-arguments-delta';
responseId: string;
itemId: string;
callId: string;
/**
* Partial JSON string of function call arguments.
*/
delta: string;
raw: unknown;
}
| {
type: 'function-call-arguments-done';
responseId: string;
itemId: string;
callId: string;
/**
* The name of the function to call.
*/
name: string;
/**
* Complete JSON string of function call arguments.
*/
arguments: string;
raw: unknown;
}
// ── Error ──────────────────────────────────────────────────────────
| {
type: 'error';
message: string;
code?: string;
raw: unknown;
}
// ── Custom / provider-specific ────────────────────────────────────
| {
type: 'custom';
/**
* The original event type string from the provider.
*/
rawType: string;
raw: unknown;
};
import type { RealtimeModelV4ToolDefinition } from './realtime-model-v4-tool-definition';
/**
* Provider-neutral configuration for a realtime session.
* Each provider maps this to their specific session.update payload.
*/
export type RealtimeModelV4SessionConfig = {
/**
* System instructions for the model.
*/
instructions?: string;
/**
* Voice to use for audio output.
*/
voice?: string;
/**
* Which output modalities the model should produce.
*/
outputModalities?: Array<'text' | 'audio'>;
/**
* Audio format configuration for input audio.
*/
inputAudioFormat?: {
/**
* Audio format type (e.g. "audio/pcm", "audio/pcmu", "audio/pcma").
*/
type: string;
/**
* Sample rate in Hz. Only applicable for PCM format.
*/
rate?: number;
};
/**
* Input audio transcription configuration.
*
* When enabled, providers that support input transcription emit normalized
* `input-transcription-completed` events that can be rendered as user
* messages.
*/
inputAudioTranscription?: {
/**
* Provider-specific transcription model.
*/
model?: string;
/**
* Optional language hint for the input audio.
*/
language?: string;
/**
* Optional prompt to guide transcription.
*/
prompt?: string;
};
/**
* Output audio transcription configuration.
*
* When enabled, providers that support output transcription emit normalized
* `audio-transcript-delta` / `audio-transcript-done` events for the model's
* spoken response. Some providers transcribe output by default; setting this
* makes the behavior explicit rather than relying on that default.
*/
outputAudioTranscription?: {
/**
* Provider-specific transcription model.
*/
model?: string;
/**
* Optional language hint for the output audio.
*/
language?: string;
/**
* Optional prompt to guide transcription.
*/
prompt?: string;
};
/**
* Audio format configuration for output audio.
*/
outputAudioFormat?: {
/**
* Audio format type (e.g. "audio/pcm", "audio/pcmu", "audio/pcma").
*/
type: string;
/**
* Sample rate in Hz. Only applicable for PCM format.
*/
rate?: number;
};
/**
* Voice activity detection configuration.
* Set to null or type 'disabled' to turn off VAD (push-to-talk mode).
*/
turnDetection?: {
/**
* VAD mode. 'server-vad' for automatic detection,
* 'semantic-vad' for OpenAI's semantic detection,
* 'disabled' to turn off VAD.
*/
type: 'server-vad' | 'semantic-vad' | 'disabled';
/**
* VAD activation threshold (0.0-1.0).
* Higher values require louder audio to trigger.
*/
threshold?: number;
/**
* How long the user must be silent (in ms) before
* the server ends the turn.
*/
silenceDurationMs?: number;
/**
* Amount of audio (in ms) to include before the
* detected start of speech.
*/
prefixPaddingMs?: number;
} | null;
/**
* Tool definitions available to the model in this session.
*/
tools?: RealtimeModelV4ToolDefinition[];
/**
* Provider-specific options that are passed through to the provider.
*/
providerOptions?: Record<string, unknown>;
};
import type { JSONSchema7 } from 'json-schema';
/**
* A tool definition for realtime models. Sent as part of the session
* configuration so the model knows which functions it can call.
*/
export type RealtimeModelV4ToolDefinition = {
/**
* The type of the tool (always 'function').
*/
type: 'function';
/**
* The name of the tool. Unique within the session.
*/
name: string;
/**
* A description of what the tool does. The model uses this to decide
* whether to call the tool.
*/
description?: string;
/**
* JSON Schema describing the parameters the tool expects.
*/
parameters: JSONSchema7;
};
import type {
RealtimeModelV4ClientSecretOptions,
RealtimeModelV4ClientSecretResult,
} from './realtime-model-v4-client-secret';
import type { RealtimeModelV4ClientEvent } from './realtime-model-v4-client-event';
import type { RealtimeModelV4ServerEvent } from './realtime-model-v4-server-event';
import type { RealtimeModelV4SessionConfig } from './realtime-model-v4-session-config';
/**
* Specification for a realtime model that supports bidirectional
* audio/text communication over WebSocket.
*
* Providers implement this interface to enable realtime voice
* conversations through the AI SDK.
*/
export type RealtimeModelV4 = {
/**
* The realtime model must specify which interface version it implements.
*/
readonly specificationVersion: 'v4';
/**
* Provider ID (e.g. 'openai', 'xai').
*/
readonly provider: string;
/**
* Provider-specific model ID (e.g. 'gpt-4o-realtime', 'grok-3').
*/
readonly modelId: string;
/**
* Server-side: Creates an ephemeral client secret for authenticating
* browser-side WebSocket connections. The secret is short-lived and
* safe to expose to client code.
*
* Naming: "do" prefix to prevent accidental direct usage by the user.
*/
doCreateClientSecret(
options: RealtimeModelV4ClientSecretOptions,
): PromiseLike<RealtimeModelV4ClientSecretResult>;
/**
* Browser-side: Returns the WebSocket URL and subprotocols to use
* when connecting. Each provider has its own authentication mechanism
* (e.g. OpenAI uses subprotocol headers, xAI may use query params).
*/
getWebSocketConfig(options: { token: string; url: string }): {
url: string;
protocols?: string[];
};
/**
* Browser-side: Parses a raw JSON event received over the WebSocket
* and returns one or more normalized events. Providers map their native
* event format to the common RealtimeModelV4ServerEvent union.
*
* Returns an array when a single provider message maps to multiple
* normalized events (e.g. Google's serverContent can contain audio,
* text, and turn-complete data in one message).
*/
parseServerEvent(
raw: unknown,
): RealtimeModelV4ServerEvent | RealtimeModelV4ServerEvent[];
/**
* Browser-side: Serializes a normalized client event into the
* provider's native JSON format for sending over the WebSocket.
*/
serializeClientEvent(
event: RealtimeModelV4ClientEvent,
): unknown | PromiseLike<unknown>;
/**
* Browser-side: Builds the provider-specific session configuration
* payload from a normalized session config. Used to construct the
* session.update event sent after WebSocket connection.
*/
buildSessionConfig(config: RealtimeModelV4SessionConfig): unknown;
/**
* Browser-side: Returns a message to auto-send back over the WebSocket
* in response to a raw incoming message, or null if no response is needed.
*
* Used for provider-specific keepalive protocols (e.g. ping/pong).
* Called by the session layer before parseServerEvent.
*/
getHealthCheckResponse?(raw: unknown): unknown | null;
};
+40
-0
# @ai-sdk/provider
## 4.0.0-beta.19
### Patch Changes
- b8396f0: trigger initial beta release
## 4.0.0-canary.18
### Patch Changes
- ce769dd: feat(provider): add experimental Realtime API support for voice conversations
Adds first-class support for realtime (speech-to-speech) APIs:
- `Experimental_RealtimeModelV4` spec in `@ai-sdk/provider` with normalized event types and factory
- OpenAI, Google, and xAI realtime provider implementations
- `openai.experimental_realtime()` / `google.experimental_realtime()` / `xai.experimental_realtime()` work in both server and browser
- `.getToken()` static method on each provider for server-side ephemeral token creation
- `experimental_getRealtimeToolDefinitions` helper for provider session tool definitions
- `experimental_useRealtime` hook in `@ai-sdk/react` returning `UIMessage[]` (aligned with `useChat`), with `onToolCall` and `addToolOutput` for client-driven tool execution
- `inputAudioTranscription` session config for showing transcribed user audio messages when supported by the provider
## 4.0.0-canary.17
### Patch Changes
- 7fc6bd6: Raise minimum supported Node.js version to 22. Supported versions: 22, 24, and 26.
## 4.0.0-canary.16
### Major Changes
- 5463d0d: feat(provider): align tool result output content file part types with top-level message file part types
## 4.0.0-canary.15
### Patch Changes
- 0c4c275: trigger initial canary release
## 4.0.0-beta.14

@@ -4,0 +44,0 @@

+4
-4
{
"name": "@ai-sdk/provider",
"version": "4.0.0-beta.14",
"version": "4.0.0-beta.19",
"type": "module",

@@ -33,4 +33,4 @@ "license": "Apache-2.0",

"@types/json-schema": "7.0.15",
"@types/node": "20.17.24",
"tsup": "^8",
"@types/node": "22.19.19",
"tsup": "^8.5.1",
"typescript": "5.8.3",

@@ -40,3 +40,3 @@ "@vercel/ai-tsconfig": "0.0.0"

"engines": {
"node": ">=18"
"node": ">=22"
},

@@ -43,0 +43,0 @@ "publishConfig": {

@@ -11,2 +11,3 @@ export * from './embedding-model/index';

export * from './provider/index';
export * from './realtime-model/v4/index';
export * from './reranking-model/index';

@@ -13,0 +14,0 @@ export * from './shared/index';

@@ -8,3 +8,2 @@ import type { JSONValue } from '../../json-value/json-value';

import type { SharedV4ProviderOptions } from '../../shared/v4/shared-v4-provider-options';
import type { SharedV4ProviderReference } from '../../shared/v4/shared-v4-provider-reference';

@@ -363,11 +362,24 @@ /**

| {
type: 'file-data';
type: 'file';
/**
* Base-64 encoded media data.
* File data as a tagged discriminated union:
*
* - `{ type: 'data', data }`: raw bytes (Uint8Array) or base64-encoded string.
* - `{ type: 'url', url }`: a URL that points to the file.
* - `{ type: 'reference', reference }`: a provider reference (`{ [provider]: id }`).
* - `{ type: 'text', text }`: inline text content (e.g. an inline text document).
*/
data: string;
data: SharedV4FileData;
/**
* IANA media type.
* Either a full IANA media type (`type/subtype`, e.g. `image/png`) or just
* the top-level IANA segment (e.g. `image`, `audio`, `video`, `text`).
*
* `*`-subtype wildcards (e.g. `image/*`) are normalized as equivalent to the
* top-level segment alone (e.g. `image`). Providers can use the helpers in
* `@ai-sdk/provider-utils` (`isFullMediaType`, `getTopLevelMediaType`,
* `detectMediaType`) to resolve the field according to their API
* requirements.
*
* @see https://www.iana.org/assignments/media-types/media-types.xhtml

@@ -388,36 +400,3 @@ */

| {
type: 'file-url';
/**
* URL of the file.
*/
url: string;
/**
* IANA media type.
* @see https://www.iana.org/assignments/media-types/media-types.xhtml
*/
mediaType: string;
/**
* Provider-specific options.
*/
providerOptions?: SharedV4ProviderOptions;
}
| {
type: 'file-reference';
/**
* Provider-specific references for the file.
* The key is the provider name, e.g. 'openai' or 'anthropic'.
*/
providerReference: SharedV4ProviderReference;
/**
* Provider-specific options.
*/
providerOptions?: SharedV4ProviderOptions;
}
| {
/**
* Custom content part. This can be used to implement

@@ -424,0 +403,0 @@ * provider-specific content parts.

Sorry, the diff of this file is too big to display