stepfun-realtime-api
Advanced tools
Comparing version
@@ -14,3 +14,3 @@ import { RealtimeEventHandler } from "./event-handler"; | ||
connect({ model }?: { | ||
model: string; | ||
model: "step-1o-audio" | "step-audio-2" | "step-audio-2-mini"; | ||
}): Promise<void>; | ||
@@ -17,0 +17,0 @@ disconnect(): void; |
@@ -54,3 +54,3 @@ "use strict"; | ||
} | ||
async connect({ model } = { model: "step-1o-audio" }) { | ||
async connect({ model } = { "model": "step-1o-audio" }) { | ||
if (this.isConnected()) { | ||
@@ -57,0 +57,0 @@ console.warn("Already connected to WebSocket."); |
@@ -30,3 +30,3 @@ import { ClientRealtimeEvent, ServerConversationItemCreated, ServerConversationItemDeleted, ServerConversationItemInputAudioTranscriptionCompleted, ServerConversationItemInputAudioTranscriptionDelta, ServerConversationItemTruncated, ServerInputAudioBufferCleared, ServerInputAudioBufferCommitted, ServerInputAudioBufferSpeechStarted, ServerInputAudioBufferSpeechStopped, ServerRealtimeEvent, ServerResponseAudioDelta, ServerResponseAudioDone, ServerResponseAudioTranscriptDelta, ServerResponseAudioTranscriptDone, ServerResponseContentPartAdded, ServerResponseContentPartDone, ServerResponseCreated, ServerResponseDone, ServerResponseError, ServerResponseFunctionCallArgumentsDelta, ServerResponseFunctionCallArgumentsDone, ServerResponseOutputItemAdded, ServerResponseOutputItemDone, ServerSessionCreated, ServerSessionUpdated, Session } from "./event"; | ||
getSessionID(): string; | ||
connect(): Promise<void>; | ||
connect(model?: "step-1o-audio" | "step-audio-2" | "step-audio-2-mini"): Promise<void>; | ||
private registerAPIHandler; | ||
@@ -33,0 +33,0 @@ /** |
@@ -79,3 +79,6 @@ "use strict"; | ||
deleteItem(id) { | ||
this.api.send({ type: event_1.ClientEventType.ConversationItemDelete, item_id: id }); | ||
this.api.send({ | ||
type: event_1.ClientEventType.ConversationItemDelete, | ||
item_id: id, | ||
}); | ||
} | ||
@@ -98,4 +101,4 @@ async waitForSessionCreated() { | ||
} | ||
async connect() { | ||
await this.api.connect(); | ||
async connect(model = "step-1o-audio") { | ||
await this.api.connect({ model }); | ||
this.updateSession(this.session); | ||
@@ -218,3 +221,5 @@ } | ||
const item = this.conversation.itemMap.get(itemId); | ||
if (item && item.formatted.audio && item.formatted.audio.length > 0) { | ||
if (item && | ||
item.formatted.audio && | ||
item.formatted.audio.length > 0) { | ||
// 计算该音频项目的播放时长(PCM16 格式,24kHz 采样率) | ||
@@ -221,0 +226,0 @@ const audioBytes = item.formatted.audio.length; |
@@ -37,3 +37,3 @@ # Stepfun Realtime API 阶跃星辰实时语音 | ||
// 连接到服务器 | ||
await client.connect(); | ||
await client.connect("step-1o-audio"); // 可选 step-audio-2-mini / step-audio-2 | ||
console.log("✅ 已连接到 Stepfun Realtime API"); | ||
@@ -40,0 +40,0 @@ |
@@ -5,3 +5,3 @@ { | ||
"name": "stepfun-realtime-api", | ||
"version": "0.1.0", | ||
"version": "0.1.1", | ||
"scripts": { | ||
@@ -8,0 +8,0 @@ "build": "tsc && cp README.md dist/", |
@@ -37,3 +37,3 @@ # Stepfun Realtime API 阶跃星辰实时语音 | ||
// 连接到服务器 | ||
await client.connect(); | ||
await client.connect("step-1o-audio"); // 可选 step-audio-2-mini / step-audio-2 | ||
console.log("✅ 已连接到 Stepfun Realtime API"); | ||
@@ -40,0 +40,0 @@ |
@@ -24,3 +24,3 @@ import { RealtimeEventHandler } from "./event-handler"; | ||
async connect({ model }: { model: string } = { model: "step-1o-audio" }): Promise<void> { | ||
async connect({ model }: { model: "step-1o-audio" | "step-audio-2" | "step-audio-2-mini" } = { "model": "step-1o-audio" }): Promise<void> { | ||
if (this.isConnected()) { | ||
@@ -27,0 +27,0 @@ console.warn("Already connected to WebSocket."); |
@@ -73,6 +73,9 @@ import { RealtimeAPI } from "./api"; | ||
}; | ||
this.api.on("server." + ServerEventType.SessionCreated, (ev: ServerRealtimeEvent<{ session: Session }>) => { | ||
this.session.id = ev.session.id; | ||
this.sessionCreated = true; | ||
}); | ||
this.api.on( | ||
"server." + ServerEventType.SessionCreated, | ||
(ev: ServerRealtimeEvent<{ session: Session }>) => { | ||
this.session.id = ev.session.id; | ||
this.sessionCreated = true; | ||
}, | ||
); | ||
this.registerAPIHandler(); | ||
@@ -115,7 +118,15 @@ } | ||
sendUserMessage(contents: ({ type: "text"; text: string } | { type: "input_audio"; audio: string })[]) { | ||
sendUserMessage( | ||
contents: ( | ||
| { type: "text"; text: string } | ||
| { type: "input_audio"; audio: string } | ||
)[], | ||
) { | ||
if (contents.length > 0) { | ||
const event: ClientRealtimeEvent<{ | ||
item: { | ||
content: ({ type: "text"; text: string } | { type: "input_audio"; audio: string })[]; | ||
content: ( | ||
| { type: "text"; text: string } | ||
| { type: "input_audio"; audio: string } | ||
)[]; | ||
type: "message"; | ||
@@ -134,3 +145,6 @@ role: "user"; | ||
deleteItem(id: string) { | ||
this.api.send({ type: ClientEventType.ConversationItemDelete, item_id: id }); | ||
this.api.send({ | ||
type: ClientEventType.ConversationItemDelete, | ||
item_id: id, | ||
}); | ||
} | ||
@@ -142,5 +156,4 @@ | ||
} | ||
const event: ServerRealtimeEvent<{ session: Session }> = await this.api.waitForNext( | ||
"server." + ServerEventType.SessionCreated | ||
); | ||
const event: ServerRealtimeEvent<{ session: Session }> = | ||
await this.api.waitForNext("server." + ServerEventType.SessionCreated); | ||
return event.session; | ||
@@ -160,4 +173,4 @@ } | ||
async connect() { | ||
await this.api.connect(); | ||
async connect(model: "step-1o-audio" | "step-audio-2" | "step-audio-2-mini" = "step-1o-audio") { | ||
await this.api.connect({ model }); | ||
this.updateSession(this.session); | ||
@@ -186,18 +199,24 @@ } | ||
// todo | ||
this.api.on("server." + ServerEventType.InputAudioBufferSpeechStopped, (event: ServerEventType) => | ||
this.conversation.processEvent(event, this.audioBuffer) | ||
this.api.on( | ||
"server." + ServerEventType.InputAudioBufferSpeechStopped, | ||
(event: ServerEventType) => | ||
this.conversation.processEvent(event, this.audioBuffer), | ||
); | ||
this.api.on("server." + ServerEventType.InputAudioBufferSpeechStarted, (event: ServerEventType) => | ||
this.api.dispatch("interrupted", {}) | ||
this.api.on( | ||
"server." + ServerEventType.InputAudioBufferSpeechStarted, | ||
(event: ServerEventType) => this.api.dispatch("interrupted", {}), | ||
); | ||
this.api.on("server." + ServerEventType.ConversationItemCreated, (event: ServerEventType) => { | ||
const ret = this.conversation.processEvent(event); | ||
if (ret && ret.item) { | ||
this.api.dispatch("conversation.item.appended", event); | ||
if (ret.item.status === "completed") { | ||
this.api.dispatch("conversation.item.completed", event); | ||
this.api.on( | ||
"server." + ServerEventType.ConversationItemCreated, | ||
(event: ServerEventType) => { | ||
const ret = this.conversation.processEvent(event); | ||
if (ret && ret.item) { | ||
this.api.dispatch("conversation.item.appended", event); | ||
if (ret.item.status === "completed") { | ||
this.api.dispatch("conversation.item.completed", event); | ||
} | ||
} | ||
} | ||
}); | ||
}, | ||
); | ||
@@ -210,53 +229,71 @@ handleAndDispatch(ServerEventType.ConversationItemDeleted); | ||
handleAndDispatch(ServerEventType.ResponseAudioTranscriptDone); | ||
handleAndDispatch(ServerEventType.ConversationItemInputAudioTranscriptionDelta); | ||
handleAndDispatch(ServerEventType.ConversationItemInputAudioTranscriptionCompleted); | ||
handleAndDispatch( | ||
ServerEventType.ConversationItemInputAudioTranscriptionDelta, | ||
); | ||
handleAndDispatch( | ||
ServerEventType.ConversationItemInputAudioTranscriptionCompleted, | ||
); | ||
handleAndDispatch(ServerEventType.ResponseFunctionCallArgumentsDelta); | ||
handleAndDispatch(ServerEventType.ResponseFunctionCallArgumentsDone); | ||
this.api.on("server." + ServerEventType.ResponseOutputItemDone, (event: ServerEventType) => { | ||
const ret = this.conversation.processEvent(event); | ||
if (ret && ret.item) { | ||
if (ret.item.status === "completed") { | ||
this.api.dispatch(LocalEventType.ConversationItemCompleted, event); | ||
this.api.on( | ||
"server." + ServerEventType.ResponseOutputItemDone, | ||
(event: ServerEventType) => { | ||
const ret = this.conversation.processEvent(event); | ||
if (ret && ret.item) { | ||
if (ret.item.status === "completed") { | ||
this.api.dispatch(LocalEventType.ConversationItemCompleted, event); | ||
} | ||
} | ||
} | ||
}); | ||
}, | ||
); | ||
// 处理音频播放完成事件 | ||
this.api.on("server." + ServerEventType.ResponseCreated, (event: ServerResponseCreated) => { | ||
// 标记所有正在播放的响应为中断状态 | ||
for (const [responseId, state] of this.audioPlaybackState.entries()) { | ||
if (!state.isResponseDone) { | ||
state.isInterrupted = true; | ||
this.api.on( | ||
"server." + ServerEventType.ResponseCreated, | ||
(event: ServerResponseCreated) => { | ||
// 标记所有正在播放的响应为中断状态 | ||
for (const [responseId, state] of this.audioPlaybackState.entries()) { | ||
if (!state.isResponseDone) { | ||
state.isInterrupted = true; | ||
} | ||
} | ||
} | ||
// 初始化新响应的音频播放状态 | ||
this.audioPlaybackState.set(event.response.id, { | ||
responseId: event.response.id, | ||
isResponseDone: false, | ||
isInterrupted: false, | ||
}); | ||
}); | ||
// 初始化新响应的音频播放状态 | ||
this.audioPlaybackState.set(event.response.id, { | ||
responseId: event.response.id, | ||
isResponseDone: false, | ||
isInterrupted: false, | ||
}); | ||
}, | ||
); | ||
this.api.on("server." + ServerEventType.ResponseDone, (event: ServerResponseDone) => { | ||
// 标记响应完成 | ||
const playbackState = this.audioPlaybackState.get(event.response.id); | ||
if (playbackState) { | ||
playbackState.isResponseDone = true; | ||
this.checkAndEmitAudioPlaybackCompleted(event.response.id); | ||
} | ||
}); | ||
this.api.on( | ||
"server." + ServerEventType.ResponseDone, | ||
(event: ServerResponseDone) => { | ||
// 标记响应完成 | ||
const playbackState = this.audioPlaybackState.get(event.response.id); | ||
if (playbackState) { | ||
playbackState.isResponseDone = true; | ||
this.checkAndEmitAudioPlaybackCompleted(event.response.id); | ||
} | ||
}, | ||
); | ||
this.api.on("server." + ServerEventType.ResponseAudioDelta, (event: ServerResponseAudioDelta) => { | ||
const playbackState = this.audioPlaybackState.get(event.response_id); | ||
if (playbackState) { | ||
// 记录第一个音频 delta 的时间 | ||
if (!playbackState.firstDeltaTime) { | ||
playbackState.firstDeltaTime = Date.now(); | ||
this.api.on( | ||
"server." + ServerEventType.ResponseAudioDelta, | ||
(event: ServerResponseAudioDelta) => { | ||
const playbackState = this.audioPlaybackState.get(event.response_id); | ||
if (playbackState) { | ||
// 记录第一个音频 delta 的时间 | ||
if (!playbackState.firstDeltaTime) { | ||
playbackState.firstDeltaTime = Date.now(); | ||
} | ||
} | ||
} | ||
}); | ||
}, | ||
); | ||
this.api.on("server." + ServerEventType.ResponseDone, (event: ServerResponseDone) => { | ||
this.api.on( | ||
"server." + ServerEventType.ResponseDone, | ||
(event: ServerResponseDone) => { | ||
// 检查是否有音频播放状态 | ||
@@ -268,6 +305,10 @@ const playbackState = this.audioPlaybackState.get(event.response.id); | ||
// 计算播放时长(从第一个音频delta到现在的时间 + 缓冲时间) | ||
const playbackDuration = Date.now() - playbackState.firstDeltaTime + 50; // 添加50ms的缓冲时间 | ||
const playbackDuration = | ||
Date.now() - playbackState.firstDeltaTime + 50; // 添加50ms的缓冲时间 | ||
playbackState.playbackTimeoutId = setTimeout(() => { | ||
this.checkAndEmitAudioPlaybackCompleted(event.response.id, playbackDuration); | ||
this.checkAndEmitAudioPlaybackCompleted( | ||
event.response.id, | ||
playbackDuration, | ||
); | ||
}, playbackDuration); | ||
@@ -279,3 +320,4 @@ } else { | ||
} | ||
}); | ||
}, | ||
); | ||
} | ||
@@ -287,3 +329,6 @@ | ||
*/ | ||
private checkAndEmitAudioPlaybackCompleted(responseId: string, playbackDurationMs?: number) { | ||
private checkAndEmitAudioPlaybackCompleted( | ||
responseId: string, | ||
playbackDurationMs?: number, | ||
) { | ||
const playbackState = this.audioPlaybackState.get(responseId); | ||
@@ -300,3 +345,7 @@ if (playbackState && playbackState.isResponseDone) { | ||
const item = this.conversation.itemMap.get(itemId); | ||
if (item && item.formatted.audio && item.formatted.audio.length > 0) { | ||
if ( | ||
item && | ||
item.formatted.audio && | ||
item.formatted.audio.length > 0 | ||
) { | ||
// 计算该音频项目的播放时长(PCM16 格式,24kHz 采样率) | ||
@@ -363,15 +412,27 @@ const audioBytes = item.formatted.audio.length; | ||
on(event: "server.*", callback: (event: ServerRealtimeEvent<any>) => void): void; | ||
on(event: "client.*", callback: (event: ClientRealtimeEvent<any>) => void): void; | ||
on( | ||
event: "server.*", | ||
callback: (event: ServerRealtimeEvent<any>) => void, | ||
): void; | ||
on( | ||
event: "client.*", | ||
callback: (event: ClientRealtimeEvent<any>) => void, | ||
): void; | ||
on( | ||
event: LocalEventType.ConversationItemAppended, | ||
callback: (event: ServerRealtimeEvent<{ item: ServerItemType<`realtime.item`> }>) => void | ||
callback: ( | ||
event: ServerRealtimeEvent<{ item: ServerItemType<`realtime.item`> }>, | ||
) => void, | ||
): void; | ||
on( | ||
event: LocalEventType.ConversationItemCompleted, | ||
callback: (event: ServerRealtimeEvent<{ item: ServerItemType<`realtime.item`> }>) => void | ||
callback: ( | ||
event: ServerRealtimeEvent<{ item: ServerItemType<`realtime.item`> }>, | ||
) => void, | ||
): void; | ||
on( | ||
event: LocalEventType.ConversationUpdated, | ||
callback: (event: ServerRealtimeEvent<{ item: ServerItemType<`realtime.item`> }>) => void | ||
callback: ( | ||
event: ServerRealtimeEvent<{ item: ServerItemType<`realtime.item`> }>, | ||
) => void, | ||
): void; | ||
@@ -386,55 +447,115 @@ on( | ||
is_interrupted: boolean; | ||
}) => void | ||
}) => void, | ||
): void; | ||
on(event: ServerEventType.SessionCreated, callback: (event: ServerSessionCreated) => void): void; | ||
on(event: ServerEventType.SessionUpdated, callback: (event: ServerSessionUpdated) => void): void; | ||
on(event: ServerEventType.ConversationItemCreated, callback: (event: ServerConversationItemCreated) => void): void; | ||
on( | ||
event: ServerEventType.SessionCreated, | ||
callback: (event: ServerSessionCreated) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.SessionUpdated, | ||
callback: (event: ServerSessionUpdated) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ConversationItemCreated, | ||
callback: (event: ServerConversationItemCreated) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ConversationItemInputAudioTranscriptionCompleted, | ||
callback: (event: ServerConversationItemInputAudioTranscriptionCompleted) => void | ||
callback: ( | ||
event: ServerConversationItemInputAudioTranscriptionCompleted, | ||
) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ConversationItemInputAudioTranscriptionDelta, | ||
callback: (event: ServerConversationItemInputAudioTranscriptionDelta) => void | ||
callback: ( | ||
event: ServerConversationItemInputAudioTranscriptionDelta, | ||
) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.InputAudioBufferSpeechStarted, | ||
callback: (event: ServerInputAudioBufferSpeechStarted) => void | ||
callback: (event: ServerInputAudioBufferSpeechStarted) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.InputAudioBufferSpeechStopped, | ||
callback: (event: ServerInputAudioBufferSpeechStopped, inputBuffer: Buffer) => void | ||
callback: ( | ||
event: ServerInputAudioBufferSpeechStopped, | ||
inputBuffer: Buffer, | ||
) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.InputAudioBufferCommitted, | ||
callback: (event: ServerInputAudioBufferCommitted) => void | ||
callback: (event: ServerInputAudioBufferCommitted) => void, | ||
): void; | ||
on(event: ServerEventType.InputAudioBufferCleared, callback: (event: ServerInputAudioBufferCleared) => void): void; | ||
on(event: ServerEventType.ConversationItemDeleted, callback: (event: ServerConversationItemDeleted) => void): void; | ||
on( | ||
event: ServerEventType.InputAudioBufferCleared, | ||
callback: (event: ServerInputAudioBufferCleared) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ConversationItemDeleted, | ||
callback: (event: ServerConversationItemDeleted) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ConversationItemTruncated, | ||
callback: (event: ServerConversationItemTruncated) => void | ||
callback: (event: ServerConversationItemTruncated) => void, | ||
): void; | ||
on(event: ServerEventType.ResponseAudioDelta, callback: (event: ServerResponseAudioDelta) => void): void; | ||
on(event: ServerEventType.ResponseAudioDone, callback: (event: ServerResponseAudioDone) => void): void; | ||
on(event: ServerEventType.ResponseContentPartAdded, callback: (event: ServerResponseContentPartAdded) => void): void; | ||
on(event: ServerEventType.ResponseContentPartDone, callback: (event: ServerResponseContentPartDone) => void): void; | ||
on(event: ServerEventType.ResponseFunctionCallArgumentsDone, callback: (event: ServerResponseFunctionCallArgumentsDone) => void): void; | ||
on(event: ServerEventType.ResponseFunctionCallArgumentsDelta, callback: (event: ServerResponseFunctionCallArgumentsDelta) => void): void; | ||
on( | ||
event: ServerEventType.ResponseAudioDelta, | ||
callback: (event: ServerResponseAudioDelta) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseAudioDone, | ||
callback: (event: ServerResponseAudioDone) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseContentPartAdded, | ||
callback: (event: ServerResponseContentPartAdded) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseContentPartDone, | ||
callback: (event: ServerResponseContentPartDone) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseFunctionCallArgumentsDone, | ||
callback: (event: ServerResponseFunctionCallArgumentsDone) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseFunctionCallArgumentsDelta, | ||
callback: (event: ServerResponseFunctionCallArgumentsDelta) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseAudioTranscriptDelta, | ||
callback: (event: ServerResponseAudioTranscriptDelta) => void | ||
callback: (event: ServerResponseAudioTranscriptDelta) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseAudioTranscriptDone, | ||
callback: (event: ServerResponseAudioTranscriptDone) => void | ||
callback: (event: ServerResponseAudioTranscriptDone) => void, | ||
): void; | ||
on(event: ServerEventType.ResponseOutputItemAdded, callback: (event: ServerResponseOutputItemAdded) => void): void; | ||
on(event: ServerEventType.ResponseOutputItemDone, callback: (event: ServerResponseOutputItemDone) => void): void; | ||
on(event: ServerEventType.ResponseCreated, callback: (event: ServerResponseCreated) => void): void; | ||
on(event: ServerEventType.ResponseDone, callback: (event: ServerResponseDone) => void): void; | ||
on(event: ServerEventType.Error, callback: (event: ServerResponseError) => void): void; | ||
on( | ||
event: ServerEventType | LocalEventType | ClientEventType | "server.*" | "client.*", | ||
callback: (event: any, extra?: any) => void | ||
event: ServerEventType.ResponseOutputItemAdded, | ||
callback: (event: ServerResponseOutputItemAdded) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseOutputItemDone, | ||
callback: (event: ServerResponseOutputItemDone) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseCreated, | ||
callback: (event: ServerResponseCreated) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.ResponseDone, | ||
callback: (event: ServerResponseDone) => void, | ||
): void; | ||
on( | ||
event: ServerEventType.Error, | ||
callback: (event: ServerResponseError) => void, | ||
): void; | ||
on( | ||
event: | ||
| ServerEventType | ||
| LocalEventType | ||
| ClientEventType | ||
| "server.*" | ||
| "client.*", | ||
callback: (event: any, extra?: any) => void, | ||
): void { | ||
@@ -452,3 +573,11 @@ if (Object.values(ServerEventType).includes(event as any)) { | ||
off(event: LocalEventType | ServerEventType | ClientEventType | "server.*" | "client.*", callback: Function): void { | ||
off( | ||
event: | ||
| LocalEventType | ||
| ServerEventType | ||
| ClientEventType | ||
| "server.*" | ||
| "client.*", | ||
callback: Function, | ||
): void { | ||
if (Object.values(ServerEventType).includes(event as any)) { | ||
@@ -455,0 +584,0 @@ this.api.off("server." + event, callback); |
129435
1.07%2830
4.97%