feat(speech): add realtime prompt dictation support

Add server-backed realtime transcription for prompt voice input and expose speech settings to choose realtime mode and models.
This commit is contained in:
Shantur Rathore
2026-03-19 11:32:45 +00:00
parent cc2f6976f6
commit f9b5e2b529
29 changed files with 1572 additions and 263 deletions

View File

@@ -8,6 +8,8 @@ import type {
FileSystemListResponse,
InstanceData,
SpeechCapabilitiesResponse,
SpeechRealtimeEvent,
SpeechRealtimeSessionResponse,
SpeechSynthesisResponse,
SpeechTranscriptionResponse,
ServerMeta,
@@ -39,6 +41,10 @@ export function buildBackgroundProcessStreamUrl(instanceId: string, processId: s
return buildAbsoluteUrl(`/workspaces/${encodedInstanceId}/plugin/background-processes/${encodedProcessId}/stream`)
}
export function buildRealtimeSpeechEventsUrl(sessionId: string): string {
return buildAbsoluteUrl(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/events`)
}
function buildEventsUrl(base: string | undefined, path: string): string {
if (path.startsWith("http://") || path.startsWith("https://")) {
return path
@@ -241,6 +247,29 @@ export const serverApi = {
fetchSpeechCapabilities(): Promise<SpeechCapabilitiesResponse> {
return request<SpeechCapabilitiesResponse>("/api/speech/capabilities")
},
createRealtimeSpeechSession(payload?: { language?: string; prompt?: string }): Promise<SpeechRealtimeSessionResponse> {
return request<SpeechRealtimeSessionResponse>("/api/speech/realtime/sessions", {
method: "POST",
body: JSON.stringify(payload ?? {}),
})
},
appendRealtimeSpeechAudio(sessionId: string, payload: { audioBase64: string }): Promise<void> {
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/audio`, {
method: "POST",
body: JSON.stringify(payload),
})
},
finalizeRealtimeSpeechSession(sessionId: string): Promise<void> {
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/finalize`, {
method: "POST",
body: JSON.stringify({}),
})
},
closeRealtimeSpeechSession(sessionId: string): Promise<void> {
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}`, {
method: "DELETE",
})
},
transcribeAudio(payload: {
audioBase64: string
mimeType: string
@@ -332,21 +361,34 @@ export const serverApi = {
},
connectEvents(onEvent: (event: WorkspaceEventPayload) => void, onError?: () => void) {
sseLogger.info(`Connecting to ${EVENTS_URL}`)
const source = new EventSource(EVENTS_URL, { withCredentials: true } as any)
source.onmessage = (event) => {
try {
const payload = JSON.parse(event.data) as WorkspaceEventPayload
onEvent(payload)
} catch (error) {
sseLogger.error("Failed to parse event", error)
}
}
source.onerror = () => {
sseLogger.warn("EventSource error, closing stream")
onError?.()
}
return source
return connectEventSource(EVENTS_URL, onEvent, onError)
},
connectRealtimeSpeechEvents(
sessionId: string,
onEvent: (event: SpeechRealtimeEvent) => void,
onError?: () => void,
) {
const url = buildRealtimeSpeechEventsUrl(sessionId)
sseLogger.info(`Connecting to ${url}`)
return connectEventSource(url, onEvent, onError)
},
}
export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType }
function connectEventSource<T>(url: string, onEvent: (event: T) => void, onError?: () => void) {
const source = new EventSource(url, { withCredentials: true } as any)
source.onmessage = (event) => {
try {
const payload = JSON.parse(event.data) as T
onEvent(payload)
} catch (error) {
sseLogger.error("Failed to parse event", error)
}
}
source.onerror = () => {
sseLogger.warn("EventSource error, closing stream")
onError?.()
}
return source
}
export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType, SpeechRealtimeEvent }