feat(speech): add realtime prompt dictation support

Add server-backed realtime transcription for prompt voice input and expose speech settings to choose realtime mode and models.
2026-03-19 11:32:45 +00:00
parent 5948e25b97
commit 2354051297
28 changed files with 1571 additions and 262 deletions
--- a/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts
@@ -0,0 +1,325 @@
+import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
+import { showAlertDialog } from "../../stores/alerts"
+import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client"
+import { useI18n } from "../../lib/i18n"
+import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
+import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream"
+import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion"
+
+interface UsePromptRealtimeVoiceInputOptions {
+  prompt: Accessor<string>
+  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
+  getTextarea: () => HTMLTextAreaElement | null
+  enabled: Accessor<boolean>
+  disabled: Accessor<boolean>
+}
+
+type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing"
+
+const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000
+
+export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) {
+  const { t } = useI18n()
+  const [state, setState] = createSignal<RealtimeVoiceState>("idle")
+  const [elapsedMs, setElapsedMs] = createSignal(0)
+
+  let activeSessionId: string | null = null
+  let eventSource: EventSource | null = null
+  let pcmStream: RealtimePcmStreamHandle | null = null
+  let audioQueue: Promise<void> = Promise.resolve()
+  let timerId: number | undefined
+  let recordingStartedAt = 0
+  let finalizeTimerId: number | undefined
+  let anchor = createPromptVoiceAnchor("", 0, 0)
+  let finalTranscript = ""
+  let liveTranscript = ""
+  let activeLiveItemId: string | null = null
+  let closing = false
+
+  createEffect(() => {
+    void loadSpeechCapabilities()
+  })
+
+  onCleanup(() => {
+    cancelRecording()
+  })
+
+  const isSupported = () => {
+    if (typeof window === "undefined") return false
+    return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined"
+  }
+
+  const canUseVoiceInput = () => {
+    const capabilities = speechCapabilities()
+    return Boolean(
+      options.enabled() &&
+        isSupported() &&
+        capabilities?.available &&
+        capabilities?.configured &&
+        capabilities?.supportsStt &&
+        capabilities?.supportsRealtimeTranscription,
+    )
+  }
+
+  async function toggleRecording(): Promise<void> {
+    if (state() === "listening" || state() === "connecting") {
+      await stopRecording()
+      return
+    }
+
+    if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return
+
+    try {
+      await startRecording()
+    } catch (error) {
+      await cleanupSession({ revertPrompt: true, closeRemote: true })
+      showAlertDialog(t("promptInput.voiceInput.error.connection"), {
+        title: t("promptInput.voiceInput.error.title"),
+        detail: error instanceof Error ? error.message : String(error),
+        variant: "error",
+      })
+    }
+  }
+
+  async function startRecording() {
+    if (!isSupported()) {
+      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
+        title: t("promptInput.voiceInput.error.title"),
+        variant: "error",
+      })
+      return
+    }
+
+    resetTranscriptState()
+    captureAnchor()
+    setState("connecting")
+    setElapsedMs(0)
+
+    const created = await serverApi.createRealtimeSpeechSession({
+      language: detectLanguage(),
+    })
+    activeSessionId = created.sessionId
+    connectEventStream(created.sessionId)
+
+    pcmStream = await createRealtimePcmStream({
+      onChunk: (audioBase64) => {
+        const sessionId = activeSessionId
+        if (!sessionId || closing) return
+        audioQueue = audioQueue
+          .then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 }))
+          .catch((error) => {
+            handleRealtimeError(error)
+          })
+      },
+    })
+
+    recordingStartedAt = Date.now()
+    startTimer()
+    setState("listening")
+  }
+
+  async function stopRecording() {
+    const sessionId = activeSessionId
+    if (!sessionId || (state() !== "listening" && state() !== "connecting")) return
+
+    setState("finalizing")
+    stopTimer()
+
+    if (pcmStream) {
+      const stream = pcmStream
+      pcmStream = null
+      await stream.stop()
+    }
+
+    try {
+      await audioQueue.catch(() => undefined)
+      await serverApi.finalizeRealtimeSpeechSession(sessionId)
+      scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS)
+    } catch (error) {
+      handleRealtimeError(error)
+    }
+  }
+
+  function cancelRecording() {
+    void cleanupSession({ revertPrompt: true, closeRemote: true })
+  }
+
+  function connectEventStream(sessionId: string) {
+    eventSource?.close()
+    eventSource = serverApi.connectRealtimeSpeechEvents(
+      sessionId,
+      (event) => handleEvent(event),
+      () => {
+        if (closing) return
+        handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection")))
+      },
+    )
+  }
+
+  function handleEvent(event: SpeechRealtimeEvent) {
+    if (event.type === "session.ready") {
+      return
+    }
+
+    if (event.type === "session.error") {
+      handleRealtimeError(new Error(event.message))
+      return
+    }
+
+    if (event.type === "transcript.partial") {
+      activeLiveItemId = event.itemId
+      liveTranscript = event.text
+      renderPrompt(false)
+      return
+    }
+
+    if (event.type === "transcript.final") {
+      activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId
+      liveTranscript = ""
+      finalTranscript = appendVoiceTranscript(finalTranscript, event.text)
+      renderPrompt(true)
+      if (state() === "finalizing") {
+        scheduleFinalizeClose(250)
+      }
+      return
+    }
+
+    if (event.type === "session.closed") {
+      void cleanupSession({ revertPrompt: false, closeRemote: false })
+    }
+  }
+
+  function captureAnchor() {
+    const textarea = options.getTextarea()
+    const current = options.prompt()
+    const start = textarea ? textarea.selectionStart : current.length
+    const end = textarea ? textarea.selectionEnd : current.length
+    anchor = createPromptVoiceAnchor(current, start, end)
+  }
+
+  function renderPrompt(persistDraft: boolean) {
+    const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "")
+    const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted)
+    options.setPrompt(value, persistDraft ? undefined : { persistDraft: false })
+    syncTextareaCursor(cursor)
+  }
+
+  function syncTextareaCursor(cursor: number) {
+    const textarea = options.getTextarea()
+    if (!textarea) return
+    queueMicrotask(() => {
+      const next = options.getTextarea()
+      if (!next) return
+      next.focus()
+      next.setSelectionRange(cursor, cursor)
+    })
+  }
+
+  function scheduleFinalizeClose(delayMs: number) {
+    if (finalizeTimerId !== undefined) {
+      window.clearTimeout(finalizeTimerId)
+    }
+    finalizeTimerId = window.setTimeout(() => {
+      void cleanupSession({ revertPrompt: false, closeRemote: true })
+    }, delayMs)
+  }
+
+  async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) {
+    if (closing) return
+    closing = true
+
+    if (finalizeTimerId !== undefined) {
+      window.clearTimeout(finalizeTimerId)
+      finalizeTimerId = undefined
+    }
+
+    stopTimer()
+
+    const sessionId = activeSessionId
+    activeSessionId = null
+
+    eventSource?.close()
+    eventSource = null
+
+    if (pcmStream) {
+      const stream = pcmStream
+      pcmStream = null
+      await stream.stop().catch(() => undefined)
+    }
+
+    await audioQueue.catch(() => undefined)
+    audioQueue = Promise.resolve()
+
+    if (cleanupOptions.closeRemote && sessionId) {
+      await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined)
+    }
+
+    if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) {
+      finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript)
+      liveTranscript = ""
+    }
+
+    if (cleanupOptions.revertPrompt) {
+      options.setPrompt(anchor.prompt)
+    } else if (finalTranscript.trim()) {
+      renderPrompt(true)
+    }
+
+    resetTranscriptState()
+    setState("idle")
+    setElapsedMs(0)
+    closing = false
+  }
+
+  function resetTranscriptState() {
+    finalTranscript = ""
+    liveTranscript = ""
+    activeLiveItemId = null
+  }
+
+  function handleRealtimeError(error: unknown) {
+    if (closing) return
+    void cleanupSession({ revertPrompt: true, closeRemote: true })
+    showAlertDialog(t("promptInput.voiceInput.error.connection"), {
+      title: t("promptInput.voiceInput.error.title"),
+      detail: error instanceof Error ? error.message : String(error),
+      variant: "error",
+    })
+  }
+
+  function startTimer() {
+    stopTimer()
+    timerId = window.setInterval(() => {
+      setElapsedMs(Date.now() - recordingStartedAt)
+    }, 250)
+  }
+
+  function stopTimer() {
+    if (timerId !== undefined) {
+      window.clearInterval(timerId)
+      timerId = undefined
+    }
+  }
+
+  return {
+    state,
+    elapsedMs,
+    canUseVoiceInput,
+    toggleRecording,
+    cancelRecording,
+    isRecording: () => state() === "connecting" || state() === "listening",
+    isTranscribing: () => state() === "finalizing",
+    buttonTitle: () => {
+      if (state() === "connecting") return t("promptInput.voiceInput.connecting.title")
+      if (state() === "listening") return t("promptInput.voiceInput.stop.title")
+      if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title")
+      return t("promptInput.voiceInput.start.title")
+    },
+  }
+}
+
+function detectLanguage(): string | undefined {
+  if (typeof navigator === "undefined") return undefined
+  const [language] = navigator.language.split("-")
+  return language?.trim() || undefined
+}