feat(speech): add realtime prompt dictation support

Add server-backed realtime transcription for prompt voice input and expose speech settings to choose realtime mode and models.
2026-03-19 11:32:45 +00:00
parent cc2f6976f6
commit f9b5e2b529
29 changed files with 1572 additions and 263 deletions
--- a/packages/ui/src/components/prompt-input.tsx
+++ b/packages/ui/src/components/prompt-input.tsx
@@ -11,7 +11,7 @@ import { getCommands } from "../stores/commands"
 import { showAlertDialog } from "../stores/alerts"
 import { useI18n } from "../lib/i18n"
 import { getLogger } from "../lib/logger"
-import { preferences } from "../stores/preferences"
+import { preferences, useConfig } from "../stores/preferences"
 import type { ExpandState, PromptInputApi, PromptInputProps, PromptInsertMode, PromptMode } from "./prompt-input/types"
 import { usePromptState } from "./prompt-input/usePromptState"
 import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
@@ -22,6 +22,7 @@ const log = getLogger("actions")

 export default function PromptInput(props: PromptInputProps) {
  const { t } = useI18n()
+  const { serverSettings } = useConfig()
  const [, setIsFocused] = createSignal(false)
  const [mode, setMode] = createSignal<PromptMode>("normal")
  const [expandState, setExpandState] = createSignal<ExpandState>("normal")
@@ -418,6 +419,7 @@ export default function PromptInput(props: PromptInputProps) {
    getTextarea: () => textareaRef ?? null,
    enabled: () => preferences().showPromptVoiceInput,
    disabled: () => Boolean(props.disabled),
+    useRealtime: () => serverSettings().speech.useRealtime,
  })
  const showVoiceInput = () =>
    preferences().showPromptVoiceInput &&
--- a/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts
+++ b/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts
@@ -0,0 +1,110 @@
+export interface RealtimePcmStreamHandle {
+  stop(): Promise<void>
+}
+
+interface CreateRealtimePcmStreamOptions {
+  onChunk: (audioBase64: string) => void | Promise<void>
+}
+
+const TARGET_SAMPLE_RATE = 24000
+const PROCESSOR_BUFFER_SIZE = 4096
+
+export async function createRealtimePcmStream(
+  options: CreateRealtimePcmStreamOptions,
+): Promise<RealtimePcmStreamHandle> {
+  const stream = await navigator.mediaDevices.getUserMedia({
+    audio: {
+      channelCount: 1,
+      echoCancellation: true,
+      noiseSuppression: true,
+      autoGainControl: true,
+    },
+  })
+
+  const AudioContextCtor = window.AudioContext || (window as any).webkitAudioContext
+  if (!AudioContextCtor) {
+    stream.getTracks().forEach((track) => track.stop())
+    throw new Error("AudioContext is not supported in this browser.")
+  }
+
+  const audioContext = new AudioContextCtor()
+  await audioContext.resume()
+
+  const source = audioContext.createMediaStreamSource(stream)
+  const processor = audioContext.createScriptProcessor(PROCESSOR_BUFFER_SIZE, 1, 1)
+  const sink = audioContext.createGain()
+  sink.gain.value = 0
+
+  source.connect(processor)
+  processor.connect(sink)
+  sink.connect(audioContext.destination)
+
+  processor.onaudioprocess = (event) => {
+    const input = event.inputBuffer.getChannelData(0)
+    const resampled = downsampleBuffer(input, audioContext.sampleRate, TARGET_SAMPLE_RATE)
+    if (resampled.length === 0) return
+    const pcm16 = floatTo16BitPcm(resampled)
+    void options.onChunk(base64EncodePcm16(pcm16))
+  }
+
+  let stopped = false
+  return {
+    async stop() {
+      if (stopped) return
+      stopped = true
+      processor.onaudioprocess = null
+      source.disconnect()
+      processor.disconnect()
+      sink.disconnect()
+      stream.getTracks().forEach((track) => track.stop())
+      await audioContext.close()
+    },
+  }
+}
+
+function downsampleBuffer(buffer: Float32Array, inputSampleRate: number, outputSampleRate: number): Float32Array {
+  if (inputSampleRate === outputSampleRate) {
+    return buffer.slice()
+  }
+
+  const sampleRateRatio = inputSampleRate / outputSampleRate
+  const outputLength = Math.max(1, Math.round(buffer.length / sampleRateRatio))
+  const output = new Float32Array(outputLength)
+  let outputIndex = 0
+  let inputIndex = 0
+
+  while (outputIndex < outputLength) {
+    const nextInputIndex = Math.min(buffer.length, Math.round((outputIndex + 1) * sampleRateRatio))
+    let sum = 0
+    let count = 0
+    for (let i = inputIndex; i < nextInputIndex; i += 1) {
+      sum += buffer[i]
+      count += 1
+    }
+    output[outputIndex] = count > 0 ? sum / count : buffer[Math.min(buffer.length - 1, inputIndex)]
+    outputIndex += 1
+    inputIndex = nextInputIndex
+  }
+
+  return output
+}
+
+function floatTo16BitPcm(buffer: Float32Array): Int16Array {
+  const pcm16 = new Int16Array(buffer.length)
+  for (let i = 0; i < buffer.length; i += 1) {
+    const sample = Math.max(-1, Math.min(1, buffer[i]))
+    pcm16[i] = sample < 0 ? Math.round(sample * 0x8000) : Math.round(sample * 0x7fff)
+  }
+  return pcm16
+}
+
+function base64EncodePcm16(buffer: Int16Array): string {
+  const bytes = new Uint8Array(buffer.buffer)
+  let binary = ""
+  const chunkSize = 0x8000
+  for (let offset = 0; offset < bytes.length; offset += chunkSize) {
+    const chunk = bytes.subarray(offset, offset + chunkSize)
+    binary += String.fromCharCode(...chunk)
+  }
+  return btoa(binary)
+}
--- a/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts
+++ b/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts
@@ -0,0 +1,36 @@
+export interface PromptVoiceAnchor {
+  prompt: string
+  start: number
+  end: number
+}
+
+export function createPromptVoiceAnchor(prompt: string, start: number, end: number): PromptVoiceAnchor {
+  return { prompt, start, end }
+}
+
+export function buildPromptWithInsertedTranscript(anchor: PromptVoiceAnchor, insertedText: string): { value: string; cursor: number } {
+  const before = anchor.prompt.slice(0, anchor.start)
+  const after = anchor.prompt.slice(anchor.end)
+  const normalized = insertedText.trim()
+
+  if (!normalized) {
+    return {
+      value: before + after,
+      cursor: before.length,
+    }
+  }
+
+  const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
+  const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
+  return {
+    value: `${before}${prefix}${normalized}${suffix}${after}`,
+    cursor: before.length + prefix.length + normalized.length,
+  }
+}
+
+export function appendVoiceTranscript(current: string, next: string): string {
+  const normalized = next.trim()
+  if (!normalized) return current
+  if (!current.trim()) return normalized
+  return /\s$/.test(current) ? `${current}${normalized}` : `${current} ${normalized}`
+}
--- a/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts
@@ -0,0 +1,241 @@
+import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
+import { showAlertDialog } from "../../stores/alerts"
+import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
+import { serverApi } from "../../lib/api-client"
+import { useI18n } from "../../lib/i18n"
+import { createPromptVoiceAnchor, buildPromptWithInsertedTranscript } from "./promptVoiceInsertion"
+
+interface UsePromptBufferedVoiceInputOptions {
+  prompt: Accessor<string>
+  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
+  getTextarea: () => HTMLTextAreaElement | null
+  enabled: Accessor<boolean>
+  disabled: Accessor<boolean>
+}
+
+type VoiceInputState = "idle" | "recording" | "transcribing"
+
+export function usePromptBufferedVoiceInput(options: UsePromptBufferedVoiceInputOptions) {
+  const { t } = useI18n()
+  const [state, setState] = createSignal<VoiceInputState>("idle")
+  const [elapsedMs, setElapsedMs] = createSignal(0)
+
+  let mediaRecorder: MediaRecorder | null = null
+  let mediaStream: MediaStream | null = null
+  let timerId: number | undefined
+  let shouldTranscribe = true
+  let recordedChunks: Blob[] = []
+  let recordingStartedAt = 0
+
+  createEffect(() => {
+    void loadSpeechCapabilities()
+  })
+
+  onCleanup(() => {
+    cleanupMedia(false)
+  })
+
+  const isSupported = () => {
+    if (typeof window === "undefined") return false
+    return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
+  }
+
+  const canUseVoiceInput = () => {
+    const capabilities = speechCapabilities()
+    return Boolean(
+      options.enabled() &&
+        isSupported() &&
+        capabilities?.available &&
+        capabilities?.configured &&
+        capabilities?.supportsStt,
+    )
+  }
+
+  async function toggleRecording(): Promise<void> {
+    if (state() === "recording") {
+      stopRecording()
+      return
+    }
+
+    if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
+
+    try {
+      await startRecording()
+    } catch (error) {
+      cleanupMedia(false)
+      showAlertDialog(t("promptInput.voiceInput.error.permission"), {
+        title: t("promptInput.voiceInput.error.title"),
+        detail: error instanceof Error ? error.message : String(error),
+        variant: "error",
+      })
+    }
+  }
+
+  function stopRecording() {
+    if (!mediaRecorder || state() !== "recording") return
+    shouldTranscribe = true
+    mediaRecorder.stop()
+    setState("transcribing")
+    stopTimer()
+  }
+
+  function cancelRecording() {
+    if (!mediaRecorder || state() !== "recording") return
+    shouldTranscribe = false
+    mediaRecorder.stop()
+    cleanupMedia(false)
+  }
+
+  async function startRecording() {
+    if (!isSupported()) {
+      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
+        title: t("promptInput.voiceInput.error.title"),
+        variant: "error",
+      })
+      return
+    }
+
+    recordedChunks = []
+    shouldTranscribe = true
+    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+    mediaRecorder = createRecorder(mediaStream)
+
+    mediaRecorder.addEventListener("dataavailable", (event) => {
+      if (event.data.size > 0) {
+        recordedChunks.push(event.data)
+      }
+    })
+
+    mediaRecorder.addEventListener("stop", () => {
+      void finalizeRecording()
+    })
+
+    recordingStartedAt = Date.now()
+    setElapsedMs(0)
+    setState("recording")
+    startTimer()
+    mediaRecorder.start()
+  }
+
+  async function finalizeRecording() {
+    const recorder = mediaRecorder
+    const stream = mediaStream
+    mediaRecorder = null
+    mediaStream = null
+
+    if (!shouldTranscribe || recordedChunks.length === 0) {
+      recordedChunks = []
+      stopTracks(stream)
+      setState("idle")
+      setElapsedMs(0)
+      return
+    }
+
+    const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
+
+    try {
+      const audioBlob = new Blob(recordedChunks, { type: mimeType })
+      const transcription = await serverApi.transcribeAudio({
+        audioBase64: await blobToBase64(audioBlob),
+        mimeType,
+      })
+      if (transcription.text.trim()) {
+        insertTranscript(transcription.text.trim())
+      }
+    } catch (error) {
+      showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
+        title: t("promptInput.voiceInput.error.title"),
+        detail: error instanceof Error ? error.message : String(error),
+        variant: "error",
+      })
+    } finally {
+      recordedChunks = []
+      stopTracks(stream)
+      setState("idle")
+      setElapsedMs(0)
+    }
+  }
+
+  function insertTranscript(text: string) {
+    const current = options.prompt()
+    const textarea = options.getTextarea()
+    const start = textarea ? textarea.selectionStart : current.length
+    const end = textarea ? textarea.selectionEnd : current.length
+    const { value, cursor } = buildPromptWithInsertedTranscript(
+      createPromptVoiceAnchor(current, start, end),
+      text,
+    )
+
+    options.setPrompt(value)
+    if (textarea) {
+      setTimeout(() => {
+        textarea.focus()
+        textarea.setSelectionRange(cursor, cursor)
+      }, 0)
+    }
+  }
+
+  function cleanupMedia(resetState = true) {
+    stopTimer()
+    if (mediaRecorder && mediaRecorder.state !== "inactive") {
+      mediaRecorder.stop()
+    }
+    mediaRecorder = null
+    stopTracks(mediaStream)
+    mediaStream = null
+    recordedChunks = []
+    if (resetState) {
+      setState("idle")
+      setElapsedMs(0)
+    }
+  }
+
+  function startTimer() {
+    stopTimer()
+    timerId = window.setInterval(() => {
+      setElapsedMs(Date.now() - recordingStartedAt)
+    }, 250)
+  }
+
+  function stopTimer() {
+    if (timerId !== undefined) {
+      window.clearInterval(timerId)
+      timerId = undefined
+    }
+  }
+
+  return {
+    state,
+    elapsedMs,
+    canUseVoiceInput,
+    toggleRecording,
+    cancelRecording,
+    isRecording: () => state() === "recording",
+    isTranscribing: () => state() === "transcribing",
+    buttonTitle: () => {
+      if (state() === "recording") return t("promptInput.voiceInput.stop.title")
+      if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
+      return t("promptInput.voiceInput.start.title")
+    },
+  }
+}
+
+function createRecorder(stream: MediaStream): MediaRecorder {
+  const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
+  const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
+  return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
+}
+
+function stopTracks(stream: MediaStream | null) {
+  stream?.getTracks().forEach((track) => track.stop())
+}
+
+async function blobToBase64(blob: Blob): Promise<string> {
+  const buffer = await blob.arrayBuffer()
+  const bytes = new Uint8Array(buffer)
+  let binary = ""
+  for (const byte of bytes) {
+    binary += String.fromCharCode(byte)
+  }
+  return btoa(binary)
+}
--- a/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts
@@ -0,0 +1,325 @@
+import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
+import { showAlertDialog } from "../../stores/alerts"
+import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client"
+import { useI18n } from "../../lib/i18n"
+import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
+import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream"
+import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion"
+
+interface UsePromptRealtimeVoiceInputOptions {
+  prompt: Accessor<string>
+  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
+  getTextarea: () => HTMLTextAreaElement | null
+  enabled: Accessor<boolean>
+  disabled: Accessor<boolean>
+}
+
+type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing"
+
+const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000
+
+export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) {
+  const { t } = useI18n()
+  const [state, setState] = createSignal<RealtimeVoiceState>("idle")
+  const [elapsedMs, setElapsedMs] = createSignal(0)
+
+  let activeSessionId: string | null = null
+  let eventSource: EventSource | null = null
+  let pcmStream: RealtimePcmStreamHandle | null = null
+  let audioQueue: Promise<void> = Promise.resolve()
+  let timerId: number | undefined
+  let recordingStartedAt = 0
+  let finalizeTimerId: number | undefined
+  let anchor = createPromptVoiceAnchor("", 0, 0)
+  let finalTranscript = ""
+  let liveTranscript = ""
+  let activeLiveItemId: string | null = null
+  let closing = false
+
+  createEffect(() => {
+    void loadSpeechCapabilities()
+  })
+
+  onCleanup(() => {
+    cancelRecording()
+  })
+
+  const isSupported = () => {
+    if (typeof window === "undefined") return false
+    return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined"
+  }
+
+  const canUseVoiceInput = () => {
+    const capabilities = speechCapabilities()
+    return Boolean(
+      options.enabled() &&
+        isSupported() &&
+        capabilities?.available &&
+        capabilities?.configured &&
+        capabilities?.supportsStt &&
+        capabilities?.supportsRealtimeTranscription,
+    )
+  }
+
+  async function toggleRecording(): Promise<void> {
+    if (state() === "listening" || state() === "connecting") {
+      await stopRecording()
+      return
+    }
+
+    if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return
+
+    try {
+      await startRecording()
+    } catch (error) {
+      await cleanupSession({ revertPrompt: true, closeRemote: true })
+      showAlertDialog(t("promptInput.voiceInput.error.connection"), {
+        title: t("promptInput.voiceInput.error.title"),
+        detail: error instanceof Error ? error.message : String(error),
+        variant: "error",
+      })
+    }
+  }
+
+  async function startRecording() {
+    if (!isSupported()) {
+      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
+        title: t("promptInput.voiceInput.error.title"),
+        variant: "error",
+      })
+      return
+    }
+
+    resetTranscriptState()
+    captureAnchor()
+    setState("connecting")
+    setElapsedMs(0)
+
+    const created = await serverApi.createRealtimeSpeechSession({
+      language: detectLanguage(),
+    })
+    activeSessionId = created.sessionId
+    connectEventStream(created.sessionId)
+
+    pcmStream = await createRealtimePcmStream({
+      onChunk: (audioBase64) => {
+        const sessionId = activeSessionId
+        if (!sessionId || closing) return
+        audioQueue = audioQueue
+          .then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 }))
+          .catch((error) => {
+            handleRealtimeError(error)
+          })
+      },
+    })
+
+    recordingStartedAt = Date.now()
+    startTimer()
+    setState("listening")
+  }
+
+  async function stopRecording() {
+    const sessionId = activeSessionId
+    if (!sessionId || (state() !== "listening" && state() !== "connecting")) return
+
+    setState("finalizing")
+    stopTimer()
+
+    if (pcmStream) {
+      const stream = pcmStream
+      pcmStream = null
+      await stream.stop()
+    }
+
+    try {
+      await audioQueue.catch(() => undefined)
+      await serverApi.finalizeRealtimeSpeechSession(sessionId)
+      scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS)
+    } catch (error) {
+      handleRealtimeError(error)
+    }
+  }
+
+  function cancelRecording() {
+    void cleanupSession({ revertPrompt: true, closeRemote: true })
+  }
+
+  function connectEventStream(sessionId: string) {
+    eventSource?.close()
+    eventSource = serverApi.connectRealtimeSpeechEvents(
+      sessionId,
+      (event) => handleEvent(event),
+      () => {
+        if (closing) return
+        handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection")))
+      },
+    )
+  }
+
+  function handleEvent(event: SpeechRealtimeEvent) {
+    if (event.type === "session.ready") {
+      return
+    }
+
+    if (event.type === "session.error") {
+      handleRealtimeError(new Error(event.message))
+      return
+    }
+
+    if (event.type === "transcript.partial") {
+      activeLiveItemId = event.itemId
+      liveTranscript = event.text
+      renderPrompt(false)
+      return
+    }
+
+    if (event.type === "transcript.final") {
+      activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId
+      liveTranscript = ""
+      finalTranscript = appendVoiceTranscript(finalTranscript, event.text)
+      renderPrompt(true)
+      if (state() === "finalizing") {
+        scheduleFinalizeClose(250)
+      }
+      return
+    }
+
+    if (event.type === "session.closed") {
+      void cleanupSession({ revertPrompt: false, closeRemote: false })
+    }
+  }
+
+  function captureAnchor() {
+    const textarea = options.getTextarea()
+    const current = options.prompt()
+    const start = textarea ? textarea.selectionStart : current.length
+    const end = textarea ? textarea.selectionEnd : current.length
+    anchor = createPromptVoiceAnchor(current, start, end)
+  }
+
+  function renderPrompt(persistDraft: boolean) {
+    const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "")
+    const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted)
+    options.setPrompt(value, persistDraft ? undefined : { persistDraft: false })
+    syncTextareaCursor(cursor)
+  }
+
+  function syncTextareaCursor(cursor: number) {
+    const textarea = options.getTextarea()
+    if (!textarea) return
+    queueMicrotask(() => {
+      const next = options.getTextarea()
+      if (!next) return
+      next.focus()
+      next.setSelectionRange(cursor, cursor)
+    })
+  }
+
+  function scheduleFinalizeClose(delayMs: number) {
+    if (finalizeTimerId !== undefined) {
+      window.clearTimeout(finalizeTimerId)
+    }
+    finalizeTimerId = window.setTimeout(() => {
+      void cleanupSession({ revertPrompt: false, closeRemote: true })
+    }, delayMs)
+  }
+
+  async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) {
+    if (closing) return
+    closing = true
+
+    if (finalizeTimerId !== undefined) {
+      window.clearTimeout(finalizeTimerId)
+      finalizeTimerId = undefined
+    }
+
+    stopTimer()
+
+    const sessionId = activeSessionId
+    activeSessionId = null
+
+    eventSource?.close()
+    eventSource = null
+
+    if (pcmStream) {
+      const stream = pcmStream
+      pcmStream = null
+      await stream.stop().catch(() => undefined)
+    }
+
+    await audioQueue.catch(() => undefined)
+    audioQueue = Promise.resolve()
+
+    if (cleanupOptions.closeRemote && sessionId) {
+      await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined)
+    }
+
+    if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) {
+      finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript)
+      liveTranscript = ""
+    }
+
+    if (cleanupOptions.revertPrompt) {
+      options.setPrompt(anchor.prompt)
+    } else if (finalTranscript.trim()) {
+      renderPrompt(true)
+    }
+
+    resetTranscriptState()
+    setState("idle")
+    setElapsedMs(0)
+    closing = false
+  }
+
+  function resetTranscriptState() {
+    finalTranscript = ""
+    liveTranscript = ""
+    activeLiveItemId = null
+  }
+
+  function handleRealtimeError(error: unknown) {
+    if (closing) return
+    void cleanupSession({ revertPrompt: true, closeRemote: true })
+    showAlertDialog(t("promptInput.voiceInput.error.connection"), {
+      title: t("promptInput.voiceInput.error.title"),
+      detail: error instanceof Error ? error.message : String(error),
+      variant: "error",
+    })
+  }
+
+  function startTimer() {
+    stopTimer()
+    timerId = window.setInterval(() => {
+      setElapsedMs(Date.now() - recordingStartedAt)
+    }, 250)
+  }
+
+  function stopTimer() {
+    if (timerId !== undefined) {
+      window.clearInterval(timerId)
+      timerId = undefined
+    }
+  }
+
+  return {
+    state,
+    elapsedMs,
+    canUseVoiceInput,
+    toggleRecording,
+    cancelRecording,
+    isRecording: () => state() === "connecting" || state() === "listening",
+    isTranscribing: () => state() === "finalizing",
+    buttonTitle: () => {
+      if (state() === "connecting") return t("promptInput.voiceInput.connecting.title")
+      if (state() === "listening") return t("promptInput.voiceInput.stop.title")
+      if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title")
+      return t("promptInput.voiceInput.start.title")
+    },
+  }
+}
+
+function detectLanguage(): string | undefined {
+  if (typeof navigator === "undefined") return undefined
+  const [language] = navigator.language.split("-")
+  return language?.trim() || undefined
+}
--- a/packages/ui/src/components/prompt-input/usePromptState.ts
+++ b/packages/ui/src/components/prompt-input/usePromptState.ts
@@ -22,7 +22,7 @@ type HistorySelectOptions = {

 type PromptState = {
  prompt: Accessor<string>
-  setPrompt: (value: string) => void
+  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
  clearPrompt: () => void

  draftLoadedNonce: Accessor<number>
@@ -48,11 +48,11 @@ export function usePromptState(options: PromptStateOptions): PromptState {
  const [historyDraft, setHistoryDraft] = createSignal<string | null>(null)
  const [draftLoadedNonce, setDraftLoadedNonce] = createSignal(0)

-  const setPrompt = (value: string) => {
+  const setPrompt = (value: string, setOptions?: { persistDraft?: boolean }) => {
    setPromptInternal(value)
    // Persist drafts only when the user is at the "fresh" position (not browsing history).
    // This keeps the bottom-of-history draft stable even if the user edits recalled history entries.
-    if (historyIndex() === -1) {
+    if (setOptions?.persistDraft !== false && historyIndex() === -1) {
      setSessionDraftPrompt(options.instanceId(), options.sessionId(), value)
    }
  }
--- a/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts
@@ -1,242 +1,30 @@
-import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
-import { showAlertDialog } from "../../stores/alerts"
-import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
-import { serverApi } from "../../lib/api-client"
-import { useI18n } from "../../lib/i18n"
+import type { Accessor } from "solid-js"
+import { usePromptBufferedVoiceInput } from "./usePromptBufferedVoiceInput"
+import { usePromptRealtimeVoiceInput } from "./usePromptRealtimeVoiceInput"

 interface UsePromptVoiceInputOptions {
  prompt: Accessor<string>
-  setPrompt: (value: string) => void
+  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
  getTextarea: () => HTMLTextAreaElement | null
  enabled: Accessor<boolean>
  disabled: Accessor<boolean>
+  useRealtime: Accessor<boolean>
 }

-type VoiceInputState = "idle" | "recording" | "transcribing"
-
 export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
-  const { t } = useI18n()
-  const [state, setState] = createSignal<VoiceInputState>("idle")
-  const [elapsedMs, setElapsedMs] = createSignal(0)
+  const buffered = usePromptBufferedVoiceInput(options)
+  const realtime = usePromptRealtimeVoiceInput(options)

-  let mediaRecorder: MediaRecorder | null = null
-  let mediaStream: MediaStream | null = null
-  let timerId: number | undefined
-  let shouldTranscribe = true
-  let recordedChunks: Blob[] = []
-  let recordingStartedAt = 0
-
-  createEffect(() => {
-    void loadSpeechCapabilities()
-  })
-
-  onCleanup(() => {
-    cleanupMedia(false)
-  })
-
-  const isSupported = () => {
-    if (typeof window === "undefined") return false
-    return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
-  }
-
-  const canUseVoiceInput = () => {
-    const capabilities = speechCapabilities()
-    return Boolean(
-      options.enabled() &&
-        isSupported() &&
-        capabilities?.available &&
-        capabilities?.configured &&
-        capabilities?.supportsStt,
-    )
-  }
-
-  async function toggleRecording(): Promise<void> {
-    if (state() === "recording") {
-      stopRecording()
-      return
-    }
-
-    if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
-
-    try {
-      await startRecording()
-    } catch (error) {
-      cleanupMedia(false)
-      showAlertDialog(t("promptInput.voiceInput.error.permission"), {
-        title: t("promptInput.voiceInput.error.title"),
-        detail: error instanceof Error ? error.message : String(error),
-        variant: "error",
-      })
-    }
-  }
-
-  function stopRecording() {
-    if (!mediaRecorder || state() !== "recording") return
-    shouldTranscribe = true
-    mediaRecorder.stop()
-    setState("transcribing")
-    stopTimer()
-  }
-
-  function cancelRecording() {
-    if (!mediaRecorder || state() !== "recording") return
-    shouldTranscribe = false
-    mediaRecorder.stop()
-    cleanupMedia(false)
-  }
-
-  async function startRecording() {
-    if (!isSupported()) {
-      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
-        title: t("promptInput.voiceInput.error.title"),
-        variant: "error",
-      })
-      return
-    }
-
-    recordedChunks = []
-    shouldTranscribe = true
-    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
-    mediaRecorder = createRecorder(mediaStream)
-
-    mediaRecorder.addEventListener("dataavailable", (event) => {
-      if (event.data.size > 0) {
-        recordedChunks.push(event.data)
-      }
-    })
-
-    mediaRecorder.addEventListener("stop", () => {
-      void finalizeRecording()
-    })
-
-    recordingStartedAt = Date.now()
-    setElapsedMs(0)
-    setState("recording")
-    startTimer()
-    mediaRecorder.start()
-  }
-
-  async function finalizeRecording() {
-    const recorder = mediaRecorder
-    const stream = mediaStream
-    mediaRecorder = null
-    mediaStream = null
-
-    if (!shouldTranscribe || recordedChunks.length === 0) {
-      recordedChunks = []
-      stopTracks(stream)
-      setState("idle")
-      setElapsedMs(0)
-      return
-    }
-
-    const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
-
-    try {
-      const audioBlob = new Blob(recordedChunks, { type: mimeType })
-      const transcription = await serverApi.transcribeAudio({
-        audioBase64: await blobToBase64(audioBlob),
-        mimeType,
-      })
-      if (transcription.text.trim()) {
-        insertTranscript(transcription.text.trim())
-      }
-    } catch (error) {
-      showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
-        title: t("promptInput.voiceInput.error.title"),
-        detail: error instanceof Error ? error.message : String(error),
-        variant: "error",
-      })
-    } finally {
-      recordedChunks = []
-      stopTracks(stream)
-      setState("idle")
-      setElapsedMs(0)
-    }
-  }
-
-  function insertTranscript(text: string) {
-    const current = options.prompt()
-    const textarea = options.getTextarea()
-    const start = textarea ? textarea.selectionStart : current.length
-    const end = textarea ? textarea.selectionEnd : current.length
-    const before = current.slice(0, start)
-    const after = current.slice(end)
-    const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
-    const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
-    const nextValue = `${before}${prefix}${text}${suffix}${after}`
-    const cursor = before.length + prefix.length + text.length
-
-    options.setPrompt(nextValue)
-    if (textarea) {
-      setTimeout(() => {
-        textarea.focus()
-        textarea.setSelectionRange(cursor, cursor)
-      }, 0)
-    }
-  }
-
-  function cleanupMedia(resetState = true) {
-    stopTimer()
-    if (mediaRecorder && mediaRecorder.state !== "inactive") {
-      mediaRecorder.stop()
-    }
-    mediaRecorder = null
-    stopTracks(mediaStream)
-    mediaStream = null
-    recordedChunks = []
-    if (resetState) {
-      setState("idle")
-      setElapsedMs(0)
-    }
-  }
-
-  function startTimer() {
-    stopTimer()
-    timerId = window.setInterval(() => {
-      setElapsedMs(Date.now() - recordingStartedAt)
-    }, 250)
-  }
-
-  function stopTimer() {
-    if (timerId !== undefined) {
-      window.clearInterval(timerId)
-      timerId = undefined
-    }
-  }
+  const active = () => (options.useRealtime() ? realtime : buffered)

  return {
-    state,
-    elapsedMs,
-    canUseVoiceInput,
-    toggleRecording,
-    cancelRecording,
-    isRecording: () => state() === "recording",
-    isTranscribing: () => state() === "transcribing",
-    buttonTitle: () => {
-      if (state() === "recording") return t("promptInput.voiceInput.stop.title")
-      if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
-      return t("promptInput.voiceInput.start.title")
-    },
+    state: () => active().state(),
+    elapsedMs: () => active().elapsedMs(),
+    canUseVoiceInput: () => active().canUseVoiceInput(),
+    toggleRecording: () => active().toggleRecording(),
+    cancelRecording: () => active().cancelRecording(),
+    isRecording: () => active().isRecording(),
+    isTranscribing: () => active().isTranscribing(),
+    buttonTitle: () => active().buttonTitle(),
  }
 }
-
-function createRecorder(stream: MediaStream): MediaRecorder {
-  const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
-  const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
-  return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
-}
-
-function stopTracks(stream: MediaStream | null) {
-  stream?.getTracks().forEach((track) => track.stop())
-}
-
-async function blobToBase64(blob: Blob): Promise<string> {
-  const buffer = await blob.arrayBuffer()
-  const bytes = new Uint8Array(buffer)
-  let binary = ""
-  for (const byte of bytes) {
-    binary += String.fromCharCode(byte)
-  }
-  return btoa(binary)
-}
--- a/packages/ui/src/components/settings/speech-settings-card.tsx
+++ b/packages/ui/src/components/settings/speech-settings-card.tsx
@@ -10,6 +10,8 @@ const log = getLogger("actions")
 type DraftFields = {
  apiKey: string
  baseUrl: string
+  useRealtime: boolean
+  realtimeModel: string
  sttModel: string
  ttsModel: string
  ttsVoice: string
@@ -19,6 +21,8 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
  return {
    apiKey: speech.apiKey ?? "",
    baseUrl: speech.baseUrl ?? "",
+    useRealtime: speech.useRealtime,
+    realtimeModel: speech.realtimeModel,
    sttModel: speech.sttModel,
    ttsModel: speech.ttsModel,
    ttsVoice: speech.ttsVoice,
@@ -26,7 +30,7 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
 }

 function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
-  return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
+  return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.useRealtime === b.useRealtime && a.realtimeModel === b.realtimeModel && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
 }

 export const SpeechSettingsCard: Component = () => {
@@ -57,7 +61,7 @@ export const SpeechSettingsCard: Component = () => {
    return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
  }

-  const updateDraft = (key: keyof DraftFields, value: string) => {
+  const updateDraft = <K extends keyof DraftFields>(key: K, value: DraftFields[K]) => {
    setSaveStatus("idle")
    setDrafts((current) => ({ ...current, [key]: value }))
  }
@@ -65,12 +69,14 @@ export const SpeechSettingsCard: Component = () => {
  const isDirty = createMemo(() => {
    const speech = serverSettings().speech
    const current = drafts()
-    return (
-      (current.apiKey || "") !== (speech.apiKey || "") ||
-      (current.baseUrl || "") !== (speech.baseUrl || "") ||
-      current.sttModel !== speech.sttModel ||
-      current.ttsModel !== speech.ttsModel ||
-      current.ttsVoice !== speech.ttsVoice
+      return (
+        (current.apiKey || "") !== (speech.apiKey || "") ||
+        (current.baseUrl || "") !== (speech.baseUrl || "") ||
+        current.useRealtime !== speech.useRealtime ||
+        current.realtimeModel !== speech.realtimeModel ||
+        current.sttModel !== speech.sttModel ||
+        current.ttsModel !== speech.ttsModel ||
+        current.ttsVoice !== speech.ttsVoice
    )
  })

@@ -90,6 +96,8 @@ export const SpeechSettingsCard: Component = () => {
      await updateSpeechSettings({
        apiKey: current.apiKey.trim() || undefined,
        baseUrl: current.baseUrl.trim() || undefined,
+        useRealtime: current.useRealtime,
+        realtimeModel: current.realtimeModel.trim() || undefined,
        sttModel: current.sttModel.trim() || undefined,
        ttsModel: current.ttsModel.trim() || undefined,
        ttsVoice: current.ttsVoice.trim() || undefined,
@@ -98,6 +106,8 @@ export const SpeechSettingsCard: Component = () => {
      setDrafts({
        apiKey: current.apiKey.trim(),
        baseUrl: current.baseUrl.trim(),
+        useRealtime: current.useRealtime,
+        realtimeModel: current.realtimeModel.trim() || serverSettings().speech.realtimeModel,
        sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
        ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
        ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
@@ -159,6 +169,27 @@ export const SpeechSettingsCard: Component = () => {
          onInput={(value) => updateDraft("baseUrl", value)}
          placeholder={t("settings.speech.baseUrl.placeholder")}
        />
+        <div class="settings-toggle-row">
+          <div>
+            <div class="settings-toggle-title">{t("settings.speech.realtime.title")}</div>
+            <div class="settings-toggle-caption">{t("settings.speech.realtime.subtitle")}</div>
+          </div>
+          <label class="settings-checkbox-toggle">
+            <input
+              type="checkbox"
+              checked={drafts().useRealtime}
+              onChange={(event) => updateDraft("useRealtime", event.currentTarget.checked)}
+            />
+            <span>{t("settings.common.enabled")}</span>
+          </label>
+        </div>
+        <Field
+          label={t("settings.speech.realtimeModel.title")}
+          caption={t("settings.speech.realtimeModel.subtitle")}
+          value={drafts().realtimeModel}
+          onInput={(value) => updateDraft("realtimeModel", value)}
+          placeholder={t("settings.speech.realtimeModel.placeholder")}
+        />
        <Field
          label={t("settings.speech.sttModel.title")}
          caption={t("settings.speech.sttModel.subtitle")}
--- a/packages/ui/src/lib/api-client.ts
+++ b/packages/ui/src/lib/api-client.ts
@@ -8,6 +8,8 @@ import type {
  FileSystemListResponse,
  InstanceData,
  SpeechCapabilitiesResponse,
+  SpeechRealtimeEvent,
+  SpeechRealtimeSessionResponse,
  SpeechSynthesisResponse,
  SpeechTranscriptionResponse,
  ServerMeta,
@@ -39,6 +41,10 @@ export function buildBackgroundProcessStreamUrl(instanceId: string, processId: s
  return buildAbsoluteUrl(`/workspaces/${encodedInstanceId}/plugin/background-processes/${encodedProcessId}/stream`)
 }

+export function buildRealtimeSpeechEventsUrl(sessionId: string): string {
+  return buildAbsoluteUrl(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/events`)
+}
+
 function buildEventsUrl(base: string | undefined, path: string): string {
  if (path.startsWith("http://") || path.startsWith("https://")) {
    return path
@@ -241,6 +247,29 @@ export const serverApi = {
  fetchSpeechCapabilities(): Promise<SpeechCapabilitiesResponse> {
    return request<SpeechCapabilitiesResponse>("/api/speech/capabilities")
  },
+  createRealtimeSpeechSession(payload?: { language?: string; prompt?: string }): Promise<SpeechRealtimeSessionResponse> {
+    return request<SpeechRealtimeSessionResponse>("/api/speech/realtime/sessions", {
+      method: "POST",
+      body: JSON.stringify(payload ?? {}),
+    })
+  },
+  appendRealtimeSpeechAudio(sessionId: string, payload: { audioBase64: string }): Promise<void> {
+    return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/audio`, {
+      method: "POST",
+      body: JSON.stringify(payload),
+    })
+  },
+  finalizeRealtimeSpeechSession(sessionId: string): Promise<void> {
+    return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/finalize`, {
+      method: "POST",
+      body: JSON.stringify({}),
+    })
+  },
+  closeRealtimeSpeechSession(sessionId: string): Promise<void> {
+    return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}`, {
+      method: "DELETE",
+    })
+  },
  transcribeAudio(payload: {
    audioBase64: string
    mimeType: string
@@ -332,21 +361,34 @@ export const serverApi = {
  },
  connectEvents(onEvent: (event: WorkspaceEventPayload) => void, onError?: () => void) {
    sseLogger.info(`Connecting to ${EVENTS_URL}`)
-    const source = new EventSource(EVENTS_URL, { withCredentials: true } as any)
-    source.onmessage = (event) => {
-      try {
-        const payload = JSON.parse(event.data) as WorkspaceEventPayload
-        onEvent(payload)
-      } catch (error) {
-        sseLogger.error("Failed to parse event", error)
-      }
-    }
-    source.onerror = () => {
-      sseLogger.warn("EventSource error, closing stream")
-      onError?.()
-    }
-    return source
+    return connectEventSource(EVENTS_URL, onEvent, onError)
+  },
+  connectRealtimeSpeechEvents(
+    sessionId: string,
+    onEvent: (event: SpeechRealtimeEvent) => void,
+    onError?: () => void,
+  ) {
+    const url = buildRealtimeSpeechEventsUrl(sessionId)
+    sseLogger.info(`Connecting to ${url}`)
+    return connectEventSource(url, onEvent, onError)
  },
 }

-export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType }
+function connectEventSource<T>(url: string, onEvent: (event: T) => void, onError?: () => void) {
+  const source = new EventSource(url, { withCredentials: true } as any)
+  source.onmessage = (event) => {
+    try {
+      const payload = JSON.parse(event.data) as T
+      onEvent(payload)
+    } catch (error) {
+      sseLogger.error("Failed to parse event", error)
+    }
+  }
+  source.onerror = () => {
+    sseLogger.warn("EventSource error, closing stream")
+    onError?.()
+  }
+  return source
+}
+
+export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType, SpeechRealtimeEvent }
--- a/packages/ui/src/lib/i18n/messages/en/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/en/messaging.ts
@@ -140,8 +140,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "Send failed",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
+  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
+  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/en/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/en/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
+  "settings.speech.realtime.title": "Realtime dictation",
+  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
+  "settings.speech.realtimeModel.title": "Realtime model",
+  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
+  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/es/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/es/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "Error al enviar",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
+  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
+  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/es/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/es/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
+  "settings.speech.realtime.title": "Realtime dictation",
+  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
+  "settings.speech.realtimeModel.title": "Realtime model",
+  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
+  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/fr/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/fr/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "Échec de l'envoi",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
+  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
+  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/fr/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/fr/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
+  "settings.speech.realtime.title": "Realtime dictation",
+  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
+  "settings.speech.realtimeModel.title": "Realtime model",
+  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
+  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/ja/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/ja/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "送信に失敗",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
+  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
+  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/ja/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/ja/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
+  "settings.speech.realtime.title": "Realtime dictation",
+  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
+  "settings.speech.realtimeModel.title": "Realtime model",
+  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
+  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/ru/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/ru/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "Не удалось отправить",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
+  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
+  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/ru/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/ru/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
+  "settings.speech.realtime.title": "Realtime dictation",
+  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
+  "settings.speech.realtimeModel.title": "Realtime model",
+  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
+  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "发送失败",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
+  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
+  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
+  "settings.speech.realtime.title": "Realtime dictation",
+  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
+  "settings.speech.realtimeModel.title": "Realtime model",
+  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
+  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/stores/preferences.tsx
+++ b/packages/ui/src/stores/preferences.tsx
@@ -34,6 +34,8 @@ export interface SpeechSettings {
  provider: SpeechProviderPreference
  apiKey?: string
  baseUrl?: string
+  useRealtime: boolean
+  realtimeModel: string
  sttModel: string
  ttsModel: string
  ttsVoice: string
@@ -136,6 +138,8 @@ const defaultUiSettings: UiSettings = {

 const defaultSpeechSettings: SpeechSettings = {
  provider: "openai-compatible",
+  useRealtime: true,
+  realtimeModel: "gpt-realtime",
  sttModel: "gpt-4o-mini-transcribe",
  ttsModel: "gpt-4o-mini-tts",
  ttsVoice: "alloy",
@@ -184,6 +188,11 @@ function normalizeSpeechSettings(input?: Partial<SpeechSettings> | null): Speech
    provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider,
    apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined,
    baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined,
+    useRealtime: sanitized.useRealtime ?? defaultSpeechSettings.useRealtime,
+    realtimeModel:
+      typeof sanitized.realtimeModel === "string" && sanitized.realtimeModel.trim()
+        ? sanitized.realtimeModel.trim()
+        : defaultSpeechSettings.realtimeModel,
    sttModel:
      typeof sanitized.sttModel === "string" && sanitized.sttModel.trim()
        ? sanitized.sttModel.trim()