feat(speech): add realtime prompt dictation support

Add server-backed realtime transcription for prompt voice input and expose speech settings to choose realtime mode and models.
2026-03-19 11:32:45 +00:00
parent cc2f6976f6
commit f9b5e2b529
29 changed files with 1572 additions and 263 deletions
--- a/packages/opencode-config/package.json
+++ b/packages/opencode-config/package.json
@@ -4,6 +4,6 @@
  "private": true,
  "license": "MIT",
  "dependencies": {
-    "@opencode-ai/plugin": "1.2.14"
+    "@opencode-ai/plugin": "1.2.24"
  }
 }
--- a/packages/server/src/api-types.ts
+++ b/packages/server/src/api-types.ts
@@ -219,12 +219,35 @@ export interface SpeechCapabilitiesResponse {
  provider: string
  supportsStt: boolean
  supportsTts: boolean
  supportsRealtimeTranscription?: boolean
  realtimeInputFormat?: {
    type: "audio/pcm"
    rate: 24000
  }
  realtimeModel?: string
  baseUrl?: string
  sttModel: string
  ttsModel: string
  ttsVoice: string
 }
 export interface SpeechRealtimeSessionResponse {
  sessionId: string
  inputFormat: {
    type: "audio/pcm"
    rate: 24000
  }
 }
 export type SpeechRealtimeEvent =
  | { type: "session.ready"; sessionId: string }
  | { type: "session.error"; message: string }
  | { type: "input.speech_started"; itemId?: string }
  | { type: "input.speech_stopped"; itemId?: string }
  | { type: "transcript.partial"; itemId: string; text: string }
  | { type: "transcript.final"; itemId: string; previousItemId?: string; text: string }
  | { type: "session.closed"; reason?: string }
 export interface SpeechTranscriptionResponse {
  text: string
  language?: string
--- a/packages/server/src/server/http-server.ts
+++ b/packages/server/src/server/http-server.ts
@@ -255,7 +255,7 @@ export function createHttpServer(deps: HttpServerDeps) {
    eventBus: deps.eventBus,
    workspaceManager: deps.workspaceManager,
  })
-  registerSpeechRoutes(app, { speechService: deps.speechService })
+  registerSpeechRoutes(app, { speechService: deps.speechService, logger: apiLogger })
  registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger })
  registerBackgroundProcessRoutes(app, { backgroundProcessManager })
  registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger })
--- a/packages/server/src/server/routes/speech.ts
+++ b/packages/server/src/server/routes/speech.ts
@@ -1,9 +1,12 @@
 import type { FastifyInstance } from "fastify"
 import { z } from "zod"
 import type { SpeechService } from "../../speech/service"
 import type { Logger } from "../../logger"
 import { SpeechRealtimeSessionManager } from "../../speech/realtime-session-manager"
 interface RouteDeps {
  speechService: SpeechService
  logger: Logger
 }
 const TranscribeBodySchema = z.object({
@@ -19,9 +22,99 @@ const SynthesizeBodySchema = z.object({
  format: z.enum(["mp3", "wav", "opus"]).optional(),
 })
 const RealtimeSessionBodySchema = z.object({
  language: z.string().trim().min(1).optional(),
  prompt: z.string().trim().min(1).optional(),
 })
 const RealtimeAudioBodySchema = z.object({
  audioBase64: z.string().min(1, "Audio payload is required"),
 })
 export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) {
  const realtimeSessions = new SpeechRealtimeSessionManager(
    deps.speechService,
    deps.logger.child({ component: "speech-realtime" }),
  )
  app.addHook("onClose", async () => {
    await realtimeSessions.dispose()
  })
  app.get("/api/speech/capabilities", async () => deps.speechService.getCapabilities())
  app.post("/api/speech/realtime/sessions", async (request, reply) => {
    try {
      const body = RealtimeSessionBodySchema.parse(request.body ?? {})
      return await realtimeSessions.createSession(body)
    } catch (error) {
      request.log.error({ err: error }, "Failed to create realtime speech session")
      reply.code(400)
      return { error: error instanceof Error ? error.message : "Failed to create realtime speech session" }
    }
  })
  app.get<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/events", (request, reply) => {
    try {
      reply.raw.setHeader("Content-Type", "text/event-stream")
      reply.raw.setHeader("Cache-Control", "no-cache")
      reply.raw.setHeader("Connection", "keep-alive")
      reply.raw.flushHeaders?.()
      reply.hijack()
      const unsubscribe = realtimeSessions.subscribe(request.params.sessionId, (event) => {
        reply.raw.write(`data: ${JSON.stringify(event)}\n\n`)
      })
      const heartbeat = setInterval(() => {
        reply.raw.write(`:hb ${Date.now()}\n\n`)
      }, 15000)
      const close = () => {
        clearInterval(heartbeat)
        unsubscribe()
        reply.raw.end?.()
      }
      request.raw.on("close", close)
      request.raw.on("error", close)
    } catch (error) {
      request.log.error({ err: error }, "Failed to open realtime speech event stream")
      reply.code(404).send({ error: error instanceof Error ? error.message : "Realtime speech session not found" })
    }
  })
  app.post<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/audio", async (request, reply) => {
    try {
      const body = RealtimeAudioBodySchema.parse(request.body ?? {})
      realtimeSessions.appendAudio(request.params.sessionId, body.audioBase64)
      reply.code(204)
      return undefined
    } catch (error) {
      request.log.error({ err: error }, "Failed to append realtime speech audio")
      reply.code(400)
      return { error: error instanceof Error ? error.message : "Failed to append realtime speech audio" }
    }
  })
  app.post<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/finalize", async (request, reply) => {
    try {
      realtimeSessions.finalize(request.params.sessionId)
      reply.code(204)
      return undefined
    } catch (error) {
      request.log.error({ err: error }, "Failed to finalize realtime speech session")
      reply.code(400)
      return { error: error instanceof Error ? error.message : "Failed to finalize realtime speech session" }
    }
  })
  app.delete<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId", async (request, reply) => {
    realtimeSessions.closeSession(request.params.sessionId, "client_closed")
    reply.code(204)
    return undefined
  })
  app.post("/api/speech/transcribe", async (request, reply) => {
    try {
      const body = TranscribeBodySchema.parse(request.body ?? {})
--- a/packages/server/src/speech/providers/openai-compatible.ts
+++ b/packages/server/src/speech/providers/openai-compatible.ts
@@ -20,7 +20,13 @@ export class OpenAICompatibleSpeechProvider {
      provider: settings.provider,
      supportsStt: true,
      supportsTts: true,
      supportsRealtimeTranscription: true,
      realtimeInputFormat: {
        type: "audio/pcm" as const,
        rate: 24000 as const,
      },
      baseUrl: settings.baseUrl,
      realtimeModel: settings.realtimeModel,
      sttModel: settings.sttModel,
      ttsModel: settings.ttsModel,
      ttsVoice: settings.ttsVoice,
--- a/packages/server/src/speech/realtime-session-manager.ts
+++ b/packages/server/src/speech/realtime-session-manager.ts
@@ -0,0 +1,525 @@
 import { randomUUID } from "node:crypto"
 import { WebSocket } from "undici"
 import type { SpeechRealtimeEvent, SpeechRealtimeSessionResponse } from "../api-types"
 import type { Logger } from "../logger"
 import type { SpeechService } from "./service"
 interface CreateRealtimeSessionOptions {
  language?: string
  prompt?: string
 }
 interface TranscriptItemState {
  previousItemId?: string
  partialText: string
  finalText?: string
 }
 interface ManagedRealtimeSession {
  id: string
  ws: WebSocket
  subscribers: Set<(event: SpeechRealtimeEvent) => void>
  items: Map<string, TranscriptItemState>
  orderedItemIds: string[]
  nextFinalIndex: number
  createdAt: number
  lastActivityAt: number
  closed: boolean
 }
 const OPEN_TIMEOUT_MS = 10_000
 const IDLE_TIMEOUT_MS = 2 * 60 * 1000
 const SWEEP_INTERVAL_MS = 30_000
 export class SpeechRealtimeSessionManager {
  private readonly sessions = new Map<string, ManagedRealtimeSession>()
  private readonly sweepTimer: NodeJS.Timeout
  constructor(
    private readonly speechService: SpeechService,
    private readonly logger: Logger,
  ) {
    this.sweepTimer = setInterval(() => {
      this.sweepIdleSessions()
    }, SWEEP_INTERVAL_MS)
    this.sweepTimer.unref?.()
  }
  async createSession(options: CreateRealtimeSessionOptions = {}): Promise<SpeechRealtimeSessionResponse> {
    const config = this.speechService.getRealtimeTranscriptionConfig()
    const id = randomUUID()
    const wsUrl = buildRealtimeWebSocketUrl(config.baseUrl, config.realtimeModel)
    const sessionUpdateEvent = buildSessionUpdateEvent(config, options)
    this.logger.info(
      {
        sessionId: id,
        wsUrl,
        realtimeModel: config.realtimeModel,
        sttModel: config.sttModel,
        payload: sessionUpdateEvent,
      },
      "Opening realtime speech websocket",
    )
    const ws = new WebSocket(wsUrl, {
      headers: {
        Authorization: `Bearer ${config.apiKey}`,
        ...(requiresRealtimeBetaHeader(config.baseUrl) ? { "OpenAI-Beta": "realtime=v1" } : {}),
      },
    })
    const session: ManagedRealtimeSession = {
      id,
      ws,
      subscribers: new Set(),
      items: new Map(),
      orderedItemIds: [],
      nextFinalIndex: 0,
      createdAt: Date.now(),
      lastActivityAt: Date.now(),
      closed: false,
    }
    this.sessions.set(id, session)
    this.attachSocketHandlers(session)
    try {
      await waitForSocketOpen(ws)
      this.send(session, sessionUpdateEvent)
      return {
        sessionId: id,
        inputFormat: config.inputFormat,
      }
    } catch (error) {
      this.logger.error({ sessionId: id, err: error }, "Failed to create realtime speech session")
      this.closeSession(id, error instanceof Error ? error.message : "Failed to create realtime speech session")
      throw error
    }
  }
  subscribe(sessionId: string, send: (event: SpeechRealtimeEvent) => void): () => void {
    const session = this.getSession(sessionId)
    if (!session) {
      throw new Error("Realtime speech session not found")
    }
    session.subscribers.add(send)
    this.touch(session)
    send({ type: "session.ready", sessionId })
    return () => {
      session.subscribers.delete(send)
      this.touch(session)
    }
  }
  appendAudio(sessionId: string, audioBase64: string): void {
    const session = this.requireSession(sessionId)
    this.send(session, {
      type: "input_audio_buffer.append",
      audio: audioBase64,
    })
  }
  finalize(sessionId: string): void {
    const session = this.requireSession(sessionId)
    this.send(session, {
      type: "input_audio_buffer.commit",
    })
  }
  closeSession(sessionId: string, reason?: string): void {
    const session = this.sessions.get(sessionId)
    if (!session || session.closed) return
    session.closed = true
    this.sessions.delete(sessionId)
    this.emit(session, { type: "session.closed", reason })
    try {
      if (session.ws.readyState === WebSocket.OPEN || session.ws.readyState === WebSocket.CONNECTING) {
        session.ws.close(1000, reason?.slice(0, 120) ?? "client_closed")
      }
    } catch (error) {
      this.logger.warn({ sessionId, err: error }, "Failed to close realtime speech websocket")
    }
    session.subscribers.clear()
  }
  async dispose(): Promise<void> {
    clearInterval(this.sweepTimer)
    for (const sessionId of Array.from(this.sessions.keys())) {
      this.closeSession(sessionId, "server_shutdown")
    }
  }
  private attachSocketHandlers(session: ManagedRealtimeSession) {
    session.ws.addEventListener("message", (event) => {
      void this.handleSocketMessage(session, event.data)
    })
    session.ws.addEventListener("error", (event) => {
      const message = event.error instanceof Error ? event.error.message : event.message || "Realtime speech connection failed"
      this.logger.warn({ sessionId: session.id, err: event.error ?? event.message }, "Realtime speech websocket error")
      this.emit(session, { type: "session.error", message })
    })
    session.ws.addEventListener("close", (event) => {
      const reason = event.reason || (event.wasClean ? "socket_closed" : "socket_terminated")
      this.logger.info(
        {
          sessionId: session.id,
          code: event.code,
          reason,
          orderedItemIds: session.orderedItemIds,
          pendingItems: Array.from(session.items.entries()).map(([itemId, item]) => ({
            itemId,
            previousItemId: item.previousItemId,
            partialText: item.partialText,
            finalText: item.finalText,
          })),
        },
        "Realtime speech websocket closed",
      )
      this.closeSession(session.id, reason)
    })
  }
  private async handleSocketMessage(session: ManagedRealtimeSession, raw: unknown) {
    if (session.closed) return
    try {
      const payload = await toText(raw)
      const event = JSON.parse(payload) as Record<string, unknown>
      this.touch(session)
      this.handleServerEvent(session, event)
    } catch (error) {
      this.logger.warn({ sessionId: session.id, err: error }, "Failed to process realtime speech event")
    }
  }
  private handleServerEvent(session: ManagedRealtimeSession, event: Record<string, unknown>) {
    const type = typeof event.type === "string" ? event.type : ""
    if (!type) return
    this.logger.debug({ sessionId: session.id, type }, "Realtime speech event received")
    if (type.startsWith("conversation.item") || type.startsWith("input_audio_buffer") || type.startsWith("session.")) {
      this.logger.debug({ sessionId: session.id, event }, "Realtime speech event payload")
    }
    if (type === "error") {
      const message = extractErrorMessage(event)
      this.logger.warn({ sessionId: session.id, event }, "Realtime speech provider error event")
      this.emit(session, { type: "session.error", message })
      return
    }
    if (type === "input_audio_buffer.speech_started") {
      this.emit(session, {
        type: "input.speech_started",
        itemId: readString(event.item_id),
      })
      return
    }
    if (type === "input_audio_buffer.speech_stopped") {
      this.emit(session, {
        type: "input.speech_stopped",
        itemId: readString(event.item_id),
      })
      return
    }
    if (type === "input_audio_buffer.committed") {
      const itemId = readString(event.item_id)
      if (!itemId) return
      const item = this.getOrCreateItem(session, itemId)
      item.previousItemId = readString(event.previous_item_id)
      if (!session.orderedItemIds.includes(itemId)) {
        session.orderedItemIds.push(itemId)
      }
      this.flushFinalizedItems(session)
      return
    }
    if (type === "conversation.item.created" || type === "conversation.item.added" || type === "conversation.item.done") {
      this.handleConversationItemEvent(session, event)
      return
    }
    if (type === "conversation.item.input_audio_transcription.delta") {
      const itemId = readString(event.item_id)
      const delta = readString(event.delta)
      if (!itemId || !delta) return
      const item = this.getOrCreateItem(session, itemId)
      item.partialText += delta
      this.emit(session, {
        type: "transcript.partial",
        itemId,
        text: item.partialText,
      })
      return
    }
    if (type === "conversation.item.input_audio_transcription.completed") {
      const itemId = readString(event.item_id)
      if (!itemId) return
      const item = this.getOrCreateItem(session, itemId)
      item.finalText = readString(event.transcript) ?? item.partialText
      this.flushFinalizedItems(session)
    }
  }
  private handleConversationItemEvent(session: ManagedRealtimeSession, event: Record<string, unknown>) {
    const itemRecord = asRecord(event.item)
    if (!itemRecord) return
    const itemId = readString(itemRecord.id) ?? readString(event.item_id)
    if (!itemId) return
    const item = this.getOrCreateItem(session, itemId)
    item.previousItemId = readString(event.previous_item_id) ?? item.previousItemId
    if (!session.orderedItemIds.includes(itemId)) {
      session.orderedItemIds.push(itemId)
    }
    const transcript = extractTranscriptFromConversationItem(itemRecord)
    if (transcript) {
      item.finalText = transcript
      this.flushFinalizedItems(session)
    }
  }
  private flushFinalizedItems(session: ManagedRealtimeSession) {
    while (session.nextFinalIndex < session.orderedItemIds.length) {
      const itemId = session.orderedItemIds[session.nextFinalIndex]
      const item = session.items.get(itemId)
      if (!item || item.finalText === undefined) {
        return
      }
      this.emit(session, {
        type: "transcript.final",
        itemId,
        previousItemId: item.previousItemId,
        text: item.finalText,
      })
      session.nextFinalIndex += 1
    }
  }
  private getOrCreateItem(session: ManagedRealtimeSession, itemId: string): TranscriptItemState {
    const existing = session.items.get(itemId)
    if (existing) return existing
    const created: TranscriptItemState = { partialText: "" }
    session.items.set(itemId, created)
    return created
  }
  private emit(session: ManagedRealtimeSession, event: SpeechRealtimeEvent) {
    for (const subscriber of session.subscribers) {
      try {
        subscriber(event)
      } catch (error) {
        this.logger.warn({ sessionId: session.id, err: error, type: event.type }, "Failed to emit realtime speech event")
      }
    }
  }
  private requireSession(sessionId: string): ManagedRealtimeSession {
    const session = this.getSession(sessionId)
    if (!session) {
      throw new Error("Realtime speech session not found")
    }
    return session
  }
  private getSession(sessionId: string): ManagedRealtimeSession | null {
    const session = this.sessions.get(sessionId) ?? null
    if (!session || session.closed) return null
    return session
  }
  private send(session: ManagedRealtimeSession, event: Record<string, unknown>) {
    if (session.closed || session.ws.readyState !== WebSocket.OPEN) {
      throw new Error("Realtime speech session is not connected")
    }
    session.ws.send(JSON.stringify(event))
    this.touch(session)
  }
  private touch(session: ManagedRealtimeSession) {
    session.lastActivityAt = Date.now()
  }
  private sweepIdleSessions() {
    const now = Date.now()
    for (const [sessionId, session] of this.sessions) {
      if (session.closed) continue
      if (now - session.lastActivityAt < IDLE_TIMEOUT_MS) continue
      this.logger.info({ sessionId }, "Closing idle realtime speech session")
      this.closeSession(sessionId, "idle_timeout")
    }
  }
 }
 function buildRealtimeWebSocketUrl(baseUrl: string | undefined, model: string): string {
  const target = new URL(baseUrl?.trim() || "https://api.openai.com/v1")
  target.protocol = target.protocol === "http:" ? "ws:" : "wss:"
  const normalizedPath = target.pathname.replace(/\/+$/, "")
  target.pathname = normalizedPath.endsWith("/realtime") ? normalizedPath : `${normalizedPath}/realtime`
  target.hash = ""
  if (!target.searchParams.has("model")) {
    target.searchParams.set("model", model)
  }
  return target.toString()
 }
 function requiresRealtimeBetaHeader(baseUrl?: string): boolean {
  if (!baseUrl || !baseUrl.trim()) return false
  try {
    return new URL(baseUrl).hostname.toLowerCase() !== "api.openai.com"
  } catch {
    return false
  }
 }
 function buildSessionUpdateEvent(
  config: { baseUrl?: string; sttModel: string; realtimeModel: string; inputFormat: { type: "audio/pcm"; rate: 24000 } },
  options: CreateRealtimeSessionOptions,
 ): Record<string, unknown> {
  if (requiresRealtimeBetaHeader(config.baseUrl)) {
    return {
      type: "session.update",
      session: {
        input_audio_transcription: {
          model: config.sttModel,
          ...(options.language ? { language: options.language } : {}),
          ...(options.prompt ? { prompt: options.prompt } : {}),
        },
        turn_detection: {
          type: "server_vad",
          threshold: 0.45,
          prefix_padding_ms: 250,
          silence_duration_ms: 400,
        },
      },
    }
  }
  return {
    type: "session.update",
    session: {
      type: "transcription",
      audio: {
        input: {
          format: config.inputFormat,
          noise_reduction: { type: "near_field" },
          transcription: {
            model: config.sttModel,
            ...(options.language ? { language: options.language } : {}),
            ...(options.prompt ? { prompt: options.prompt } : {}),
          },
          turn_detection: {
            type: "server_vad",
            threshold: 0.45,
            prefix_padding_ms: 250,
            silence_duration_ms: 400,
          },
        },
      },
    },
  }
 }
 function waitForSocketOpen(ws: WebSocket): Promise<void> {
  if (ws.readyState === WebSocket.OPEN) {
    return Promise.resolve()
  }
  return new Promise((resolve, reject) => {
    let settled = false
    const timeout = setTimeout(() => {
      cleanup()
      reject(new Error("Timed out connecting to realtime speech provider"))
    }, OPEN_TIMEOUT_MS)
    const cleanup = () => {
      clearTimeout(timeout)
      ws.removeEventListener("open", handleOpen)
      ws.removeEventListener("error", handleError)
      ws.removeEventListener("close", handleClose)
    }
    const finish = (callback: () => void) => {
      if (settled) return
      settled = true
      cleanup()
      callback()
    }
    const handleOpen = () => {
      finish(resolve)
    }
    const handleError = (event: { error?: unknown; message?: string }) => {
      finish(() => reject(event.error instanceof Error ? event.error : new Error(event.message || "Failed to connect")))
    }
    const handleClose = () => {
      finish(() => reject(new Error("Realtime speech connection closed before initialization")))
    }
    ws.addEventListener("open", handleOpen)
    ws.addEventListener("error", handleError as any)
    ws.addEventListener("close", handleClose)
  })
 }
 async function toText(data: unknown): Promise<string> {
  if (typeof data === "string") return data
  if (data instanceof ArrayBuffer) return Buffer.from(data).toString("utf-8")
  if (ArrayBuffer.isView(data)) return Buffer.from(data.buffer, data.byteOffset, data.byteLength).toString("utf-8")
  if (typeof Blob !== "undefined" && data instanceof Blob) {
    return Buffer.from(await data.arrayBuffer()).toString("utf-8")
  }
  return String(data ?? "")
 }
 function extractErrorMessage(event: Record<string, unknown>): string {
  const error = event.error
  if (error && typeof error === "object") {
    const message = readString((error as Record<string, unknown>).message)
    if (message) return message
  }
  return readString(event.message) ?? "Realtime speech request failed"
 }
 function readString(value: unknown): string | undefined {
  return typeof value === "string" && value.length > 0 ? value : undefined
 }
 function asRecord(value: unknown): Record<string, unknown> | null {
  return value && typeof value === "object" && !Array.isArray(value) ? (value as Record<string, unknown>) : null
 }
 function extractTranscriptFromConversationItem(item: Record<string, unknown>): string | undefined {
  const directTranscript = readString(item.transcript) ?? readString(item.text)
  if (directTranscript) return directTranscript
  const content = Array.isArray(item.content) ? item.content : []
  for (const part of content) {
    const record = asRecord(part)
    if (!record) continue
    const transcript =
      readString(record.transcript) ??
      readString(record.text) ??
      readString(asRecord(record.audio)?.transcript)
    if (transcript) {
      return transcript
    }
  }
  return undefined
 }
--- a/packages/server/src/speech/service.ts
+++ b/packages/server/src/speech/service.ts
@@ -10,6 +10,8 @@ const ServerSpeechSettingsSchema = z.object({
      provider: z.string().optional(),
      apiKey: z.string().optional(),
      baseUrl: z.string().optional(),
      useRealtime: z.boolean().optional(),
      realtimeModel: z.string().optional(),
      sttModel: z.string().optional(),
      ttsModel: z.string().optional(),
      ttsVoice: z.string().optional(),
@@ -40,12 +42,26 @@ export interface NormalizedSpeechSettings {
  provider: string
  apiKey?: string
  baseUrl?: string
  realtimeModel: string
  sttModel: string
  ttsModel: string
  ttsVoice: string
 }
 export interface RealtimeTranscriptionConfig {
  provider: string
  apiKey: string
  baseUrl?: string
  realtimeModel: string
  sttModel: string
  inputFormat: {
    type: "audio/pcm"
    rate: 24000
  }
 }
 const DEFAULT_PROVIDER = "openai-compatible"
 const DEFAULT_REALTIME_MODEL = "gpt-realtime"
 const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
 const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
 const DEFAULT_TTS_VOICE = "alloy"
@@ -67,6 +83,25 @@ export class SpeechService {
    return this.createProvider().synthesize(input)
  }
  getRealtimeTranscriptionConfig(): RealtimeTranscriptionConfig {
    const settings = this.resolveSettings()
    if (!settings.apiKey) {
      throw new Error("Speech provider is not configured. Add an API key in Speech settings.")
    }
    return {
      provider: settings.provider,
      apiKey: settings.apiKey,
      baseUrl: settings.baseUrl,
      realtimeModel: settings.realtimeModel,
      sttModel: settings.sttModel,
      inputFormat: {
        type: "audio/pcm",
        rate: 24000,
      },
    }
  }
  private createProvider(): SpeechProvider {
    const settings = this.resolveSettings()
    return new OpenAICompatibleSpeechProvider({
@@ -83,6 +118,7 @@ export class SpeechService {
      provider: speech.provider?.trim() || DEFAULT_PROVIDER,
      apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY,
      baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined,
      realtimeModel: speech.realtimeModel?.trim() || DEFAULT_REALTIME_MODEL,
      sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
      ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
      ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
--- a/packages/ui/src/components/prompt-input.tsx
+++ b/packages/ui/src/components/prompt-input.tsx
@@ -11,7 +11,7 @@ import { getCommands } from "../stores/commands"
 import { showAlertDialog } from "../stores/alerts"
 import { useI18n } from "../lib/i18n"
 import { getLogger } from "../lib/logger"
-import { preferences } from "../stores/preferences"
+import { preferences, useConfig } from "../stores/preferences"
 import type { ExpandState, PromptInputApi, PromptInputProps, PromptInsertMode, PromptMode } from "./prompt-input/types"
 import { usePromptState } from "./prompt-input/usePromptState"
 import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
@@ -22,6 +22,7 @@ const log = getLogger("actions")
 export default function PromptInput(props: PromptInputProps) {
  const { t } = useI18n()
  const { serverSettings } = useConfig()
  const [, setIsFocused] = createSignal(false)
  const [mode, setMode] = createSignal<PromptMode>("normal")
  const [expandState, setExpandState] = createSignal<ExpandState>("normal")
@@ -418,6 +419,7 @@ export default function PromptInput(props: PromptInputProps) {
    getTextarea: () => textareaRef ?? null,
    enabled: () => preferences().showPromptVoiceInput,
    disabled: () => Boolean(props.disabled),
    useRealtime: () => serverSettings().speech.useRealtime,
  })
  const showVoiceInput = () =>
    preferences().showPromptVoiceInput &&
--- a/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts
+++ b/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts
@@ -0,0 +1,110 @@
 export interface RealtimePcmStreamHandle {
  stop(): Promise<void>
 }
 interface CreateRealtimePcmStreamOptions {
  onChunk: (audioBase64: string) => void | Promise<void>
 }
 const TARGET_SAMPLE_RATE = 24000
 const PROCESSOR_BUFFER_SIZE = 4096
 export async function createRealtimePcmStream(
  options: CreateRealtimePcmStreamOptions,
 ): Promise<RealtimePcmStreamHandle> {
  const stream = await navigator.mediaDevices.getUserMedia({
    audio: {
      channelCount: 1,
      echoCancellation: true,
      noiseSuppression: true,
      autoGainControl: true,
    },
  })
  const AudioContextCtor = window.AudioContext || (window as any).webkitAudioContext
  if (!AudioContextCtor) {
    stream.getTracks().forEach((track) => track.stop())
    throw new Error("AudioContext is not supported in this browser.")
  }
  const audioContext = new AudioContextCtor()
  await audioContext.resume()
  const source = audioContext.createMediaStreamSource(stream)
  const processor = audioContext.createScriptProcessor(PROCESSOR_BUFFER_SIZE, 1, 1)
  const sink = audioContext.createGain()
  sink.gain.value = 0
  source.connect(processor)
  processor.connect(sink)
  sink.connect(audioContext.destination)
  processor.onaudioprocess = (event) => {
    const input = event.inputBuffer.getChannelData(0)
    const resampled = downsampleBuffer(input, audioContext.sampleRate, TARGET_SAMPLE_RATE)
    if (resampled.length === 0) return
    const pcm16 = floatTo16BitPcm(resampled)
    void options.onChunk(base64EncodePcm16(pcm16))
  }
  let stopped = false
  return {
    async stop() {
      if (stopped) return
      stopped = true
      processor.onaudioprocess = null
      source.disconnect()
      processor.disconnect()
      sink.disconnect()
      stream.getTracks().forEach((track) => track.stop())
      await audioContext.close()
    },
  }
 }
 function downsampleBuffer(buffer: Float32Array, inputSampleRate: number, outputSampleRate: number): Float32Array {
  if (inputSampleRate === outputSampleRate) {
    return buffer.slice()
  }
  const sampleRateRatio = inputSampleRate / outputSampleRate
  const outputLength = Math.max(1, Math.round(buffer.length / sampleRateRatio))
  const output = new Float32Array(outputLength)
  let outputIndex = 0
  let inputIndex = 0
  while (outputIndex < outputLength) {
    const nextInputIndex = Math.min(buffer.length, Math.round((outputIndex + 1) * sampleRateRatio))
    let sum = 0
    let count = 0
    for (let i = inputIndex; i < nextInputIndex; i += 1) {
      sum += buffer[i]
      count += 1
    }
    output[outputIndex] = count > 0 ? sum / count : buffer[Math.min(buffer.length - 1, inputIndex)]
    outputIndex += 1
    inputIndex = nextInputIndex
  }
  return output
 }
 function floatTo16BitPcm(buffer: Float32Array): Int16Array {
  const pcm16 = new Int16Array(buffer.length)
  for (let i = 0; i < buffer.length; i += 1) {
    const sample = Math.max(-1, Math.min(1, buffer[i]))
    pcm16[i] = sample < 0 ? Math.round(sample * 0x8000) : Math.round(sample * 0x7fff)
  }
  return pcm16
 }
 function base64EncodePcm16(buffer: Int16Array): string {
  const bytes = new Uint8Array(buffer.buffer)
  let binary = ""
  const chunkSize = 0x8000
  for (let offset = 0; offset < bytes.length; offset += chunkSize) {
    const chunk = bytes.subarray(offset, offset + chunkSize)
    binary += String.fromCharCode(...chunk)
  }
  return btoa(binary)
 }
--- a/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts
+++ b/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts
@@ -0,0 +1,36 @@
 export interface PromptVoiceAnchor {
  prompt: string
  start: number
  end: number
 }
 export function createPromptVoiceAnchor(prompt: string, start: number, end: number): PromptVoiceAnchor {
  return { prompt, start, end }
 }
 export function buildPromptWithInsertedTranscript(anchor: PromptVoiceAnchor, insertedText: string): { value: string; cursor: number } {
  const before = anchor.prompt.slice(0, anchor.start)
  const after = anchor.prompt.slice(anchor.end)
  const normalized = insertedText.trim()
  if (!normalized) {
    return {
      value: before + after,
      cursor: before.length,
    }
  }
  const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
  const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
  return {
    value: `${before}${prefix}${normalized}${suffix}${after}`,
    cursor: before.length + prefix.length + normalized.length,
  }
 }
 export function appendVoiceTranscript(current: string, next: string): string {
  const normalized = next.trim()
  if (!normalized) return current
  if (!current.trim()) return normalized
  return /\s$/.test(current) ? `${current}${normalized}` : `${current} ${normalized}`
 }
--- a/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts
@@ -0,0 +1,241 @@
 import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
 import { showAlertDialog } from "../../stores/alerts"
 import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
 import { serverApi } from "../../lib/api-client"
 import { useI18n } from "../../lib/i18n"
 import { createPromptVoiceAnchor, buildPromptWithInsertedTranscript } from "./promptVoiceInsertion"
 interface UsePromptBufferedVoiceInputOptions {
  prompt: Accessor<string>
  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
  getTextarea: () => HTMLTextAreaElement | null
  enabled: Accessor<boolean>
  disabled: Accessor<boolean>
 }
 type VoiceInputState = "idle" | "recording" | "transcribing"
 export function usePromptBufferedVoiceInput(options: UsePromptBufferedVoiceInputOptions) {
  const { t } = useI18n()
  const [state, setState] = createSignal<VoiceInputState>("idle")
  const [elapsedMs, setElapsedMs] = createSignal(0)
  let mediaRecorder: MediaRecorder | null = null
  let mediaStream: MediaStream | null = null
  let timerId: number | undefined
  let shouldTranscribe = true
  let recordedChunks: Blob[] = []
  let recordingStartedAt = 0
  createEffect(() => {
    void loadSpeechCapabilities()
  })
  onCleanup(() => {
    cleanupMedia(false)
  })
  const isSupported = () => {
    if (typeof window === "undefined") return false
    return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
  }
  const canUseVoiceInput = () => {
    const capabilities = speechCapabilities()
    return Boolean(
      options.enabled() &&
        isSupported() &&
        capabilities?.available &&
        capabilities?.configured &&
        capabilities?.supportsStt,
    )
  }
  async function toggleRecording(): Promise<void> {
    if (state() === "recording") {
      stopRecording()
      return
    }
    if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
    try {
      await startRecording()
    } catch (error) {
      cleanupMedia(false)
      showAlertDialog(t("promptInput.voiceInput.error.permission"), {
        title: t("promptInput.voiceInput.error.title"),
        detail: error instanceof Error ? error.message : String(error),
        variant: "error",
      })
    }
  }
  function stopRecording() {
    if (!mediaRecorder || state() !== "recording") return
    shouldTranscribe = true
    mediaRecorder.stop()
    setState("transcribing")
    stopTimer()
  }
  function cancelRecording() {
    if (!mediaRecorder || state() !== "recording") return
    shouldTranscribe = false
    mediaRecorder.stop()
    cleanupMedia(false)
  }
  async function startRecording() {
    if (!isSupported()) {
      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
        title: t("promptInput.voiceInput.error.title"),
        variant: "error",
      })
      return
    }
    recordedChunks = []
    shouldTranscribe = true
    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
    mediaRecorder = createRecorder(mediaStream)
    mediaRecorder.addEventListener("dataavailable", (event) => {
      if (event.data.size > 0) {
        recordedChunks.push(event.data)
      }
    })
    mediaRecorder.addEventListener("stop", () => {
      void finalizeRecording()
    })
    recordingStartedAt = Date.now()
    setElapsedMs(0)
    setState("recording")
    startTimer()
    mediaRecorder.start()
  }
  async function finalizeRecording() {
    const recorder = mediaRecorder
    const stream = mediaStream
    mediaRecorder = null
    mediaStream = null
    if (!shouldTranscribe || recordedChunks.length === 0) {
      recordedChunks = []
      stopTracks(stream)
      setState("idle")
      setElapsedMs(0)
      return
    }
    const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
    try {
      const audioBlob = new Blob(recordedChunks, { type: mimeType })
      const transcription = await serverApi.transcribeAudio({
        audioBase64: await blobToBase64(audioBlob),
        mimeType,
      })
      if (transcription.text.trim()) {
        insertTranscript(transcription.text.trim())
      }
    } catch (error) {
      showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
        title: t("promptInput.voiceInput.error.title"),
        detail: error instanceof Error ? error.message : String(error),
        variant: "error",
      })
    } finally {
      recordedChunks = []
      stopTracks(stream)
      setState("idle")
      setElapsedMs(0)
    }
  }
  function insertTranscript(text: string) {
    const current = options.prompt()
    const textarea = options.getTextarea()
    const start = textarea ? textarea.selectionStart : current.length
    const end = textarea ? textarea.selectionEnd : current.length
    const { value, cursor } = buildPromptWithInsertedTranscript(
      createPromptVoiceAnchor(current, start, end),
      text,
    )
    options.setPrompt(value)
    if (textarea) {
      setTimeout(() => {
        textarea.focus()
        textarea.setSelectionRange(cursor, cursor)
      }, 0)
    }
  }
  function cleanupMedia(resetState = true) {
    stopTimer()
    if (mediaRecorder && mediaRecorder.state !== "inactive") {
      mediaRecorder.stop()
    }
    mediaRecorder = null
    stopTracks(mediaStream)
    mediaStream = null
    recordedChunks = []
    if (resetState) {
      setState("idle")
      setElapsedMs(0)
    }
  }
  function startTimer() {
    stopTimer()
    timerId = window.setInterval(() => {
      setElapsedMs(Date.now() - recordingStartedAt)
    }, 250)
  }
  function stopTimer() {
    if (timerId !== undefined) {
      window.clearInterval(timerId)
      timerId = undefined
    }
  }
  return {
    state,
    elapsedMs,
    canUseVoiceInput,
    toggleRecording,
    cancelRecording,
    isRecording: () => state() === "recording",
    isTranscribing: () => state() === "transcribing",
    buttonTitle: () => {
      if (state() === "recording") return t("promptInput.voiceInput.stop.title")
      if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
      return t("promptInput.voiceInput.start.title")
    },
  }
 }
 function createRecorder(stream: MediaStream): MediaRecorder {
  const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
  const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
  return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
 }
 function stopTracks(stream: MediaStream | null) {
  stream?.getTracks().forEach((track) => track.stop())
 }
 async function blobToBase64(blob: Blob): Promise<string> {
  const buffer = await blob.arrayBuffer()
  const bytes = new Uint8Array(buffer)
  let binary = ""
  for (const byte of bytes) {
    binary += String.fromCharCode(byte)
  }
  return btoa(binary)
 }
--- a/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts
@@ -0,0 +1,325 @@
 import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
 import { showAlertDialog } from "../../stores/alerts"
 import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client"
 import { useI18n } from "../../lib/i18n"
 import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
 import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream"
 import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion"
 interface UsePromptRealtimeVoiceInputOptions {
  prompt: Accessor<string>
  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
  getTextarea: () => HTMLTextAreaElement | null
  enabled: Accessor<boolean>
  disabled: Accessor<boolean>
 }
 type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing"
 const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000
 export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) {
  const { t } = useI18n()
  const [state, setState] = createSignal<RealtimeVoiceState>("idle")
  const [elapsedMs, setElapsedMs] = createSignal(0)
  let activeSessionId: string | null = null
  let eventSource: EventSource | null = null
  let pcmStream: RealtimePcmStreamHandle | null = null
  let audioQueue: Promise<void> = Promise.resolve()
  let timerId: number | undefined
  let recordingStartedAt = 0
  let finalizeTimerId: number | undefined
  let anchor = createPromptVoiceAnchor("", 0, 0)
  let finalTranscript = ""
  let liveTranscript = ""
  let activeLiveItemId: string | null = null
  let closing = false
  createEffect(() => {
    void loadSpeechCapabilities()
  })
  onCleanup(() => {
    cancelRecording()
  })
  const isSupported = () => {
    if (typeof window === "undefined") return false
    return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined"
  }
  const canUseVoiceInput = () => {
    const capabilities = speechCapabilities()
    return Boolean(
      options.enabled() &&
        isSupported() &&
        capabilities?.available &&
        capabilities?.configured &&
        capabilities?.supportsStt &&
        capabilities?.supportsRealtimeTranscription,
    )
  }
  async function toggleRecording(): Promise<void> {
    if (state() === "listening" || state() === "connecting") {
      await stopRecording()
      return
    }
    if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return
    try {
      await startRecording()
    } catch (error) {
      await cleanupSession({ revertPrompt: true, closeRemote: true })
      showAlertDialog(t("promptInput.voiceInput.error.connection"), {
        title: t("promptInput.voiceInput.error.title"),
        detail: error instanceof Error ? error.message : String(error),
        variant: "error",
      })
    }
  }
  async function startRecording() {
    if (!isSupported()) {
      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
        title: t("promptInput.voiceInput.error.title"),
        variant: "error",
      })
      return
    }
    resetTranscriptState()
    captureAnchor()
    setState("connecting")
    setElapsedMs(0)
    const created = await serverApi.createRealtimeSpeechSession({
      language: detectLanguage(),
    })
    activeSessionId = created.sessionId
    connectEventStream(created.sessionId)
    pcmStream = await createRealtimePcmStream({
      onChunk: (audioBase64) => {
        const sessionId = activeSessionId
        if (!sessionId || closing) return
        audioQueue = audioQueue
          .then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 }))
          .catch((error) => {
            handleRealtimeError(error)
          })
      },
    })
    recordingStartedAt = Date.now()
    startTimer()
    setState("listening")
  }
  async function stopRecording() {
    const sessionId = activeSessionId
    if (!sessionId || (state() !== "listening" && state() !== "connecting")) return
    setState("finalizing")
    stopTimer()
    if (pcmStream) {
      const stream = pcmStream
      pcmStream = null
      await stream.stop()
    }
    try {
      await audioQueue.catch(() => undefined)
      await serverApi.finalizeRealtimeSpeechSession(sessionId)
      scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS)
    } catch (error) {
      handleRealtimeError(error)
    }
  }
  function cancelRecording() {
    void cleanupSession({ revertPrompt: true, closeRemote: true })
  }
  function connectEventStream(sessionId: string) {
    eventSource?.close()
    eventSource = serverApi.connectRealtimeSpeechEvents(
      sessionId,
      (event) => handleEvent(event),
      () => {
        if (closing) return
        handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection")))
      },
    )
  }
  function handleEvent(event: SpeechRealtimeEvent) {
    if (event.type === "session.ready") {
      return
    }
    if (event.type === "session.error") {
      handleRealtimeError(new Error(event.message))
      return
    }
    if (event.type === "transcript.partial") {
      activeLiveItemId = event.itemId
      liveTranscript = event.text
      renderPrompt(false)
      return
    }
    if (event.type === "transcript.final") {
      activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId
      liveTranscript = ""
      finalTranscript = appendVoiceTranscript(finalTranscript, event.text)
      renderPrompt(true)
      if (state() === "finalizing") {
        scheduleFinalizeClose(250)
      }
      return
    }
    if (event.type === "session.closed") {
      void cleanupSession({ revertPrompt: false, closeRemote: false })
    }
  }
  function captureAnchor() {
    const textarea = options.getTextarea()
    const current = options.prompt()
    const start = textarea ? textarea.selectionStart : current.length
    const end = textarea ? textarea.selectionEnd : current.length
    anchor = createPromptVoiceAnchor(current, start, end)
  }
  function renderPrompt(persistDraft: boolean) {
    const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "")
    const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted)
    options.setPrompt(value, persistDraft ? undefined : { persistDraft: false })
    syncTextareaCursor(cursor)
  }
  function syncTextareaCursor(cursor: number) {
    const textarea = options.getTextarea()
    if (!textarea) return
    queueMicrotask(() => {
      const next = options.getTextarea()
      if (!next) return
      next.focus()
      next.setSelectionRange(cursor, cursor)
    })
  }
  function scheduleFinalizeClose(delayMs: number) {
    if (finalizeTimerId !== undefined) {
      window.clearTimeout(finalizeTimerId)
    }
    finalizeTimerId = window.setTimeout(() => {
      void cleanupSession({ revertPrompt: false, closeRemote: true })
    }, delayMs)
  }
  async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) {
    if (closing) return
    closing = true
    if (finalizeTimerId !== undefined) {
      window.clearTimeout(finalizeTimerId)
      finalizeTimerId = undefined
    }
    stopTimer()
    const sessionId = activeSessionId
    activeSessionId = null
    eventSource?.close()
    eventSource = null
    if (pcmStream) {
      const stream = pcmStream
      pcmStream = null
      await stream.stop().catch(() => undefined)
    }
    await audioQueue.catch(() => undefined)
    audioQueue = Promise.resolve()
    if (cleanupOptions.closeRemote && sessionId) {
      await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined)
    }
    if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) {
      finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript)
      liveTranscript = ""
    }
    if (cleanupOptions.revertPrompt) {
      options.setPrompt(anchor.prompt)
    } else if (finalTranscript.trim()) {
      renderPrompt(true)
    }
    resetTranscriptState()
    setState("idle")
    setElapsedMs(0)
    closing = false
  }
  function resetTranscriptState() {
    finalTranscript = ""
    liveTranscript = ""
    activeLiveItemId = null
  }
  function handleRealtimeError(error: unknown) {
    if (closing) return
    void cleanupSession({ revertPrompt: true, closeRemote: true })
    showAlertDialog(t("promptInput.voiceInput.error.connection"), {
      title: t("promptInput.voiceInput.error.title"),
      detail: error instanceof Error ? error.message : String(error),
      variant: "error",
    })
  }
  function startTimer() {
    stopTimer()
    timerId = window.setInterval(() => {
      setElapsedMs(Date.now() - recordingStartedAt)
    }, 250)
  }
  function stopTimer() {
    if (timerId !== undefined) {
      window.clearInterval(timerId)
      timerId = undefined
    }
  }
  return {
    state,
    elapsedMs,
    canUseVoiceInput,
    toggleRecording,
    cancelRecording,
    isRecording: () => state() === "connecting" || state() === "listening",
    isTranscribing: () => state() === "finalizing",
    buttonTitle: () => {
      if (state() === "connecting") return t("promptInput.voiceInput.connecting.title")
      if (state() === "listening") return t("promptInput.voiceInput.stop.title")
      if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title")
      return t("promptInput.voiceInput.start.title")
    },
  }
 }
 function detectLanguage(): string | undefined {
  if (typeof navigator === "undefined") return undefined
  const [language] = navigator.language.split("-")
  return language?.trim() || undefined
 }
--- a/packages/ui/src/components/prompt-input/usePromptState.ts
+++ b/packages/ui/src/components/prompt-input/usePromptState.ts
@@ -22,7 +22,7 @@ type HistorySelectOptions = {
 type PromptState = {
  prompt: Accessor<string>
-  setPrompt: (value: string) => void
+  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
  clearPrompt: () => void
  draftLoadedNonce: Accessor<number>
@@ -48,11 +48,11 @@ export function usePromptState(options: PromptStateOptions): PromptState {
  const [historyDraft, setHistoryDraft] = createSignal<string | null>(null)
  const [draftLoadedNonce, setDraftLoadedNonce] = createSignal(0)
-  const setPrompt = (value: string) => {
+  const setPrompt = (value: string, setOptions?: { persistDraft?: boolean }) => {
    setPromptInternal(value)
    // Persist drafts only when the user is at the "fresh" position (not browsing history).
    // This keeps the bottom-of-history draft stable even if the user edits recalled history entries.
-    if (historyIndex() === -1) {
+    if (setOptions?.persistDraft !== false && historyIndex() === -1) {
      setSessionDraftPrompt(options.instanceId(), options.sessionId(), value)
    }
  }
--- a/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts
@@ -1,242 +1,30 @@
-import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
+import type { Accessor } from "solid-js"
-import { showAlertDialog } from "../../stores/alerts"
+import { usePromptBufferedVoiceInput } from "./usePromptBufferedVoiceInput"
-import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
+import { usePromptRealtimeVoiceInput } from "./usePromptRealtimeVoiceInput"
 import { serverApi } from "../../lib/api-client"
 import { useI18n } from "../../lib/i18n"
 interface UsePromptVoiceInputOptions {
  prompt: Accessor<string>
-  setPrompt: (value: string) => void
+  setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
  getTextarea: () => HTMLTextAreaElement | null
  enabled: Accessor<boolean>
  disabled: Accessor<boolean>
  useRealtime: Accessor<boolean>
 }
 type VoiceInputState = "idle" | "recording" | "transcribing"
 export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
-  const { t } = useI18n()
+  const buffered = usePromptBufferedVoiceInput(options)
-  const [state, setState] = createSignal<VoiceInputState>("idle")
+  const realtime = usePromptRealtimeVoiceInput(options)
  const [elapsedMs, setElapsedMs] = createSignal(0)
-  let mediaRecorder: MediaRecorder | null = null
+  const active = () => (options.useRealtime() ? realtime : buffered)
  let mediaStream: MediaStream | null = null
  let timerId: number | undefined
  let shouldTranscribe = true
  let recordedChunks: Blob[] = []
  let recordingStartedAt = 0
  createEffect(() => {
    void loadSpeechCapabilities()
  })
  onCleanup(() => {
    cleanupMedia(false)
  })
  const isSupported = () => {
    if (typeof window === "undefined") return false
    return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
  }
  const canUseVoiceInput = () => {
    const capabilities = speechCapabilities()
    return Boolean(
      options.enabled() &&
        isSupported() &&
        capabilities?.available &&
        capabilities?.configured &&
        capabilities?.supportsStt,
    )
  }
  async function toggleRecording(): Promise<void> {
    if (state() === "recording") {
      stopRecording()
      return
    }
    if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
    try {
      await startRecording()
    } catch (error) {
      cleanupMedia(false)
      showAlertDialog(t("promptInput.voiceInput.error.permission"), {
        title: t("promptInput.voiceInput.error.title"),
        detail: error instanceof Error ? error.message : String(error),
        variant: "error",
      })
    }
  }
  function stopRecording() {
    if (!mediaRecorder || state() !== "recording") return
    shouldTranscribe = true
    mediaRecorder.stop()
    setState("transcribing")
    stopTimer()
  }
  function cancelRecording() {
    if (!mediaRecorder || state() !== "recording") return
    shouldTranscribe = false
    mediaRecorder.stop()
    cleanupMedia(false)
  }
  async function startRecording() {
    if (!isSupported()) {
      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
        title: t("promptInput.voiceInput.error.title"),
        variant: "error",
      })
      return
    }
    recordedChunks = []
    shouldTranscribe = true
    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
    mediaRecorder = createRecorder(mediaStream)
    mediaRecorder.addEventListener("dataavailable", (event) => {
      if (event.data.size > 0) {
        recordedChunks.push(event.data)
      }
    })
    mediaRecorder.addEventListener("stop", () => {
      void finalizeRecording()
    })
    recordingStartedAt = Date.now()
    setElapsedMs(0)
    setState("recording")
    startTimer()
    mediaRecorder.start()
  }
  async function finalizeRecording() {
    const recorder = mediaRecorder
    const stream = mediaStream
    mediaRecorder = null
    mediaStream = null
    if (!shouldTranscribe || recordedChunks.length === 0) {
      recordedChunks = []
      stopTracks(stream)
      setState("idle")
      setElapsedMs(0)
      return
    }
    const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
    try {
      const audioBlob = new Blob(recordedChunks, { type: mimeType })
      const transcription = await serverApi.transcribeAudio({
        audioBase64: await blobToBase64(audioBlob),
        mimeType,
      })
      if (transcription.text.trim()) {
        insertTranscript(transcription.text.trim())
      }
    } catch (error) {
      showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
        title: t("promptInput.voiceInput.error.title"),
        detail: error instanceof Error ? error.message : String(error),
        variant: "error",
      })
    } finally {
      recordedChunks = []
      stopTracks(stream)
      setState("idle")
      setElapsedMs(0)
    }
  }
  function insertTranscript(text: string) {
    const current = options.prompt()
    const textarea = options.getTextarea()
    const start = textarea ? textarea.selectionStart : current.length
    const end = textarea ? textarea.selectionEnd : current.length
    const before = current.slice(0, start)
    const after = current.slice(end)
    const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
    const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
    const nextValue = `${before}${prefix}${text}${suffix}${after}`
    const cursor = before.length + prefix.length + text.length
    options.setPrompt(nextValue)
    if (textarea) {
      setTimeout(() => {
        textarea.focus()
        textarea.setSelectionRange(cursor, cursor)
      }, 0)
    }
  }
  function cleanupMedia(resetState = true) {
    stopTimer()
    if (mediaRecorder && mediaRecorder.state !== "inactive") {
      mediaRecorder.stop()
    }
    mediaRecorder = null
    stopTracks(mediaStream)
    mediaStream = null
    recordedChunks = []
    if (resetState) {
      setState("idle")
      setElapsedMs(0)
    }
  }
  function startTimer() {
    stopTimer()
    timerId = window.setInterval(() => {
      setElapsedMs(Date.now() - recordingStartedAt)
    }, 250)
  }
  function stopTimer() {
    if (timerId !== undefined) {
      window.clearInterval(timerId)
      timerId = undefined
    }
  }
  return {
-    state,
+    state: () => active().state(),
-    elapsedMs,
+    elapsedMs: () => active().elapsedMs(),
-    canUseVoiceInput,
+    canUseVoiceInput: () => active().canUseVoiceInput(),
-    toggleRecording,
+    toggleRecording: () => active().toggleRecording(),
-    cancelRecording,
+    cancelRecording: () => active().cancelRecording(),
-    isRecording: () => state() === "recording",
+    isRecording: () => active().isRecording(),
-    isTranscribing: () => state() === "transcribing",
+    isTranscribing: () => active().isTranscribing(),
-    buttonTitle: () => {
+    buttonTitle: () => active().buttonTitle(),
      if (state() === "recording") return t("promptInput.voiceInput.stop.title")
      if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
      return t("promptInput.voiceInput.start.title")
    },
  }
 }
 function createRecorder(stream: MediaStream): MediaRecorder {
  const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
  const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
  return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
 }
 function stopTracks(stream: MediaStream | null) {
  stream?.getTracks().forEach((track) => track.stop())
 }
 async function blobToBase64(blob: Blob): Promise<string> {
  const buffer = await blob.arrayBuffer()
  const bytes = new Uint8Array(buffer)
  let binary = ""
  for (const byte of bytes) {
    binary += String.fromCharCode(byte)
  }
  return btoa(binary)
 }
--- a/packages/ui/src/components/settings/speech-settings-card.tsx
+++ b/packages/ui/src/components/settings/speech-settings-card.tsx
@@ -10,6 +10,8 @@ const log = getLogger("actions")
 type DraftFields = {
  apiKey: string
  baseUrl: string
  useRealtime: boolean
  realtimeModel: string
  sttModel: string
  ttsModel: string
  ttsVoice: string
@@ -19,6 +21,8 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
  return {
    apiKey: speech.apiKey ?? "",
    baseUrl: speech.baseUrl ?? "",
    useRealtime: speech.useRealtime,
    realtimeModel: speech.realtimeModel,
    sttModel: speech.sttModel,
    ttsModel: speech.ttsModel,
    ttsVoice: speech.ttsVoice,
@@ -26,7 +30,7 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
 }
 function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
-  return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
+  return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.useRealtime === b.useRealtime && a.realtimeModel === b.realtimeModel && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
 }
 export const SpeechSettingsCard: Component = () => {
@@ -57,7 +61,7 @@ export const SpeechSettingsCard: Component = () => {
    return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
  }
-  const updateDraft = (key: keyof DraftFields, value: string) => {
+  const updateDraft = <K extends keyof DraftFields>(key: K, value: DraftFields[K]) => {
    setSaveStatus("idle")
    setDrafts((current) => ({ ...current, [key]: value }))
  }
@@ -65,12 +69,14 @@ export const SpeechSettingsCard: Component = () => {
  const isDirty = createMemo(() => {
    const speech = serverSettings().speech
    const current = drafts()
-    return (
+      return (
-      (current.apiKey || "") !== (speech.apiKey || "") ||
+        (current.apiKey || "") !== (speech.apiKey || "") ||
-      (current.baseUrl || "") !== (speech.baseUrl || "") ||
+        (current.baseUrl || "") !== (speech.baseUrl || "") ||
-      current.sttModel !== speech.sttModel ||
+        current.useRealtime !== speech.useRealtime ||
-      current.ttsModel !== speech.ttsModel ||
+        current.realtimeModel !== speech.realtimeModel ||
-      current.ttsVoice !== speech.ttsVoice
+        current.sttModel !== speech.sttModel ||
        current.ttsModel !== speech.ttsModel ||
        current.ttsVoice !== speech.ttsVoice
    )
  })
@@ -90,6 +96,8 @@ export const SpeechSettingsCard: Component = () => {
      await updateSpeechSettings({
        apiKey: current.apiKey.trim() || undefined,
        baseUrl: current.baseUrl.trim() || undefined,
        useRealtime: current.useRealtime,
        realtimeModel: current.realtimeModel.trim() || undefined,
        sttModel: current.sttModel.trim() || undefined,
        ttsModel: current.ttsModel.trim() || undefined,
        ttsVoice: current.ttsVoice.trim() || undefined,
@@ -98,6 +106,8 @@ export const SpeechSettingsCard: Component = () => {
      setDrafts({
        apiKey: current.apiKey.trim(),
        baseUrl: current.baseUrl.trim(),
        useRealtime: current.useRealtime,
        realtimeModel: current.realtimeModel.trim() || serverSettings().speech.realtimeModel,
        sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
        ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
        ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
@@ -159,6 +169,27 @@ export const SpeechSettingsCard: Component = () => {
          onInput={(value) => updateDraft("baseUrl", value)}
          placeholder={t("settings.speech.baseUrl.placeholder")}
        />
        <div class="settings-toggle-row">
          <div>
            <div class="settings-toggle-title">{t("settings.speech.realtime.title")}</div>
            <div class="settings-toggle-caption">{t("settings.speech.realtime.subtitle")}</div>
          </div>
          <label class="settings-checkbox-toggle">
            <input
              type="checkbox"
              checked={drafts().useRealtime}
              onChange={(event) => updateDraft("useRealtime", event.currentTarget.checked)}
            />
            <span>{t("settings.common.enabled")}</span>
          </label>
        </div>
        <Field
          label={t("settings.speech.realtimeModel.title")}
          caption={t("settings.speech.realtimeModel.subtitle")}
          value={drafts().realtimeModel}
          onInput={(value) => updateDraft("realtimeModel", value)}
          placeholder={t("settings.speech.realtimeModel.placeholder")}
        />
        <Field
          label={t("settings.speech.sttModel.title")}
          caption={t("settings.speech.sttModel.subtitle")}
--- a/packages/ui/src/lib/api-client.ts
+++ b/packages/ui/src/lib/api-client.ts
@@ -8,6 +8,8 @@ import type {
  FileSystemListResponse,
  InstanceData,
  SpeechCapabilitiesResponse,
  SpeechRealtimeEvent,
  SpeechRealtimeSessionResponse,
  SpeechSynthesisResponse,
  SpeechTranscriptionResponse,
  ServerMeta,
@@ -39,6 +41,10 @@ export function buildBackgroundProcessStreamUrl(instanceId: string, processId: s
  return buildAbsoluteUrl(`/workspaces/${encodedInstanceId}/plugin/background-processes/${encodedProcessId}/stream`)
 }
 export function buildRealtimeSpeechEventsUrl(sessionId: string): string {
  return buildAbsoluteUrl(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/events`)
 }
 function buildEventsUrl(base: string | undefined, path: string): string {
  if (path.startsWith("http://") || path.startsWith("https://")) {
    return path
@@ -241,6 +247,29 @@ export const serverApi = {
  fetchSpeechCapabilities(): Promise<SpeechCapabilitiesResponse> {
    return request<SpeechCapabilitiesResponse>("/api/speech/capabilities")
  },
  createRealtimeSpeechSession(payload?: { language?: string; prompt?: string }): Promise<SpeechRealtimeSessionResponse> {
    return request<SpeechRealtimeSessionResponse>("/api/speech/realtime/sessions", {
      method: "POST",
      body: JSON.stringify(payload ?? {}),
    })
  },
  appendRealtimeSpeechAudio(sessionId: string, payload: { audioBase64: string }): Promise<void> {
    return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/audio`, {
      method: "POST",
      body: JSON.stringify(payload),
    })
  },
  finalizeRealtimeSpeechSession(sessionId: string): Promise<void> {
    return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/finalize`, {
      method: "POST",
      body: JSON.stringify({}),
    })
  },
  closeRealtimeSpeechSession(sessionId: string): Promise<void> {
    return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}`, {
      method: "DELETE",
    })
  },
  transcribeAudio(payload: {
    audioBase64: string
    mimeType: string
@@ -332,21 +361,34 @@ export const serverApi = {
  },
  connectEvents(onEvent: (event: WorkspaceEventPayload) => void, onError?: () => void) {
    sseLogger.info(`Connecting to ${EVENTS_URL}`)
-    const source = new EventSource(EVENTS_URL, { withCredentials: true } as any)
+    return connectEventSource(EVENTS_URL, onEvent, onError)
-    source.onmessage = (event) => {
+  },
-      try {
+  connectRealtimeSpeechEvents(
-        const payload = JSON.parse(event.data) as WorkspaceEventPayload
+    sessionId: string,
-        onEvent(payload)
+    onEvent: (event: SpeechRealtimeEvent) => void,
-      } catch (error) {
+    onError?: () => void,
-        sseLogger.error("Failed to parse event", error)
+  ) {
-      }
+    const url = buildRealtimeSpeechEventsUrl(sessionId)
-    }
+    sseLogger.info(`Connecting to ${url}`)
-    source.onerror = () => {
+    return connectEventSource(url, onEvent, onError)
      sseLogger.warn("EventSource error, closing stream")
      onError?.()
    }
    return source
  },
 }
-export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType }
+function connectEventSource<T>(url: string, onEvent: (event: T) => void, onError?: () => void) {
  const source = new EventSource(url, { withCredentials: true } as any)
  source.onmessage = (event) => {
    try {
      const payload = JSON.parse(event.data) as T
      onEvent(payload)
    } catch (error) {
      sseLogger.error("Failed to parse event", error)
    }
  }
  source.onerror = () => {
    sseLogger.warn("EventSource error, closing stream")
    onError?.()
  }
  return source
 }
 export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType, SpeechRealtimeEvent }
--- a/packages/ui/src/lib/i18n/messages/en/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/en/messaging.ts
@@ -140,8 +140,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "Send failed",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/en/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/en/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
  "settings.speech.realtime.title": "Realtime dictation",
  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
  "settings.speech.realtimeModel.title": "Realtime model",
  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/es/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/es/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "Error al enviar",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/es/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/es/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
  "settings.speech.realtime.title": "Realtime dictation",
  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
  "settings.speech.realtimeModel.title": "Realtime model",
  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/fr/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/fr/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "Échec de l'envoi",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/fr/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/fr/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
  "settings.speech.realtime.title": "Realtime dictation",
  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
  "settings.speech.realtimeModel.title": "Realtime model",
  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/ja/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/ja/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "送信に失敗",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/ja/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/ja/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
  "settings.speech.realtime.title": "Realtime dictation",
  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
  "settings.speech.realtimeModel.title": "Realtime model",
  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/ru/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/ru/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "Не удалось отправить",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/ru/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/ru/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
  "settings.speech.realtime.title": "Realtime dictation",
  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
  "settings.speech.realtimeModel.title": "Realtime model",
  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts
+++ b/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts
@@ -142,8 +142,10 @@ export const messagingMessages = {
  "promptInput.send.errorTitle": "发送失败",
  "promptInput.voiceInput.start.title": "Start voice input",
  "promptInput.voiceInput.stop.title": "Stop recording and transcribe",
  "promptInput.voiceInput.connecting.title": "Connecting microphone",
  "promptInput.voiceInput.transcribing.title": "Transcribing audio",
  "promptInput.voiceInput.error.title": "Voice input failed",
  "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
  "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
  "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
  "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
--- a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts
+++ b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts
@@ -156,13 +156,18 @@ export const settingsMessages = {
  "settings.speech.baseUrl.title": "Base URL",
  "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
  "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
  "settings.speech.realtime.title": "Realtime dictation",
  "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
  "settings.speech.realtimeModel.title": "Realtime model",
  "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
  "settings.speech.realtimeModel.placeholder": "gpt-realtime",
  "settings.speech.sttModel.title": "Transcription model",
  "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
  "settings.speech.ttsModel.title": "Speech model",
  "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
  "settings.speech.ttsVoice.title": "Default voice",
  "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
-  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
+  "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
  "settings.speech.save.action": "Save",
  "settings.speech.save.saving": "Saving...",
  "settings.speech.save.saved": "Saved",
--- a/packages/ui/src/stores/preferences.tsx
+++ b/packages/ui/src/stores/preferences.tsx
@@ -34,6 +34,8 @@ export interface SpeechSettings {
  provider: SpeechProviderPreference
  apiKey?: string
  baseUrl?: string
  useRealtime: boolean
  realtimeModel: string
  sttModel: string
  ttsModel: string
  ttsVoice: string
@@ -136,6 +138,8 @@ const defaultUiSettings: UiSettings = {
 const defaultSpeechSettings: SpeechSettings = {
  provider: "openai-compatible",
  useRealtime: true,
  realtimeModel: "gpt-realtime",
  sttModel: "gpt-4o-mini-transcribe",
  ttsModel: "gpt-4o-mini-tts",
  ttsVoice: "alloy",
@@ -184,6 +188,11 @@ function normalizeSpeechSettings(input?: Partial<SpeechSettings> | null): Speech
    provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider,
    apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined,
    baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined,
    useRealtime: sanitized.useRealtime ?? defaultSpeechSettings.useRealtime,
    realtimeModel:
      typeof sanitized.realtimeModel === "string" && sanitized.realtimeModel.trim()
        ? sanitized.realtimeModel.trim()
        : defaultSpeechSettings.realtimeModel,
    sttModel:
      typeof sanitized.sttModel === "string" && sanitized.sttModel.trim()
        ? sanitized.sttModel.trim()