feat(speech): add prompt voice input (#249)

## Summary - add server-backed speech capabilities and transcription endpoints plus UI settings for speech configuration - add push-to-talk prompt voice input with microphone controls, transcription insertion, and browser capability gating - keep prompt controls aligned by restoring right-side nav placement and moving the mic beside the expand control
2026-03-25 14:08:11 +00:00
parent a950d47df0
commit 1233121a13
40 changed files with 1545 additions and 27 deletions
--- a/packages/ui/src/components/prompt-input.tsx
+++ b/packages/ui/src/components/prompt-input.tsx
@@ -1,5 +1,5 @@
-import { Suspense, createEffect, createSignal, lazy, on, onCleanup, onMount, Show } from "solid-js"
-import { ArrowBigUp, ArrowBigDown } from "lucide-solid"
+import { Suspense, createEffect, createSignal, lazy, on, onCleanup, Show } from "solid-js"
+import { ArrowBigUp, ArrowBigDown, Loader2, Mic } from "lucide-solid"
 import ExpandButton from "./expand-button"
 import { clearAttachments, removeAttachment } from "../stores/attachments"
 import { resolvePastedPlaceholders } from "../lib/prompt-placeholders"
@@ -18,6 +18,7 @@ import { usePromptState } from "./prompt-input/usePromptState"
 import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
 import { usePromptPicker } from "./prompt-input/usePromptPicker"
 import { usePromptKeyDown } from "./prompt-input/usePromptKeyDown"
+import { usePromptVoiceInput } from "./prompt-input/usePromptVoiceInput"
 const log = getLogger("actions")
 const LazyUnifiedPicker = lazy(() => import("./unified-picker"))

@@ -450,9 +451,45 @@ export default function PromptInput(props: PromptInputProps) {
  })

  const shouldShowOverlay = () => prompt().length === 0
+  const voiceInput = usePromptVoiceInput({
+    prompt,
+    setPrompt,
+    getTextarea: () => textareaRef ?? null,
+    enabled: () => preferences().showPromptVoiceInput,
+    disabled: () => Boolean(props.disabled),
+  })
+  const showVoiceInput = () =>
+    preferences().showPromptVoiceInput &&
+    (voiceInput.canUseVoiceInput() || voiceInput.isRecording() || voiceInput.isTranscribing())

  const instance = () => getActiveInstance()

+  let voiceButtonPressed = false
+
+  const beginVoicePress = (event?: PointerEvent | KeyboardEvent) => {
+    if (voiceButtonPressed || props.disabled || voiceInput.isTranscribing() || !voiceInput.canUseVoiceInput()) return
+    voiceButtonPressed = true
+
+    if (event instanceof PointerEvent) {
+      const target = event.currentTarget
+      if (target instanceof HTMLElement) {
+        try {
+          target.setPointerCapture(event.pointerId)
+        } catch {
+          // no-op
+        }
+      }
+    }
+
+    void voiceInput.startRecording()
+  }
+
+  const endVoicePress = () => {
+    if (!voiceButtonPressed) return
+    voiceButtonPressed = false
+    voiceInput.stopRecording()
+  }
+
  return (
    <div class="prompt-input-container">
      <div
@@ -506,10 +543,54 @@ export default function PromptInput(props: PromptInputProps) {
                autocomplete="off"
              />
              <div class="prompt-nav-buttons">
-                <ExpandButton
-                  expandState={expandState}
-                  onToggleExpand={handleExpandToggle}
-                />
+                <div class="prompt-nav-top-row">
+                  <Show when={showVoiceInput()}>
+                    <button
+                      type="button"
+                      class={`prompt-voice-button prompt-nav-voice-button ${voiceInput.isRecording() ? "is-recording" : ""}`}
+                      onPointerDown={(event) => {
+                        event.preventDefault()
+                        beginVoicePress(event)
+                      }}
+                      onPointerUp={(event) => {
+                        event.preventDefault()
+                        endVoicePress()
+                      }}
+                      onPointerCancel={() => endVoicePress()}
+                      onLostPointerCapture={() => endVoicePress()}
+                      onKeyDown={(event) => {
+                        if (event.repeat) return
+                        if (event.key !== " " && event.key !== "Enter") return
+                        event.preventDefault()
+                        beginVoicePress(event)
+                      }}
+                      onKeyUp={(event) => {
+                        if (event.key !== " " && event.key !== "Enter") return
+                        event.preventDefault()
+                        endVoicePress()
+                      }}
+                      onBlur={() => endVoicePress()}
+                      disabled={!voiceInput.isRecording() && (props.disabled || voiceInput.isTranscribing() || !voiceInput.canUseVoiceInput())}
+                      aria-label={voiceInput.buttonTitle()}
+                      title={voiceInput.buttonTitle()}
+                    >
+                      <Show
+                        when={voiceInput.isRecording()}
+                        fallback={
+                          <Show when={voiceInput.isTranscribing()} fallback={<Mic class="h-4 w-4" aria-hidden="true" />}>
+                            <Loader2 class="h-4 w-4 animate-spin" aria-hidden="true" />
+                          </Show>
+                        }
+                      >
+                        <span class="prompt-voice-timer">{formatVoiceTimer(voiceInput.elapsedMs())}</span>
+                      </Show>
+                    </button>
+                  </Show>
+                  <ExpandButton
+                    expandState={expandState}
+                    onToggleExpand={handleExpandToggle}
+                  />
+                </div>
                <Show when={hasHistory()}>
                  <button
                    type="button"
@@ -631,3 +712,10 @@ export default function PromptInput(props: PromptInputProps) {
    </div>
  )
 }
+
+function formatVoiceTimer(elapsedMs: number): string {
+  const totalSeconds = Math.max(0, Math.floor(elapsedMs / 1000))
+  const minutes = Math.floor(totalSeconds / 60)
+  const seconds = totalSeconds % 60
+  return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}`
+}
--- a/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts
@@ -0,0 +1,244 @@
+import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
+import { showAlertDialog } from "../../stores/alerts"
+import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
+import { serverApi } from "../../lib/api-client"
+import { useI18n } from "../../lib/i18n"
+
+interface UsePromptVoiceInputOptions {
+  prompt: Accessor<string>
+  setPrompt: (value: string) => void
+  getTextarea: () => HTMLTextAreaElement | null
+  enabled: Accessor<boolean>
+  disabled: Accessor<boolean>
+}
+
+type VoiceInputState = "idle" | "recording" | "transcribing"
+
+export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
+  const { t } = useI18n()
+  const [state, setState] = createSignal<VoiceInputState>("idle")
+  const [elapsedMs, setElapsedMs] = createSignal(0)
+
+  let mediaRecorder: MediaRecorder | null = null
+  let mediaStream: MediaStream | null = null
+  let timerId: number | undefined
+  let shouldTranscribe = true
+  let recordedChunks: Blob[] = []
+  let recordingStartedAt = 0
+
+  createEffect(() => {
+    void loadSpeechCapabilities()
+  })
+
+  onCleanup(() => {
+    cleanupMedia(false)
+  })
+
+  const isSupported = () => {
+    if (typeof window === "undefined") return false
+    return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
+  }
+
+  const canUseVoiceInput = () => {
+    const capabilities = speechCapabilities()
+    return Boolean(
+      options.enabled() &&
+        isSupported() &&
+        capabilities?.available &&
+        capabilities?.configured &&
+        capabilities?.supportsStt,
+    )
+  }
+
+  async function toggleRecording(): Promise<void> {
+    if (state() === "recording") {
+      stopRecording()
+      return
+    }
+
+    await startRecording()
+  }
+
+  function stopRecording() {
+    if (!mediaRecorder || state() !== "recording") return
+    shouldTranscribe = true
+    mediaRecorder.stop()
+    setState("transcribing")
+    stopTimer()
+  }
+
+  function cancelRecording() {
+    if (!mediaRecorder || state() !== "recording") return
+    shouldTranscribe = false
+    mediaRecorder.stop()
+    cleanupMedia(false)
+  }
+
+  async function startRecording() {
+    if (!canUseVoiceInput() || options.disabled() || state() === "transcribing" || state() === "recording") return
+
+    if (!isSupported()) {
+      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
+        title: t("promptInput.voiceInput.error.title"),
+        variant: "error",
+      })
+      return
+    }
+
+    try {
+      recordedChunks = []
+      shouldTranscribe = true
+      mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+      mediaRecorder = createRecorder(mediaStream)
+
+      mediaRecorder.addEventListener("dataavailable", (event) => {
+        if (event.data.size > 0) {
+          recordedChunks.push(event.data)
+        }
+      })
+
+      mediaRecorder.addEventListener("stop", () => {
+        void finalizeRecording()
+      })
+
+      recordingStartedAt = Date.now()
+      setElapsedMs(0)
+      setState("recording")
+      startTimer()
+      mediaRecorder.start()
+    } catch (error) {
+      cleanupMedia(false)
+      showAlertDialog(t("promptInput.voiceInput.error.permission"), {
+        title: t("promptInput.voiceInput.error.title"),
+        detail: error instanceof Error ? error.message : String(error),
+        variant: "error",
+      })
+    }
+  }
+
+  async function finalizeRecording() {
+    const recorder = mediaRecorder
+    const stream = mediaStream
+    mediaRecorder = null
+    mediaStream = null
+
+    if (!shouldTranscribe || recordedChunks.length === 0) {
+      recordedChunks = []
+      stopTracks(stream)
+      setState("idle")
+      setElapsedMs(0)
+      return
+    }
+
+    const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
+
+    try {
+      const audioBlob = new Blob(recordedChunks, { type: mimeType })
+      const transcription = await serverApi.transcribeAudio({
+        audioBase64: await blobToBase64(audioBlob),
+        mimeType,
+      })
+      if (transcription.text.trim()) {
+        insertTranscript(transcription.text.trim())
+      }
+    } catch (error) {
+      showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
+        title: t("promptInput.voiceInput.error.title"),
+        detail: error instanceof Error ? error.message : String(error),
+        variant: "error",
+      })
+    } finally {
+      recordedChunks = []
+      stopTracks(stream)
+      setState("idle")
+      setElapsedMs(0)
+    }
+  }
+
+  function insertTranscript(text: string) {
+    const current = options.prompt()
+    const textarea = options.getTextarea()
+    const start = textarea ? textarea.selectionStart : current.length
+    const end = textarea ? textarea.selectionEnd : current.length
+    const before = current.slice(0, start)
+    const after = current.slice(end)
+    const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
+    const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
+    const nextValue = `${before}${prefix}${text}${suffix}${after}`
+    const cursor = before.length + prefix.length + text.length
+
+    options.setPrompt(nextValue)
+    if (textarea) {
+      setTimeout(() => {
+        textarea.focus()
+        textarea.setSelectionRange(cursor, cursor)
+      }, 0)
+    }
+  }
+
+  function cleanupMedia(resetState = true) {
+    stopTimer()
+    if (mediaRecorder && mediaRecorder.state !== "inactive") {
+      mediaRecorder.stop()
+    }
+    mediaRecorder = null
+    stopTracks(mediaStream)
+    mediaStream = null
+    recordedChunks = []
+    if (resetState) {
+      setState("idle")
+      setElapsedMs(0)
+    }
+  }
+
+  function startTimer() {
+    stopTimer()
+    timerId = window.setInterval(() => {
+      setElapsedMs(Date.now() - recordingStartedAt)
+    }, 250)
+  }
+
+  function stopTimer() {
+    if (timerId !== undefined) {
+      window.clearInterval(timerId)
+      timerId = undefined
+    }
+  }
+
+  return {
+    state,
+    elapsedMs,
+    canUseVoiceInput,
+    startRecording,
+    stopRecording,
+    toggleRecording,
+    cancelRecording,
+    isRecording: () => state() === "recording",
+    isTranscribing: () => state() === "transcribing",
+    buttonTitle: () => {
+      if (state() === "recording") return t("promptInput.voiceInput.stop.title")
+      if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
+      return t("promptInput.voiceInput.start.title")
+    },
+  }
+}
+
+function createRecorder(stream: MediaStream): MediaRecorder {
+  const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
+  const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
+  return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
+}
+
+function stopTracks(stream: MediaStream | null) {
+  stream?.getTracks().forEach((track) => track.stop())
+}
+
+async function blobToBase64(blob: Blob): Promise<string> {
+  const buffer = await blob.arrayBuffer()
+  const bytes = new Uint8Array(buffer)
+  let binary = ""
+  for (const byte of bytes) {
+    binary += String.fromCharCode(byte)
+  }
+  return btoa(binary)
+}
--- a/packages/ui/src/components/settings-screen.tsx
+++ b/packages/ui/src/components/settings-screen.tsx
@@ -1,5 +1,5 @@
 import { Dialog } from "@kobalte/core/dialog"
-import { Settings, Bell, MonitorUp, Paintbrush, Terminal, X } from "lucide-solid"
+import { Settings, Bell, MonitorUp, Paintbrush, Terminal, Volume2, X } from "lucide-solid"
 import { createMemo, For, type Component } from "solid-js"
 import { useI18n } from "../lib/i18n"
 import {
@@ -13,6 +13,7 @@ import { AppearanceSettingsSection } from "./settings/appearance-settings-sectio
 import { NotificationsSettingsSection } from "./settings/notifications-settings-section"
 import { OpenCodeSettingsSection } from "./settings/opencode-settings-section"
 import { RemoteAccessSettingsSection } from "./settings/remote-access-settings-section"
+import { SpeechSettingsSection } from "./settings/speech-settings-section"

 export const SettingsScreen: Component = () => {
  const { t } = useI18n()
@@ -21,6 +22,7 @@ export const SettingsScreen: Component = () => {
    { id: "appearance" as SettingsSectionId, icon: Paintbrush, label: t("settings.nav.appearance") },
    { id: "notifications" as SettingsSectionId, icon: Bell, label: t("settings.nav.notifications") },
    { id: "remote" as SettingsSectionId, icon: MonitorUp, label: t("settings.nav.remote") },
+    { id: "speech" as SettingsSectionId, icon: Volume2, label: t("settings.nav.speech") },
    { id: "opencode" as SettingsSectionId, icon: Terminal, label: t("settings.nav.opencode") },
  ])

@@ -30,6 +32,8 @@ export const SettingsScreen: Component = () => {
        return <NotificationsSettingsSection />
      case "remote":
        return <RemoteAccessSettingsSection />
+      case "speech":
+        return <SpeechSettingsSection />
      case "opencode":
        return <OpenCodeSettingsSection />
      case "appearance":
--- a/packages/ui/src/components/settings/appearance-settings-section.tsx
+++ b/packages/ui/src/components/settings/appearance-settings-section.tsx
@@ -24,6 +24,7 @@ export const AppearanceSettingsSection: Component = () => {
    toggleUsageMetrics,
    toggleAutoCleanupBlankSessions,
    togglePromptSubmitOnEnter,
+    toggleShowPromptVoiceInput,
    setDiffViewMode,
    setToolOutputExpansion,
    setDiagnosticsExpansion,
@@ -38,10 +39,11 @@ export const AppearanceSettingsSection: Component = () => {
      toggleShowThinkingBlocks,
      toggleKeyboardShortcutHints,
      toggleShowTimelineTools,
-      toggleUsageMetrics,
-      toggleAutoCleanupBlankSessions,
-      togglePromptSubmitOnEnter,
-      setDiffViewMode,
+        toggleUsageMetrics,
+        toggleAutoCleanupBlankSessions,
+        togglePromptSubmitOnEnter,
+        toggleShowPromptVoiceInput,
+        setDiffViewMode,
      setToolOutputExpansion,
      setDiagnosticsExpansion,
      setThinkingBlocksExpansion,
--- a/packages/ui/src/components/settings/speech-settings-card.tsx
+++ b/packages/ui/src/components/settings/speech-settings-card.tsx
@@ -0,0 +1,252 @@
+import { Show, createEffect, createMemo, createSignal, type Component } from "solid-js"
+import { Mic, Volume2 } from "lucide-solid"
+import { useConfig, type SpeechSettings } from "../../stores/preferences"
+import { useI18n } from "../../lib/i18n"
+import { loadSpeechCapabilities, speechCapabilities, speechCapabilitiesError, speechCapabilitiesLoading } from "../../stores/speech"
+import { getLogger } from "../../lib/logger"
+
+const log = getLogger("actions")
+
+type DraftFields = {
+  apiKey: string
+  baseUrl: string
+  sttModel: string
+  ttsModel: string
+  ttsVoice: string
+}
+
+function createDraftFields(speech: SpeechSettings): DraftFields {
+  return {
+    apiKey: "",
+    baseUrl: speech.baseUrl ?? "",
+    sttModel: speech.sttModel,
+    ttsModel: speech.ttsModel,
+    ttsVoice: speech.ttsVoice,
+  }
+}
+
+function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
+  return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
+}
+
+export const SpeechSettingsCard: Component = () => {
+  const { t } = useI18n()
+  const { serverSettings, updateSpeechSettings } = useConfig()
+  const initialDrafts = createDraftFields(serverSettings().speech)
+  const [isSaving, setIsSaving] = createSignal(false)
+  const [saveStatus, setSaveStatus] = createSignal<"idle" | "saved" | "error">("saved")
+  const [drafts, setDrafts] = createSignal<DraftFields>(initialDrafts)
+  const [apiKeyTouched, setApiKeyTouched] = createSignal(false)
+  const [clearStoredApiKey, setClearStoredApiKey] = createSignal(false)
+
+  createEffect(() => {
+    const speech = serverSettings().speech
+    const nextDrafts = createDraftFields(speech)
+    if (!isSaving() && !isDirty()) {
+      if (!isDraftEqual(drafts(), nextDrafts)) {
+        setDrafts(nextDrafts)
+      }
+      if (apiKeyTouched()) {
+        setApiKeyTouched(false)
+      }
+      if (clearStoredApiKey()) {
+        setClearStoredApiKey(false)
+      }
+    }
+  })
+
+  createEffect(() => {
+    void loadSpeechCapabilities()
+  })
+
+  const capabilityLabel = () => {
+    if (speechCapabilitiesLoading()) return t("settings.speech.status.loading")
+    if (speechCapabilitiesError()) return t("settings.speech.status.error")
+    return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
+  }
+
+  const updateDraft = (key: keyof DraftFields, value: string) => {
+    setSaveStatus("idle")
+    if (key === "apiKey") {
+      setApiKeyTouched(true)
+      setClearStoredApiKey(false)
+    }
+    setDrafts((current) => ({ ...current, [key]: value }))
+  }
+
+  const apiKeyDirty = createMemo(() => clearStoredApiKey() || drafts().apiKey.trim().length > 0)
+
+  const isDirty = createMemo(() => {
+    const speech = serverSettings().speech
+    const current = drafts()
+    return (
+      apiKeyDirty() ||
+      (current.baseUrl || "") !== (speech.baseUrl || "") ||
+      current.sttModel !== speech.sttModel ||
+      current.ttsModel !== speech.ttsModel ||
+      current.ttsVoice !== speech.ttsVoice
+    )
+  })
+
+  const saveStatusLabel = () => {
+    if (isSaving()) return t("settings.speech.save.saving")
+    if (saveStatus() === "saved") return t("settings.speech.save.saved")
+    if (saveStatus() === "error") return t("settings.speech.save.error")
+    return t("settings.speech.save.unsaved")
+  }
+
+  async function handleSave() {
+    if (!isDirty() || isSaving()) return
+    const current = drafts()
+    setIsSaving(true)
+    setSaveStatus("idle")
+    try {
+      const trimmedApiKey = current.apiKey.trim()
+      await updateSpeechSettings({
+        ...(clearStoredApiKey() ? { apiKey: null } : trimmedApiKey ? { apiKey: trimmedApiKey } : {}),
+        baseUrl: current.baseUrl.trim() || undefined,
+        sttModel: current.sttModel.trim() || undefined,
+        ttsModel: current.ttsModel.trim() || undefined,
+        ttsVoice: current.ttsVoice.trim() || undefined,
+      })
+      await loadSpeechCapabilities(true)
+      setDrafts({
+        apiKey: "",
+        baseUrl: current.baseUrl.trim(),
+        sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
+        ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
+        ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
+      })
+      setApiKeyTouched(false)
+      setClearStoredApiKey(false)
+      setSaveStatus("saved")
+    } catch (error) {
+      log.error("Failed to save speech settings", error)
+      setSaveStatus("error")
+    } finally {
+      setIsSaving(false)
+    }
+  }
+
+  return (
+    <div class="settings-card">
+      <div class="settings-card-header">
+        <div class="settings-card-heading-with-icon">
+          <Volume2 class="settings-card-heading-icon" />
+          <div>
+            <h3 class="settings-card-title">{t("settings.speech.title")}</h3>
+            <p class="settings-card-subtitle">{t("settings.speech.subtitle")}</p>
+          </div>
+        </div>
+        <span class="settings-scope-badge settings-scope-badge-server">{t("settings.scope.server")}</span>
+      </div>
+
+      <div class="settings-stack">
+        <div class="settings-toggle-row settings-toggle-row-compact">
+          <div>
+            <div class="settings-toggle-title">{t("settings.speech.provider.title")}</div>
+            <div class="settings-toggle-caption">{t("settings.speech.provider.subtitle")}</div>
+          </div>
+          <div class="settings-toolbar-inline">
+            <span class="settings-inline-note">{t("settings.speech.provider.openaiCompatible")}</span>
+            <span class="settings-inline-note">{capabilityLabel()}</span>
+            <span class="settings-inline-note">{saveStatusLabel()}</span>
+            <button
+              type="button"
+              class="selector-button selector-button-primary w-auto whitespace-nowrap"
+              onClick={() => void handleSave()}
+              disabled={!isDirty() || isSaving()}
+            >
+              {isSaving() ? t("settings.speech.save.saving") : t("settings.speech.save.action")}
+            </button>
+          </div>
+        </div>
+
+        <Field
+          label={t("settings.speech.apiKey.title")}
+          caption={t("settings.speech.apiKey.subtitle")}
+          value={drafts().apiKey}
+          onInput={(value) => updateDraft("apiKey", value)}
+          type="password"
+          placeholder={serverSettings().speech.hasApiKey ? t("settings.speech.apiKey.placeholder") : undefined}
+        />
+        <Show when={serverSettings().speech.hasApiKey && !apiKeyTouched() && drafts().apiKey.length === 0}>
+          <div class="settings-inline-note">
+            {clearStoredApiKey() ? t("settings.speech.apiKey.clearPending") : t("settings.speech.apiKey.storedNote")}{" "}
+            <Show when={!clearStoredApiKey()}>
+              <button
+                type="button"
+                class="selector-button selector-button-secondary w-auto whitespace-nowrap"
+                onClick={() => {
+                  setClearStoredApiKey(true)
+                  setSaveStatus("idle")
+                }}
+              >
+                {t("settings.speech.apiKey.clearAction")}
+              </button>
+            </Show>
+          </div>
+        </Show>
+        <Field
+          label={t("settings.speech.baseUrl.title")}
+          caption={t("settings.speech.baseUrl.subtitle")}
+          value={drafts().baseUrl}
+          onInput={(value) => updateDraft("baseUrl", value)}
+          placeholder={t("settings.speech.baseUrl.placeholder")}
+        />
+        <Field
+          label={t("settings.speech.sttModel.title")}
+          caption={t("settings.speech.sttModel.subtitle")}
+          value={drafts().sttModel}
+          onInput={(value) => updateDraft("sttModel", value)}
+        />
+        <Field
+          label={t("settings.speech.ttsModel.title")}
+          caption={t("settings.speech.ttsModel.subtitle")}
+          value={drafts().ttsModel}
+          onInput={(value) => updateDraft("ttsModel", value)}
+        />
+        <Field
+          label={t("settings.speech.ttsVoice.title")}
+          caption={t("settings.speech.ttsVoice.subtitle")}
+          value={drafts().ttsVoice}
+          onInput={(value) => updateDraft("ttsVoice", value)}
+          icon={<Mic class="w-3.5 h-3.5 icon-muted flex-shrink-0" />}
+        />
+
+        <div class="settings-inline-note">{t("settings.speech.help")}</div>
+      </div>
+    </div>
+  )
+}
+
+const Field: Component<{
+  label: string
+  caption: string
+  value: string
+  type?: string
+  placeholder?: string
+  onInput: (value: string) => void
+  icon?: any
+}> = (props) => {
+  return (
+    <div class="settings-toggle-row settings-toggle-row-compact">
+      <div>
+        <div class="settings-toggle-title">{props.label}</div>
+        <div class="settings-toggle-caption">{props.caption}</div>
+      </div>
+      <div class="flex items-center gap-2 min-w-[18rem] max-w-[24rem] w-full">
+        {props.icon}
+        <input
+          type={props.type ?? "text"}
+          value={props.value}
+          onInput={(event) => props.onInput(event.currentTarget.value)}
+          class="selector-input w-full"
+          placeholder={props.placeholder}
+        />
+      </div>
+    </div>
+  )
+}
+
+export default SpeechSettingsCard
--- a/packages/ui/src/components/settings/speech-settings-section.tsx
+++ b/packages/ui/src/components/settings/speech-settings-section.tsx
@@ -0,0 +1,10 @@
+import type { Component } from "solid-js"
+import SpeechSettingsCard from "./speech-settings-card"
+
+export const SpeechSettingsSection: Component = () => {
+  return (
+    <div class="settings-section-stack">
+      <SpeechSettingsCard />
+    </div>
+  )
+}