feat(speech): add prompt voice input groundwork

2026-03-12 22:04:57 +00:00
parent a950d47df0
commit f3a1ccd8b8
34 changed files with 1285 additions and 10 deletions
--- a/packages/ui/src/components/prompt-input.tsx
+++ b/packages/ui/src/components/prompt-input.tsx
@@ -1,5 +1,5 @@
-import { Suspense, createEffect, createSignal, lazy, on, onCleanup, onMount, Show } from "solid-js"
-import { ArrowBigUp, ArrowBigDown } from "lucide-solid"
+import { Suspense, createEffect, createSignal, lazy, on, onCleanup, Show } from "solid-js"
+import { ArrowBigUp, ArrowBigDown, Loader2, Mic, Square } from "lucide-solid"
 import ExpandButton from "./expand-button"
 import { clearAttachments, removeAttachment } from "../stores/attachments"
 import { resolvePastedPlaceholders } from "../lib/prompt-placeholders"
@@ -18,6 +18,7 @@ import { usePromptState } from "./prompt-input/usePromptState"
 import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
 import { usePromptPicker } from "./prompt-input/usePromptPicker"
 import { usePromptKeyDown } from "./prompt-input/usePromptKeyDown"
+import { usePromptVoiceInput } from "./prompt-input/usePromptVoiceInput"
 const log = getLogger("actions")
 const LazyUnifiedPicker = lazy(() => import("./unified-picker"))

@@ -450,6 +451,16 @@ export default function PromptInput(props: PromptInputProps) {
  })

  const shouldShowOverlay = () => prompt().length === 0
+  const voiceInput = usePromptVoiceInput({
+    prompt,
+    setPrompt,
+    getTextarea: () => textareaRef ?? null,
+    enabled: () => preferences().showPromptVoiceInput,
+    disabled: () => Boolean(props.disabled),
+  })
+  const showVoiceInput = () =>
+    preferences().showPromptVoiceInput &&
+    (voiceInput.canUseVoiceInput() || voiceInput.isRecording() || voiceInput.isTranscribing())

  const instance = () => getActiveInstance()

@@ -597,6 +608,30 @@ export default function PromptInput(props: PromptInputProps) {
        </div>

        <div class="prompt-input-actions">
+          <Show when={showVoiceInput()}>
+            <button
+              type="button"
+              class={`prompt-voice-button ${voiceInput.isRecording() ? "is-recording" : ""}`}
+              onClick={() => void voiceInput.toggleRecording()}
+              disabled={!voiceInput.isRecording() && (props.disabled || voiceInput.isTranscribing() || !voiceInput.canUseVoiceInput())}
+              aria-label={voiceInput.buttonTitle()}
+              title={voiceInput.buttonTitle()}
+            >
+              <Show
+                when={voiceInput.isTranscribing()}
+                fallback={
+                  <Show when={voiceInput.isRecording()} fallback={<Mic class="h-4 w-4" aria-hidden="true" />}>
+                    <Square class="h-4 w-4" aria-hidden="true" />
+                  </Show>
+                }
+              >
+                <Loader2 class="h-4 w-4 animate-spin" aria-hidden="true" />
+              </Show>
+            </button>
+            <Show when={voiceInput.isRecording()}>
+              <span class="prompt-voice-timer">{formatVoiceTimer(voiceInput.elapsedMs())}</span>
+            </Show>
+          </Show>
          <button
            type="button"
            class="stop-button"
@@ -631,3 +666,10 @@ export default function PromptInput(props: PromptInputProps) {
    </div>
  )
 }
+
+function formatVoiceTimer(elapsedMs: number): string {
+  const totalSeconds = Math.max(0, Math.floor(elapsedMs / 1000))
+  const minutes = Math.floor(totalSeconds / 60)
+  const seconds = totalSeconds % 60
+  return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}`
+}
--- a/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts
+++ b/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts
@@ -0,0 +1,242 @@
+import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
+import { showAlertDialog } from "../../stores/alerts"
+import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
+import { serverApi } from "../../lib/api-client"
+import { useI18n } from "../../lib/i18n"
+
+interface UsePromptVoiceInputOptions {
+  prompt: Accessor<string>
+  setPrompt: (value: string) => void
+  getTextarea: () => HTMLTextAreaElement | null
+  enabled: Accessor<boolean>
+  disabled: Accessor<boolean>
+}
+
+type VoiceInputState = "idle" | "recording" | "transcribing"
+
+export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
+  const { t } = useI18n()
+  const [state, setState] = createSignal<VoiceInputState>("idle")
+  const [elapsedMs, setElapsedMs] = createSignal(0)
+
+  let mediaRecorder: MediaRecorder | null = null
+  let mediaStream: MediaStream | null = null
+  let timerId: number | undefined
+  let shouldTranscribe = true
+  let recordedChunks: Blob[] = []
+  let recordingStartedAt = 0
+
+  createEffect(() => {
+    void loadSpeechCapabilities()
+  })
+
+  onCleanup(() => {
+    cleanupMedia(false)
+  })
+
+  const isSupported = () => {
+    if (typeof window === "undefined") return false
+    return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
+  }
+
+  const canUseVoiceInput = () => {
+    const capabilities = speechCapabilities()
+    return Boolean(
+      options.enabled() &&
+        isSupported() &&
+        capabilities?.available &&
+        capabilities?.configured &&
+        capabilities?.supportsStt,
+    )
+  }
+
+  async function toggleRecording(): Promise<void> {
+    if (state() === "recording") {
+      stopRecording()
+      return
+    }
+
+    if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
+
+    try {
+      await startRecording()
+    } catch (error) {
+      cleanupMedia(false)
+      showAlertDialog(t("promptInput.voiceInput.error.permission"), {
+        title: t("promptInput.voiceInput.error.title"),
+        detail: error instanceof Error ? error.message : String(error),
+        variant: "error",
+      })
+    }
+  }
+
+  function stopRecording() {
+    if (!mediaRecorder || state() !== "recording") return
+    shouldTranscribe = true
+    mediaRecorder.stop()
+    setState("transcribing")
+    stopTimer()
+  }
+
+  function cancelRecording() {
+    if (!mediaRecorder || state() !== "recording") return
+    shouldTranscribe = false
+    mediaRecorder.stop()
+    cleanupMedia(false)
+  }
+
+  async function startRecording() {
+    if (!isSupported()) {
+      showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
+        title: t("promptInput.voiceInput.error.title"),
+        variant: "error",
+      })
+      return
+    }
+
+    recordedChunks = []
+    shouldTranscribe = true
+    mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
+    mediaRecorder = createRecorder(mediaStream)
+
+    mediaRecorder.addEventListener("dataavailable", (event) => {
+      if (event.data.size > 0) {
+        recordedChunks.push(event.data)
+      }
+    })
+
+    mediaRecorder.addEventListener("stop", () => {
+      void finalizeRecording()
+    })
+
+    recordingStartedAt = Date.now()
+    setElapsedMs(0)
+    setState("recording")
+    startTimer()
+    mediaRecorder.start()
+  }
+
+  async function finalizeRecording() {
+    const recorder = mediaRecorder
+    const stream = mediaStream
+    mediaRecorder = null
+    mediaStream = null
+
+    if (!shouldTranscribe || recordedChunks.length === 0) {
+      recordedChunks = []
+      stopTracks(stream)
+      setState("idle")
+      setElapsedMs(0)
+      return
+    }
+
+    const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
+
+    try {
+      const audioBlob = new Blob(recordedChunks, { type: mimeType })
+      const transcription = await serverApi.transcribeAudio({
+        audioBase64: await blobToBase64(audioBlob),
+        mimeType,
+      })
+      if (transcription.text.trim()) {
+        insertTranscript(transcription.text.trim())
+      }
+    } catch (error) {
+      showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
+        title: t("promptInput.voiceInput.error.title"),
+        detail: error instanceof Error ? error.message : String(error),
+        variant: "error",
+      })
+    } finally {
+      recordedChunks = []
+      stopTracks(stream)
+      setState("idle")
+      setElapsedMs(0)
+    }
+  }
+
+  function insertTranscript(text: string) {
+    const current = options.prompt()
+    const textarea = options.getTextarea()
+    const start = textarea ? textarea.selectionStart : current.length
+    const end = textarea ? textarea.selectionEnd : current.length
+    const before = current.slice(0, start)
+    const after = current.slice(end)
+    const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
+    const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
+    const nextValue = `${before}${prefix}${text}${suffix}${after}`
+    const cursor = before.length + prefix.length + text.length
+
+    options.setPrompt(nextValue)
+    if (textarea) {
+      setTimeout(() => {
+        textarea.focus()
+        textarea.setSelectionRange(cursor, cursor)
+      }, 0)
+    }
+  }
+
+  function cleanupMedia(resetState = true) {
+    stopTimer()
+    if (mediaRecorder && mediaRecorder.state !== "inactive") {
+      mediaRecorder.stop()
+    }
+    mediaRecorder = null
+    stopTracks(mediaStream)
+    mediaStream = null
+    recordedChunks = []
+    if (resetState) {
+      setState("idle")
+      setElapsedMs(0)
+    }
+  }
+
+  function startTimer() {
+    stopTimer()
+    timerId = window.setInterval(() => {
+      setElapsedMs(Date.now() - recordingStartedAt)
+    }, 250)
+  }
+
+  function stopTimer() {
+    if (timerId !== undefined) {
+      window.clearInterval(timerId)
+      timerId = undefined
+    }
+  }
+
+  return {
+    state,
+    elapsedMs,
+    canUseVoiceInput,
+    toggleRecording,
+    cancelRecording,
+    isRecording: () => state() === "recording",
+    isTranscribing: () => state() === "transcribing",
+    buttonTitle: () => {
+      if (state() === "recording") return t("promptInput.voiceInput.stop.title")
+      if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
+      return t("promptInput.voiceInput.start.title")
+    },
+  }
+}
+
+function createRecorder(stream: MediaStream): MediaRecorder {
+  const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
+  const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
+  return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
+}
+
+function stopTracks(stream: MediaStream | null) {
+  stream?.getTracks().forEach((track) => track.stop())
+}
+
+async function blobToBase64(blob: Blob): Promise<string> {
+  const buffer = await blob.arrayBuffer()
+  const bytes = new Uint8Array(buffer)
+  let binary = ""
+  for (const byte of bytes) {
+    binary += String.fromCharCode(byte)
+  }
+  return btoa(binary)
+}
--- a/packages/ui/src/components/settings-screen.tsx
+++ b/packages/ui/src/components/settings-screen.tsx
@@ -1,5 +1,5 @@
 import { Dialog } from "@kobalte/core/dialog"
-import { Settings, Bell, MonitorUp, Paintbrush, Terminal, X } from "lucide-solid"
+import { Settings, Bell, MonitorUp, Paintbrush, Terminal, Volume2, X } from "lucide-solid"
 import { createMemo, For, type Component } from "solid-js"
 import { useI18n } from "../lib/i18n"
 import {
@@ -13,6 +13,7 @@ import { AppearanceSettingsSection } from "./settings/appearance-settings-sectio
 import { NotificationsSettingsSection } from "./settings/notifications-settings-section"
 import { OpenCodeSettingsSection } from "./settings/opencode-settings-section"
 import { RemoteAccessSettingsSection } from "./settings/remote-access-settings-section"
+import { SpeechSettingsSection } from "./settings/speech-settings-section"

 export const SettingsScreen: Component = () => {
  const { t } = useI18n()
@@ -21,6 +22,7 @@ export const SettingsScreen: Component = () => {
    { id: "appearance" as SettingsSectionId, icon: Paintbrush, label: t("settings.nav.appearance") },
    { id: "notifications" as SettingsSectionId, icon: Bell, label: t("settings.nav.notifications") },
    { id: "remote" as SettingsSectionId, icon: MonitorUp, label: t("settings.nav.remote") },
+    { id: "speech" as SettingsSectionId, icon: Volume2, label: t("settings.nav.speech") },
    { id: "opencode" as SettingsSectionId, icon: Terminal, label: t("settings.nav.opencode") },
  ])

@@ -30,6 +32,8 @@ export const SettingsScreen: Component = () => {
        return <NotificationsSettingsSection />
      case "remote":
        return <RemoteAccessSettingsSection />
+      case "speech":
+        return <SpeechSettingsSection />
      case "opencode":
        return <OpenCodeSettingsSection />
      case "appearance":
--- a/packages/ui/src/components/settings/appearance-settings-section.tsx
+++ b/packages/ui/src/components/settings/appearance-settings-section.tsx
@@ -24,6 +24,7 @@ export const AppearanceSettingsSection: Component = () => {
    toggleUsageMetrics,
    toggleAutoCleanupBlankSessions,
    togglePromptSubmitOnEnter,
+    toggleShowPromptVoiceInput,
    setDiffViewMode,
    setToolOutputExpansion,
    setDiagnosticsExpansion,
@@ -38,10 +39,11 @@ export const AppearanceSettingsSection: Component = () => {
      toggleShowThinkingBlocks,
      toggleKeyboardShortcutHints,
      toggleShowTimelineTools,
-      toggleUsageMetrics,
-      toggleAutoCleanupBlankSessions,
-      togglePromptSubmitOnEnter,
-      setDiffViewMode,
+        toggleUsageMetrics,
+        toggleAutoCleanupBlankSessions,
+        togglePromptSubmitOnEnter,
+        toggleShowPromptVoiceInput,
+        setDiffViewMode,
      setToolOutputExpansion,
      setDiagnosticsExpansion,
      setThinkingBlocksExpansion,
--- a/packages/ui/src/components/settings/speech-settings-card.tsx
+++ b/packages/ui/src/components/settings/speech-settings-card.tsx
@@ -0,0 +1,220 @@
+import { createEffect, createMemo, createSignal, type Component } from "solid-js"
+import { Mic, Volume2 } from "lucide-solid"
+import { useConfig, type SpeechSettings } from "../../stores/preferences"
+import { useI18n } from "../../lib/i18n"
+import { loadSpeechCapabilities, speechCapabilities, speechCapabilitiesError, speechCapabilitiesLoading } from "../../stores/speech"
+import { getLogger } from "../../lib/logger"
+
+const log = getLogger("actions")
+
+type DraftFields = {
+  apiKey: string
+  baseUrl: string
+  sttModel: string
+  ttsModel: string
+  ttsVoice: string
+}
+
+function createDraftFields(speech: SpeechSettings): DraftFields {
+  return {
+    apiKey: speech.apiKey ?? "",
+    baseUrl: speech.baseUrl ?? "",
+    sttModel: speech.sttModel,
+    ttsModel: speech.ttsModel,
+    ttsVoice: speech.ttsVoice,
+  }
+}
+
+function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
+  return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
+}
+
+export const SpeechSettingsCard: Component = () => {
+  const { t } = useI18n()
+  const { serverSettings, updateSpeechSettings } = useConfig()
+  const initialDrafts = createDraftFields(serverSettings().speech)
+  const [isSaving, setIsSaving] = createSignal(false)
+  const [saveStatus, setSaveStatus] = createSignal<"idle" | "saved" | "error">("idle")
+  const [drafts, setDrafts] = createSignal<DraftFields>(initialDrafts)
+
+  createEffect(() => {
+    const speech = serverSettings().speech
+    const nextDrafts = createDraftFields(speech)
+    if (!isDirty() || isSaving()) {
+      if (!isDraftEqual(drafts(), nextDrafts)) {
+        setDrafts(nextDrafts)
+      }
+    }
+    if (!isSaving() && !isDirty()) {
+      setSaveStatus("saved")
+    }
+  })
+
+  createEffect(() => {
+    void loadSpeechCapabilities()
+  })
+
+  const capabilityLabel = () => {
+    if (speechCapabilitiesLoading()) return t("settings.speech.status.loading")
+    if (speechCapabilitiesError()) return t("settings.speech.status.error")
+    return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
+  }
+
+  const updateDraft = (key: keyof DraftFields, value: string) => {
+    setSaveStatus("idle")
+    setDrafts((current) => ({ ...current, [key]: value }))
+  }
+
+  const isDirty = createMemo(() => {
+    const speech = serverSettings().speech
+    const current = drafts()
+    return (
+      (current.apiKey || "") !== (speech.apiKey || "") ||
+      (current.baseUrl || "") !== (speech.baseUrl || "") ||
+      current.sttModel !== speech.sttModel ||
+      current.ttsModel !== speech.ttsModel ||
+      current.ttsVoice !== speech.ttsVoice
+    )
+  })
+
+  const saveStatusLabel = () => {
+    if (isSaving()) return t("settings.speech.save.saving")
+    if (saveStatus() === "saved") return t("settings.speech.save.saved")
+    if (saveStatus() === "error") return t("settings.speech.save.error")
+    return t("settings.speech.save.unsaved")
+  }
+
+  async function handleSave() {
+    if (!isDirty() || isSaving()) return
+    setIsSaving(true)
+    setSaveStatus("idle")
+    const current = drafts()
+    try {
+      await updateSpeechSettings({
+        apiKey: current.apiKey.trim() || undefined,
+        baseUrl: current.baseUrl.trim() || undefined,
+        sttModel: current.sttModel.trim() || undefined,
+        ttsModel: current.ttsModel.trim() || undefined,
+        ttsVoice: current.ttsVoice.trim() || undefined,
+      })
+      await loadSpeechCapabilities(true)
+      setDrafts({
+        apiKey: current.apiKey.trim(),
+        baseUrl: current.baseUrl.trim(),
+        sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
+        ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
+        ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
+      })
+      setSaveStatus("saved")
+    } catch (error) {
+      log.error("Failed to save speech settings", error)
+      setSaveStatus("error")
+    } finally {
+      setIsSaving(false)
+    }
+  }
+
+  return (
+    <div class="settings-card">
+      <div class="settings-card-header">
+        <div class="settings-card-heading-with-icon">
+          <Volume2 class="settings-card-heading-icon" />
+          <div>
+            <h3 class="settings-card-title">{t("settings.speech.title")}</h3>
+            <p class="settings-card-subtitle">{t("settings.speech.subtitle")}</p>
+          </div>
+        </div>
+        <span class="settings-scope-badge settings-scope-badge-server">{t("settings.scope.server")}</span>
+      </div>
+
+      <div class="settings-stack">
+        <div class="settings-toggle-row settings-toggle-row-compact">
+          <div>
+            <div class="settings-toggle-title">{t("settings.speech.provider.title")}</div>
+            <div class="settings-toggle-caption">{t("settings.speech.provider.subtitle")}</div>
+          </div>
+          <div class="settings-toolbar-inline">
+            <span class="settings-inline-note">{t("settings.speech.provider.openaiCompatible")}</span>
+            <span class="settings-inline-note">{capabilityLabel()}</span>
+            <span class="settings-inline-note">{saveStatusLabel()}</span>
+            <button
+              type="button"
+              class="selector-button selector-button-primary w-auto whitespace-nowrap"
+              onClick={() => void handleSave()}
+              disabled={!isDirty() || isSaving()}
+            >
+              {isSaving() ? t("settings.speech.save.saving") : t("settings.speech.save.action")}
+            </button>
+          </div>
+        </div>
+
+        <Field
+          label={t("settings.speech.apiKey.title")}
+          caption={t("settings.speech.apiKey.subtitle")}
+          value={drafts().apiKey}
+          onInput={(value) => updateDraft("apiKey", value)}
+          type="password"
+        />
+        <Field
+          label={t("settings.speech.baseUrl.title")}
+          caption={t("settings.speech.baseUrl.subtitle")}
+          value={drafts().baseUrl}
+          onInput={(value) => updateDraft("baseUrl", value)}
+          placeholder={t("settings.speech.baseUrl.placeholder")}
+        />
+        <Field
+          label={t("settings.speech.sttModel.title")}
+          caption={t("settings.speech.sttModel.subtitle")}
+          value={drafts().sttModel}
+          onInput={(value) => updateDraft("sttModel", value)}
+        />
+        <Field
+          label={t("settings.speech.ttsModel.title")}
+          caption={t("settings.speech.ttsModel.subtitle")}
+          value={drafts().ttsModel}
+          onInput={(value) => updateDraft("ttsModel", value)}
+        />
+        <Field
+          label={t("settings.speech.ttsVoice.title")}
+          caption={t("settings.speech.ttsVoice.subtitle")}
+          value={drafts().ttsVoice}
+          onInput={(value) => updateDraft("ttsVoice", value)}
+          icon={<Mic class="w-3.5 h-3.5 icon-muted flex-shrink-0" />}
+        />
+
+        <div class="settings-inline-note">{t("settings.speech.help")}</div>
+      </div>
+    </div>
+  )
+}
+
+const Field: Component<{
+  label: string
+  caption: string
+  value: string
+  type?: string
+  placeholder?: string
+  onInput: (value: string) => void
+  icon?: any
+}> = (props) => {
+  return (
+    <div class="settings-toggle-row settings-toggle-row-compact">
+      <div>
+        <div class="settings-toggle-title">{props.label}</div>
+        <div class="settings-toggle-caption">{props.caption}</div>
+      </div>
+      <div class="flex items-center gap-2 min-w-[18rem] max-w-[24rem] w-full">
+        {props.icon}
+        <input
+          type={props.type ?? "text"}
+          value={props.value}
+          onInput={(event) => props.onInput(event.currentTarget.value)}
+          class="selector-input w-full"
+          placeholder={props.placeholder}
+        />
+      </div>
+    </div>
+  )
+}
+
+export default SpeechSettingsCard
--- a/packages/ui/src/components/settings/speech-settings-section.tsx
+++ b/packages/ui/src/components/settings/speech-settings-section.tsx
@@ -0,0 +1,10 @@
+import type { Component } from "solid-js"
+import SpeechSettingsCard from "./speech-settings-card"
+
+export const SpeechSettingsSection: Component = () => {
+  return (
+    <div class="settings-section-stack">
+      <SpeechSettingsCard />
+    </div>
+  )
+}