feat(speech): add prompt voice input groundwork

2026-03-12 22:04:57 +00:00
parent d9068ac8c6
commit 0ed19aeefb
34 changed files with 1284 additions and 9 deletions
--- a/packages/ui/src/stores/preferences.tsx
+++ b/packages/ui/src/stores/preferences.tsx
@@ -7,6 +7,7 @@ import {
  updateInstanceConfig as updateInstanceData,
 } from "./instance-config"
 import { getLogger } from "../lib/logger"
+import { loadSpeechCapabilities, resetSpeechCapabilities } from "./speech"

 const log = getLogger("actions")

@@ -27,6 +28,16 @@ export type DiffViewMode = "split" | "unified"
 export type ExpansionPreference = "expanded" | "collapsed"
 export type ToolInputsVisibilityPreference = "hidden" | "collapsed" | "expanded"
 export type ListeningMode = "local" | "all"
+export type SpeechProviderPreference = "openai-compatible"
+
+export interface SpeechSettings {
+  provider: SpeechProviderPreference
+  apiKey?: string
+  baseUrl?: string
+  sttModel: string
+  ttsModel: string
+  ttsVoice: string
+}

 export interface UiSettings {
  showThinkingBlocks: boolean
@@ -34,6 +45,7 @@ export interface UiSettings {
  thinkingBlocksExpansion: ExpansionPreference
  showTimelineTools: boolean
  promptSubmitOnEnter: boolean
+  showPromptVoiceInput: boolean
  locale?: string
  diffViewMode: DiffViewMode
  toolOutputExpansion: ExpansionPreference
@@ -75,6 +87,7 @@ interface ServerConfigBucket {
  listeningMode?: ListeningMode
  environmentVariables?: Record<string, string>
  opencodeBinary?: string
+  speech?: Partial<SpeechSettings>
 }

 interface UiStateBucket {
@@ -107,6 +120,7 @@ const defaultUiSettings: UiSettings = {
  thinkingBlocksExpansion: "expanded",
  showTimelineTools: true,
  promptSubmitOnEnter: false,
+  showPromptVoiceInput: true,
  diffViewMode: "split",
  toolOutputExpansion: "expanded",
  diagnosticsExpansion: "expanded",
@@ -120,6 +134,13 @@ const defaultUiSettings: UiSettings = {
  notifyOnIdle: true,
 }

+const defaultSpeechSettings: SpeechSettings = {
+  provider: "openai-compatible",
+  sttModel: "gpt-4o-mini-transcribe",
+  ttsModel: "gpt-4o-mini-tts",
+  ttsVoice: "alloy",
+}
+
 function normalizeUiSettings(input?: Partial<UiSettings> | null): UiSettings {
  const sanitized = input ?? {}
  return {
@@ -129,6 +150,7 @@ function normalizeUiSettings(input?: Partial<UiSettings> | null): UiSettings {
    thinkingBlocksExpansion: sanitized.thinkingBlocksExpansion ?? defaultUiSettings.thinkingBlocksExpansion,
    showTimelineTools: sanitized.showTimelineTools ?? defaultUiSettings.showTimelineTools,
    promptSubmitOnEnter: sanitized.promptSubmitOnEnter ?? defaultUiSettings.promptSubmitOnEnter,
+    showPromptVoiceInput: sanitized.showPromptVoiceInput ?? defaultUiSettings.showPromptVoiceInput,
    locale: sanitized.locale ?? defaultUiSettings.locale,
    diffViewMode: sanitized.diffViewMode ?? defaultUiSettings.diffViewMode,
    toolOutputExpansion: sanitized.toolOutputExpansion ?? defaultUiSettings.toolOutputExpansion,
@@ -156,6 +178,27 @@ function normalizeRecord(value: unknown): Record<string, string> {
  return out
 }

+function normalizeSpeechSettings(input?: Partial<SpeechSettings> | null): SpeechSettings {
+  const sanitized = input ?? {}
+  return {
+    provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider,
+    apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined,
+    baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined,
+    sttModel:
+      typeof sanitized.sttModel === "string" && sanitized.sttModel.trim()
+        ? sanitized.sttModel.trim()
+        : defaultSpeechSettings.sttModel,
+    ttsModel:
+      typeof sanitized.ttsModel === "string" && sanitized.ttsModel.trim()
+        ? sanitized.ttsModel.trim()
+        : defaultSpeechSettings.ttsModel,
+    ttsVoice:
+      typeof sanitized.ttsVoice === "string" && sanitized.ttsVoice.trim()
+        ? sanitized.ttsVoice.trim()
+        : defaultSpeechSettings.ttsVoice,
+  }
+}
+
 function cloneArray<T>(value: unknown, mapper: (item: any) => T | null): T[] {
  if (!Array.isArray(value)) return []
  const out: T[] = []
@@ -206,12 +249,15 @@ function normalizeUiState(input?: UiStateBucket | null): NormalizedUiState {
  }
 }

-function normalizeServerConfig(input?: ServerConfigBucket | null): Required<Pick<ServerConfigBucket, "listeningMode" | "environmentVariables" | "opencodeBinary">> {
+function normalizeServerConfig(
+  input?: ServerConfigBucket | null,
+): Required<Pick<ServerConfigBucket, "listeningMode" | "environmentVariables" | "opencodeBinary">> & { speech: SpeechSettings } {
  const source = input ?? {}
  const listeningMode = source.listeningMode === "all" ? "all" : "local"
  const opencodeBinary = typeof source.opencodeBinary === "string" && source.opencodeBinary.trim() ? source.opencodeBinary : "opencode"
  const environmentVariables = normalizeRecord(source.environmentVariables)
-  return { listeningMode, opencodeBinary, environmentVariables }
+  const speech = normalizeSpeechSettings(source.speech)
+  return { listeningMode, opencodeBinary, environmentVariables, speech }
 }

 function getModelKey(model: { providerId: string; modelId: string }): string {
@@ -342,6 +388,16 @@ function updateLastUsedBinary(path: string): void {
  void patchStateOwner("ui", { opencodeBinaries: nextList }).catch((error) => log.error("Failed to update binary list", error))
 }

+async function updateSpeechSettings(updates: Partial<SpeechSettings>): Promise<void> {
+  const next = normalizeSpeechSettings({ ...serverSettings().speech, ...updates })
+  try {
+    await patchConfigOwner("server", { speech: next })
+  } catch (error) {
+    log.error("Failed to update speech settings", error)
+    throw error
+  }
+}
+
 function addOpenCodeBinary(path: string, version?: string): void {
  const nextList = buildBinaryList(path, version, opencodeBinaries())
  void patchStateOwner("ui", { opencodeBinaries: nextList }).catch((error) => log.error("Failed to add binary", error))
@@ -476,6 +532,10 @@ function togglePromptSubmitOnEnter(): void {
  updateUiSettings({ promptSubmitOnEnter: !preferences().promptSubmitOnEnter })
 }

+function toggleShowPromptVoiceInput(): void {
+  updateUiSettings({ showPromptVoiceInput: !preferences().showPromptVoiceInput })
+}
+
 function toggleAutoCleanupBlankSessions(): void {
  const nextValue = !preferences().autoCleanupBlankSessions
  log.info("toggle auto cleanup", { value: nextValue })
@@ -521,6 +581,7 @@ interface ConfigContextValue {
  addEnvironmentVariable: typeof addEnvironmentVariable
  removeEnvironmentVariable: typeof removeEnvironmentVariable
  updateLastUsedBinary: typeof updateLastUsedBinary
+  updateSpeechSettings: typeof updateSpeechSettings

  // ui-owned state
  recentFolders: typeof recentFolders
@@ -544,6 +605,7 @@ interface ConfigContextValue {
  toggleUsageMetrics: typeof toggleUsageMetrics
  toggleAutoCleanupBlankSessions: typeof toggleAutoCleanupBlankSessions
  togglePromptSubmitOnEnter: typeof togglePromptSubmitOnEnter
+  toggleShowPromptVoiceInput: typeof toggleShowPromptVoiceInput
  setDiffViewMode: typeof setDiffViewMode
  setToolOutputExpansion: typeof setToolOutputExpansion
  setDiagnosticsExpansion: typeof setDiagnosticsExpansion
@@ -569,6 +631,7 @@ const configContextValue: ConfigContextValue = {
  addEnvironmentVariable,
  removeEnvironmentVariable,
  updateLastUsedBinary,
+  updateSpeechSettings,
  recentFolders,
  opencodeBinaries,
  uiState,
@@ -588,6 +651,7 @@ const configContextValue: ConfigContextValue = {
  toggleUsageMetrics,
  toggleAutoCleanupBlankSessions,
  togglePromptSubmitOnEnter,
+  toggleShowPromptVoiceInput,
  setDiffViewMode,
  setToolOutputExpansion,
  setDiagnosticsExpansion,
@@ -610,6 +674,8 @@ export const ConfigProvider: ParentComponent = (props) => {
    const unsubServer = storage.onConfigOwnerChanged("server", (bucket) => {
      setServerConfigBucket(bucket as any)
      setIsLoaded(true)
+      resetSpeechCapabilities()
+      void loadSpeechCapabilities(true)
    })
    const unsubStateUi = storage.onStateOwnerChanged("ui", (bucket) => {
      setUiStateBucket(bucket as any)
@@ -648,6 +714,7 @@ export {
  addEnvironmentVariable,
  removeEnvironmentVariable,
  updateLastUsedBinary,
+  updateSpeechSettings,
  addRecentFolder,
  removeRecentFolder,
  addOpenCodeBinary,
@@ -664,6 +731,7 @@ export {
  toggleUsageMetrics,
  toggleAutoCleanupBlankSessions,
  togglePromptSubmitOnEnter,
+  toggleShowPromptVoiceInput,
  setDiffViewMode,
  setToolOutputExpansion,
  setDiagnosticsExpansion,
--- a/packages/ui/src/stores/settings-screen.ts
+++ b/packages/ui/src/stores/settings-screen.ts
@@ -1,6 +1,6 @@
 import { createSignal } from "solid-js"

-export type SettingsSectionId = "appearance" | "notifications" | "remote" | "opencode"
+export type SettingsSectionId = "appearance" | "notifications" | "remote" | "speech" | "opencode"

 const [settingsOpen, setSettingsOpen] = createSignal(false)
 const [activeSettingsSection, setActiveSettingsSection] = createSignal<SettingsSectionId>("appearance")
--- a/packages/ui/src/stores/speech.ts
+++ b/packages/ui/src/stores/speech.ts
@@ -0,0 +1,46 @@
+import { createSignal } from "solid-js"
+import type { SpeechCapabilitiesResponse } from "../../../server/src/api-types"
+import { serverApi } from "../lib/api-client"
+import { getLogger } from "../lib/logger"
+
+const log = getLogger("api")
+
+const [speechCapabilities, setSpeechCapabilities] = createSignal<SpeechCapabilitiesResponse | null>(null)
+const [speechCapabilitiesLoading, setSpeechCapabilitiesLoading] = createSignal(false)
+const [speechCapabilitiesError, setSpeechCapabilitiesError] = createSignal<string | null>(null)
+
+let speechCapabilitiesPromise: Promise<SpeechCapabilitiesResponse | null> | null = null
+
+async function loadSpeechCapabilities(force = false): Promise<SpeechCapabilitiesResponse | null> {
+  if (!force && speechCapabilities()) return speechCapabilities()
+  if (speechCapabilitiesPromise) return speechCapabilitiesPromise
+
+  setSpeechCapabilitiesLoading(true)
+  setSpeechCapabilitiesError(null)
+  speechCapabilitiesPromise = serverApi
+    .fetchSpeechCapabilities()
+    .then((result) => {
+      setSpeechCapabilities(result)
+      setSpeechCapabilitiesError(null)
+      return result
+    })
+    .catch((error) => {
+      log.error("Failed to load speech capabilities", error)
+      setSpeechCapabilities(null)
+      setSpeechCapabilitiesError(error instanceof Error ? error.message : String(error))
+      return null
+    })
+    .finally(() => {
+      setSpeechCapabilitiesLoading(false)
+      speechCapabilitiesPromise = null
+    })
+
+  return speechCapabilitiesPromise
+}
+
+function resetSpeechCapabilities(): void {
+  setSpeechCapabilities(null)
+  setSpeechCapabilitiesError(null)
+}
+
+export { speechCapabilities, speechCapabilitiesLoading, speechCapabilitiesError, loadSpeechCapabilities, resetSpeechCapabilities }