feat(speech): add prompt voice input (#249)

## Summary - add server-backed speech capabilities and transcription endpoints plus UI settings for speech configuration - add push-to-talk prompt voice input with microphone controls, transcription insertion, and browser capability gating - keep prompt controls aligned by restoring right-side nav placement and moving the mic beside the expand control
2026-03-25 14:08:11 +00:00
parent a950d47df0
commit 1233121a13
40 changed files with 1545 additions and 27 deletions
--- a/packages/ui/src/stores/preferences.tsx
+++ b/packages/ui/src/stores/preferences.tsx
@@ -7,6 +7,7 @@ import {
  updateInstanceConfig as updateInstanceData,
 } from "./instance-config"
 import { getLogger } from "../lib/logger"
+import { loadSpeechCapabilities, resetSpeechCapabilities } from "./speech"

 const log = getLogger("actions")

@@ -27,6 +28,21 @@ export type DiffViewMode = "split" | "unified"
 export type ExpansionPreference = "expanded" | "collapsed"
 export type ToolInputsVisibilityPreference = "hidden" | "collapsed" | "expanded"
 export type ListeningMode = "local" | "all"
+export type SpeechProviderPreference = "openai-compatible"
+
+export interface SpeechSettings {
+  provider: SpeechProviderPreference
+  apiKey?: string
+  hasApiKey: boolean
+  baseUrl?: string
+  sttModel: string
+  ttsModel: string
+  ttsVoice: string
+}
+
+export type SpeechSettingsUpdate = Partial<Omit<SpeechSettings, "apiKey">> & {
+  apiKey?: string | null
+}

 export interface UiSettings {
  showThinkingBlocks: boolean
@@ -34,6 +50,7 @@ export interface UiSettings {
  thinkingBlocksExpansion: ExpansionPreference
  showTimelineTools: boolean
  promptSubmitOnEnter: boolean
+  showPromptVoiceInput: boolean
  locale?: string
  diffViewMode: DiffViewMode
  toolOutputExpansion: ExpansionPreference
@@ -75,6 +92,7 @@ interface ServerConfigBucket {
  listeningMode?: ListeningMode
  environmentVariables?: Record<string, string>
  opencodeBinary?: string
+  speech?: Partial<SpeechSettings>
 }

 interface UiStateBucket {
@@ -107,6 +125,7 @@ const defaultUiSettings: UiSettings = {
  thinkingBlocksExpansion: "expanded",
  showTimelineTools: true,
  promptSubmitOnEnter: false,
+  showPromptVoiceInput: true,
  diffViewMode: "split",
  toolOutputExpansion: "expanded",
  diagnosticsExpansion: "expanded",
@@ -120,6 +139,14 @@ const defaultUiSettings: UiSettings = {
  notifyOnIdle: true,
 }

+const defaultSpeechSettings: SpeechSettings = {
+  provider: "openai-compatible",
+  hasApiKey: false,
+  sttModel: "gpt-4o-mini-transcribe",
+  ttsModel: "gpt-4o-mini-tts",
+  ttsVoice: "alloy",
+}
+
 function normalizeUiSettings(input?: Partial<UiSettings> | null): UiSettings {
  const sanitized = input ?? {}
  return {
@@ -129,6 +156,7 @@ function normalizeUiSettings(input?: Partial<UiSettings> | null): UiSettings {
    thinkingBlocksExpansion: sanitized.thinkingBlocksExpansion ?? defaultUiSettings.thinkingBlocksExpansion,
    showTimelineTools: sanitized.showTimelineTools ?? defaultUiSettings.showTimelineTools,
    promptSubmitOnEnter: sanitized.promptSubmitOnEnter ?? defaultUiSettings.promptSubmitOnEnter,
+    showPromptVoiceInput: sanitized.showPromptVoiceInput ?? defaultUiSettings.showPromptVoiceInput,
    locale: sanitized.locale ?? defaultUiSettings.locale,
    diffViewMode: sanitized.diffViewMode ?? defaultUiSettings.diffViewMode,
    toolOutputExpansion: sanitized.toolOutputExpansion ?? defaultUiSettings.toolOutputExpansion,
@@ -156,6 +184,28 @@ function normalizeRecord(value: unknown): Record<string, string> {
  return out
 }

+function normalizeSpeechSettings(input?: Partial<SpeechSettings> | null): SpeechSettings {
+  const sanitized = input ?? {}
+  return {
+    provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider,
+    apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined,
+    hasApiKey: sanitized.hasApiKey === true || (typeof sanitized.apiKey === "string" && sanitized.apiKey.trim().length > 0),
+    baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined,
+    sttModel:
+      typeof sanitized.sttModel === "string" && sanitized.sttModel.trim()
+        ? sanitized.sttModel.trim()
+        : defaultSpeechSettings.sttModel,
+    ttsModel:
+      typeof sanitized.ttsModel === "string" && sanitized.ttsModel.trim()
+        ? sanitized.ttsModel.trim()
+        : defaultSpeechSettings.ttsModel,
+    ttsVoice:
+      typeof sanitized.ttsVoice === "string" && sanitized.ttsVoice.trim()
+        ? sanitized.ttsVoice.trim()
+        : defaultSpeechSettings.ttsVoice,
+  }
+}
+
 function cloneArray<T>(value: unknown, mapper: (item: any) => T | null): T[] {
  if (!Array.isArray(value)) return []
  const out: T[] = []
@@ -206,12 +256,15 @@ function normalizeUiState(input?: UiStateBucket | null): NormalizedUiState {
  }
 }

-function normalizeServerConfig(input?: ServerConfigBucket | null): Required<Pick<ServerConfigBucket, "listeningMode" | "environmentVariables" | "opencodeBinary">> {
+function normalizeServerConfig(
+  input?: ServerConfigBucket | null,
+): Required<Pick<ServerConfigBucket, "listeningMode" | "environmentVariables" | "opencodeBinary">> & { speech: SpeechSettings } {
  const source = input ?? {}
  const listeningMode = source.listeningMode === "all" ? "all" : "local"
  const opencodeBinary = typeof source.opencodeBinary === "string" && source.opencodeBinary.trim() ? source.opencodeBinary : "opencode"
  const environmentVariables = normalizeRecord(source.environmentVariables)
-  return { listeningMode, opencodeBinary, environmentVariables }
+  const speech = normalizeSpeechSettings(source.speech)
+  return { listeningMode, opencodeBinary, environmentVariables, speech }
 }

 function getModelKey(model: { providerId: string; modelId: string }): string {
@@ -342,6 +395,27 @@ function updateLastUsedBinary(path: string): void {
  void patchStateOwner("ui", { opencodeBinaries: nextList }).catch((error) => log.error("Failed to update binary list", error))
 }

+async function updateSpeechSettings(updates: SpeechSettingsUpdate): Promise<void> {
+  const apiKeyPatch = updates.apiKey
+  const { apiKey: _apiKey, ...restUpdates } = updates
+  const next = normalizeSpeechSettings({
+    ...serverSettings().speech,
+    ...restUpdates,
+    ...(apiKeyPatch === null ? {} : { apiKey: apiKeyPatch }),
+  })
+  const { hasApiKey: _hasApiKey, ...persistedSpeech } = next
+  const patch = {
+    ...persistedSpeech,
+    ...(apiKeyPatch === null ? { apiKey: null } : {}),
+  }
+  try {
+    await patchConfigOwner("server", { speech: patch })
+  } catch (error) {
+    log.error("Failed to update speech settings", error)
+    throw error
+  }
+}
+
 function addOpenCodeBinary(path: string, version?: string): void {
  const nextList = buildBinaryList(path, version, opencodeBinaries())
  void patchStateOwner("ui", { opencodeBinaries: nextList }).catch((error) => log.error("Failed to add binary", error))
@@ -476,6 +550,10 @@ function togglePromptSubmitOnEnter(): void {
  updateUiSettings({ promptSubmitOnEnter: !preferences().promptSubmitOnEnter })
 }

+function toggleShowPromptVoiceInput(): void {
+  updateUiSettings({ showPromptVoiceInput: !preferences().showPromptVoiceInput })
+}
+
 function toggleAutoCleanupBlankSessions(): void {
  const nextValue = !preferences().autoCleanupBlankSessions
  log.info("toggle auto cleanup", { value: nextValue })
@@ -521,6 +599,7 @@ interface ConfigContextValue {
  addEnvironmentVariable: typeof addEnvironmentVariable
  removeEnvironmentVariable: typeof removeEnvironmentVariable
  updateLastUsedBinary: typeof updateLastUsedBinary
+  updateSpeechSettings: typeof updateSpeechSettings

  // ui-owned state
  recentFolders: typeof recentFolders
@@ -544,6 +623,7 @@ interface ConfigContextValue {
  toggleUsageMetrics: typeof toggleUsageMetrics
  toggleAutoCleanupBlankSessions: typeof toggleAutoCleanupBlankSessions
  togglePromptSubmitOnEnter: typeof togglePromptSubmitOnEnter
+  toggleShowPromptVoiceInput: typeof toggleShowPromptVoiceInput
  setDiffViewMode: typeof setDiffViewMode
  setToolOutputExpansion: typeof setToolOutputExpansion
  setDiagnosticsExpansion: typeof setDiagnosticsExpansion
@@ -569,6 +649,7 @@ const configContextValue: ConfigContextValue = {
  addEnvironmentVariable,
  removeEnvironmentVariable,
  updateLastUsedBinary,
+  updateSpeechSettings,
  recentFolders,
  opencodeBinaries,
  uiState,
@@ -588,6 +669,7 @@ const configContextValue: ConfigContextValue = {
  toggleUsageMetrics,
  toggleAutoCleanupBlankSessions,
  togglePromptSubmitOnEnter,
+  toggleShowPromptVoiceInput,
  setDiffViewMode,
  setToolOutputExpansion,
  setDiagnosticsExpansion,
@@ -610,6 +692,8 @@ export const ConfigProvider: ParentComponent = (props) => {
    const unsubServer = storage.onConfigOwnerChanged("server", (bucket) => {
      setServerConfigBucket(bucket as any)
      setIsLoaded(true)
+      resetSpeechCapabilities()
+      void loadSpeechCapabilities(true)
    })
    const unsubStateUi = storage.onStateOwnerChanged("ui", (bucket) => {
      setUiStateBucket(bucket as any)
@@ -648,6 +732,7 @@ export {
  addEnvironmentVariable,
  removeEnvironmentVariable,
  updateLastUsedBinary,
+  updateSpeechSettings,
  addRecentFolder,
  removeRecentFolder,
  addOpenCodeBinary,
@@ -664,6 +749,7 @@ export {
  toggleUsageMetrics,
  toggleAutoCleanupBlankSessions,
  togglePromptSubmitOnEnter,
+  toggleShowPromptVoiceInput,
  setDiffViewMode,
  setToolOutputExpansion,
  setDiagnosticsExpansion,