diff --git a/package-lock.json b/package-lock.json index afc757c1..8af51fd0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8231,6 +8231,27 @@ "regex-recursion": "^6.0.2" } }, + "node_modules/openai": { + "version": "6.27.0", + "resolved": "https://registry.npmjs.org/openai/-/openai-6.27.0.tgz", + "integrity": "sha512-osTKySlrdYrLYTt0zjhY8yp0JUBmWDCN+Q+QxsV4xMQnnoVFpylgKGgxwN8sSdTNw0G4y+WUXs4eCMWpyDNWZQ==", + "license": "Apache-2.0", + "bin": { + "openai": "bin/cli" + }, + "peerDependencies": { + "ws": "^8.18.0", + "zod": "^3.25 || ^4.0" + }, + "peerDependenciesMeta": { + "ws": { + "optional": true + }, + "zod": { + "optional": true + } + } + }, "node_modules/own-keys": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/own-keys/-/own-keys-1.0.1.tgz", @@ -11988,6 +12009,7 @@ "node_modules/zod": { "version": "3.25.76", "license": "MIT", + "peer": true, "funding": { "url": "https://github.com/sponsors/colinhacks" } @@ -12049,6 +12071,7 @@ "fastify": "^4.28.1", "fuzzysort": "^2.0.4", "node-forge": "^1.3.3", + "openai": "^6.27.0", "pino": "^9.4.0", "undici": "^6.19.8", "yaml": "^2.4.2", diff --git a/packages/server/package.json b/packages/server/package.json index caa4de40..9646af1a 100644 --- a/packages/server/package.json +++ b/packages/server/package.json @@ -32,6 +32,7 @@ "fastify": "^4.28.1", "fuzzysort": "^2.0.4", "node-forge": "^1.3.3", + "openai": "^6.27.0", "pino": "^9.4.0", "undici": "^6.19.8", "yaml": "^2.4.2", diff --git a/packages/server/src/api-types.ts b/packages/server/src/api-types.ts index c3dea831..8eb7c928 100644 --- a/packages/server/src/api-types.ts +++ b/packages/server/src/api-types.ts @@ -207,6 +207,36 @@ export interface BinaryValidationResult { error?: string } +export interface SpeechSegment { + startMs: number + endMs: number + text: string +} + +export interface SpeechCapabilitiesResponse { + available: boolean + configured: boolean + provider: string + supportsStt: boolean + supportsTts: boolean + baseUrl?: string + sttModel: string + ttsModel: string + ttsVoice: string +} + +export interface SpeechTranscriptionResponse { + text: string + language?: string + durationMs?: number + segments?: SpeechSegment[] +} + +export interface SpeechSynthesisResponse { + audioBase64: string + mimeType: string +} + export type WorkspaceEventType = | "workspace.created" | "workspace.started" diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index a317a5d3..1d83ec72 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -23,6 +23,7 @@ import { AuthManager, BOOTSTRAP_TOKEN_STDOUT_PREFIX, DEFAULT_AUTH_USERNAME } fro import { resolveHttpsOptions } from "./server/tls" import { resolveNetworkAddresses } from "./server/network-addresses" import { startDevReleaseMonitor } from "./releases/dev-release-monitor" +import { SpeechService } from "./speech/service" const require = createRequire(import.meta.url) @@ -304,6 +305,7 @@ async function main() { }) const fileSystemBrowser = new FileSystemBrowser({ rootDir: options.rootDir, unrestricted: options.unrestrictedRoot }) const instanceStore = new InstanceStore(configLocation.instancesDir) + const speechService = new SpeechService(settings, logger.child({ component: "speech" })) const instanceEventBridge = new InstanceEventBridge({ workspaceManager, eventBus, @@ -388,6 +390,7 @@ async function main() { eventBus, serverMeta, instanceStore, + speechService, authManager, uiStaticDir: uiResolution.uiStaticDir ?? DEFAULT_UI_STATIC_DIR, uiDevServerUrl: uiResolution.uiDevServerUrl, @@ -408,6 +411,7 @@ async function main() { eventBus, serverMeta, instanceStore, + speechService, authManager, uiStaticDir: uiResolution.uiStaticDir ?? DEFAULT_UI_STATIC_DIR, uiDevServerUrl: undefined, diff --git a/packages/server/src/server/http-server.ts b/packages/server/src/server/http-server.ts index dd36f882..3f558cb8 100644 --- a/packages/server/src/server/http-server.ts +++ b/packages/server/src/server/http-server.ts @@ -21,12 +21,14 @@ import { registerStorageRoutes } from "./routes/storage" import { registerPluginRoutes } from "./routes/plugin" import { registerBackgroundProcessRoutes } from "./routes/background-processes" import { registerWorktreeRoutes } from "./routes/worktrees" +import { registerSpeechRoutes } from "./routes/speech" import { ServerMeta } from "../api-types" import { InstanceStore } from "../storage/instance-store" import { BackgroundProcessManager } from "../background-processes/manager" import type { AuthManager } from "../auth/manager" import { registerAuthRoutes } from "./routes/auth" import { sendUnauthorized, wantsHtml } from "../auth/http-auth" +import type { SpeechService } from "../speech/service" interface HttpServerDeps { bindHost: string @@ -41,6 +43,7 @@ interface HttpServerDeps { eventBus: EventBus serverMeta: ServerMeta instanceStore: InstanceStore + speechService: SpeechService authManager: AuthManager uiStaticDir: string uiDevServerUrl?: string @@ -252,6 +255,7 @@ export function createHttpServer(deps: HttpServerDeps) { eventBus: deps.eventBus, workspaceManager: deps.workspaceManager, }) + registerSpeechRoutes(app, { speechService: deps.speechService }) registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger }) registerBackgroundProcessRoutes(app, { backgroundProcessManager }) registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger }) diff --git a/packages/server/src/server/routes/speech.ts b/packages/server/src/server/routes/speech.ts new file mode 100644 index 00000000..3eab4ad6 --- /dev/null +++ b/packages/server/src/server/routes/speech.ts @@ -0,0 +1,46 @@ +import type { FastifyInstance } from "fastify" +import { z } from "zod" +import type { SpeechService } from "../../speech/service" + +interface RouteDeps { + speechService: SpeechService +} + +const TranscribeBodySchema = z.object({ + audioBase64: z.string().min(1, "Audio payload is required"), + mimeType: z.string().min(1, "Audio MIME type is required"), + filename: z.string().optional(), + language: z.string().optional(), + prompt: z.string().optional(), +}) + +const SynthesizeBodySchema = z.object({ + text: z.string().trim().min(1, "Text is required"), + format: z.enum(["mp3", "wav", "opus"]).optional(), +}) + +export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) { + app.get("/api/speech/capabilities", async () => deps.speechService.getCapabilities()) + + app.post("/api/speech/transcribe", async (request, reply) => { + try { + const body = TranscribeBodySchema.parse(request.body ?? {}) + return await deps.speechService.transcribe(body) + } catch (error) { + request.log.error({ err: error }, "Failed to transcribe audio") + reply.code(400) + return { error: error instanceof Error ? error.message : "Failed to transcribe audio" } + } + }) + + app.post("/api/speech/synthesize", async (request, reply) => { + try { + const body = SynthesizeBodySchema.parse(request.body ?? {}) + return await deps.speechService.synthesize(body) + } catch (error) { + request.log.error({ err: error }, "Failed to synthesize audio") + reply.code(400) + return { error: error instanceof Error ? error.message : "Failed to synthesize audio" } + } + }) +} diff --git a/packages/server/src/speech/providers/openai-compatible.ts b/packages/server/src/speech/providers/openai-compatible.ts new file mode 100644 index 00000000..4c426d72 --- /dev/null +++ b/packages/server/src/speech/providers/openai-compatible.ts @@ -0,0 +1,148 @@ +import OpenAI from "openai" +import { toFile } from "openai/uploads" +import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types" +import type { Logger } from "../../logger" +import type { NormalizedSpeechSettings, SynthesizeSpeechInput, TranscribeAudioInput } from "../service" + +interface OpenAICompatibleSpeechProviderOptions { + settings: NormalizedSpeechSettings + logger: Logger +} + +export class OpenAICompatibleSpeechProvider { + constructor(private readonly options: OpenAICompatibleSpeechProviderOptions) {} + + getCapabilities() { + const { settings } = this.options + return { + available: true, + configured: Boolean(settings.apiKey), + provider: settings.provider, + supportsStt: true, + supportsTts: true, + baseUrl: settings.baseUrl, + sttModel: settings.sttModel, + ttsModel: settings.ttsModel, + ttsVoice: settings.ttsVoice, + } + } + + async transcribe(input: TranscribeAudioInput): Promise { + const client = this.createClient() + const startedAt = Date.now() + const extension = extensionForMime(input.mimeType) + const buffer = Buffer.from(input.audioBase64, "base64") + const filename = input.filename?.trim() || `prompt-input.${extension}` + + this.options.logger.info( + { + mimeType: input.mimeType, + bytes: buffer.byteLength, + language: input.language, + model: this.options.settings.sttModel, + }, + "speech.transcribe", + ) + + const response = await this.requestTranscription(client, buffer, filename, input) + + return { + text: typeof response?.text === "string" ? response.text : "", + language: typeof response?.language === "string" ? response.language : input.language, + durationMs: Number.isFinite(response?.duration) ? Math.round(Number(response.duration) * 1000) : Date.now() - startedAt, + segments: Array.isArray(response?.segments) + ? response.segments + .filter((segment: any) => typeof segment?.text === "string") + .map((segment: any) => ({ + startMs: Math.max(0, Math.round(Number(segment.start ?? 0) * 1000)), + endMs: Math.max(0, Math.round(Number(segment.end ?? 0) * 1000)), + text: String(segment.text), + })) + : undefined, + } + } + + private async requestTranscription( + client: OpenAI, + buffer: Buffer, + filename: string, + input: TranscribeAudioInput, + ): Promise { + const baseRequest = { + model: this.options.settings.sttModel, + ...(input.language ? { language: input.language } : {}), + ...(input.prompt ? { prompt: input.prompt } : {}), + } + + try { + const file = await toFile(buffer, filename, { type: input.mimeType }) + return (await client.audio.transcriptions.create({ + ...baseRequest, + file, + response_format: "verbose_json" as any, + } as any)) as any + } catch (error) { + this.options.logger.warn({ err: error }, "speech.transcribe verbose_json failed; retrying default format") + const retryFile = await toFile(buffer, filename, { type: input.mimeType }) + return (await client.audio.transcriptions.create({ + ...baseRequest, + file: retryFile, + } as any)) as any + } + } + + async synthesize(input: SynthesizeSpeechInput): Promise { + const client = this.createClient() + const format = input.format ?? "mp3" + + this.options.logger.info( + { + model: this.options.settings.ttsModel, + voice: this.options.settings.ttsVoice, + format, + }, + "speech.synthesize", + ) + + const response = await client.audio.speech.create({ + model: this.options.settings.ttsModel, + voice: this.options.settings.ttsVoice as any, + input: input.text, + response_format: format as any, + }) + + const audioBuffer = Buffer.from(await response.arrayBuffer()) + return { + audioBase64: audioBuffer.toString("base64"), + mimeType: mimeTypeForFormat(format), + } + } + + private createClient(): OpenAI { + const { settings } = this.options + if (!settings.apiKey) { + throw new Error("Speech provider is not configured. Add an API key in Speech settings.") + } + + return new OpenAI({ + apiKey: settings.apiKey, + baseURL: settings.baseUrl, + }) + } +} + +function extensionForMime(mimeType: string): string { + const normalized = mimeType.toLowerCase() + if (normalized.includes("webm")) return "webm" + if (normalized.includes("ogg")) return "ogg" + if (normalized.includes("wav")) return "wav" + if (normalized.includes("mpeg") || normalized.includes("mp3")) return "mp3" + if (normalized.includes("mp4") || normalized.includes("aac")) return "m4a" + return "webm" +} + +function mimeTypeForFormat(format: "mp3" | "wav" | "opus"): string { + if (format === "wav") return "audio/wav" + if (format === "opus") return "audio/opus" + return "audio/mpeg" +} diff --git a/packages/server/src/speech/service.ts b/packages/server/src/speech/service.ts new file mode 100644 index 00000000..14f37a15 --- /dev/null +++ b/packages/server/src/speech/service.ts @@ -0,0 +1,91 @@ +import { z } from "zod" +import type { Logger } from "../logger" +import type { SettingsService } from "../settings/service" +import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types" +import { OpenAICompatibleSpeechProvider } from "./providers/openai-compatible" + +const ServerSpeechSettingsSchema = z.object({ + speech: z + .object({ + provider: z.string().optional(), + apiKey: z.string().optional(), + baseUrl: z.string().optional(), + sttModel: z.string().optional(), + ttsModel: z.string().optional(), + ttsVoice: z.string().optional(), + }) + .optional(), +}) + +export interface TranscribeAudioInput { + audioBase64: string + mimeType: string + filename?: string + language?: string + prompt?: string +} + +export interface SynthesizeSpeechInput { + text: string + format?: "mp3" | "wav" | "opus" +} + +export interface SpeechProvider { + getCapabilities(): SpeechCapabilitiesResponse + transcribe(input: TranscribeAudioInput): Promise + synthesize(input: SynthesizeSpeechInput): Promise +} + +export interface NormalizedSpeechSettings { + provider: string + apiKey?: string + baseUrl?: string + sttModel: string + ttsModel: string + ttsVoice: string +} + +const DEFAULT_PROVIDER = "openai-compatible" +const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe" +const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts" +const DEFAULT_TTS_VOICE = "alloy" +export class SpeechService { + constructor( + private readonly settings: SettingsService, + private readonly logger: Logger, + ) {} + + getCapabilities(): SpeechCapabilitiesResponse { + return this.createProvider().getCapabilities() + } + + async transcribe(input: TranscribeAudioInput): Promise { + return this.createProvider().transcribe(input) + } + + async synthesize(input: SynthesizeSpeechInput): Promise { + return this.createProvider().synthesize(input) + } + + private createProvider(): SpeechProvider { + const settings = this.resolveSettings() + return new OpenAICompatibleSpeechProvider({ + settings, + logger: this.logger.child({ provider: settings.provider }), + }) + } + + private resolveSettings(): NormalizedSpeechSettings { + const parsed = ServerSpeechSettingsSchema.parse(this.settings.getOwner("config", "server") ?? {}) + const speech = parsed.speech ?? {} + + return { + provider: speech.provider?.trim() || DEFAULT_PROVIDER, + apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY, + baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined, + sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL, + ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL, + ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE, + } + } +} diff --git a/packages/ui/src/App.tsx b/packages/ui/src/App.tsx index 4420af92..81ac56ea 100644 --- a/packages/ui/src/App.tsx +++ b/packages/ui/src/App.tsx @@ -71,6 +71,7 @@ const App: Component = () => { toggleAutoCleanupBlankSessions, toggleUsageMetrics, togglePromptSubmitOnEnter, + toggleShowPromptVoiceInput, setDiffViewMode, setToolOutputExpansion, setDiagnosticsExpansion, @@ -360,6 +361,7 @@ const App: Component = () => { toggleShowTimelineTools, toggleUsageMetrics, togglePromptSubmitOnEnter, + toggleShowPromptVoiceInput, setDiffViewMode, setToolOutputExpansion, setDiagnosticsExpansion, diff --git a/packages/ui/src/components/prompt-input.tsx b/packages/ui/src/components/prompt-input.tsx index 63c80fa1..1ae41690 100644 --- a/packages/ui/src/components/prompt-input.tsx +++ b/packages/ui/src/components/prompt-input.tsx @@ -1,5 +1,5 @@ import { createSignal, Show, onMount, onCleanup, createEffect, on } from "solid-js" -import { ArrowBigUp, ArrowBigDown } from "lucide-solid" +import { ArrowBigUp, ArrowBigDown, Loader2, Mic, Square } from "lucide-solid" import UnifiedPicker from "./unified-picker" import ExpandButton from "./expand-button" import { clearAttachments, removeAttachment } from "../stores/attachments" @@ -17,6 +17,7 @@ import { usePromptState } from "./prompt-input/usePromptState" import { usePromptAttachments } from "./prompt-input/usePromptAttachments" import { usePromptPicker } from "./prompt-input/usePromptPicker" import { usePromptKeyDown } from "./prompt-input/usePromptKeyDown" +import { usePromptVoiceInput } from "./prompt-input/usePromptVoiceInput" const log = getLogger("actions") export default function PromptInput(props: PromptInputProps) { @@ -411,6 +412,16 @@ export default function PromptInput(props: PromptInputProps) { }) const shouldShowOverlay = () => prompt().length === 0 + const voiceInput = usePromptVoiceInput({ + prompt, + setPrompt, + getTextarea: () => textareaRef ?? null, + enabled: () => preferences().showPromptVoiceInput, + disabled: () => Boolean(props.disabled), + }) + const showVoiceInput = () => + preferences().showPromptVoiceInput && + (voiceInput.canUseVoiceInput() || voiceInput.isRecording() || voiceInput.isTranscribing()) const instance = () => getActiveInstance() @@ -555,6 +566,30 @@ export default function PromptInput(props: PromptInputProps) {
+ + + + {formatVoiceTimer(voiceInput.elapsedMs())} + + +
+ + + updateDraft("apiKey", value)} + type="password" + /> + updateDraft("baseUrl", value)} + placeholder={t("settings.speech.baseUrl.placeholder")} + /> + updateDraft("sttModel", value)} + /> + updateDraft("ttsModel", value)} + /> + updateDraft("ttsVoice", value)} + icon={} + /> + +
{t("settings.speech.help")}
+ + + ) +} + +const Field: Component<{ + label: string + caption: string + value: string + type?: string + placeholder?: string + onInput: (value: string) => void + icon?: any +}> = (props) => { + return ( +
+
+
{props.label}
+
{props.caption}
+
+
+ {props.icon} + props.onInput(event.currentTarget.value)} + class="selector-input w-full" + placeholder={props.placeholder} + /> +
+
+ ) +} + +export default SpeechSettingsCard diff --git a/packages/ui/src/components/settings/speech-settings-section.tsx b/packages/ui/src/components/settings/speech-settings-section.tsx new file mode 100644 index 00000000..939b279c --- /dev/null +++ b/packages/ui/src/components/settings/speech-settings-section.tsx @@ -0,0 +1,10 @@ +import type { Component } from "solid-js" +import SpeechSettingsCard from "./speech-settings-card" + +export const SpeechSettingsSection: Component = () => { + return ( +
+ +
+ ) +} diff --git a/packages/ui/src/lib/api-client.ts b/packages/ui/src/lib/api-client.ts index 96971e00..d6e0868a 100644 --- a/packages/ui/src/lib/api-client.ts +++ b/packages/ui/src/lib/api-client.ts @@ -7,6 +7,9 @@ import type { FileSystemCreateFolderResponse, FileSystemListResponse, InstanceData, + SpeechCapabilitiesResponse, + SpeechSynthesisResponse, + SpeechTranscriptionResponse, ServerMeta, WorkspaceCreateRequest, WorkspaceDescriptor, @@ -235,6 +238,27 @@ export const serverApi = { body: JSON.stringify({ path }), }) }, + fetchSpeechCapabilities(): Promise { + return request("/api/speech/capabilities") + }, + transcribeAudio(payload: { + audioBase64: string + mimeType: string + filename?: string + language?: string + prompt?: string + }): Promise { + return request("/api/speech/transcribe", { + method: "POST", + body: JSON.stringify(payload), + }) + }, + synthesizeSpeech(payload: { text: string; format?: "mp3" | "wav" | "opus" }): Promise { + return request("/api/speech/synthesize", { + method: "POST", + body: JSON.stringify(payload), + }) + }, listFileSystem(path?: string, options?: { includeFiles?: boolean }): Promise { const params = new URLSearchParams() if (path && path !== ".") { diff --git a/packages/ui/src/lib/hooks/use-commands.ts b/packages/ui/src/lib/hooks/use-commands.ts index a74fc840..d73a47fe 100644 --- a/packages/ui/src/lib/hooks/use-commands.ts +++ b/packages/ui/src/lib/hooks/use-commands.ts @@ -34,6 +34,7 @@ export interface UseCommandsOptions { toggleUsageMetrics: () => void toggleAutoCleanupBlankSessions: () => void togglePromptSubmitOnEnter: () => void + toggleShowPromptVoiceInput: () => void setDiffViewMode: (mode: "split" | "unified") => void setToolOutputExpansion: (mode: ExpansionPreference) => void setDiagnosticsExpansion: (mode: ExpansionPreference) => void @@ -435,6 +436,7 @@ export function useCommands(options: UseCommandsOptions) { toggleUsageMetrics: options.toggleUsageMetrics, toggleAutoCleanupBlankSessions: options.toggleAutoCleanupBlankSessions, togglePromptSubmitOnEnter: options.togglePromptSubmitOnEnter, + toggleShowPromptVoiceInput: options.toggleShowPromptVoiceInput, setDiffViewMode: options.setDiffViewMode, setToolOutputExpansion: options.setToolOutputExpansion, setDiagnosticsExpansion: options.setDiagnosticsExpansion, diff --git a/packages/ui/src/lib/i18n/messages/en/messaging.ts b/packages/ui/src/lib/i18n/messages/en/messaging.ts index c3723a99..7b8a574a 100644 --- a/packages/ui/src/lib/i18n/messages/en/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/en/messaging.ts @@ -138,4 +138,11 @@ export const messagingMessages = { "promptInput.send.ariaLabel": "Send message", "promptInput.send.errorFallback": "Failed to send message", "promptInput.send.errorTitle": "Send failed", + "promptInput.voiceInput.start.title": "Start voice input", + "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.transcribing.title": "Transcribing audio", + "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", + "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", + "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", } as const diff --git a/packages/ui/src/lib/i18n/messages/en/settings.ts b/packages/ui/src/lib/i18n/messages/en/settings.ts index cbf95000..318f1dcb 100644 --- a/packages/ui/src/lib/i18n/messages/en/settings.ts +++ b/packages/ui/src/lib/i18n/messages/en/settings.ts @@ -65,6 +65,7 @@ export const settingsMessages = { "settings.nav.appearance": "Appearance", "settings.nav.notifications": "Notifications", "settings.nav.remote": "Remote Access", + "settings.nav.speech": "Speech", "settings.nav.opencode": "OpenCode", "settings.scope.device": "This device", "settings.scope.server": "Server setting", @@ -137,6 +138,34 @@ export const settingsMessages = { "settings.behavior.usageMetrics.subtitle": "Show or hide token and cost stats for assistant messages.", "settings.behavior.autoCleanup.title": "Auto-cleanup blank sessions", "settings.behavior.autoCleanup.subtitle": "Automatically clean up blank sessions when creating new ones.", + "settings.behavior.promptVoiceInput.title": "Prompt voice input", + "settings.behavior.promptVoiceInput.subtitle": "Show the microphone control for speech-to-text prompt input when speech is configured.", "settings.behavior.promptSubmit.title": "Enter to submit", "settings.behavior.promptSubmit.subtitle": "Use Enter to submit prompts; Cmd/Ctrl+Enter inserts a new line.", + "settings.speech.title": "Speech", + "settings.speech.subtitle": "Configure speech-to-text now and text-to-speech groundwork for later features.", + "settings.speech.provider.title": "Provider", + "settings.speech.provider.subtitle": "Speech requests use the server-side speech adapter.", + "settings.speech.provider.openaiCompatible": "OpenAI-compatible", + "settings.speech.status.loading": "Checking configuration...", + "settings.speech.status.configured": "Configured", + "settings.speech.status.missing": "Missing API key", + "settings.speech.status.error": "Speech service unavailable", + "settings.speech.apiKey.title": "API key", + "settings.speech.apiKey.subtitle": "Used for CodeNomad-managed speech requests.", + "settings.speech.baseUrl.title": "Base URL", + "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", + "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.sttModel.title": "Transcription model", + "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", + "settings.speech.ttsModel.title": "Speech model", + "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", + "settings.speech.ttsVoice.title": "Default voice", + "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.save.action": "Save", + "settings.speech.save.saving": "Saving...", + "settings.speech.save.saved": "Saved", + "settings.speech.save.unsaved": "Unsaved changes", + "settings.speech.save.error": "Save failed", } as const diff --git a/packages/ui/src/lib/i18n/messages/es/messaging.ts b/packages/ui/src/lib/i18n/messages/es/messaging.ts index 1c2eb7bd..850ab6b4 100644 --- a/packages/ui/src/lib/i18n/messages/es/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/es/messaging.ts @@ -140,4 +140,11 @@ export const messagingMessages = { "promptInput.send.ariaLabel": "Enviar mensaje", "promptInput.send.errorFallback": "No se pudo enviar el mensaje", "promptInput.send.errorTitle": "Error al enviar", + "promptInput.voiceInput.start.title": "Start voice input", + "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.transcribing.title": "Transcribing audio", + "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", + "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", + "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", } as const diff --git a/packages/ui/src/lib/i18n/messages/es/settings.ts b/packages/ui/src/lib/i18n/messages/es/settings.ts index 8a9cbe6c..1c39e405 100644 --- a/packages/ui/src/lib/i18n/messages/es/settings.ts +++ b/packages/ui/src/lib/i18n/messages/es/settings.ts @@ -65,6 +65,7 @@ export const settingsMessages = { "settings.nav.appearance": "Appearance", "settings.nav.notifications": "Notifications", "settings.nav.remote": "Remote Access", + "settings.nav.speech": "Speech", "settings.nav.opencode": "OpenCode", "settings.scope.device": "This device", "settings.scope.server": "Server setting", @@ -137,6 +138,34 @@ export const settingsMessages = { "settings.behavior.usageMetrics.subtitle": "Muestra u oculta estadisticas de tokens y costo en mensajes del asistente.", "settings.behavior.autoCleanup.title": "Limpieza automatica de sesiones en blanco", "settings.behavior.autoCleanup.subtitle": "Limpia automaticamente las sesiones en blanco al crear nuevas.", + "settings.behavior.promptVoiceInput.title": "Prompt voice input", + "settings.behavior.promptVoiceInput.subtitle": "Show the microphone control for speech-to-text prompt input when speech is configured.", "settings.behavior.promptSubmit.title": "Enter para enviar", "settings.behavior.promptSubmit.subtitle": "Usa Enter para enviar; Cmd/Ctrl+Enter inserta una nueva linea.", + "settings.speech.title": "Speech", + "settings.speech.subtitle": "Configure speech-to-text now and text-to-speech groundwork for later features.", + "settings.speech.provider.title": "Provider", + "settings.speech.provider.subtitle": "Speech requests use the server-side speech adapter.", + "settings.speech.provider.openaiCompatible": "OpenAI-compatible", + "settings.speech.status.loading": "Checking configuration...", + "settings.speech.status.configured": "Configured", + "settings.speech.status.missing": "Missing API key", + "settings.speech.status.error": "Speech service unavailable", + "settings.speech.apiKey.title": "API key", + "settings.speech.apiKey.subtitle": "Used for CodeNomad-managed speech requests.", + "settings.speech.baseUrl.title": "Base URL", + "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", + "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.sttModel.title": "Transcription model", + "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", + "settings.speech.ttsModel.title": "Speech model", + "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", + "settings.speech.ttsVoice.title": "Default voice", + "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.save.action": "Save", + "settings.speech.save.saving": "Saving...", + "settings.speech.save.saved": "Saved", + "settings.speech.save.unsaved": "Unsaved changes", + "settings.speech.save.error": "Save failed", } as const diff --git a/packages/ui/src/lib/i18n/messages/fr/messaging.ts b/packages/ui/src/lib/i18n/messages/fr/messaging.ts index abf3ebab..0a742efa 100644 --- a/packages/ui/src/lib/i18n/messages/fr/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/fr/messaging.ts @@ -140,4 +140,11 @@ export const messagingMessages = { "promptInput.send.ariaLabel": "Envoyer le message", "promptInput.send.errorFallback": "Impossible d'envoyer le message", "promptInput.send.errorTitle": "Échec de l'envoi", + "promptInput.voiceInput.start.title": "Start voice input", + "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.transcribing.title": "Transcribing audio", + "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", + "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", + "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", } as const diff --git a/packages/ui/src/lib/i18n/messages/fr/settings.ts b/packages/ui/src/lib/i18n/messages/fr/settings.ts index 9a7009ba..710d2c77 100644 --- a/packages/ui/src/lib/i18n/messages/fr/settings.ts +++ b/packages/ui/src/lib/i18n/messages/fr/settings.ts @@ -65,6 +65,7 @@ export const settingsMessages = { "settings.nav.appearance": "Appearance", "settings.nav.notifications": "Notifications", "settings.nav.remote": "Remote Access", + "settings.nav.speech": "Speech", "settings.nav.opencode": "OpenCode", "settings.scope.device": "This device", "settings.scope.server": "Server setting", @@ -137,6 +138,34 @@ export const settingsMessages = { "settings.behavior.usageMetrics.subtitle": "Afficher ou masquer les stats de tokens et de cout pour les messages de l'assistant.", "settings.behavior.autoCleanup.title": "Nettoyage auto des sessions vides", "settings.behavior.autoCleanup.subtitle": "Nettoyer automatiquement les sessions vides lors de la creation de nouvelles.", + "settings.behavior.promptVoiceInput.title": "Prompt voice input", + "settings.behavior.promptVoiceInput.subtitle": "Show the microphone control for speech-to-text prompt input when speech is configured.", "settings.behavior.promptSubmit.title": "Entrer pour envoyer", "settings.behavior.promptSubmit.subtitle": "Utiliser Entrer pour envoyer; Cmd/Ctrl+Entrer insere une nouvelle ligne.", + "settings.speech.title": "Speech", + "settings.speech.subtitle": "Configure speech-to-text now and text-to-speech groundwork for later features.", + "settings.speech.provider.title": "Provider", + "settings.speech.provider.subtitle": "Speech requests use the server-side speech adapter.", + "settings.speech.provider.openaiCompatible": "OpenAI-compatible", + "settings.speech.status.loading": "Checking configuration...", + "settings.speech.status.configured": "Configured", + "settings.speech.status.missing": "Missing API key", + "settings.speech.status.error": "Speech service unavailable", + "settings.speech.apiKey.title": "API key", + "settings.speech.apiKey.subtitle": "Used for CodeNomad-managed speech requests.", + "settings.speech.baseUrl.title": "Base URL", + "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", + "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.sttModel.title": "Transcription model", + "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", + "settings.speech.ttsModel.title": "Speech model", + "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", + "settings.speech.ttsVoice.title": "Default voice", + "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.save.action": "Save", + "settings.speech.save.saving": "Saving...", + "settings.speech.save.saved": "Saved", + "settings.speech.save.unsaved": "Unsaved changes", + "settings.speech.save.error": "Save failed", } as const diff --git a/packages/ui/src/lib/i18n/messages/ja/messaging.ts b/packages/ui/src/lib/i18n/messages/ja/messaging.ts index fe8cb52b..60cd8881 100644 --- a/packages/ui/src/lib/i18n/messages/ja/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/ja/messaging.ts @@ -140,4 +140,11 @@ export const messagingMessages = { "promptInput.send.ariaLabel": "メッセージを送信", "promptInput.send.errorFallback": "メッセージの送信に失敗しました", "promptInput.send.errorTitle": "送信に失敗", + "promptInput.voiceInput.start.title": "Start voice input", + "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.transcribing.title": "Transcribing audio", + "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", + "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", + "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", } as const diff --git a/packages/ui/src/lib/i18n/messages/ja/settings.ts b/packages/ui/src/lib/i18n/messages/ja/settings.ts index 6bc70aab..8feaa819 100644 --- a/packages/ui/src/lib/i18n/messages/ja/settings.ts +++ b/packages/ui/src/lib/i18n/messages/ja/settings.ts @@ -65,6 +65,7 @@ export const settingsMessages = { "settings.nav.appearance": "Appearance", "settings.nav.notifications": "Notifications", "settings.nav.remote": "Remote Access", + "settings.nav.speech": "Speech", "settings.nav.opencode": "OpenCode", "settings.scope.device": "This device", "settings.scope.server": "Server setting", @@ -137,6 +138,34 @@ export const settingsMessages = { "settings.behavior.usageMetrics.subtitle": "アシスタントのメッセージにトークン数とコストの統計を表示/非表示にします。", "settings.behavior.autoCleanup.title": "空のセッションを自動クリーンアップ", "settings.behavior.autoCleanup.subtitle": "新しいセッション作成時に空のセッションを自動的にクリーンアップします。", + "settings.behavior.promptVoiceInput.title": "Prompt voice input", + "settings.behavior.promptVoiceInput.subtitle": "Show the microphone control for speech-to-text prompt input when speech is configured.", "settings.behavior.promptSubmit.title": "Enterで送信", "settings.behavior.promptSubmit.subtitle": "Enterで送信し、Cmd/Ctrl+Enterで改行します。", + "settings.speech.title": "Speech", + "settings.speech.subtitle": "Configure speech-to-text now and text-to-speech groundwork for later features.", + "settings.speech.provider.title": "Provider", + "settings.speech.provider.subtitle": "Speech requests use the server-side speech adapter.", + "settings.speech.provider.openaiCompatible": "OpenAI-compatible", + "settings.speech.status.loading": "Checking configuration...", + "settings.speech.status.configured": "Configured", + "settings.speech.status.missing": "Missing API key", + "settings.speech.status.error": "Speech service unavailable", + "settings.speech.apiKey.title": "API key", + "settings.speech.apiKey.subtitle": "Used for CodeNomad-managed speech requests.", + "settings.speech.baseUrl.title": "Base URL", + "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", + "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.sttModel.title": "Transcription model", + "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", + "settings.speech.ttsModel.title": "Speech model", + "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", + "settings.speech.ttsVoice.title": "Default voice", + "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.save.action": "Save", + "settings.speech.save.saving": "Saving...", + "settings.speech.save.saved": "Saved", + "settings.speech.save.unsaved": "Unsaved changes", + "settings.speech.save.error": "Save failed", } as const diff --git a/packages/ui/src/lib/i18n/messages/ru/messaging.ts b/packages/ui/src/lib/i18n/messages/ru/messaging.ts index b7457d11..a833b25e 100644 --- a/packages/ui/src/lib/i18n/messages/ru/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/ru/messaging.ts @@ -140,4 +140,11 @@ export const messagingMessages = { "promptInput.send.ariaLabel": "Отправить сообщение", "promptInput.send.errorFallback": "Не удалось отправить сообщение", "promptInput.send.errorTitle": "Не удалось отправить", + "promptInput.voiceInput.start.title": "Start voice input", + "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.transcribing.title": "Transcribing audio", + "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", + "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", + "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", } as const diff --git a/packages/ui/src/lib/i18n/messages/ru/settings.ts b/packages/ui/src/lib/i18n/messages/ru/settings.ts index ce52f835..494ff447 100644 --- a/packages/ui/src/lib/i18n/messages/ru/settings.ts +++ b/packages/ui/src/lib/i18n/messages/ru/settings.ts @@ -65,6 +65,7 @@ export const settingsMessages = { "settings.nav.appearance": "Appearance", "settings.nav.notifications": "Notifications", "settings.nav.remote": "Remote Access", + "settings.nav.speech": "Speech", "settings.nav.opencode": "OpenCode", "settings.scope.device": "This device", "settings.scope.server": "Server setting", @@ -137,6 +138,34 @@ export const settingsMessages = { "settings.behavior.usageMetrics.subtitle": "Показывать или скрывать статистику токенов и стоимости в сообщениях ассистента.", "settings.behavior.autoCleanup.title": "Автоочистка пустых сессий", "settings.behavior.autoCleanup.subtitle": "Автоматически очищать пустые сессии при создании новых.", + "settings.behavior.promptVoiceInput.title": "Prompt voice input", + "settings.behavior.promptVoiceInput.subtitle": "Show the microphone control for speech-to-text prompt input when speech is configured.", "settings.behavior.promptSubmit.title": "Enter для отправки", "settings.behavior.promptSubmit.subtitle": "Enter отправляет; Cmd/Ctrl+Enter вставляет новую строку.", + "settings.speech.title": "Speech", + "settings.speech.subtitle": "Configure speech-to-text now and text-to-speech groundwork for later features.", + "settings.speech.provider.title": "Provider", + "settings.speech.provider.subtitle": "Speech requests use the server-side speech adapter.", + "settings.speech.provider.openaiCompatible": "OpenAI-compatible", + "settings.speech.status.loading": "Checking configuration...", + "settings.speech.status.configured": "Configured", + "settings.speech.status.missing": "Missing API key", + "settings.speech.status.error": "Speech service unavailable", + "settings.speech.apiKey.title": "API key", + "settings.speech.apiKey.subtitle": "Used for CodeNomad-managed speech requests.", + "settings.speech.baseUrl.title": "Base URL", + "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", + "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.sttModel.title": "Transcription model", + "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", + "settings.speech.ttsModel.title": "Speech model", + "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", + "settings.speech.ttsVoice.title": "Default voice", + "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.save.action": "Save", + "settings.speech.save.saving": "Saving...", + "settings.speech.save.saved": "Saved", + "settings.speech.save.unsaved": "Unsaved changes", + "settings.speech.save.error": "Save failed", } as const diff --git a/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts b/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts index 9c7bc232..aeabd954 100644 --- a/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts @@ -140,4 +140,11 @@ export const messagingMessages = { "promptInput.send.ariaLabel": "发送消息", "promptInput.send.errorFallback": "发送消息失败", "promptInput.send.errorTitle": "发送失败", + "promptInput.voiceInput.start.title": "Start voice input", + "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.transcribing.title": "Transcribing audio", + "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", + "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", + "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", } as const diff --git a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts index 8451aeae..aff1063f 100644 --- a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts +++ b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts @@ -65,6 +65,7 @@ export const settingsMessages = { "settings.nav.appearance": "Appearance", "settings.nav.notifications": "Notifications", "settings.nav.remote": "Remote Access", + "settings.nav.speech": "Speech", "settings.nav.opencode": "OpenCode", "settings.scope.device": "This device", "settings.scope.server": "Server setting", @@ -137,6 +138,34 @@ export const settingsMessages = { "settings.behavior.usageMetrics.subtitle": "显示或隐藏助手消息的令牌与成本统计。", "settings.behavior.autoCleanup.title": "自动清理空会话", "settings.behavior.autoCleanup.subtitle": "创建新会话时自动清理空会话。", + "settings.behavior.promptVoiceInput.title": "Prompt voice input", + "settings.behavior.promptVoiceInput.subtitle": "Show the microphone control for speech-to-text prompt input when speech is configured.", "settings.behavior.promptSubmit.title": "回车发送", "settings.behavior.promptSubmit.subtitle": "使用回车发送;Cmd/Ctrl+回车插入新行。", + "settings.speech.title": "Speech", + "settings.speech.subtitle": "Configure speech-to-text now and text-to-speech groundwork for later features.", + "settings.speech.provider.title": "Provider", + "settings.speech.provider.subtitle": "Speech requests use the server-side speech adapter.", + "settings.speech.provider.openaiCompatible": "OpenAI-compatible", + "settings.speech.status.loading": "Checking configuration...", + "settings.speech.status.configured": "Configured", + "settings.speech.status.missing": "Missing API key", + "settings.speech.status.error": "Speech service unavailable", + "settings.speech.apiKey.title": "API key", + "settings.speech.apiKey.subtitle": "Used for CodeNomad-managed speech requests.", + "settings.speech.baseUrl.title": "Base URL", + "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", + "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.sttModel.title": "Transcription model", + "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", + "settings.speech.ttsModel.title": "Speech model", + "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", + "settings.speech.ttsVoice.title": "Default voice", + "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.save.action": "Save", + "settings.speech.save.saving": "Saving...", + "settings.speech.save.saved": "Saved", + "settings.speech.save.unsaved": "Unsaved changes", + "settings.speech.save.error": "Save failed", } as const diff --git a/packages/ui/src/lib/settings/behavior-registry.ts b/packages/ui/src/lib/settings/behavior-registry.ts index b7ab1b98..573b97f3 100644 --- a/packages/ui/src/lib/settings/behavior-registry.ts +++ b/packages/ui/src/lib/settings/behavior-registry.ts @@ -42,6 +42,7 @@ export type BehaviorRegistryActions = { toggleUsageMetrics: () => void toggleAutoCleanupBlankSessions: () => void togglePromptSubmitOnEnter: () => void + toggleShowPromptVoiceInput: () => void setDiffViewMode: (mode: "split" | "unified") => void setToolOutputExpansion: (mode: ExpansionPreference) => void setDiagnosticsExpansion: (mode: ExpansionPreference) => void @@ -248,6 +249,24 @@ export function getBehaviorSettings(actions: BehaviorRegistryActions): BehaviorS ) }, }, + { + kind: "toggle", + id: "behavior.promptVoiceInput", + titleKey: "settings.behavior.promptVoiceInput.title", + subtitleKey: "settings.behavior.promptVoiceInput.subtitle", + get: (p) => Boolean(p.showPromptVoiceInput ?? true), + set: (next) => { + if (updatePreferences) { + updatePreferences({ showPromptVoiceInput: next }) + return + } + setBooleanByToggle( + () => Boolean(prefs().showPromptVoiceInput ?? true), + actions.toggleShowPromptVoiceInput, + next, + ) + }, + }, { kind: "toggle", id: "behavior.promptSubmitOnEnter", diff --git a/packages/ui/src/stores/preferences.tsx b/packages/ui/src/stores/preferences.tsx index 8ac2ead0..a387b754 100644 --- a/packages/ui/src/stores/preferences.tsx +++ b/packages/ui/src/stores/preferences.tsx @@ -7,6 +7,7 @@ import { updateInstanceConfig as updateInstanceData, } from "./instance-config" import { getLogger } from "../lib/logger" +import { loadSpeechCapabilities, resetSpeechCapabilities } from "./speech" const log = getLogger("actions") @@ -27,6 +28,16 @@ export type DiffViewMode = "split" | "unified" export type ExpansionPreference = "expanded" | "collapsed" export type ToolInputsVisibilityPreference = "hidden" | "collapsed" | "expanded" export type ListeningMode = "local" | "all" +export type SpeechProviderPreference = "openai-compatible" + +export interface SpeechSettings { + provider: SpeechProviderPreference + apiKey?: string + baseUrl?: string + sttModel: string + ttsModel: string + ttsVoice: string +} export interface UiSettings { showThinkingBlocks: boolean @@ -34,6 +45,7 @@ export interface UiSettings { thinkingBlocksExpansion: ExpansionPreference showTimelineTools: boolean promptSubmitOnEnter: boolean + showPromptVoiceInput: boolean locale?: string diffViewMode: DiffViewMode toolOutputExpansion: ExpansionPreference @@ -75,6 +87,7 @@ interface ServerConfigBucket { listeningMode?: ListeningMode environmentVariables?: Record opencodeBinary?: string + speech?: Partial } interface UiStateBucket { @@ -107,6 +120,7 @@ const defaultUiSettings: UiSettings = { thinkingBlocksExpansion: "expanded", showTimelineTools: true, promptSubmitOnEnter: false, + showPromptVoiceInput: true, diffViewMode: "split", toolOutputExpansion: "expanded", diagnosticsExpansion: "expanded", @@ -120,6 +134,13 @@ const defaultUiSettings: UiSettings = { notifyOnIdle: true, } +const defaultSpeechSettings: SpeechSettings = { + provider: "openai-compatible", + sttModel: "gpt-4o-mini-transcribe", + ttsModel: "gpt-4o-mini-tts", + ttsVoice: "alloy", +} + function normalizeUiSettings(input?: Partial | null): UiSettings { const sanitized = input ?? {} return { @@ -129,6 +150,7 @@ function normalizeUiSettings(input?: Partial | null): UiSettings { thinkingBlocksExpansion: sanitized.thinkingBlocksExpansion ?? defaultUiSettings.thinkingBlocksExpansion, showTimelineTools: sanitized.showTimelineTools ?? defaultUiSettings.showTimelineTools, promptSubmitOnEnter: sanitized.promptSubmitOnEnter ?? defaultUiSettings.promptSubmitOnEnter, + showPromptVoiceInput: sanitized.showPromptVoiceInput ?? defaultUiSettings.showPromptVoiceInput, locale: sanitized.locale ?? defaultUiSettings.locale, diffViewMode: sanitized.diffViewMode ?? defaultUiSettings.diffViewMode, toolOutputExpansion: sanitized.toolOutputExpansion ?? defaultUiSettings.toolOutputExpansion, @@ -156,6 +178,27 @@ function normalizeRecord(value: unknown): Record { return out } +function normalizeSpeechSettings(input?: Partial | null): SpeechSettings { + const sanitized = input ?? {} + return { + provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider, + apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined, + baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined, + sttModel: + typeof sanitized.sttModel === "string" && sanitized.sttModel.trim() + ? sanitized.sttModel.trim() + : defaultSpeechSettings.sttModel, + ttsModel: + typeof sanitized.ttsModel === "string" && sanitized.ttsModel.trim() + ? sanitized.ttsModel.trim() + : defaultSpeechSettings.ttsModel, + ttsVoice: + typeof sanitized.ttsVoice === "string" && sanitized.ttsVoice.trim() + ? sanitized.ttsVoice.trim() + : defaultSpeechSettings.ttsVoice, + } +} + function cloneArray(value: unknown, mapper: (item: any) => T | null): T[] { if (!Array.isArray(value)) return [] const out: T[] = [] @@ -206,12 +249,15 @@ function normalizeUiState(input?: UiStateBucket | null): NormalizedUiState { } } -function normalizeServerConfig(input?: ServerConfigBucket | null): Required> { +function normalizeServerConfig( + input?: ServerConfigBucket | null, +): Required> & { speech: SpeechSettings } { const source = input ?? {} const listeningMode = source.listeningMode === "all" ? "all" : "local" const opencodeBinary = typeof source.opencodeBinary === "string" && source.opencodeBinary.trim() ? source.opencodeBinary : "opencode" const environmentVariables = normalizeRecord(source.environmentVariables) - return { listeningMode, opencodeBinary, environmentVariables } + const speech = normalizeSpeechSettings(source.speech) + return { listeningMode, opencodeBinary, environmentVariables, speech } } function getModelKey(model: { providerId: string; modelId: string }): string { @@ -342,6 +388,16 @@ function updateLastUsedBinary(path: string): void { void patchStateOwner("ui", { opencodeBinaries: nextList }).catch((error) => log.error("Failed to update binary list", error)) } +async function updateSpeechSettings(updates: Partial): Promise { + const next = normalizeSpeechSettings({ ...serverSettings().speech, ...updates }) + try { + await patchConfigOwner("server", { speech: next }) + } catch (error) { + log.error("Failed to update speech settings", error) + throw error + } +} + function addOpenCodeBinary(path: string, version?: string): void { const nextList = buildBinaryList(path, version, opencodeBinaries()) void patchStateOwner("ui", { opencodeBinaries: nextList }).catch((error) => log.error("Failed to add binary", error)) @@ -476,6 +532,10 @@ function togglePromptSubmitOnEnter(): void { updateUiSettings({ promptSubmitOnEnter: !preferences().promptSubmitOnEnter }) } +function toggleShowPromptVoiceInput(): void { + updateUiSettings({ showPromptVoiceInput: !preferences().showPromptVoiceInput }) +} + function toggleAutoCleanupBlankSessions(): void { const nextValue = !preferences().autoCleanupBlankSessions log.info("toggle auto cleanup", { value: nextValue }) @@ -521,6 +581,7 @@ interface ConfigContextValue { addEnvironmentVariable: typeof addEnvironmentVariable removeEnvironmentVariable: typeof removeEnvironmentVariable updateLastUsedBinary: typeof updateLastUsedBinary + updateSpeechSettings: typeof updateSpeechSettings // ui-owned state recentFolders: typeof recentFolders @@ -544,6 +605,7 @@ interface ConfigContextValue { toggleUsageMetrics: typeof toggleUsageMetrics toggleAutoCleanupBlankSessions: typeof toggleAutoCleanupBlankSessions togglePromptSubmitOnEnter: typeof togglePromptSubmitOnEnter + toggleShowPromptVoiceInput: typeof toggleShowPromptVoiceInput setDiffViewMode: typeof setDiffViewMode setToolOutputExpansion: typeof setToolOutputExpansion setDiagnosticsExpansion: typeof setDiagnosticsExpansion @@ -569,6 +631,7 @@ const configContextValue: ConfigContextValue = { addEnvironmentVariable, removeEnvironmentVariable, updateLastUsedBinary, + updateSpeechSettings, recentFolders, opencodeBinaries, uiState, @@ -588,6 +651,7 @@ const configContextValue: ConfigContextValue = { toggleUsageMetrics, toggleAutoCleanupBlankSessions, togglePromptSubmitOnEnter, + toggleShowPromptVoiceInput, setDiffViewMode, setToolOutputExpansion, setDiagnosticsExpansion, @@ -610,6 +674,8 @@ export const ConfigProvider: ParentComponent = (props) => { const unsubServer = storage.onConfigOwnerChanged("server", (bucket) => { setServerConfigBucket(bucket as any) setIsLoaded(true) + resetSpeechCapabilities() + void loadSpeechCapabilities(true) }) const unsubStateUi = storage.onStateOwnerChanged("ui", (bucket) => { setUiStateBucket(bucket as any) @@ -648,6 +714,7 @@ export { addEnvironmentVariable, removeEnvironmentVariable, updateLastUsedBinary, + updateSpeechSettings, addRecentFolder, removeRecentFolder, addOpenCodeBinary, @@ -664,6 +731,7 @@ export { toggleUsageMetrics, toggleAutoCleanupBlankSessions, togglePromptSubmitOnEnter, + toggleShowPromptVoiceInput, setDiffViewMode, setToolOutputExpansion, setDiagnosticsExpansion, diff --git a/packages/ui/src/stores/settings-screen.ts b/packages/ui/src/stores/settings-screen.ts index 3de9eb13..f411f073 100644 --- a/packages/ui/src/stores/settings-screen.ts +++ b/packages/ui/src/stores/settings-screen.ts @@ -1,6 +1,6 @@ import { createSignal } from "solid-js" -export type SettingsSectionId = "appearance" | "notifications" | "remote" | "opencode" +export type SettingsSectionId = "appearance" | "notifications" | "remote" | "speech" | "opencode" const [settingsOpen, setSettingsOpen] = createSignal(false) const [activeSettingsSection, setActiveSettingsSection] = createSignal("appearance") diff --git a/packages/ui/src/stores/speech.ts b/packages/ui/src/stores/speech.ts new file mode 100644 index 00000000..c7440bdc --- /dev/null +++ b/packages/ui/src/stores/speech.ts @@ -0,0 +1,46 @@ +import { createSignal } from "solid-js" +import type { SpeechCapabilitiesResponse } from "../../../server/src/api-types" +import { serverApi } from "../lib/api-client" +import { getLogger } from "../lib/logger" + +const log = getLogger("api") + +const [speechCapabilities, setSpeechCapabilities] = createSignal(null) +const [speechCapabilitiesLoading, setSpeechCapabilitiesLoading] = createSignal(false) +const [speechCapabilitiesError, setSpeechCapabilitiesError] = createSignal(null) + +let speechCapabilitiesPromise: Promise | null = null + +async function loadSpeechCapabilities(force = false): Promise { + if (!force && speechCapabilities()) return speechCapabilities() + if (speechCapabilitiesPromise) return speechCapabilitiesPromise + + setSpeechCapabilitiesLoading(true) + setSpeechCapabilitiesError(null) + speechCapabilitiesPromise = serverApi + .fetchSpeechCapabilities() + .then((result) => { + setSpeechCapabilities(result) + setSpeechCapabilitiesError(null) + return result + }) + .catch((error) => { + log.error("Failed to load speech capabilities", error) + setSpeechCapabilities(null) + setSpeechCapabilitiesError(error instanceof Error ? error.message : String(error)) + return null + }) + .finally(() => { + setSpeechCapabilitiesLoading(false) + speechCapabilitiesPromise = null + }) + + return speechCapabilitiesPromise +} + +function resetSpeechCapabilities(): void { + setSpeechCapabilities(null) + setSpeechCapabilitiesError(null) +} + +export { speechCapabilities, speechCapabilitiesLoading, speechCapabilitiesError, loadSpeechCapabilities, resetSpeechCapabilities } diff --git a/packages/ui/src/styles/messaging/prompt-input.css b/packages/ui/src/styles/messaging/prompt-input.css index 8cbfb796..bec804f2 100644 --- a/packages/ui/src/styles/messaging/prompt-input.css +++ b/packages/ui/src/styles/messaging/prompt-input.css @@ -170,6 +170,37 @@ color: var(--button-danger-text, var(--text-inverted, #ffffff)); } +.prompt-voice-button { + @apply w-10 h-10 rounded-md border-none cursor-pointer flex items-center justify-center transition-all flex-shrink-0; + background-color: color-mix(in oklab, var(--surface-secondary) 82%, var(--surface-base)); + color: var(--text-secondary); +} + +.prompt-voice-button:hover:not(:disabled) { + color: var(--text-primary); + background-color: color-mix(in oklab, var(--accent-primary) 12%, var(--surface-secondary)); + @apply scale-105; +} + +.prompt-voice-button:active:not(:disabled) { + @apply scale-95; +} + +.prompt-voice-button.is-recording { + background-color: color-mix(in oklab, var(--button-danger-bg, rgba(239, 68, 68, 0.85)) 88%, white 12%); + color: var(--button-danger-text, var(--text-inverted, #ffffff)); +} + +.prompt-voice-button:disabled { + @apply opacity-50 cursor-not-allowed; +} + +.prompt-voice-timer { + font-size: 0.68rem; + line-height: 1; + color: var(--text-muted); +} + .stop-button:hover:not(:disabled) { background-color: var(--button-danger-hover-bg, rgba(239, 68, 68, 0.9)); @apply opacity-95 scale-105;