feat(speech): add prompt voice input groundwork

This commit is contained in:
Shantur Rathore
2026-03-12 22:04:57 +00:00
parent a950d47df0
commit f3a1ccd8b8
34 changed files with 1285 additions and 10 deletions

View File

@@ -1,5 +1,5 @@
import { Suspense, createEffect, createSignal, lazy, on, onCleanup, onMount, Show } from "solid-js"
import { ArrowBigUp, ArrowBigDown } from "lucide-solid"
import { Suspense, createEffect, createSignal, lazy, on, onCleanup, Show } from "solid-js"
import { ArrowBigUp, ArrowBigDown, Loader2, Mic, Square } from "lucide-solid"
import ExpandButton from "./expand-button"
import { clearAttachments, removeAttachment } from "../stores/attachments"
import { resolvePastedPlaceholders } from "../lib/prompt-placeholders"
@@ -18,6 +18,7 @@ import { usePromptState } from "./prompt-input/usePromptState"
import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
import { usePromptPicker } from "./prompt-input/usePromptPicker"
import { usePromptKeyDown } from "./prompt-input/usePromptKeyDown"
import { usePromptVoiceInput } from "./prompt-input/usePromptVoiceInput"
const log = getLogger("actions")
const LazyUnifiedPicker = lazy(() => import("./unified-picker"))
@@ -450,6 +451,16 @@ export default function PromptInput(props: PromptInputProps) {
})
const shouldShowOverlay = () => prompt().length === 0
const voiceInput = usePromptVoiceInput({
prompt,
setPrompt,
getTextarea: () => textareaRef ?? null,
enabled: () => preferences().showPromptVoiceInput,
disabled: () => Boolean(props.disabled),
})
const showVoiceInput = () =>
preferences().showPromptVoiceInput &&
(voiceInput.canUseVoiceInput() || voiceInput.isRecording() || voiceInput.isTranscribing())
const instance = () => getActiveInstance()
@@ -597,6 +608,30 @@ export default function PromptInput(props: PromptInputProps) {
</div>
<div class="prompt-input-actions">
<Show when={showVoiceInput()}>
<button
type="button"
class={`prompt-voice-button ${voiceInput.isRecording() ? "is-recording" : ""}`}
onClick={() => void voiceInput.toggleRecording()}
disabled={!voiceInput.isRecording() && (props.disabled || voiceInput.isTranscribing() || !voiceInput.canUseVoiceInput())}
aria-label={voiceInput.buttonTitle()}
title={voiceInput.buttonTitle()}
>
<Show
when={voiceInput.isTranscribing()}
fallback={
<Show when={voiceInput.isRecording()} fallback={<Mic class="h-4 w-4" aria-hidden="true" />}>
<Square class="h-4 w-4" aria-hidden="true" />
</Show>
}
>
<Loader2 class="h-4 w-4 animate-spin" aria-hidden="true" />
</Show>
</button>
<Show when={voiceInput.isRecording()}>
<span class="prompt-voice-timer">{formatVoiceTimer(voiceInput.elapsedMs())}</span>
</Show>
</Show>
<button
type="button"
class="stop-button"
@@ -631,3 +666,10 @@ export default function PromptInput(props: PromptInputProps) {
</div>
)
}
function formatVoiceTimer(elapsedMs: number): string {
const totalSeconds = Math.max(0, Math.floor(elapsedMs / 1000))
const minutes = Math.floor(totalSeconds / 60)
const seconds = totalSeconds % 60
return `${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")}`
}

View File

@@ -0,0 +1,242 @@
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
import { showAlertDialog } from "../../stores/alerts"
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
import { serverApi } from "../../lib/api-client"
import { useI18n } from "../../lib/i18n"
interface UsePromptVoiceInputOptions {
prompt: Accessor<string>
setPrompt: (value: string) => void
getTextarea: () => HTMLTextAreaElement | null
enabled: Accessor<boolean>
disabled: Accessor<boolean>
}
type VoiceInputState = "idle" | "recording" | "transcribing"
export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
const { t } = useI18n()
const [state, setState] = createSignal<VoiceInputState>("idle")
const [elapsedMs, setElapsedMs] = createSignal(0)
let mediaRecorder: MediaRecorder | null = null
let mediaStream: MediaStream | null = null
let timerId: number | undefined
let shouldTranscribe = true
let recordedChunks: Blob[] = []
let recordingStartedAt = 0
createEffect(() => {
void loadSpeechCapabilities()
})
onCleanup(() => {
cleanupMedia(false)
})
const isSupported = () => {
if (typeof window === "undefined") return false
return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
}
const canUseVoiceInput = () => {
const capabilities = speechCapabilities()
return Boolean(
options.enabled() &&
isSupported() &&
capabilities?.available &&
capabilities?.configured &&
capabilities?.supportsStt,
)
}
async function toggleRecording(): Promise<void> {
if (state() === "recording") {
stopRecording()
return
}
if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
try {
await startRecording()
} catch (error) {
cleanupMedia(false)
showAlertDialog(t("promptInput.voiceInput.error.permission"), {
title: t("promptInput.voiceInput.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
}
}
function stopRecording() {
if (!mediaRecorder || state() !== "recording") return
shouldTranscribe = true
mediaRecorder.stop()
setState("transcribing")
stopTimer()
}
function cancelRecording() {
if (!mediaRecorder || state() !== "recording") return
shouldTranscribe = false
mediaRecorder.stop()
cleanupMedia(false)
}
async function startRecording() {
if (!isSupported()) {
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
title: t("promptInput.voiceInput.error.title"),
variant: "error",
})
return
}
recordedChunks = []
shouldTranscribe = true
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
mediaRecorder = createRecorder(mediaStream)
mediaRecorder.addEventListener("dataavailable", (event) => {
if (event.data.size > 0) {
recordedChunks.push(event.data)
}
})
mediaRecorder.addEventListener("stop", () => {
void finalizeRecording()
})
recordingStartedAt = Date.now()
setElapsedMs(0)
setState("recording")
startTimer()
mediaRecorder.start()
}
async function finalizeRecording() {
const recorder = mediaRecorder
const stream = mediaStream
mediaRecorder = null
mediaStream = null
if (!shouldTranscribe || recordedChunks.length === 0) {
recordedChunks = []
stopTracks(stream)
setState("idle")
setElapsedMs(0)
return
}
const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
try {
const audioBlob = new Blob(recordedChunks, { type: mimeType })
const transcription = await serverApi.transcribeAudio({
audioBase64: await blobToBase64(audioBlob),
mimeType,
})
if (transcription.text.trim()) {
insertTranscript(transcription.text.trim())
}
} catch (error) {
showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
title: t("promptInput.voiceInput.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
} finally {
recordedChunks = []
stopTracks(stream)
setState("idle")
setElapsedMs(0)
}
}
function insertTranscript(text: string) {
const current = options.prompt()
const textarea = options.getTextarea()
const start = textarea ? textarea.selectionStart : current.length
const end = textarea ? textarea.selectionEnd : current.length
const before = current.slice(0, start)
const after = current.slice(end)
const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
const nextValue = `${before}${prefix}${text}${suffix}${after}`
const cursor = before.length + prefix.length + text.length
options.setPrompt(nextValue)
if (textarea) {
setTimeout(() => {
textarea.focus()
textarea.setSelectionRange(cursor, cursor)
}, 0)
}
}
function cleanupMedia(resetState = true) {
stopTimer()
if (mediaRecorder && mediaRecorder.state !== "inactive") {
mediaRecorder.stop()
}
mediaRecorder = null
stopTracks(mediaStream)
mediaStream = null
recordedChunks = []
if (resetState) {
setState("idle")
setElapsedMs(0)
}
}
function startTimer() {
stopTimer()
timerId = window.setInterval(() => {
setElapsedMs(Date.now() - recordingStartedAt)
}, 250)
}
function stopTimer() {
if (timerId !== undefined) {
window.clearInterval(timerId)
timerId = undefined
}
}
return {
state,
elapsedMs,
canUseVoiceInput,
toggleRecording,
cancelRecording,
isRecording: () => state() === "recording",
isTranscribing: () => state() === "transcribing",
buttonTitle: () => {
if (state() === "recording") return t("promptInput.voiceInput.stop.title")
if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
return t("promptInput.voiceInput.start.title")
},
}
}
function createRecorder(stream: MediaStream): MediaRecorder {
const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
}
function stopTracks(stream: MediaStream | null) {
stream?.getTracks().forEach((track) => track.stop())
}
async function blobToBase64(blob: Blob): Promise<string> {
const buffer = await blob.arrayBuffer()
const bytes = new Uint8Array(buffer)
let binary = ""
for (const byte of bytes) {
binary += String.fromCharCode(byte)
}
return btoa(binary)
}

View File

@@ -1,5 +1,5 @@
import { Dialog } from "@kobalte/core/dialog"
import { Settings, Bell, MonitorUp, Paintbrush, Terminal, X } from "lucide-solid"
import { Settings, Bell, MonitorUp, Paintbrush, Terminal, Volume2, X } from "lucide-solid"
import { createMemo, For, type Component } from "solid-js"
import { useI18n } from "../lib/i18n"
import {
@@ -13,6 +13,7 @@ import { AppearanceSettingsSection } from "./settings/appearance-settings-sectio
import { NotificationsSettingsSection } from "./settings/notifications-settings-section"
import { OpenCodeSettingsSection } from "./settings/opencode-settings-section"
import { RemoteAccessSettingsSection } from "./settings/remote-access-settings-section"
import { SpeechSettingsSection } from "./settings/speech-settings-section"
export const SettingsScreen: Component = () => {
const { t } = useI18n()
@@ -21,6 +22,7 @@ export const SettingsScreen: Component = () => {
{ id: "appearance" as SettingsSectionId, icon: Paintbrush, label: t("settings.nav.appearance") },
{ id: "notifications" as SettingsSectionId, icon: Bell, label: t("settings.nav.notifications") },
{ id: "remote" as SettingsSectionId, icon: MonitorUp, label: t("settings.nav.remote") },
{ id: "speech" as SettingsSectionId, icon: Volume2, label: t("settings.nav.speech") },
{ id: "opencode" as SettingsSectionId, icon: Terminal, label: t("settings.nav.opencode") },
])
@@ -30,6 +32,8 @@ export const SettingsScreen: Component = () => {
return <NotificationsSettingsSection />
case "remote":
return <RemoteAccessSettingsSection />
case "speech":
return <SpeechSettingsSection />
case "opencode":
return <OpenCodeSettingsSection />
case "appearance":

View File

@@ -24,6 +24,7 @@ export const AppearanceSettingsSection: Component = () => {
toggleUsageMetrics,
toggleAutoCleanupBlankSessions,
togglePromptSubmitOnEnter,
toggleShowPromptVoiceInput,
setDiffViewMode,
setToolOutputExpansion,
setDiagnosticsExpansion,
@@ -38,10 +39,11 @@ export const AppearanceSettingsSection: Component = () => {
toggleShowThinkingBlocks,
toggleKeyboardShortcutHints,
toggleShowTimelineTools,
toggleUsageMetrics,
toggleAutoCleanupBlankSessions,
togglePromptSubmitOnEnter,
setDiffViewMode,
toggleUsageMetrics,
toggleAutoCleanupBlankSessions,
togglePromptSubmitOnEnter,
toggleShowPromptVoiceInput,
setDiffViewMode,
setToolOutputExpansion,
setDiagnosticsExpansion,
setThinkingBlocksExpansion,

View File

@@ -0,0 +1,220 @@
import { createEffect, createMemo, createSignal, type Component } from "solid-js"
import { Mic, Volume2 } from "lucide-solid"
import { useConfig, type SpeechSettings } from "../../stores/preferences"
import { useI18n } from "../../lib/i18n"
import { loadSpeechCapabilities, speechCapabilities, speechCapabilitiesError, speechCapabilitiesLoading } from "../../stores/speech"
import { getLogger } from "../../lib/logger"
const log = getLogger("actions")
type DraftFields = {
apiKey: string
baseUrl: string
sttModel: string
ttsModel: string
ttsVoice: string
}
function createDraftFields(speech: SpeechSettings): DraftFields {
return {
apiKey: speech.apiKey ?? "",
baseUrl: speech.baseUrl ?? "",
sttModel: speech.sttModel,
ttsModel: speech.ttsModel,
ttsVoice: speech.ttsVoice,
}
}
function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
}
export const SpeechSettingsCard: Component = () => {
const { t } = useI18n()
const { serverSettings, updateSpeechSettings } = useConfig()
const initialDrafts = createDraftFields(serverSettings().speech)
const [isSaving, setIsSaving] = createSignal(false)
const [saveStatus, setSaveStatus] = createSignal<"idle" | "saved" | "error">("idle")
const [drafts, setDrafts] = createSignal<DraftFields>(initialDrafts)
createEffect(() => {
const speech = serverSettings().speech
const nextDrafts = createDraftFields(speech)
if (!isDirty() || isSaving()) {
if (!isDraftEqual(drafts(), nextDrafts)) {
setDrafts(nextDrafts)
}
}
if (!isSaving() && !isDirty()) {
setSaveStatus("saved")
}
})
createEffect(() => {
void loadSpeechCapabilities()
})
const capabilityLabel = () => {
if (speechCapabilitiesLoading()) return t("settings.speech.status.loading")
if (speechCapabilitiesError()) return t("settings.speech.status.error")
return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
}
const updateDraft = (key: keyof DraftFields, value: string) => {
setSaveStatus("idle")
setDrafts((current) => ({ ...current, [key]: value }))
}
const isDirty = createMemo(() => {
const speech = serverSettings().speech
const current = drafts()
return (
(current.apiKey || "") !== (speech.apiKey || "") ||
(current.baseUrl || "") !== (speech.baseUrl || "") ||
current.sttModel !== speech.sttModel ||
current.ttsModel !== speech.ttsModel ||
current.ttsVoice !== speech.ttsVoice
)
})
const saveStatusLabel = () => {
if (isSaving()) return t("settings.speech.save.saving")
if (saveStatus() === "saved") return t("settings.speech.save.saved")
if (saveStatus() === "error") return t("settings.speech.save.error")
return t("settings.speech.save.unsaved")
}
async function handleSave() {
if (!isDirty() || isSaving()) return
setIsSaving(true)
setSaveStatus("idle")
const current = drafts()
try {
await updateSpeechSettings({
apiKey: current.apiKey.trim() || undefined,
baseUrl: current.baseUrl.trim() || undefined,
sttModel: current.sttModel.trim() || undefined,
ttsModel: current.ttsModel.trim() || undefined,
ttsVoice: current.ttsVoice.trim() || undefined,
})
await loadSpeechCapabilities(true)
setDrafts({
apiKey: current.apiKey.trim(),
baseUrl: current.baseUrl.trim(),
sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
})
setSaveStatus("saved")
} catch (error) {
log.error("Failed to save speech settings", error)
setSaveStatus("error")
} finally {
setIsSaving(false)
}
}
return (
<div class="settings-card">
<div class="settings-card-header">
<div class="settings-card-heading-with-icon">
<Volume2 class="settings-card-heading-icon" />
<div>
<h3 class="settings-card-title">{t("settings.speech.title")}</h3>
<p class="settings-card-subtitle">{t("settings.speech.subtitle")}</p>
</div>
</div>
<span class="settings-scope-badge settings-scope-badge-server">{t("settings.scope.server")}</span>
</div>
<div class="settings-stack">
<div class="settings-toggle-row settings-toggle-row-compact">
<div>
<div class="settings-toggle-title">{t("settings.speech.provider.title")}</div>
<div class="settings-toggle-caption">{t("settings.speech.provider.subtitle")}</div>
</div>
<div class="settings-toolbar-inline">
<span class="settings-inline-note">{t("settings.speech.provider.openaiCompatible")}</span>
<span class="settings-inline-note">{capabilityLabel()}</span>
<span class="settings-inline-note">{saveStatusLabel()}</span>
<button
type="button"
class="selector-button selector-button-primary w-auto whitespace-nowrap"
onClick={() => void handleSave()}
disabled={!isDirty() || isSaving()}
>
{isSaving() ? t("settings.speech.save.saving") : t("settings.speech.save.action")}
</button>
</div>
</div>
<Field
label={t("settings.speech.apiKey.title")}
caption={t("settings.speech.apiKey.subtitle")}
value={drafts().apiKey}
onInput={(value) => updateDraft("apiKey", value)}
type="password"
/>
<Field
label={t("settings.speech.baseUrl.title")}
caption={t("settings.speech.baseUrl.subtitle")}
value={drafts().baseUrl}
onInput={(value) => updateDraft("baseUrl", value)}
placeholder={t("settings.speech.baseUrl.placeholder")}
/>
<Field
label={t("settings.speech.sttModel.title")}
caption={t("settings.speech.sttModel.subtitle")}
value={drafts().sttModel}
onInput={(value) => updateDraft("sttModel", value)}
/>
<Field
label={t("settings.speech.ttsModel.title")}
caption={t("settings.speech.ttsModel.subtitle")}
value={drafts().ttsModel}
onInput={(value) => updateDraft("ttsModel", value)}
/>
<Field
label={t("settings.speech.ttsVoice.title")}
caption={t("settings.speech.ttsVoice.subtitle")}
value={drafts().ttsVoice}
onInput={(value) => updateDraft("ttsVoice", value)}
icon={<Mic class="w-3.5 h-3.5 icon-muted flex-shrink-0" />}
/>
<div class="settings-inline-note">{t("settings.speech.help")}</div>
</div>
</div>
)
}
const Field: Component<{
label: string
caption: string
value: string
type?: string
placeholder?: string
onInput: (value: string) => void
icon?: any
}> = (props) => {
return (
<div class="settings-toggle-row settings-toggle-row-compact">
<div>
<div class="settings-toggle-title">{props.label}</div>
<div class="settings-toggle-caption">{props.caption}</div>
</div>
<div class="flex items-center gap-2 min-w-[18rem] max-w-[24rem] w-full">
{props.icon}
<input
type={props.type ?? "text"}
value={props.value}
onInput={(event) => props.onInput(event.currentTarget.value)}
class="selector-input w-full"
placeholder={props.placeholder}
/>
</div>
</div>
)
}
export default SpeechSettingsCard

View File

@@ -0,0 +1,10 @@
import type { Component } from "solid-js"
import SpeechSettingsCard from "./speech-settings-card"
export const SpeechSettingsSection: Component = () => {
return (
<div class="settings-section-stack">
<SpeechSettingsCard />
</div>
)
}