feat(speech): add realtime prompt dictation support

Add server-backed realtime transcription for prompt voice input and expose speech settings to choose realtime mode and models.
This commit is contained in:
Shantur Rathore
2026-03-19 11:32:45 +00:00
parent cc2f6976f6
commit f9b5e2b529
29 changed files with 1572 additions and 263 deletions

View File

@@ -11,7 +11,7 @@ import { getCommands } from "../stores/commands"
import { showAlertDialog } from "../stores/alerts"
import { useI18n } from "../lib/i18n"
import { getLogger } from "../lib/logger"
import { preferences } from "../stores/preferences"
import { preferences, useConfig } from "../stores/preferences"
import type { ExpandState, PromptInputApi, PromptInputProps, PromptInsertMode, PromptMode } from "./prompt-input/types"
import { usePromptState } from "./prompt-input/usePromptState"
import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
@@ -22,6 +22,7 @@ const log = getLogger("actions")
export default function PromptInput(props: PromptInputProps) {
const { t } = useI18n()
const { serverSettings } = useConfig()
const [, setIsFocused] = createSignal(false)
const [mode, setMode] = createSignal<PromptMode>("normal")
const [expandState, setExpandState] = createSignal<ExpandState>("normal")
@@ -418,6 +419,7 @@ export default function PromptInput(props: PromptInputProps) {
getTextarea: () => textareaRef ?? null,
enabled: () => preferences().showPromptVoiceInput,
disabled: () => Boolean(props.disabled),
useRealtime: () => serverSettings().speech.useRealtime,
})
const showVoiceInput = () =>
preferences().showPromptVoiceInput &&

View File

@@ -0,0 +1,110 @@
export interface RealtimePcmStreamHandle {
stop(): Promise<void>
}
interface CreateRealtimePcmStreamOptions {
onChunk: (audioBase64: string) => void | Promise<void>
}
const TARGET_SAMPLE_RATE = 24000
const PROCESSOR_BUFFER_SIZE = 4096
export async function createRealtimePcmStream(
options: CreateRealtimePcmStreamOptions,
): Promise<RealtimePcmStreamHandle> {
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
})
const AudioContextCtor = window.AudioContext || (window as any).webkitAudioContext
if (!AudioContextCtor) {
stream.getTracks().forEach((track) => track.stop())
throw new Error("AudioContext is not supported in this browser.")
}
const audioContext = new AudioContextCtor()
await audioContext.resume()
const source = audioContext.createMediaStreamSource(stream)
const processor = audioContext.createScriptProcessor(PROCESSOR_BUFFER_SIZE, 1, 1)
const sink = audioContext.createGain()
sink.gain.value = 0
source.connect(processor)
processor.connect(sink)
sink.connect(audioContext.destination)
processor.onaudioprocess = (event) => {
const input = event.inputBuffer.getChannelData(0)
const resampled = downsampleBuffer(input, audioContext.sampleRate, TARGET_SAMPLE_RATE)
if (resampled.length === 0) return
const pcm16 = floatTo16BitPcm(resampled)
void options.onChunk(base64EncodePcm16(pcm16))
}
let stopped = false
return {
async stop() {
if (stopped) return
stopped = true
processor.onaudioprocess = null
source.disconnect()
processor.disconnect()
sink.disconnect()
stream.getTracks().forEach((track) => track.stop())
await audioContext.close()
},
}
}
function downsampleBuffer(buffer: Float32Array, inputSampleRate: number, outputSampleRate: number): Float32Array {
if (inputSampleRate === outputSampleRate) {
return buffer.slice()
}
const sampleRateRatio = inputSampleRate / outputSampleRate
const outputLength = Math.max(1, Math.round(buffer.length / sampleRateRatio))
const output = new Float32Array(outputLength)
let outputIndex = 0
let inputIndex = 0
while (outputIndex < outputLength) {
const nextInputIndex = Math.min(buffer.length, Math.round((outputIndex + 1) * sampleRateRatio))
let sum = 0
let count = 0
for (let i = inputIndex; i < nextInputIndex; i += 1) {
sum += buffer[i]
count += 1
}
output[outputIndex] = count > 0 ? sum / count : buffer[Math.min(buffer.length - 1, inputIndex)]
outputIndex += 1
inputIndex = nextInputIndex
}
return output
}
function floatTo16BitPcm(buffer: Float32Array): Int16Array {
const pcm16 = new Int16Array(buffer.length)
for (let i = 0; i < buffer.length; i += 1) {
const sample = Math.max(-1, Math.min(1, buffer[i]))
pcm16[i] = sample < 0 ? Math.round(sample * 0x8000) : Math.round(sample * 0x7fff)
}
return pcm16
}
function base64EncodePcm16(buffer: Int16Array): string {
const bytes = new Uint8Array(buffer.buffer)
let binary = ""
const chunkSize = 0x8000
for (let offset = 0; offset < bytes.length; offset += chunkSize) {
const chunk = bytes.subarray(offset, offset + chunkSize)
binary += String.fromCharCode(...chunk)
}
return btoa(binary)
}

View File

@@ -0,0 +1,36 @@
export interface PromptVoiceAnchor {
prompt: string
start: number
end: number
}
export function createPromptVoiceAnchor(prompt: string, start: number, end: number): PromptVoiceAnchor {
return { prompt, start, end }
}
export function buildPromptWithInsertedTranscript(anchor: PromptVoiceAnchor, insertedText: string): { value: string; cursor: number } {
const before = anchor.prompt.slice(0, anchor.start)
const after = anchor.prompt.slice(anchor.end)
const normalized = insertedText.trim()
if (!normalized) {
return {
value: before + after,
cursor: before.length,
}
}
const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
return {
value: `${before}${prefix}${normalized}${suffix}${after}`,
cursor: before.length + prefix.length + normalized.length,
}
}
export function appendVoiceTranscript(current: string, next: string): string {
const normalized = next.trim()
if (!normalized) return current
if (!current.trim()) return normalized
return /\s$/.test(current) ? `${current}${normalized}` : `${current} ${normalized}`
}

View File

@@ -0,0 +1,241 @@
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
import { showAlertDialog } from "../../stores/alerts"
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
import { serverApi } from "../../lib/api-client"
import { useI18n } from "../../lib/i18n"
import { createPromptVoiceAnchor, buildPromptWithInsertedTranscript } from "./promptVoiceInsertion"
interface UsePromptBufferedVoiceInputOptions {
prompt: Accessor<string>
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
getTextarea: () => HTMLTextAreaElement | null
enabled: Accessor<boolean>
disabled: Accessor<boolean>
}
type VoiceInputState = "idle" | "recording" | "transcribing"
export function usePromptBufferedVoiceInput(options: UsePromptBufferedVoiceInputOptions) {
const { t } = useI18n()
const [state, setState] = createSignal<VoiceInputState>("idle")
const [elapsedMs, setElapsedMs] = createSignal(0)
let mediaRecorder: MediaRecorder | null = null
let mediaStream: MediaStream | null = null
let timerId: number | undefined
let shouldTranscribe = true
let recordedChunks: Blob[] = []
let recordingStartedAt = 0
createEffect(() => {
void loadSpeechCapabilities()
})
onCleanup(() => {
cleanupMedia(false)
})
const isSupported = () => {
if (typeof window === "undefined") return false
return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
}
const canUseVoiceInput = () => {
const capabilities = speechCapabilities()
return Boolean(
options.enabled() &&
isSupported() &&
capabilities?.available &&
capabilities?.configured &&
capabilities?.supportsStt,
)
}
async function toggleRecording(): Promise<void> {
if (state() === "recording") {
stopRecording()
return
}
if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
try {
await startRecording()
} catch (error) {
cleanupMedia(false)
showAlertDialog(t("promptInput.voiceInput.error.permission"), {
title: t("promptInput.voiceInput.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
}
}
function stopRecording() {
if (!mediaRecorder || state() !== "recording") return
shouldTranscribe = true
mediaRecorder.stop()
setState("transcribing")
stopTimer()
}
function cancelRecording() {
if (!mediaRecorder || state() !== "recording") return
shouldTranscribe = false
mediaRecorder.stop()
cleanupMedia(false)
}
async function startRecording() {
if (!isSupported()) {
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
title: t("promptInput.voiceInput.error.title"),
variant: "error",
})
return
}
recordedChunks = []
shouldTranscribe = true
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
mediaRecorder = createRecorder(mediaStream)
mediaRecorder.addEventListener("dataavailable", (event) => {
if (event.data.size > 0) {
recordedChunks.push(event.data)
}
})
mediaRecorder.addEventListener("stop", () => {
void finalizeRecording()
})
recordingStartedAt = Date.now()
setElapsedMs(0)
setState("recording")
startTimer()
mediaRecorder.start()
}
async function finalizeRecording() {
const recorder = mediaRecorder
const stream = mediaStream
mediaRecorder = null
mediaStream = null
if (!shouldTranscribe || recordedChunks.length === 0) {
recordedChunks = []
stopTracks(stream)
setState("idle")
setElapsedMs(0)
return
}
const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
try {
const audioBlob = new Blob(recordedChunks, { type: mimeType })
const transcription = await serverApi.transcribeAudio({
audioBase64: await blobToBase64(audioBlob),
mimeType,
})
if (transcription.text.trim()) {
insertTranscript(transcription.text.trim())
}
} catch (error) {
showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
title: t("promptInput.voiceInput.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
} finally {
recordedChunks = []
stopTracks(stream)
setState("idle")
setElapsedMs(0)
}
}
function insertTranscript(text: string) {
const current = options.prompt()
const textarea = options.getTextarea()
const start = textarea ? textarea.selectionStart : current.length
const end = textarea ? textarea.selectionEnd : current.length
const { value, cursor } = buildPromptWithInsertedTranscript(
createPromptVoiceAnchor(current, start, end),
text,
)
options.setPrompt(value)
if (textarea) {
setTimeout(() => {
textarea.focus()
textarea.setSelectionRange(cursor, cursor)
}, 0)
}
}
function cleanupMedia(resetState = true) {
stopTimer()
if (mediaRecorder && mediaRecorder.state !== "inactive") {
mediaRecorder.stop()
}
mediaRecorder = null
stopTracks(mediaStream)
mediaStream = null
recordedChunks = []
if (resetState) {
setState("idle")
setElapsedMs(0)
}
}
function startTimer() {
stopTimer()
timerId = window.setInterval(() => {
setElapsedMs(Date.now() - recordingStartedAt)
}, 250)
}
function stopTimer() {
if (timerId !== undefined) {
window.clearInterval(timerId)
timerId = undefined
}
}
return {
state,
elapsedMs,
canUseVoiceInput,
toggleRecording,
cancelRecording,
isRecording: () => state() === "recording",
isTranscribing: () => state() === "transcribing",
buttonTitle: () => {
if (state() === "recording") return t("promptInput.voiceInput.stop.title")
if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
return t("promptInput.voiceInput.start.title")
},
}
}
function createRecorder(stream: MediaStream): MediaRecorder {
const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
}
function stopTracks(stream: MediaStream | null) {
stream?.getTracks().forEach((track) => track.stop())
}
async function blobToBase64(blob: Blob): Promise<string> {
const buffer = await blob.arrayBuffer()
const bytes = new Uint8Array(buffer)
let binary = ""
for (const byte of bytes) {
binary += String.fromCharCode(byte)
}
return btoa(binary)
}

View File

@@ -0,0 +1,325 @@
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
import { showAlertDialog } from "../../stores/alerts"
import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client"
import { useI18n } from "../../lib/i18n"
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream"
import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion"
interface UsePromptRealtimeVoiceInputOptions {
prompt: Accessor<string>
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
getTextarea: () => HTMLTextAreaElement | null
enabled: Accessor<boolean>
disabled: Accessor<boolean>
}
type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing"
const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000
export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) {
const { t } = useI18n()
const [state, setState] = createSignal<RealtimeVoiceState>("idle")
const [elapsedMs, setElapsedMs] = createSignal(0)
let activeSessionId: string | null = null
let eventSource: EventSource | null = null
let pcmStream: RealtimePcmStreamHandle | null = null
let audioQueue: Promise<void> = Promise.resolve()
let timerId: number | undefined
let recordingStartedAt = 0
let finalizeTimerId: number | undefined
let anchor = createPromptVoiceAnchor("", 0, 0)
let finalTranscript = ""
let liveTranscript = ""
let activeLiveItemId: string | null = null
let closing = false
createEffect(() => {
void loadSpeechCapabilities()
})
onCleanup(() => {
cancelRecording()
})
const isSupported = () => {
if (typeof window === "undefined") return false
return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined"
}
const canUseVoiceInput = () => {
const capabilities = speechCapabilities()
return Boolean(
options.enabled() &&
isSupported() &&
capabilities?.available &&
capabilities?.configured &&
capabilities?.supportsStt &&
capabilities?.supportsRealtimeTranscription,
)
}
async function toggleRecording(): Promise<void> {
if (state() === "listening" || state() === "connecting") {
await stopRecording()
return
}
if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return
try {
await startRecording()
} catch (error) {
await cleanupSession({ revertPrompt: true, closeRemote: true })
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
title: t("promptInput.voiceInput.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
}
}
async function startRecording() {
if (!isSupported()) {
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
title: t("promptInput.voiceInput.error.title"),
variant: "error",
})
return
}
resetTranscriptState()
captureAnchor()
setState("connecting")
setElapsedMs(0)
const created = await serverApi.createRealtimeSpeechSession({
language: detectLanguage(),
})
activeSessionId = created.sessionId
connectEventStream(created.sessionId)
pcmStream = await createRealtimePcmStream({
onChunk: (audioBase64) => {
const sessionId = activeSessionId
if (!sessionId || closing) return
audioQueue = audioQueue
.then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 }))
.catch((error) => {
handleRealtimeError(error)
})
},
})
recordingStartedAt = Date.now()
startTimer()
setState("listening")
}
async function stopRecording() {
const sessionId = activeSessionId
if (!sessionId || (state() !== "listening" && state() !== "connecting")) return
setState("finalizing")
stopTimer()
if (pcmStream) {
const stream = pcmStream
pcmStream = null
await stream.stop()
}
try {
await audioQueue.catch(() => undefined)
await serverApi.finalizeRealtimeSpeechSession(sessionId)
scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS)
} catch (error) {
handleRealtimeError(error)
}
}
function cancelRecording() {
void cleanupSession({ revertPrompt: true, closeRemote: true })
}
function connectEventStream(sessionId: string) {
eventSource?.close()
eventSource = serverApi.connectRealtimeSpeechEvents(
sessionId,
(event) => handleEvent(event),
() => {
if (closing) return
handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection")))
},
)
}
function handleEvent(event: SpeechRealtimeEvent) {
if (event.type === "session.ready") {
return
}
if (event.type === "session.error") {
handleRealtimeError(new Error(event.message))
return
}
if (event.type === "transcript.partial") {
activeLiveItemId = event.itemId
liveTranscript = event.text
renderPrompt(false)
return
}
if (event.type === "transcript.final") {
activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId
liveTranscript = ""
finalTranscript = appendVoiceTranscript(finalTranscript, event.text)
renderPrompt(true)
if (state() === "finalizing") {
scheduleFinalizeClose(250)
}
return
}
if (event.type === "session.closed") {
void cleanupSession({ revertPrompt: false, closeRemote: false })
}
}
function captureAnchor() {
const textarea = options.getTextarea()
const current = options.prompt()
const start = textarea ? textarea.selectionStart : current.length
const end = textarea ? textarea.selectionEnd : current.length
anchor = createPromptVoiceAnchor(current, start, end)
}
function renderPrompt(persistDraft: boolean) {
const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "")
const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted)
options.setPrompt(value, persistDraft ? undefined : { persistDraft: false })
syncTextareaCursor(cursor)
}
function syncTextareaCursor(cursor: number) {
const textarea = options.getTextarea()
if (!textarea) return
queueMicrotask(() => {
const next = options.getTextarea()
if (!next) return
next.focus()
next.setSelectionRange(cursor, cursor)
})
}
function scheduleFinalizeClose(delayMs: number) {
if (finalizeTimerId !== undefined) {
window.clearTimeout(finalizeTimerId)
}
finalizeTimerId = window.setTimeout(() => {
void cleanupSession({ revertPrompt: false, closeRemote: true })
}, delayMs)
}
async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) {
if (closing) return
closing = true
if (finalizeTimerId !== undefined) {
window.clearTimeout(finalizeTimerId)
finalizeTimerId = undefined
}
stopTimer()
const sessionId = activeSessionId
activeSessionId = null
eventSource?.close()
eventSource = null
if (pcmStream) {
const stream = pcmStream
pcmStream = null
await stream.stop().catch(() => undefined)
}
await audioQueue.catch(() => undefined)
audioQueue = Promise.resolve()
if (cleanupOptions.closeRemote && sessionId) {
await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined)
}
if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) {
finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript)
liveTranscript = ""
}
if (cleanupOptions.revertPrompt) {
options.setPrompt(anchor.prompt)
} else if (finalTranscript.trim()) {
renderPrompt(true)
}
resetTranscriptState()
setState("idle")
setElapsedMs(0)
closing = false
}
function resetTranscriptState() {
finalTranscript = ""
liveTranscript = ""
activeLiveItemId = null
}
function handleRealtimeError(error: unknown) {
if (closing) return
void cleanupSession({ revertPrompt: true, closeRemote: true })
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
title: t("promptInput.voiceInput.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
}
function startTimer() {
stopTimer()
timerId = window.setInterval(() => {
setElapsedMs(Date.now() - recordingStartedAt)
}, 250)
}
function stopTimer() {
if (timerId !== undefined) {
window.clearInterval(timerId)
timerId = undefined
}
}
return {
state,
elapsedMs,
canUseVoiceInput,
toggleRecording,
cancelRecording,
isRecording: () => state() === "connecting" || state() === "listening",
isTranscribing: () => state() === "finalizing",
buttonTitle: () => {
if (state() === "connecting") return t("promptInput.voiceInput.connecting.title")
if (state() === "listening") return t("promptInput.voiceInput.stop.title")
if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title")
return t("promptInput.voiceInput.start.title")
},
}
}
function detectLanguage(): string | undefined {
if (typeof navigator === "undefined") return undefined
const [language] = navigator.language.split("-")
return language?.trim() || undefined
}

View File

@@ -22,7 +22,7 @@ type HistorySelectOptions = {
type PromptState = {
prompt: Accessor<string>
setPrompt: (value: string) => void
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
clearPrompt: () => void
draftLoadedNonce: Accessor<number>
@@ -48,11 +48,11 @@ export function usePromptState(options: PromptStateOptions): PromptState {
const [historyDraft, setHistoryDraft] = createSignal<string | null>(null)
const [draftLoadedNonce, setDraftLoadedNonce] = createSignal(0)
const setPrompt = (value: string) => {
const setPrompt = (value: string, setOptions?: { persistDraft?: boolean }) => {
setPromptInternal(value)
// Persist drafts only when the user is at the "fresh" position (not browsing history).
// This keeps the bottom-of-history draft stable even if the user edits recalled history entries.
if (historyIndex() === -1) {
if (setOptions?.persistDraft !== false && historyIndex() === -1) {
setSessionDraftPrompt(options.instanceId(), options.sessionId(), value)
}
}

View File

@@ -1,242 +1,30 @@
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
import { showAlertDialog } from "../../stores/alerts"
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
import { serverApi } from "../../lib/api-client"
import { useI18n } from "../../lib/i18n"
import type { Accessor } from "solid-js"
import { usePromptBufferedVoiceInput } from "./usePromptBufferedVoiceInput"
import { usePromptRealtimeVoiceInput } from "./usePromptRealtimeVoiceInput"
interface UsePromptVoiceInputOptions {
prompt: Accessor<string>
setPrompt: (value: string) => void
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
getTextarea: () => HTMLTextAreaElement | null
enabled: Accessor<boolean>
disabled: Accessor<boolean>
useRealtime: Accessor<boolean>
}
type VoiceInputState = "idle" | "recording" | "transcribing"
export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
const { t } = useI18n()
const [state, setState] = createSignal<VoiceInputState>("idle")
const [elapsedMs, setElapsedMs] = createSignal(0)
const buffered = usePromptBufferedVoiceInput(options)
const realtime = usePromptRealtimeVoiceInput(options)
let mediaRecorder: MediaRecorder | null = null
let mediaStream: MediaStream | null = null
let timerId: number | undefined
let shouldTranscribe = true
let recordedChunks: Blob[] = []
let recordingStartedAt = 0
createEffect(() => {
void loadSpeechCapabilities()
})
onCleanup(() => {
cleanupMedia(false)
})
const isSupported = () => {
if (typeof window === "undefined") return false
return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
}
const canUseVoiceInput = () => {
const capabilities = speechCapabilities()
return Boolean(
options.enabled() &&
isSupported() &&
capabilities?.available &&
capabilities?.configured &&
capabilities?.supportsStt,
)
}
async function toggleRecording(): Promise<void> {
if (state() === "recording") {
stopRecording()
return
}
if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
try {
await startRecording()
} catch (error) {
cleanupMedia(false)
showAlertDialog(t("promptInput.voiceInput.error.permission"), {
title: t("promptInput.voiceInput.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
}
}
function stopRecording() {
if (!mediaRecorder || state() !== "recording") return
shouldTranscribe = true
mediaRecorder.stop()
setState("transcribing")
stopTimer()
}
function cancelRecording() {
if (!mediaRecorder || state() !== "recording") return
shouldTranscribe = false
mediaRecorder.stop()
cleanupMedia(false)
}
async function startRecording() {
if (!isSupported()) {
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
title: t("promptInput.voiceInput.error.title"),
variant: "error",
})
return
}
recordedChunks = []
shouldTranscribe = true
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
mediaRecorder = createRecorder(mediaStream)
mediaRecorder.addEventListener("dataavailable", (event) => {
if (event.data.size > 0) {
recordedChunks.push(event.data)
}
})
mediaRecorder.addEventListener("stop", () => {
void finalizeRecording()
})
recordingStartedAt = Date.now()
setElapsedMs(0)
setState("recording")
startTimer()
mediaRecorder.start()
}
async function finalizeRecording() {
const recorder = mediaRecorder
const stream = mediaStream
mediaRecorder = null
mediaStream = null
if (!shouldTranscribe || recordedChunks.length === 0) {
recordedChunks = []
stopTracks(stream)
setState("idle")
setElapsedMs(0)
return
}
const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
try {
const audioBlob = new Blob(recordedChunks, { type: mimeType })
const transcription = await serverApi.transcribeAudio({
audioBase64: await blobToBase64(audioBlob),
mimeType,
})
if (transcription.text.trim()) {
insertTranscript(transcription.text.trim())
}
} catch (error) {
showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
title: t("promptInput.voiceInput.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
} finally {
recordedChunks = []
stopTracks(stream)
setState("idle")
setElapsedMs(0)
}
}
function insertTranscript(text: string) {
const current = options.prompt()
const textarea = options.getTextarea()
const start = textarea ? textarea.selectionStart : current.length
const end = textarea ? textarea.selectionEnd : current.length
const before = current.slice(0, start)
const after = current.slice(end)
const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
const nextValue = `${before}${prefix}${text}${suffix}${after}`
const cursor = before.length + prefix.length + text.length
options.setPrompt(nextValue)
if (textarea) {
setTimeout(() => {
textarea.focus()
textarea.setSelectionRange(cursor, cursor)
}, 0)
}
}
function cleanupMedia(resetState = true) {
stopTimer()
if (mediaRecorder && mediaRecorder.state !== "inactive") {
mediaRecorder.stop()
}
mediaRecorder = null
stopTracks(mediaStream)
mediaStream = null
recordedChunks = []
if (resetState) {
setState("idle")
setElapsedMs(0)
}
}
function startTimer() {
stopTimer()
timerId = window.setInterval(() => {
setElapsedMs(Date.now() - recordingStartedAt)
}, 250)
}
function stopTimer() {
if (timerId !== undefined) {
window.clearInterval(timerId)
timerId = undefined
}
}
const active = () => (options.useRealtime() ? realtime : buffered)
return {
state,
elapsedMs,
canUseVoiceInput,
toggleRecording,
cancelRecording,
isRecording: () => state() === "recording",
isTranscribing: () => state() === "transcribing",
buttonTitle: () => {
if (state() === "recording") return t("promptInput.voiceInput.stop.title")
if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
return t("promptInput.voiceInput.start.title")
},
state: () => active().state(),
elapsedMs: () => active().elapsedMs(),
canUseVoiceInput: () => active().canUseVoiceInput(),
toggleRecording: () => active().toggleRecording(),
cancelRecording: () => active().cancelRecording(),
isRecording: () => active().isRecording(),
isTranscribing: () => active().isTranscribing(),
buttonTitle: () => active().buttonTitle(),
}
}
function createRecorder(stream: MediaStream): MediaRecorder {
const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
}
function stopTracks(stream: MediaStream | null) {
stream?.getTracks().forEach((track) => track.stop())
}
async function blobToBase64(blob: Blob): Promise<string> {
const buffer = await blob.arrayBuffer()
const bytes = new Uint8Array(buffer)
let binary = ""
for (const byte of bytes) {
binary += String.fromCharCode(byte)
}
return btoa(binary)
}

View File

@@ -10,6 +10,8 @@ const log = getLogger("actions")
type DraftFields = {
apiKey: string
baseUrl: string
useRealtime: boolean
realtimeModel: string
sttModel: string
ttsModel: string
ttsVoice: string
@@ -19,6 +21,8 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
return {
apiKey: speech.apiKey ?? "",
baseUrl: speech.baseUrl ?? "",
useRealtime: speech.useRealtime,
realtimeModel: speech.realtimeModel,
sttModel: speech.sttModel,
ttsModel: speech.ttsModel,
ttsVoice: speech.ttsVoice,
@@ -26,7 +30,7 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
}
function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.useRealtime === b.useRealtime && a.realtimeModel === b.realtimeModel && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
}
export const SpeechSettingsCard: Component = () => {
@@ -57,7 +61,7 @@ export const SpeechSettingsCard: Component = () => {
return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
}
const updateDraft = (key: keyof DraftFields, value: string) => {
const updateDraft = <K extends keyof DraftFields>(key: K, value: DraftFields[K]) => {
setSaveStatus("idle")
setDrafts((current) => ({ ...current, [key]: value }))
}
@@ -65,12 +69,14 @@ export const SpeechSettingsCard: Component = () => {
const isDirty = createMemo(() => {
const speech = serverSettings().speech
const current = drafts()
return (
(current.apiKey || "") !== (speech.apiKey || "") ||
(current.baseUrl || "") !== (speech.baseUrl || "") ||
current.sttModel !== speech.sttModel ||
current.ttsModel !== speech.ttsModel ||
current.ttsVoice !== speech.ttsVoice
return (
(current.apiKey || "") !== (speech.apiKey || "") ||
(current.baseUrl || "") !== (speech.baseUrl || "") ||
current.useRealtime !== speech.useRealtime ||
current.realtimeModel !== speech.realtimeModel ||
current.sttModel !== speech.sttModel ||
current.ttsModel !== speech.ttsModel ||
current.ttsVoice !== speech.ttsVoice
)
})
@@ -90,6 +96,8 @@ export const SpeechSettingsCard: Component = () => {
await updateSpeechSettings({
apiKey: current.apiKey.trim() || undefined,
baseUrl: current.baseUrl.trim() || undefined,
useRealtime: current.useRealtime,
realtimeModel: current.realtimeModel.trim() || undefined,
sttModel: current.sttModel.trim() || undefined,
ttsModel: current.ttsModel.trim() || undefined,
ttsVoice: current.ttsVoice.trim() || undefined,
@@ -98,6 +106,8 @@ export const SpeechSettingsCard: Component = () => {
setDrafts({
apiKey: current.apiKey.trim(),
baseUrl: current.baseUrl.trim(),
useRealtime: current.useRealtime,
realtimeModel: current.realtimeModel.trim() || serverSettings().speech.realtimeModel,
sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
@@ -159,6 +169,27 @@ export const SpeechSettingsCard: Component = () => {
onInput={(value) => updateDraft("baseUrl", value)}
placeholder={t("settings.speech.baseUrl.placeholder")}
/>
<div class="settings-toggle-row">
<div>
<div class="settings-toggle-title">{t("settings.speech.realtime.title")}</div>
<div class="settings-toggle-caption">{t("settings.speech.realtime.subtitle")}</div>
</div>
<label class="settings-checkbox-toggle">
<input
type="checkbox"
checked={drafts().useRealtime}
onChange={(event) => updateDraft("useRealtime", event.currentTarget.checked)}
/>
<span>{t("settings.common.enabled")}</span>
</label>
</div>
<Field
label={t("settings.speech.realtimeModel.title")}
caption={t("settings.speech.realtimeModel.subtitle")}
value={drafts().realtimeModel}
onInput={(value) => updateDraft("realtimeModel", value)}
placeholder={t("settings.speech.realtimeModel.placeholder")}
/>
<Field
label={t("settings.speech.sttModel.title")}
caption={t("settings.speech.sttModel.subtitle")}

View File

@@ -8,6 +8,8 @@ import type {
FileSystemListResponse,
InstanceData,
SpeechCapabilitiesResponse,
SpeechRealtimeEvent,
SpeechRealtimeSessionResponse,
SpeechSynthesisResponse,
SpeechTranscriptionResponse,
ServerMeta,
@@ -39,6 +41,10 @@ export function buildBackgroundProcessStreamUrl(instanceId: string, processId: s
return buildAbsoluteUrl(`/workspaces/${encodedInstanceId}/plugin/background-processes/${encodedProcessId}/stream`)
}
export function buildRealtimeSpeechEventsUrl(sessionId: string): string {
return buildAbsoluteUrl(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/events`)
}
function buildEventsUrl(base: string | undefined, path: string): string {
if (path.startsWith("http://") || path.startsWith("https://")) {
return path
@@ -241,6 +247,29 @@ export const serverApi = {
fetchSpeechCapabilities(): Promise<SpeechCapabilitiesResponse> {
return request<SpeechCapabilitiesResponse>("/api/speech/capabilities")
},
createRealtimeSpeechSession(payload?: { language?: string; prompt?: string }): Promise<SpeechRealtimeSessionResponse> {
return request<SpeechRealtimeSessionResponse>("/api/speech/realtime/sessions", {
method: "POST",
body: JSON.stringify(payload ?? {}),
})
},
appendRealtimeSpeechAudio(sessionId: string, payload: { audioBase64: string }): Promise<void> {
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/audio`, {
method: "POST",
body: JSON.stringify(payload),
})
},
finalizeRealtimeSpeechSession(sessionId: string): Promise<void> {
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/finalize`, {
method: "POST",
body: JSON.stringify({}),
})
},
closeRealtimeSpeechSession(sessionId: string): Promise<void> {
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}`, {
method: "DELETE",
})
},
transcribeAudio(payload: {
audioBase64: string
mimeType: string
@@ -332,21 +361,34 @@ export const serverApi = {
},
connectEvents(onEvent: (event: WorkspaceEventPayload) => void, onError?: () => void) {
sseLogger.info(`Connecting to ${EVENTS_URL}`)
const source = new EventSource(EVENTS_URL, { withCredentials: true } as any)
source.onmessage = (event) => {
try {
const payload = JSON.parse(event.data) as WorkspaceEventPayload
onEvent(payload)
} catch (error) {
sseLogger.error("Failed to parse event", error)
}
}
source.onerror = () => {
sseLogger.warn("EventSource error, closing stream")
onError?.()
}
return source
return connectEventSource(EVENTS_URL, onEvent, onError)
},
connectRealtimeSpeechEvents(
sessionId: string,
onEvent: (event: SpeechRealtimeEvent) => void,
onError?: () => void,
) {
const url = buildRealtimeSpeechEventsUrl(sessionId)
sseLogger.info(`Connecting to ${url}`)
return connectEventSource(url, onEvent, onError)
},
}
export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType }
function connectEventSource<T>(url: string, onEvent: (event: T) => void, onError?: () => void) {
const source = new EventSource(url, { withCredentials: true } as any)
source.onmessage = (event) => {
try {
const payload = JSON.parse(event.data) as T
onEvent(payload)
} catch (error) {
sseLogger.error("Failed to parse event", error)
}
}
source.onerror = () => {
sseLogger.warn("EventSource error, closing stream")
onError?.()
}
return source
}
export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType, SpeechRealtimeEvent }

View File

@@ -140,8 +140,10 @@ export const messagingMessages = {
"promptInput.send.errorTitle": "Send failed",
"promptInput.voiceInput.start.title": "Start voice input",
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
"promptInput.voiceInput.connecting.title": "Connecting microphone",
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
"promptInput.voiceInput.error.title": "Voice input failed",
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",

View File

@@ -156,13 +156,18 @@ export const settingsMessages = {
"settings.speech.baseUrl.title": "Base URL",
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
"settings.speech.realtime.title": "Realtime dictation",
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
"settings.speech.realtimeModel.title": "Realtime model",
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
"settings.speech.sttModel.title": "Transcription model",
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
"settings.speech.ttsModel.title": "Speech model",
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
"settings.speech.ttsVoice.title": "Default voice",
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
"settings.speech.save.action": "Save",
"settings.speech.save.saving": "Saving...",
"settings.speech.save.saved": "Saved",

View File

@@ -142,8 +142,10 @@ export const messagingMessages = {
"promptInput.send.errorTitle": "Error al enviar",
"promptInput.voiceInput.start.title": "Start voice input",
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
"promptInput.voiceInput.connecting.title": "Connecting microphone",
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
"promptInput.voiceInput.error.title": "Voice input failed",
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",

View File

@@ -156,13 +156,18 @@ export const settingsMessages = {
"settings.speech.baseUrl.title": "Base URL",
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
"settings.speech.realtime.title": "Realtime dictation",
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
"settings.speech.realtimeModel.title": "Realtime model",
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
"settings.speech.sttModel.title": "Transcription model",
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
"settings.speech.ttsModel.title": "Speech model",
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
"settings.speech.ttsVoice.title": "Default voice",
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
"settings.speech.save.action": "Save",
"settings.speech.save.saving": "Saving...",
"settings.speech.save.saved": "Saved",

View File

@@ -142,8 +142,10 @@ export const messagingMessages = {
"promptInput.send.errorTitle": "Échec de l'envoi",
"promptInput.voiceInput.start.title": "Start voice input",
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
"promptInput.voiceInput.connecting.title": "Connecting microphone",
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
"promptInput.voiceInput.error.title": "Voice input failed",
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",

View File

@@ -156,13 +156,18 @@ export const settingsMessages = {
"settings.speech.baseUrl.title": "Base URL",
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
"settings.speech.realtime.title": "Realtime dictation",
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
"settings.speech.realtimeModel.title": "Realtime model",
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
"settings.speech.sttModel.title": "Transcription model",
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
"settings.speech.ttsModel.title": "Speech model",
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
"settings.speech.ttsVoice.title": "Default voice",
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
"settings.speech.save.action": "Save",
"settings.speech.save.saving": "Saving...",
"settings.speech.save.saved": "Saved",

View File

@@ -142,8 +142,10 @@ export const messagingMessages = {
"promptInput.send.errorTitle": "送信に失敗",
"promptInput.voiceInput.start.title": "Start voice input",
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
"promptInput.voiceInput.connecting.title": "Connecting microphone",
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
"promptInput.voiceInput.error.title": "Voice input failed",
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",

View File

@@ -156,13 +156,18 @@ export const settingsMessages = {
"settings.speech.baseUrl.title": "Base URL",
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
"settings.speech.realtime.title": "Realtime dictation",
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
"settings.speech.realtimeModel.title": "Realtime model",
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
"settings.speech.sttModel.title": "Transcription model",
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
"settings.speech.ttsModel.title": "Speech model",
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
"settings.speech.ttsVoice.title": "Default voice",
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
"settings.speech.save.action": "Save",
"settings.speech.save.saving": "Saving...",
"settings.speech.save.saved": "Saved",

View File

@@ -142,8 +142,10 @@ export const messagingMessages = {
"promptInput.send.errorTitle": "Не удалось отправить",
"promptInput.voiceInput.start.title": "Start voice input",
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
"promptInput.voiceInput.connecting.title": "Connecting microphone",
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
"promptInput.voiceInput.error.title": "Voice input failed",
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",

View File

@@ -156,13 +156,18 @@ export const settingsMessages = {
"settings.speech.baseUrl.title": "Base URL",
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
"settings.speech.realtime.title": "Realtime dictation",
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
"settings.speech.realtimeModel.title": "Realtime model",
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
"settings.speech.sttModel.title": "Transcription model",
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
"settings.speech.ttsModel.title": "Speech model",
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
"settings.speech.ttsVoice.title": "Default voice",
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
"settings.speech.save.action": "Save",
"settings.speech.save.saving": "Saving...",
"settings.speech.save.saved": "Saved",

View File

@@ -142,8 +142,10 @@ export const messagingMessages = {
"promptInput.send.errorTitle": "发送失败",
"promptInput.voiceInput.start.title": "Start voice input",
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
"promptInput.voiceInput.connecting.title": "Connecting microphone",
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
"promptInput.voiceInput.error.title": "Voice input failed",
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",

View File

@@ -156,13 +156,18 @@ export const settingsMessages = {
"settings.speech.baseUrl.title": "Base URL",
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
"settings.speech.realtime.title": "Realtime dictation",
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
"settings.speech.realtimeModel.title": "Realtime model",
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
"settings.speech.sttModel.title": "Transcription model",
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
"settings.speech.ttsModel.title": "Speech model",
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
"settings.speech.ttsVoice.title": "Default voice",
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
"settings.speech.save.action": "Save",
"settings.speech.save.saving": "Saving...",
"settings.speech.save.saved": "Saved",

View File

@@ -34,6 +34,8 @@ export interface SpeechSettings {
provider: SpeechProviderPreference
apiKey?: string
baseUrl?: string
useRealtime: boolean
realtimeModel: string
sttModel: string
ttsModel: string
ttsVoice: string
@@ -136,6 +138,8 @@ const defaultUiSettings: UiSettings = {
const defaultSpeechSettings: SpeechSettings = {
provider: "openai-compatible",
useRealtime: true,
realtimeModel: "gpt-realtime",
sttModel: "gpt-4o-mini-transcribe",
ttsModel: "gpt-4o-mini-tts",
ttsVoice: "alloy",
@@ -184,6 +188,11 @@ function normalizeSpeechSettings(input?: Partial<SpeechSettings> | null): Speech
provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider,
apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined,
baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined,
useRealtime: sanitized.useRealtime ?? defaultSpeechSettings.useRealtime,
realtimeModel:
typeof sanitized.realtimeModel === "string" && sanitized.realtimeModel.trim()
? sanitized.realtimeModel.trim()
: defaultSpeechSettings.realtimeModel,
sttModel:
typeof sanitized.sttModel === "string" && sanitized.sttModel.trim()
? sanitized.sttModel.trim()