feat(speech): add realtime prompt dictation support
Add server-backed realtime transcription for prompt voice input and expose speech settings to choose realtime mode and models.
This commit is contained in:
@@ -0,0 +1,325 @@
|
||||
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
|
||||
import { showAlertDialog } from "../../stores/alerts"
|
||||
import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client"
|
||||
import { useI18n } from "../../lib/i18n"
|
||||
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
||||
import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream"
|
||||
import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion"
|
||||
|
||||
interface UsePromptRealtimeVoiceInputOptions {
|
||||
prompt: Accessor<string>
|
||||
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||
getTextarea: () => HTMLTextAreaElement | null
|
||||
enabled: Accessor<boolean>
|
||||
disabled: Accessor<boolean>
|
||||
}
|
||||
|
||||
type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing"
|
||||
|
||||
const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000
|
||||
|
||||
export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) {
|
||||
const { t } = useI18n()
|
||||
const [state, setState] = createSignal<RealtimeVoiceState>("idle")
|
||||
const [elapsedMs, setElapsedMs] = createSignal(0)
|
||||
|
||||
let activeSessionId: string | null = null
|
||||
let eventSource: EventSource | null = null
|
||||
let pcmStream: RealtimePcmStreamHandle | null = null
|
||||
let audioQueue: Promise<void> = Promise.resolve()
|
||||
let timerId: number | undefined
|
||||
let recordingStartedAt = 0
|
||||
let finalizeTimerId: number | undefined
|
||||
let anchor = createPromptVoiceAnchor("", 0, 0)
|
||||
let finalTranscript = ""
|
||||
let liveTranscript = ""
|
||||
let activeLiveItemId: string | null = null
|
||||
let closing = false
|
||||
|
||||
createEffect(() => {
|
||||
void loadSpeechCapabilities()
|
||||
})
|
||||
|
||||
onCleanup(() => {
|
||||
cancelRecording()
|
||||
})
|
||||
|
||||
const isSupported = () => {
|
||||
if (typeof window === "undefined") return false
|
||||
return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined"
|
||||
}
|
||||
|
||||
const canUseVoiceInput = () => {
|
||||
const capabilities = speechCapabilities()
|
||||
return Boolean(
|
||||
options.enabled() &&
|
||||
isSupported() &&
|
||||
capabilities?.available &&
|
||||
capabilities?.configured &&
|
||||
capabilities?.supportsStt &&
|
||||
capabilities?.supportsRealtimeTranscription,
|
||||
)
|
||||
}
|
||||
|
||||
async function toggleRecording(): Promise<void> {
|
||||
if (state() === "listening" || state() === "connecting") {
|
||||
await stopRecording()
|
||||
return
|
||||
}
|
||||
|
||||
if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return
|
||||
|
||||
try {
|
||||
await startRecording()
|
||||
} catch (error) {
|
||||
await cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
detail: error instanceof Error ? error.message : String(error),
|
||||
variant: "error",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
async function startRecording() {
|
||||
if (!isSupported()) {
|
||||
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
variant: "error",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
resetTranscriptState()
|
||||
captureAnchor()
|
||||
setState("connecting")
|
||||
setElapsedMs(0)
|
||||
|
||||
const created = await serverApi.createRealtimeSpeechSession({
|
||||
language: detectLanguage(),
|
||||
})
|
||||
activeSessionId = created.sessionId
|
||||
connectEventStream(created.sessionId)
|
||||
|
||||
pcmStream = await createRealtimePcmStream({
|
||||
onChunk: (audioBase64) => {
|
||||
const sessionId = activeSessionId
|
||||
if (!sessionId || closing) return
|
||||
audioQueue = audioQueue
|
||||
.then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 }))
|
||||
.catch((error) => {
|
||||
handleRealtimeError(error)
|
||||
})
|
||||
},
|
||||
})
|
||||
|
||||
recordingStartedAt = Date.now()
|
||||
startTimer()
|
||||
setState("listening")
|
||||
}
|
||||
|
||||
async function stopRecording() {
|
||||
const sessionId = activeSessionId
|
||||
if (!sessionId || (state() !== "listening" && state() !== "connecting")) return
|
||||
|
||||
setState("finalizing")
|
||||
stopTimer()
|
||||
|
||||
if (pcmStream) {
|
||||
const stream = pcmStream
|
||||
pcmStream = null
|
||||
await stream.stop()
|
||||
}
|
||||
|
||||
try {
|
||||
await audioQueue.catch(() => undefined)
|
||||
await serverApi.finalizeRealtimeSpeechSession(sessionId)
|
||||
scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS)
|
||||
} catch (error) {
|
||||
handleRealtimeError(error)
|
||||
}
|
||||
}
|
||||
|
||||
function cancelRecording() {
|
||||
void cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||
}
|
||||
|
||||
function connectEventStream(sessionId: string) {
|
||||
eventSource?.close()
|
||||
eventSource = serverApi.connectRealtimeSpeechEvents(
|
||||
sessionId,
|
||||
(event) => handleEvent(event),
|
||||
() => {
|
||||
if (closing) return
|
||||
handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection")))
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
function handleEvent(event: SpeechRealtimeEvent) {
|
||||
if (event.type === "session.ready") {
|
||||
return
|
||||
}
|
||||
|
||||
if (event.type === "session.error") {
|
||||
handleRealtimeError(new Error(event.message))
|
||||
return
|
||||
}
|
||||
|
||||
if (event.type === "transcript.partial") {
|
||||
activeLiveItemId = event.itemId
|
||||
liveTranscript = event.text
|
||||
renderPrompt(false)
|
||||
return
|
||||
}
|
||||
|
||||
if (event.type === "transcript.final") {
|
||||
activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId
|
||||
liveTranscript = ""
|
||||
finalTranscript = appendVoiceTranscript(finalTranscript, event.text)
|
||||
renderPrompt(true)
|
||||
if (state() === "finalizing") {
|
||||
scheduleFinalizeClose(250)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if (event.type === "session.closed") {
|
||||
void cleanupSession({ revertPrompt: false, closeRemote: false })
|
||||
}
|
||||
}
|
||||
|
||||
function captureAnchor() {
|
||||
const textarea = options.getTextarea()
|
||||
const current = options.prompt()
|
||||
const start = textarea ? textarea.selectionStart : current.length
|
||||
const end = textarea ? textarea.selectionEnd : current.length
|
||||
anchor = createPromptVoiceAnchor(current, start, end)
|
||||
}
|
||||
|
||||
function renderPrompt(persistDraft: boolean) {
|
||||
const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "")
|
||||
const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted)
|
||||
options.setPrompt(value, persistDraft ? undefined : { persistDraft: false })
|
||||
syncTextareaCursor(cursor)
|
||||
}
|
||||
|
||||
function syncTextareaCursor(cursor: number) {
|
||||
const textarea = options.getTextarea()
|
||||
if (!textarea) return
|
||||
queueMicrotask(() => {
|
||||
const next = options.getTextarea()
|
||||
if (!next) return
|
||||
next.focus()
|
||||
next.setSelectionRange(cursor, cursor)
|
||||
})
|
||||
}
|
||||
|
||||
function scheduleFinalizeClose(delayMs: number) {
|
||||
if (finalizeTimerId !== undefined) {
|
||||
window.clearTimeout(finalizeTimerId)
|
||||
}
|
||||
finalizeTimerId = window.setTimeout(() => {
|
||||
void cleanupSession({ revertPrompt: false, closeRemote: true })
|
||||
}, delayMs)
|
||||
}
|
||||
|
||||
async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) {
|
||||
if (closing) return
|
||||
closing = true
|
||||
|
||||
if (finalizeTimerId !== undefined) {
|
||||
window.clearTimeout(finalizeTimerId)
|
||||
finalizeTimerId = undefined
|
||||
}
|
||||
|
||||
stopTimer()
|
||||
|
||||
const sessionId = activeSessionId
|
||||
activeSessionId = null
|
||||
|
||||
eventSource?.close()
|
||||
eventSource = null
|
||||
|
||||
if (pcmStream) {
|
||||
const stream = pcmStream
|
||||
pcmStream = null
|
||||
await stream.stop().catch(() => undefined)
|
||||
}
|
||||
|
||||
await audioQueue.catch(() => undefined)
|
||||
audioQueue = Promise.resolve()
|
||||
|
||||
if (cleanupOptions.closeRemote && sessionId) {
|
||||
await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined)
|
||||
}
|
||||
|
||||
if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) {
|
||||
finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript)
|
||||
liveTranscript = ""
|
||||
}
|
||||
|
||||
if (cleanupOptions.revertPrompt) {
|
||||
options.setPrompt(anchor.prompt)
|
||||
} else if (finalTranscript.trim()) {
|
||||
renderPrompt(true)
|
||||
}
|
||||
|
||||
resetTranscriptState()
|
||||
setState("idle")
|
||||
setElapsedMs(0)
|
||||
closing = false
|
||||
}
|
||||
|
||||
function resetTranscriptState() {
|
||||
finalTranscript = ""
|
||||
liveTranscript = ""
|
||||
activeLiveItemId = null
|
||||
}
|
||||
|
||||
function handleRealtimeError(error: unknown) {
|
||||
if (closing) return
|
||||
void cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
detail: error instanceof Error ? error.message : String(error),
|
||||
variant: "error",
|
||||
})
|
||||
}
|
||||
|
||||
function startTimer() {
|
||||
stopTimer()
|
||||
timerId = window.setInterval(() => {
|
||||
setElapsedMs(Date.now() - recordingStartedAt)
|
||||
}, 250)
|
||||
}
|
||||
|
||||
function stopTimer() {
|
||||
if (timerId !== undefined) {
|
||||
window.clearInterval(timerId)
|
||||
timerId = undefined
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
state,
|
||||
elapsedMs,
|
||||
canUseVoiceInput,
|
||||
toggleRecording,
|
||||
cancelRecording,
|
||||
isRecording: () => state() === "connecting" || state() === "listening",
|
||||
isTranscribing: () => state() === "finalizing",
|
||||
buttonTitle: () => {
|
||||
if (state() === "connecting") return t("promptInput.voiceInput.connecting.title")
|
||||
if (state() === "listening") return t("promptInput.voiceInput.stop.title")
|
||||
if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title")
|
||||
return t("promptInput.voiceInput.start.title")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
function detectLanguage(): string | undefined {
|
||||
if (typeof navigator === "undefined") return undefined
|
||||
const [language] = navigator.language.split("-")
|
||||
return language?.trim() || undefined
|
||||
}
|
||||
Reference in New Issue
Block a user