Add server-backed realtime transcription for prompt voice input and expose speech settings to choose realtime mode and models.
326 lines
9.6 KiB
TypeScript
326 lines
9.6 KiB
TypeScript
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
|
|
import { showAlertDialog } from "../../stores/alerts"
|
|
import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client"
|
|
import { useI18n } from "../../lib/i18n"
|
|
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
|
import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream"
|
|
import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion"
|
|
|
|
interface UsePromptRealtimeVoiceInputOptions {
|
|
prompt: Accessor<string>
|
|
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
|
getTextarea: () => HTMLTextAreaElement | null
|
|
enabled: Accessor<boolean>
|
|
disabled: Accessor<boolean>
|
|
}
|
|
|
|
type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing"
|
|
|
|
const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000
|
|
|
|
export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) {
|
|
const { t } = useI18n()
|
|
const [state, setState] = createSignal<RealtimeVoiceState>("idle")
|
|
const [elapsedMs, setElapsedMs] = createSignal(0)
|
|
|
|
let activeSessionId: string | null = null
|
|
let eventSource: EventSource | null = null
|
|
let pcmStream: RealtimePcmStreamHandle | null = null
|
|
let audioQueue: Promise<void> = Promise.resolve()
|
|
let timerId: number | undefined
|
|
let recordingStartedAt = 0
|
|
let finalizeTimerId: number | undefined
|
|
let anchor = createPromptVoiceAnchor("", 0, 0)
|
|
let finalTranscript = ""
|
|
let liveTranscript = ""
|
|
let activeLiveItemId: string | null = null
|
|
let closing = false
|
|
|
|
createEffect(() => {
|
|
void loadSpeechCapabilities()
|
|
})
|
|
|
|
onCleanup(() => {
|
|
cancelRecording()
|
|
})
|
|
|
|
const isSupported = () => {
|
|
if (typeof window === "undefined") return false
|
|
return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined"
|
|
}
|
|
|
|
const canUseVoiceInput = () => {
|
|
const capabilities = speechCapabilities()
|
|
return Boolean(
|
|
options.enabled() &&
|
|
isSupported() &&
|
|
capabilities?.available &&
|
|
capabilities?.configured &&
|
|
capabilities?.supportsStt &&
|
|
capabilities?.supportsRealtimeTranscription,
|
|
)
|
|
}
|
|
|
|
async function toggleRecording(): Promise<void> {
|
|
if (state() === "listening" || state() === "connecting") {
|
|
await stopRecording()
|
|
return
|
|
}
|
|
|
|
if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return
|
|
|
|
try {
|
|
await startRecording()
|
|
} catch (error) {
|
|
await cleanupSession({ revertPrompt: true, closeRemote: true })
|
|
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
|
|
title: t("promptInput.voiceInput.error.title"),
|
|
detail: error instanceof Error ? error.message : String(error),
|
|
variant: "error",
|
|
})
|
|
}
|
|
}
|
|
|
|
async function startRecording() {
|
|
if (!isSupported()) {
|
|
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
|
|
title: t("promptInput.voiceInput.error.title"),
|
|
variant: "error",
|
|
})
|
|
return
|
|
}
|
|
|
|
resetTranscriptState()
|
|
captureAnchor()
|
|
setState("connecting")
|
|
setElapsedMs(0)
|
|
|
|
const created = await serverApi.createRealtimeSpeechSession({
|
|
language: detectLanguage(),
|
|
})
|
|
activeSessionId = created.sessionId
|
|
connectEventStream(created.sessionId)
|
|
|
|
pcmStream = await createRealtimePcmStream({
|
|
onChunk: (audioBase64) => {
|
|
const sessionId = activeSessionId
|
|
if (!sessionId || closing) return
|
|
audioQueue = audioQueue
|
|
.then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 }))
|
|
.catch((error) => {
|
|
handleRealtimeError(error)
|
|
})
|
|
},
|
|
})
|
|
|
|
recordingStartedAt = Date.now()
|
|
startTimer()
|
|
setState("listening")
|
|
}
|
|
|
|
async function stopRecording() {
|
|
const sessionId = activeSessionId
|
|
if (!sessionId || (state() !== "listening" && state() !== "connecting")) return
|
|
|
|
setState("finalizing")
|
|
stopTimer()
|
|
|
|
if (pcmStream) {
|
|
const stream = pcmStream
|
|
pcmStream = null
|
|
await stream.stop()
|
|
}
|
|
|
|
try {
|
|
await audioQueue.catch(() => undefined)
|
|
await serverApi.finalizeRealtimeSpeechSession(sessionId)
|
|
scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS)
|
|
} catch (error) {
|
|
handleRealtimeError(error)
|
|
}
|
|
}
|
|
|
|
function cancelRecording() {
|
|
void cleanupSession({ revertPrompt: true, closeRemote: true })
|
|
}
|
|
|
|
function connectEventStream(sessionId: string) {
|
|
eventSource?.close()
|
|
eventSource = serverApi.connectRealtimeSpeechEvents(
|
|
sessionId,
|
|
(event) => handleEvent(event),
|
|
() => {
|
|
if (closing) return
|
|
handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection")))
|
|
},
|
|
)
|
|
}
|
|
|
|
function handleEvent(event: SpeechRealtimeEvent) {
|
|
if (event.type === "session.ready") {
|
|
return
|
|
}
|
|
|
|
if (event.type === "session.error") {
|
|
handleRealtimeError(new Error(event.message))
|
|
return
|
|
}
|
|
|
|
if (event.type === "transcript.partial") {
|
|
activeLiveItemId = event.itemId
|
|
liveTranscript = event.text
|
|
renderPrompt(false)
|
|
return
|
|
}
|
|
|
|
if (event.type === "transcript.final") {
|
|
activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId
|
|
liveTranscript = ""
|
|
finalTranscript = appendVoiceTranscript(finalTranscript, event.text)
|
|
renderPrompt(true)
|
|
if (state() === "finalizing") {
|
|
scheduleFinalizeClose(250)
|
|
}
|
|
return
|
|
}
|
|
|
|
if (event.type === "session.closed") {
|
|
void cleanupSession({ revertPrompt: false, closeRemote: false })
|
|
}
|
|
}
|
|
|
|
function captureAnchor() {
|
|
const textarea = options.getTextarea()
|
|
const current = options.prompt()
|
|
const start = textarea ? textarea.selectionStart : current.length
|
|
const end = textarea ? textarea.selectionEnd : current.length
|
|
anchor = createPromptVoiceAnchor(current, start, end)
|
|
}
|
|
|
|
function renderPrompt(persistDraft: boolean) {
|
|
const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "")
|
|
const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted)
|
|
options.setPrompt(value, persistDraft ? undefined : { persistDraft: false })
|
|
syncTextareaCursor(cursor)
|
|
}
|
|
|
|
function syncTextareaCursor(cursor: number) {
|
|
const textarea = options.getTextarea()
|
|
if (!textarea) return
|
|
queueMicrotask(() => {
|
|
const next = options.getTextarea()
|
|
if (!next) return
|
|
next.focus()
|
|
next.setSelectionRange(cursor, cursor)
|
|
})
|
|
}
|
|
|
|
function scheduleFinalizeClose(delayMs: number) {
|
|
if (finalizeTimerId !== undefined) {
|
|
window.clearTimeout(finalizeTimerId)
|
|
}
|
|
finalizeTimerId = window.setTimeout(() => {
|
|
void cleanupSession({ revertPrompt: false, closeRemote: true })
|
|
}, delayMs)
|
|
}
|
|
|
|
async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) {
|
|
if (closing) return
|
|
closing = true
|
|
|
|
if (finalizeTimerId !== undefined) {
|
|
window.clearTimeout(finalizeTimerId)
|
|
finalizeTimerId = undefined
|
|
}
|
|
|
|
stopTimer()
|
|
|
|
const sessionId = activeSessionId
|
|
activeSessionId = null
|
|
|
|
eventSource?.close()
|
|
eventSource = null
|
|
|
|
if (pcmStream) {
|
|
const stream = pcmStream
|
|
pcmStream = null
|
|
await stream.stop().catch(() => undefined)
|
|
}
|
|
|
|
await audioQueue.catch(() => undefined)
|
|
audioQueue = Promise.resolve()
|
|
|
|
if (cleanupOptions.closeRemote && sessionId) {
|
|
await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined)
|
|
}
|
|
|
|
if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) {
|
|
finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript)
|
|
liveTranscript = ""
|
|
}
|
|
|
|
if (cleanupOptions.revertPrompt) {
|
|
options.setPrompt(anchor.prompt)
|
|
} else if (finalTranscript.trim()) {
|
|
renderPrompt(true)
|
|
}
|
|
|
|
resetTranscriptState()
|
|
setState("idle")
|
|
setElapsedMs(0)
|
|
closing = false
|
|
}
|
|
|
|
function resetTranscriptState() {
|
|
finalTranscript = ""
|
|
liveTranscript = ""
|
|
activeLiveItemId = null
|
|
}
|
|
|
|
function handleRealtimeError(error: unknown) {
|
|
if (closing) return
|
|
void cleanupSession({ revertPrompt: true, closeRemote: true })
|
|
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
|
|
title: t("promptInput.voiceInput.error.title"),
|
|
detail: error instanceof Error ? error.message : String(error),
|
|
variant: "error",
|
|
})
|
|
}
|
|
|
|
function startTimer() {
|
|
stopTimer()
|
|
timerId = window.setInterval(() => {
|
|
setElapsedMs(Date.now() - recordingStartedAt)
|
|
}, 250)
|
|
}
|
|
|
|
function stopTimer() {
|
|
if (timerId !== undefined) {
|
|
window.clearInterval(timerId)
|
|
timerId = undefined
|
|
}
|
|
}
|
|
|
|
return {
|
|
state,
|
|
elapsedMs,
|
|
canUseVoiceInput,
|
|
toggleRecording,
|
|
cancelRecording,
|
|
isRecording: () => state() === "connecting" || state() === "listening",
|
|
isTranscribing: () => state() === "finalizing",
|
|
buttonTitle: () => {
|
|
if (state() === "connecting") return t("promptInput.voiceInput.connecting.title")
|
|
if (state() === "listening") return t("promptInput.voiceInput.stop.title")
|
|
if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title")
|
|
return t("promptInput.voiceInput.start.title")
|
|
},
|
|
}
|
|
}
|
|
|
|
function detectLanguage(): string | undefined {
|
|
if (typeof navigator === "undefined") return undefined
|
|
const [language] = navigator.language.split("-")
|
|
return language?.trim() || undefined
|
|
}
|