Files
CodeNomad/packages/ui/src/lib/hooks/use-speech.ts
2026-03-26 22:59:30 +00:00

417 lines
12 KiB
TypeScript

import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
import { showAlertDialog } from "../../stores/alerts"
import { serverApi } from "../api-client"
import { useI18n } from "../i18n"
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
import { useConfig, type SpeechSettings } from "../../stores/preferences"
import { formatToMimeType, getSpeechPlaybackSupport } from "../speech-playback-support"
type SpeechPlaybackState = "idle" | "loading" | "playing"
interface UseSpeechOptions {
id: Accessor<string>
text: Accessor<string>
settingsOverride?: Accessor<Partial<Pick<SpeechSettings, "playbackMode" | "ttsFormat">>>
}
interface ActivePlaybackEntry {
ownerId: string
stop: () => void
}
const stateResetters = new Map<string, () => void>()
let activePlayback: ActivePlaybackEntry | null = null
function resetOwnerState(ownerId: string) {
stateResetters.get(ownerId)?.()
}
function stopActivePlayback(ownerId?: string) {
if (!activePlayback) return
if (ownerId && activePlayback.ownerId !== ownerId) return
const current = activePlayback
activePlayback = null
current.stop()
}
function setActivePlayback(ownerId: string, stop: () => void) {
if (activePlayback?.ownerId === ownerId) {
activePlayback = { ownerId, stop }
return
}
stopActivePlayback()
activePlayback = { ownerId, stop }
}
export function useSpeech(options: UseSpeechOptions) {
const { t } = useI18n()
const { serverSettings } = useConfig()
const [state, setState] = createSignal<SpeechPlaybackState>("idle")
let requestVersion = 0
let audio: HTMLAudioElement | null = null
let objectUrl: string | null = null
let mediaSource: MediaSource | null = null
let abortController: AbortController | null = null
createEffect(() => {
void loadSpeechCapabilities()
})
const cleanupAudio = () => {
if (abortController) {
abortController.abort()
abortController = null
}
if (audio) {
audio.pause()
audio.currentTime = 0
audio.src = ""
audio.load()
audio = null
}
mediaSource = null
if (objectUrl) {
URL.revokeObjectURL(objectUrl)
objectUrl = null
}
}
const resetState = () => {
requestVersion += 1
cleanupAudio()
setState("idle")
}
stateResetters.set(options.id(), resetState)
onCleanup(() => {
stateResetters.delete(options.id())
stopActivePlayback(options.id())
resetState()
})
const isSupported = () => typeof window !== "undefined" && typeof window.Audio !== "undefined"
const resolvedSettings = () => ({
...serverSettings().speech,
...(options.settingsOverride?.() ?? {}),
})
const canUseSpeech = () => {
const capabilities = speechCapabilities()
if (!isSupported() || !capabilities?.available || !capabilities?.configured || !capabilities?.supportsTts) {
return false
}
return getSpeechPlaybackSupport({
playbackMode: resolvedSettings().playbackMode,
ttsFormat: resolvedSettings().ttsFormat,
capabilities,
}).available
}
const stop = () => {
if (activePlayback?.ownerId === options.id()) {
activePlayback = null
}
resetState()
}
const start = async () => {
const ownerId = options.id()
const text = options.text().trim()
if (!text || state() === "loading" || state() === "playing") return
if (!isSupported()) {
showAlertDialog(t("messageItem.actions.speak.error.unsupported"), {
title: t("messageItem.actions.speak.error.title"),
variant: "error",
})
return
}
const capabilities = (await loadSpeechCapabilities()) ?? speechCapabilities()
if (!capabilities?.available || !capabilities?.configured || !capabilities?.supportsTts) {
showAlertDialog(t("messageItem.actions.speak.error.unavailable"), {
title: t("messageItem.actions.speak.error.title"),
variant: "error",
})
return
}
const support = getSpeechPlaybackSupport({
playbackMode: resolvedSettings().playbackMode,
ttsFormat: resolvedSettings().ttsFormat,
capabilities,
})
if (!support.available) {
const detailKey =
support.reason === "provider-streaming-unavailable"
? "settings.speech.compatibility.streamingUnavailable"
: support.reason === "browser-streaming-unavailable"
? "settings.speech.compatibility.browserStreamingUnavailable"
: "messageItem.actions.speak.error.unsupported"
showAlertDialog(t("messageItem.actions.speak.error.unavailable"), {
title: t("messageItem.actions.speak.error.title"),
detail: t(detailKey),
variant: "error",
})
return
}
requestVersion += 1
const currentRequest = requestVersion
stopActivePlayback()
cleanupAudio()
setState("loading")
const settings = resolvedSettings()
const format = settings.ttsFormat
try {
if (settings.playbackMode === "streaming") {
await startStreamingPlayback(ownerId, currentRequest, text, format)
} else {
await startBufferedPlayback(ownerId, currentRequest, text, format)
}
} catch (error) {
if (currentRequest !== requestVersion) {
return
}
resetState()
showAlertDialog(t("messageItem.actions.speak.error.generate"), {
title: t("messageItem.actions.speak.error.title"),
detail: error instanceof Error ? error.message : String(error),
variant: "error",
})
}
}
async function startBufferedPlayback(
ownerId: string,
currentRequest: number,
text: string,
format: "mp3" | "wav" | "opus" | "aac",
) {
const response = await serverApi.synthesizeSpeech({ text, format })
if (currentRequest !== requestVersion) {
return
}
const nextUrl = createObjectUrlFromBase64(response.audioBase64, response.mimeType)
const nextAudio = new Audio(nextUrl)
objectUrl = nextUrl
audio = nextAudio
attachPlaybackLifecycle(ownerId, nextAudio)
setActivePlayback(ownerId, () => {
cleanupAudio()
setState("idle")
})
setState("playing")
await nextAudio.play()
}
async function startStreamingPlayback(
ownerId: string,
currentRequest: number,
text: string,
format: "mp3" | "wav" | "opus" | "aac",
) {
if (typeof MediaSource === "undefined") {
throw new Error("MediaSource is not available in this browser.")
}
const controller = new AbortController()
abortController = controller
const response = await serverApi.synthesizeSpeechStream({ text, format }, controller.signal)
const mimeType = response.headers.get("content-type") || formatToMimeType(format)
if (!MediaSource.isTypeSupported(mimeType)) {
throw new Error(`Streaming playback is not supported for ${mimeType}.`)
}
const stream = response.body
if (!stream) {
throw new Error("Speech stream did not include a response body.")
}
const nextMediaSource = new MediaSource()
const nextObjectUrl = URL.createObjectURL(nextMediaSource)
const nextAudio = new Audio(nextObjectUrl)
mediaSource = nextMediaSource
objectUrl = nextObjectUrl
audio = nextAudio
attachPlaybackLifecycle(ownerId, nextAudio)
setActivePlayback(ownerId, () => {
cleanupAudio()
setState("idle")
})
await new Promise<void>((resolve, reject) => {
const handleSourceOpen = () => {
nextMediaSource.removeEventListener("sourceopen", handleSourceOpen)
void streamToMediaSource({
mediaSource: nextMediaSource,
stream,
mimeType,
audioElement: nextAudio,
onPlayable: async () => {
if (currentRequest !== requestVersion) return
if (state() !== "playing") {
setState("playing")
}
try {
await nextAudio.play()
} catch (error) {
reject(error)
}
},
onComplete: resolve,
onError: reject,
})
}
nextMediaSource.addEventListener("sourceopen", handleSourceOpen, { once: true })
nextAudio.addEventListener(
"error",
() => reject(new Error("Unable to play streamed speech.")),
{ once: true },
)
})
}
const toggle = async () => {
if (state() === "idle") {
await start()
return
}
stop()
}
return {
state,
canUseSpeech,
isLoading: () => state() === "loading",
isPlaying: () => state() === "playing",
toggle,
stop,
buttonTitle: () => {
if (state() === "loading") return t("messageItem.actions.generatingSpeech")
if (state() === "playing") return t("messageItem.actions.stopSpeech")
return t("messageItem.actions.speak")
},
}
}
function attachPlaybackLifecycle(ownerId: string, audio: HTMLAudioElement) {
const finish = () => {
if (activePlayback?.ownerId === ownerId) {
activePlayback = null
}
resetOwnerState(ownerId)
}
audio.addEventListener("ended", finish, { once: true })
audio.addEventListener("error", finish, { once: true })
}
async function streamToMediaSource(options: {
mediaSource: MediaSource
stream: ReadableStream<Uint8Array>
mimeType: string
audioElement: HTMLAudioElement
onPlayable: () => Promise<void>
onComplete: () => void
onError: (error: unknown) => void
}) {
try {
const sourceBuffer = options.mediaSource.addSourceBuffer(options.mimeType)
const reader = options.stream.getReader()
let startedPlayback = false
let queue: Uint8Array[] = []
let processing = false
const flushQueue = async () => {
if (processing || sourceBuffer.updating || queue.length === 0) return
processing = true
const chunk = queue.shift()!
await appendChunk(sourceBuffer, chunk)
if (!startedPlayback) {
startedPlayback = true
await options.onPlayable()
}
processing = false
await flushQueue()
}
while (true) {
const { done, value } = await reader.read()
if (done) break
if (value && value.byteLength > 0) {
queue.push(value)
await flushQueue()
}
}
while (queue.length > 0 || sourceBuffer.updating) {
if (queue.length > 0) {
await flushQueue()
} else {
await waitForUpdateEnd(sourceBuffer)
}
}
if (options.mediaSource.readyState === "open") {
options.mediaSource.endOfStream()
}
options.onComplete()
} catch (error) {
options.onError(error)
}
}
function appendChunk(sourceBuffer: SourceBuffer, chunk: Uint8Array): Promise<void> {
return new Promise((resolve, reject) => {
const handleUpdateEnd = () => {
cleanup()
resolve()
}
const handleError = () => {
cleanup()
reject(new Error("Failed to append audio stream chunk."))
}
const cleanup = () => {
sourceBuffer.removeEventListener("updateend", handleUpdateEnd)
sourceBuffer.removeEventListener("error", handleError)
}
sourceBuffer.addEventListener("updateend", handleUpdateEnd, { once: true })
sourceBuffer.addEventListener("error", handleError, { once: true })
sourceBuffer.appendBuffer(new Uint8Array(chunk).buffer)
})
}
function waitForUpdateEnd(sourceBuffer: SourceBuffer): Promise<void> {
return new Promise((resolve) => {
sourceBuffer.addEventListener("updateend", () => resolve(), { once: true })
})
}
function createObjectUrlFromBase64(audioBase64: string, mimeType: string): string {
const binary = atob(audioBase64)
const bytes = new Uint8Array(binary.length)
for (let index = 0; index < binary.length; index += 1) {
bytes[index] = binary.charCodeAt(index)
}
return URL.createObjectURL(new Blob([bytes], { type: mimeType || "audio/mpeg" }))
}