import { createSignal } from "solid-js" import { tGlobal } from "../lib/i18n" import { showToastNotification } from "../lib/notifications" import { serverApi } from "../lib/api-client" import { getLogger } from "../lib/logger" import { formatToMimeType, getSpeechPlaybackSupport } from "../lib/speech-playback-support" import { serverSettings } from "./preferences" import { loadSpeechCapabilities, speechCapabilities } from "./speech" import { getActiveSession, sessions } from "./session-state" import type { ClientPart, MessageInfo } from "../types/message" import { messageStoreBus } from "./message-v2/bus" import { activeInstanceId } from "./instances" type SpeechPlaybackMode = "streaming" | "buffered" type SpeechTtsFormat = "mp3" | "wav" | "opus" | "aac" interface ConversationQueueEntry { key: string instanceId: string sessionId: string messageId: string partId: string text: string } interface PlaybackHandle { stop: () => void done: Promise } const log = getLogger("actions") const [conversationModeInstances, setConversationModeInstances] = createSignal>(new Map()) const queuedKeys = new Set() const spokenKeysBySession = new Map>() let queue: ConversationQueueEntry[] = [] let currentPlayback: | { entry: ConversationQueueEntry handle: PlaybackHandle } | null = null let queueRunner: Promise | null = null let playbackErrorShown = false function getEntryKey(instanceId: string, sessionId: string, messageId: string, partId: string): string { return `${instanceId}:${sessionId}:${messageId}:${partId}` } function getSpokenKeySet(instanceId: string, sessionId: string): Set { const sessionKey = `${instanceId}:${sessionId}` const existing = spokenKeysBySession.get(sessionKey) if (existing) return existing const next = new Set() spokenKeysBySession.set(sessionKey, next) return next } function resolveTextPartContent(part: ClientPart): string { if (part.type !== "text") return "" if (typeof part.text === "string") { return part.text } if (part.text && typeof part.text === "object") { const value = part.text as { text?: unknown; value?: unknown; content?: unknown[] } const segments: string[] = [] if (typeof value.text === "string") { segments.push(value.text) } if (typeof value.value === "string") { segments.push(value.value) } if (Array.isArray(value.content)) { for (const segment of value.content) { if (typeof segment === "string") { segments.push(segment) } else if (segment && typeof segment === "object") { const typedSegment = segment as { text?: unknown; value?: unknown } if (typeof typedSegment.text === "string") segments.push(typedSegment.text) if (typeof typedSegment.value === "string") segments.push(typedSegment.value) } } } return segments.join("\n") } return "" } export function isConversationModeEnabled(instanceId: string): boolean { return conversationModeInstances().get(instanceId) === true } export function canUseConversationMode(): boolean { const capabilities = speechCapabilities() if (!capabilities?.available || !capabilities.configured || !capabilities.supportsTts) { return false } const settings = serverSettings().speech return getSpeechPlaybackSupport({ playbackMode: settings.playbackMode, ttsFormat: settings.ttsFormat, capabilities, }).available } export function setConversationModeEnabled(instanceId: string, enabled: boolean): void { setConversationModeInstances((prev) => { const next = new Map(prev) if (enabled) { next.set(instanceId, true) } else { next.delete(instanceId) } return next }) if (!enabled) { clearConversationPlaybackForInstance(instanceId) } } export function toggleConversationMode(instanceId: string): void { setConversationModeEnabled(instanceId, !isConversationModeEnabled(instanceId)) } export function clearConversationPlaybackForSession(instanceId: string, sessionId: string): void { const sessionKey = `${instanceId}:${sessionId}` queue = queue.filter((entry) => { if (`${entry.instanceId}:${entry.sessionId}` === sessionKey) { queuedKeys.delete(entry.key) return false } return true }) if (currentPlayback && `${currentPlayback.entry.instanceId}:${currentPlayback.entry.sessionId}` === sessionKey) { currentPlayback.handle.stop() currentPlayback = null } } export function clearConversationPlaybackForInstance(instanceId: string): void { queue = queue.filter((entry) => { if (entry.instanceId === instanceId) { queuedKeys.delete(entry.key) return false } return true }) if (currentPlayback?.entry.instanceId === instanceId) { currentPlayback.handle.stop() currentPlayback = null } } function isSpeakableSession(instanceId: string, sessionId: string): boolean { if (activeInstanceId() !== instanceId) { return false } const activeSession = getActiveSession(instanceId) if (!activeSession || activeSession.id !== sessionId) { return false } const session = sessions().get(instanceId)?.get(sessionId) ?? activeSession return !session?.parentId } export function handleConversationAssistantPartUpdated(instanceId: string, part: ClientPart, messageInfo?: MessageInfo): void { if (part.type !== "text") return const sessionId = typeof part.sessionID === "string" ? part.sessionID : messageInfo?.sessionID const messageId = typeof part.messageID === "string" ? part.messageID : messageInfo?.id const partId = typeof part.id === "string" ? part.id : undefined if (!sessionId || !messageId || !partId) return const messageRole = messageInfo?.role ?? messageStoreBus.getOrCreate(instanceId).getMessage(messageId)?.role ?? null if (messageRole !== "assistant") return if (!isConversationModeEnabled(instanceId)) return if (!isSpeakableSession(instanceId, sessionId)) return const text = resolveTextPartContent(part).trim() if (!text) return const key = getEntryKey(instanceId, sessionId, messageId, partId) const spokenKeys = getSpokenKeySet(instanceId, sessionId) if (spokenKeys.has(key) || queuedKeys.has(key) || currentPlayback?.entry.key === key) { return } queuedKeys.add(key) queue.push({ key, instanceId, sessionId, messageId, partId, text }) void runConversationQueue() } async function runConversationQueue(): Promise { if (queueRunner) { await queueRunner return } queueRunner = (async () => { while (queue.length > 0) { const entry = queue.shift()! queuedKeys.delete(entry.key) if (!isConversationModeEnabled(entry.instanceId)) { continue } if (!isSpeakableSession(entry.instanceId, entry.sessionId)) { continue } const spokenKeys = getSpokenKeySet(entry.instanceId, entry.sessionId) spokenKeys.add(entry.key) try { const handle = await createPlaybackHandle(entry.text) currentPlayback = { entry, handle } await handle.done } catch (error) { spokenKeys.delete(entry.key) clearConversationPlaybackForInstance(entry.instanceId) if (!playbackErrorShown) { playbackErrorShown = true showToastNotification({ title: tGlobal("promptInput.conversationMode.error.title"), message: error instanceof Error && error.message ? error.message : tGlobal("promptInput.conversationMode.error.message"), variant: "error", }) } log.error("Conversation playback failed", error) break } finally { if (currentPlayback?.entry.key === entry.key) { currentPlayback = null } } } })() try { await queueRunner } finally { queueRunner = null if (queue.length === 0) { playbackErrorShown = false } } } async function createPlaybackHandle(text: string): Promise { const capabilities = (await loadSpeechCapabilities()) ?? speechCapabilities() const settings = serverSettings().speech if (!capabilities?.available || !capabilities.configured || !capabilities.supportsTts) { throw new Error(tGlobal("messageItem.actions.speak.error.unavailable")) } const support = getSpeechPlaybackSupport({ playbackMode: settings.playbackMode, ttsFormat: settings.ttsFormat, capabilities, }) if (!support.available) { if (support.reason === "provider-streaming-unavailable") { throw new Error(tGlobal("settings.speech.compatibility.streamingUnavailable")) } if (support.reason === "browser-streaming-unavailable") { throw new Error(tGlobal("settings.speech.compatibility.browserStreamingUnavailable")) } throw new Error(tGlobal("messageItem.actions.speak.error.unsupported")) } return settings.playbackMode === "streaming" ? createStreamingPlaybackHandle(text, settings.ttsFormat) : createBufferedPlaybackHandle(text, settings.ttsFormat) } async function createBufferedPlaybackHandle(text: string, format: SpeechTtsFormat): Promise { const response = await serverApi.synthesizeSpeech({ text, format }) const objectUrl = createObjectUrlFromBase64(response.audioBase64, response.mimeType) const audio = new Audio(objectUrl) let settled = false let resolveDone!: () => void let rejectDone!: (error: unknown) => void const cleanup = () => { audio.pause() audio.src = "" audio.load() URL.revokeObjectURL(objectUrl) } const done = new Promise((resolve, reject) => { resolveDone = () => { if (settled) return settled = true cleanup() resolve() } rejectDone = (error) => { if (settled) return settled = true cleanup() reject(error) } }) audio.addEventListener("ended", () => resolveDone(), { once: true }) audio.addEventListener("error", () => rejectDone(new Error(tGlobal("messageItem.actions.speak.error.generate"))), { once: true, }) await audio.play() return { stop: () => resolveDone(), done, } } async function createStreamingPlaybackHandle(text: string, format: SpeechTtsFormat): Promise { if (typeof MediaSource === "undefined") { throw new Error(tGlobal("messageItem.actions.speak.error.unsupported")) } const abortController = new AbortController() const response = await serverApi.synthesizeSpeechStream({ text, format }, abortController.signal) const mimeType = response.headers.get("content-type") || formatToMimeType(format) const stream = response.body if (!stream) { throw new Error(tGlobal("messageItem.actions.speak.error.generate")) } if (!MediaSource.isTypeSupported(mimeType)) { throw new Error(tGlobal("settings.speech.compatibility.browserStreamingUnavailable")) } const mediaSource = new MediaSource() const objectUrl = URL.createObjectURL(mediaSource) const audio = new Audio(objectUrl) let settled = false let startedPlayback = false let resolveDone!: () => void let rejectDone!: (error: unknown) => void const cleanup = () => { abortController.abort() audio.pause() audio.src = "" audio.load() URL.revokeObjectURL(objectUrl) } const done = new Promise((resolve, reject) => { resolveDone = () => { if (settled) return settled = true cleanup() resolve() } rejectDone = (error) => { if (settled) return settled = true cleanup() reject(error) } }) audio.addEventListener("ended", () => resolveDone(), { once: true }) audio.addEventListener("error", () => rejectDone(new Error(tGlobal("messageItem.actions.speak.error.generate"))), { once: true, }) await new Promise((resolve, reject) => { mediaSource.addEventListener( "sourceopen", () => { void streamToMediaSource({ mediaSource, stream, mimeType, onPlayable: async () => { if (startedPlayback) return startedPlayback = true try { await audio.play() resolve() } catch (error) { reject(error) } }, onError: reject, }) }, { once: true }, ) }) return { stop: () => resolveDone(), done, } } async function streamToMediaSource(options: { mediaSource: MediaSource stream: ReadableStream mimeType: string onPlayable: () => Promise onError: (error: unknown) => void }) { try { const sourceBuffer = options.mediaSource.addSourceBuffer(options.mimeType) const reader = options.stream.getReader() const queue: Uint8Array[] = [] let processing = false let playbackStarted = false const flushQueue = async () => { if (processing || sourceBuffer.updating || queue.length === 0) return processing = true const chunk = queue.shift()! await appendChunk(sourceBuffer, chunk) if (!playbackStarted) { playbackStarted = true await options.onPlayable() } processing = false await flushQueue() } while (true) { const { done, value } = await reader.read() if (done) break if (value && value.byteLength > 0) { queue.push(value) await flushQueue() } } while (queue.length > 0 || sourceBuffer.updating) { if (queue.length > 0) { await flushQueue() } else { await waitForUpdateEnd(sourceBuffer) } } if (options.mediaSource.readyState === "open") { options.mediaSource.endOfStream() } } catch (error) { options.onError(error) } } function appendChunk(sourceBuffer: SourceBuffer, chunk: Uint8Array): Promise { return new Promise((resolve, reject) => { const handleUpdateEnd = () => { cleanup() resolve() } const handleError = () => { cleanup() reject(new Error(tGlobal("messageItem.actions.speak.error.generate"))) } const cleanup = () => { sourceBuffer.removeEventListener("updateend", handleUpdateEnd) sourceBuffer.removeEventListener("error", handleError) } sourceBuffer.addEventListener("updateend", handleUpdateEnd, { once: true }) sourceBuffer.addEventListener("error", handleError, { once: true }) sourceBuffer.appendBuffer(new Uint8Array(chunk).buffer) }) } function waitForUpdateEnd(sourceBuffer: SourceBuffer): Promise { return new Promise((resolve) => { sourceBuffer.addEventListener("updateend", () => resolve(), { once: true }) }) } function createObjectUrlFromBase64(audioBase64: string, mimeType: string): string { const binary = atob(audioBase64) const bytes = new Uint8Array(binary.length) for (let index = 0; index < binary.length; index += 1) { bytes[index] = binary.charCodeAt(index) } return URL.createObjectURL(new Blob([bytes], { type: mimeType || "audio/mpeg" })) }