diff --git a/packages/opencode-config/package.json b/packages/opencode-config/package.json index 82ed1e6d..61266886 100644 --- a/packages/opencode-config/package.json +++ b/packages/opencode-config/package.json @@ -4,6 +4,6 @@ "private": true, "license": "MIT", "dependencies": { - "@opencode-ai/plugin": "1.2.14" + "@opencode-ai/plugin": "1.2.24" } } \ No newline at end of file diff --git a/packages/server/src/api-types.ts b/packages/server/src/api-types.ts index 8eb7c928..c1fbefb3 100644 --- a/packages/server/src/api-types.ts +++ b/packages/server/src/api-types.ts @@ -219,12 +219,35 @@ export interface SpeechCapabilitiesResponse { provider: string supportsStt: boolean supportsTts: boolean + supportsRealtimeTranscription?: boolean + realtimeInputFormat?: { + type: "audio/pcm" + rate: 24000 + } + realtimeModel?: string baseUrl?: string sttModel: string ttsModel: string ttsVoice: string } +export interface SpeechRealtimeSessionResponse { + sessionId: string + inputFormat: { + type: "audio/pcm" + rate: 24000 + } +} + +export type SpeechRealtimeEvent = + | { type: "session.ready"; sessionId: string } + | { type: "session.error"; message: string } + | { type: "input.speech_started"; itemId?: string } + | { type: "input.speech_stopped"; itemId?: string } + | { type: "transcript.partial"; itemId: string; text: string } + | { type: "transcript.final"; itemId: string; previousItemId?: string; text: string } + | { type: "session.closed"; reason?: string } + export interface SpeechTranscriptionResponse { text: string language?: string diff --git a/packages/server/src/server/http-server.ts b/packages/server/src/server/http-server.ts index 3f558cb8..e8252e5b 100644 --- a/packages/server/src/server/http-server.ts +++ b/packages/server/src/server/http-server.ts @@ -255,7 +255,7 @@ export function createHttpServer(deps: HttpServerDeps) { eventBus: deps.eventBus, workspaceManager: deps.workspaceManager, }) - registerSpeechRoutes(app, { speechService: deps.speechService }) + registerSpeechRoutes(app, { speechService: deps.speechService, logger: apiLogger }) registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger }) registerBackgroundProcessRoutes(app, { backgroundProcessManager }) registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger }) diff --git a/packages/server/src/server/routes/speech.ts b/packages/server/src/server/routes/speech.ts index 3eab4ad6..e9421092 100644 --- a/packages/server/src/server/routes/speech.ts +++ b/packages/server/src/server/routes/speech.ts @@ -1,9 +1,12 @@ import type { FastifyInstance } from "fastify" import { z } from "zod" import type { SpeechService } from "../../speech/service" +import type { Logger } from "../../logger" +import { SpeechRealtimeSessionManager } from "../../speech/realtime-session-manager" interface RouteDeps { speechService: SpeechService + logger: Logger } const TranscribeBodySchema = z.object({ @@ -19,9 +22,99 @@ const SynthesizeBodySchema = z.object({ format: z.enum(["mp3", "wav", "opus"]).optional(), }) +const RealtimeSessionBodySchema = z.object({ + language: z.string().trim().min(1).optional(), + prompt: z.string().trim().min(1).optional(), +}) + +const RealtimeAudioBodySchema = z.object({ + audioBase64: z.string().min(1, "Audio payload is required"), +}) + export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) { + const realtimeSessions = new SpeechRealtimeSessionManager( + deps.speechService, + deps.logger.child({ component: "speech-realtime" }), + ) + + app.addHook("onClose", async () => { + await realtimeSessions.dispose() + }) + app.get("/api/speech/capabilities", async () => deps.speechService.getCapabilities()) + app.post("/api/speech/realtime/sessions", async (request, reply) => { + try { + const body = RealtimeSessionBodySchema.parse(request.body ?? {}) + return await realtimeSessions.createSession(body) + } catch (error) { + request.log.error({ err: error }, "Failed to create realtime speech session") + reply.code(400) + return { error: error instanceof Error ? error.message : "Failed to create realtime speech session" } + } + }) + + app.get<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/events", (request, reply) => { + try { + reply.raw.setHeader("Content-Type", "text/event-stream") + reply.raw.setHeader("Cache-Control", "no-cache") + reply.raw.setHeader("Connection", "keep-alive") + reply.raw.flushHeaders?.() + reply.hijack() + + const unsubscribe = realtimeSessions.subscribe(request.params.sessionId, (event) => { + reply.raw.write(`data: ${JSON.stringify(event)}\n\n`) + }) + + const heartbeat = setInterval(() => { + reply.raw.write(`:hb ${Date.now()}\n\n`) + }, 15000) + + const close = () => { + clearInterval(heartbeat) + unsubscribe() + reply.raw.end?.() + } + + request.raw.on("close", close) + request.raw.on("error", close) + } catch (error) { + request.log.error({ err: error }, "Failed to open realtime speech event stream") + reply.code(404).send({ error: error instanceof Error ? error.message : "Realtime speech session not found" }) + } + }) + + app.post<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/audio", async (request, reply) => { + try { + const body = RealtimeAudioBodySchema.parse(request.body ?? {}) + realtimeSessions.appendAudio(request.params.sessionId, body.audioBase64) + reply.code(204) + return undefined + } catch (error) { + request.log.error({ err: error }, "Failed to append realtime speech audio") + reply.code(400) + return { error: error instanceof Error ? error.message : "Failed to append realtime speech audio" } + } + }) + + app.post<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/finalize", async (request, reply) => { + try { + realtimeSessions.finalize(request.params.sessionId) + reply.code(204) + return undefined + } catch (error) { + request.log.error({ err: error }, "Failed to finalize realtime speech session") + reply.code(400) + return { error: error instanceof Error ? error.message : "Failed to finalize realtime speech session" } + } + }) + + app.delete<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId", async (request, reply) => { + realtimeSessions.closeSession(request.params.sessionId, "client_closed") + reply.code(204) + return undefined + }) + app.post("/api/speech/transcribe", async (request, reply) => { try { const body = TranscribeBodySchema.parse(request.body ?? {}) diff --git a/packages/server/src/speech/providers/openai-compatible.ts b/packages/server/src/speech/providers/openai-compatible.ts index 4c426d72..436bfc36 100644 --- a/packages/server/src/speech/providers/openai-compatible.ts +++ b/packages/server/src/speech/providers/openai-compatible.ts @@ -20,7 +20,13 @@ export class OpenAICompatibleSpeechProvider { provider: settings.provider, supportsStt: true, supportsTts: true, + supportsRealtimeTranscription: true, + realtimeInputFormat: { + type: "audio/pcm" as const, + rate: 24000 as const, + }, baseUrl: settings.baseUrl, + realtimeModel: settings.realtimeModel, sttModel: settings.sttModel, ttsModel: settings.ttsModel, ttsVoice: settings.ttsVoice, diff --git a/packages/server/src/speech/realtime-session-manager.ts b/packages/server/src/speech/realtime-session-manager.ts new file mode 100644 index 00000000..99aa395a --- /dev/null +++ b/packages/server/src/speech/realtime-session-manager.ts @@ -0,0 +1,525 @@ +import { randomUUID } from "node:crypto" +import { WebSocket } from "undici" +import type { SpeechRealtimeEvent, SpeechRealtimeSessionResponse } from "../api-types" +import type { Logger } from "../logger" +import type { SpeechService } from "./service" + +interface CreateRealtimeSessionOptions { + language?: string + prompt?: string +} + +interface TranscriptItemState { + previousItemId?: string + partialText: string + finalText?: string +} + +interface ManagedRealtimeSession { + id: string + ws: WebSocket + subscribers: Set<(event: SpeechRealtimeEvent) => void> + items: Map + orderedItemIds: string[] + nextFinalIndex: number + createdAt: number + lastActivityAt: number + closed: boolean +} + +const OPEN_TIMEOUT_MS = 10_000 +const IDLE_TIMEOUT_MS = 2 * 60 * 1000 +const SWEEP_INTERVAL_MS = 30_000 + +export class SpeechRealtimeSessionManager { + private readonly sessions = new Map() + private readonly sweepTimer: NodeJS.Timeout + + constructor( + private readonly speechService: SpeechService, + private readonly logger: Logger, + ) { + this.sweepTimer = setInterval(() => { + this.sweepIdleSessions() + }, SWEEP_INTERVAL_MS) + this.sweepTimer.unref?.() + } + + async createSession(options: CreateRealtimeSessionOptions = {}): Promise { + const config = this.speechService.getRealtimeTranscriptionConfig() + const id = randomUUID() + const wsUrl = buildRealtimeWebSocketUrl(config.baseUrl, config.realtimeModel) + const sessionUpdateEvent = buildSessionUpdateEvent(config, options) + this.logger.info( + { + sessionId: id, + wsUrl, + realtimeModel: config.realtimeModel, + sttModel: config.sttModel, + payload: sessionUpdateEvent, + }, + "Opening realtime speech websocket", + ) + const ws = new WebSocket(wsUrl, { + headers: { + Authorization: `Bearer ${config.apiKey}`, + ...(requiresRealtimeBetaHeader(config.baseUrl) ? { "OpenAI-Beta": "realtime=v1" } : {}), + }, + }) + + const session: ManagedRealtimeSession = { + id, + ws, + subscribers: new Set(), + items: new Map(), + orderedItemIds: [], + nextFinalIndex: 0, + createdAt: Date.now(), + lastActivityAt: Date.now(), + closed: false, + } + + this.sessions.set(id, session) + this.attachSocketHandlers(session) + + try { + await waitForSocketOpen(ws) + this.send(session, sessionUpdateEvent) + return { + sessionId: id, + inputFormat: config.inputFormat, + } + } catch (error) { + this.logger.error({ sessionId: id, err: error }, "Failed to create realtime speech session") + this.closeSession(id, error instanceof Error ? error.message : "Failed to create realtime speech session") + throw error + } + } + + subscribe(sessionId: string, send: (event: SpeechRealtimeEvent) => void): () => void { + const session = this.getSession(sessionId) + if (!session) { + throw new Error("Realtime speech session not found") + } + + session.subscribers.add(send) + this.touch(session) + send({ type: "session.ready", sessionId }) + + return () => { + session.subscribers.delete(send) + this.touch(session) + } + } + + appendAudio(sessionId: string, audioBase64: string): void { + const session = this.requireSession(sessionId) + this.send(session, { + type: "input_audio_buffer.append", + audio: audioBase64, + }) + } + + finalize(sessionId: string): void { + const session = this.requireSession(sessionId) + this.send(session, { + type: "input_audio_buffer.commit", + }) + } + + closeSession(sessionId: string, reason?: string): void { + const session = this.sessions.get(sessionId) + if (!session || session.closed) return + + session.closed = true + this.sessions.delete(sessionId) + this.emit(session, { type: "session.closed", reason }) + + try { + if (session.ws.readyState === WebSocket.OPEN || session.ws.readyState === WebSocket.CONNECTING) { + session.ws.close(1000, reason?.slice(0, 120) ?? "client_closed") + } + } catch (error) { + this.logger.warn({ sessionId, err: error }, "Failed to close realtime speech websocket") + } + + session.subscribers.clear() + } + + async dispose(): Promise { + clearInterval(this.sweepTimer) + for (const sessionId of Array.from(this.sessions.keys())) { + this.closeSession(sessionId, "server_shutdown") + } + } + + private attachSocketHandlers(session: ManagedRealtimeSession) { + session.ws.addEventListener("message", (event) => { + void this.handleSocketMessage(session, event.data) + }) + + session.ws.addEventListener("error", (event) => { + const message = event.error instanceof Error ? event.error.message : event.message || "Realtime speech connection failed" + this.logger.warn({ sessionId: session.id, err: event.error ?? event.message }, "Realtime speech websocket error") + this.emit(session, { type: "session.error", message }) + }) + + session.ws.addEventListener("close", (event) => { + const reason = event.reason || (event.wasClean ? "socket_closed" : "socket_terminated") + this.logger.info( + { + sessionId: session.id, + code: event.code, + reason, + orderedItemIds: session.orderedItemIds, + pendingItems: Array.from(session.items.entries()).map(([itemId, item]) => ({ + itemId, + previousItemId: item.previousItemId, + partialText: item.partialText, + finalText: item.finalText, + })), + }, + "Realtime speech websocket closed", + ) + this.closeSession(session.id, reason) + }) + } + + private async handleSocketMessage(session: ManagedRealtimeSession, raw: unknown) { + if (session.closed) return + + try { + const payload = await toText(raw) + const event = JSON.parse(payload) as Record + this.touch(session) + this.handleServerEvent(session, event) + } catch (error) { + this.logger.warn({ sessionId: session.id, err: error }, "Failed to process realtime speech event") + } + } + + private handleServerEvent(session: ManagedRealtimeSession, event: Record) { + const type = typeof event.type === "string" ? event.type : "" + if (!type) return + + this.logger.debug({ sessionId: session.id, type }, "Realtime speech event received") + if (type.startsWith("conversation.item") || type.startsWith("input_audio_buffer") || type.startsWith("session.")) { + this.logger.debug({ sessionId: session.id, event }, "Realtime speech event payload") + } + + if (type === "error") { + const message = extractErrorMessage(event) + this.logger.warn({ sessionId: session.id, event }, "Realtime speech provider error event") + this.emit(session, { type: "session.error", message }) + return + } + + if (type === "input_audio_buffer.speech_started") { + this.emit(session, { + type: "input.speech_started", + itemId: readString(event.item_id), + }) + return + } + + if (type === "input_audio_buffer.speech_stopped") { + this.emit(session, { + type: "input.speech_stopped", + itemId: readString(event.item_id), + }) + return + } + + if (type === "input_audio_buffer.committed") { + const itemId = readString(event.item_id) + if (!itemId) return + const item = this.getOrCreateItem(session, itemId) + item.previousItemId = readString(event.previous_item_id) + if (!session.orderedItemIds.includes(itemId)) { + session.orderedItemIds.push(itemId) + } + this.flushFinalizedItems(session) + return + } + + if (type === "conversation.item.created" || type === "conversation.item.added" || type === "conversation.item.done") { + this.handleConversationItemEvent(session, event) + return + } + + if (type === "conversation.item.input_audio_transcription.delta") { + const itemId = readString(event.item_id) + const delta = readString(event.delta) + if (!itemId || !delta) return + const item = this.getOrCreateItem(session, itemId) + item.partialText += delta + this.emit(session, { + type: "transcript.partial", + itemId, + text: item.partialText, + }) + return + } + + if (type === "conversation.item.input_audio_transcription.completed") { + const itemId = readString(event.item_id) + if (!itemId) return + const item = this.getOrCreateItem(session, itemId) + item.finalText = readString(event.transcript) ?? item.partialText + this.flushFinalizedItems(session) + } + } + + private handleConversationItemEvent(session: ManagedRealtimeSession, event: Record) { + const itemRecord = asRecord(event.item) + if (!itemRecord) return + + const itemId = readString(itemRecord.id) ?? readString(event.item_id) + if (!itemId) return + + const item = this.getOrCreateItem(session, itemId) + item.previousItemId = readString(event.previous_item_id) ?? item.previousItemId + if (!session.orderedItemIds.includes(itemId)) { + session.orderedItemIds.push(itemId) + } + + const transcript = extractTranscriptFromConversationItem(itemRecord) + if (transcript) { + item.finalText = transcript + this.flushFinalizedItems(session) + } + } + + private flushFinalizedItems(session: ManagedRealtimeSession) { + while (session.nextFinalIndex < session.orderedItemIds.length) { + const itemId = session.orderedItemIds[session.nextFinalIndex] + const item = session.items.get(itemId) + if (!item || item.finalText === undefined) { + return + } + + this.emit(session, { + type: "transcript.final", + itemId, + previousItemId: item.previousItemId, + text: item.finalText, + }) + session.nextFinalIndex += 1 + } + } + + private getOrCreateItem(session: ManagedRealtimeSession, itemId: string): TranscriptItemState { + const existing = session.items.get(itemId) + if (existing) return existing + const created: TranscriptItemState = { partialText: "" } + session.items.set(itemId, created) + return created + } + + private emit(session: ManagedRealtimeSession, event: SpeechRealtimeEvent) { + for (const subscriber of session.subscribers) { + try { + subscriber(event) + } catch (error) { + this.logger.warn({ sessionId: session.id, err: error, type: event.type }, "Failed to emit realtime speech event") + } + } + } + + private requireSession(sessionId: string): ManagedRealtimeSession { + const session = this.getSession(sessionId) + if (!session) { + throw new Error("Realtime speech session not found") + } + return session + } + + private getSession(sessionId: string): ManagedRealtimeSession | null { + const session = this.sessions.get(sessionId) ?? null + if (!session || session.closed) return null + return session + } + + private send(session: ManagedRealtimeSession, event: Record) { + if (session.closed || session.ws.readyState !== WebSocket.OPEN) { + throw new Error("Realtime speech session is not connected") + } + + session.ws.send(JSON.stringify(event)) + this.touch(session) + } + + private touch(session: ManagedRealtimeSession) { + session.lastActivityAt = Date.now() + } + + private sweepIdleSessions() { + const now = Date.now() + for (const [sessionId, session] of this.sessions) { + if (session.closed) continue + if (now - session.lastActivityAt < IDLE_TIMEOUT_MS) continue + this.logger.info({ sessionId }, "Closing idle realtime speech session") + this.closeSession(sessionId, "idle_timeout") + } + } +} + +function buildRealtimeWebSocketUrl(baseUrl: string | undefined, model: string): string { + const target = new URL(baseUrl?.trim() || "https://api.openai.com/v1") + target.protocol = target.protocol === "http:" ? "ws:" : "wss:" + const normalizedPath = target.pathname.replace(/\/+$/, "") + target.pathname = normalizedPath.endsWith("/realtime") ? normalizedPath : `${normalizedPath}/realtime` + target.hash = "" + if (!target.searchParams.has("model")) { + target.searchParams.set("model", model) + } + return target.toString() +} + +function requiresRealtimeBetaHeader(baseUrl?: string): boolean { + if (!baseUrl || !baseUrl.trim()) return false + try { + return new URL(baseUrl).hostname.toLowerCase() !== "api.openai.com" + } catch { + return false + } +} + +function buildSessionUpdateEvent( + config: { baseUrl?: string; sttModel: string; realtimeModel: string; inputFormat: { type: "audio/pcm"; rate: 24000 } }, + options: CreateRealtimeSessionOptions, +): Record { + if (requiresRealtimeBetaHeader(config.baseUrl)) { + return { + type: "session.update", + session: { + input_audio_transcription: { + model: config.sttModel, + ...(options.language ? { language: options.language } : {}), + ...(options.prompt ? { prompt: options.prompt } : {}), + }, + turn_detection: { + type: "server_vad", + threshold: 0.45, + prefix_padding_ms: 250, + silence_duration_ms: 400, + }, + }, + } + } + + return { + type: "session.update", + session: { + type: "transcription", + audio: { + input: { + format: config.inputFormat, + noise_reduction: { type: "near_field" }, + transcription: { + model: config.sttModel, + ...(options.language ? { language: options.language } : {}), + ...(options.prompt ? { prompt: options.prompt } : {}), + }, + turn_detection: { + type: "server_vad", + threshold: 0.45, + prefix_padding_ms: 250, + silence_duration_ms: 400, + }, + }, + }, + }, + } +} + +function waitForSocketOpen(ws: WebSocket): Promise { + if (ws.readyState === WebSocket.OPEN) { + return Promise.resolve() + } + + return new Promise((resolve, reject) => { + let settled = false + const timeout = setTimeout(() => { + cleanup() + reject(new Error("Timed out connecting to realtime speech provider")) + }, OPEN_TIMEOUT_MS) + + const cleanup = () => { + clearTimeout(timeout) + ws.removeEventListener("open", handleOpen) + ws.removeEventListener("error", handleError) + ws.removeEventListener("close", handleClose) + } + + const finish = (callback: () => void) => { + if (settled) return + settled = true + cleanup() + callback() + } + + const handleOpen = () => { + finish(resolve) + } + + const handleError = (event: { error?: unknown; message?: string }) => { + finish(() => reject(event.error instanceof Error ? event.error : new Error(event.message || "Failed to connect"))) + } + + const handleClose = () => { + finish(() => reject(new Error("Realtime speech connection closed before initialization"))) + } + + ws.addEventListener("open", handleOpen) + ws.addEventListener("error", handleError as any) + ws.addEventListener("close", handleClose) + }) +} + +async function toText(data: unknown): Promise { + if (typeof data === "string") return data + if (data instanceof ArrayBuffer) return Buffer.from(data).toString("utf-8") + if (ArrayBuffer.isView(data)) return Buffer.from(data.buffer, data.byteOffset, data.byteLength).toString("utf-8") + if (typeof Blob !== "undefined" && data instanceof Blob) { + return Buffer.from(await data.arrayBuffer()).toString("utf-8") + } + return String(data ?? "") +} + +function extractErrorMessage(event: Record): string { + const error = event.error + if (error && typeof error === "object") { + const message = readString((error as Record).message) + if (message) return message + } + return readString(event.message) ?? "Realtime speech request failed" +} + +function readString(value: unknown): string | undefined { + return typeof value === "string" && value.length > 0 ? value : undefined +} + +function asRecord(value: unknown): Record | null { + return value && typeof value === "object" && !Array.isArray(value) ? (value as Record) : null +} + +function extractTranscriptFromConversationItem(item: Record): string | undefined { + const directTranscript = readString(item.transcript) ?? readString(item.text) + if (directTranscript) return directTranscript + + const content = Array.isArray(item.content) ? item.content : [] + for (const part of content) { + const record = asRecord(part) + if (!record) continue + const transcript = + readString(record.transcript) ?? + readString(record.text) ?? + readString(asRecord(record.audio)?.transcript) + if (transcript) { + return transcript + } + } + + return undefined +} diff --git a/packages/server/src/speech/service.ts b/packages/server/src/speech/service.ts index 14f37a15..26cf6682 100644 --- a/packages/server/src/speech/service.ts +++ b/packages/server/src/speech/service.ts @@ -10,6 +10,8 @@ const ServerSpeechSettingsSchema = z.object({ provider: z.string().optional(), apiKey: z.string().optional(), baseUrl: z.string().optional(), + useRealtime: z.boolean().optional(), + realtimeModel: z.string().optional(), sttModel: z.string().optional(), ttsModel: z.string().optional(), ttsVoice: z.string().optional(), @@ -40,12 +42,26 @@ export interface NormalizedSpeechSettings { provider: string apiKey?: string baseUrl?: string + realtimeModel: string sttModel: string ttsModel: string ttsVoice: string } +export interface RealtimeTranscriptionConfig { + provider: string + apiKey: string + baseUrl?: string + realtimeModel: string + sttModel: string + inputFormat: { + type: "audio/pcm" + rate: 24000 + } +} + const DEFAULT_PROVIDER = "openai-compatible" +const DEFAULT_REALTIME_MODEL = "gpt-realtime" const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe" const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts" const DEFAULT_TTS_VOICE = "alloy" @@ -67,6 +83,25 @@ export class SpeechService { return this.createProvider().synthesize(input) } + getRealtimeTranscriptionConfig(): RealtimeTranscriptionConfig { + const settings = this.resolveSettings() + if (!settings.apiKey) { + throw new Error("Speech provider is not configured. Add an API key in Speech settings.") + } + + return { + provider: settings.provider, + apiKey: settings.apiKey, + baseUrl: settings.baseUrl, + realtimeModel: settings.realtimeModel, + sttModel: settings.sttModel, + inputFormat: { + type: "audio/pcm", + rate: 24000, + }, + } + } + private createProvider(): SpeechProvider { const settings = this.resolveSettings() return new OpenAICompatibleSpeechProvider({ @@ -83,6 +118,7 @@ export class SpeechService { provider: speech.provider?.trim() || DEFAULT_PROVIDER, apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY, baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined, + realtimeModel: speech.realtimeModel?.trim() || DEFAULT_REALTIME_MODEL, sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL, ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL, ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE, diff --git a/packages/ui/src/components/prompt-input.tsx b/packages/ui/src/components/prompt-input.tsx index 1ae41690..45db8b44 100644 --- a/packages/ui/src/components/prompt-input.tsx +++ b/packages/ui/src/components/prompt-input.tsx @@ -11,7 +11,7 @@ import { getCommands } from "../stores/commands" import { showAlertDialog } from "../stores/alerts" import { useI18n } from "../lib/i18n" import { getLogger } from "../lib/logger" -import { preferences } from "../stores/preferences" +import { preferences, useConfig } from "../stores/preferences" import type { ExpandState, PromptInputApi, PromptInputProps, PromptInsertMode, PromptMode } from "./prompt-input/types" import { usePromptState } from "./prompt-input/usePromptState" import { usePromptAttachments } from "./prompt-input/usePromptAttachments" @@ -22,6 +22,7 @@ const log = getLogger("actions") export default function PromptInput(props: PromptInputProps) { const { t } = useI18n() + const { serverSettings } = useConfig() const [, setIsFocused] = createSignal(false) const [mode, setMode] = createSignal("normal") const [expandState, setExpandState] = createSignal("normal") @@ -418,6 +419,7 @@ export default function PromptInput(props: PromptInputProps) { getTextarea: () => textareaRef ?? null, enabled: () => preferences().showPromptVoiceInput, disabled: () => Boolean(props.disabled), + useRealtime: () => serverSettings().speech.useRealtime, }) const showVoiceInput = () => preferences().showPromptVoiceInput && diff --git a/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts b/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts new file mode 100644 index 00000000..7d3cea34 --- /dev/null +++ b/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts @@ -0,0 +1,110 @@ +export interface RealtimePcmStreamHandle { + stop(): Promise +} + +interface CreateRealtimePcmStreamOptions { + onChunk: (audioBase64: string) => void | Promise +} + +const TARGET_SAMPLE_RATE = 24000 +const PROCESSOR_BUFFER_SIZE = 4096 + +export async function createRealtimePcmStream( + options: CreateRealtimePcmStreamOptions, +): Promise { + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + }, + }) + + const AudioContextCtor = window.AudioContext || (window as any).webkitAudioContext + if (!AudioContextCtor) { + stream.getTracks().forEach((track) => track.stop()) + throw new Error("AudioContext is not supported in this browser.") + } + + const audioContext = new AudioContextCtor() + await audioContext.resume() + + const source = audioContext.createMediaStreamSource(stream) + const processor = audioContext.createScriptProcessor(PROCESSOR_BUFFER_SIZE, 1, 1) + const sink = audioContext.createGain() + sink.gain.value = 0 + + source.connect(processor) + processor.connect(sink) + sink.connect(audioContext.destination) + + processor.onaudioprocess = (event) => { + const input = event.inputBuffer.getChannelData(0) + const resampled = downsampleBuffer(input, audioContext.sampleRate, TARGET_SAMPLE_RATE) + if (resampled.length === 0) return + const pcm16 = floatTo16BitPcm(resampled) + void options.onChunk(base64EncodePcm16(pcm16)) + } + + let stopped = false + return { + async stop() { + if (stopped) return + stopped = true + processor.onaudioprocess = null + source.disconnect() + processor.disconnect() + sink.disconnect() + stream.getTracks().forEach((track) => track.stop()) + await audioContext.close() + }, + } +} + +function downsampleBuffer(buffer: Float32Array, inputSampleRate: number, outputSampleRate: number): Float32Array { + if (inputSampleRate === outputSampleRate) { + return buffer.slice() + } + + const sampleRateRatio = inputSampleRate / outputSampleRate + const outputLength = Math.max(1, Math.round(buffer.length / sampleRateRatio)) + const output = new Float32Array(outputLength) + let outputIndex = 0 + let inputIndex = 0 + + while (outputIndex < outputLength) { + const nextInputIndex = Math.min(buffer.length, Math.round((outputIndex + 1) * sampleRateRatio)) + let sum = 0 + let count = 0 + for (let i = inputIndex; i < nextInputIndex; i += 1) { + sum += buffer[i] + count += 1 + } + output[outputIndex] = count > 0 ? sum / count : buffer[Math.min(buffer.length - 1, inputIndex)] + outputIndex += 1 + inputIndex = nextInputIndex + } + + return output +} + +function floatTo16BitPcm(buffer: Float32Array): Int16Array { + const pcm16 = new Int16Array(buffer.length) + for (let i = 0; i < buffer.length; i += 1) { + const sample = Math.max(-1, Math.min(1, buffer[i])) + pcm16[i] = sample < 0 ? Math.round(sample * 0x8000) : Math.round(sample * 0x7fff) + } + return pcm16 +} + +function base64EncodePcm16(buffer: Int16Array): string { + const bytes = new Uint8Array(buffer.buffer) + let binary = "" + const chunkSize = 0x8000 + for (let offset = 0; offset < bytes.length; offset += chunkSize) { + const chunk = bytes.subarray(offset, offset + chunkSize) + binary += String.fromCharCode(...chunk) + } + return btoa(binary) +} diff --git a/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts b/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts new file mode 100644 index 00000000..3fcc8e78 --- /dev/null +++ b/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts @@ -0,0 +1,36 @@ +export interface PromptVoiceAnchor { + prompt: string + start: number + end: number +} + +export function createPromptVoiceAnchor(prompt: string, start: number, end: number): PromptVoiceAnchor { + return { prompt, start, end } +} + +export function buildPromptWithInsertedTranscript(anchor: PromptVoiceAnchor, insertedText: string): { value: string; cursor: number } { + const before = anchor.prompt.slice(0, anchor.start) + const after = anchor.prompt.slice(anchor.end) + const normalized = insertedText.trim() + + if (!normalized) { + return { + value: before + after, + cursor: before.length, + } + } + + const prefix = before.length > 0 && !/\s$/.test(before) ? " " : "" + const suffix = after.length > 0 && !/^\s/.test(after) ? " " : "" + return { + value: `${before}${prefix}${normalized}${suffix}${after}`, + cursor: before.length + prefix.length + normalized.length, + } +} + +export function appendVoiceTranscript(current: string, next: string): string { + const normalized = next.trim() + if (!normalized) return current + if (!current.trim()) return normalized + return /\s$/.test(current) ? `${current}${normalized}` : `${current} ${normalized}` +} diff --git a/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts b/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts new file mode 100644 index 00000000..51f6e5cf --- /dev/null +++ b/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts @@ -0,0 +1,241 @@ +import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js" +import { showAlertDialog } from "../../stores/alerts" +import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech" +import { serverApi } from "../../lib/api-client" +import { useI18n } from "../../lib/i18n" +import { createPromptVoiceAnchor, buildPromptWithInsertedTranscript } from "./promptVoiceInsertion" + +interface UsePromptBufferedVoiceInputOptions { + prompt: Accessor + setPrompt: (value: string, options?: { persistDraft?: boolean }) => void + getTextarea: () => HTMLTextAreaElement | null + enabled: Accessor + disabled: Accessor +} + +type VoiceInputState = "idle" | "recording" | "transcribing" + +export function usePromptBufferedVoiceInput(options: UsePromptBufferedVoiceInputOptions) { + const { t } = useI18n() + const [state, setState] = createSignal("idle") + const [elapsedMs, setElapsedMs] = createSignal(0) + + let mediaRecorder: MediaRecorder | null = null + let mediaStream: MediaStream | null = null + let timerId: number | undefined + let shouldTranscribe = true + let recordedChunks: Blob[] = [] + let recordingStartedAt = 0 + + createEffect(() => { + void loadSpeechCapabilities() + }) + + onCleanup(() => { + cleanupMedia(false) + }) + + const isSupported = () => { + if (typeof window === "undefined") return false + return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia) + } + + const canUseVoiceInput = () => { + const capabilities = speechCapabilities() + return Boolean( + options.enabled() && + isSupported() && + capabilities?.available && + capabilities?.configured && + capabilities?.supportsStt, + ) + } + + async function toggleRecording(): Promise { + if (state() === "recording") { + stopRecording() + return + } + + if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return + + try { + await startRecording() + } catch (error) { + cleanupMedia(false) + showAlertDialog(t("promptInput.voiceInput.error.permission"), { + title: t("promptInput.voiceInput.error.title"), + detail: error instanceof Error ? error.message : String(error), + variant: "error", + }) + } + } + + function stopRecording() { + if (!mediaRecorder || state() !== "recording") return + shouldTranscribe = true + mediaRecorder.stop() + setState("transcribing") + stopTimer() + } + + function cancelRecording() { + if (!mediaRecorder || state() !== "recording") return + shouldTranscribe = false + mediaRecorder.stop() + cleanupMedia(false) + } + + async function startRecording() { + if (!isSupported()) { + showAlertDialog(t("promptInput.voiceInput.error.unsupported"), { + title: t("promptInput.voiceInput.error.title"), + variant: "error", + }) + return + } + + recordedChunks = [] + shouldTranscribe = true + mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + mediaRecorder = createRecorder(mediaStream) + + mediaRecorder.addEventListener("dataavailable", (event) => { + if (event.data.size > 0) { + recordedChunks.push(event.data) + } + }) + + mediaRecorder.addEventListener("stop", () => { + void finalizeRecording() + }) + + recordingStartedAt = Date.now() + setElapsedMs(0) + setState("recording") + startTimer() + mediaRecorder.start() + } + + async function finalizeRecording() { + const recorder = mediaRecorder + const stream = mediaStream + mediaRecorder = null + mediaStream = null + + if (!shouldTranscribe || recordedChunks.length === 0) { + recordedChunks = [] + stopTracks(stream) + setState("idle") + setElapsedMs(0) + return + } + + const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm" + + try { + const audioBlob = new Blob(recordedChunks, { type: mimeType }) + const transcription = await serverApi.transcribeAudio({ + audioBase64: await blobToBase64(audioBlob), + mimeType, + }) + if (transcription.text.trim()) { + insertTranscript(transcription.text.trim()) + } + } catch (error) { + showAlertDialog(t("promptInput.voiceInput.error.transcribe"), { + title: t("promptInput.voiceInput.error.title"), + detail: error instanceof Error ? error.message : String(error), + variant: "error", + }) + } finally { + recordedChunks = [] + stopTracks(stream) + setState("idle") + setElapsedMs(0) + } + } + + function insertTranscript(text: string) { + const current = options.prompt() + const textarea = options.getTextarea() + const start = textarea ? textarea.selectionStart : current.length + const end = textarea ? textarea.selectionEnd : current.length + const { value, cursor } = buildPromptWithInsertedTranscript( + createPromptVoiceAnchor(current, start, end), + text, + ) + + options.setPrompt(value) + if (textarea) { + setTimeout(() => { + textarea.focus() + textarea.setSelectionRange(cursor, cursor) + }, 0) + } + } + + function cleanupMedia(resetState = true) { + stopTimer() + if (mediaRecorder && mediaRecorder.state !== "inactive") { + mediaRecorder.stop() + } + mediaRecorder = null + stopTracks(mediaStream) + mediaStream = null + recordedChunks = [] + if (resetState) { + setState("idle") + setElapsedMs(0) + } + } + + function startTimer() { + stopTimer() + timerId = window.setInterval(() => { + setElapsedMs(Date.now() - recordingStartedAt) + }, 250) + } + + function stopTimer() { + if (timerId !== undefined) { + window.clearInterval(timerId) + timerId = undefined + } + } + + return { + state, + elapsedMs, + canUseVoiceInput, + toggleRecording, + cancelRecording, + isRecording: () => state() === "recording", + isTranscribing: () => state() === "transcribing", + buttonTitle: () => { + if (state() === "recording") return t("promptInput.voiceInput.stop.title") + if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title") + return t("promptInput.voiceInput.start.title") + }, + } +} + +function createRecorder(stream: MediaStream): MediaRecorder { + const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"] + const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate)) + return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream) +} + +function stopTracks(stream: MediaStream | null) { + stream?.getTracks().forEach((track) => track.stop()) +} + +async function blobToBase64(blob: Blob): Promise { + const buffer = await blob.arrayBuffer() + const bytes = new Uint8Array(buffer) + let binary = "" + for (const byte of bytes) { + binary += String.fromCharCode(byte) + } + return btoa(binary) +} diff --git a/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts b/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts new file mode 100644 index 00000000..186ec3b7 --- /dev/null +++ b/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts @@ -0,0 +1,325 @@ +import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js" +import { showAlertDialog } from "../../stores/alerts" +import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client" +import { useI18n } from "../../lib/i18n" +import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech" +import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream" +import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion" + +interface UsePromptRealtimeVoiceInputOptions { + prompt: Accessor + setPrompt: (value: string, options?: { persistDraft?: boolean }) => void + getTextarea: () => HTMLTextAreaElement | null + enabled: Accessor + disabled: Accessor +} + +type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing" + +const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000 + +export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) { + const { t } = useI18n() + const [state, setState] = createSignal("idle") + const [elapsedMs, setElapsedMs] = createSignal(0) + + let activeSessionId: string | null = null + let eventSource: EventSource | null = null + let pcmStream: RealtimePcmStreamHandle | null = null + let audioQueue: Promise = Promise.resolve() + let timerId: number | undefined + let recordingStartedAt = 0 + let finalizeTimerId: number | undefined + let anchor = createPromptVoiceAnchor("", 0, 0) + let finalTranscript = "" + let liveTranscript = "" + let activeLiveItemId: string | null = null + let closing = false + + createEffect(() => { + void loadSpeechCapabilities() + }) + + onCleanup(() => { + cancelRecording() + }) + + const isSupported = () => { + if (typeof window === "undefined") return false + return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined" + } + + const canUseVoiceInput = () => { + const capabilities = speechCapabilities() + return Boolean( + options.enabled() && + isSupported() && + capabilities?.available && + capabilities?.configured && + capabilities?.supportsStt && + capabilities?.supportsRealtimeTranscription, + ) + } + + async function toggleRecording(): Promise { + if (state() === "listening" || state() === "connecting") { + await stopRecording() + return + } + + if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return + + try { + await startRecording() + } catch (error) { + await cleanupSession({ revertPrompt: true, closeRemote: true }) + showAlertDialog(t("promptInput.voiceInput.error.connection"), { + title: t("promptInput.voiceInput.error.title"), + detail: error instanceof Error ? error.message : String(error), + variant: "error", + }) + } + } + + async function startRecording() { + if (!isSupported()) { + showAlertDialog(t("promptInput.voiceInput.error.unsupported"), { + title: t("promptInput.voiceInput.error.title"), + variant: "error", + }) + return + } + + resetTranscriptState() + captureAnchor() + setState("connecting") + setElapsedMs(0) + + const created = await serverApi.createRealtimeSpeechSession({ + language: detectLanguage(), + }) + activeSessionId = created.sessionId + connectEventStream(created.sessionId) + + pcmStream = await createRealtimePcmStream({ + onChunk: (audioBase64) => { + const sessionId = activeSessionId + if (!sessionId || closing) return + audioQueue = audioQueue + .then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 })) + .catch((error) => { + handleRealtimeError(error) + }) + }, + }) + + recordingStartedAt = Date.now() + startTimer() + setState("listening") + } + + async function stopRecording() { + const sessionId = activeSessionId + if (!sessionId || (state() !== "listening" && state() !== "connecting")) return + + setState("finalizing") + stopTimer() + + if (pcmStream) { + const stream = pcmStream + pcmStream = null + await stream.stop() + } + + try { + await audioQueue.catch(() => undefined) + await serverApi.finalizeRealtimeSpeechSession(sessionId) + scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS) + } catch (error) { + handleRealtimeError(error) + } + } + + function cancelRecording() { + void cleanupSession({ revertPrompt: true, closeRemote: true }) + } + + function connectEventStream(sessionId: string) { + eventSource?.close() + eventSource = serverApi.connectRealtimeSpeechEvents( + sessionId, + (event) => handleEvent(event), + () => { + if (closing) return + handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection"))) + }, + ) + } + + function handleEvent(event: SpeechRealtimeEvent) { + if (event.type === "session.ready") { + return + } + + if (event.type === "session.error") { + handleRealtimeError(new Error(event.message)) + return + } + + if (event.type === "transcript.partial") { + activeLiveItemId = event.itemId + liveTranscript = event.text + renderPrompt(false) + return + } + + if (event.type === "transcript.final") { + activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId + liveTranscript = "" + finalTranscript = appendVoiceTranscript(finalTranscript, event.text) + renderPrompt(true) + if (state() === "finalizing") { + scheduleFinalizeClose(250) + } + return + } + + if (event.type === "session.closed") { + void cleanupSession({ revertPrompt: false, closeRemote: false }) + } + } + + function captureAnchor() { + const textarea = options.getTextarea() + const current = options.prompt() + const start = textarea ? textarea.selectionStart : current.length + const end = textarea ? textarea.selectionEnd : current.length + anchor = createPromptVoiceAnchor(current, start, end) + } + + function renderPrompt(persistDraft: boolean) { + const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "") + const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted) + options.setPrompt(value, persistDraft ? undefined : { persistDraft: false }) + syncTextareaCursor(cursor) + } + + function syncTextareaCursor(cursor: number) { + const textarea = options.getTextarea() + if (!textarea) return + queueMicrotask(() => { + const next = options.getTextarea() + if (!next) return + next.focus() + next.setSelectionRange(cursor, cursor) + }) + } + + function scheduleFinalizeClose(delayMs: number) { + if (finalizeTimerId !== undefined) { + window.clearTimeout(finalizeTimerId) + } + finalizeTimerId = window.setTimeout(() => { + void cleanupSession({ revertPrompt: false, closeRemote: true }) + }, delayMs) + } + + async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) { + if (closing) return + closing = true + + if (finalizeTimerId !== undefined) { + window.clearTimeout(finalizeTimerId) + finalizeTimerId = undefined + } + + stopTimer() + + const sessionId = activeSessionId + activeSessionId = null + + eventSource?.close() + eventSource = null + + if (pcmStream) { + const stream = pcmStream + pcmStream = null + await stream.stop().catch(() => undefined) + } + + await audioQueue.catch(() => undefined) + audioQueue = Promise.resolve() + + if (cleanupOptions.closeRemote && sessionId) { + await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined) + } + + if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) { + finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript) + liveTranscript = "" + } + + if (cleanupOptions.revertPrompt) { + options.setPrompt(anchor.prompt) + } else if (finalTranscript.trim()) { + renderPrompt(true) + } + + resetTranscriptState() + setState("idle") + setElapsedMs(0) + closing = false + } + + function resetTranscriptState() { + finalTranscript = "" + liveTranscript = "" + activeLiveItemId = null + } + + function handleRealtimeError(error: unknown) { + if (closing) return + void cleanupSession({ revertPrompt: true, closeRemote: true }) + showAlertDialog(t("promptInput.voiceInput.error.connection"), { + title: t("promptInput.voiceInput.error.title"), + detail: error instanceof Error ? error.message : String(error), + variant: "error", + }) + } + + function startTimer() { + stopTimer() + timerId = window.setInterval(() => { + setElapsedMs(Date.now() - recordingStartedAt) + }, 250) + } + + function stopTimer() { + if (timerId !== undefined) { + window.clearInterval(timerId) + timerId = undefined + } + } + + return { + state, + elapsedMs, + canUseVoiceInput, + toggleRecording, + cancelRecording, + isRecording: () => state() === "connecting" || state() === "listening", + isTranscribing: () => state() === "finalizing", + buttonTitle: () => { + if (state() === "connecting") return t("promptInput.voiceInput.connecting.title") + if (state() === "listening") return t("promptInput.voiceInput.stop.title") + if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title") + return t("promptInput.voiceInput.start.title") + }, + } +} + +function detectLanguage(): string | undefined { + if (typeof navigator === "undefined") return undefined + const [language] = navigator.language.split("-") + return language?.trim() || undefined +} diff --git a/packages/ui/src/components/prompt-input/usePromptState.ts b/packages/ui/src/components/prompt-input/usePromptState.ts index 3b326f2c..a2e252a4 100644 --- a/packages/ui/src/components/prompt-input/usePromptState.ts +++ b/packages/ui/src/components/prompt-input/usePromptState.ts @@ -22,7 +22,7 @@ type HistorySelectOptions = { type PromptState = { prompt: Accessor - setPrompt: (value: string) => void + setPrompt: (value: string, options?: { persistDraft?: boolean }) => void clearPrompt: () => void draftLoadedNonce: Accessor @@ -48,11 +48,11 @@ export function usePromptState(options: PromptStateOptions): PromptState { const [historyDraft, setHistoryDraft] = createSignal(null) const [draftLoadedNonce, setDraftLoadedNonce] = createSignal(0) - const setPrompt = (value: string) => { + const setPrompt = (value: string, setOptions?: { persistDraft?: boolean }) => { setPromptInternal(value) // Persist drafts only when the user is at the "fresh" position (not browsing history). // This keeps the bottom-of-history draft stable even if the user edits recalled history entries. - if (historyIndex() === -1) { + if (setOptions?.persistDraft !== false && historyIndex() === -1) { setSessionDraftPrompt(options.instanceId(), options.sessionId(), value) } } diff --git a/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts b/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts index f05c4e05..0a13a6ff 100644 --- a/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts +++ b/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts @@ -1,242 +1,30 @@ -import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js" -import { showAlertDialog } from "../../stores/alerts" -import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech" -import { serverApi } from "../../lib/api-client" -import { useI18n } from "../../lib/i18n" +import type { Accessor } from "solid-js" +import { usePromptBufferedVoiceInput } from "./usePromptBufferedVoiceInput" +import { usePromptRealtimeVoiceInput } from "./usePromptRealtimeVoiceInput" interface UsePromptVoiceInputOptions { prompt: Accessor - setPrompt: (value: string) => void + setPrompt: (value: string, options?: { persistDraft?: boolean }) => void getTextarea: () => HTMLTextAreaElement | null enabled: Accessor disabled: Accessor + useRealtime: Accessor } -type VoiceInputState = "idle" | "recording" | "transcribing" - export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) { - const { t } = useI18n() - const [state, setState] = createSignal("idle") - const [elapsedMs, setElapsedMs] = createSignal(0) + const buffered = usePromptBufferedVoiceInput(options) + const realtime = usePromptRealtimeVoiceInput(options) - let mediaRecorder: MediaRecorder | null = null - let mediaStream: MediaStream | null = null - let timerId: number | undefined - let shouldTranscribe = true - let recordedChunks: Blob[] = [] - let recordingStartedAt = 0 - - createEffect(() => { - void loadSpeechCapabilities() - }) - - onCleanup(() => { - cleanupMedia(false) - }) - - const isSupported = () => { - if (typeof window === "undefined") return false - return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia) - } - - const canUseVoiceInput = () => { - const capabilities = speechCapabilities() - return Boolean( - options.enabled() && - isSupported() && - capabilities?.available && - capabilities?.configured && - capabilities?.supportsStt, - ) - } - - async function toggleRecording(): Promise { - if (state() === "recording") { - stopRecording() - return - } - - if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return - - try { - await startRecording() - } catch (error) { - cleanupMedia(false) - showAlertDialog(t("promptInput.voiceInput.error.permission"), { - title: t("promptInput.voiceInput.error.title"), - detail: error instanceof Error ? error.message : String(error), - variant: "error", - }) - } - } - - function stopRecording() { - if (!mediaRecorder || state() !== "recording") return - shouldTranscribe = true - mediaRecorder.stop() - setState("transcribing") - stopTimer() - } - - function cancelRecording() { - if (!mediaRecorder || state() !== "recording") return - shouldTranscribe = false - mediaRecorder.stop() - cleanupMedia(false) - } - - async function startRecording() { - if (!isSupported()) { - showAlertDialog(t("promptInput.voiceInput.error.unsupported"), { - title: t("promptInput.voiceInput.error.title"), - variant: "error", - }) - return - } - - recordedChunks = [] - shouldTranscribe = true - mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }) - mediaRecorder = createRecorder(mediaStream) - - mediaRecorder.addEventListener("dataavailable", (event) => { - if (event.data.size > 0) { - recordedChunks.push(event.data) - } - }) - - mediaRecorder.addEventListener("stop", () => { - void finalizeRecording() - }) - - recordingStartedAt = Date.now() - setElapsedMs(0) - setState("recording") - startTimer() - mediaRecorder.start() - } - - async function finalizeRecording() { - const recorder = mediaRecorder - const stream = mediaStream - mediaRecorder = null - mediaStream = null - - if (!shouldTranscribe || recordedChunks.length === 0) { - recordedChunks = [] - stopTracks(stream) - setState("idle") - setElapsedMs(0) - return - } - - const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm" - - try { - const audioBlob = new Blob(recordedChunks, { type: mimeType }) - const transcription = await serverApi.transcribeAudio({ - audioBase64: await blobToBase64(audioBlob), - mimeType, - }) - if (transcription.text.trim()) { - insertTranscript(transcription.text.trim()) - } - } catch (error) { - showAlertDialog(t("promptInput.voiceInput.error.transcribe"), { - title: t("promptInput.voiceInput.error.title"), - detail: error instanceof Error ? error.message : String(error), - variant: "error", - }) - } finally { - recordedChunks = [] - stopTracks(stream) - setState("idle") - setElapsedMs(0) - } - } - - function insertTranscript(text: string) { - const current = options.prompt() - const textarea = options.getTextarea() - const start = textarea ? textarea.selectionStart : current.length - const end = textarea ? textarea.selectionEnd : current.length - const before = current.slice(0, start) - const after = current.slice(end) - const prefix = before.length > 0 && !/\s$/.test(before) ? " " : "" - const suffix = after.length > 0 && !/^\s/.test(after) ? " " : "" - const nextValue = `${before}${prefix}${text}${suffix}${after}` - const cursor = before.length + prefix.length + text.length - - options.setPrompt(nextValue) - if (textarea) { - setTimeout(() => { - textarea.focus() - textarea.setSelectionRange(cursor, cursor) - }, 0) - } - } - - function cleanupMedia(resetState = true) { - stopTimer() - if (mediaRecorder && mediaRecorder.state !== "inactive") { - mediaRecorder.stop() - } - mediaRecorder = null - stopTracks(mediaStream) - mediaStream = null - recordedChunks = [] - if (resetState) { - setState("idle") - setElapsedMs(0) - } - } - - function startTimer() { - stopTimer() - timerId = window.setInterval(() => { - setElapsedMs(Date.now() - recordingStartedAt) - }, 250) - } - - function stopTimer() { - if (timerId !== undefined) { - window.clearInterval(timerId) - timerId = undefined - } - } + const active = () => (options.useRealtime() ? realtime : buffered) return { - state, - elapsedMs, - canUseVoiceInput, - toggleRecording, - cancelRecording, - isRecording: () => state() === "recording", - isTranscribing: () => state() === "transcribing", - buttonTitle: () => { - if (state() === "recording") return t("promptInput.voiceInput.stop.title") - if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title") - return t("promptInput.voiceInput.start.title") - }, + state: () => active().state(), + elapsedMs: () => active().elapsedMs(), + canUseVoiceInput: () => active().canUseVoiceInput(), + toggleRecording: () => active().toggleRecording(), + cancelRecording: () => active().cancelRecording(), + isRecording: () => active().isRecording(), + isTranscribing: () => active().isTranscribing(), + buttonTitle: () => active().buttonTitle(), } } - -function createRecorder(stream: MediaStream): MediaRecorder { - const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"] - const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate)) - return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream) -} - -function stopTracks(stream: MediaStream | null) { - stream?.getTracks().forEach((track) => track.stop()) -} - -async function blobToBase64(blob: Blob): Promise { - const buffer = await blob.arrayBuffer() - const bytes = new Uint8Array(buffer) - let binary = "" - for (const byte of bytes) { - binary += String.fromCharCode(byte) - } - return btoa(binary) -} diff --git a/packages/ui/src/components/settings/speech-settings-card.tsx b/packages/ui/src/components/settings/speech-settings-card.tsx index 464fb4ba..a1ca1169 100644 --- a/packages/ui/src/components/settings/speech-settings-card.tsx +++ b/packages/ui/src/components/settings/speech-settings-card.tsx @@ -10,6 +10,8 @@ const log = getLogger("actions") type DraftFields = { apiKey: string baseUrl: string + useRealtime: boolean + realtimeModel: string sttModel: string ttsModel: string ttsVoice: string @@ -19,6 +21,8 @@ function createDraftFields(speech: SpeechSettings): DraftFields { return { apiKey: speech.apiKey ?? "", baseUrl: speech.baseUrl ?? "", + useRealtime: speech.useRealtime, + realtimeModel: speech.realtimeModel, sttModel: speech.sttModel, ttsModel: speech.ttsModel, ttsVoice: speech.ttsVoice, @@ -26,7 +30,7 @@ function createDraftFields(speech: SpeechSettings): DraftFields { } function isDraftEqual(a: DraftFields, b: DraftFields): boolean { - return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice + return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.useRealtime === b.useRealtime && a.realtimeModel === b.realtimeModel && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice } export const SpeechSettingsCard: Component = () => { @@ -57,7 +61,7 @@ export const SpeechSettingsCard: Component = () => { return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing") } - const updateDraft = (key: keyof DraftFields, value: string) => { + const updateDraft = (key: K, value: DraftFields[K]) => { setSaveStatus("idle") setDrafts((current) => ({ ...current, [key]: value })) } @@ -65,12 +69,14 @@ export const SpeechSettingsCard: Component = () => { const isDirty = createMemo(() => { const speech = serverSettings().speech const current = drafts() - return ( - (current.apiKey || "") !== (speech.apiKey || "") || - (current.baseUrl || "") !== (speech.baseUrl || "") || - current.sttModel !== speech.sttModel || - current.ttsModel !== speech.ttsModel || - current.ttsVoice !== speech.ttsVoice + return ( + (current.apiKey || "") !== (speech.apiKey || "") || + (current.baseUrl || "") !== (speech.baseUrl || "") || + current.useRealtime !== speech.useRealtime || + current.realtimeModel !== speech.realtimeModel || + current.sttModel !== speech.sttModel || + current.ttsModel !== speech.ttsModel || + current.ttsVoice !== speech.ttsVoice ) }) @@ -90,6 +96,8 @@ export const SpeechSettingsCard: Component = () => { await updateSpeechSettings({ apiKey: current.apiKey.trim() || undefined, baseUrl: current.baseUrl.trim() || undefined, + useRealtime: current.useRealtime, + realtimeModel: current.realtimeModel.trim() || undefined, sttModel: current.sttModel.trim() || undefined, ttsModel: current.ttsModel.trim() || undefined, ttsVoice: current.ttsVoice.trim() || undefined, @@ -98,6 +106,8 @@ export const SpeechSettingsCard: Component = () => { setDrafts({ apiKey: current.apiKey.trim(), baseUrl: current.baseUrl.trim(), + useRealtime: current.useRealtime, + realtimeModel: current.realtimeModel.trim() || serverSettings().speech.realtimeModel, sttModel: current.sttModel.trim() || serverSettings().speech.sttModel, ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel, ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice, @@ -159,6 +169,27 @@ export const SpeechSettingsCard: Component = () => { onInput={(value) => updateDraft("baseUrl", value)} placeholder={t("settings.speech.baseUrl.placeholder")} /> +
+
+
{t("settings.speech.realtime.title")}
+
{t("settings.speech.realtime.subtitle")}
+
+ +
+ updateDraft("realtimeModel", value)} + placeholder={t("settings.speech.realtimeModel.placeholder")} + /> { return request("/api/speech/capabilities") }, + createRealtimeSpeechSession(payload?: { language?: string; prompt?: string }): Promise { + return request("/api/speech/realtime/sessions", { + method: "POST", + body: JSON.stringify(payload ?? {}), + }) + }, + appendRealtimeSpeechAudio(sessionId: string, payload: { audioBase64: string }): Promise { + return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/audio`, { + method: "POST", + body: JSON.stringify(payload), + }) + }, + finalizeRealtimeSpeechSession(sessionId: string): Promise { + return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/finalize`, { + method: "POST", + body: JSON.stringify({}), + }) + }, + closeRealtimeSpeechSession(sessionId: string): Promise { + return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}`, { + method: "DELETE", + }) + }, transcribeAudio(payload: { audioBase64: string mimeType: string @@ -332,21 +361,34 @@ export const serverApi = { }, connectEvents(onEvent: (event: WorkspaceEventPayload) => void, onError?: () => void) { sseLogger.info(`Connecting to ${EVENTS_URL}`) - const source = new EventSource(EVENTS_URL, { withCredentials: true } as any) - source.onmessage = (event) => { - try { - const payload = JSON.parse(event.data) as WorkspaceEventPayload - onEvent(payload) - } catch (error) { - sseLogger.error("Failed to parse event", error) - } - } - source.onerror = () => { - sseLogger.warn("EventSource error, closing stream") - onError?.() - } - return source + return connectEventSource(EVENTS_URL, onEvent, onError) + }, + connectRealtimeSpeechEvents( + sessionId: string, + onEvent: (event: SpeechRealtimeEvent) => void, + onError?: () => void, + ) { + const url = buildRealtimeSpeechEventsUrl(sessionId) + sseLogger.info(`Connecting to ${url}`) + return connectEventSource(url, onEvent, onError) }, } -export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType } +function connectEventSource(url: string, onEvent: (event: T) => void, onError?: () => void) { + const source = new EventSource(url, { withCredentials: true } as any) + source.onmessage = (event) => { + try { + const payload = JSON.parse(event.data) as T + onEvent(payload) + } catch (error) { + sseLogger.error("Failed to parse event", error) + } + } + source.onerror = () => { + sseLogger.warn("EventSource error, closing stream") + onError?.() + } + return source +} + +export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType, SpeechRealtimeEvent } diff --git a/packages/ui/src/lib/i18n/messages/en/messaging.ts b/packages/ui/src/lib/i18n/messages/en/messaging.ts index 7b8a574a..08cb1848 100644 --- a/packages/ui/src/lib/i18n/messages/en/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/en/messaging.ts @@ -140,8 +140,10 @@ export const messagingMessages = { "promptInput.send.errorTitle": "Send failed", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/en/settings.ts b/packages/ui/src/lib/i18n/messages/en/settings.ts index 318f1dcb..eb068eac 100644 --- a/packages/ui/src/lib/i18n/messages/en/settings.ts +++ b/packages/ui/src/lib/i18n/messages/en/settings.ts @@ -156,13 +156,18 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.realtime.title": "Realtime dictation", + "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", + "settings.speech.realtimeModel.title": "Realtime model", + "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", + "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/es/messaging.ts b/packages/ui/src/lib/i18n/messages/es/messaging.ts index 850ab6b4..23dcd94e 100644 --- a/packages/ui/src/lib/i18n/messages/es/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/es/messaging.ts @@ -142,8 +142,10 @@ export const messagingMessages = { "promptInput.send.errorTitle": "Error al enviar", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/es/settings.ts b/packages/ui/src/lib/i18n/messages/es/settings.ts index 1c39e405..48fb8e4e 100644 --- a/packages/ui/src/lib/i18n/messages/es/settings.ts +++ b/packages/ui/src/lib/i18n/messages/es/settings.ts @@ -156,13 +156,18 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.realtime.title": "Realtime dictation", + "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", + "settings.speech.realtimeModel.title": "Realtime model", + "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", + "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/fr/messaging.ts b/packages/ui/src/lib/i18n/messages/fr/messaging.ts index 0a742efa..2347f740 100644 --- a/packages/ui/src/lib/i18n/messages/fr/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/fr/messaging.ts @@ -142,8 +142,10 @@ export const messagingMessages = { "promptInput.send.errorTitle": "Échec de l'envoi", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/fr/settings.ts b/packages/ui/src/lib/i18n/messages/fr/settings.ts index 710d2c77..e692e647 100644 --- a/packages/ui/src/lib/i18n/messages/fr/settings.ts +++ b/packages/ui/src/lib/i18n/messages/fr/settings.ts @@ -156,13 +156,18 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.realtime.title": "Realtime dictation", + "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", + "settings.speech.realtimeModel.title": "Realtime model", + "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", + "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/ja/messaging.ts b/packages/ui/src/lib/i18n/messages/ja/messaging.ts index 60cd8881..46dbbac4 100644 --- a/packages/ui/src/lib/i18n/messages/ja/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/ja/messaging.ts @@ -142,8 +142,10 @@ export const messagingMessages = { "promptInput.send.errorTitle": "送信に失敗", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/ja/settings.ts b/packages/ui/src/lib/i18n/messages/ja/settings.ts index 8feaa819..1cf7a834 100644 --- a/packages/ui/src/lib/i18n/messages/ja/settings.ts +++ b/packages/ui/src/lib/i18n/messages/ja/settings.ts @@ -156,13 +156,18 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.realtime.title": "Realtime dictation", + "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", + "settings.speech.realtimeModel.title": "Realtime model", + "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", + "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/ru/messaging.ts b/packages/ui/src/lib/i18n/messages/ru/messaging.ts index a833b25e..109d5e2e 100644 --- a/packages/ui/src/lib/i18n/messages/ru/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/ru/messaging.ts @@ -142,8 +142,10 @@ export const messagingMessages = { "promptInput.send.errorTitle": "Не удалось отправить", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/ru/settings.ts b/packages/ui/src/lib/i18n/messages/ru/settings.ts index 494ff447..6694eaa0 100644 --- a/packages/ui/src/lib/i18n/messages/ru/settings.ts +++ b/packages/ui/src/lib/i18n/messages/ru/settings.ts @@ -156,13 +156,18 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.realtime.title": "Realtime dictation", + "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", + "settings.speech.realtimeModel.title": "Realtime model", + "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", + "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts b/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts index aeabd954..13d36824 100644 --- a/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts @@ -142,8 +142,10 @@ export const messagingMessages = { "promptInput.send.errorTitle": "发送失败", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", + "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", + "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts index aff1063f..68b3a11c 100644 --- a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts +++ b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts @@ -156,13 +156,18 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", + "settings.speech.realtime.title": "Realtime dictation", + "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", + "settings.speech.realtimeModel.title": "Realtime model", + "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", + "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/stores/preferences.tsx b/packages/ui/src/stores/preferences.tsx index a387b754..6ea73763 100644 --- a/packages/ui/src/stores/preferences.tsx +++ b/packages/ui/src/stores/preferences.tsx @@ -34,6 +34,8 @@ export interface SpeechSettings { provider: SpeechProviderPreference apiKey?: string baseUrl?: string + useRealtime: boolean + realtimeModel: string sttModel: string ttsModel: string ttsVoice: string @@ -136,6 +138,8 @@ const defaultUiSettings: UiSettings = { const defaultSpeechSettings: SpeechSettings = { provider: "openai-compatible", + useRealtime: true, + realtimeModel: "gpt-realtime", sttModel: "gpt-4o-mini-transcribe", ttsModel: "gpt-4o-mini-tts", ttsVoice: "alloy", @@ -184,6 +188,11 @@ function normalizeSpeechSettings(input?: Partial | null): Speech provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider, apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined, baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined, + useRealtime: sanitized.useRealtime ?? defaultSpeechSettings.useRealtime, + realtimeModel: + typeof sanitized.realtimeModel === "string" && sanitized.realtimeModel.trim() + ? sanitized.realtimeModel.trim() + : defaultSpeechSettings.realtimeModel, sttModel: typeof sanitized.sttModel === "string" && sanitized.sttModel.trim() ? sanitized.sttModel.trim()