diff --git a/packages/opencode-config/package.json b/packages/opencode-config/package.json index 61266886..82ed1e6d 100644 --- a/packages/opencode-config/package.json +++ b/packages/opencode-config/package.json @@ -4,6 +4,6 @@ "private": true, "license": "MIT", "dependencies": { - "@opencode-ai/plugin": "1.2.24" + "@opencode-ai/plugin": "1.2.14" } } \ No newline at end of file diff --git a/packages/server/src/api-types.ts b/packages/server/src/api-types.ts index c1fbefb3..8eb7c928 100644 --- a/packages/server/src/api-types.ts +++ b/packages/server/src/api-types.ts @@ -219,35 +219,12 @@ export interface SpeechCapabilitiesResponse { provider: string supportsStt: boolean supportsTts: boolean - supportsRealtimeTranscription?: boolean - realtimeInputFormat?: { - type: "audio/pcm" - rate: 24000 - } - realtimeModel?: string baseUrl?: string sttModel: string ttsModel: string ttsVoice: string } -export interface SpeechRealtimeSessionResponse { - sessionId: string - inputFormat: { - type: "audio/pcm" - rate: 24000 - } -} - -export type SpeechRealtimeEvent = - | { type: "session.ready"; sessionId: string } - | { type: "session.error"; message: string } - | { type: "input.speech_started"; itemId?: string } - | { type: "input.speech_stopped"; itemId?: string } - | { type: "transcript.partial"; itemId: string; text: string } - | { type: "transcript.final"; itemId: string; previousItemId?: string; text: string } - | { type: "session.closed"; reason?: string } - export interface SpeechTranscriptionResponse { text: string language?: string diff --git a/packages/server/src/server/http-server.ts b/packages/server/src/server/http-server.ts index e8252e5b..3f558cb8 100644 --- a/packages/server/src/server/http-server.ts +++ b/packages/server/src/server/http-server.ts @@ -255,7 +255,7 @@ export function createHttpServer(deps: HttpServerDeps) { eventBus: deps.eventBus, workspaceManager: deps.workspaceManager, }) - registerSpeechRoutes(app, { speechService: deps.speechService, logger: apiLogger }) + registerSpeechRoutes(app, { speechService: deps.speechService }) registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger }) registerBackgroundProcessRoutes(app, { backgroundProcessManager }) registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger }) diff --git a/packages/server/src/server/routes/speech.ts b/packages/server/src/server/routes/speech.ts index e9421092..3eab4ad6 100644 --- a/packages/server/src/server/routes/speech.ts +++ b/packages/server/src/server/routes/speech.ts @@ -1,12 +1,9 @@ import type { FastifyInstance } from "fastify" import { z } from "zod" import type { SpeechService } from "../../speech/service" -import type { Logger } from "../../logger" -import { SpeechRealtimeSessionManager } from "../../speech/realtime-session-manager" interface RouteDeps { speechService: SpeechService - logger: Logger } const TranscribeBodySchema = z.object({ @@ -22,99 +19,9 @@ const SynthesizeBodySchema = z.object({ format: z.enum(["mp3", "wav", "opus"]).optional(), }) -const RealtimeSessionBodySchema = z.object({ - language: z.string().trim().min(1).optional(), - prompt: z.string().trim().min(1).optional(), -}) - -const RealtimeAudioBodySchema = z.object({ - audioBase64: z.string().min(1, "Audio payload is required"), -}) - export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) { - const realtimeSessions = new SpeechRealtimeSessionManager( - deps.speechService, - deps.logger.child({ component: "speech-realtime" }), - ) - - app.addHook("onClose", async () => { - await realtimeSessions.dispose() - }) - app.get("/api/speech/capabilities", async () => deps.speechService.getCapabilities()) - app.post("/api/speech/realtime/sessions", async (request, reply) => { - try { - const body = RealtimeSessionBodySchema.parse(request.body ?? {}) - return await realtimeSessions.createSession(body) - } catch (error) { - request.log.error({ err: error }, "Failed to create realtime speech session") - reply.code(400) - return { error: error instanceof Error ? error.message : "Failed to create realtime speech session" } - } - }) - - app.get<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/events", (request, reply) => { - try { - reply.raw.setHeader("Content-Type", "text/event-stream") - reply.raw.setHeader("Cache-Control", "no-cache") - reply.raw.setHeader("Connection", "keep-alive") - reply.raw.flushHeaders?.() - reply.hijack() - - const unsubscribe = realtimeSessions.subscribe(request.params.sessionId, (event) => { - reply.raw.write(`data: ${JSON.stringify(event)}\n\n`) - }) - - const heartbeat = setInterval(() => { - reply.raw.write(`:hb ${Date.now()}\n\n`) - }, 15000) - - const close = () => { - clearInterval(heartbeat) - unsubscribe() - reply.raw.end?.() - } - - request.raw.on("close", close) - request.raw.on("error", close) - } catch (error) { - request.log.error({ err: error }, "Failed to open realtime speech event stream") - reply.code(404).send({ error: error instanceof Error ? error.message : "Realtime speech session not found" }) - } - }) - - app.post<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/audio", async (request, reply) => { - try { - const body = RealtimeAudioBodySchema.parse(request.body ?? {}) - realtimeSessions.appendAudio(request.params.sessionId, body.audioBase64) - reply.code(204) - return undefined - } catch (error) { - request.log.error({ err: error }, "Failed to append realtime speech audio") - reply.code(400) - return { error: error instanceof Error ? error.message : "Failed to append realtime speech audio" } - } - }) - - app.post<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/finalize", async (request, reply) => { - try { - realtimeSessions.finalize(request.params.sessionId) - reply.code(204) - return undefined - } catch (error) { - request.log.error({ err: error }, "Failed to finalize realtime speech session") - reply.code(400) - return { error: error instanceof Error ? error.message : "Failed to finalize realtime speech session" } - } - }) - - app.delete<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId", async (request, reply) => { - realtimeSessions.closeSession(request.params.sessionId, "client_closed") - reply.code(204) - return undefined - }) - app.post("/api/speech/transcribe", async (request, reply) => { try { const body = TranscribeBodySchema.parse(request.body ?? {}) diff --git a/packages/server/src/speech/providers/openai-compatible.ts b/packages/server/src/speech/providers/openai-compatible.ts index 436bfc36..4c426d72 100644 --- a/packages/server/src/speech/providers/openai-compatible.ts +++ b/packages/server/src/speech/providers/openai-compatible.ts @@ -20,13 +20,7 @@ export class OpenAICompatibleSpeechProvider { provider: settings.provider, supportsStt: true, supportsTts: true, - supportsRealtimeTranscription: true, - realtimeInputFormat: { - type: "audio/pcm" as const, - rate: 24000 as const, - }, baseUrl: settings.baseUrl, - realtimeModel: settings.realtimeModel, sttModel: settings.sttModel, ttsModel: settings.ttsModel, ttsVoice: settings.ttsVoice, diff --git a/packages/server/src/speech/realtime-session-manager.ts b/packages/server/src/speech/realtime-session-manager.ts deleted file mode 100644 index 99aa395a..00000000 --- a/packages/server/src/speech/realtime-session-manager.ts +++ /dev/null @@ -1,525 +0,0 @@ -import { randomUUID } from "node:crypto" -import { WebSocket } from "undici" -import type { SpeechRealtimeEvent, SpeechRealtimeSessionResponse } from "../api-types" -import type { Logger } from "../logger" -import type { SpeechService } from "./service" - -interface CreateRealtimeSessionOptions { - language?: string - prompt?: string -} - -interface TranscriptItemState { - previousItemId?: string - partialText: string - finalText?: string -} - -interface ManagedRealtimeSession { - id: string - ws: WebSocket - subscribers: Set<(event: SpeechRealtimeEvent) => void> - items: Map - orderedItemIds: string[] - nextFinalIndex: number - createdAt: number - lastActivityAt: number - closed: boolean -} - -const OPEN_TIMEOUT_MS = 10_000 -const IDLE_TIMEOUT_MS = 2 * 60 * 1000 -const SWEEP_INTERVAL_MS = 30_000 - -export class SpeechRealtimeSessionManager { - private readonly sessions = new Map() - private readonly sweepTimer: NodeJS.Timeout - - constructor( - private readonly speechService: SpeechService, - private readonly logger: Logger, - ) { - this.sweepTimer = setInterval(() => { - this.sweepIdleSessions() - }, SWEEP_INTERVAL_MS) - this.sweepTimer.unref?.() - } - - async createSession(options: CreateRealtimeSessionOptions = {}): Promise { - const config = this.speechService.getRealtimeTranscriptionConfig() - const id = randomUUID() - const wsUrl = buildRealtimeWebSocketUrl(config.baseUrl, config.realtimeModel) - const sessionUpdateEvent = buildSessionUpdateEvent(config, options) - this.logger.info( - { - sessionId: id, - wsUrl, - realtimeModel: config.realtimeModel, - sttModel: config.sttModel, - payload: sessionUpdateEvent, - }, - "Opening realtime speech websocket", - ) - const ws = new WebSocket(wsUrl, { - headers: { - Authorization: `Bearer ${config.apiKey}`, - ...(requiresRealtimeBetaHeader(config.baseUrl) ? { "OpenAI-Beta": "realtime=v1" } : {}), - }, - }) - - const session: ManagedRealtimeSession = { - id, - ws, - subscribers: new Set(), - items: new Map(), - orderedItemIds: [], - nextFinalIndex: 0, - createdAt: Date.now(), - lastActivityAt: Date.now(), - closed: false, - } - - this.sessions.set(id, session) - this.attachSocketHandlers(session) - - try { - await waitForSocketOpen(ws) - this.send(session, sessionUpdateEvent) - return { - sessionId: id, - inputFormat: config.inputFormat, - } - } catch (error) { - this.logger.error({ sessionId: id, err: error }, "Failed to create realtime speech session") - this.closeSession(id, error instanceof Error ? error.message : "Failed to create realtime speech session") - throw error - } - } - - subscribe(sessionId: string, send: (event: SpeechRealtimeEvent) => void): () => void { - const session = this.getSession(sessionId) - if (!session) { - throw new Error("Realtime speech session not found") - } - - session.subscribers.add(send) - this.touch(session) - send({ type: "session.ready", sessionId }) - - return () => { - session.subscribers.delete(send) - this.touch(session) - } - } - - appendAudio(sessionId: string, audioBase64: string): void { - const session = this.requireSession(sessionId) - this.send(session, { - type: "input_audio_buffer.append", - audio: audioBase64, - }) - } - - finalize(sessionId: string): void { - const session = this.requireSession(sessionId) - this.send(session, { - type: "input_audio_buffer.commit", - }) - } - - closeSession(sessionId: string, reason?: string): void { - const session = this.sessions.get(sessionId) - if (!session || session.closed) return - - session.closed = true - this.sessions.delete(sessionId) - this.emit(session, { type: "session.closed", reason }) - - try { - if (session.ws.readyState === WebSocket.OPEN || session.ws.readyState === WebSocket.CONNECTING) { - session.ws.close(1000, reason?.slice(0, 120) ?? "client_closed") - } - } catch (error) { - this.logger.warn({ sessionId, err: error }, "Failed to close realtime speech websocket") - } - - session.subscribers.clear() - } - - async dispose(): Promise { - clearInterval(this.sweepTimer) - for (const sessionId of Array.from(this.sessions.keys())) { - this.closeSession(sessionId, "server_shutdown") - } - } - - private attachSocketHandlers(session: ManagedRealtimeSession) { - session.ws.addEventListener("message", (event) => { - void this.handleSocketMessage(session, event.data) - }) - - session.ws.addEventListener("error", (event) => { - const message = event.error instanceof Error ? event.error.message : event.message || "Realtime speech connection failed" - this.logger.warn({ sessionId: session.id, err: event.error ?? event.message }, "Realtime speech websocket error") - this.emit(session, { type: "session.error", message }) - }) - - session.ws.addEventListener("close", (event) => { - const reason = event.reason || (event.wasClean ? "socket_closed" : "socket_terminated") - this.logger.info( - { - sessionId: session.id, - code: event.code, - reason, - orderedItemIds: session.orderedItemIds, - pendingItems: Array.from(session.items.entries()).map(([itemId, item]) => ({ - itemId, - previousItemId: item.previousItemId, - partialText: item.partialText, - finalText: item.finalText, - })), - }, - "Realtime speech websocket closed", - ) - this.closeSession(session.id, reason) - }) - } - - private async handleSocketMessage(session: ManagedRealtimeSession, raw: unknown) { - if (session.closed) return - - try { - const payload = await toText(raw) - const event = JSON.parse(payload) as Record - this.touch(session) - this.handleServerEvent(session, event) - } catch (error) { - this.logger.warn({ sessionId: session.id, err: error }, "Failed to process realtime speech event") - } - } - - private handleServerEvent(session: ManagedRealtimeSession, event: Record) { - const type = typeof event.type === "string" ? event.type : "" - if (!type) return - - this.logger.debug({ sessionId: session.id, type }, "Realtime speech event received") - if (type.startsWith("conversation.item") || type.startsWith("input_audio_buffer") || type.startsWith("session.")) { - this.logger.debug({ sessionId: session.id, event }, "Realtime speech event payload") - } - - if (type === "error") { - const message = extractErrorMessage(event) - this.logger.warn({ sessionId: session.id, event }, "Realtime speech provider error event") - this.emit(session, { type: "session.error", message }) - return - } - - if (type === "input_audio_buffer.speech_started") { - this.emit(session, { - type: "input.speech_started", - itemId: readString(event.item_id), - }) - return - } - - if (type === "input_audio_buffer.speech_stopped") { - this.emit(session, { - type: "input.speech_stopped", - itemId: readString(event.item_id), - }) - return - } - - if (type === "input_audio_buffer.committed") { - const itemId = readString(event.item_id) - if (!itemId) return - const item = this.getOrCreateItem(session, itemId) - item.previousItemId = readString(event.previous_item_id) - if (!session.orderedItemIds.includes(itemId)) { - session.orderedItemIds.push(itemId) - } - this.flushFinalizedItems(session) - return - } - - if (type === "conversation.item.created" || type === "conversation.item.added" || type === "conversation.item.done") { - this.handleConversationItemEvent(session, event) - return - } - - if (type === "conversation.item.input_audio_transcription.delta") { - const itemId = readString(event.item_id) - const delta = readString(event.delta) - if (!itemId || !delta) return - const item = this.getOrCreateItem(session, itemId) - item.partialText += delta - this.emit(session, { - type: "transcript.partial", - itemId, - text: item.partialText, - }) - return - } - - if (type === "conversation.item.input_audio_transcription.completed") { - const itemId = readString(event.item_id) - if (!itemId) return - const item = this.getOrCreateItem(session, itemId) - item.finalText = readString(event.transcript) ?? item.partialText - this.flushFinalizedItems(session) - } - } - - private handleConversationItemEvent(session: ManagedRealtimeSession, event: Record) { - const itemRecord = asRecord(event.item) - if (!itemRecord) return - - const itemId = readString(itemRecord.id) ?? readString(event.item_id) - if (!itemId) return - - const item = this.getOrCreateItem(session, itemId) - item.previousItemId = readString(event.previous_item_id) ?? item.previousItemId - if (!session.orderedItemIds.includes(itemId)) { - session.orderedItemIds.push(itemId) - } - - const transcript = extractTranscriptFromConversationItem(itemRecord) - if (transcript) { - item.finalText = transcript - this.flushFinalizedItems(session) - } - } - - private flushFinalizedItems(session: ManagedRealtimeSession) { - while (session.nextFinalIndex < session.orderedItemIds.length) { - const itemId = session.orderedItemIds[session.nextFinalIndex] - const item = session.items.get(itemId) - if (!item || item.finalText === undefined) { - return - } - - this.emit(session, { - type: "transcript.final", - itemId, - previousItemId: item.previousItemId, - text: item.finalText, - }) - session.nextFinalIndex += 1 - } - } - - private getOrCreateItem(session: ManagedRealtimeSession, itemId: string): TranscriptItemState { - const existing = session.items.get(itemId) - if (existing) return existing - const created: TranscriptItemState = { partialText: "" } - session.items.set(itemId, created) - return created - } - - private emit(session: ManagedRealtimeSession, event: SpeechRealtimeEvent) { - for (const subscriber of session.subscribers) { - try { - subscriber(event) - } catch (error) { - this.logger.warn({ sessionId: session.id, err: error, type: event.type }, "Failed to emit realtime speech event") - } - } - } - - private requireSession(sessionId: string): ManagedRealtimeSession { - const session = this.getSession(sessionId) - if (!session) { - throw new Error("Realtime speech session not found") - } - return session - } - - private getSession(sessionId: string): ManagedRealtimeSession | null { - const session = this.sessions.get(sessionId) ?? null - if (!session || session.closed) return null - return session - } - - private send(session: ManagedRealtimeSession, event: Record) { - if (session.closed || session.ws.readyState !== WebSocket.OPEN) { - throw new Error("Realtime speech session is not connected") - } - - session.ws.send(JSON.stringify(event)) - this.touch(session) - } - - private touch(session: ManagedRealtimeSession) { - session.lastActivityAt = Date.now() - } - - private sweepIdleSessions() { - const now = Date.now() - for (const [sessionId, session] of this.sessions) { - if (session.closed) continue - if (now - session.lastActivityAt < IDLE_TIMEOUT_MS) continue - this.logger.info({ sessionId }, "Closing idle realtime speech session") - this.closeSession(sessionId, "idle_timeout") - } - } -} - -function buildRealtimeWebSocketUrl(baseUrl: string | undefined, model: string): string { - const target = new URL(baseUrl?.trim() || "https://api.openai.com/v1") - target.protocol = target.protocol === "http:" ? "ws:" : "wss:" - const normalizedPath = target.pathname.replace(/\/+$/, "") - target.pathname = normalizedPath.endsWith("/realtime") ? normalizedPath : `${normalizedPath}/realtime` - target.hash = "" - if (!target.searchParams.has("model")) { - target.searchParams.set("model", model) - } - return target.toString() -} - -function requiresRealtimeBetaHeader(baseUrl?: string): boolean { - if (!baseUrl || !baseUrl.trim()) return false - try { - return new URL(baseUrl).hostname.toLowerCase() !== "api.openai.com" - } catch { - return false - } -} - -function buildSessionUpdateEvent( - config: { baseUrl?: string; sttModel: string; realtimeModel: string; inputFormat: { type: "audio/pcm"; rate: 24000 } }, - options: CreateRealtimeSessionOptions, -): Record { - if (requiresRealtimeBetaHeader(config.baseUrl)) { - return { - type: "session.update", - session: { - input_audio_transcription: { - model: config.sttModel, - ...(options.language ? { language: options.language } : {}), - ...(options.prompt ? { prompt: options.prompt } : {}), - }, - turn_detection: { - type: "server_vad", - threshold: 0.45, - prefix_padding_ms: 250, - silence_duration_ms: 400, - }, - }, - } - } - - return { - type: "session.update", - session: { - type: "transcription", - audio: { - input: { - format: config.inputFormat, - noise_reduction: { type: "near_field" }, - transcription: { - model: config.sttModel, - ...(options.language ? { language: options.language } : {}), - ...(options.prompt ? { prompt: options.prompt } : {}), - }, - turn_detection: { - type: "server_vad", - threshold: 0.45, - prefix_padding_ms: 250, - silence_duration_ms: 400, - }, - }, - }, - }, - } -} - -function waitForSocketOpen(ws: WebSocket): Promise { - if (ws.readyState === WebSocket.OPEN) { - return Promise.resolve() - } - - return new Promise((resolve, reject) => { - let settled = false - const timeout = setTimeout(() => { - cleanup() - reject(new Error("Timed out connecting to realtime speech provider")) - }, OPEN_TIMEOUT_MS) - - const cleanup = () => { - clearTimeout(timeout) - ws.removeEventListener("open", handleOpen) - ws.removeEventListener("error", handleError) - ws.removeEventListener("close", handleClose) - } - - const finish = (callback: () => void) => { - if (settled) return - settled = true - cleanup() - callback() - } - - const handleOpen = () => { - finish(resolve) - } - - const handleError = (event: { error?: unknown; message?: string }) => { - finish(() => reject(event.error instanceof Error ? event.error : new Error(event.message || "Failed to connect"))) - } - - const handleClose = () => { - finish(() => reject(new Error("Realtime speech connection closed before initialization"))) - } - - ws.addEventListener("open", handleOpen) - ws.addEventListener("error", handleError as any) - ws.addEventListener("close", handleClose) - }) -} - -async function toText(data: unknown): Promise { - if (typeof data === "string") return data - if (data instanceof ArrayBuffer) return Buffer.from(data).toString("utf-8") - if (ArrayBuffer.isView(data)) return Buffer.from(data.buffer, data.byteOffset, data.byteLength).toString("utf-8") - if (typeof Blob !== "undefined" && data instanceof Blob) { - return Buffer.from(await data.arrayBuffer()).toString("utf-8") - } - return String(data ?? "") -} - -function extractErrorMessage(event: Record): string { - const error = event.error - if (error && typeof error === "object") { - const message = readString((error as Record).message) - if (message) return message - } - return readString(event.message) ?? "Realtime speech request failed" -} - -function readString(value: unknown): string | undefined { - return typeof value === "string" && value.length > 0 ? value : undefined -} - -function asRecord(value: unknown): Record | null { - return value && typeof value === "object" && !Array.isArray(value) ? (value as Record) : null -} - -function extractTranscriptFromConversationItem(item: Record): string | undefined { - const directTranscript = readString(item.transcript) ?? readString(item.text) - if (directTranscript) return directTranscript - - const content = Array.isArray(item.content) ? item.content : [] - for (const part of content) { - const record = asRecord(part) - if (!record) continue - const transcript = - readString(record.transcript) ?? - readString(record.text) ?? - readString(asRecord(record.audio)?.transcript) - if (transcript) { - return transcript - } - } - - return undefined -} diff --git a/packages/server/src/speech/service.ts b/packages/server/src/speech/service.ts index 26cf6682..14f37a15 100644 --- a/packages/server/src/speech/service.ts +++ b/packages/server/src/speech/service.ts @@ -10,8 +10,6 @@ const ServerSpeechSettingsSchema = z.object({ provider: z.string().optional(), apiKey: z.string().optional(), baseUrl: z.string().optional(), - useRealtime: z.boolean().optional(), - realtimeModel: z.string().optional(), sttModel: z.string().optional(), ttsModel: z.string().optional(), ttsVoice: z.string().optional(), @@ -42,26 +40,12 @@ export interface NormalizedSpeechSettings { provider: string apiKey?: string baseUrl?: string - realtimeModel: string sttModel: string ttsModel: string ttsVoice: string } -export interface RealtimeTranscriptionConfig { - provider: string - apiKey: string - baseUrl?: string - realtimeModel: string - sttModel: string - inputFormat: { - type: "audio/pcm" - rate: 24000 - } -} - const DEFAULT_PROVIDER = "openai-compatible" -const DEFAULT_REALTIME_MODEL = "gpt-realtime" const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe" const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts" const DEFAULT_TTS_VOICE = "alloy" @@ -83,25 +67,6 @@ export class SpeechService { return this.createProvider().synthesize(input) } - getRealtimeTranscriptionConfig(): RealtimeTranscriptionConfig { - const settings = this.resolveSettings() - if (!settings.apiKey) { - throw new Error("Speech provider is not configured. Add an API key in Speech settings.") - } - - return { - provider: settings.provider, - apiKey: settings.apiKey, - baseUrl: settings.baseUrl, - realtimeModel: settings.realtimeModel, - sttModel: settings.sttModel, - inputFormat: { - type: "audio/pcm", - rate: 24000, - }, - } - } - private createProvider(): SpeechProvider { const settings = this.resolveSettings() return new OpenAICompatibleSpeechProvider({ @@ -118,7 +83,6 @@ export class SpeechService { provider: speech.provider?.trim() || DEFAULT_PROVIDER, apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY, baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined, - realtimeModel: speech.realtimeModel?.trim() || DEFAULT_REALTIME_MODEL, sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL, ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL, ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE, diff --git a/packages/ui/src/components/prompt-input.tsx b/packages/ui/src/components/prompt-input.tsx index 45db8b44..1ae41690 100644 --- a/packages/ui/src/components/prompt-input.tsx +++ b/packages/ui/src/components/prompt-input.tsx @@ -11,7 +11,7 @@ import { getCommands } from "../stores/commands" import { showAlertDialog } from "../stores/alerts" import { useI18n } from "../lib/i18n" import { getLogger } from "../lib/logger" -import { preferences, useConfig } from "../stores/preferences" +import { preferences } from "../stores/preferences" import type { ExpandState, PromptInputApi, PromptInputProps, PromptInsertMode, PromptMode } from "./prompt-input/types" import { usePromptState } from "./prompt-input/usePromptState" import { usePromptAttachments } from "./prompt-input/usePromptAttachments" @@ -22,7 +22,6 @@ const log = getLogger("actions") export default function PromptInput(props: PromptInputProps) { const { t } = useI18n() - const { serverSettings } = useConfig() const [, setIsFocused] = createSignal(false) const [mode, setMode] = createSignal("normal") const [expandState, setExpandState] = createSignal("normal") @@ -419,7 +418,6 @@ export default function PromptInput(props: PromptInputProps) { getTextarea: () => textareaRef ?? null, enabled: () => preferences().showPromptVoiceInput, disabled: () => Boolean(props.disabled), - useRealtime: () => serverSettings().speech.useRealtime, }) const showVoiceInput = () => preferences().showPromptVoiceInput && diff --git a/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts b/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts deleted file mode 100644 index 7d3cea34..00000000 --- a/packages/ui/src/components/prompt-input/createRealtimePcmStream.ts +++ /dev/null @@ -1,110 +0,0 @@ -export interface RealtimePcmStreamHandle { - stop(): Promise -} - -interface CreateRealtimePcmStreamOptions { - onChunk: (audioBase64: string) => void | Promise -} - -const TARGET_SAMPLE_RATE = 24000 -const PROCESSOR_BUFFER_SIZE = 4096 - -export async function createRealtimePcmStream( - options: CreateRealtimePcmStreamOptions, -): Promise { - const stream = await navigator.mediaDevices.getUserMedia({ - audio: { - channelCount: 1, - echoCancellation: true, - noiseSuppression: true, - autoGainControl: true, - }, - }) - - const AudioContextCtor = window.AudioContext || (window as any).webkitAudioContext - if (!AudioContextCtor) { - stream.getTracks().forEach((track) => track.stop()) - throw new Error("AudioContext is not supported in this browser.") - } - - const audioContext = new AudioContextCtor() - await audioContext.resume() - - const source = audioContext.createMediaStreamSource(stream) - const processor = audioContext.createScriptProcessor(PROCESSOR_BUFFER_SIZE, 1, 1) - const sink = audioContext.createGain() - sink.gain.value = 0 - - source.connect(processor) - processor.connect(sink) - sink.connect(audioContext.destination) - - processor.onaudioprocess = (event) => { - const input = event.inputBuffer.getChannelData(0) - const resampled = downsampleBuffer(input, audioContext.sampleRate, TARGET_SAMPLE_RATE) - if (resampled.length === 0) return - const pcm16 = floatTo16BitPcm(resampled) - void options.onChunk(base64EncodePcm16(pcm16)) - } - - let stopped = false - return { - async stop() { - if (stopped) return - stopped = true - processor.onaudioprocess = null - source.disconnect() - processor.disconnect() - sink.disconnect() - stream.getTracks().forEach((track) => track.stop()) - await audioContext.close() - }, - } -} - -function downsampleBuffer(buffer: Float32Array, inputSampleRate: number, outputSampleRate: number): Float32Array { - if (inputSampleRate === outputSampleRate) { - return buffer.slice() - } - - const sampleRateRatio = inputSampleRate / outputSampleRate - const outputLength = Math.max(1, Math.round(buffer.length / sampleRateRatio)) - const output = new Float32Array(outputLength) - let outputIndex = 0 - let inputIndex = 0 - - while (outputIndex < outputLength) { - const nextInputIndex = Math.min(buffer.length, Math.round((outputIndex + 1) * sampleRateRatio)) - let sum = 0 - let count = 0 - for (let i = inputIndex; i < nextInputIndex; i += 1) { - sum += buffer[i] - count += 1 - } - output[outputIndex] = count > 0 ? sum / count : buffer[Math.min(buffer.length - 1, inputIndex)] - outputIndex += 1 - inputIndex = nextInputIndex - } - - return output -} - -function floatTo16BitPcm(buffer: Float32Array): Int16Array { - const pcm16 = new Int16Array(buffer.length) - for (let i = 0; i < buffer.length; i += 1) { - const sample = Math.max(-1, Math.min(1, buffer[i])) - pcm16[i] = sample < 0 ? Math.round(sample * 0x8000) : Math.round(sample * 0x7fff) - } - return pcm16 -} - -function base64EncodePcm16(buffer: Int16Array): string { - const bytes = new Uint8Array(buffer.buffer) - let binary = "" - const chunkSize = 0x8000 - for (let offset = 0; offset < bytes.length; offset += chunkSize) { - const chunk = bytes.subarray(offset, offset + chunkSize) - binary += String.fromCharCode(...chunk) - } - return btoa(binary) -} diff --git a/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts b/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts deleted file mode 100644 index 3fcc8e78..00000000 --- a/packages/ui/src/components/prompt-input/promptVoiceInsertion.ts +++ /dev/null @@ -1,36 +0,0 @@ -export interface PromptVoiceAnchor { - prompt: string - start: number - end: number -} - -export function createPromptVoiceAnchor(prompt: string, start: number, end: number): PromptVoiceAnchor { - return { prompt, start, end } -} - -export function buildPromptWithInsertedTranscript(anchor: PromptVoiceAnchor, insertedText: string): { value: string; cursor: number } { - const before = anchor.prompt.slice(0, anchor.start) - const after = anchor.prompt.slice(anchor.end) - const normalized = insertedText.trim() - - if (!normalized) { - return { - value: before + after, - cursor: before.length, - } - } - - const prefix = before.length > 0 && !/\s$/.test(before) ? " " : "" - const suffix = after.length > 0 && !/^\s/.test(after) ? " " : "" - return { - value: `${before}${prefix}${normalized}${suffix}${after}`, - cursor: before.length + prefix.length + normalized.length, - } -} - -export function appendVoiceTranscript(current: string, next: string): string { - const normalized = next.trim() - if (!normalized) return current - if (!current.trim()) return normalized - return /\s$/.test(current) ? `${current}${normalized}` : `${current} ${normalized}` -} diff --git a/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts b/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts deleted file mode 100644 index 51f6e5cf..00000000 --- a/packages/ui/src/components/prompt-input/usePromptBufferedVoiceInput.ts +++ /dev/null @@ -1,241 +0,0 @@ -import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js" -import { showAlertDialog } from "../../stores/alerts" -import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech" -import { serverApi } from "../../lib/api-client" -import { useI18n } from "../../lib/i18n" -import { createPromptVoiceAnchor, buildPromptWithInsertedTranscript } from "./promptVoiceInsertion" - -interface UsePromptBufferedVoiceInputOptions { - prompt: Accessor - setPrompt: (value: string, options?: { persistDraft?: boolean }) => void - getTextarea: () => HTMLTextAreaElement | null - enabled: Accessor - disabled: Accessor -} - -type VoiceInputState = "idle" | "recording" | "transcribing" - -export function usePromptBufferedVoiceInput(options: UsePromptBufferedVoiceInputOptions) { - const { t } = useI18n() - const [state, setState] = createSignal("idle") - const [elapsedMs, setElapsedMs] = createSignal(0) - - let mediaRecorder: MediaRecorder | null = null - let mediaStream: MediaStream | null = null - let timerId: number | undefined - let shouldTranscribe = true - let recordedChunks: Blob[] = [] - let recordingStartedAt = 0 - - createEffect(() => { - void loadSpeechCapabilities() - }) - - onCleanup(() => { - cleanupMedia(false) - }) - - const isSupported = () => { - if (typeof window === "undefined") return false - return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia) - } - - const canUseVoiceInput = () => { - const capabilities = speechCapabilities() - return Boolean( - options.enabled() && - isSupported() && - capabilities?.available && - capabilities?.configured && - capabilities?.supportsStt, - ) - } - - async function toggleRecording(): Promise { - if (state() === "recording") { - stopRecording() - return - } - - if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return - - try { - await startRecording() - } catch (error) { - cleanupMedia(false) - showAlertDialog(t("promptInput.voiceInput.error.permission"), { - title: t("promptInput.voiceInput.error.title"), - detail: error instanceof Error ? error.message : String(error), - variant: "error", - }) - } - } - - function stopRecording() { - if (!mediaRecorder || state() !== "recording") return - shouldTranscribe = true - mediaRecorder.stop() - setState("transcribing") - stopTimer() - } - - function cancelRecording() { - if (!mediaRecorder || state() !== "recording") return - shouldTranscribe = false - mediaRecorder.stop() - cleanupMedia(false) - } - - async function startRecording() { - if (!isSupported()) { - showAlertDialog(t("promptInput.voiceInput.error.unsupported"), { - title: t("promptInput.voiceInput.error.title"), - variant: "error", - }) - return - } - - recordedChunks = [] - shouldTranscribe = true - mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }) - mediaRecorder = createRecorder(mediaStream) - - mediaRecorder.addEventListener("dataavailable", (event) => { - if (event.data.size > 0) { - recordedChunks.push(event.data) - } - }) - - mediaRecorder.addEventListener("stop", () => { - void finalizeRecording() - }) - - recordingStartedAt = Date.now() - setElapsedMs(0) - setState("recording") - startTimer() - mediaRecorder.start() - } - - async function finalizeRecording() { - const recorder = mediaRecorder - const stream = mediaStream - mediaRecorder = null - mediaStream = null - - if (!shouldTranscribe || recordedChunks.length === 0) { - recordedChunks = [] - stopTracks(stream) - setState("idle") - setElapsedMs(0) - return - } - - const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm" - - try { - const audioBlob = new Blob(recordedChunks, { type: mimeType }) - const transcription = await serverApi.transcribeAudio({ - audioBase64: await blobToBase64(audioBlob), - mimeType, - }) - if (transcription.text.trim()) { - insertTranscript(transcription.text.trim()) - } - } catch (error) { - showAlertDialog(t("promptInput.voiceInput.error.transcribe"), { - title: t("promptInput.voiceInput.error.title"), - detail: error instanceof Error ? error.message : String(error), - variant: "error", - }) - } finally { - recordedChunks = [] - stopTracks(stream) - setState("idle") - setElapsedMs(0) - } - } - - function insertTranscript(text: string) { - const current = options.prompt() - const textarea = options.getTextarea() - const start = textarea ? textarea.selectionStart : current.length - const end = textarea ? textarea.selectionEnd : current.length - const { value, cursor } = buildPromptWithInsertedTranscript( - createPromptVoiceAnchor(current, start, end), - text, - ) - - options.setPrompt(value) - if (textarea) { - setTimeout(() => { - textarea.focus() - textarea.setSelectionRange(cursor, cursor) - }, 0) - } - } - - function cleanupMedia(resetState = true) { - stopTimer() - if (mediaRecorder && mediaRecorder.state !== "inactive") { - mediaRecorder.stop() - } - mediaRecorder = null - stopTracks(mediaStream) - mediaStream = null - recordedChunks = [] - if (resetState) { - setState("idle") - setElapsedMs(0) - } - } - - function startTimer() { - stopTimer() - timerId = window.setInterval(() => { - setElapsedMs(Date.now() - recordingStartedAt) - }, 250) - } - - function stopTimer() { - if (timerId !== undefined) { - window.clearInterval(timerId) - timerId = undefined - } - } - - return { - state, - elapsedMs, - canUseVoiceInput, - toggleRecording, - cancelRecording, - isRecording: () => state() === "recording", - isTranscribing: () => state() === "transcribing", - buttonTitle: () => { - if (state() === "recording") return t("promptInput.voiceInput.stop.title") - if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title") - return t("promptInput.voiceInput.start.title") - }, - } -} - -function createRecorder(stream: MediaStream): MediaRecorder { - const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"] - const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate)) - return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream) -} - -function stopTracks(stream: MediaStream | null) { - stream?.getTracks().forEach((track) => track.stop()) -} - -async function blobToBase64(blob: Blob): Promise { - const buffer = await blob.arrayBuffer() - const bytes = new Uint8Array(buffer) - let binary = "" - for (const byte of bytes) { - binary += String.fromCharCode(byte) - } - return btoa(binary) -} diff --git a/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts b/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts deleted file mode 100644 index 186ec3b7..00000000 --- a/packages/ui/src/components/prompt-input/usePromptRealtimeVoiceInput.ts +++ /dev/null @@ -1,325 +0,0 @@ -import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js" -import { showAlertDialog } from "../../stores/alerts" -import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client" -import { useI18n } from "../../lib/i18n" -import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech" -import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream" -import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion" - -interface UsePromptRealtimeVoiceInputOptions { - prompt: Accessor - setPrompt: (value: string, options?: { persistDraft?: boolean }) => void - getTextarea: () => HTMLTextAreaElement | null - enabled: Accessor - disabled: Accessor -} - -type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing" - -const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000 - -export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) { - const { t } = useI18n() - const [state, setState] = createSignal("idle") - const [elapsedMs, setElapsedMs] = createSignal(0) - - let activeSessionId: string | null = null - let eventSource: EventSource | null = null - let pcmStream: RealtimePcmStreamHandle | null = null - let audioQueue: Promise = Promise.resolve() - let timerId: number | undefined - let recordingStartedAt = 0 - let finalizeTimerId: number | undefined - let anchor = createPromptVoiceAnchor("", 0, 0) - let finalTranscript = "" - let liveTranscript = "" - let activeLiveItemId: string | null = null - let closing = false - - createEffect(() => { - void loadSpeechCapabilities() - }) - - onCleanup(() => { - cancelRecording() - }) - - const isSupported = () => { - if (typeof window === "undefined") return false - return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined" - } - - const canUseVoiceInput = () => { - const capabilities = speechCapabilities() - return Boolean( - options.enabled() && - isSupported() && - capabilities?.available && - capabilities?.configured && - capabilities?.supportsStt && - capabilities?.supportsRealtimeTranscription, - ) - } - - async function toggleRecording(): Promise { - if (state() === "listening" || state() === "connecting") { - await stopRecording() - return - } - - if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return - - try { - await startRecording() - } catch (error) { - await cleanupSession({ revertPrompt: true, closeRemote: true }) - showAlertDialog(t("promptInput.voiceInput.error.connection"), { - title: t("promptInput.voiceInput.error.title"), - detail: error instanceof Error ? error.message : String(error), - variant: "error", - }) - } - } - - async function startRecording() { - if (!isSupported()) { - showAlertDialog(t("promptInput.voiceInput.error.unsupported"), { - title: t("promptInput.voiceInput.error.title"), - variant: "error", - }) - return - } - - resetTranscriptState() - captureAnchor() - setState("connecting") - setElapsedMs(0) - - const created = await serverApi.createRealtimeSpeechSession({ - language: detectLanguage(), - }) - activeSessionId = created.sessionId - connectEventStream(created.sessionId) - - pcmStream = await createRealtimePcmStream({ - onChunk: (audioBase64) => { - const sessionId = activeSessionId - if (!sessionId || closing) return - audioQueue = audioQueue - .then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 })) - .catch((error) => { - handleRealtimeError(error) - }) - }, - }) - - recordingStartedAt = Date.now() - startTimer() - setState("listening") - } - - async function stopRecording() { - const sessionId = activeSessionId - if (!sessionId || (state() !== "listening" && state() !== "connecting")) return - - setState("finalizing") - stopTimer() - - if (pcmStream) { - const stream = pcmStream - pcmStream = null - await stream.stop() - } - - try { - await audioQueue.catch(() => undefined) - await serverApi.finalizeRealtimeSpeechSession(sessionId) - scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS) - } catch (error) { - handleRealtimeError(error) - } - } - - function cancelRecording() { - void cleanupSession({ revertPrompt: true, closeRemote: true }) - } - - function connectEventStream(sessionId: string) { - eventSource?.close() - eventSource = serverApi.connectRealtimeSpeechEvents( - sessionId, - (event) => handleEvent(event), - () => { - if (closing) return - handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection"))) - }, - ) - } - - function handleEvent(event: SpeechRealtimeEvent) { - if (event.type === "session.ready") { - return - } - - if (event.type === "session.error") { - handleRealtimeError(new Error(event.message)) - return - } - - if (event.type === "transcript.partial") { - activeLiveItemId = event.itemId - liveTranscript = event.text - renderPrompt(false) - return - } - - if (event.type === "transcript.final") { - activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId - liveTranscript = "" - finalTranscript = appendVoiceTranscript(finalTranscript, event.text) - renderPrompt(true) - if (state() === "finalizing") { - scheduleFinalizeClose(250) - } - return - } - - if (event.type === "session.closed") { - void cleanupSession({ revertPrompt: false, closeRemote: false }) - } - } - - function captureAnchor() { - const textarea = options.getTextarea() - const current = options.prompt() - const start = textarea ? textarea.selectionStart : current.length - const end = textarea ? textarea.selectionEnd : current.length - anchor = createPromptVoiceAnchor(current, start, end) - } - - function renderPrompt(persistDraft: boolean) { - const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "") - const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted) - options.setPrompt(value, persistDraft ? undefined : { persistDraft: false }) - syncTextareaCursor(cursor) - } - - function syncTextareaCursor(cursor: number) { - const textarea = options.getTextarea() - if (!textarea) return - queueMicrotask(() => { - const next = options.getTextarea() - if (!next) return - next.focus() - next.setSelectionRange(cursor, cursor) - }) - } - - function scheduleFinalizeClose(delayMs: number) { - if (finalizeTimerId !== undefined) { - window.clearTimeout(finalizeTimerId) - } - finalizeTimerId = window.setTimeout(() => { - void cleanupSession({ revertPrompt: false, closeRemote: true }) - }, delayMs) - } - - async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) { - if (closing) return - closing = true - - if (finalizeTimerId !== undefined) { - window.clearTimeout(finalizeTimerId) - finalizeTimerId = undefined - } - - stopTimer() - - const sessionId = activeSessionId - activeSessionId = null - - eventSource?.close() - eventSource = null - - if (pcmStream) { - const stream = pcmStream - pcmStream = null - await stream.stop().catch(() => undefined) - } - - await audioQueue.catch(() => undefined) - audioQueue = Promise.resolve() - - if (cleanupOptions.closeRemote && sessionId) { - await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined) - } - - if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) { - finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript) - liveTranscript = "" - } - - if (cleanupOptions.revertPrompt) { - options.setPrompt(anchor.prompt) - } else if (finalTranscript.trim()) { - renderPrompt(true) - } - - resetTranscriptState() - setState("idle") - setElapsedMs(0) - closing = false - } - - function resetTranscriptState() { - finalTranscript = "" - liveTranscript = "" - activeLiveItemId = null - } - - function handleRealtimeError(error: unknown) { - if (closing) return - void cleanupSession({ revertPrompt: true, closeRemote: true }) - showAlertDialog(t("promptInput.voiceInput.error.connection"), { - title: t("promptInput.voiceInput.error.title"), - detail: error instanceof Error ? error.message : String(error), - variant: "error", - }) - } - - function startTimer() { - stopTimer() - timerId = window.setInterval(() => { - setElapsedMs(Date.now() - recordingStartedAt) - }, 250) - } - - function stopTimer() { - if (timerId !== undefined) { - window.clearInterval(timerId) - timerId = undefined - } - } - - return { - state, - elapsedMs, - canUseVoiceInput, - toggleRecording, - cancelRecording, - isRecording: () => state() === "connecting" || state() === "listening", - isTranscribing: () => state() === "finalizing", - buttonTitle: () => { - if (state() === "connecting") return t("promptInput.voiceInput.connecting.title") - if (state() === "listening") return t("promptInput.voiceInput.stop.title") - if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title") - return t("promptInput.voiceInput.start.title") - }, - } -} - -function detectLanguage(): string | undefined { - if (typeof navigator === "undefined") return undefined - const [language] = navigator.language.split("-") - return language?.trim() || undefined -} diff --git a/packages/ui/src/components/prompt-input/usePromptState.ts b/packages/ui/src/components/prompt-input/usePromptState.ts index a2e252a4..3b326f2c 100644 --- a/packages/ui/src/components/prompt-input/usePromptState.ts +++ b/packages/ui/src/components/prompt-input/usePromptState.ts @@ -22,7 +22,7 @@ type HistorySelectOptions = { type PromptState = { prompt: Accessor - setPrompt: (value: string, options?: { persistDraft?: boolean }) => void + setPrompt: (value: string) => void clearPrompt: () => void draftLoadedNonce: Accessor @@ -48,11 +48,11 @@ export function usePromptState(options: PromptStateOptions): PromptState { const [historyDraft, setHistoryDraft] = createSignal(null) const [draftLoadedNonce, setDraftLoadedNonce] = createSignal(0) - const setPrompt = (value: string, setOptions?: { persistDraft?: boolean }) => { + const setPrompt = (value: string) => { setPromptInternal(value) // Persist drafts only when the user is at the "fresh" position (not browsing history). // This keeps the bottom-of-history draft stable even if the user edits recalled history entries. - if (setOptions?.persistDraft !== false && historyIndex() === -1) { + if (historyIndex() === -1) { setSessionDraftPrompt(options.instanceId(), options.sessionId(), value) } } diff --git a/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts b/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts index 0a13a6ff..f05c4e05 100644 --- a/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts +++ b/packages/ui/src/components/prompt-input/usePromptVoiceInput.ts @@ -1,30 +1,242 @@ -import type { Accessor } from "solid-js" -import { usePromptBufferedVoiceInput } from "./usePromptBufferedVoiceInput" -import { usePromptRealtimeVoiceInput } from "./usePromptRealtimeVoiceInput" +import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js" +import { showAlertDialog } from "../../stores/alerts" +import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech" +import { serverApi } from "../../lib/api-client" +import { useI18n } from "../../lib/i18n" interface UsePromptVoiceInputOptions { prompt: Accessor - setPrompt: (value: string, options?: { persistDraft?: boolean }) => void + setPrompt: (value: string) => void getTextarea: () => HTMLTextAreaElement | null enabled: Accessor disabled: Accessor - useRealtime: Accessor } +type VoiceInputState = "idle" | "recording" | "transcribing" + export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) { - const buffered = usePromptBufferedVoiceInput(options) - const realtime = usePromptRealtimeVoiceInput(options) + const { t } = useI18n() + const [state, setState] = createSignal("idle") + const [elapsedMs, setElapsedMs] = createSignal(0) - const active = () => (options.useRealtime() ? realtime : buffered) + let mediaRecorder: MediaRecorder | null = null + let mediaStream: MediaStream | null = null + let timerId: number | undefined + let shouldTranscribe = true + let recordedChunks: Blob[] = [] + let recordingStartedAt = 0 + + createEffect(() => { + void loadSpeechCapabilities() + }) + + onCleanup(() => { + cleanupMedia(false) + }) + + const isSupported = () => { + if (typeof window === "undefined") return false + return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia) + } + + const canUseVoiceInput = () => { + const capabilities = speechCapabilities() + return Boolean( + options.enabled() && + isSupported() && + capabilities?.available && + capabilities?.configured && + capabilities?.supportsStt, + ) + } + + async function toggleRecording(): Promise { + if (state() === "recording") { + stopRecording() + return + } + + if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return + + try { + await startRecording() + } catch (error) { + cleanupMedia(false) + showAlertDialog(t("promptInput.voiceInput.error.permission"), { + title: t("promptInput.voiceInput.error.title"), + detail: error instanceof Error ? error.message : String(error), + variant: "error", + }) + } + } + + function stopRecording() { + if (!mediaRecorder || state() !== "recording") return + shouldTranscribe = true + mediaRecorder.stop() + setState("transcribing") + stopTimer() + } + + function cancelRecording() { + if (!mediaRecorder || state() !== "recording") return + shouldTranscribe = false + mediaRecorder.stop() + cleanupMedia(false) + } + + async function startRecording() { + if (!isSupported()) { + showAlertDialog(t("promptInput.voiceInput.error.unsupported"), { + title: t("promptInput.voiceInput.error.title"), + variant: "error", + }) + return + } + + recordedChunks = [] + shouldTranscribe = true + mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true }) + mediaRecorder = createRecorder(mediaStream) + + mediaRecorder.addEventListener("dataavailable", (event) => { + if (event.data.size > 0) { + recordedChunks.push(event.data) + } + }) + + mediaRecorder.addEventListener("stop", () => { + void finalizeRecording() + }) + + recordingStartedAt = Date.now() + setElapsedMs(0) + setState("recording") + startTimer() + mediaRecorder.start() + } + + async function finalizeRecording() { + const recorder = mediaRecorder + const stream = mediaStream + mediaRecorder = null + mediaStream = null + + if (!shouldTranscribe || recordedChunks.length === 0) { + recordedChunks = [] + stopTracks(stream) + setState("idle") + setElapsedMs(0) + return + } + + const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm" + + try { + const audioBlob = new Blob(recordedChunks, { type: mimeType }) + const transcription = await serverApi.transcribeAudio({ + audioBase64: await blobToBase64(audioBlob), + mimeType, + }) + if (transcription.text.trim()) { + insertTranscript(transcription.text.trim()) + } + } catch (error) { + showAlertDialog(t("promptInput.voiceInput.error.transcribe"), { + title: t("promptInput.voiceInput.error.title"), + detail: error instanceof Error ? error.message : String(error), + variant: "error", + }) + } finally { + recordedChunks = [] + stopTracks(stream) + setState("idle") + setElapsedMs(0) + } + } + + function insertTranscript(text: string) { + const current = options.prompt() + const textarea = options.getTextarea() + const start = textarea ? textarea.selectionStart : current.length + const end = textarea ? textarea.selectionEnd : current.length + const before = current.slice(0, start) + const after = current.slice(end) + const prefix = before.length > 0 && !/\s$/.test(before) ? " " : "" + const suffix = after.length > 0 && !/^\s/.test(after) ? " " : "" + const nextValue = `${before}${prefix}${text}${suffix}${after}` + const cursor = before.length + prefix.length + text.length + + options.setPrompt(nextValue) + if (textarea) { + setTimeout(() => { + textarea.focus() + textarea.setSelectionRange(cursor, cursor) + }, 0) + } + } + + function cleanupMedia(resetState = true) { + stopTimer() + if (mediaRecorder && mediaRecorder.state !== "inactive") { + mediaRecorder.stop() + } + mediaRecorder = null + stopTracks(mediaStream) + mediaStream = null + recordedChunks = [] + if (resetState) { + setState("idle") + setElapsedMs(0) + } + } + + function startTimer() { + stopTimer() + timerId = window.setInterval(() => { + setElapsedMs(Date.now() - recordingStartedAt) + }, 250) + } + + function stopTimer() { + if (timerId !== undefined) { + window.clearInterval(timerId) + timerId = undefined + } + } return { - state: () => active().state(), - elapsedMs: () => active().elapsedMs(), - canUseVoiceInput: () => active().canUseVoiceInput(), - toggleRecording: () => active().toggleRecording(), - cancelRecording: () => active().cancelRecording(), - isRecording: () => active().isRecording(), - isTranscribing: () => active().isTranscribing(), - buttonTitle: () => active().buttonTitle(), + state, + elapsedMs, + canUseVoiceInput, + toggleRecording, + cancelRecording, + isRecording: () => state() === "recording", + isTranscribing: () => state() === "transcribing", + buttonTitle: () => { + if (state() === "recording") return t("promptInput.voiceInput.stop.title") + if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title") + return t("promptInput.voiceInput.start.title") + }, } } + +function createRecorder(stream: MediaStream): MediaRecorder { + const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"] + const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate)) + return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream) +} + +function stopTracks(stream: MediaStream | null) { + stream?.getTracks().forEach((track) => track.stop()) +} + +async function blobToBase64(blob: Blob): Promise { + const buffer = await blob.arrayBuffer() + const bytes = new Uint8Array(buffer) + let binary = "" + for (const byte of bytes) { + binary += String.fromCharCode(byte) + } + return btoa(binary) +} diff --git a/packages/ui/src/components/settings/speech-settings-card.tsx b/packages/ui/src/components/settings/speech-settings-card.tsx index a1ca1169..464fb4ba 100644 --- a/packages/ui/src/components/settings/speech-settings-card.tsx +++ b/packages/ui/src/components/settings/speech-settings-card.tsx @@ -10,8 +10,6 @@ const log = getLogger("actions") type DraftFields = { apiKey: string baseUrl: string - useRealtime: boolean - realtimeModel: string sttModel: string ttsModel: string ttsVoice: string @@ -21,8 +19,6 @@ function createDraftFields(speech: SpeechSettings): DraftFields { return { apiKey: speech.apiKey ?? "", baseUrl: speech.baseUrl ?? "", - useRealtime: speech.useRealtime, - realtimeModel: speech.realtimeModel, sttModel: speech.sttModel, ttsModel: speech.ttsModel, ttsVoice: speech.ttsVoice, @@ -30,7 +26,7 @@ function createDraftFields(speech: SpeechSettings): DraftFields { } function isDraftEqual(a: DraftFields, b: DraftFields): boolean { - return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.useRealtime === b.useRealtime && a.realtimeModel === b.realtimeModel && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice + return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice } export const SpeechSettingsCard: Component = () => { @@ -61,7 +57,7 @@ export const SpeechSettingsCard: Component = () => { return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing") } - const updateDraft = (key: K, value: DraftFields[K]) => { + const updateDraft = (key: keyof DraftFields, value: string) => { setSaveStatus("idle") setDrafts((current) => ({ ...current, [key]: value })) } @@ -69,14 +65,12 @@ export const SpeechSettingsCard: Component = () => { const isDirty = createMemo(() => { const speech = serverSettings().speech const current = drafts() - return ( - (current.apiKey || "") !== (speech.apiKey || "") || - (current.baseUrl || "") !== (speech.baseUrl || "") || - current.useRealtime !== speech.useRealtime || - current.realtimeModel !== speech.realtimeModel || - current.sttModel !== speech.sttModel || - current.ttsModel !== speech.ttsModel || - current.ttsVoice !== speech.ttsVoice + return ( + (current.apiKey || "") !== (speech.apiKey || "") || + (current.baseUrl || "") !== (speech.baseUrl || "") || + current.sttModel !== speech.sttModel || + current.ttsModel !== speech.ttsModel || + current.ttsVoice !== speech.ttsVoice ) }) @@ -96,8 +90,6 @@ export const SpeechSettingsCard: Component = () => { await updateSpeechSettings({ apiKey: current.apiKey.trim() || undefined, baseUrl: current.baseUrl.trim() || undefined, - useRealtime: current.useRealtime, - realtimeModel: current.realtimeModel.trim() || undefined, sttModel: current.sttModel.trim() || undefined, ttsModel: current.ttsModel.trim() || undefined, ttsVoice: current.ttsVoice.trim() || undefined, @@ -106,8 +98,6 @@ export const SpeechSettingsCard: Component = () => { setDrafts({ apiKey: current.apiKey.trim(), baseUrl: current.baseUrl.trim(), - useRealtime: current.useRealtime, - realtimeModel: current.realtimeModel.trim() || serverSettings().speech.realtimeModel, sttModel: current.sttModel.trim() || serverSettings().speech.sttModel, ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel, ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice, @@ -169,27 +159,6 @@ export const SpeechSettingsCard: Component = () => { onInput={(value) => updateDraft("baseUrl", value)} placeholder={t("settings.speech.baseUrl.placeholder")} /> -
-
-
{t("settings.speech.realtime.title")}
-
{t("settings.speech.realtime.subtitle")}
-
- -
- updateDraft("realtimeModel", value)} - placeholder={t("settings.speech.realtimeModel.placeholder")} - /> { return request("/api/speech/capabilities") }, - createRealtimeSpeechSession(payload?: { language?: string; prompt?: string }): Promise { - return request("/api/speech/realtime/sessions", { - method: "POST", - body: JSON.stringify(payload ?? {}), - }) - }, - appendRealtimeSpeechAudio(sessionId: string, payload: { audioBase64: string }): Promise { - return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/audio`, { - method: "POST", - body: JSON.stringify(payload), - }) - }, - finalizeRealtimeSpeechSession(sessionId: string): Promise { - return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/finalize`, { - method: "POST", - body: JSON.stringify({}), - }) - }, - closeRealtimeSpeechSession(sessionId: string): Promise { - return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}`, { - method: "DELETE", - }) - }, transcribeAudio(payload: { audioBase64: string mimeType: string @@ -361,34 +332,21 @@ export const serverApi = { }, connectEvents(onEvent: (event: WorkspaceEventPayload) => void, onError?: () => void) { sseLogger.info(`Connecting to ${EVENTS_URL}`) - return connectEventSource(EVENTS_URL, onEvent, onError) - }, - connectRealtimeSpeechEvents( - sessionId: string, - onEvent: (event: SpeechRealtimeEvent) => void, - onError?: () => void, - ) { - const url = buildRealtimeSpeechEventsUrl(sessionId) - sseLogger.info(`Connecting to ${url}`) - return connectEventSource(url, onEvent, onError) - }, -} - -function connectEventSource(url: string, onEvent: (event: T) => void, onError?: () => void) { - const source = new EventSource(url, { withCredentials: true } as any) - source.onmessage = (event) => { - try { - const payload = JSON.parse(event.data) as T - onEvent(payload) - } catch (error) { - sseLogger.error("Failed to parse event", error) + const source = new EventSource(EVENTS_URL, { withCredentials: true } as any) + source.onmessage = (event) => { + try { + const payload = JSON.parse(event.data) as WorkspaceEventPayload + onEvent(payload) + } catch (error) { + sseLogger.error("Failed to parse event", error) + } } - } - source.onerror = () => { - sseLogger.warn("EventSource error, closing stream") - onError?.() - } - return source + source.onerror = () => { + sseLogger.warn("EventSource error, closing stream") + onError?.() + } + return source + }, } -export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType, SpeechRealtimeEvent } +export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType } diff --git a/packages/ui/src/lib/i18n/messages/en/messaging.ts b/packages/ui/src/lib/i18n/messages/en/messaging.ts index 08cb1848..7b8a574a 100644 --- a/packages/ui/src/lib/i18n/messages/en/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/en/messaging.ts @@ -140,10 +140,8 @@ export const messagingMessages = { "promptInput.send.errorTitle": "Send failed", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", - "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", - "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/en/settings.ts b/packages/ui/src/lib/i18n/messages/en/settings.ts index eb068eac..318f1dcb 100644 --- a/packages/ui/src/lib/i18n/messages/en/settings.ts +++ b/packages/ui/src/lib/i18n/messages/en/settings.ts @@ -156,18 +156,13 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", - "settings.speech.realtime.title": "Realtime dictation", - "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", - "settings.speech.realtimeModel.title": "Realtime model", - "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", - "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/es/messaging.ts b/packages/ui/src/lib/i18n/messages/es/messaging.ts index 23dcd94e..850ab6b4 100644 --- a/packages/ui/src/lib/i18n/messages/es/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/es/messaging.ts @@ -142,10 +142,8 @@ export const messagingMessages = { "promptInput.send.errorTitle": "Error al enviar", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", - "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", - "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/es/settings.ts b/packages/ui/src/lib/i18n/messages/es/settings.ts index 48fb8e4e..1c39e405 100644 --- a/packages/ui/src/lib/i18n/messages/es/settings.ts +++ b/packages/ui/src/lib/i18n/messages/es/settings.ts @@ -156,18 +156,13 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", - "settings.speech.realtime.title": "Realtime dictation", - "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", - "settings.speech.realtimeModel.title": "Realtime model", - "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", - "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/fr/messaging.ts b/packages/ui/src/lib/i18n/messages/fr/messaging.ts index 2347f740..0a742efa 100644 --- a/packages/ui/src/lib/i18n/messages/fr/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/fr/messaging.ts @@ -142,10 +142,8 @@ export const messagingMessages = { "promptInput.send.errorTitle": "Échec de l'envoi", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", - "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", - "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/fr/settings.ts b/packages/ui/src/lib/i18n/messages/fr/settings.ts index e692e647..710d2c77 100644 --- a/packages/ui/src/lib/i18n/messages/fr/settings.ts +++ b/packages/ui/src/lib/i18n/messages/fr/settings.ts @@ -156,18 +156,13 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", - "settings.speech.realtime.title": "Realtime dictation", - "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", - "settings.speech.realtimeModel.title": "Realtime model", - "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", - "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/ja/messaging.ts b/packages/ui/src/lib/i18n/messages/ja/messaging.ts index 46dbbac4..60cd8881 100644 --- a/packages/ui/src/lib/i18n/messages/ja/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/ja/messaging.ts @@ -142,10 +142,8 @@ export const messagingMessages = { "promptInput.send.errorTitle": "送信に失敗", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", - "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", - "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/ja/settings.ts b/packages/ui/src/lib/i18n/messages/ja/settings.ts index 1cf7a834..8feaa819 100644 --- a/packages/ui/src/lib/i18n/messages/ja/settings.ts +++ b/packages/ui/src/lib/i18n/messages/ja/settings.ts @@ -156,18 +156,13 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", - "settings.speech.realtime.title": "Realtime dictation", - "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", - "settings.speech.realtimeModel.title": "Realtime model", - "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", - "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/ru/messaging.ts b/packages/ui/src/lib/i18n/messages/ru/messaging.ts index 109d5e2e..a833b25e 100644 --- a/packages/ui/src/lib/i18n/messages/ru/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/ru/messaging.ts @@ -142,10 +142,8 @@ export const messagingMessages = { "promptInput.send.errorTitle": "Не удалось отправить", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", - "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", - "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/ru/settings.ts b/packages/ui/src/lib/i18n/messages/ru/settings.ts index 6694eaa0..494ff447 100644 --- a/packages/ui/src/lib/i18n/messages/ru/settings.ts +++ b/packages/ui/src/lib/i18n/messages/ru/settings.ts @@ -156,18 +156,13 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", - "settings.speech.realtime.title": "Realtime dictation", - "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", - "settings.speech.realtimeModel.title": "Realtime model", - "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", - "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts b/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts index 13d36824..aeabd954 100644 --- a/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts +++ b/packages/ui/src/lib/i18n/messages/zh-Hans/messaging.ts @@ -142,10 +142,8 @@ export const messagingMessages = { "promptInput.send.errorTitle": "发送失败", "promptInput.voiceInput.start.title": "Start voice input", "promptInput.voiceInput.stop.title": "Stop recording and transcribe", - "promptInput.voiceInput.connecting.title": "Connecting microphone", "promptInput.voiceInput.transcribing.title": "Transcribing audio", "promptInput.voiceInput.error.title": "Voice input failed", - "promptInput.voiceInput.error.connection": "Unable to start realtime voice input.", "promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.", "promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.", "promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.", diff --git a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts index 68b3a11c..aff1063f 100644 --- a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts +++ b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts @@ -156,18 +156,13 @@ export const settingsMessages = { "settings.speech.baseUrl.title": "Base URL", "settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.", "settings.speech.baseUrl.placeholder": "https://api.openai.com/v1", - "settings.speech.realtime.title": "Realtime dictation", - "settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.", - "settings.speech.realtimeModel.title": "Realtime model", - "settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.", - "settings.speech.realtimeModel.placeholder": "gpt-realtime", "settings.speech.sttModel.title": "Transcription model", "settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.", "settings.speech.ttsModel.title": "Speech model", "settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.", "settings.speech.ttsVoice.title": "Default voice", "settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.", - "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.", + "settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.", "settings.speech.save.action": "Save", "settings.speech.save.saving": "Saving...", "settings.speech.save.saved": "Saved", diff --git a/packages/ui/src/stores/preferences.tsx b/packages/ui/src/stores/preferences.tsx index 6ea73763..a387b754 100644 --- a/packages/ui/src/stores/preferences.tsx +++ b/packages/ui/src/stores/preferences.tsx @@ -34,8 +34,6 @@ export interface SpeechSettings { provider: SpeechProviderPreference apiKey?: string baseUrl?: string - useRealtime: boolean - realtimeModel: string sttModel: string ttsModel: string ttsVoice: string @@ -138,8 +136,6 @@ const defaultUiSettings: UiSettings = { const defaultSpeechSettings: SpeechSettings = { provider: "openai-compatible", - useRealtime: true, - realtimeModel: "gpt-realtime", sttModel: "gpt-4o-mini-transcribe", ttsModel: "gpt-4o-mini-tts", ttsVoice: "alloy", @@ -188,11 +184,6 @@ function normalizeSpeechSettings(input?: Partial | null): Speech provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider, apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined, baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined, - useRealtime: sanitized.useRealtime ?? defaultSpeechSettings.useRealtime, - realtimeModel: - typeof sanitized.realtimeModel === "string" && sanitized.realtimeModel.trim() - ? sanitized.realtimeModel.trim() - : defaultSpeechSettings.realtimeModel, sttModel: typeof sanitized.sttModel === "string" && sanitized.sttModel.trim() ? sanitized.sttModel.trim()