feat(speech): add realtime prompt dictation support
Add server-backed realtime transcription for prompt voice input and expose speech settings to choose realtime mode and models.
This commit is contained in:
@@ -4,6 +4,6 @@
|
|||||||
"private": true,
|
"private": true,
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@opencode-ai/plugin": "1.2.14"
|
"@opencode-ai/plugin": "1.2.24"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -219,12 +219,35 @@ export interface SpeechCapabilitiesResponse {
|
|||||||
provider: string
|
provider: string
|
||||||
supportsStt: boolean
|
supportsStt: boolean
|
||||||
supportsTts: boolean
|
supportsTts: boolean
|
||||||
|
supportsRealtimeTranscription?: boolean
|
||||||
|
realtimeInputFormat?: {
|
||||||
|
type: "audio/pcm"
|
||||||
|
rate: 24000
|
||||||
|
}
|
||||||
|
realtimeModel?: string
|
||||||
baseUrl?: string
|
baseUrl?: string
|
||||||
sttModel: string
|
sttModel: string
|
||||||
ttsModel: string
|
ttsModel: string
|
||||||
ttsVoice: string
|
ttsVoice: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface SpeechRealtimeSessionResponse {
|
||||||
|
sessionId: string
|
||||||
|
inputFormat: {
|
||||||
|
type: "audio/pcm"
|
||||||
|
rate: 24000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export type SpeechRealtimeEvent =
|
||||||
|
| { type: "session.ready"; sessionId: string }
|
||||||
|
| { type: "session.error"; message: string }
|
||||||
|
| { type: "input.speech_started"; itemId?: string }
|
||||||
|
| { type: "input.speech_stopped"; itemId?: string }
|
||||||
|
| { type: "transcript.partial"; itemId: string; text: string }
|
||||||
|
| { type: "transcript.final"; itemId: string; previousItemId?: string; text: string }
|
||||||
|
| { type: "session.closed"; reason?: string }
|
||||||
|
|
||||||
export interface SpeechTranscriptionResponse {
|
export interface SpeechTranscriptionResponse {
|
||||||
text: string
|
text: string
|
||||||
language?: string
|
language?: string
|
||||||
|
|||||||
@@ -255,7 +255,7 @@ export function createHttpServer(deps: HttpServerDeps) {
|
|||||||
eventBus: deps.eventBus,
|
eventBus: deps.eventBus,
|
||||||
workspaceManager: deps.workspaceManager,
|
workspaceManager: deps.workspaceManager,
|
||||||
})
|
})
|
||||||
registerSpeechRoutes(app, { speechService: deps.speechService })
|
registerSpeechRoutes(app, { speechService: deps.speechService, logger: apiLogger })
|
||||||
registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger })
|
registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger })
|
||||||
registerBackgroundProcessRoutes(app, { backgroundProcessManager })
|
registerBackgroundProcessRoutes(app, { backgroundProcessManager })
|
||||||
registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger })
|
registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger })
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
import type { FastifyInstance } from "fastify"
|
import type { FastifyInstance } from "fastify"
|
||||||
import { z } from "zod"
|
import { z } from "zod"
|
||||||
import type { SpeechService } from "../../speech/service"
|
import type { SpeechService } from "../../speech/service"
|
||||||
|
import type { Logger } from "../../logger"
|
||||||
|
import { SpeechRealtimeSessionManager } from "../../speech/realtime-session-manager"
|
||||||
|
|
||||||
interface RouteDeps {
|
interface RouteDeps {
|
||||||
speechService: SpeechService
|
speechService: SpeechService
|
||||||
|
logger: Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
const TranscribeBodySchema = z.object({
|
const TranscribeBodySchema = z.object({
|
||||||
@@ -19,9 +22,99 @@ const SynthesizeBodySchema = z.object({
|
|||||||
format: z.enum(["mp3", "wav", "opus"]).optional(),
|
format: z.enum(["mp3", "wav", "opus"]).optional(),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
const RealtimeSessionBodySchema = z.object({
|
||||||
|
language: z.string().trim().min(1).optional(),
|
||||||
|
prompt: z.string().trim().min(1).optional(),
|
||||||
|
})
|
||||||
|
|
||||||
|
const RealtimeAudioBodySchema = z.object({
|
||||||
|
audioBase64: z.string().min(1, "Audio payload is required"),
|
||||||
|
})
|
||||||
|
|
||||||
export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) {
|
export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) {
|
||||||
|
const realtimeSessions = new SpeechRealtimeSessionManager(
|
||||||
|
deps.speechService,
|
||||||
|
deps.logger.child({ component: "speech-realtime" }),
|
||||||
|
)
|
||||||
|
|
||||||
|
app.addHook("onClose", async () => {
|
||||||
|
await realtimeSessions.dispose()
|
||||||
|
})
|
||||||
|
|
||||||
app.get("/api/speech/capabilities", async () => deps.speechService.getCapabilities())
|
app.get("/api/speech/capabilities", async () => deps.speechService.getCapabilities())
|
||||||
|
|
||||||
|
app.post("/api/speech/realtime/sessions", async (request, reply) => {
|
||||||
|
try {
|
||||||
|
const body = RealtimeSessionBodySchema.parse(request.body ?? {})
|
||||||
|
return await realtimeSessions.createSession(body)
|
||||||
|
} catch (error) {
|
||||||
|
request.log.error({ err: error }, "Failed to create realtime speech session")
|
||||||
|
reply.code(400)
|
||||||
|
return { error: error instanceof Error ? error.message : "Failed to create realtime speech session" }
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
app.get<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/events", (request, reply) => {
|
||||||
|
try {
|
||||||
|
reply.raw.setHeader("Content-Type", "text/event-stream")
|
||||||
|
reply.raw.setHeader("Cache-Control", "no-cache")
|
||||||
|
reply.raw.setHeader("Connection", "keep-alive")
|
||||||
|
reply.raw.flushHeaders?.()
|
||||||
|
reply.hijack()
|
||||||
|
|
||||||
|
const unsubscribe = realtimeSessions.subscribe(request.params.sessionId, (event) => {
|
||||||
|
reply.raw.write(`data: ${JSON.stringify(event)}\n\n`)
|
||||||
|
})
|
||||||
|
|
||||||
|
const heartbeat = setInterval(() => {
|
||||||
|
reply.raw.write(`:hb ${Date.now()}\n\n`)
|
||||||
|
}, 15000)
|
||||||
|
|
||||||
|
const close = () => {
|
||||||
|
clearInterval(heartbeat)
|
||||||
|
unsubscribe()
|
||||||
|
reply.raw.end?.()
|
||||||
|
}
|
||||||
|
|
||||||
|
request.raw.on("close", close)
|
||||||
|
request.raw.on("error", close)
|
||||||
|
} catch (error) {
|
||||||
|
request.log.error({ err: error }, "Failed to open realtime speech event stream")
|
||||||
|
reply.code(404).send({ error: error instanceof Error ? error.message : "Realtime speech session not found" })
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
app.post<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/audio", async (request, reply) => {
|
||||||
|
try {
|
||||||
|
const body = RealtimeAudioBodySchema.parse(request.body ?? {})
|
||||||
|
realtimeSessions.appendAudio(request.params.sessionId, body.audioBase64)
|
||||||
|
reply.code(204)
|
||||||
|
return undefined
|
||||||
|
} catch (error) {
|
||||||
|
request.log.error({ err: error }, "Failed to append realtime speech audio")
|
||||||
|
reply.code(400)
|
||||||
|
return { error: error instanceof Error ? error.message : "Failed to append realtime speech audio" }
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
app.post<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId/finalize", async (request, reply) => {
|
||||||
|
try {
|
||||||
|
realtimeSessions.finalize(request.params.sessionId)
|
||||||
|
reply.code(204)
|
||||||
|
return undefined
|
||||||
|
} catch (error) {
|
||||||
|
request.log.error({ err: error }, "Failed to finalize realtime speech session")
|
||||||
|
reply.code(400)
|
||||||
|
return { error: error instanceof Error ? error.message : "Failed to finalize realtime speech session" }
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
app.delete<{ Params: { sessionId: string } }>("/api/speech/realtime/sessions/:sessionId", async (request, reply) => {
|
||||||
|
realtimeSessions.closeSession(request.params.sessionId, "client_closed")
|
||||||
|
reply.code(204)
|
||||||
|
return undefined
|
||||||
|
})
|
||||||
|
|
||||||
app.post("/api/speech/transcribe", async (request, reply) => {
|
app.post("/api/speech/transcribe", async (request, reply) => {
|
||||||
try {
|
try {
|
||||||
const body = TranscribeBodySchema.parse(request.body ?? {})
|
const body = TranscribeBodySchema.parse(request.body ?? {})
|
||||||
|
|||||||
@@ -20,7 +20,13 @@ export class OpenAICompatibleSpeechProvider {
|
|||||||
provider: settings.provider,
|
provider: settings.provider,
|
||||||
supportsStt: true,
|
supportsStt: true,
|
||||||
supportsTts: true,
|
supportsTts: true,
|
||||||
|
supportsRealtimeTranscription: true,
|
||||||
|
realtimeInputFormat: {
|
||||||
|
type: "audio/pcm" as const,
|
||||||
|
rate: 24000 as const,
|
||||||
|
},
|
||||||
baseUrl: settings.baseUrl,
|
baseUrl: settings.baseUrl,
|
||||||
|
realtimeModel: settings.realtimeModel,
|
||||||
sttModel: settings.sttModel,
|
sttModel: settings.sttModel,
|
||||||
ttsModel: settings.ttsModel,
|
ttsModel: settings.ttsModel,
|
||||||
ttsVoice: settings.ttsVoice,
|
ttsVoice: settings.ttsVoice,
|
||||||
|
|||||||
525
packages/server/src/speech/realtime-session-manager.ts
Normal file
525
packages/server/src/speech/realtime-session-manager.ts
Normal file
@@ -0,0 +1,525 @@
|
|||||||
|
import { randomUUID } from "node:crypto"
|
||||||
|
import { WebSocket } from "undici"
|
||||||
|
import type { SpeechRealtimeEvent, SpeechRealtimeSessionResponse } from "../api-types"
|
||||||
|
import type { Logger } from "../logger"
|
||||||
|
import type { SpeechService } from "./service"
|
||||||
|
|
||||||
|
interface CreateRealtimeSessionOptions {
|
||||||
|
language?: string
|
||||||
|
prompt?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TranscriptItemState {
|
||||||
|
previousItemId?: string
|
||||||
|
partialText: string
|
||||||
|
finalText?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ManagedRealtimeSession {
|
||||||
|
id: string
|
||||||
|
ws: WebSocket
|
||||||
|
subscribers: Set<(event: SpeechRealtimeEvent) => void>
|
||||||
|
items: Map<string, TranscriptItemState>
|
||||||
|
orderedItemIds: string[]
|
||||||
|
nextFinalIndex: number
|
||||||
|
createdAt: number
|
||||||
|
lastActivityAt: number
|
||||||
|
closed: boolean
|
||||||
|
}
|
||||||
|
|
||||||
|
const OPEN_TIMEOUT_MS = 10_000
|
||||||
|
const IDLE_TIMEOUT_MS = 2 * 60 * 1000
|
||||||
|
const SWEEP_INTERVAL_MS = 30_000
|
||||||
|
|
||||||
|
export class SpeechRealtimeSessionManager {
|
||||||
|
private readonly sessions = new Map<string, ManagedRealtimeSession>()
|
||||||
|
private readonly sweepTimer: NodeJS.Timeout
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
private readonly speechService: SpeechService,
|
||||||
|
private readonly logger: Logger,
|
||||||
|
) {
|
||||||
|
this.sweepTimer = setInterval(() => {
|
||||||
|
this.sweepIdleSessions()
|
||||||
|
}, SWEEP_INTERVAL_MS)
|
||||||
|
this.sweepTimer.unref?.()
|
||||||
|
}
|
||||||
|
|
||||||
|
async createSession(options: CreateRealtimeSessionOptions = {}): Promise<SpeechRealtimeSessionResponse> {
|
||||||
|
const config = this.speechService.getRealtimeTranscriptionConfig()
|
||||||
|
const id = randomUUID()
|
||||||
|
const wsUrl = buildRealtimeWebSocketUrl(config.baseUrl, config.realtimeModel)
|
||||||
|
const sessionUpdateEvent = buildSessionUpdateEvent(config, options)
|
||||||
|
this.logger.info(
|
||||||
|
{
|
||||||
|
sessionId: id,
|
||||||
|
wsUrl,
|
||||||
|
realtimeModel: config.realtimeModel,
|
||||||
|
sttModel: config.sttModel,
|
||||||
|
payload: sessionUpdateEvent,
|
||||||
|
},
|
||||||
|
"Opening realtime speech websocket",
|
||||||
|
)
|
||||||
|
const ws = new WebSocket(wsUrl, {
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${config.apiKey}`,
|
||||||
|
...(requiresRealtimeBetaHeader(config.baseUrl) ? { "OpenAI-Beta": "realtime=v1" } : {}),
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
const session: ManagedRealtimeSession = {
|
||||||
|
id,
|
||||||
|
ws,
|
||||||
|
subscribers: new Set(),
|
||||||
|
items: new Map(),
|
||||||
|
orderedItemIds: [],
|
||||||
|
nextFinalIndex: 0,
|
||||||
|
createdAt: Date.now(),
|
||||||
|
lastActivityAt: Date.now(),
|
||||||
|
closed: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
this.sessions.set(id, session)
|
||||||
|
this.attachSocketHandlers(session)
|
||||||
|
|
||||||
|
try {
|
||||||
|
await waitForSocketOpen(ws)
|
||||||
|
this.send(session, sessionUpdateEvent)
|
||||||
|
return {
|
||||||
|
sessionId: id,
|
||||||
|
inputFormat: config.inputFormat,
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.error({ sessionId: id, err: error }, "Failed to create realtime speech session")
|
||||||
|
this.closeSession(id, error instanceof Error ? error.message : "Failed to create realtime speech session")
|
||||||
|
throw error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
subscribe(sessionId: string, send: (event: SpeechRealtimeEvent) => void): () => void {
|
||||||
|
const session = this.getSession(sessionId)
|
||||||
|
if (!session) {
|
||||||
|
throw new Error("Realtime speech session not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
session.subscribers.add(send)
|
||||||
|
this.touch(session)
|
||||||
|
send({ type: "session.ready", sessionId })
|
||||||
|
|
||||||
|
return () => {
|
||||||
|
session.subscribers.delete(send)
|
||||||
|
this.touch(session)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
appendAudio(sessionId: string, audioBase64: string): void {
|
||||||
|
const session = this.requireSession(sessionId)
|
||||||
|
this.send(session, {
|
||||||
|
type: "input_audio_buffer.append",
|
||||||
|
audio: audioBase64,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
finalize(sessionId: string): void {
|
||||||
|
const session = this.requireSession(sessionId)
|
||||||
|
this.send(session, {
|
||||||
|
type: "input_audio_buffer.commit",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
closeSession(sessionId: string, reason?: string): void {
|
||||||
|
const session = this.sessions.get(sessionId)
|
||||||
|
if (!session || session.closed) return
|
||||||
|
|
||||||
|
session.closed = true
|
||||||
|
this.sessions.delete(sessionId)
|
||||||
|
this.emit(session, { type: "session.closed", reason })
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (session.ws.readyState === WebSocket.OPEN || session.ws.readyState === WebSocket.CONNECTING) {
|
||||||
|
session.ws.close(1000, reason?.slice(0, 120) ?? "client_closed")
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.warn({ sessionId, err: error }, "Failed to close realtime speech websocket")
|
||||||
|
}
|
||||||
|
|
||||||
|
session.subscribers.clear()
|
||||||
|
}
|
||||||
|
|
||||||
|
async dispose(): Promise<void> {
|
||||||
|
clearInterval(this.sweepTimer)
|
||||||
|
for (const sessionId of Array.from(this.sessions.keys())) {
|
||||||
|
this.closeSession(sessionId, "server_shutdown")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private attachSocketHandlers(session: ManagedRealtimeSession) {
|
||||||
|
session.ws.addEventListener("message", (event) => {
|
||||||
|
void this.handleSocketMessage(session, event.data)
|
||||||
|
})
|
||||||
|
|
||||||
|
session.ws.addEventListener("error", (event) => {
|
||||||
|
const message = event.error instanceof Error ? event.error.message : event.message || "Realtime speech connection failed"
|
||||||
|
this.logger.warn({ sessionId: session.id, err: event.error ?? event.message }, "Realtime speech websocket error")
|
||||||
|
this.emit(session, { type: "session.error", message })
|
||||||
|
})
|
||||||
|
|
||||||
|
session.ws.addEventListener("close", (event) => {
|
||||||
|
const reason = event.reason || (event.wasClean ? "socket_closed" : "socket_terminated")
|
||||||
|
this.logger.info(
|
||||||
|
{
|
||||||
|
sessionId: session.id,
|
||||||
|
code: event.code,
|
||||||
|
reason,
|
||||||
|
orderedItemIds: session.orderedItemIds,
|
||||||
|
pendingItems: Array.from(session.items.entries()).map(([itemId, item]) => ({
|
||||||
|
itemId,
|
||||||
|
previousItemId: item.previousItemId,
|
||||||
|
partialText: item.partialText,
|
||||||
|
finalText: item.finalText,
|
||||||
|
})),
|
||||||
|
},
|
||||||
|
"Realtime speech websocket closed",
|
||||||
|
)
|
||||||
|
this.closeSession(session.id, reason)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
private async handleSocketMessage(session: ManagedRealtimeSession, raw: unknown) {
|
||||||
|
if (session.closed) return
|
||||||
|
|
||||||
|
try {
|
||||||
|
const payload = await toText(raw)
|
||||||
|
const event = JSON.parse(payload) as Record<string, unknown>
|
||||||
|
this.touch(session)
|
||||||
|
this.handleServerEvent(session, event)
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.warn({ sessionId: session.id, err: error }, "Failed to process realtime speech event")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private handleServerEvent(session: ManagedRealtimeSession, event: Record<string, unknown>) {
|
||||||
|
const type = typeof event.type === "string" ? event.type : ""
|
||||||
|
if (!type) return
|
||||||
|
|
||||||
|
this.logger.debug({ sessionId: session.id, type }, "Realtime speech event received")
|
||||||
|
if (type.startsWith("conversation.item") || type.startsWith("input_audio_buffer") || type.startsWith("session.")) {
|
||||||
|
this.logger.debug({ sessionId: session.id, event }, "Realtime speech event payload")
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === "error") {
|
||||||
|
const message = extractErrorMessage(event)
|
||||||
|
this.logger.warn({ sessionId: session.id, event }, "Realtime speech provider error event")
|
||||||
|
this.emit(session, { type: "session.error", message })
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === "input_audio_buffer.speech_started") {
|
||||||
|
this.emit(session, {
|
||||||
|
type: "input.speech_started",
|
||||||
|
itemId: readString(event.item_id),
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === "input_audio_buffer.speech_stopped") {
|
||||||
|
this.emit(session, {
|
||||||
|
type: "input.speech_stopped",
|
||||||
|
itemId: readString(event.item_id),
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === "input_audio_buffer.committed") {
|
||||||
|
const itemId = readString(event.item_id)
|
||||||
|
if (!itemId) return
|
||||||
|
const item = this.getOrCreateItem(session, itemId)
|
||||||
|
item.previousItemId = readString(event.previous_item_id)
|
||||||
|
if (!session.orderedItemIds.includes(itemId)) {
|
||||||
|
session.orderedItemIds.push(itemId)
|
||||||
|
}
|
||||||
|
this.flushFinalizedItems(session)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === "conversation.item.created" || type === "conversation.item.added" || type === "conversation.item.done") {
|
||||||
|
this.handleConversationItemEvent(session, event)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === "conversation.item.input_audio_transcription.delta") {
|
||||||
|
const itemId = readString(event.item_id)
|
||||||
|
const delta = readString(event.delta)
|
||||||
|
if (!itemId || !delta) return
|
||||||
|
const item = this.getOrCreateItem(session, itemId)
|
||||||
|
item.partialText += delta
|
||||||
|
this.emit(session, {
|
||||||
|
type: "transcript.partial",
|
||||||
|
itemId,
|
||||||
|
text: item.partialText,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (type === "conversation.item.input_audio_transcription.completed") {
|
||||||
|
const itemId = readString(event.item_id)
|
||||||
|
if (!itemId) return
|
||||||
|
const item = this.getOrCreateItem(session, itemId)
|
||||||
|
item.finalText = readString(event.transcript) ?? item.partialText
|
||||||
|
this.flushFinalizedItems(session)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private handleConversationItemEvent(session: ManagedRealtimeSession, event: Record<string, unknown>) {
|
||||||
|
const itemRecord = asRecord(event.item)
|
||||||
|
if (!itemRecord) return
|
||||||
|
|
||||||
|
const itemId = readString(itemRecord.id) ?? readString(event.item_id)
|
||||||
|
if (!itemId) return
|
||||||
|
|
||||||
|
const item = this.getOrCreateItem(session, itemId)
|
||||||
|
item.previousItemId = readString(event.previous_item_id) ?? item.previousItemId
|
||||||
|
if (!session.orderedItemIds.includes(itemId)) {
|
||||||
|
session.orderedItemIds.push(itemId)
|
||||||
|
}
|
||||||
|
|
||||||
|
const transcript = extractTranscriptFromConversationItem(itemRecord)
|
||||||
|
if (transcript) {
|
||||||
|
item.finalText = transcript
|
||||||
|
this.flushFinalizedItems(session)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private flushFinalizedItems(session: ManagedRealtimeSession) {
|
||||||
|
while (session.nextFinalIndex < session.orderedItemIds.length) {
|
||||||
|
const itemId = session.orderedItemIds[session.nextFinalIndex]
|
||||||
|
const item = session.items.get(itemId)
|
||||||
|
if (!item || item.finalText === undefined) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
this.emit(session, {
|
||||||
|
type: "transcript.final",
|
||||||
|
itemId,
|
||||||
|
previousItemId: item.previousItemId,
|
||||||
|
text: item.finalText,
|
||||||
|
})
|
||||||
|
session.nextFinalIndex += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private getOrCreateItem(session: ManagedRealtimeSession, itemId: string): TranscriptItemState {
|
||||||
|
const existing = session.items.get(itemId)
|
||||||
|
if (existing) return existing
|
||||||
|
const created: TranscriptItemState = { partialText: "" }
|
||||||
|
session.items.set(itemId, created)
|
||||||
|
return created
|
||||||
|
}
|
||||||
|
|
||||||
|
private emit(session: ManagedRealtimeSession, event: SpeechRealtimeEvent) {
|
||||||
|
for (const subscriber of session.subscribers) {
|
||||||
|
try {
|
||||||
|
subscriber(event)
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.warn({ sessionId: session.id, err: error, type: event.type }, "Failed to emit realtime speech event")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private requireSession(sessionId: string): ManagedRealtimeSession {
|
||||||
|
const session = this.getSession(sessionId)
|
||||||
|
if (!session) {
|
||||||
|
throw new Error("Realtime speech session not found")
|
||||||
|
}
|
||||||
|
return session
|
||||||
|
}
|
||||||
|
|
||||||
|
private getSession(sessionId: string): ManagedRealtimeSession | null {
|
||||||
|
const session = this.sessions.get(sessionId) ?? null
|
||||||
|
if (!session || session.closed) return null
|
||||||
|
return session
|
||||||
|
}
|
||||||
|
|
||||||
|
private send(session: ManagedRealtimeSession, event: Record<string, unknown>) {
|
||||||
|
if (session.closed || session.ws.readyState !== WebSocket.OPEN) {
|
||||||
|
throw new Error("Realtime speech session is not connected")
|
||||||
|
}
|
||||||
|
|
||||||
|
session.ws.send(JSON.stringify(event))
|
||||||
|
this.touch(session)
|
||||||
|
}
|
||||||
|
|
||||||
|
private touch(session: ManagedRealtimeSession) {
|
||||||
|
session.lastActivityAt = Date.now()
|
||||||
|
}
|
||||||
|
|
||||||
|
private sweepIdleSessions() {
|
||||||
|
const now = Date.now()
|
||||||
|
for (const [sessionId, session] of this.sessions) {
|
||||||
|
if (session.closed) continue
|
||||||
|
if (now - session.lastActivityAt < IDLE_TIMEOUT_MS) continue
|
||||||
|
this.logger.info({ sessionId }, "Closing idle realtime speech session")
|
||||||
|
this.closeSession(sessionId, "idle_timeout")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildRealtimeWebSocketUrl(baseUrl: string | undefined, model: string): string {
|
||||||
|
const target = new URL(baseUrl?.trim() || "https://api.openai.com/v1")
|
||||||
|
target.protocol = target.protocol === "http:" ? "ws:" : "wss:"
|
||||||
|
const normalizedPath = target.pathname.replace(/\/+$/, "")
|
||||||
|
target.pathname = normalizedPath.endsWith("/realtime") ? normalizedPath : `${normalizedPath}/realtime`
|
||||||
|
target.hash = ""
|
||||||
|
if (!target.searchParams.has("model")) {
|
||||||
|
target.searchParams.set("model", model)
|
||||||
|
}
|
||||||
|
return target.toString()
|
||||||
|
}
|
||||||
|
|
||||||
|
function requiresRealtimeBetaHeader(baseUrl?: string): boolean {
|
||||||
|
if (!baseUrl || !baseUrl.trim()) return false
|
||||||
|
try {
|
||||||
|
return new URL(baseUrl).hostname.toLowerCase() !== "api.openai.com"
|
||||||
|
} catch {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildSessionUpdateEvent(
|
||||||
|
config: { baseUrl?: string; sttModel: string; realtimeModel: string; inputFormat: { type: "audio/pcm"; rate: 24000 } },
|
||||||
|
options: CreateRealtimeSessionOptions,
|
||||||
|
): Record<string, unknown> {
|
||||||
|
if (requiresRealtimeBetaHeader(config.baseUrl)) {
|
||||||
|
return {
|
||||||
|
type: "session.update",
|
||||||
|
session: {
|
||||||
|
input_audio_transcription: {
|
||||||
|
model: config.sttModel,
|
||||||
|
...(options.language ? { language: options.language } : {}),
|
||||||
|
...(options.prompt ? { prompt: options.prompt } : {}),
|
||||||
|
},
|
||||||
|
turn_detection: {
|
||||||
|
type: "server_vad",
|
||||||
|
threshold: 0.45,
|
||||||
|
prefix_padding_ms: 250,
|
||||||
|
silence_duration_ms: 400,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
type: "session.update",
|
||||||
|
session: {
|
||||||
|
type: "transcription",
|
||||||
|
audio: {
|
||||||
|
input: {
|
||||||
|
format: config.inputFormat,
|
||||||
|
noise_reduction: { type: "near_field" },
|
||||||
|
transcription: {
|
||||||
|
model: config.sttModel,
|
||||||
|
...(options.language ? { language: options.language } : {}),
|
||||||
|
...(options.prompt ? { prompt: options.prompt } : {}),
|
||||||
|
},
|
||||||
|
turn_detection: {
|
||||||
|
type: "server_vad",
|
||||||
|
threshold: 0.45,
|
||||||
|
prefix_padding_ms: 250,
|
||||||
|
silence_duration_ms: 400,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function waitForSocketOpen(ws: WebSocket): Promise<void> {
|
||||||
|
if (ws.readyState === WebSocket.OPEN) {
|
||||||
|
return Promise.resolve()
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
let settled = false
|
||||||
|
const timeout = setTimeout(() => {
|
||||||
|
cleanup()
|
||||||
|
reject(new Error("Timed out connecting to realtime speech provider"))
|
||||||
|
}, OPEN_TIMEOUT_MS)
|
||||||
|
|
||||||
|
const cleanup = () => {
|
||||||
|
clearTimeout(timeout)
|
||||||
|
ws.removeEventListener("open", handleOpen)
|
||||||
|
ws.removeEventListener("error", handleError)
|
||||||
|
ws.removeEventListener("close", handleClose)
|
||||||
|
}
|
||||||
|
|
||||||
|
const finish = (callback: () => void) => {
|
||||||
|
if (settled) return
|
||||||
|
settled = true
|
||||||
|
cleanup()
|
||||||
|
callback()
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleOpen = () => {
|
||||||
|
finish(resolve)
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleError = (event: { error?: unknown; message?: string }) => {
|
||||||
|
finish(() => reject(event.error instanceof Error ? event.error : new Error(event.message || "Failed to connect")))
|
||||||
|
}
|
||||||
|
|
||||||
|
const handleClose = () => {
|
||||||
|
finish(() => reject(new Error("Realtime speech connection closed before initialization")))
|
||||||
|
}
|
||||||
|
|
||||||
|
ws.addEventListener("open", handleOpen)
|
||||||
|
ws.addEventListener("error", handleError as any)
|
||||||
|
ws.addEventListener("close", handleClose)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async function toText(data: unknown): Promise<string> {
|
||||||
|
if (typeof data === "string") return data
|
||||||
|
if (data instanceof ArrayBuffer) return Buffer.from(data).toString("utf-8")
|
||||||
|
if (ArrayBuffer.isView(data)) return Buffer.from(data.buffer, data.byteOffset, data.byteLength).toString("utf-8")
|
||||||
|
if (typeof Blob !== "undefined" && data instanceof Blob) {
|
||||||
|
return Buffer.from(await data.arrayBuffer()).toString("utf-8")
|
||||||
|
}
|
||||||
|
return String(data ?? "")
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractErrorMessage(event: Record<string, unknown>): string {
|
||||||
|
const error = event.error
|
||||||
|
if (error && typeof error === "object") {
|
||||||
|
const message = readString((error as Record<string, unknown>).message)
|
||||||
|
if (message) return message
|
||||||
|
}
|
||||||
|
return readString(event.message) ?? "Realtime speech request failed"
|
||||||
|
}
|
||||||
|
|
||||||
|
function readString(value: unknown): string | undefined {
|
||||||
|
return typeof value === "string" && value.length > 0 ? value : undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
function asRecord(value: unknown): Record<string, unknown> | null {
|
||||||
|
return value && typeof value === "object" && !Array.isArray(value) ? (value as Record<string, unknown>) : null
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractTranscriptFromConversationItem(item: Record<string, unknown>): string | undefined {
|
||||||
|
const directTranscript = readString(item.transcript) ?? readString(item.text)
|
||||||
|
if (directTranscript) return directTranscript
|
||||||
|
|
||||||
|
const content = Array.isArray(item.content) ? item.content : []
|
||||||
|
for (const part of content) {
|
||||||
|
const record = asRecord(part)
|
||||||
|
if (!record) continue
|
||||||
|
const transcript =
|
||||||
|
readString(record.transcript) ??
|
||||||
|
readString(record.text) ??
|
||||||
|
readString(asRecord(record.audio)?.transcript)
|
||||||
|
if (transcript) {
|
||||||
|
return transcript
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
@@ -10,6 +10,8 @@ const ServerSpeechSettingsSchema = z.object({
|
|||||||
provider: z.string().optional(),
|
provider: z.string().optional(),
|
||||||
apiKey: z.string().optional(),
|
apiKey: z.string().optional(),
|
||||||
baseUrl: z.string().optional(),
|
baseUrl: z.string().optional(),
|
||||||
|
useRealtime: z.boolean().optional(),
|
||||||
|
realtimeModel: z.string().optional(),
|
||||||
sttModel: z.string().optional(),
|
sttModel: z.string().optional(),
|
||||||
ttsModel: z.string().optional(),
|
ttsModel: z.string().optional(),
|
||||||
ttsVoice: z.string().optional(),
|
ttsVoice: z.string().optional(),
|
||||||
@@ -40,12 +42,26 @@ export interface NormalizedSpeechSettings {
|
|||||||
provider: string
|
provider: string
|
||||||
apiKey?: string
|
apiKey?: string
|
||||||
baseUrl?: string
|
baseUrl?: string
|
||||||
|
realtimeModel: string
|
||||||
sttModel: string
|
sttModel: string
|
||||||
ttsModel: string
|
ttsModel: string
|
||||||
ttsVoice: string
|
ttsVoice: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export interface RealtimeTranscriptionConfig {
|
||||||
|
provider: string
|
||||||
|
apiKey: string
|
||||||
|
baseUrl?: string
|
||||||
|
realtimeModel: string
|
||||||
|
sttModel: string
|
||||||
|
inputFormat: {
|
||||||
|
type: "audio/pcm"
|
||||||
|
rate: 24000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const DEFAULT_PROVIDER = "openai-compatible"
|
const DEFAULT_PROVIDER = "openai-compatible"
|
||||||
|
const DEFAULT_REALTIME_MODEL = "gpt-realtime"
|
||||||
const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
|
const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
|
||||||
const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
|
const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
|
||||||
const DEFAULT_TTS_VOICE = "alloy"
|
const DEFAULT_TTS_VOICE = "alloy"
|
||||||
@@ -67,6 +83,25 @@ export class SpeechService {
|
|||||||
return this.createProvider().synthesize(input)
|
return this.createProvider().synthesize(input)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
getRealtimeTranscriptionConfig(): RealtimeTranscriptionConfig {
|
||||||
|
const settings = this.resolveSettings()
|
||||||
|
if (!settings.apiKey) {
|
||||||
|
throw new Error("Speech provider is not configured. Add an API key in Speech settings.")
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
provider: settings.provider,
|
||||||
|
apiKey: settings.apiKey,
|
||||||
|
baseUrl: settings.baseUrl,
|
||||||
|
realtimeModel: settings.realtimeModel,
|
||||||
|
sttModel: settings.sttModel,
|
||||||
|
inputFormat: {
|
||||||
|
type: "audio/pcm",
|
||||||
|
rate: 24000,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private createProvider(): SpeechProvider {
|
private createProvider(): SpeechProvider {
|
||||||
const settings = this.resolveSettings()
|
const settings = this.resolveSettings()
|
||||||
return new OpenAICompatibleSpeechProvider({
|
return new OpenAICompatibleSpeechProvider({
|
||||||
@@ -83,6 +118,7 @@ export class SpeechService {
|
|||||||
provider: speech.provider?.trim() || DEFAULT_PROVIDER,
|
provider: speech.provider?.trim() || DEFAULT_PROVIDER,
|
||||||
apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY,
|
apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY,
|
||||||
baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined,
|
baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined,
|
||||||
|
realtimeModel: speech.realtimeModel?.trim() || DEFAULT_REALTIME_MODEL,
|
||||||
sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
|
sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
|
||||||
ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
|
ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
|
||||||
ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
|
ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ import { getCommands } from "../stores/commands"
|
|||||||
import { showAlertDialog } from "../stores/alerts"
|
import { showAlertDialog } from "../stores/alerts"
|
||||||
import { useI18n } from "../lib/i18n"
|
import { useI18n } from "../lib/i18n"
|
||||||
import { getLogger } from "../lib/logger"
|
import { getLogger } from "../lib/logger"
|
||||||
import { preferences } from "../stores/preferences"
|
import { preferences, useConfig } from "../stores/preferences"
|
||||||
import type { ExpandState, PromptInputApi, PromptInputProps, PromptInsertMode, PromptMode } from "./prompt-input/types"
|
import type { ExpandState, PromptInputApi, PromptInputProps, PromptInsertMode, PromptMode } from "./prompt-input/types"
|
||||||
import { usePromptState } from "./prompt-input/usePromptState"
|
import { usePromptState } from "./prompt-input/usePromptState"
|
||||||
import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
|
import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
|
||||||
@@ -22,6 +22,7 @@ const log = getLogger("actions")
|
|||||||
|
|
||||||
export default function PromptInput(props: PromptInputProps) {
|
export default function PromptInput(props: PromptInputProps) {
|
||||||
const { t } = useI18n()
|
const { t } = useI18n()
|
||||||
|
const { serverSettings } = useConfig()
|
||||||
const [, setIsFocused] = createSignal(false)
|
const [, setIsFocused] = createSignal(false)
|
||||||
const [mode, setMode] = createSignal<PromptMode>("normal")
|
const [mode, setMode] = createSignal<PromptMode>("normal")
|
||||||
const [expandState, setExpandState] = createSignal<ExpandState>("normal")
|
const [expandState, setExpandState] = createSignal<ExpandState>("normal")
|
||||||
@@ -418,6 +419,7 @@ export default function PromptInput(props: PromptInputProps) {
|
|||||||
getTextarea: () => textareaRef ?? null,
|
getTextarea: () => textareaRef ?? null,
|
||||||
enabled: () => preferences().showPromptVoiceInput,
|
enabled: () => preferences().showPromptVoiceInput,
|
||||||
disabled: () => Boolean(props.disabled),
|
disabled: () => Boolean(props.disabled),
|
||||||
|
useRealtime: () => serverSettings().speech.useRealtime,
|
||||||
})
|
})
|
||||||
const showVoiceInput = () =>
|
const showVoiceInput = () =>
|
||||||
preferences().showPromptVoiceInput &&
|
preferences().showPromptVoiceInput &&
|
||||||
|
|||||||
@@ -0,0 +1,110 @@
|
|||||||
|
export interface RealtimePcmStreamHandle {
|
||||||
|
stop(): Promise<void>
|
||||||
|
}
|
||||||
|
|
||||||
|
interface CreateRealtimePcmStreamOptions {
|
||||||
|
onChunk: (audioBase64: string) => void | Promise<void>
|
||||||
|
}
|
||||||
|
|
||||||
|
const TARGET_SAMPLE_RATE = 24000
|
||||||
|
const PROCESSOR_BUFFER_SIZE = 4096
|
||||||
|
|
||||||
|
export async function createRealtimePcmStream(
|
||||||
|
options: CreateRealtimePcmStreamOptions,
|
||||||
|
): Promise<RealtimePcmStreamHandle> {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({
|
||||||
|
audio: {
|
||||||
|
channelCount: 1,
|
||||||
|
echoCancellation: true,
|
||||||
|
noiseSuppression: true,
|
||||||
|
autoGainControl: true,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
const AudioContextCtor = window.AudioContext || (window as any).webkitAudioContext
|
||||||
|
if (!AudioContextCtor) {
|
||||||
|
stream.getTracks().forEach((track) => track.stop())
|
||||||
|
throw new Error("AudioContext is not supported in this browser.")
|
||||||
|
}
|
||||||
|
|
||||||
|
const audioContext = new AudioContextCtor()
|
||||||
|
await audioContext.resume()
|
||||||
|
|
||||||
|
const source = audioContext.createMediaStreamSource(stream)
|
||||||
|
const processor = audioContext.createScriptProcessor(PROCESSOR_BUFFER_SIZE, 1, 1)
|
||||||
|
const sink = audioContext.createGain()
|
||||||
|
sink.gain.value = 0
|
||||||
|
|
||||||
|
source.connect(processor)
|
||||||
|
processor.connect(sink)
|
||||||
|
sink.connect(audioContext.destination)
|
||||||
|
|
||||||
|
processor.onaudioprocess = (event) => {
|
||||||
|
const input = event.inputBuffer.getChannelData(0)
|
||||||
|
const resampled = downsampleBuffer(input, audioContext.sampleRate, TARGET_SAMPLE_RATE)
|
||||||
|
if (resampled.length === 0) return
|
||||||
|
const pcm16 = floatTo16BitPcm(resampled)
|
||||||
|
void options.onChunk(base64EncodePcm16(pcm16))
|
||||||
|
}
|
||||||
|
|
||||||
|
let stopped = false
|
||||||
|
return {
|
||||||
|
async stop() {
|
||||||
|
if (stopped) return
|
||||||
|
stopped = true
|
||||||
|
processor.onaudioprocess = null
|
||||||
|
source.disconnect()
|
||||||
|
processor.disconnect()
|
||||||
|
sink.disconnect()
|
||||||
|
stream.getTracks().forEach((track) => track.stop())
|
||||||
|
await audioContext.close()
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function downsampleBuffer(buffer: Float32Array, inputSampleRate: number, outputSampleRate: number): Float32Array {
|
||||||
|
if (inputSampleRate === outputSampleRate) {
|
||||||
|
return buffer.slice()
|
||||||
|
}
|
||||||
|
|
||||||
|
const sampleRateRatio = inputSampleRate / outputSampleRate
|
||||||
|
const outputLength = Math.max(1, Math.round(buffer.length / sampleRateRatio))
|
||||||
|
const output = new Float32Array(outputLength)
|
||||||
|
let outputIndex = 0
|
||||||
|
let inputIndex = 0
|
||||||
|
|
||||||
|
while (outputIndex < outputLength) {
|
||||||
|
const nextInputIndex = Math.min(buffer.length, Math.round((outputIndex + 1) * sampleRateRatio))
|
||||||
|
let sum = 0
|
||||||
|
let count = 0
|
||||||
|
for (let i = inputIndex; i < nextInputIndex; i += 1) {
|
||||||
|
sum += buffer[i]
|
||||||
|
count += 1
|
||||||
|
}
|
||||||
|
output[outputIndex] = count > 0 ? sum / count : buffer[Math.min(buffer.length - 1, inputIndex)]
|
||||||
|
outputIndex += 1
|
||||||
|
inputIndex = nextInputIndex
|
||||||
|
}
|
||||||
|
|
||||||
|
return output
|
||||||
|
}
|
||||||
|
|
||||||
|
function floatTo16BitPcm(buffer: Float32Array): Int16Array {
|
||||||
|
const pcm16 = new Int16Array(buffer.length)
|
||||||
|
for (let i = 0; i < buffer.length; i += 1) {
|
||||||
|
const sample = Math.max(-1, Math.min(1, buffer[i]))
|
||||||
|
pcm16[i] = sample < 0 ? Math.round(sample * 0x8000) : Math.round(sample * 0x7fff)
|
||||||
|
}
|
||||||
|
return pcm16
|
||||||
|
}
|
||||||
|
|
||||||
|
function base64EncodePcm16(buffer: Int16Array): string {
|
||||||
|
const bytes = new Uint8Array(buffer.buffer)
|
||||||
|
let binary = ""
|
||||||
|
const chunkSize = 0x8000
|
||||||
|
for (let offset = 0; offset < bytes.length; offset += chunkSize) {
|
||||||
|
const chunk = bytes.subarray(offset, offset + chunkSize)
|
||||||
|
binary += String.fromCharCode(...chunk)
|
||||||
|
}
|
||||||
|
return btoa(binary)
|
||||||
|
}
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
export interface PromptVoiceAnchor {
|
||||||
|
prompt: string
|
||||||
|
start: number
|
||||||
|
end: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export function createPromptVoiceAnchor(prompt: string, start: number, end: number): PromptVoiceAnchor {
|
||||||
|
return { prompt, start, end }
|
||||||
|
}
|
||||||
|
|
||||||
|
export function buildPromptWithInsertedTranscript(anchor: PromptVoiceAnchor, insertedText: string): { value: string; cursor: number } {
|
||||||
|
const before = anchor.prompt.slice(0, anchor.start)
|
||||||
|
const after = anchor.prompt.slice(anchor.end)
|
||||||
|
const normalized = insertedText.trim()
|
||||||
|
|
||||||
|
if (!normalized) {
|
||||||
|
return {
|
||||||
|
value: before + after,
|
||||||
|
cursor: before.length,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
|
||||||
|
const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
|
||||||
|
return {
|
||||||
|
value: `${before}${prefix}${normalized}${suffix}${after}`,
|
||||||
|
cursor: before.length + prefix.length + normalized.length,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function appendVoiceTranscript(current: string, next: string): string {
|
||||||
|
const normalized = next.trim()
|
||||||
|
if (!normalized) return current
|
||||||
|
if (!current.trim()) return normalized
|
||||||
|
return /\s$/.test(current) ? `${current}${normalized}` : `${current} ${normalized}`
|
||||||
|
}
|
||||||
@@ -0,0 +1,241 @@
|
|||||||
|
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
|
||||||
|
import { showAlertDialog } from "../../stores/alerts"
|
||||||
|
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
||||||
|
import { serverApi } from "../../lib/api-client"
|
||||||
|
import { useI18n } from "../../lib/i18n"
|
||||||
|
import { createPromptVoiceAnchor, buildPromptWithInsertedTranscript } from "./promptVoiceInsertion"
|
||||||
|
|
||||||
|
interface UsePromptBufferedVoiceInputOptions {
|
||||||
|
prompt: Accessor<string>
|
||||||
|
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||||
|
getTextarea: () => HTMLTextAreaElement | null
|
||||||
|
enabled: Accessor<boolean>
|
||||||
|
disabled: Accessor<boolean>
|
||||||
|
}
|
||||||
|
|
||||||
|
type VoiceInputState = "idle" | "recording" | "transcribing"
|
||||||
|
|
||||||
|
export function usePromptBufferedVoiceInput(options: UsePromptBufferedVoiceInputOptions) {
|
||||||
|
const { t } = useI18n()
|
||||||
|
const [state, setState] = createSignal<VoiceInputState>("idle")
|
||||||
|
const [elapsedMs, setElapsedMs] = createSignal(0)
|
||||||
|
|
||||||
|
let mediaRecorder: MediaRecorder | null = null
|
||||||
|
let mediaStream: MediaStream | null = null
|
||||||
|
let timerId: number | undefined
|
||||||
|
let shouldTranscribe = true
|
||||||
|
let recordedChunks: Blob[] = []
|
||||||
|
let recordingStartedAt = 0
|
||||||
|
|
||||||
|
createEffect(() => {
|
||||||
|
void loadSpeechCapabilities()
|
||||||
|
})
|
||||||
|
|
||||||
|
onCleanup(() => {
|
||||||
|
cleanupMedia(false)
|
||||||
|
})
|
||||||
|
|
||||||
|
const isSupported = () => {
|
||||||
|
if (typeof window === "undefined") return false
|
||||||
|
return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
|
||||||
|
}
|
||||||
|
|
||||||
|
const canUseVoiceInput = () => {
|
||||||
|
const capabilities = speechCapabilities()
|
||||||
|
return Boolean(
|
||||||
|
options.enabled() &&
|
||||||
|
isSupported() &&
|
||||||
|
capabilities?.available &&
|
||||||
|
capabilities?.configured &&
|
||||||
|
capabilities?.supportsStt,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
async function toggleRecording(): Promise<void> {
|
||||||
|
if (state() === "recording") {
|
||||||
|
stopRecording()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
|
||||||
|
|
||||||
|
try {
|
||||||
|
await startRecording()
|
||||||
|
} catch (error) {
|
||||||
|
cleanupMedia(false)
|
||||||
|
showAlertDialog(t("promptInput.voiceInput.error.permission"), {
|
||||||
|
title: t("promptInput.voiceInput.error.title"),
|
||||||
|
detail: error instanceof Error ? error.message : String(error),
|
||||||
|
variant: "error",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopRecording() {
|
||||||
|
if (!mediaRecorder || state() !== "recording") return
|
||||||
|
shouldTranscribe = true
|
||||||
|
mediaRecorder.stop()
|
||||||
|
setState("transcribing")
|
||||||
|
stopTimer()
|
||||||
|
}
|
||||||
|
|
||||||
|
function cancelRecording() {
|
||||||
|
if (!mediaRecorder || state() !== "recording") return
|
||||||
|
shouldTranscribe = false
|
||||||
|
mediaRecorder.stop()
|
||||||
|
cleanupMedia(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
async function startRecording() {
|
||||||
|
if (!isSupported()) {
|
||||||
|
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
|
||||||
|
title: t("promptInput.voiceInput.error.title"),
|
||||||
|
variant: "error",
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
recordedChunks = []
|
||||||
|
shouldTranscribe = true
|
||||||
|
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
|
||||||
|
mediaRecorder = createRecorder(mediaStream)
|
||||||
|
|
||||||
|
mediaRecorder.addEventListener("dataavailable", (event) => {
|
||||||
|
if (event.data.size > 0) {
|
||||||
|
recordedChunks.push(event.data)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
mediaRecorder.addEventListener("stop", () => {
|
||||||
|
void finalizeRecording()
|
||||||
|
})
|
||||||
|
|
||||||
|
recordingStartedAt = Date.now()
|
||||||
|
setElapsedMs(0)
|
||||||
|
setState("recording")
|
||||||
|
startTimer()
|
||||||
|
mediaRecorder.start()
|
||||||
|
}
|
||||||
|
|
||||||
|
async function finalizeRecording() {
|
||||||
|
const recorder = mediaRecorder
|
||||||
|
const stream = mediaStream
|
||||||
|
mediaRecorder = null
|
||||||
|
mediaStream = null
|
||||||
|
|
||||||
|
if (!shouldTranscribe || recordedChunks.length === 0) {
|
||||||
|
recordedChunks = []
|
||||||
|
stopTracks(stream)
|
||||||
|
setState("idle")
|
||||||
|
setElapsedMs(0)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
|
||||||
|
|
||||||
|
try {
|
||||||
|
const audioBlob = new Blob(recordedChunks, { type: mimeType })
|
||||||
|
const transcription = await serverApi.transcribeAudio({
|
||||||
|
audioBase64: await blobToBase64(audioBlob),
|
||||||
|
mimeType,
|
||||||
|
})
|
||||||
|
if (transcription.text.trim()) {
|
||||||
|
insertTranscript(transcription.text.trim())
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
|
||||||
|
title: t("promptInput.voiceInput.error.title"),
|
||||||
|
detail: error instanceof Error ? error.message : String(error),
|
||||||
|
variant: "error",
|
||||||
|
})
|
||||||
|
} finally {
|
||||||
|
recordedChunks = []
|
||||||
|
stopTracks(stream)
|
||||||
|
setState("idle")
|
||||||
|
setElapsedMs(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function insertTranscript(text: string) {
|
||||||
|
const current = options.prompt()
|
||||||
|
const textarea = options.getTextarea()
|
||||||
|
const start = textarea ? textarea.selectionStart : current.length
|
||||||
|
const end = textarea ? textarea.selectionEnd : current.length
|
||||||
|
const { value, cursor } = buildPromptWithInsertedTranscript(
|
||||||
|
createPromptVoiceAnchor(current, start, end),
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
|
||||||
|
options.setPrompt(value)
|
||||||
|
if (textarea) {
|
||||||
|
setTimeout(() => {
|
||||||
|
textarea.focus()
|
||||||
|
textarea.setSelectionRange(cursor, cursor)
|
||||||
|
}, 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanupMedia(resetState = true) {
|
||||||
|
stopTimer()
|
||||||
|
if (mediaRecorder && mediaRecorder.state !== "inactive") {
|
||||||
|
mediaRecorder.stop()
|
||||||
|
}
|
||||||
|
mediaRecorder = null
|
||||||
|
stopTracks(mediaStream)
|
||||||
|
mediaStream = null
|
||||||
|
recordedChunks = []
|
||||||
|
if (resetState) {
|
||||||
|
setState("idle")
|
||||||
|
setElapsedMs(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function startTimer() {
|
||||||
|
stopTimer()
|
||||||
|
timerId = window.setInterval(() => {
|
||||||
|
setElapsedMs(Date.now() - recordingStartedAt)
|
||||||
|
}, 250)
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopTimer() {
|
||||||
|
if (timerId !== undefined) {
|
||||||
|
window.clearInterval(timerId)
|
||||||
|
timerId = undefined
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
state,
|
||||||
|
elapsedMs,
|
||||||
|
canUseVoiceInput,
|
||||||
|
toggleRecording,
|
||||||
|
cancelRecording,
|
||||||
|
isRecording: () => state() === "recording",
|
||||||
|
isTranscribing: () => state() === "transcribing",
|
||||||
|
buttonTitle: () => {
|
||||||
|
if (state() === "recording") return t("promptInput.voiceInput.stop.title")
|
||||||
|
if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
|
||||||
|
return t("promptInput.voiceInput.start.title")
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function createRecorder(stream: MediaStream): MediaRecorder {
|
||||||
|
const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
|
||||||
|
const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
|
||||||
|
return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopTracks(stream: MediaStream | null) {
|
||||||
|
stream?.getTracks().forEach((track) => track.stop())
|
||||||
|
}
|
||||||
|
|
||||||
|
async function blobToBase64(blob: Blob): Promise<string> {
|
||||||
|
const buffer = await blob.arrayBuffer()
|
||||||
|
const bytes = new Uint8Array(buffer)
|
||||||
|
let binary = ""
|
||||||
|
for (const byte of bytes) {
|
||||||
|
binary += String.fromCharCode(byte)
|
||||||
|
}
|
||||||
|
return btoa(binary)
|
||||||
|
}
|
||||||
@@ -0,0 +1,325 @@
|
|||||||
|
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
|
||||||
|
import { showAlertDialog } from "../../stores/alerts"
|
||||||
|
import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client"
|
||||||
|
import { useI18n } from "../../lib/i18n"
|
||||||
|
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
||||||
|
import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream"
|
||||||
|
import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion"
|
||||||
|
|
||||||
|
interface UsePromptRealtimeVoiceInputOptions {
|
||||||
|
prompt: Accessor<string>
|
||||||
|
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||||
|
getTextarea: () => HTMLTextAreaElement | null
|
||||||
|
enabled: Accessor<boolean>
|
||||||
|
disabled: Accessor<boolean>
|
||||||
|
}
|
||||||
|
|
||||||
|
type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing"
|
||||||
|
|
||||||
|
const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000
|
||||||
|
|
||||||
|
export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) {
|
||||||
|
const { t } = useI18n()
|
||||||
|
const [state, setState] = createSignal<RealtimeVoiceState>("idle")
|
||||||
|
const [elapsedMs, setElapsedMs] = createSignal(0)
|
||||||
|
|
||||||
|
let activeSessionId: string | null = null
|
||||||
|
let eventSource: EventSource | null = null
|
||||||
|
let pcmStream: RealtimePcmStreamHandle | null = null
|
||||||
|
let audioQueue: Promise<void> = Promise.resolve()
|
||||||
|
let timerId: number | undefined
|
||||||
|
let recordingStartedAt = 0
|
||||||
|
let finalizeTimerId: number | undefined
|
||||||
|
let anchor = createPromptVoiceAnchor("", 0, 0)
|
||||||
|
let finalTranscript = ""
|
||||||
|
let liveTranscript = ""
|
||||||
|
let activeLiveItemId: string | null = null
|
||||||
|
let closing = false
|
||||||
|
|
||||||
|
createEffect(() => {
|
||||||
|
void loadSpeechCapabilities()
|
||||||
|
})
|
||||||
|
|
||||||
|
onCleanup(() => {
|
||||||
|
cancelRecording()
|
||||||
|
})
|
||||||
|
|
||||||
|
const isSupported = () => {
|
||||||
|
if (typeof window === "undefined") return false
|
||||||
|
return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined"
|
||||||
|
}
|
||||||
|
|
||||||
|
const canUseVoiceInput = () => {
|
||||||
|
const capabilities = speechCapabilities()
|
||||||
|
return Boolean(
|
||||||
|
options.enabled() &&
|
||||||
|
isSupported() &&
|
||||||
|
capabilities?.available &&
|
||||||
|
capabilities?.configured &&
|
||||||
|
capabilities?.supportsStt &&
|
||||||
|
capabilities?.supportsRealtimeTranscription,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
async function toggleRecording(): Promise<void> {
|
||||||
|
if (state() === "listening" || state() === "connecting") {
|
||||||
|
await stopRecording()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return
|
||||||
|
|
||||||
|
try {
|
||||||
|
await startRecording()
|
||||||
|
} catch (error) {
|
||||||
|
await cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||||
|
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
|
||||||
|
title: t("promptInput.voiceInput.error.title"),
|
||||||
|
detail: error instanceof Error ? error.message : String(error),
|
||||||
|
variant: "error",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function startRecording() {
|
||||||
|
if (!isSupported()) {
|
||||||
|
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
|
||||||
|
title: t("promptInput.voiceInput.error.title"),
|
||||||
|
variant: "error",
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
resetTranscriptState()
|
||||||
|
captureAnchor()
|
||||||
|
setState("connecting")
|
||||||
|
setElapsedMs(0)
|
||||||
|
|
||||||
|
const created = await serverApi.createRealtimeSpeechSession({
|
||||||
|
language: detectLanguage(),
|
||||||
|
})
|
||||||
|
activeSessionId = created.sessionId
|
||||||
|
connectEventStream(created.sessionId)
|
||||||
|
|
||||||
|
pcmStream = await createRealtimePcmStream({
|
||||||
|
onChunk: (audioBase64) => {
|
||||||
|
const sessionId = activeSessionId
|
||||||
|
if (!sessionId || closing) return
|
||||||
|
audioQueue = audioQueue
|
||||||
|
.then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 }))
|
||||||
|
.catch((error) => {
|
||||||
|
handleRealtimeError(error)
|
||||||
|
})
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
recordingStartedAt = Date.now()
|
||||||
|
startTimer()
|
||||||
|
setState("listening")
|
||||||
|
}
|
||||||
|
|
||||||
|
async function stopRecording() {
|
||||||
|
const sessionId = activeSessionId
|
||||||
|
if (!sessionId || (state() !== "listening" && state() !== "connecting")) return
|
||||||
|
|
||||||
|
setState("finalizing")
|
||||||
|
stopTimer()
|
||||||
|
|
||||||
|
if (pcmStream) {
|
||||||
|
const stream = pcmStream
|
||||||
|
pcmStream = null
|
||||||
|
await stream.stop()
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await audioQueue.catch(() => undefined)
|
||||||
|
await serverApi.finalizeRealtimeSpeechSession(sessionId)
|
||||||
|
scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS)
|
||||||
|
} catch (error) {
|
||||||
|
handleRealtimeError(error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function cancelRecording() {
|
||||||
|
void cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||||
|
}
|
||||||
|
|
||||||
|
function connectEventStream(sessionId: string) {
|
||||||
|
eventSource?.close()
|
||||||
|
eventSource = serverApi.connectRealtimeSpeechEvents(
|
||||||
|
sessionId,
|
||||||
|
(event) => handleEvent(event),
|
||||||
|
() => {
|
||||||
|
if (closing) return
|
||||||
|
handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection")))
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
function handleEvent(event: SpeechRealtimeEvent) {
|
||||||
|
if (event.type === "session.ready") {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === "session.error") {
|
||||||
|
handleRealtimeError(new Error(event.message))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === "transcript.partial") {
|
||||||
|
activeLiveItemId = event.itemId
|
||||||
|
liveTranscript = event.text
|
||||||
|
renderPrompt(false)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === "transcript.final") {
|
||||||
|
activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId
|
||||||
|
liveTranscript = ""
|
||||||
|
finalTranscript = appendVoiceTranscript(finalTranscript, event.text)
|
||||||
|
renderPrompt(true)
|
||||||
|
if (state() === "finalizing") {
|
||||||
|
scheduleFinalizeClose(250)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === "session.closed") {
|
||||||
|
void cleanupSession({ revertPrompt: false, closeRemote: false })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function captureAnchor() {
|
||||||
|
const textarea = options.getTextarea()
|
||||||
|
const current = options.prompt()
|
||||||
|
const start = textarea ? textarea.selectionStart : current.length
|
||||||
|
const end = textarea ? textarea.selectionEnd : current.length
|
||||||
|
anchor = createPromptVoiceAnchor(current, start, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderPrompt(persistDraft: boolean) {
|
||||||
|
const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "")
|
||||||
|
const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted)
|
||||||
|
options.setPrompt(value, persistDraft ? undefined : { persistDraft: false })
|
||||||
|
syncTextareaCursor(cursor)
|
||||||
|
}
|
||||||
|
|
||||||
|
function syncTextareaCursor(cursor: number) {
|
||||||
|
const textarea = options.getTextarea()
|
||||||
|
if (!textarea) return
|
||||||
|
queueMicrotask(() => {
|
||||||
|
const next = options.getTextarea()
|
||||||
|
if (!next) return
|
||||||
|
next.focus()
|
||||||
|
next.setSelectionRange(cursor, cursor)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
function scheduleFinalizeClose(delayMs: number) {
|
||||||
|
if (finalizeTimerId !== undefined) {
|
||||||
|
window.clearTimeout(finalizeTimerId)
|
||||||
|
}
|
||||||
|
finalizeTimerId = window.setTimeout(() => {
|
||||||
|
void cleanupSession({ revertPrompt: false, closeRemote: true })
|
||||||
|
}, delayMs)
|
||||||
|
}
|
||||||
|
|
||||||
|
async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) {
|
||||||
|
if (closing) return
|
||||||
|
closing = true
|
||||||
|
|
||||||
|
if (finalizeTimerId !== undefined) {
|
||||||
|
window.clearTimeout(finalizeTimerId)
|
||||||
|
finalizeTimerId = undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
stopTimer()
|
||||||
|
|
||||||
|
const sessionId = activeSessionId
|
||||||
|
activeSessionId = null
|
||||||
|
|
||||||
|
eventSource?.close()
|
||||||
|
eventSource = null
|
||||||
|
|
||||||
|
if (pcmStream) {
|
||||||
|
const stream = pcmStream
|
||||||
|
pcmStream = null
|
||||||
|
await stream.stop().catch(() => undefined)
|
||||||
|
}
|
||||||
|
|
||||||
|
await audioQueue.catch(() => undefined)
|
||||||
|
audioQueue = Promise.resolve()
|
||||||
|
|
||||||
|
if (cleanupOptions.closeRemote && sessionId) {
|
||||||
|
await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) {
|
||||||
|
finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript)
|
||||||
|
liveTranscript = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cleanupOptions.revertPrompt) {
|
||||||
|
options.setPrompt(anchor.prompt)
|
||||||
|
} else if (finalTranscript.trim()) {
|
||||||
|
renderPrompt(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
resetTranscriptState()
|
||||||
|
setState("idle")
|
||||||
|
setElapsedMs(0)
|
||||||
|
closing = false
|
||||||
|
}
|
||||||
|
|
||||||
|
function resetTranscriptState() {
|
||||||
|
finalTranscript = ""
|
||||||
|
liveTranscript = ""
|
||||||
|
activeLiveItemId = null
|
||||||
|
}
|
||||||
|
|
||||||
|
function handleRealtimeError(error: unknown) {
|
||||||
|
if (closing) return
|
||||||
|
void cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||||
|
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
|
||||||
|
title: t("promptInput.voiceInput.error.title"),
|
||||||
|
detail: error instanceof Error ? error.message : String(error),
|
||||||
|
variant: "error",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
function startTimer() {
|
||||||
|
stopTimer()
|
||||||
|
timerId = window.setInterval(() => {
|
||||||
|
setElapsedMs(Date.now() - recordingStartedAt)
|
||||||
|
}, 250)
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopTimer() {
|
||||||
|
if (timerId !== undefined) {
|
||||||
|
window.clearInterval(timerId)
|
||||||
|
timerId = undefined
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
state,
|
||||||
|
elapsedMs,
|
||||||
|
canUseVoiceInput,
|
||||||
|
toggleRecording,
|
||||||
|
cancelRecording,
|
||||||
|
isRecording: () => state() === "connecting" || state() === "listening",
|
||||||
|
isTranscribing: () => state() === "finalizing",
|
||||||
|
buttonTitle: () => {
|
||||||
|
if (state() === "connecting") return t("promptInput.voiceInput.connecting.title")
|
||||||
|
if (state() === "listening") return t("promptInput.voiceInput.stop.title")
|
||||||
|
if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title")
|
||||||
|
return t("promptInput.voiceInput.start.title")
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectLanguage(): string | undefined {
|
||||||
|
if (typeof navigator === "undefined") return undefined
|
||||||
|
const [language] = navigator.language.split("-")
|
||||||
|
return language?.trim() || undefined
|
||||||
|
}
|
||||||
@@ -22,7 +22,7 @@ type HistorySelectOptions = {
|
|||||||
|
|
||||||
type PromptState = {
|
type PromptState = {
|
||||||
prompt: Accessor<string>
|
prompt: Accessor<string>
|
||||||
setPrompt: (value: string) => void
|
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||||
clearPrompt: () => void
|
clearPrompt: () => void
|
||||||
|
|
||||||
draftLoadedNonce: Accessor<number>
|
draftLoadedNonce: Accessor<number>
|
||||||
@@ -48,11 +48,11 @@ export function usePromptState(options: PromptStateOptions): PromptState {
|
|||||||
const [historyDraft, setHistoryDraft] = createSignal<string | null>(null)
|
const [historyDraft, setHistoryDraft] = createSignal<string | null>(null)
|
||||||
const [draftLoadedNonce, setDraftLoadedNonce] = createSignal(0)
|
const [draftLoadedNonce, setDraftLoadedNonce] = createSignal(0)
|
||||||
|
|
||||||
const setPrompt = (value: string) => {
|
const setPrompt = (value: string, setOptions?: { persistDraft?: boolean }) => {
|
||||||
setPromptInternal(value)
|
setPromptInternal(value)
|
||||||
// Persist drafts only when the user is at the "fresh" position (not browsing history).
|
// Persist drafts only when the user is at the "fresh" position (not browsing history).
|
||||||
// This keeps the bottom-of-history draft stable even if the user edits recalled history entries.
|
// This keeps the bottom-of-history draft stable even if the user edits recalled history entries.
|
||||||
if (historyIndex() === -1) {
|
if (setOptions?.persistDraft !== false && historyIndex() === -1) {
|
||||||
setSessionDraftPrompt(options.instanceId(), options.sessionId(), value)
|
setSessionDraftPrompt(options.instanceId(), options.sessionId(), value)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,242 +1,30 @@
|
|||||||
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
|
import type { Accessor } from "solid-js"
|
||||||
import { showAlertDialog } from "../../stores/alerts"
|
import { usePromptBufferedVoiceInput } from "./usePromptBufferedVoiceInput"
|
||||||
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
import { usePromptRealtimeVoiceInput } from "./usePromptRealtimeVoiceInput"
|
||||||
import { serverApi } from "../../lib/api-client"
|
|
||||||
import { useI18n } from "../../lib/i18n"
|
|
||||||
|
|
||||||
interface UsePromptVoiceInputOptions {
|
interface UsePromptVoiceInputOptions {
|
||||||
prompt: Accessor<string>
|
prompt: Accessor<string>
|
||||||
setPrompt: (value: string) => void
|
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||||
getTextarea: () => HTMLTextAreaElement | null
|
getTextarea: () => HTMLTextAreaElement | null
|
||||||
enabled: Accessor<boolean>
|
enabled: Accessor<boolean>
|
||||||
disabled: Accessor<boolean>
|
disabled: Accessor<boolean>
|
||||||
|
useRealtime: Accessor<boolean>
|
||||||
}
|
}
|
||||||
|
|
||||||
type VoiceInputState = "idle" | "recording" | "transcribing"
|
|
||||||
|
|
||||||
export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
|
export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
|
||||||
const { t } = useI18n()
|
const buffered = usePromptBufferedVoiceInput(options)
|
||||||
const [state, setState] = createSignal<VoiceInputState>("idle")
|
const realtime = usePromptRealtimeVoiceInput(options)
|
||||||
const [elapsedMs, setElapsedMs] = createSignal(0)
|
|
||||||
|
|
||||||
let mediaRecorder: MediaRecorder | null = null
|
const active = () => (options.useRealtime() ? realtime : buffered)
|
||||||
let mediaStream: MediaStream | null = null
|
|
||||||
let timerId: number | undefined
|
|
||||||
let shouldTranscribe = true
|
|
||||||
let recordedChunks: Blob[] = []
|
|
||||||
let recordingStartedAt = 0
|
|
||||||
|
|
||||||
createEffect(() => {
|
|
||||||
void loadSpeechCapabilities()
|
|
||||||
})
|
|
||||||
|
|
||||||
onCleanup(() => {
|
|
||||||
cleanupMedia(false)
|
|
||||||
})
|
|
||||||
|
|
||||||
const isSupported = () => {
|
|
||||||
if (typeof window === "undefined") return false
|
|
||||||
return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
|
|
||||||
}
|
|
||||||
|
|
||||||
const canUseVoiceInput = () => {
|
|
||||||
const capabilities = speechCapabilities()
|
|
||||||
return Boolean(
|
|
||||||
options.enabled() &&
|
|
||||||
isSupported() &&
|
|
||||||
capabilities?.available &&
|
|
||||||
capabilities?.configured &&
|
|
||||||
capabilities?.supportsStt,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
async function toggleRecording(): Promise<void> {
|
|
||||||
if (state() === "recording") {
|
|
||||||
stopRecording()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
|
|
||||||
|
|
||||||
try {
|
|
||||||
await startRecording()
|
|
||||||
} catch (error) {
|
|
||||||
cleanupMedia(false)
|
|
||||||
showAlertDialog(t("promptInput.voiceInput.error.permission"), {
|
|
||||||
title: t("promptInput.voiceInput.error.title"),
|
|
||||||
detail: error instanceof Error ? error.message : String(error),
|
|
||||||
variant: "error",
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function stopRecording() {
|
|
||||||
if (!mediaRecorder || state() !== "recording") return
|
|
||||||
shouldTranscribe = true
|
|
||||||
mediaRecorder.stop()
|
|
||||||
setState("transcribing")
|
|
||||||
stopTimer()
|
|
||||||
}
|
|
||||||
|
|
||||||
function cancelRecording() {
|
|
||||||
if (!mediaRecorder || state() !== "recording") return
|
|
||||||
shouldTranscribe = false
|
|
||||||
mediaRecorder.stop()
|
|
||||||
cleanupMedia(false)
|
|
||||||
}
|
|
||||||
|
|
||||||
async function startRecording() {
|
|
||||||
if (!isSupported()) {
|
|
||||||
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
|
|
||||||
title: t("promptInput.voiceInput.error.title"),
|
|
||||||
variant: "error",
|
|
||||||
})
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
recordedChunks = []
|
|
||||||
shouldTranscribe = true
|
|
||||||
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
|
|
||||||
mediaRecorder = createRecorder(mediaStream)
|
|
||||||
|
|
||||||
mediaRecorder.addEventListener("dataavailable", (event) => {
|
|
||||||
if (event.data.size > 0) {
|
|
||||||
recordedChunks.push(event.data)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
mediaRecorder.addEventListener("stop", () => {
|
|
||||||
void finalizeRecording()
|
|
||||||
})
|
|
||||||
|
|
||||||
recordingStartedAt = Date.now()
|
|
||||||
setElapsedMs(0)
|
|
||||||
setState("recording")
|
|
||||||
startTimer()
|
|
||||||
mediaRecorder.start()
|
|
||||||
}
|
|
||||||
|
|
||||||
async function finalizeRecording() {
|
|
||||||
const recorder = mediaRecorder
|
|
||||||
const stream = mediaStream
|
|
||||||
mediaRecorder = null
|
|
||||||
mediaStream = null
|
|
||||||
|
|
||||||
if (!shouldTranscribe || recordedChunks.length === 0) {
|
|
||||||
recordedChunks = []
|
|
||||||
stopTracks(stream)
|
|
||||||
setState("idle")
|
|
||||||
setElapsedMs(0)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
|
|
||||||
|
|
||||||
try {
|
|
||||||
const audioBlob = new Blob(recordedChunks, { type: mimeType })
|
|
||||||
const transcription = await serverApi.transcribeAudio({
|
|
||||||
audioBase64: await blobToBase64(audioBlob),
|
|
||||||
mimeType,
|
|
||||||
})
|
|
||||||
if (transcription.text.trim()) {
|
|
||||||
insertTranscript(transcription.text.trim())
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
|
|
||||||
title: t("promptInput.voiceInput.error.title"),
|
|
||||||
detail: error instanceof Error ? error.message : String(error),
|
|
||||||
variant: "error",
|
|
||||||
})
|
|
||||||
} finally {
|
|
||||||
recordedChunks = []
|
|
||||||
stopTracks(stream)
|
|
||||||
setState("idle")
|
|
||||||
setElapsedMs(0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function insertTranscript(text: string) {
|
|
||||||
const current = options.prompt()
|
|
||||||
const textarea = options.getTextarea()
|
|
||||||
const start = textarea ? textarea.selectionStart : current.length
|
|
||||||
const end = textarea ? textarea.selectionEnd : current.length
|
|
||||||
const before = current.slice(0, start)
|
|
||||||
const after = current.slice(end)
|
|
||||||
const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
|
|
||||||
const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
|
|
||||||
const nextValue = `${before}${prefix}${text}${suffix}${after}`
|
|
||||||
const cursor = before.length + prefix.length + text.length
|
|
||||||
|
|
||||||
options.setPrompt(nextValue)
|
|
||||||
if (textarea) {
|
|
||||||
setTimeout(() => {
|
|
||||||
textarea.focus()
|
|
||||||
textarea.setSelectionRange(cursor, cursor)
|
|
||||||
}, 0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function cleanupMedia(resetState = true) {
|
|
||||||
stopTimer()
|
|
||||||
if (mediaRecorder && mediaRecorder.state !== "inactive") {
|
|
||||||
mediaRecorder.stop()
|
|
||||||
}
|
|
||||||
mediaRecorder = null
|
|
||||||
stopTracks(mediaStream)
|
|
||||||
mediaStream = null
|
|
||||||
recordedChunks = []
|
|
||||||
if (resetState) {
|
|
||||||
setState("idle")
|
|
||||||
setElapsedMs(0)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function startTimer() {
|
|
||||||
stopTimer()
|
|
||||||
timerId = window.setInterval(() => {
|
|
||||||
setElapsedMs(Date.now() - recordingStartedAt)
|
|
||||||
}, 250)
|
|
||||||
}
|
|
||||||
|
|
||||||
function stopTimer() {
|
|
||||||
if (timerId !== undefined) {
|
|
||||||
window.clearInterval(timerId)
|
|
||||||
timerId = undefined
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
state,
|
state: () => active().state(),
|
||||||
elapsedMs,
|
elapsedMs: () => active().elapsedMs(),
|
||||||
canUseVoiceInput,
|
canUseVoiceInput: () => active().canUseVoiceInput(),
|
||||||
toggleRecording,
|
toggleRecording: () => active().toggleRecording(),
|
||||||
cancelRecording,
|
cancelRecording: () => active().cancelRecording(),
|
||||||
isRecording: () => state() === "recording",
|
isRecording: () => active().isRecording(),
|
||||||
isTranscribing: () => state() === "transcribing",
|
isTranscribing: () => active().isTranscribing(),
|
||||||
buttonTitle: () => {
|
buttonTitle: () => active().buttonTitle(),
|
||||||
if (state() === "recording") return t("promptInput.voiceInput.stop.title")
|
|
||||||
if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
|
|
||||||
return t("promptInput.voiceInput.start.title")
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function createRecorder(stream: MediaStream): MediaRecorder {
|
|
||||||
const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
|
|
||||||
const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
|
|
||||||
return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
|
|
||||||
}
|
|
||||||
|
|
||||||
function stopTracks(stream: MediaStream | null) {
|
|
||||||
stream?.getTracks().forEach((track) => track.stop())
|
|
||||||
}
|
|
||||||
|
|
||||||
async function blobToBase64(blob: Blob): Promise<string> {
|
|
||||||
const buffer = await blob.arrayBuffer()
|
|
||||||
const bytes = new Uint8Array(buffer)
|
|
||||||
let binary = ""
|
|
||||||
for (const byte of bytes) {
|
|
||||||
binary += String.fromCharCode(byte)
|
|
||||||
}
|
|
||||||
return btoa(binary)
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -10,6 +10,8 @@ const log = getLogger("actions")
|
|||||||
type DraftFields = {
|
type DraftFields = {
|
||||||
apiKey: string
|
apiKey: string
|
||||||
baseUrl: string
|
baseUrl: string
|
||||||
|
useRealtime: boolean
|
||||||
|
realtimeModel: string
|
||||||
sttModel: string
|
sttModel: string
|
||||||
ttsModel: string
|
ttsModel: string
|
||||||
ttsVoice: string
|
ttsVoice: string
|
||||||
@@ -19,6 +21,8 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
|
|||||||
return {
|
return {
|
||||||
apiKey: speech.apiKey ?? "",
|
apiKey: speech.apiKey ?? "",
|
||||||
baseUrl: speech.baseUrl ?? "",
|
baseUrl: speech.baseUrl ?? "",
|
||||||
|
useRealtime: speech.useRealtime,
|
||||||
|
realtimeModel: speech.realtimeModel,
|
||||||
sttModel: speech.sttModel,
|
sttModel: speech.sttModel,
|
||||||
ttsModel: speech.ttsModel,
|
ttsModel: speech.ttsModel,
|
||||||
ttsVoice: speech.ttsVoice,
|
ttsVoice: speech.ttsVoice,
|
||||||
@@ -26,7 +30,7 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
|
function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
|
||||||
return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
|
return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.useRealtime === b.useRealtime && a.realtimeModel === b.realtimeModel && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
|
||||||
}
|
}
|
||||||
|
|
||||||
export const SpeechSettingsCard: Component = () => {
|
export const SpeechSettingsCard: Component = () => {
|
||||||
@@ -57,7 +61,7 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
|
return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
|
||||||
}
|
}
|
||||||
|
|
||||||
const updateDraft = (key: keyof DraftFields, value: string) => {
|
const updateDraft = <K extends keyof DraftFields>(key: K, value: DraftFields[K]) => {
|
||||||
setSaveStatus("idle")
|
setSaveStatus("idle")
|
||||||
setDrafts((current) => ({ ...current, [key]: value }))
|
setDrafts((current) => ({ ...current, [key]: value }))
|
||||||
}
|
}
|
||||||
@@ -65,12 +69,14 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
const isDirty = createMemo(() => {
|
const isDirty = createMemo(() => {
|
||||||
const speech = serverSettings().speech
|
const speech = serverSettings().speech
|
||||||
const current = drafts()
|
const current = drafts()
|
||||||
return (
|
return (
|
||||||
(current.apiKey || "") !== (speech.apiKey || "") ||
|
(current.apiKey || "") !== (speech.apiKey || "") ||
|
||||||
(current.baseUrl || "") !== (speech.baseUrl || "") ||
|
(current.baseUrl || "") !== (speech.baseUrl || "") ||
|
||||||
current.sttModel !== speech.sttModel ||
|
current.useRealtime !== speech.useRealtime ||
|
||||||
current.ttsModel !== speech.ttsModel ||
|
current.realtimeModel !== speech.realtimeModel ||
|
||||||
current.ttsVoice !== speech.ttsVoice
|
current.sttModel !== speech.sttModel ||
|
||||||
|
current.ttsModel !== speech.ttsModel ||
|
||||||
|
current.ttsVoice !== speech.ttsVoice
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -90,6 +96,8 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
await updateSpeechSettings({
|
await updateSpeechSettings({
|
||||||
apiKey: current.apiKey.trim() || undefined,
|
apiKey: current.apiKey.trim() || undefined,
|
||||||
baseUrl: current.baseUrl.trim() || undefined,
|
baseUrl: current.baseUrl.trim() || undefined,
|
||||||
|
useRealtime: current.useRealtime,
|
||||||
|
realtimeModel: current.realtimeModel.trim() || undefined,
|
||||||
sttModel: current.sttModel.trim() || undefined,
|
sttModel: current.sttModel.trim() || undefined,
|
||||||
ttsModel: current.ttsModel.trim() || undefined,
|
ttsModel: current.ttsModel.trim() || undefined,
|
||||||
ttsVoice: current.ttsVoice.trim() || undefined,
|
ttsVoice: current.ttsVoice.trim() || undefined,
|
||||||
@@ -98,6 +106,8 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
setDrafts({
|
setDrafts({
|
||||||
apiKey: current.apiKey.trim(),
|
apiKey: current.apiKey.trim(),
|
||||||
baseUrl: current.baseUrl.trim(),
|
baseUrl: current.baseUrl.trim(),
|
||||||
|
useRealtime: current.useRealtime,
|
||||||
|
realtimeModel: current.realtimeModel.trim() || serverSettings().speech.realtimeModel,
|
||||||
sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
|
sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
|
||||||
ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
|
ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
|
||||||
ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
|
ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
|
||||||
@@ -159,6 +169,27 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
onInput={(value) => updateDraft("baseUrl", value)}
|
onInput={(value) => updateDraft("baseUrl", value)}
|
||||||
placeholder={t("settings.speech.baseUrl.placeholder")}
|
placeholder={t("settings.speech.baseUrl.placeholder")}
|
||||||
/>
|
/>
|
||||||
|
<div class="settings-toggle-row">
|
||||||
|
<div>
|
||||||
|
<div class="settings-toggle-title">{t("settings.speech.realtime.title")}</div>
|
||||||
|
<div class="settings-toggle-caption">{t("settings.speech.realtime.subtitle")}</div>
|
||||||
|
</div>
|
||||||
|
<label class="settings-checkbox-toggle">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={drafts().useRealtime}
|
||||||
|
onChange={(event) => updateDraft("useRealtime", event.currentTarget.checked)}
|
||||||
|
/>
|
||||||
|
<span>{t("settings.common.enabled")}</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<Field
|
||||||
|
label={t("settings.speech.realtimeModel.title")}
|
||||||
|
caption={t("settings.speech.realtimeModel.subtitle")}
|
||||||
|
value={drafts().realtimeModel}
|
||||||
|
onInput={(value) => updateDraft("realtimeModel", value)}
|
||||||
|
placeholder={t("settings.speech.realtimeModel.placeholder")}
|
||||||
|
/>
|
||||||
<Field
|
<Field
|
||||||
label={t("settings.speech.sttModel.title")}
|
label={t("settings.speech.sttModel.title")}
|
||||||
caption={t("settings.speech.sttModel.subtitle")}
|
caption={t("settings.speech.sttModel.subtitle")}
|
||||||
|
|||||||
@@ -8,6 +8,8 @@ import type {
|
|||||||
FileSystemListResponse,
|
FileSystemListResponse,
|
||||||
InstanceData,
|
InstanceData,
|
||||||
SpeechCapabilitiesResponse,
|
SpeechCapabilitiesResponse,
|
||||||
|
SpeechRealtimeEvent,
|
||||||
|
SpeechRealtimeSessionResponse,
|
||||||
SpeechSynthesisResponse,
|
SpeechSynthesisResponse,
|
||||||
SpeechTranscriptionResponse,
|
SpeechTranscriptionResponse,
|
||||||
ServerMeta,
|
ServerMeta,
|
||||||
@@ -39,6 +41,10 @@ export function buildBackgroundProcessStreamUrl(instanceId: string, processId: s
|
|||||||
return buildAbsoluteUrl(`/workspaces/${encodedInstanceId}/plugin/background-processes/${encodedProcessId}/stream`)
|
return buildAbsoluteUrl(`/workspaces/${encodedInstanceId}/plugin/background-processes/${encodedProcessId}/stream`)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function buildRealtimeSpeechEventsUrl(sessionId: string): string {
|
||||||
|
return buildAbsoluteUrl(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/events`)
|
||||||
|
}
|
||||||
|
|
||||||
function buildEventsUrl(base: string | undefined, path: string): string {
|
function buildEventsUrl(base: string | undefined, path: string): string {
|
||||||
if (path.startsWith("http://") || path.startsWith("https://")) {
|
if (path.startsWith("http://") || path.startsWith("https://")) {
|
||||||
return path
|
return path
|
||||||
@@ -241,6 +247,29 @@ export const serverApi = {
|
|||||||
fetchSpeechCapabilities(): Promise<SpeechCapabilitiesResponse> {
|
fetchSpeechCapabilities(): Promise<SpeechCapabilitiesResponse> {
|
||||||
return request<SpeechCapabilitiesResponse>("/api/speech/capabilities")
|
return request<SpeechCapabilitiesResponse>("/api/speech/capabilities")
|
||||||
},
|
},
|
||||||
|
createRealtimeSpeechSession(payload?: { language?: string; prompt?: string }): Promise<SpeechRealtimeSessionResponse> {
|
||||||
|
return request<SpeechRealtimeSessionResponse>("/api/speech/realtime/sessions", {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify(payload ?? {}),
|
||||||
|
})
|
||||||
|
},
|
||||||
|
appendRealtimeSpeechAudio(sessionId: string, payload: { audioBase64: string }): Promise<void> {
|
||||||
|
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/audio`, {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify(payload),
|
||||||
|
})
|
||||||
|
},
|
||||||
|
finalizeRealtimeSpeechSession(sessionId: string): Promise<void> {
|
||||||
|
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}/finalize`, {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify({}),
|
||||||
|
})
|
||||||
|
},
|
||||||
|
closeRealtimeSpeechSession(sessionId: string): Promise<void> {
|
||||||
|
return request(`/api/speech/realtime/sessions/${encodeURIComponent(sessionId)}`, {
|
||||||
|
method: "DELETE",
|
||||||
|
})
|
||||||
|
},
|
||||||
transcribeAudio(payload: {
|
transcribeAudio(payload: {
|
||||||
audioBase64: string
|
audioBase64: string
|
||||||
mimeType: string
|
mimeType: string
|
||||||
@@ -332,21 +361,34 @@ export const serverApi = {
|
|||||||
},
|
},
|
||||||
connectEvents(onEvent: (event: WorkspaceEventPayload) => void, onError?: () => void) {
|
connectEvents(onEvent: (event: WorkspaceEventPayload) => void, onError?: () => void) {
|
||||||
sseLogger.info(`Connecting to ${EVENTS_URL}`)
|
sseLogger.info(`Connecting to ${EVENTS_URL}`)
|
||||||
const source = new EventSource(EVENTS_URL, { withCredentials: true } as any)
|
return connectEventSource(EVENTS_URL, onEvent, onError)
|
||||||
source.onmessage = (event) => {
|
},
|
||||||
try {
|
connectRealtimeSpeechEvents(
|
||||||
const payload = JSON.parse(event.data) as WorkspaceEventPayload
|
sessionId: string,
|
||||||
onEvent(payload)
|
onEvent: (event: SpeechRealtimeEvent) => void,
|
||||||
} catch (error) {
|
onError?: () => void,
|
||||||
sseLogger.error("Failed to parse event", error)
|
) {
|
||||||
}
|
const url = buildRealtimeSpeechEventsUrl(sessionId)
|
||||||
}
|
sseLogger.info(`Connecting to ${url}`)
|
||||||
source.onerror = () => {
|
return connectEventSource(url, onEvent, onError)
|
||||||
sseLogger.warn("EventSource error, closing stream")
|
|
||||||
onError?.()
|
|
||||||
}
|
|
||||||
return source
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType }
|
function connectEventSource<T>(url: string, onEvent: (event: T) => void, onError?: () => void) {
|
||||||
|
const source = new EventSource(url, { withCredentials: true } as any)
|
||||||
|
source.onmessage = (event) => {
|
||||||
|
try {
|
||||||
|
const payload = JSON.parse(event.data) as T
|
||||||
|
onEvent(payload)
|
||||||
|
} catch (error) {
|
||||||
|
sseLogger.error("Failed to parse event", error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
source.onerror = () => {
|
||||||
|
sseLogger.warn("EventSource error, closing stream")
|
||||||
|
onError?.()
|
||||||
|
}
|
||||||
|
return source
|
||||||
|
}
|
||||||
|
|
||||||
|
export type { WorkspaceDescriptor, WorkspaceLogEntry, WorkspaceEventPayload, WorkspaceEventType, SpeechRealtimeEvent }
|
||||||
|
|||||||
@@ -140,8 +140,10 @@ export const messagingMessages = {
|
|||||||
"promptInput.send.errorTitle": "Send failed",
|
"promptInput.send.errorTitle": "Send failed",
|
||||||
"promptInput.voiceInput.start.title": "Start voice input",
|
"promptInput.voiceInput.start.title": "Start voice input",
|
||||||
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
||||||
|
"promptInput.voiceInput.connecting.title": "Connecting microphone",
|
||||||
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
||||||
"promptInput.voiceInput.error.title": "Voice input failed",
|
"promptInput.voiceInput.error.title": "Voice input failed",
|
||||||
|
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
|
||||||
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
||||||
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
||||||
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
||||||
|
|||||||
@@ -156,13 +156,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.baseUrl.title": "Base URL",
|
"settings.speech.baseUrl.title": "Base URL",
|
||||||
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
||||||
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
||||||
|
"settings.speech.realtime.title": "Realtime dictation",
|
||||||
|
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
|
||||||
|
"settings.speech.realtimeModel.title": "Realtime model",
|
||||||
|
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
|
||||||
|
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
|
||||||
"settings.speech.sttModel.title": "Transcription model",
|
"settings.speech.sttModel.title": "Transcription model",
|
||||||
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
||||||
"settings.speech.ttsModel.title": "Speech model",
|
"settings.speech.ttsModel.title": "Speech model",
|
||||||
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
||||||
"settings.speech.ttsVoice.title": "Default voice",
|
"settings.speech.ttsVoice.title": "Default voice",
|
||||||
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
||||||
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
|
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
|
||||||
"settings.speech.save.action": "Save",
|
"settings.speech.save.action": "Save",
|
||||||
"settings.speech.save.saving": "Saving...",
|
"settings.speech.save.saving": "Saving...",
|
||||||
"settings.speech.save.saved": "Saved",
|
"settings.speech.save.saved": "Saved",
|
||||||
|
|||||||
@@ -142,8 +142,10 @@ export const messagingMessages = {
|
|||||||
"promptInput.send.errorTitle": "Error al enviar",
|
"promptInput.send.errorTitle": "Error al enviar",
|
||||||
"promptInput.voiceInput.start.title": "Start voice input",
|
"promptInput.voiceInput.start.title": "Start voice input",
|
||||||
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
||||||
|
"promptInput.voiceInput.connecting.title": "Connecting microphone",
|
||||||
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
||||||
"promptInput.voiceInput.error.title": "Voice input failed",
|
"promptInput.voiceInput.error.title": "Voice input failed",
|
||||||
|
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
|
||||||
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
||||||
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
||||||
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
||||||
|
|||||||
@@ -156,13 +156,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.baseUrl.title": "Base URL",
|
"settings.speech.baseUrl.title": "Base URL",
|
||||||
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
||||||
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
||||||
|
"settings.speech.realtime.title": "Realtime dictation",
|
||||||
|
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
|
||||||
|
"settings.speech.realtimeModel.title": "Realtime model",
|
||||||
|
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
|
||||||
|
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
|
||||||
"settings.speech.sttModel.title": "Transcription model",
|
"settings.speech.sttModel.title": "Transcription model",
|
||||||
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
||||||
"settings.speech.ttsModel.title": "Speech model",
|
"settings.speech.ttsModel.title": "Speech model",
|
||||||
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
||||||
"settings.speech.ttsVoice.title": "Default voice",
|
"settings.speech.ttsVoice.title": "Default voice",
|
||||||
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
||||||
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
|
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
|
||||||
"settings.speech.save.action": "Save",
|
"settings.speech.save.action": "Save",
|
||||||
"settings.speech.save.saving": "Saving...",
|
"settings.speech.save.saving": "Saving...",
|
||||||
"settings.speech.save.saved": "Saved",
|
"settings.speech.save.saved": "Saved",
|
||||||
|
|||||||
@@ -142,8 +142,10 @@ export const messagingMessages = {
|
|||||||
"promptInput.send.errorTitle": "Échec de l'envoi",
|
"promptInput.send.errorTitle": "Échec de l'envoi",
|
||||||
"promptInput.voiceInput.start.title": "Start voice input",
|
"promptInput.voiceInput.start.title": "Start voice input",
|
||||||
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
||||||
|
"promptInput.voiceInput.connecting.title": "Connecting microphone",
|
||||||
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
||||||
"promptInput.voiceInput.error.title": "Voice input failed",
|
"promptInput.voiceInput.error.title": "Voice input failed",
|
||||||
|
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
|
||||||
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
||||||
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
||||||
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
||||||
|
|||||||
@@ -156,13 +156,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.baseUrl.title": "Base URL",
|
"settings.speech.baseUrl.title": "Base URL",
|
||||||
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
||||||
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
||||||
|
"settings.speech.realtime.title": "Realtime dictation",
|
||||||
|
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
|
||||||
|
"settings.speech.realtimeModel.title": "Realtime model",
|
||||||
|
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
|
||||||
|
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
|
||||||
"settings.speech.sttModel.title": "Transcription model",
|
"settings.speech.sttModel.title": "Transcription model",
|
||||||
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
||||||
"settings.speech.ttsModel.title": "Speech model",
|
"settings.speech.ttsModel.title": "Speech model",
|
||||||
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
||||||
"settings.speech.ttsVoice.title": "Default voice",
|
"settings.speech.ttsVoice.title": "Default voice",
|
||||||
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
||||||
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
|
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
|
||||||
"settings.speech.save.action": "Save",
|
"settings.speech.save.action": "Save",
|
||||||
"settings.speech.save.saving": "Saving...",
|
"settings.speech.save.saving": "Saving...",
|
||||||
"settings.speech.save.saved": "Saved",
|
"settings.speech.save.saved": "Saved",
|
||||||
|
|||||||
@@ -142,8 +142,10 @@ export const messagingMessages = {
|
|||||||
"promptInput.send.errorTitle": "送信に失敗",
|
"promptInput.send.errorTitle": "送信に失敗",
|
||||||
"promptInput.voiceInput.start.title": "Start voice input",
|
"promptInput.voiceInput.start.title": "Start voice input",
|
||||||
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
||||||
|
"promptInput.voiceInput.connecting.title": "Connecting microphone",
|
||||||
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
||||||
"promptInput.voiceInput.error.title": "Voice input failed",
|
"promptInput.voiceInput.error.title": "Voice input failed",
|
||||||
|
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
|
||||||
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
||||||
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
||||||
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
||||||
|
|||||||
@@ -156,13 +156,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.baseUrl.title": "Base URL",
|
"settings.speech.baseUrl.title": "Base URL",
|
||||||
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
||||||
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
||||||
|
"settings.speech.realtime.title": "Realtime dictation",
|
||||||
|
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
|
||||||
|
"settings.speech.realtimeModel.title": "Realtime model",
|
||||||
|
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
|
||||||
|
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
|
||||||
"settings.speech.sttModel.title": "Transcription model",
|
"settings.speech.sttModel.title": "Transcription model",
|
||||||
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
||||||
"settings.speech.ttsModel.title": "Speech model",
|
"settings.speech.ttsModel.title": "Speech model",
|
||||||
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
||||||
"settings.speech.ttsVoice.title": "Default voice",
|
"settings.speech.ttsVoice.title": "Default voice",
|
||||||
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
||||||
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
|
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
|
||||||
"settings.speech.save.action": "Save",
|
"settings.speech.save.action": "Save",
|
||||||
"settings.speech.save.saving": "Saving...",
|
"settings.speech.save.saving": "Saving...",
|
||||||
"settings.speech.save.saved": "Saved",
|
"settings.speech.save.saved": "Saved",
|
||||||
|
|||||||
@@ -142,8 +142,10 @@ export const messagingMessages = {
|
|||||||
"promptInput.send.errorTitle": "Не удалось отправить",
|
"promptInput.send.errorTitle": "Не удалось отправить",
|
||||||
"promptInput.voiceInput.start.title": "Start voice input",
|
"promptInput.voiceInput.start.title": "Start voice input",
|
||||||
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
||||||
|
"promptInput.voiceInput.connecting.title": "Connecting microphone",
|
||||||
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
||||||
"promptInput.voiceInput.error.title": "Voice input failed",
|
"promptInput.voiceInput.error.title": "Voice input failed",
|
||||||
|
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
|
||||||
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
||||||
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
||||||
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
||||||
|
|||||||
@@ -156,13 +156,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.baseUrl.title": "Base URL",
|
"settings.speech.baseUrl.title": "Base URL",
|
||||||
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
||||||
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
||||||
|
"settings.speech.realtime.title": "Realtime dictation",
|
||||||
|
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
|
||||||
|
"settings.speech.realtimeModel.title": "Realtime model",
|
||||||
|
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
|
||||||
|
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
|
||||||
"settings.speech.sttModel.title": "Transcription model",
|
"settings.speech.sttModel.title": "Transcription model",
|
||||||
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
||||||
"settings.speech.ttsModel.title": "Speech model",
|
"settings.speech.ttsModel.title": "Speech model",
|
||||||
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
||||||
"settings.speech.ttsVoice.title": "Default voice",
|
"settings.speech.ttsVoice.title": "Default voice",
|
||||||
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
||||||
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
|
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
|
||||||
"settings.speech.save.action": "Save",
|
"settings.speech.save.action": "Save",
|
||||||
"settings.speech.save.saving": "Saving...",
|
"settings.speech.save.saving": "Saving...",
|
||||||
"settings.speech.save.saved": "Saved",
|
"settings.speech.save.saved": "Saved",
|
||||||
|
|||||||
@@ -142,8 +142,10 @@ export const messagingMessages = {
|
|||||||
"promptInput.send.errorTitle": "发送失败",
|
"promptInput.send.errorTitle": "发送失败",
|
||||||
"promptInput.voiceInput.start.title": "Start voice input",
|
"promptInput.voiceInput.start.title": "Start voice input",
|
||||||
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
"promptInput.voiceInput.stop.title": "Stop recording and transcribe",
|
||||||
|
"promptInput.voiceInput.connecting.title": "Connecting microphone",
|
||||||
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
"promptInput.voiceInput.transcribing.title": "Transcribing audio",
|
||||||
"promptInput.voiceInput.error.title": "Voice input failed",
|
"promptInput.voiceInput.error.title": "Voice input failed",
|
||||||
|
"promptInput.voiceInput.error.connection": "Unable to start realtime voice input.",
|
||||||
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
"promptInput.voiceInput.error.permission": "Microphone access is required to record voice input.",
|
||||||
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
"promptInput.voiceInput.error.unsupported": "Voice input is not supported in this browser.",
|
||||||
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
"promptInput.voiceInput.error.transcribe": "Unable to transcribe the recorded audio.",
|
||||||
|
|||||||
@@ -156,13 +156,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.baseUrl.title": "Base URL",
|
"settings.speech.baseUrl.title": "Base URL",
|
||||||
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
"settings.speech.baseUrl.subtitle": "Optional override for OpenAI-compatible speech endpoints.",
|
||||||
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
"settings.speech.baseUrl.placeholder": "https://api.openai.com/v1",
|
||||||
|
"settings.speech.realtime.title": "Realtime dictation",
|
||||||
|
"settings.speech.realtime.subtitle": "Use the Realtime API for prompt voice input. Disable this if your speech server only supports standard transcription uploads.",
|
||||||
|
"settings.speech.realtimeModel.title": "Realtime model",
|
||||||
|
"settings.speech.realtimeModel.subtitle": "Model used when opening the Realtime WebSocket session. This is separate from the speech-to-text transcription model.",
|
||||||
|
"settings.speech.realtimeModel.placeholder": "gpt-realtime",
|
||||||
"settings.speech.sttModel.title": "Transcription model",
|
"settings.speech.sttModel.title": "Transcription model",
|
||||||
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
"settings.speech.sttModel.subtitle": "Model used for prompt speech-to-text requests.",
|
||||||
"settings.speech.ttsModel.title": "Speech model",
|
"settings.speech.ttsModel.title": "Speech model",
|
||||||
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
||||||
"settings.speech.ttsVoice.title": "Default voice",
|
"settings.speech.ttsVoice.title": "Default voice",
|
||||||
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
||||||
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
|
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and the selected dictation mode is supported by this browser and provider.",
|
||||||
"settings.speech.save.action": "Save",
|
"settings.speech.save.action": "Save",
|
||||||
"settings.speech.save.saving": "Saving...",
|
"settings.speech.save.saving": "Saving...",
|
||||||
"settings.speech.save.saved": "Saved",
|
"settings.speech.save.saved": "Saved",
|
||||||
|
|||||||
@@ -34,6 +34,8 @@ export interface SpeechSettings {
|
|||||||
provider: SpeechProviderPreference
|
provider: SpeechProviderPreference
|
||||||
apiKey?: string
|
apiKey?: string
|
||||||
baseUrl?: string
|
baseUrl?: string
|
||||||
|
useRealtime: boolean
|
||||||
|
realtimeModel: string
|
||||||
sttModel: string
|
sttModel: string
|
||||||
ttsModel: string
|
ttsModel: string
|
||||||
ttsVoice: string
|
ttsVoice: string
|
||||||
@@ -136,6 +138,8 @@ const defaultUiSettings: UiSettings = {
|
|||||||
|
|
||||||
const defaultSpeechSettings: SpeechSettings = {
|
const defaultSpeechSettings: SpeechSettings = {
|
||||||
provider: "openai-compatible",
|
provider: "openai-compatible",
|
||||||
|
useRealtime: true,
|
||||||
|
realtimeModel: "gpt-realtime",
|
||||||
sttModel: "gpt-4o-mini-transcribe",
|
sttModel: "gpt-4o-mini-transcribe",
|
||||||
ttsModel: "gpt-4o-mini-tts",
|
ttsModel: "gpt-4o-mini-tts",
|
||||||
ttsVoice: "alloy",
|
ttsVoice: "alloy",
|
||||||
@@ -184,6 +188,11 @@ function normalizeSpeechSettings(input?: Partial<SpeechSettings> | null): Speech
|
|||||||
provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider,
|
provider: sanitized.provider === "openai-compatible" ? sanitized.provider : defaultSpeechSettings.provider,
|
||||||
apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined,
|
apiKey: typeof sanitized.apiKey === "string" && sanitized.apiKey.trim() ? sanitized.apiKey.trim() : undefined,
|
||||||
baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined,
|
baseUrl: typeof sanitized.baseUrl === "string" && sanitized.baseUrl.trim() ? sanitized.baseUrl.trim() : undefined,
|
||||||
|
useRealtime: sanitized.useRealtime ?? defaultSpeechSettings.useRealtime,
|
||||||
|
realtimeModel:
|
||||||
|
typeof sanitized.realtimeModel === "string" && sanitized.realtimeModel.trim()
|
||||||
|
? sanitized.realtimeModel.trim()
|
||||||
|
: defaultSpeechSettings.realtimeModel,
|
||||||
sttModel:
|
sttModel:
|
||||||
typeof sanitized.sttModel === "string" && sanitized.sttModel.trim()
|
typeof sanitized.sttModel === "string" && sanitized.sttModel.trim()
|
||||||
? sanitized.sttModel.trim()
|
? sanitized.sttModel.trim()
|
||||||
|
|||||||
Reference in New Issue
Block a user