From 045d8da8b2d432d96db84716e357fc7535d276f3 Mon Sep 17 00:00:00 2001 From: Shantur Date: Tue, 31 Mar 2026 00:20:26 +0100 Subject: [PATCH] feat(voice): add spoken summary mode for conversation replies --- packages/opencode-config/plugin/codenomad.ts | 30 +++++++++++++++++++ packages/server/src/api-types.ts | 4 +++ packages/server/src/server/http-server.ts | 9 +++++- packages/server/src/server/routes/plugin.ts | 30 ++++++++++++++++--- packages/ui/src/lib/api-client.ts | 7 +++++ packages/ui/src/stores/conversation-speech.ts | 29 +++++++++++++++++- 6 files changed, 103 insertions(+), 6 deletions(-) diff --git a/packages/opencode-config/plugin/codenomad.ts b/packages/opencode-config/plugin/codenomad.ts index b04322d0..08515dd8 100644 --- a/packages/opencode-config/plugin/codenomad.ts +++ b/packages/opencode-config/plugin/codenomad.ts @@ -2,6 +2,8 @@ import type { PluginInput } from "@opencode-ai/plugin" import { createCodeNomadClient, getCodeNomadConfig } from "./lib/client" import { createBackgroundProcessTools } from "./lib/background-process" +let voiceModeEnabled = false + export async function CodeNomadPlugin(input: PluginInput) { const config = getCodeNomadConfig() const client = createCodeNomadClient(config) @@ -16,6 +18,11 @@ export async function CodeNomadPlugin(input: PluginInput) { pingTs: (event.properties as any)?.ts, }, }).catch(() => {}) + return + } + + if (event.type === "codenomad.voiceMode") { + voiceModeEnabled = Boolean((event.properties as { enabled?: unknown } | undefined)?.enabled) } }) @@ -23,6 +30,13 @@ export async function CodeNomadPlugin(input: PluginInput) { tool: { ...backgroundProcessTools, }, + async "chat.message"(_input: { sessionID: string }, output: { message: { system?: string } }) { + if (!voiceModeEnabled) { + return + } + + output.message.system = [output.message.system, buildVoiceModePrompt()].filter(Boolean).join("\n\n") + }, async event(input: { event: any }) { const opencodeEvent = input?.event if (!opencodeEvent || typeof opencodeEvent !== "object") return @@ -30,3 +44,19 @@ export async function CodeNomadPlugin(input: PluginInput) { }, } } + +function buildVoiceModePrompt(): string { + return [ + "Voice conversation mode is enabled.", + "Prepend your reply with a fenced code block using language `spoken`.", + "The `spoken` block should be the natural conversational reply you would say out loud to the user. It should be a concise spoken gist of the full response in 2 to 4 natural sentences.", + "In the spoken block, summarize the main outcome, recommendation, or next step. Sound conversational and natural, not like a document summary.", + "Do not include code, bullet lists, markdown formatting, or long technical detail in the spoken block.", + "Do not add generic phrases about whether the user should read more.", + "Only mention additional written detail when there is something specific that may matter for the user's next response, such as a tradeoff, caveat, risk, open question, exact diff, or test result.", + "When referring to that written detail, say `below` or `in the message` rather than `detailed section`.", + "After the `spoken` block, continue with your normal detailed response.", + "Example:", + "```spoken\nI implemented the relay-based voice-mode flow and it works with the current plugin bridge. The reconnect caveat is explained below.\n```", + ].join("\n\n") +} diff --git a/packages/server/src/api-types.ts b/packages/server/src/api-types.ts index 7bc54a13..e901e335 100644 --- a/packages/server/src/api-types.ts +++ b/packages/server/src/api-types.ts @@ -240,6 +240,10 @@ export interface SpeechSynthesisResponse { mimeType: string } +export interface VoiceModeStateResponse { + enabled: boolean +} + export type WorkspaceEventType = | "workspace.created" | "workspace.started" diff --git a/packages/server/src/server/http-server.ts b/packages/server/src/server/http-server.ts index 3f558cb8..13391946 100644 --- a/packages/server/src/server/http-server.ts +++ b/packages/server/src/server/http-server.ts @@ -29,6 +29,7 @@ import type { AuthManager } from "../auth/manager" import { registerAuthRoutes } from "./routes/auth" import { sendUnauthorized, wantsHtml } from "../auth/http-auth" import type { SpeechService } from "../speech/service" +import { PluginChannelManager } from "../plugins/channel" interface HttpServerDeps { bindHost: string @@ -173,6 +174,7 @@ export function createHttpServer(deps: HttpServerDeps) { eventBus: deps.eventBus, logger: deps.logger.child({ component: "background-processes" }), }) + const pluginChannel = new PluginChannelManager(deps.logger.child({ component: "plugin-channel" })) registerAuthRoutes(app, { authManager: deps.authManager }) @@ -256,7 +258,12 @@ export function createHttpServer(deps: HttpServerDeps) { workspaceManager: deps.workspaceManager, }) registerSpeechRoutes(app, { speechService: deps.speechService }) - registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger }) + registerPluginRoutes(app, { + workspaceManager: deps.workspaceManager, + eventBus: deps.eventBus, + logger: proxyLogger, + channel: pluginChannel, + }) registerBackgroundProcessRoutes(app, { backgroundProcessManager }) registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger }) diff --git a/packages/server/src/server/routes/plugin.ts b/packages/server/src/server/routes/plugin.ts index 374ce545..5b37f3bb 100644 --- a/packages/server/src/server/routes/plugin.ts +++ b/packages/server/src/server/routes/plugin.ts @@ -1,5 +1,6 @@ import { FastifyInstance } from "fastify" import { z } from "zod" +import type { VoiceModeStateResponse } from "../../api-types" import type { WorkspaceManager } from "../../workspaces/manager" import type { EventBus } from "../../events/bus" import type { Logger } from "../../logger" @@ -10,6 +11,7 @@ interface RouteDeps { workspaceManager: WorkspaceManager eventBus: EventBus logger: Logger + channel: PluginChannelManager } const PluginEventSchema = z.object({ @@ -17,9 +19,11 @@ const PluginEventSchema = z.object({ properties: z.record(z.unknown()).optional(), }) -export function registerPluginRoutes(app: FastifyInstance, deps: RouteDeps) { - const channel = new PluginChannelManager(deps.logger.child({ component: "plugin-channel" })) +const VoiceModeStateSchema = z.object({ + enabled: z.boolean(), +}) +export function registerPluginRoutes(app: FastifyInstance, deps: RouteDeps) { app.get<{ Params: { id: string } }>("/workspaces/:id/plugin/events", (request, reply) => { const workspace = deps.workspaceManager.get(request.params.id) if (!workspace) { @@ -33,10 +37,10 @@ export function registerPluginRoutes(app: FastifyInstance, deps: RouteDeps) { reply.raw.flushHeaders?.() reply.hijack() - const registration = channel.register(request.params.id, reply) + const registration = deps.channel.register(request.params.id, reply) const heartbeat = setInterval(() => { - channel.send(request.params.id, buildPingEvent()) + deps.channel.send(request.params.id, buildPingEvent()) }, 15000) const close = () => { @@ -49,6 +53,24 @@ export function registerPluginRoutes(app: FastifyInstance, deps: RouteDeps) { request.raw.on("error", close) }) + app.post<{ Params: { id: string }; Body: VoiceModeStateResponse }>("/workspaces/:id/plugin/voice-mode", (request, reply) => { + const workspace = deps.workspaceManager.get(request.params.id) + if (!workspace) { + reply.code(404).send({ error: "Workspace not found" }) + return + } + + const payload = VoiceModeStateSchema.parse(request.body ?? {}) + deps.channel.send(request.params.id, { + type: "codenomad.voiceMode", + properties: { + enabled: payload.enabled, + formatVersion: "v1", + }, + }) + return { enabled: payload.enabled } + }) + const handleWildcard = async (request: any, reply: any) => { const workspaceId = request.params.id as string const workspace = deps.workspaceManager.get(workspaceId) diff --git a/packages/ui/src/lib/api-client.ts b/packages/ui/src/lib/api-client.ts index e733bf7d..4dfc426c 100644 --- a/packages/ui/src/lib/api-client.ts +++ b/packages/ui/src/lib/api-client.ts @@ -11,6 +11,7 @@ import type { SpeechSynthesisResponse, SpeechTranscriptionResponse, ServerMeta, + VoiceModeStateResponse, WorkspaceCreateRequest, WorkspaceDescriptor, WorkspaceFileResponse, @@ -348,6 +349,12 @@ export const serverApi = { { method: "POST" }, ) }, + updateVoiceMode(instanceId: string, enabled: boolean): Promise { + return request(`/workspaces/${encodeURIComponent(instanceId)}/plugin/voice-mode`, { + method: "POST", + body: JSON.stringify({ enabled }), + }) + }, fetchBackgroundProcessOutput( instanceId: string, processId: string, diff --git a/packages/ui/src/stores/conversation-speech.ts b/packages/ui/src/stores/conversation-speech.ts index 802725ef..665b6bd6 100644 --- a/packages/ui/src/stores/conversation-speech.ts +++ b/packages/ui/src/stores/conversation-speech.ts @@ -30,6 +30,7 @@ interface PlaybackHandle { const log = getLogger("actions") const [conversationModeInstances, setConversationModeInstances] = createSignal>(new Map()) +const LEADING_SPOKEN_BLOCK_REGEX = /^\s*```spoken[ \t]*\r?\n([\s\S]*?)\r?\n```(?:\r?\n|$)/i const queuedKeys = new Set() const spokenKeysBySession = new Map>() @@ -107,6 +108,9 @@ export function canUseConversationMode(): boolean { } export function setConversationModeEnabled(instanceId: string, enabled: boolean): void { + const previous = isConversationModeEnabled(instanceId) + if (previous === enabled) return + setConversationModeInstances((prev) => { const next = new Map(prev) if (enabled) { @@ -120,6 +124,23 @@ export function setConversationModeEnabled(instanceId: string, enabled: boolean) if (!enabled) { clearConversationPlaybackForInstance(instanceId) } + + void serverApi.updateVoiceMode(instanceId, enabled).catch((error) => { + log.error("Failed to update conversation mode", error) + setConversationModeInstances((prev) => { + const next = new Map(prev) + if (previous) { + next.set(instanceId, true) + } else { + next.delete(instanceId) + } + return next + }) + + if (!previous) { + clearConversationPlaybackForInstance(instanceId) + } + }) } export function toggleConversationMode(instanceId: string): void { @@ -188,7 +209,7 @@ export function handleConversationAssistantPartUpdated(instanceId: string, part: if (!isConversationModeEnabled(instanceId)) return if (!isSpeakableSession(instanceId, sessionId)) return - const text = resolveTextPartContent(part).trim() + const text = extractLeadingSpokenBlock(resolveTextPartContent(part)) if (!text) return const key = getEntryKey(instanceId, sessionId, messageId, partId) @@ -505,3 +526,9 @@ function createObjectUrlFromBase64(audioBase64: string, mimeType: string): strin } return URL.createObjectURL(new Blob([bytes], { type: mimeType || "audio/mpeg" })) } + +function extractLeadingSpokenBlock(text: string): string { + const match = text.match(LEADING_SPOKEN_BLOCK_REGEX) + if (!match?.[1]) return "" + return match[1].trim() +}