feat(voice): add spoken summary mode for conversation replies

This commit is contained in:
Shantur
2026-03-31 00:20:26 +01:00
parent d1a27ac31b
commit 045d8da8b2
6 changed files with 103 additions and 6 deletions

View File

@@ -2,6 +2,8 @@ import type { PluginInput } from "@opencode-ai/plugin"
import { createCodeNomadClient, getCodeNomadConfig } from "./lib/client"
import { createBackgroundProcessTools } from "./lib/background-process"
let voiceModeEnabled = false
export async function CodeNomadPlugin(input: PluginInput) {
const config = getCodeNomadConfig()
const client = createCodeNomadClient(config)
@@ -16,6 +18,11 @@ export async function CodeNomadPlugin(input: PluginInput) {
pingTs: (event.properties as any)?.ts,
},
}).catch(() => {})
return
}
if (event.type === "codenomad.voiceMode") {
voiceModeEnabled = Boolean((event.properties as { enabled?: unknown } | undefined)?.enabled)
}
})
@@ -23,6 +30,13 @@ export async function CodeNomadPlugin(input: PluginInput) {
tool: {
...backgroundProcessTools,
},
async "chat.message"(_input: { sessionID: string }, output: { message: { system?: string } }) {
if (!voiceModeEnabled) {
return
}
output.message.system = [output.message.system, buildVoiceModePrompt()].filter(Boolean).join("\n\n")
},
async event(input: { event: any }) {
const opencodeEvent = input?.event
if (!opencodeEvent || typeof opencodeEvent !== "object") return
@@ -30,3 +44,19 @@ export async function CodeNomadPlugin(input: PluginInput) {
},
}
}
function buildVoiceModePrompt(): string {
return [
"Voice conversation mode is enabled.",
"Prepend your reply with a fenced code block using language `spoken`.",
"The `spoken` block should be the natural conversational reply you would say out loud to the user. It should be a concise spoken gist of the full response in 2 to 4 natural sentences.",
"In the spoken block, summarize the main outcome, recommendation, or next step. Sound conversational and natural, not like a document summary.",
"Do not include code, bullet lists, markdown formatting, or long technical detail in the spoken block.",
"Do not add generic phrases about whether the user should read more.",
"Only mention additional written detail when there is something specific that may matter for the user's next response, such as a tradeoff, caveat, risk, open question, exact diff, or test result.",
"When referring to that written detail, say `below` or `in the message` rather than `detailed section`.",
"After the `spoken` block, continue with your normal detailed response.",
"Example:",
"```spoken\nI implemented the relay-based voice-mode flow and it works with the current plugin bridge. The reconnect caveat is explained below.\n```",
].join("\n\n")
}

View File

@@ -240,6 +240,10 @@ export interface SpeechSynthesisResponse {
mimeType: string
}
export interface VoiceModeStateResponse {
enabled: boolean
}
export type WorkspaceEventType =
| "workspace.created"
| "workspace.started"

View File

@@ -29,6 +29,7 @@ import type { AuthManager } from "../auth/manager"
import { registerAuthRoutes } from "./routes/auth"
import { sendUnauthorized, wantsHtml } from "../auth/http-auth"
import type { SpeechService } from "../speech/service"
import { PluginChannelManager } from "../plugins/channel"
interface HttpServerDeps {
bindHost: string
@@ -173,6 +174,7 @@ export function createHttpServer(deps: HttpServerDeps) {
eventBus: deps.eventBus,
logger: deps.logger.child({ component: "background-processes" }),
})
const pluginChannel = new PluginChannelManager(deps.logger.child({ component: "plugin-channel" }))
registerAuthRoutes(app, { authManager: deps.authManager })
@@ -256,7 +258,12 @@ export function createHttpServer(deps: HttpServerDeps) {
workspaceManager: deps.workspaceManager,
})
registerSpeechRoutes(app, { speechService: deps.speechService })
registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger })
registerPluginRoutes(app, {
workspaceManager: deps.workspaceManager,
eventBus: deps.eventBus,
logger: proxyLogger,
channel: pluginChannel,
})
registerBackgroundProcessRoutes(app, { backgroundProcessManager })
registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger })

View File

@@ -1,5 +1,6 @@
import { FastifyInstance } from "fastify"
import { z } from "zod"
import type { VoiceModeStateResponse } from "../../api-types"
import type { WorkspaceManager } from "../../workspaces/manager"
import type { EventBus } from "../../events/bus"
import type { Logger } from "../../logger"
@@ -10,6 +11,7 @@ interface RouteDeps {
workspaceManager: WorkspaceManager
eventBus: EventBus
logger: Logger
channel: PluginChannelManager
}
const PluginEventSchema = z.object({
@@ -17,9 +19,11 @@ const PluginEventSchema = z.object({
properties: z.record(z.unknown()).optional(),
})
export function registerPluginRoutes(app: FastifyInstance, deps: RouteDeps) {
const channel = new PluginChannelManager(deps.logger.child({ component: "plugin-channel" }))
const VoiceModeStateSchema = z.object({
enabled: z.boolean(),
})
export function registerPluginRoutes(app: FastifyInstance, deps: RouteDeps) {
app.get<{ Params: { id: string } }>("/workspaces/:id/plugin/events", (request, reply) => {
const workspace = deps.workspaceManager.get(request.params.id)
if (!workspace) {
@@ -33,10 +37,10 @@ export function registerPluginRoutes(app: FastifyInstance, deps: RouteDeps) {
reply.raw.flushHeaders?.()
reply.hijack()
const registration = channel.register(request.params.id, reply)
const registration = deps.channel.register(request.params.id, reply)
const heartbeat = setInterval(() => {
channel.send(request.params.id, buildPingEvent())
deps.channel.send(request.params.id, buildPingEvent())
}, 15000)
const close = () => {
@@ -49,6 +53,24 @@ export function registerPluginRoutes(app: FastifyInstance, deps: RouteDeps) {
request.raw.on("error", close)
})
app.post<{ Params: { id: string }; Body: VoiceModeStateResponse }>("/workspaces/:id/plugin/voice-mode", (request, reply) => {
const workspace = deps.workspaceManager.get(request.params.id)
if (!workspace) {
reply.code(404).send({ error: "Workspace not found" })
return
}
const payload = VoiceModeStateSchema.parse(request.body ?? {})
deps.channel.send(request.params.id, {
type: "codenomad.voiceMode",
properties: {
enabled: payload.enabled,
formatVersion: "v1",
},
})
return { enabled: payload.enabled }
})
const handleWildcard = async (request: any, reply: any) => {
const workspaceId = request.params.id as string
const workspace = deps.workspaceManager.get(workspaceId)

View File

@@ -11,6 +11,7 @@ import type {
SpeechSynthesisResponse,
SpeechTranscriptionResponse,
ServerMeta,
VoiceModeStateResponse,
WorkspaceCreateRequest,
WorkspaceDescriptor,
WorkspaceFileResponse,
@@ -348,6 +349,12 @@ export const serverApi = {
{ method: "POST" },
)
},
updateVoiceMode(instanceId: string, enabled: boolean): Promise<VoiceModeStateResponse> {
return request<VoiceModeStateResponse>(`/workspaces/${encodeURIComponent(instanceId)}/plugin/voice-mode`, {
method: "POST",
body: JSON.stringify({ enabled }),
})
},
fetchBackgroundProcessOutput(
instanceId: string,
processId: string,

View File

@@ -30,6 +30,7 @@ interface PlaybackHandle {
const log = getLogger("actions")
const [conversationModeInstances, setConversationModeInstances] = createSignal<Map<string, boolean>>(new Map())
const LEADING_SPOKEN_BLOCK_REGEX = /^\s*```spoken[ \t]*\r?\n([\s\S]*?)\r?\n```(?:\r?\n|$)/i
const queuedKeys = new Set<string>()
const spokenKeysBySession = new Map<string, Set<string>>()
@@ -107,6 +108,9 @@ export function canUseConversationMode(): boolean {
}
export function setConversationModeEnabled(instanceId: string, enabled: boolean): void {
const previous = isConversationModeEnabled(instanceId)
if (previous === enabled) return
setConversationModeInstances((prev) => {
const next = new Map(prev)
if (enabled) {
@@ -120,6 +124,23 @@ export function setConversationModeEnabled(instanceId: string, enabled: boolean)
if (!enabled) {
clearConversationPlaybackForInstance(instanceId)
}
void serverApi.updateVoiceMode(instanceId, enabled).catch((error) => {
log.error("Failed to update conversation mode", error)
setConversationModeInstances((prev) => {
const next = new Map(prev)
if (previous) {
next.set(instanceId, true)
} else {
next.delete(instanceId)
}
return next
})
if (!previous) {
clearConversationPlaybackForInstance(instanceId)
}
})
}
export function toggleConversationMode(instanceId: string): void {
@@ -188,7 +209,7 @@ export function handleConversationAssistantPartUpdated(instanceId: string, part:
if (!isConversationModeEnabled(instanceId)) return
if (!isSpeakableSession(instanceId, sessionId)) return
const text = resolveTextPartContent(part).trim()
const text = extractLeadingSpokenBlock(resolveTextPartContent(part))
if (!text) return
const key = getEntryKey(instanceId, sessionId, messageId, partId)
@@ -505,3 +526,9 @@ function createObjectUrlFromBase64(audioBase64: string, mimeType: string): strin
}
return URL.createObjectURL(new Blob([bytes], { type: mimeType || "audio/mpeg" }))
}
function extractLeadingSpokenBlock(text: string): string {
const match = text.match(LEADING_SPOKEN_BLOCK_REGEX)
if (!match?.[1]) return ""
return match[1].trim()
}