feat(speech): add prompt voice input (#249)
## Summary - add server-backed speech capabilities and transcription endpoints plus UI settings for speech configuration - add push-to-talk prompt voice input with microphone controls, transcription insertion, and browser capability gating - keep prompt controls aligned by restoring right-side nav placement and moving the mic beside the expand control
This commit is contained in:
@@ -207,6 +207,36 @@ export interface BinaryValidationResult {
|
||||
error?: string
|
||||
}
|
||||
|
||||
export interface SpeechSegment {
|
||||
startMs: number
|
||||
endMs: number
|
||||
text: string
|
||||
}
|
||||
|
||||
export interface SpeechCapabilitiesResponse {
|
||||
available: boolean
|
||||
configured: boolean
|
||||
provider: string
|
||||
supportsStt: boolean
|
||||
supportsTts: boolean
|
||||
baseUrl?: string
|
||||
sttModel: string
|
||||
ttsModel: string
|
||||
ttsVoice: string
|
||||
}
|
||||
|
||||
export interface SpeechTranscriptionResponse {
|
||||
text: string
|
||||
language?: string
|
||||
durationMs?: number
|
||||
segments?: SpeechSegment[]
|
||||
}
|
||||
|
||||
export interface SpeechSynthesisResponse {
|
||||
audioBase64: string
|
||||
mimeType: string
|
||||
}
|
||||
|
||||
export type WorkspaceEventType =
|
||||
| "workspace.created"
|
||||
| "workspace.started"
|
||||
|
||||
@@ -23,6 +23,7 @@ import { AuthManager, BOOTSTRAP_TOKEN_STDOUT_PREFIX, DEFAULT_AUTH_USERNAME } fro
|
||||
import { resolveHttpsOptions } from "./server/tls"
|
||||
import { resolveNetworkAddresses } from "./server/network-addresses"
|
||||
import { startDevReleaseMonitor } from "./releases/dev-release-monitor"
|
||||
import { SpeechService } from "./speech/service"
|
||||
|
||||
const require = createRequire(import.meta.url)
|
||||
|
||||
@@ -304,6 +305,7 @@ async function main() {
|
||||
})
|
||||
const fileSystemBrowser = new FileSystemBrowser({ rootDir: options.rootDir, unrestricted: options.unrestrictedRoot })
|
||||
const instanceStore = new InstanceStore(configLocation.instancesDir)
|
||||
const speechService = new SpeechService(settings, logger.child({ component: "speech" }))
|
||||
const instanceEventBridge = new InstanceEventBridge({
|
||||
workspaceManager,
|
||||
eventBus,
|
||||
@@ -388,6 +390,7 @@ async function main() {
|
||||
eventBus,
|
||||
serverMeta,
|
||||
instanceStore,
|
||||
speechService,
|
||||
authManager,
|
||||
uiStaticDir: uiResolution.uiStaticDir ?? DEFAULT_UI_STATIC_DIR,
|
||||
uiDevServerUrl: uiResolution.uiDevServerUrl,
|
||||
@@ -408,6 +411,7 @@ async function main() {
|
||||
eventBus,
|
||||
serverMeta,
|
||||
instanceStore,
|
||||
speechService,
|
||||
authManager,
|
||||
uiStaticDir: uiResolution.uiStaticDir ?? DEFAULT_UI_STATIC_DIR,
|
||||
uiDevServerUrl: undefined,
|
||||
|
||||
@@ -21,12 +21,14 @@ import { registerStorageRoutes } from "./routes/storage"
|
||||
import { registerPluginRoutes } from "./routes/plugin"
|
||||
import { registerBackgroundProcessRoutes } from "./routes/background-processes"
|
||||
import { registerWorktreeRoutes } from "./routes/worktrees"
|
||||
import { registerSpeechRoutes } from "./routes/speech"
|
||||
import { ServerMeta } from "../api-types"
|
||||
import { InstanceStore } from "../storage/instance-store"
|
||||
import { BackgroundProcessManager } from "../background-processes/manager"
|
||||
import type { AuthManager } from "../auth/manager"
|
||||
import { registerAuthRoutes } from "./routes/auth"
|
||||
import { sendUnauthorized, wantsHtml } from "../auth/http-auth"
|
||||
import type { SpeechService } from "../speech/service"
|
||||
|
||||
interface HttpServerDeps {
|
||||
bindHost: string
|
||||
@@ -41,6 +43,7 @@ interface HttpServerDeps {
|
||||
eventBus: EventBus
|
||||
serverMeta: ServerMeta
|
||||
instanceStore: InstanceStore
|
||||
speechService: SpeechService
|
||||
authManager: AuthManager
|
||||
uiStaticDir: string
|
||||
uiDevServerUrl?: string
|
||||
@@ -252,6 +255,7 @@ export function createHttpServer(deps: HttpServerDeps) {
|
||||
eventBus: deps.eventBus,
|
||||
workspaceManager: deps.workspaceManager,
|
||||
})
|
||||
registerSpeechRoutes(app, { speechService: deps.speechService })
|
||||
registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger })
|
||||
registerBackgroundProcessRoutes(app, { backgroundProcessManager })
|
||||
registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger })
|
||||
|
||||
@@ -3,6 +3,7 @@ import { z } from "zod"
|
||||
import { probeBinaryVersion } from "../../workspaces/runtime"
|
||||
import type { SettingsService } from "../../settings/service"
|
||||
import type { Logger } from "../../logger"
|
||||
import { sanitizeConfigDoc, sanitizeConfigOwner } from "../../settings/public-config"
|
||||
|
||||
interface RouteDeps {
|
||||
settings: SettingsService
|
||||
@@ -20,10 +21,10 @@ function validateBinaryPath(binaryPath: string): { valid: boolean; version?: str
|
||||
|
||||
export function registerSettingsRoutes(app: FastifyInstance, deps: RouteDeps) {
|
||||
// Full-document access
|
||||
app.get("/api/storage/config", async () => deps.settings.getDoc("config"))
|
||||
app.get("/api/storage/config", async () => sanitizeConfigDoc(deps.settings.getDoc("config")))
|
||||
app.patch("/api/storage/config", async (request, reply) => {
|
||||
try {
|
||||
return deps.settings.mergePatchDoc("config", request.body ?? {})
|
||||
return sanitizeConfigDoc(deps.settings.mergePatchDoc("config", request.body ?? {}))
|
||||
} catch (error) {
|
||||
reply.code(400)
|
||||
return { error: error instanceof Error ? error.message : "Invalid patch" }
|
||||
@@ -31,12 +32,15 @@ export function registerSettingsRoutes(app: FastifyInstance, deps: RouteDeps) {
|
||||
})
|
||||
|
||||
app.get<{ Params: { owner: string } }>("/api/storage/config/:owner", async (request) => {
|
||||
return deps.settings.getOwner("config", request.params.owner)
|
||||
return sanitizeConfigOwner(request.params.owner, deps.settings.getOwner("config", request.params.owner))
|
||||
})
|
||||
|
||||
app.patch<{ Params: { owner: string } }>("/api/storage/config/:owner", async (request, reply) => {
|
||||
try {
|
||||
return deps.settings.mergePatchOwner("config", request.params.owner, request.body ?? {})
|
||||
return sanitizeConfigOwner(
|
||||
request.params.owner,
|
||||
deps.settings.mergePatchOwner("config", request.params.owner, request.body ?? {}),
|
||||
)
|
||||
} catch (error) {
|
||||
reply.code(400)
|
||||
return { error: error instanceof Error ? error.message : "Invalid patch" }
|
||||
|
||||
60
packages/server/src/server/routes/speech.ts
Normal file
60
packages/server/src/server/routes/speech.ts
Normal file
@@ -0,0 +1,60 @@
|
||||
import type { FastifyInstance } from "fastify"
|
||||
import { z } from "zod"
|
||||
import type { SpeechService } from "../../speech/service"
|
||||
|
||||
interface RouteDeps {
|
||||
speechService: SpeechService
|
||||
}
|
||||
|
||||
const TranscribeBodySchema = z.object({
|
||||
audioBase64: z.string().min(1, "Audio payload is required"),
|
||||
mimeType: z.string().min(1, "Audio MIME type is required"),
|
||||
filename: z.string().optional(),
|
||||
language: z.string().optional(),
|
||||
prompt: z.string().optional(),
|
||||
})
|
||||
|
||||
const SynthesizeBodySchema = z.object({
|
||||
text: z.string().trim().min(1, "Text is required"),
|
||||
format: z.enum(["mp3", "wav", "opus"]).optional(),
|
||||
})
|
||||
|
||||
function getSpeechErrorStatus(error: unknown): number {
|
||||
if (error instanceof z.ZodError) {
|
||||
return 400
|
||||
}
|
||||
if (error instanceof Error && /not configured/i.test(error.message)) {
|
||||
return 503
|
||||
}
|
||||
return 502
|
||||
}
|
||||
|
||||
function getSpeechErrorMessage(error: unknown, fallback: string): string {
|
||||
return error instanceof Error ? error.message : fallback
|
||||
}
|
||||
|
||||
export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) {
|
||||
app.get("/api/speech/capabilities", async () => deps.speechService.getCapabilities())
|
||||
|
||||
app.post("/api/speech/transcribe", async (request, reply) => {
|
||||
try {
|
||||
const body = TranscribeBodySchema.parse(request.body ?? {})
|
||||
return await deps.speechService.transcribe(body)
|
||||
} catch (error) {
|
||||
request.log.error({ err: error }, "Failed to transcribe audio")
|
||||
reply.code(getSpeechErrorStatus(error))
|
||||
return { error: getSpeechErrorMessage(error, "Failed to transcribe audio") }
|
||||
}
|
||||
})
|
||||
|
||||
app.post("/api/speech/synthesize", async (request, reply) => {
|
||||
try {
|
||||
const body = SynthesizeBodySchema.parse(request.body ?? {})
|
||||
return await deps.speechService.synthesize(body)
|
||||
} catch (error) {
|
||||
request.log.error({ err: error }, "Failed to synthesize audio")
|
||||
reply.code(getSpeechErrorStatus(error))
|
||||
return { error: getSpeechErrorMessage(error, "Failed to synthesize audio") }
|
||||
}
|
||||
})
|
||||
}
|
||||
40
packages/server/src/settings/public-config.ts
Normal file
40
packages/server/src/settings/public-config.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
import type { SettingsDoc } from "./yaml-doc-store"
|
||||
|
||||
function isPlainObject(value: unknown): value is Record<string, unknown> {
|
||||
return typeof value === "object" && value !== null && !Array.isArray(value)
|
||||
}
|
||||
|
||||
function sanitizeServerOwner(value: SettingsDoc): SettingsDoc {
|
||||
const next: SettingsDoc = { ...value }
|
||||
const speech = isPlainObject(next.speech) ? { ...next.speech } : null
|
||||
|
||||
if (!speech) {
|
||||
return next
|
||||
}
|
||||
|
||||
const rawApiKey = typeof speech.apiKey === "string" ? speech.apiKey.trim() : ""
|
||||
if (rawApiKey) {
|
||||
delete speech.apiKey
|
||||
speech.hasApiKey = true
|
||||
} else if (!("hasApiKey" in speech)) {
|
||||
speech.hasApiKey = false
|
||||
}
|
||||
|
||||
next.speech = speech
|
||||
return next
|
||||
}
|
||||
|
||||
export function sanitizeConfigOwner(owner: string, value: SettingsDoc): SettingsDoc {
|
||||
if (owner !== "server") {
|
||||
return value
|
||||
}
|
||||
return sanitizeServerOwner(value)
|
||||
}
|
||||
|
||||
export function sanitizeConfigDoc(value: SettingsDoc): SettingsDoc {
|
||||
const next: SettingsDoc = { ...value }
|
||||
if (isPlainObject(next.server)) {
|
||||
next.server = sanitizeServerOwner(next.server)
|
||||
}
|
||||
return next
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import type { ConfigLocation } from "../config/location"
|
||||
import { YamlDocStore, type SettingsDoc } from "./yaml-doc-store"
|
||||
import { migrateSettingsLayout } from "./migrate"
|
||||
import type { WorkspaceEventPayload } from "../api-types"
|
||||
import { sanitizeConfigOwner } from "./public-config"
|
||||
|
||||
export type DocKind = "config" | "state"
|
||||
|
||||
@@ -45,10 +46,11 @@ export class SettingsService {
|
||||
private publish(kind: DocKind, owner: string, value?: SettingsDoc) {
|
||||
if (!this.eventBus) return
|
||||
const type = kind === "config" ? "storage.configChanged" : "storage.stateChanged"
|
||||
const nextValue = value ?? this.getOwner(kind, owner)
|
||||
const payload: WorkspaceEventPayload = {
|
||||
type,
|
||||
owner,
|
||||
value: value ?? this.getOwner(kind, owner),
|
||||
value: kind === "config" ? sanitizeConfigOwner(owner, nextValue) : nextValue,
|
||||
} as any
|
||||
this.eventBus.publish(payload)
|
||||
}
|
||||
|
||||
148
packages/server/src/speech/providers/openai-compatible.ts
Normal file
148
packages/server/src/speech/providers/openai-compatible.ts
Normal file
@@ -0,0 +1,148 @@
|
||||
import OpenAI from "openai"
|
||||
import { toFile } from "openai/uploads"
|
||||
import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types"
|
||||
import type { Logger } from "../../logger"
|
||||
import type { NormalizedSpeechSettings, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"
|
||||
|
||||
interface OpenAICompatibleSpeechProviderOptions {
|
||||
settings: NormalizedSpeechSettings
|
||||
logger: Logger
|
||||
}
|
||||
|
||||
export class OpenAICompatibleSpeechProvider {
|
||||
constructor(private readonly options: OpenAICompatibleSpeechProviderOptions) {}
|
||||
|
||||
getCapabilities() {
|
||||
const { settings } = this.options
|
||||
return {
|
||||
available: true,
|
||||
configured: Boolean(settings.apiKey),
|
||||
provider: settings.provider,
|
||||
supportsStt: true,
|
||||
supportsTts: true,
|
||||
baseUrl: settings.baseUrl,
|
||||
sttModel: settings.sttModel,
|
||||
ttsModel: settings.ttsModel,
|
||||
ttsVoice: settings.ttsVoice,
|
||||
}
|
||||
}
|
||||
|
||||
async transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse> {
|
||||
const client = this.createClient()
|
||||
const startedAt = Date.now()
|
||||
const extension = extensionForMime(input.mimeType)
|
||||
const buffer = Buffer.from(input.audioBase64, "base64")
|
||||
const filename = input.filename?.trim() || `prompt-input.${extension}`
|
||||
|
||||
this.options.logger.info(
|
||||
{
|
||||
mimeType: input.mimeType,
|
||||
bytes: buffer.byteLength,
|
||||
language: input.language,
|
||||
model: this.options.settings.sttModel,
|
||||
},
|
||||
"speech.transcribe",
|
||||
)
|
||||
|
||||
const response = await this.requestTranscription(client, buffer, filename, input)
|
||||
|
||||
return {
|
||||
text: typeof response?.text === "string" ? response.text : "",
|
||||
language: typeof response?.language === "string" ? response.language : input.language,
|
||||
durationMs: Number.isFinite(response?.duration) ? Math.round(Number(response.duration) * 1000) : Date.now() - startedAt,
|
||||
segments: Array.isArray(response?.segments)
|
||||
? response.segments
|
||||
.filter((segment: any) => typeof segment?.text === "string")
|
||||
.map((segment: any) => ({
|
||||
startMs: Math.max(0, Math.round(Number(segment.start ?? 0) * 1000)),
|
||||
endMs: Math.max(0, Math.round(Number(segment.end ?? 0) * 1000)),
|
||||
text: String(segment.text),
|
||||
}))
|
||||
: undefined,
|
||||
}
|
||||
}
|
||||
|
||||
private async requestTranscription(
|
||||
client: OpenAI,
|
||||
buffer: Buffer,
|
||||
filename: string,
|
||||
input: TranscribeAudioInput,
|
||||
): Promise<any> {
|
||||
const baseRequest = {
|
||||
model: this.options.settings.sttModel,
|
||||
...(input.language ? { language: input.language } : {}),
|
||||
...(input.prompt ? { prompt: input.prompt } : {}),
|
||||
}
|
||||
|
||||
try {
|
||||
const file = await toFile(buffer, filename, { type: input.mimeType })
|
||||
return (await client.audio.transcriptions.create({
|
||||
...baseRequest,
|
||||
file,
|
||||
response_format: "verbose_json" as any,
|
||||
} as any)) as any
|
||||
} catch (error) {
|
||||
this.options.logger.warn({ err: error }, "speech.transcribe verbose_json failed; retrying default format")
|
||||
const retryFile = await toFile(buffer, filename, { type: input.mimeType })
|
||||
return (await client.audio.transcriptions.create({
|
||||
...baseRequest,
|
||||
file: retryFile,
|
||||
} as any)) as any
|
||||
}
|
||||
}
|
||||
|
||||
async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
|
||||
const client = this.createClient()
|
||||
const format = input.format ?? "mp3"
|
||||
|
||||
this.options.logger.info(
|
||||
{
|
||||
model: this.options.settings.ttsModel,
|
||||
voice: this.options.settings.ttsVoice,
|
||||
format,
|
||||
},
|
||||
"speech.synthesize",
|
||||
)
|
||||
|
||||
const response = await client.audio.speech.create({
|
||||
model: this.options.settings.ttsModel,
|
||||
voice: this.options.settings.ttsVoice as any,
|
||||
input: input.text,
|
||||
response_format: format as any,
|
||||
})
|
||||
|
||||
const audioBuffer = Buffer.from(await response.arrayBuffer())
|
||||
return {
|
||||
audioBase64: audioBuffer.toString("base64"),
|
||||
mimeType: mimeTypeForFormat(format),
|
||||
}
|
||||
}
|
||||
|
||||
private createClient(): OpenAI {
|
||||
const { settings } = this.options
|
||||
if (!settings.apiKey) {
|
||||
throw new Error("Speech provider is not configured. Add an API key in Speech settings.")
|
||||
}
|
||||
|
||||
return new OpenAI({
|
||||
apiKey: settings.apiKey,
|
||||
baseURL: settings.baseUrl,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
function extensionForMime(mimeType: string): string {
|
||||
const normalized = mimeType.toLowerCase()
|
||||
if (normalized.includes("webm")) return "webm"
|
||||
if (normalized.includes("ogg")) return "ogg"
|
||||
if (normalized.includes("wav")) return "wav"
|
||||
if (normalized.includes("mpeg") || normalized.includes("mp3")) return "mp3"
|
||||
if (normalized.includes("mp4") || normalized.includes("aac")) return "m4a"
|
||||
return "webm"
|
||||
}
|
||||
|
||||
function mimeTypeForFormat(format: "mp3" | "wav" | "opus"): string {
|
||||
if (format === "wav") return "audio/wav"
|
||||
if (format === "opus") return "audio/opus"
|
||||
return "audio/mpeg"
|
||||
}
|
||||
91
packages/server/src/speech/service.ts
Normal file
91
packages/server/src/speech/service.ts
Normal file
@@ -0,0 +1,91 @@
|
||||
import { z } from "zod"
|
||||
import type { Logger } from "../logger"
|
||||
import type { SettingsService } from "../settings/service"
|
||||
import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
|
||||
import { OpenAICompatibleSpeechProvider } from "./providers/openai-compatible"
|
||||
|
||||
const ServerSpeechSettingsSchema = z.object({
|
||||
speech: z
|
||||
.object({
|
||||
provider: z.string().optional(),
|
||||
apiKey: z.string().optional(),
|
||||
baseUrl: z.string().optional(),
|
||||
sttModel: z.string().optional(),
|
||||
ttsModel: z.string().optional(),
|
||||
ttsVoice: z.string().optional(),
|
||||
})
|
||||
.optional(),
|
||||
})
|
||||
|
||||
export interface TranscribeAudioInput {
|
||||
audioBase64: string
|
||||
mimeType: string
|
||||
filename?: string
|
||||
language?: string
|
||||
prompt?: string
|
||||
}
|
||||
|
||||
export interface SynthesizeSpeechInput {
|
||||
text: string
|
||||
format?: "mp3" | "wav" | "opus"
|
||||
}
|
||||
|
||||
export interface SpeechProvider {
|
||||
getCapabilities(): SpeechCapabilitiesResponse
|
||||
transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
|
||||
synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
|
||||
}
|
||||
|
||||
export interface NormalizedSpeechSettings {
|
||||
provider: string
|
||||
apiKey?: string
|
||||
baseUrl?: string
|
||||
sttModel: string
|
||||
ttsModel: string
|
||||
ttsVoice: string
|
||||
}
|
||||
|
||||
const DEFAULT_PROVIDER = "openai-compatible"
|
||||
const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
|
||||
const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
|
||||
const DEFAULT_TTS_VOICE = "alloy"
|
||||
export class SpeechService {
|
||||
constructor(
|
||||
private readonly settings: SettingsService,
|
||||
private readonly logger: Logger,
|
||||
) {}
|
||||
|
||||
getCapabilities(): SpeechCapabilitiesResponse {
|
||||
return this.createProvider().getCapabilities()
|
||||
}
|
||||
|
||||
async transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse> {
|
||||
return this.createProvider().transcribe(input)
|
||||
}
|
||||
|
||||
async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
|
||||
return this.createProvider().synthesize(input)
|
||||
}
|
||||
|
||||
private createProvider(): SpeechProvider {
|
||||
const settings = this.resolveSettings()
|
||||
return new OpenAICompatibleSpeechProvider({
|
||||
settings,
|
||||
logger: this.logger.child({ provider: settings.provider }),
|
||||
})
|
||||
}
|
||||
|
||||
private resolveSettings(): NormalizedSpeechSettings {
|
||||
const parsed = ServerSpeechSettingsSchema.parse(this.settings.getOwner("config", "server") ?? {})
|
||||
const speech = parsed.speech ?? {}
|
||||
|
||||
return {
|
||||
provider: speech.provider?.trim() || DEFAULT_PROVIDER,
|
||||
apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY,
|
||||
baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined,
|
||||
sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
|
||||
ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
|
||||
ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user