feat(speech): add prompt voice input (#249)

## Summary - add server-backed speech capabilities and transcription endpoints plus UI settings for speech configuration - add push-to-talk prompt voice input with microphone controls, transcription insertion, and browser capability gating - keep prompt controls aligned by restoring right-side nav placement and moving the mic beside the expand control
2026-03-25 14:08:11 +00:00
parent a950d47df0
commit 1233121a13
40 changed files with 1545 additions and 27 deletions
--- a/packages/server/src/speech/providers/openai-compatible.ts
+++ b/packages/server/src/speech/providers/openai-compatible.ts
@@ -0,0 +1,148 @@
+import OpenAI from "openai"
+import { toFile } from "openai/uploads"
+import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types"
+import type { Logger } from "../../logger"
+import type { NormalizedSpeechSettings, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"
+
+interface OpenAICompatibleSpeechProviderOptions {
+  settings: NormalizedSpeechSettings
+  logger: Logger
+}
+
+export class OpenAICompatibleSpeechProvider {
+  constructor(private readonly options: OpenAICompatibleSpeechProviderOptions) {}
+
+  getCapabilities() {
+    const { settings } = this.options
+    return {
+      available: true,
+      configured: Boolean(settings.apiKey),
+      provider: settings.provider,
+      supportsStt: true,
+      supportsTts: true,
+      baseUrl: settings.baseUrl,
+      sttModel: settings.sttModel,
+      ttsModel: settings.ttsModel,
+      ttsVoice: settings.ttsVoice,
+    }
+  }
+
+  async transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse> {
+    const client = this.createClient()
+    const startedAt = Date.now()
+    const extension = extensionForMime(input.mimeType)
+    const buffer = Buffer.from(input.audioBase64, "base64")
+    const filename = input.filename?.trim() || `prompt-input.${extension}`
+
+    this.options.logger.info(
+      {
+        mimeType: input.mimeType,
+        bytes: buffer.byteLength,
+        language: input.language,
+        model: this.options.settings.sttModel,
+      },
+      "speech.transcribe",
+    )
+
+    const response = await this.requestTranscription(client, buffer, filename, input)
+
+    return {
+      text: typeof response?.text === "string" ? response.text : "",
+      language: typeof response?.language === "string" ? response.language : input.language,
+      durationMs: Number.isFinite(response?.duration) ? Math.round(Number(response.duration) * 1000) : Date.now() - startedAt,
+      segments: Array.isArray(response?.segments)
+        ? response.segments
+            .filter((segment: any) => typeof segment?.text === "string")
+            .map((segment: any) => ({
+              startMs: Math.max(0, Math.round(Number(segment.start ?? 0) * 1000)),
+              endMs: Math.max(0, Math.round(Number(segment.end ?? 0) * 1000)),
+              text: String(segment.text),
+            }))
+        : undefined,
+    }
+  }
+
+  private async requestTranscription(
+    client: OpenAI,
+    buffer: Buffer,
+    filename: string,
+    input: TranscribeAudioInput,
+  ): Promise<any> {
+    const baseRequest = {
+      model: this.options.settings.sttModel,
+      ...(input.language ? { language: input.language } : {}),
+      ...(input.prompt ? { prompt: input.prompt } : {}),
+    }
+
+    try {
+      const file = await toFile(buffer, filename, { type: input.mimeType })
+      return (await client.audio.transcriptions.create({
+        ...baseRequest,
+        file,
+        response_format: "verbose_json" as any,
+      } as any)) as any
+    } catch (error) {
+      this.options.logger.warn({ err: error }, "speech.transcribe verbose_json failed; retrying default format")
+      const retryFile = await toFile(buffer, filename, { type: input.mimeType })
+      return (await client.audio.transcriptions.create({
+        ...baseRequest,
+        file: retryFile,
+      } as any)) as any
+    }
+  }
+
+  async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
+    const client = this.createClient()
+    const format = input.format ?? "mp3"
+
+    this.options.logger.info(
+      {
+        model: this.options.settings.ttsModel,
+        voice: this.options.settings.ttsVoice,
+        format,
+      },
+      "speech.synthesize",
+    )
+
+    const response = await client.audio.speech.create({
+      model: this.options.settings.ttsModel,
+      voice: this.options.settings.ttsVoice as any,
+      input: input.text,
+      response_format: format as any,
+    })
+
+    const audioBuffer = Buffer.from(await response.arrayBuffer())
+    return {
+      audioBase64: audioBuffer.toString("base64"),
+      mimeType: mimeTypeForFormat(format),
+    }
+  }
+
+  private createClient(): OpenAI {
+    const { settings } = this.options
+    if (!settings.apiKey) {
+      throw new Error("Speech provider is not configured. Add an API key in Speech settings.")
+    }
+
+    return new OpenAI({
+      apiKey: settings.apiKey,
+      baseURL: settings.baseUrl,
+    })
+  }
+}
+
+function extensionForMime(mimeType: string): string {
+  const normalized = mimeType.toLowerCase()
+  if (normalized.includes("webm")) return "webm"
+  if (normalized.includes("ogg")) return "ogg"
+  if (normalized.includes("wav")) return "wav"
+  if (normalized.includes("mpeg") || normalized.includes("mp3")) return "mp3"
+  if (normalized.includes("mp4") || normalized.includes("aac")) return "m4a"
+  return "webm"
+}
+
+function mimeTypeForFormat(format: "mp3" | "wav" | "opus"): string {
+  if (format === "wav") return "audio/wav"
+  if (format === "opus") return "audio/opus"
+  return "audio/mpeg"
+}
--- a/packages/server/src/speech/service.ts
+++ b/packages/server/src/speech/service.ts
@@ -0,0 +1,91 @@
+import { z } from "zod"
+import type { Logger } from "../logger"
+import type { SettingsService } from "../settings/service"
+import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
+import { OpenAICompatibleSpeechProvider } from "./providers/openai-compatible"
+
+const ServerSpeechSettingsSchema = z.object({
+  speech: z
+    .object({
+      provider: z.string().optional(),
+      apiKey: z.string().optional(),
+      baseUrl: z.string().optional(),
+      sttModel: z.string().optional(),
+      ttsModel: z.string().optional(),
+      ttsVoice: z.string().optional(),
+    })
+    .optional(),
+})
+
+export interface TranscribeAudioInput {
+  audioBase64: string
+  mimeType: string
+  filename?: string
+  language?: string
+  prompt?: string
+}
+
+export interface SynthesizeSpeechInput {
+  text: string
+  format?: "mp3" | "wav" | "opus"
+}
+
+export interface SpeechProvider {
+  getCapabilities(): SpeechCapabilitiesResponse
+  transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
+  synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
+}
+
+export interface NormalizedSpeechSettings {
+  provider: string
+  apiKey?: string
+  baseUrl?: string
+  sttModel: string
+  ttsModel: string
+  ttsVoice: string
+}
+
+const DEFAULT_PROVIDER = "openai-compatible"
+const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
+const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
+const DEFAULT_TTS_VOICE = "alloy"
+export class SpeechService {
+  constructor(
+    private readonly settings: SettingsService,
+    private readonly logger: Logger,
+  ) {}
+
+  getCapabilities(): SpeechCapabilitiesResponse {
+    return this.createProvider().getCapabilities()
+  }
+
+  async transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse> {
+    return this.createProvider().transcribe(input)
+  }
+
+  async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
+    return this.createProvider().synthesize(input)
+  }
+
+  private createProvider(): SpeechProvider {
+    const settings = this.resolveSettings()
+    return new OpenAICompatibleSpeechProvider({
+      settings,
+      logger: this.logger.child({ provider: settings.provider }),
+    })
+  }
+
+  private resolveSettings(): NormalizedSpeechSettings {
+    const parsed = ServerSpeechSettingsSchema.parse(this.settings.getOwner("config", "server") ?? {})
+    const speech = parsed.speech ?? {}
+
+    return {
+      provider: speech.provider?.trim() || DEFAULT_PROVIDER,
+      apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY,
+      baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined,
+      sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
+      ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
+      ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
+    }
+  }
+}