feat(speech): add configurable TTS playback modes

2026-03-26 20:46:49 +00:00
parent 740f37db86
commit d13ecba322
15 changed files with 561 additions and 55 deletions
--- a/packages/server/src/speech/service.ts
+++ b/packages/server/src/speech/service.ts
@@ -1,4 +1,5 @@
 import { z } from "zod"
+import type { Readable } from "node:stream"
 import type { Logger } from "../logger"
 import type { SettingsService } from "../settings/service"
 import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
@@ -13,6 +14,7 @@ const ServerSpeechSettingsSchema = z.object({
      sttModel: z.string().optional(),
      ttsModel: z.string().optional(),
      ttsVoice: z.string().optional(),
+      ttsFormat: z.enum(["mp3", "wav", "opus", "aac"]).optional(),
    })
    .optional(),
 })
@@ -27,13 +29,19 @@ export interface TranscribeAudioInput {

 export interface SynthesizeSpeechInput {
  text: string
-  format?: "mp3" | "wav" | "opus"
+  format?: "mp3" | "wav" | "opus" | "aac"
+}
+
+export interface SpeechSynthesisStreamResponse {
+  stream: Readable
+  mimeType: string
 }

 export interface SpeechProvider {
  getCapabilities(): SpeechCapabilitiesResponse
  transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
  synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
+  synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse>
 }

 export interface NormalizedSpeechSettings {
@@ -43,12 +51,14 @@ export interface NormalizedSpeechSettings {
  sttModel: string
  ttsModel: string
  ttsVoice: string
+  ttsFormat: "mp3" | "wav" | "opus" | "aac"
 }

 const DEFAULT_PROVIDER = "openai-compatible"
 const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
 const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
 const DEFAULT_TTS_VOICE = "alloy"
+const DEFAULT_TTS_FORMAT = "mp3"
 export class SpeechService {
  constructor(
    private readonly settings: SettingsService,
@@ -67,6 +77,10 @@ export class SpeechService {
    return this.createProvider().synthesize(input)
  }

+  async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
+    return this.createProvider().synthesizeStream(input)
+  }
+
  private createProvider(): SpeechProvider {
    const settings = this.resolveSettings()
    return new OpenAICompatibleSpeechProvider({
@@ -86,6 +100,7 @@ export class SpeechService {
      sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
      ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
      ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
+      ttsFormat: speech.ttsFormat ?? DEFAULT_TTS_FORMAT,
    }
  }
 }