feat(speech): add configurable TTS playback modes

2026-03-26 20:46:49 +00:00
parent 740f37db86
commit d13ecba322
15 changed files with 561 additions and 55 deletions
--- a/packages/server/src/api-types.ts
+++ b/packages/server/src/api-types.ts
@@ -219,10 +219,13 @@ export interface SpeechCapabilitiesResponse {
  provider: string
  supportsStt: boolean
  supportsTts: boolean
+  supportsStreamingTts: boolean
  baseUrl?: string
  sttModel: string
  ttsModel: string
  ttsVoice: string
+  ttsFormats: string[]
+  streamingTtsFormats: string[]
 }

 export interface SpeechTranscriptionResponse {
--- a/packages/server/src/server/routes/speech.ts
+++ b/packages/server/src/server/routes/speech.ts
@@ -16,7 +16,7 @@ const TranscribeBodySchema = z.object({

 const SynthesizeBodySchema = z.object({
  text: z.string().trim().min(1, "Text is required"),
-  format: z.enum(["mp3", "wav", "opus"]).optional(),
+  format: z.enum(["mp3", "wav", "opus", "aac"]).optional(),
 })

 function getSpeechErrorStatus(error: unknown): number {
@@ -57,4 +57,18 @@ export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) {
      return { error: getSpeechErrorMessage(error, "Failed to synthesize audio") }
    }
  })
+
+  app.post("/api/speech/synthesize/stream", async (request, reply) => {
+    try {
+      const body = SynthesizeBodySchema.parse(request.body ?? {})
+      const result = await deps.speechService.synthesizeStream(body)
+      reply.header("Content-Type", result.mimeType)
+      reply.header("Cache-Control", "no-store")
+      return reply.send(result.stream)
+    } catch (error) {
+      request.log.error({ err: error }, "Failed to stream synthesized audio")
+      reply.code(getSpeechErrorStatus(error))
+      return { error: getSpeechErrorMessage(error, "Failed to stream synthesized audio") }
+    }
+  })
 }
--- a/packages/server/src/speech/providers/openai-compatible.ts
+++ b/packages/server/src/speech/providers/openai-compatible.ts
@@ -1,8 +1,9 @@
+import { Readable } from "node:stream"
 import OpenAI from "openai"
 import { toFile } from "openai/uploads"
 import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types"
 import type { Logger } from "../../logger"
-import type { NormalizedSpeechSettings, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"
+import type { NormalizedSpeechSettings, SpeechSynthesisStreamResponse, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"

 interface OpenAICompatibleSpeechProviderOptions {
  settings: NormalizedSpeechSettings
@@ -20,10 +21,13 @@ export class OpenAICompatibleSpeechProvider {
      provider: settings.provider,
      supportsStt: true,
      supportsTts: true,
+      supportsStreamingTts: true,
      baseUrl: settings.baseUrl,
      sttModel: settings.sttModel,
      ttsModel: settings.ttsModel,
      ttsVoice: settings.ttsVoice,
+      ttsFormats: ["mp3", "wav", "opus", "aac"],
+      streamingTtsFormats: ["mp3", "wav", "opus", "aac"],
    }
  }

@@ -92,8 +96,7 @@ export class OpenAICompatibleSpeechProvider {
  }

  async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
-    const client = this.createClient()
-    const format = input.format ?? "mp3"
+    const format = input.format ?? this.options.settings.ttsFormat

    this.options.logger.info(
      {
@@ -104,12 +107,7 @@ export class OpenAICompatibleSpeechProvider {
      "speech.synthesize",
    )

-    const response = await client.audio.speech.create({
-      model: this.options.settings.ttsModel,
-      voice: this.options.settings.ttsVoice as any,
-      input: input.text,
-      response_format: format as any,
-    })
+    const response = await this.requestSpeechAudio(input.text, format)

    const audioBuffer = Buffer.from(await response.arrayBuffer())
    return {
@@ -118,6 +116,58 @@ export class OpenAICompatibleSpeechProvider {
    }
  }

+  async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
+    const format = input.format ?? this.options.settings.ttsFormat
+
+    this.options.logger.info(
+      {
+        model: this.options.settings.ttsModel,
+        voice: this.options.settings.ttsVoice,
+        format,
+      },
+      "speech.synthesize.stream",
+    )
+
+    const response = await this.requestSpeechAudio(input.text, format)
+    if (!response.body) {
+      throw new Error("Speech provider did not return a stream.")
+    }
+
+    return {
+      stream: Readable.fromWeb(response.body as any),
+      mimeType: mimeTypeForFormat(format),
+    }
+  }
+
+  private async requestSpeechAudio(text: string, format: "mp3" | "wav" | "opus" | "aac"): Promise<Response> {
+    const { settings } = this.options
+    if (!settings.apiKey) {
+      throw new Error("Speech provider is not configured. Add an API key in Speech settings.")
+    }
+
+    const endpoint = new URL("audio/speech", ensureTrailingSlash(settings.baseUrl ?? "https://api.openai.com/v1"))
+    const response = await fetch(endpoint, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${settings.apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model: settings.ttsModel,
+        voice: settings.ttsVoice,
+        input: text,
+        response_format: format,
+      }),
+    })
+
+    if (!response.ok) {
+      const detail = await response.text()
+      throw new Error(detail || `Speech synthesis failed with ${response.status}`)
+    }
+
+    return response
+  }
+
  private createClient(): OpenAI {
    const { settings } = this.options
    if (!settings.apiKey) {
@@ -141,8 +191,13 @@ function extensionForMime(mimeType: string): string {
  return "webm"
 }

-function mimeTypeForFormat(format: "mp3" | "wav" | "opus"): string {
+function mimeTypeForFormat(format: "mp3" | "wav" | "opus" | "aac"): string {
  if (format === "wav") return "audio/wav"
  if (format === "opus") return "audio/opus"
+  if (format === "aac") return "audio/aac"
  return "audio/mpeg"
 }
+
+function ensureTrailingSlash(value: string): string {
+  return value.endsWith("/") ? value : `${value}/`
+}
--- a/packages/server/src/speech/service.ts
+++ b/packages/server/src/speech/service.ts
@@ -1,4 +1,5 @@
 import { z } from "zod"
+import type { Readable } from "node:stream"
 import type { Logger } from "../logger"
 import type { SettingsService } from "../settings/service"
 import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
@@ -13,6 +14,7 @@ const ServerSpeechSettingsSchema = z.object({
      sttModel: z.string().optional(),
      ttsModel: z.string().optional(),
      ttsVoice: z.string().optional(),
+      ttsFormat: z.enum(["mp3", "wav", "opus", "aac"]).optional(),
    })
    .optional(),
 })
@@ -27,13 +29,19 @@ export interface TranscribeAudioInput {

 export interface SynthesizeSpeechInput {
  text: string
-  format?: "mp3" | "wav" | "opus"
+  format?: "mp3" | "wav" | "opus" | "aac"
+}
+
+export interface SpeechSynthesisStreamResponse {
+  stream: Readable
+  mimeType: string
 }

 export interface SpeechProvider {
  getCapabilities(): SpeechCapabilitiesResponse
  transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
  synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
+  synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse>
 }

 export interface NormalizedSpeechSettings {
@@ -43,12 +51,14 @@ export interface NormalizedSpeechSettings {
  sttModel: string
  ttsModel: string
  ttsVoice: string
+  ttsFormat: "mp3" | "wav" | "opus" | "aac"
 }

 const DEFAULT_PROVIDER = "openai-compatible"
 const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
 const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
 const DEFAULT_TTS_VOICE = "alloy"
+const DEFAULT_TTS_FORMAT = "mp3"
 export class SpeechService {
  constructor(
    private readonly settings: SettingsService,
@@ -67,6 +77,10 @@ export class SpeechService {
    return this.createProvider().synthesize(input)
  }

+  async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
+    return this.createProvider().synthesizeStream(input)
+  }
+
  private createProvider(): SpeechProvider {
    const settings = this.resolveSettings()
    return new OpenAICompatibleSpeechProvider({
@@ -86,6 +100,7 @@ export class SpeechService {
      sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
      ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
      ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
+      ttsFormat: speech.ttsFormat ?? DEFAULT_TTS_FORMAT,
    }
  }
 }