feat(speech): add configurable TTS playback modes

2026-03-26 20:46:49 +00:00
parent 740f37db86
commit d13ecba322
15 changed files with 561 additions and 55 deletions
--- a/packages/server/src/speech/providers/openai-compatible.ts
+++ b/packages/server/src/speech/providers/openai-compatible.ts
@@ -1,8 +1,9 @@
+import { Readable } from "node:stream"
 import OpenAI from "openai"
 import { toFile } from "openai/uploads"
 import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types"
 import type { Logger } from "../../logger"
-import type { NormalizedSpeechSettings, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"
+import type { NormalizedSpeechSettings, SpeechSynthesisStreamResponse, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"

 interface OpenAICompatibleSpeechProviderOptions {
  settings: NormalizedSpeechSettings
@@ -20,10 +21,13 @@ export class OpenAICompatibleSpeechProvider {
      provider: settings.provider,
      supportsStt: true,
      supportsTts: true,
+      supportsStreamingTts: true,
      baseUrl: settings.baseUrl,
      sttModel: settings.sttModel,
      ttsModel: settings.ttsModel,
      ttsVoice: settings.ttsVoice,
+      ttsFormats: ["mp3", "wav", "opus", "aac"],
+      streamingTtsFormats: ["mp3", "wav", "opus", "aac"],
    }
  }

@@ -92,8 +96,7 @@ export class OpenAICompatibleSpeechProvider {
  }

  async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
-    const client = this.createClient()
-    const format = input.format ?? "mp3"
+    const format = input.format ?? this.options.settings.ttsFormat

    this.options.logger.info(
      {
@@ -104,12 +107,7 @@ export class OpenAICompatibleSpeechProvider {
      "speech.synthesize",
    )

-    const response = await client.audio.speech.create({
-      model: this.options.settings.ttsModel,
-      voice: this.options.settings.ttsVoice as any,
-      input: input.text,
-      response_format: format as any,
-    })
+    const response = await this.requestSpeechAudio(input.text, format)

    const audioBuffer = Buffer.from(await response.arrayBuffer())
    return {
@@ -118,6 +116,58 @@ export class OpenAICompatibleSpeechProvider {
    }
  }

+  async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
+    const format = input.format ?? this.options.settings.ttsFormat
+
+    this.options.logger.info(
+      {
+        model: this.options.settings.ttsModel,
+        voice: this.options.settings.ttsVoice,
+        format,
+      },
+      "speech.synthesize.stream",
+    )
+
+    const response = await this.requestSpeechAudio(input.text, format)
+    if (!response.body) {
+      throw new Error("Speech provider did not return a stream.")
+    }
+
+    return {
+      stream: Readable.fromWeb(response.body as any),
+      mimeType: mimeTypeForFormat(format),
+    }
+  }
+
+  private async requestSpeechAudio(text: string, format: "mp3" | "wav" | "opus" | "aac"): Promise<Response> {
+    const { settings } = this.options
+    if (!settings.apiKey) {
+      throw new Error("Speech provider is not configured. Add an API key in Speech settings.")
+    }
+
+    const endpoint = new URL("audio/speech", ensureTrailingSlash(settings.baseUrl ?? "https://api.openai.com/v1"))
+    const response = await fetch(endpoint, {
+      method: "POST",
+      headers: {
+        Authorization: `Bearer ${settings.apiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({
+        model: settings.ttsModel,
+        voice: settings.ttsVoice,
+        input: text,
+        response_format: format,
+      }),
+    })
+
+    if (!response.ok) {
+      const detail = await response.text()
+      throw new Error(detail || `Speech synthesis failed with ${response.status}`)
+    }
+
+    return response
+  }
+
  private createClient(): OpenAI {
    const { settings } = this.options
    if (!settings.apiKey) {
@@ -141,8 +191,13 @@ function extensionForMime(mimeType: string): string {
  return "webm"
 }

-function mimeTypeForFormat(format: "mp3" | "wav" | "opus"): string {
+function mimeTypeForFormat(format: "mp3" | "wav" | "opus" | "aac"): string {
  if (format === "wav") return "audio/wav"
  if (format === "opus") return "audio/opus"
+  if (format === "aac") return "audio/aac"
  return "audio/mpeg"
 }
+
+function ensureTrailingSlash(value: string): string {
+  return value.endsWith("/") ? value : `${value}/`
+}