feat(speech): add configurable TTS playback modes

This commit is contained in:
Shantur Rathore
2026-03-26 20:46:49 +00:00
parent 740f37db86
commit d13ecba322
15 changed files with 561 additions and 55 deletions

View File

@@ -1,4 +1,5 @@
import { z } from "zod"
import type { Readable } from "node:stream"
import type { Logger } from "../logger"
import type { SettingsService } from "../settings/service"
import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
@@ -13,6 +14,7 @@ const ServerSpeechSettingsSchema = z.object({
sttModel: z.string().optional(),
ttsModel: z.string().optional(),
ttsVoice: z.string().optional(),
ttsFormat: z.enum(["mp3", "wav", "opus", "aac"]).optional(),
})
.optional(),
})
@@ -27,13 +29,19 @@ export interface TranscribeAudioInput {
export interface SynthesizeSpeechInput {
text: string
format?: "mp3" | "wav" | "opus"
format?: "mp3" | "wav" | "opus" | "aac"
}
export interface SpeechSynthesisStreamResponse {
stream: Readable
mimeType: string
}
export interface SpeechProvider {
getCapabilities(): SpeechCapabilitiesResponse
transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse>
}
export interface NormalizedSpeechSettings {
@@ -43,12 +51,14 @@ export interface NormalizedSpeechSettings {
sttModel: string
ttsModel: string
ttsVoice: string
ttsFormat: "mp3" | "wav" | "opus" | "aac"
}
const DEFAULT_PROVIDER = "openai-compatible"
const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
const DEFAULT_TTS_VOICE = "alloy"
const DEFAULT_TTS_FORMAT = "mp3"
export class SpeechService {
constructor(
private readonly settings: SettingsService,
@@ -67,6 +77,10 @@ export class SpeechService {
return this.createProvider().synthesize(input)
}
async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
return this.createProvider().synthesizeStream(input)
}
private createProvider(): SpeechProvider {
const settings = this.resolveSettings()
return new OpenAICompatibleSpeechProvider({
@@ -86,6 +100,7 @@ export class SpeechService {
sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
ttsFormat: speech.ttsFormat ?? DEFAULT_TTS_FORMAT,
}
}
}