feat(speech): add configurable TTS playback modes
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
import { Readable } from "node:stream"
|
||||
import OpenAI from "openai"
|
||||
import { toFile } from "openai/uploads"
|
||||
import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types"
|
||||
import type { Logger } from "../../logger"
|
||||
import type { NormalizedSpeechSettings, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"
|
||||
import type { NormalizedSpeechSettings, SpeechSynthesisStreamResponse, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"
|
||||
|
||||
interface OpenAICompatibleSpeechProviderOptions {
|
||||
settings: NormalizedSpeechSettings
|
||||
@@ -20,10 +21,13 @@ export class OpenAICompatibleSpeechProvider {
|
||||
provider: settings.provider,
|
||||
supportsStt: true,
|
||||
supportsTts: true,
|
||||
supportsStreamingTts: true,
|
||||
baseUrl: settings.baseUrl,
|
||||
sttModel: settings.sttModel,
|
||||
ttsModel: settings.ttsModel,
|
||||
ttsVoice: settings.ttsVoice,
|
||||
ttsFormats: ["mp3", "wav", "opus", "aac"],
|
||||
streamingTtsFormats: ["mp3", "wav", "opus", "aac"],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,8 +96,7 @@ export class OpenAICompatibleSpeechProvider {
|
||||
}
|
||||
|
||||
async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
|
||||
const client = this.createClient()
|
||||
const format = input.format ?? "mp3"
|
||||
const format = input.format ?? this.options.settings.ttsFormat
|
||||
|
||||
this.options.logger.info(
|
||||
{
|
||||
@@ -104,12 +107,7 @@ export class OpenAICompatibleSpeechProvider {
|
||||
"speech.synthesize",
|
||||
)
|
||||
|
||||
const response = await client.audio.speech.create({
|
||||
model: this.options.settings.ttsModel,
|
||||
voice: this.options.settings.ttsVoice as any,
|
||||
input: input.text,
|
||||
response_format: format as any,
|
||||
})
|
||||
const response = await this.requestSpeechAudio(input.text, format)
|
||||
|
||||
const audioBuffer = Buffer.from(await response.arrayBuffer())
|
||||
return {
|
||||
@@ -118,6 +116,58 @@ export class OpenAICompatibleSpeechProvider {
|
||||
}
|
||||
}
|
||||
|
||||
async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
|
||||
const format = input.format ?? this.options.settings.ttsFormat
|
||||
|
||||
this.options.logger.info(
|
||||
{
|
||||
model: this.options.settings.ttsModel,
|
||||
voice: this.options.settings.ttsVoice,
|
||||
format,
|
||||
},
|
||||
"speech.synthesize.stream",
|
||||
)
|
||||
|
||||
const response = await this.requestSpeechAudio(input.text, format)
|
||||
if (!response.body) {
|
||||
throw new Error("Speech provider did not return a stream.")
|
||||
}
|
||||
|
||||
return {
|
||||
stream: Readable.fromWeb(response.body as any),
|
||||
mimeType: mimeTypeForFormat(format),
|
||||
}
|
||||
}
|
||||
|
||||
private async requestSpeechAudio(text: string, format: "mp3" | "wav" | "opus" | "aac"): Promise<Response> {
|
||||
const { settings } = this.options
|
||||
if (!settings.apiKey) {
|
||||
throw new Error("Speech provider is not configured. Add an API key in Speech settings.")
|
||||
}
|
||||
|
||||
const endpoint = new URL("audio/speech", ensureTrailingSlash(settings.baseUrl ?? "https://api.openai.com/v1"))
|
||||
const response = await fetch(endpoint, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${settings.apiKey}`,
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({
|
||||
model: settings.ttsModel,
|
||||
voice: settings.ttsVoice,
|
||||
input: text,
|
||||
response_format: format,
|
||||
}),
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
const detail = await response.text()
|
||||
throw new Error(detail || `Speech synthesis failed with ${response.status}`)
|
||||
}
|
||||
|
||||
return response
|
||||
}
|
||||
|
||||
private createClient(): OpenAI {
|
||||
const { settings } = this.options
|
||||
if (!settings.apiKey) {
|
||||
@@ -141,8 +191,13 @@ function extensionForMime(mimeType: string): string {
|
||||
return "webm"
|
||||
}
|
||||
|
||||
function mimeTypeForFormat(format: "mp3" | "wav" | "opus"): string {
|
||||
function mimeTypeForFormat(format: "mp3" | "wav" | "opus" | "aac"): string {
|
||||
if (format === "wav") return "audio/wav"
|
||||
if (format === "opus") return "audio/opus"
|
||||
if (format === "aac") return "audio/aac"
|
||||
return "audio/mpeg"
|
||||
}
|
||||
|
||||
function ensureTrailingSlash(value: string): string {
|
||||
return value.endsWith("/") ? value : `${value}/`
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { z } from "zod"
|
||||
import type { Readable } from "node:stream"
|
||||
import type { Logger } from "../logger"
|
||||
import type { SettingsService } from "../settings/service"
|
||||
import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
|
||||
@@ -13,6 +14,7 @@ const ServerSpeechSettingsSchema = z.object({
|
||||
sttModel: z.string().optional(),
|
||||
ttsModel: z.string().optional(),
|
||||
ttsVoice: z.string().optional(),
|
||||
ttsFormat: z.enum(["mp3", "wav", "opus", "aac"]).optional(),
|
||||
})
|
||||
.optional(),
|
||||
})
|
||||
@@ -27,13 +29,19 @@ export interface TranscribeAudioInput {
|
||||
|
||||
export interface SynthesizeSpeechInput {
|
||||
text: string
|
||||
format?: "mp3" | "wav" | "opus"
|
||||
format?: "mp3" | "wav" | "opus" | "aac"
|
||||
}
|
||||
|
||||
export interface SpeechSynthesisStreamResponse {
|
||||
stream: Readable
|
||||
mimeType: string
|
||||
}
|
||||
|
||||
export interface SpeechProvider {
|
||||
getCapabilities(): SpeechCapabilitiesResponse
|
||||
transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
|
||||
synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
|
||||
synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse>
|
||||
}
|
||||
|
||||
export interface NormalizedSpeechSettings {
|
||||
@@ -43,12 +51,14 @@ export interface NormalizedSpeechSettings {
|
||||
sttModel: string
|
||||
ttsModel: string
|
||||
ttsVoice: string
|
||||
ttsFormat: "mp3" | "wav" | "opus" | "aac"
|
||||
}
|
||||
|
||||
const DEFAULT_PROVIDER = "openai-compatible"
|
||||
const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
|
||||
const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
|
||||
const DEFAULT_TTS_VOICE = "alloy"
|
||||
const DEFAULT_TTS_FORMAT = "mp3"
|
||||
export class SpeechService {
|
||||
constructor(
|
||||
private readonly settings: SettingsService,
|
||||
@@ -67,6 +77,10 @@ export class SpeechService {
|
||||
return this.createProvider().synthesize(input)
|
||||
}
|
||||
|
||||
async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
|
||||
return this.createProvider().synthesizeStream(input)
|
||||
}
|
||||
|
||||
private createProvider(): SpeechProvider {
|
||||
const settings = this.resolveSettings()
|
||||
return new OpenAICompatibleSpeechProvider({
|
||||
@@ -86,6 +100,7 @@ export class SpeechService {
|
||||
sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
|
||||
ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
|
||||
ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
|
||||
ttsFormat: speech.ttsFormat ?? DEFAULT_TTS_FORMAT,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user