diff --git a/packages/server/src/api-types.ts b/packages/server/src/api-types.ts index 8eb7c928..7bc54a13 100644 --- a/packages/server/src/api-types.ts +++ b/packages/server/src/api-types.ts @@ -219,10 +219,13 @@ export interface SpeechCapabilitiesResponse { provider: string supportsStt: boolean supportsTts: boolean + supportsStreamingTts: boolean baseUrl?: string sttModel: string ttsModel: string ttsVoice: string + ttsFormats: string[] + streamingTtsFormats: string[] } export interface SpeechTranscriptionResponse { diff --git a/packages/server/src/server/routes/speech.ts b/packages/server/src/server/routes/speech.ts index 3fdce9f9..c43d5d0a 100644 --- a/packages/server/src/server/routes/speech.ts +++ b/packages/server/src/server/routes/speech.ts @@ -16,7 +16,7 @@ const TranscribeBodySchema = z.object({ const SynthesizeBodySchema = z.object({ text: z.string().trim().min(1, "Text is required"), - format: z.enum(["mp3", "wav", "opus"]).optional(), + format: z.enum(["mp3", "wav", "opus", "aac"]).optional(), }) function getSpeechErrorStatus(error: unknown): number { @@ -57,4 +57,18 @@ export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) { return { error: getSpeechErrorMessage(error, "Failed to synthesize audio") } } }) + + app.post("/api/speech/synthesize/stream", async (request, reply) => { + try { + const body = SynthesizeBodySchema.parse(request.body ?? {}) + const result = await deps.speechService.synthesizeStream(body) + reply.header("Content-Type", result.mimeType) + reply.header("Cache-Control", "no-store") + return reply.send(result.stream) + } catch (error) { + request.log.error({ err: error }, "Failed to stream synthesized audio") + reply.code(getSpeechErrorStatus(error)) + return { error: getSpeechErrorMessage(error, "Failed to stream synthesized audio") } + } + }) } diff --git a/packages/server/src/speech/providers/openai-compatible.ts b/packages/server/src/speech/providers/openai-compatible.ts index 4c426d72..4ff8b411 100644 --- a/packages/server/src/speech/providers/openai-compatible.ts +++ b/packages/server/src/speech/providers/openai-compatible.ts @@ -1,8 +1,9 @@ +import { Readable } from "node:stream" import OpenAI from "openai" import { toFile } from "openai/uploads" import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types" import type { Logger } from "../../logger" -import type { NormalizedSpeechSettings, SynthesizeSpeechInput, TranscribeAudioInput } from "../service" +import type { NormalizedSpeechSettings, SpeechSynthesisStreamResponse, SynthesizeSpeechInput, TranscribeAudioInput } from "../service" interface OpenAICompatibleSpeechProviderOptions { settings: NormalizedSpeechSettings @@ -20,10 +21,13 @@ export class OpenAICompatibleSpeechProvider { provider: settings.provider, supportsStt: true, supportsTts: true, + supportsStreamingTts: true, baseUrl: settings.baseUrl, sttModel: settings.sttModel, ttsModel: settings.ttsModel, ttsVoice: settings.ttsVoice, + ttsFormats: ["mp3", "wav", "opus", "aac"], + streamingTtsFormats: ["mp3", "wav", "opus", "aac"], } } @@ -92,8 +96,7 @@ export class OpenAICompatibleSpeechProvider { } async synthesize(input: SynthesizeSpeechInput): Promise { - const client = this.createClient() - const format = input.format ?? "mp3" + const format = input.format ?? this.options.settings.ttsFormat this.options.logger.info( { @@ -104,12 +107,7 @@ export class OpenAICompatibleSpeechProvider { "speech.synthesize", ) - const response = await client.audio.speech.create({ - model: this.options.settings.ttsModel, - voice: this.options.settings.ttsVoice as any, - input: input.text, - response_format: format as any, - }) + const response = await this.requestSpeechAudio(input.text, format) const audioBuffer = Buffer.from(await response.arrayBuffer()) return { @@ -118,6 +116,58 @@ export class OpenAICompatibleSpeechProvider { } } + async synthesizeStream(input: SynthesizeSpeechInput): Promise { + const format = input.format ?? this.options.settings.ttsFormat + + this.options.logger.info( + { + model: this.options.settings.ttsModel, + voice: this.options.settings.ttsVoice, + format, + }, + "speech.synthesize.stream", + ) + + const response = await this.requestSpeechAudio(input.text, format) + if (!response.body) { + throw new Error("Speech provider did not return a stream.") + } + + return { + stream: Readable.fromWeb(response.body as any), + mimeType: mimeTypeForFormat(format), + } + } + + private async requestSpeechAudio(text: string, format: "mp3" | "wav" | "opus" | "aac"): Promise { + const { settings } = this.options + if (!settings.apiKey) { + throw new Error("Speech provider is not configured. Add an API key in Speech settings.") + } + + const endpoint = new URL("audio/speech", ensureTrailingSlash(settings.baseUrl ?? "https://api.openai.com/v1")) + const response = await fetch(endpoint, { + method: "POST", + headers: { + Authorization: `Bearer ${settings.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model: settings.ttsModel, + voice: settings.ttsVoice, + input: text, + response_format: format, + }), + }) + + if (!response.ok) { + const detail = await response.text() + throw new Error(detail || `Speech synthesis failed with ${response.status}`) + } + + return response + } + private createClient(): OpenAI { const { settings } = this.options if (!settings.apiKey) { @@ -141,8 +191,13 @@ function extensionForMime(mimeType: string): string { return "webm" } -function mimeTypeForFormat(format: "mp3" | "wav" | "opus"): string { +function mimeTypeForFormat(format: "mp3" | "wav" | "opus" | "aac"): string { if (format === "wav") return "audio/wav" if (format === "opus") return "audio/opus" + if (format === "aac") return "audio/aac" return "audio/mpeg" } + +function ensureTrailingSlash(value: string): string { + return value.endsWith("/") ? value : `${value}/` +} diff --git a/packages/server/src/speech/service.ts b/packages/server/src/speech/service.ts index 14f37a15..d3c37646 100644 --- a/packages/server/src/speech/service.ts +++ b/packages/server/src/speech/service.ts @@ -1,4 +1,5 @@ import { z } from "zod" +import type { Readable } from "node:stream" import type { Logger } from "../logger" import type { SettingsService } from "../settings/service" import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types" @@ -13,6 +14,7 @@ const ServerSpeechSettingsSchema = z.object({ sttModel: z.string().optional(), ttsModel: z.string().optional(), ttsVoice: z.string().optional(), + ttsFormat: z.enum(["mp3", "wav", "opus", "aac"]).optional(), }) .optional(), }) @@ -27,13 +29,19 @@ export interface TranscribeAudioInput { export interface SynthesizeSpeechInput { text: string - format?: "mp3" | "wav" | "opus" + format?: "mp3" | "wav" | "opus" | "aac" +} + +export interface SpeechSynthesisStreamResponse { + stream: Readable + mimeType: string } export interface SpeechProvider { getCapabilities(): SpeechCapabilitiesResponse transcribe(input: TranscribeAudioInput): Promise synthesize(input: SynthesizeSpeechInput): Promise + synthesizeStream(input: SynthesizeSpeechInput): Promise } export interface NormalizedSpeechSettings { @@ -43,12 +51,14 @@ export interface NormalizedSpeechSettings { sttModel: string ttsModel: string ttsVoice: string + ttsFormat: "mp3" | "wav" | "opus" | "aac" } const DEFAULT_PROVIDER = "openai-compatible" const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe" const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts" const DEFAULT_TTS_VOICE = "alloy" +const DEFAULT_TTS_FORMAT = "mp3" export class SpeechService { constructor( private readonly settings: SettingsService, @@ -67,6 +77,10 @@ export class SpeechService { return this.createProvider().synthesize(input) } + async synthesizeStream(input: SynthesizeSpeechInput): Promise { + return this.createProvider().synthesizeStream(input) + } + private createProvider(): SpeechProvider { const settings = this.resolveSettings() return new OpenAICompatibleSpeechProvider({ @@ -86,6 +100,7 @@ export class SpeechService { sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL, ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL, ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE, + ttsFormat: speech.ttsFormat ?? DEFAULT_TTS_FORMAT, } } } diff --git a/packages/ui/src/components/settings/speech-settings-card.tsx b/packages/ui/src/components/settings/speech-settings-card.tsx index 09d83969..8cab35ce 100644 --- a/packages/ui/src/components/settings/speech-settings-card.tsx +++ b/packages/ui/src/components/settings/speech-settings-card.tsx @@ -1,9 +1,10 @@ -import { Show, createEffect, createMemo, createSignal, type Component } from "solid-js" -import { Mic, Volume2 } from "lucide-solid" +import { For, Show, createEffect, createMemo, createSignal, type Component } from "solid-js" +import { Loader2, Mic, Square, Volume2 } from "lucide-solid" import { useConfig, type SpeechSettings } from "../../stores/preferences" import { useI18n } from "../../lib/i18n" import { loadSpeechCapabilities, speechCapabilities, speechCapabilitiesError, speechCapabilitiesLoading } from "../../stores/speech" import { getLogger } from "../../lib/logger" +import { useSpeech } from "../../lib/hooks/use-speech" const log = getLogger("actions") @@ -13,6 +14,8 @@ type DraftFields = { sttModel: string ttsModel: string ttsVoice: string + playbackMode: SpeechSettings["playbackMode"] + ttsFormat: SpeechSettings["ttsFormat"] } function createDraftFields(speech: SpeechSettings): DraftFields { @@ -22,11 +25,21 @@ function createDraftFields(speech: SpeechSettings): DraftFields { sttModel: speech.sttModel, ttsModel: speech.ttsModel, ttsVoice: speech.ttsVoice, + playbackMode: speech.playbackMode, + ttsFormat: speech.ttsFormat, } } function isDraftEqual(a: DraftFields, b: DraftFields): boolean { - return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice + return ( + a.apiKey === b.apiKey && + a.baseUrl === b.baseUrl && + a.sttModel === b.sttModel && + a.ttsModel === b.ttsModel && + a.ttsVoice === b.ttsVoice && + a.playbackMode === b.playbackMode && + a.ttsFormat === b.ttsFormat + ) } export const SpeechSettingsCard: Component = () => { @@ -39,6 +52,15 @@ export const SpeechSettingsCard: Component = () => { const [apiKeyTouched, setApiKeyTouched] = createSignal(false) const [clearStoredApiKey, setClearStoredApiKey] = createSignal(false) + const testSpeech = useSpeech({ + id: () => "settings-speech-test", + text: () => t("settings.speech.testPlayback.sample"), + settingsOverride: () => ({ + playbackMode: drafts().playbackMode, + ttsFormat: drafts().ttsFormat, + }), + }) + createEffect(() => { const speech = serverSettings().speech const nextDrafts = createDraftFields(speech) @@ -84,7 +106,9 @@ export const SpeechSettingsCard: Component = () => { (current.baseUrl || "") !== (speech.baseUrl || "") || current.sttModel !== speech.sttModel || current.ttsModel !== speech.ttsModel || - current.ttsVoice !== speech.ttsVoice + current.ttsVoice !== speech.ttsVoice || + current.playbackMode !== speech.playbackMode || + current.ttsFormat !== speech.ttsFormat ) }) @@ -108,6 +132,8 @@ export const SpeechSettingsCard: Component = () => { sttModel: current.sttModel.trim() || undefined, ttsModel: current.ttsModel.trim() || undefined, ttsVoice: current.ttsVoice.trim() || undefined, + playbackMode: current.playbackMode, + ttsFormat: current.ttsFormat, }) await loadSpeechCapabilities(true) setDrafts({ @@ -116,6 +142,8 @@ export const SpeechSettingsCard: Component = () => { sttModel: current.sttModel.trim() || serverSettings().speech.sttModel, ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel, ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice, + playbackMode: current.playbackMode, + ttsFormat: current.ttsFormat, }) setApiKeyTouched(false) setClearStoredApiKey(false) @@ -151,6 +179,32 @@ export const SpeechSettingsCard: Component = () => { {t("settings.speech.provider.openaiCompatible")} {capabilityLabel()} {saveStatusLabel()} +