diff --git a/packages/server/src/speech/providers/openai-compatible.ts b/packages/server/src/speech/providers/openai-compatible.ts index 4ff8b411..0db8da7f 100644 --- a/packages/server/src/speech/providers/openai-compatible.ts +++ b/packages/server/src/speech/providers/openai-compatible.ts @@ -108,11 +108,12 @@ export class OpenAICompatibleSpeechProvider { ) const response = await this.requestSpeechAudio(input.text, format) + const mimeType = response.headers.get("content-type") || mimeTypeForFormat(format) const audioBuffer = Buffer.from(await response.arrayBuffer()) return { audioBase64: audioBuffer.toString("base64"), - mimeType: mimeTypeForFormat(format), + mimeType, } } @@ -135,7 +136,7 @@ export class OpenAICompatibleSpeechProvider { return { stream: Readable.fromWeb(response.body as any), - mimeType: mimeTypeForFormat(format), + mimeType: response.headers.get("content-type") || mimeTypeForFormat(format), } } @@ -193,7 +194,7 @@ function extensionForMime(mimeType: string): string { function mimeTypeForFormat(format: "mp3" | "wav" | "opus" | "aac"): string { if (format === "wav") return "audio/wav" - if (format === "opus") return "audio/opus" + if (format === "opus") return 'audio/ogg; codecs="opus"' if (format === "aac") return "audio/aac" return "audio/mpeg" } diff --git a/packages/ui/src/components/settings/speech-settings-card.tsx b/packages/ui/src/components/settings/speech-settings-card.tsx index 8cab35ce..e847113d 100644 --- a/packages/ui/src/components/settings/speech-settings-card.tsx +++ b/packages/ui/src/components/settings/speech-settings-card.tsx @@ -5,6 +5,7 @@ import { useI18n } from "../../lib/i18n" import { loadSpeechCapabilities, speechCapabilities, speechCapabilitiesError, speechCapabilitiesLoading } from "../../stores/speech" import { getLogger } from "../../lib/logger" import { useSpeech } from "../../lib/hooks/use-speech" +import { getSpeechPlaybackSupport } from "../../lib/speech-playback-support" const log = getLogger("actions") @@ -97,6 +98,26 @@ export const SpeechSettingsCard: Component = () => { } const apiKeyDirty = createMemo(() => clearStoredApiKey() || drafts().apiKey.trim().length > 0) + const playbackSupport = createMemo(() => + getSpeechPlaybackSupport({ + playbackMode: drafts().playbackMode, + ttsFormat: drafts().ttsFormat, + capabilities: speechCapabilities(), + }), + ) + const compatibilityMessage = createMemo(() => { + const capabilities = speechCapabilities() + if (!capabilities?.available || !capabilities?.configured || !capabilities?.supportsTts) { + return null + } + if (drafts().playbackMode === "streaming" && !capabilities.supportsStreamingTts) { + return t("settings.speech.compatibility.streamingUnavailable") + } + if (drafts().playbackMode === "streaming" && !playbackSupport().available) { + return t("settings.speech.compatibility.browserStreamingUnavailable") + } + return t("settings.speech.compatibility.runtimeNote") + }) const isDirty = createMemo(() => { const speech = serverSettings().speech @@ -291,6 +312,7 @@ export const SpeechSettingsCard: Component = () => { />
{t("settings.speech.help")}
+ {(message) =>
{message()}
}
{t("settings.speech.testPlayback.note")}
diff --git a/packages/ui/src/lib/hooks/use-speech.ts b/packages/ui/src/lib/hooks/use-speech.ts index 441ad7e5..41bf0cee 100644 --- a/packages/ui/src/lib/hooks/use-speech.ts +++ b/packages/ui/src/lib/hooks/use-speech.ts @@ -4,6 +4,7 @@ import { serverApi } from "../api-client" import { useI18n } from "../i18n" import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech" import { useConfig, type SpeechSettings } from "../../stores/preferences" +import { formatToMimeType, getSpeechPlaybackSupport } from "../speech-playback-support" type SpeechPlaybackState = "idle" | "loading" | "playing" @@ -107,10 +108,11 @@ export function useSpeech(options: UseSpeechOptions) { if (!isSupported() || !capabilities?.available || !capabilities?.configured || !capabilities?.supportsTts) { return false } - if (resolvedSettings().playbackMode === "streaming") { - return Boolean(capabilities.supportsStreamingTts) - } - return true + return getSpeechPlaybackSupport({ + playbackMode: resolvedSettings().playbackMode, + ttsFormat: resolvedSettings().ttsFormat, + capabilities, + }).available } const stop = () => { @@ -142,6 +144,27 @@ export function useSpeech(options: UseSpeechOptions) { return } + const support = getSpeechPlaybackSupport({ + playbackMode: resolvedSettings().playbackMode, + ttsFormat: resolvedSettings().ttsFormat, + capabilities, + }) + if (!support.available) { + const detailKey = + support.reason === "provider-streaming-unavailable" + ? "settings.speech.compatibility.streamingUnavailable" + : support.reason === "browser-streaming-unavailable" + ? "settings.speech.compatibility.browserStreamingUnavailable" + : "messageItem.actions.speak.error.unsupported" + + showAlertDialog(t("messageItem.actions.speak.error.unavailable"), { + title: t("messageItem.actions.speak.error.title"), + detail: t(detailKey), + variant: "error", + }) + return + } + requestVersion += 1 const currentRequest = requestVersion stopActivePlayback() @@ -391,10 +414,3 @@ function createObjectUrlFromBase64(audioBase64: string, mimeType: string): strin } return URL.createObjectURL(new Blob([bytes], { type: mimeType || "audio/mpeg" })) } - -function formatToMimeType(format: "mp3" | "wav" | "opus" | "aac"): string { - if (format === "wav") return "audio/wav" - if (format === "opus") return "audio/opus" - if (format === "aac") return "audio/aac" - return "audio/mpeg" -} diff --git a/packages/ui/src/lib/i18n/messages/en/settings.ts b/packages/ui/src/lib/i18n/messages/en/settings.ts index 49ae6be2..bdd4a710 100644 --- a/packages/ui/src/lib/i18n/messages/en/settings.ts +++ b/packages/ui/src/lib/i18n/messages/en/settings.ts @@ -173,6 +173,9 @@ export const settingsMessages = { "settings.speech.ttsFormat.title": "Output format", "settings.speech.ttsFormat.subtitle": "Choose the audio format for synthesized speech. Streaming support depends on your provider and browser.", "settings.speech.help": "Prompt voice input appears when speech transcription is configured and supported. Message playback uses the TTS mode and format selected here.", + "settings.speech.compatibility.streamingUnavailable": "Your current speech provider configuration does not advertise streaming TTS. Switch playback mode to buffered if you want playback to work now.", + "settings.speech.compatibility.browserStreamingUnavailable": "Your current browser cannot stream the selected TTS format. Choose buffered playback or switch to a different format.", + "settings.speech.compatibility.runtimeNote": "All formats stay selectable in streaming mode. Some browser and provider combinations may still fail at playback time.", "settings.speech.testPlayback.action": "Test playback", "settings.speech.testPlayback.generating": "Generating sample", "settings.speech.testPlayback.stop": "Stop sample", diff --git a/packages/ui/src/lib/i18n/messages/es/settings.ts b/packages/ui/src/lib/i18n/messages/es/settings.ts index 0fb4fc99..ea17bb48 100644 --- a/packages/ui/src/lib/i18n/messages/es/settings.ts +++ b/packages/ui/src/lib/i18n/messages/es/settings.ts @@ -173,10 +173,13 @@ export const settingsMessages = { "settings.speech.ttsFormat.title": "Formato de salida", "settings.speech.ttsFormat.subtitle": "Elige el formato de audio para la voz sintetizada. La compatibilidad de streaming depende de tu proveedor y navegador.", "settings.speech.help": "La entrada de voz del prompt aparece cuando la transcripcion de voz esta configurada y es compatible. La reproduccion de mensajes usa el modo y formato TTS seleccionados aqui.", + "settings.speech.compatibility.streamingUnavailable": "Tu configuracion actual del proveedor de voz no anuncia TTS por streaming. Cambia el modo de reproduccion a buffered si quieres que la reproduccion funcione ahora.", + "settings.speech.compatibility.browserStreamingUnavailable": "Tu navegador actual no puede reproducir por streaming el formato TTS seleccionado. Elige reproduccion buffered o cambia a otro formato.", + "settings.speech.compatibility.runtimeNote": "Todos los formatos siguen disponibles en modo streaming. Algunas combinaciones de navegador y proveedor aun pueden fallar al reproducir.", "settings.speech.testPlayback.action": "Probar reproduccion", "settings.speech.testPlayback.generating": "Generando muestra", "settings.speech.testPlayback.stop": "Detener muestra", - "settings.speech.testPlayback.sample": "Esta es una prueba de reproduccion de voz con el modo y formato seleccionados actualmente.", + "settings.speech.testPlayback.sample": "Gracias por usar CodeNomad, tu configuracion de voz funciona correctamente.", "settings.speech.testPlayback.note": "La prueba usa de inmediato el modo y formato actuales. Guarda primero los cambios de API key, base URL, modelo o voz si tambien quieres probarlos.", "settings.speech.save.action": "Guardar", "settings.speech.save.saving": "Guardando...", diff --git a/packages/ui/src/lib/i18n/messages/fr/settings.ts b/packages/ui/src/lib/i18n/messages/fr/settings.ts index 6ba5e738..5a543305 100644 --- a/packages/ui/src/lib/i18n/messages/fr/settings.ts +++ b/packages/ui/src/lib/i18n/messages/fr/settings.ts @@ -173,10 +173,13 @@ export const settingsMessages = { "settings.speech.ttsFormat.title": "Format de sortie", "settings.speech.ttsFormat.subtitle": "Choisissez le format audio pour la voix synthetisee. La prise en charge du streaming depend du fournisseur et du navigateur.", "settings.speech.help": "La saisie vocale du prompt apparait lorsque la transcription vocale est configuree et prise en charge. La lecture des messages utilise le mode et le format TTS selectionnes ici.", + "settings.speech.compatibility.streamingUnavailable": "Votre configuration actuelle du fournisseur vocal n'annonce pas le TTS en streaming. Passez le mode de lecture sur buffered si vous voulez que la lecture fonctionne maintenant.", + "settings.speech.compatibility.browserStreamingUnavailable": "Votre navigateur actuel ne peut pas lire en streaming le format TTS selectionne. Choisissez la lecture buffered ou passez a un autre format.", + "settings.speech.compatibility.runtimeNote": "Tous les formats restent selectionnables en mode streaming. Certaines combinaisons navigateur/fournisseur peuvent quand meme echouer au moment de la lecture.", "settings.speech.testPlayback.action": "Tester la lecture", "settings.speech.testPlayback.generating": "Generation de l'extrait", "settings.speech.testPlayback.stop": "Arreter l'extrait", - "settings.speech.testPlayback.sample": "Ceci est un test de lecture vocale utilisant le mode de lecture et le format actuellement selectionnes.", + "settings.speech.testPlayback.sample": "Merci d'utiliser CodeNomad, vos parametres vocaux fonctionnent correctement.", "settings.speech.testPlayback.note": "Le test utilise immediatement le mode et le format actuels. Enregistrez d'abord les changements d'API key, d'URL de base, de modele ou de voix si vous voulez aussi les tester.", "settings.speech.save.action": "Enregistrer", "settings.speech.save.saving": "Enregistrement...", diff --git a/packages/ui/src/lib/i18n/messages/he/settings.ts b/packages/ui/src/lib/i18n/messages/he/settings.ts index 711e6b4e..534a6d30 100644 --- a/packages/ui/src/lib/i18n/messages/he/settings.ts +++ b/packages/ui/src/lib/i18n/messages/he/settings.ts @@ -172,10 +172,13 @@ export const settingsMessages = { "settings.speech.ttsFormat.title": "פורמט פלט", "settings.speech.ttsFormat.subtitle": "בחר את פורמט האודיו לדיבור מסונתז. תמיכת סטרימינג תלויה בספק ובדפדפן.", "settings.speech.help": "קלט קולי לפרומפט מופיע כאשר תמלול קול מוגדר ונתמך. השמעת הודעות משתמשת במצב ובפורמט ה-TTS שנבחרו כאן.", + "settings.speech.compatibility.streamingUnavailable": "תצורת ספק הקול הנוכחית שלך לא מצהירה על TTS בסטרימינג. עבור למצב buffered אם אתה רוצה שהניגון יעבוד כבר עכשיו.", + "settings.speech.compatibility.browserStreamingUnavailable": "הדפדפן הנוכחי שלך לא יכול לנגן בסטרימינג את פורמט ה-TTS שנבחר. בחר בניגון buffered או עבור לפורמט אחר.", + "settings.speech.compatibility.runtimeNote": "כל הפורמטים נשארים זמינים במצב סטרימינג. חלק מהשילובים של דפדפן וספק עדיין עלולים להיכשל בזמן הניגון.", "settings.speech.testPlayback.action": "בדוק ניגון", "settings.speech.testPlayback.generating": "יוצר דוגמה", "settings.speech.testPlayback.stop": "עצור דוגמה", - "settings.speech.testPlayback.sample": "זהו מבחן ניגון קולי המשתמש במצב ובפורמט שנבחרו כרגע.", + "settings.speech.testPlayback.sample": "תודה שאתה משתמש ב-CodeNomad, הגדרות הקול שלך פועלות כראוי.", "settings.speech.testPlayback.note": "המבחן משתמש מיד במצב ובפורמט הנוכחיים. שמור תחילה שינויים ב-API key, ב-Base URL, במודל או בקול אם גם אותם תרצה לבדוק.", "settings.speech.save.action": "שמור", "settings.speech.save.saving": "שומר...", diff --git a/packages/ui/src/lib/i18n/messages/ja/settings.ts b/packages/ui/src/lib/i18n/messages/ja/settings.ts index aec92a00..9373a8ca 100644 --- a/packages/ui/src/lib/i18n/messages/ja/settings.ts +++ b/packages/ui/src/lib/i18n/messages/ja/settings.ts @@ -173,10 +173,13 @@ export const settingsMessages = { "settings.speech.ttsFormat.title": "出力形式", "settings.speech.ttsFormat.subtitle": "音声合成の出力形式を選択します。ストリーミング対応はプロバイダーとブラウザーに依存します。", "settings.speech.help": "プロンプト音声入力は音声文字起こしが設定され対応している場合に表示されます。メッセージ再生にはここで選んだTTSモードと形式が使われます。", + "settings.speech.compatibility.streamingUnavailable": "現在の音声プロバイダー設定ではストリーミングTTSが利用可能として公開されていません。今すぐ再生を使いたい場合は再生モードを buffered に切り替えてください。", + "settings.speech.compatibility.browserStreamingUnavailable": "現在のブラウザーでは、選択したTTS形式をストリーミング再生できません。buffered 再生に切り替えるか、別の形式を選んでください。", + "settings.speech.compatibility.runtimeNote": "ストリーミングモードでも全ての形式を選択できますが、ブラウザーとプロバイダーの組み合わせによっては再生時に失敗することがあります。", "settings.speech.testPlayback.action": "再生をテスト", "settings.speech.testPlayback.generating": "サンプルを生成中", "settings.speech.testPlayback.stop": "サンプルを停止", - "settings.speech.testPlayback.sample": "現在選択している再生モードと形式で音声再生をテストします。", + "settings.speech.testPlayback.sample": "CodeNomad をご利用いただきありがとうございます。音声設定は正常に動作しています。", "settings.speech.testPlayback.note": "このテストは現在の再生モードと形式をすぐに使います。APIキー、Base URL、モデル、音声の変更も試したい場合は先に保存してください。", "settings.speech.save.action": "保存", "settings.speech.save.saving": "保存中...", diff --git a/packages/ui/src/lib/i18n/messages/ru/settings.ts b/packages/ui/src/lib/i18n/messages/ru/settings.ts index d65284bf..2b3e0fa1 100644 --- a/packages/ui/src/lib/i18n/messages/ru/settings.ts +++ b/packages/ui/src/lib/i18n/messages/ru/settings.ts @@ -173,10 +173,13 @@ export const settingsMessages = { "settings.speech.ttsFormat.title": "Формат вывода", "settings.speech.ttsFormat.subtitle": "Выберите аудиоформат для синтезированной речи. Поддержка потокового режима зависит от провайдера и браузера.", "settings.speech.help": "Голосовой ввод появляется, когда распознавание речи настроено и поддерживается. Для воспроизведения сообщений используются выбранные здесь режим и формат TTS.", + "settings.speech.compatibility.streamingUnavailable": "Текущая конфигурация голосового провайдера не заявляет поддержку потокового TTS. Переключите режим воспроизведения на buffered, если хотите, чтобы воспроизведение работало уже сейчас.", + "settings.speech.compatibility.browserStreamingUnavailable": "Ваш текущий браузер не может воспроизводить потоково выбранный формат TTS. Выберите buffered-воспроизведение или переключитесь на другой формат.", + "settings.speech.compatibility.runtimeNote": "В режиме streaming по-прежнему доступны все форматы. Некоторые сочетания браузера и провайдера все равно могут завершаться ошибкой во время воспроизведения.", "settings.speech.testPlayback.action": "Проверить воспроизведение", "settings.speech.testPlayback.generating": "Генерация примера", "settings.speech.testPlayback.stop": "Остановить пример", - "settings.speech.testPlayback.sample": "Это тест голосового воспроизведения с текущим режимом и выбранным форматом.", + "settings.speech.testPlayback.sample": "Спасибо, что используете CodeNomad, ваши настройки речи работают нормально.", "settings.speech.testPlayback.note": "Тест сразу использует текущие режим и формат. Сначала сохраните изменения API key, Base URL, модели или голоса, если хотите проверить и их.", "settings.speech.save.action": "Сохранить", "settings.speech.save.saving": "Сохранение...", diff --git a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts index 99e50350..75303a7b 100644 --- a/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts +++ b/packages/ui/src/lib/i18n/messages/zh-Hans/settings.ts @@ -173,10 +173,13 @@ export const settingsMessages = { "settings.speech.ttsFormat.title": "输出格式", "settings.speech.ttsFormat.subtitle": "选择语音合成的音频格式。流式支持取决于你的提供商和浏览器。", "settings.speech.help": "当语音转写已配置且受支持时,提示框语音输入会显示。消息播放会使用这里选择的 TTS 模式和格式。", + "settings.speech.compatibility.streamingUnavailable": "你当前的语音提供商配置没有声明支持流式 TTS。如果你现在就想让播放可用,请把播放模式切换为 buffered。", + "settings.speech.compatibility.browserStreamingUnavailable": "你当前的浏览器无法流式播放所选的 TTS 格式。请选择 buffered 播放,或切换到其他格式。", + "settings.speech.compatibility.runtimeNote": "在流式模式下仍然可以选择所有格式,但某些浏览器与提供商的组合在播放时仍可能失败。", "settings.speech.testPlayback.action": "测试播放", "settings.speech.testPlayback.generating": "正在生成示例", "settings.speech.testPlayback.stop": "停止示例", - "settings.speech.testPlayback.sample": "这是一个使用当前播放模式和所选格式的语音播放测试。", + "settings.speech.testPlayback.sample": "感谢你使用 CodeNomad,你的语音设置工作正常。", "settings.speech.testPlayback.note": "测试会立即使用当前播放模式和格式。如果你也想测试 API key、Base URL、模型或音色的更改,请先保存。", "settings.speech.save.action": "保存", "settings.speech.save.saving": "保存中...", diff --git a/packages/ui/src/lib/speech-playback-support.ts b/packages/ui/src/lib/speech-playback-support.ts new file mode 100644 index 00000000..672c51f7 --- /dev/null +++ b/packages/ui/src/lib/speech-playback-support.ts @@ -0,0 +1,58 @@ +import type { SpeechCapabilitiesResponse } from "../../../server/src/api-types" +import type { SpeechPlaybackMode, SpeechTtsFormat } from "../stores/preferences" + +export interface SpeechPlaybackSupportResult { + available: boolean + reason?: "unsupported-environment" | "provider-streaming-unavailable" | "browser-streaming-unavailable" +} + +export function formatToMimeType(format: SpeechTtsFormat): string { + if (format === "wav") return "audio/wav" + if (format === "opus") return getSupportedMimeType(format) + if (format === "aac") return "audio/aac" + return "audio/mpeg" +} + +export function getCandidateMimeTypes(format: SpeechTtsFormat): string[] { + if (format === "wav") return ["audio/wav"] + if (format === "opus") { + return ['audio/ogg; codecs="opus"', 'audio/webm; codecs="opus"', "audio/opus"] + } + if (format === "aac") return ["audio/aac", "audio/mp4", 'audio/mp4; codecs="mp4a.40.2"'] + return ["audio/mpeg"] +} + +export function getSupportedMimeType(format: SpeechTtsFormat): string { + const candidates = getCandidateMimeTypes(format) + if (typeof MediaSource === "undefined") { + return candidates[0] + } + return candidates.find((candidate) => MediaSource.isTypeSupported(candidate)) ?? candidates[0] +} + +export function getSpeechPlaybackSupport(options: { + playbackMode: SpeechPlaybackMode + ttsFormat: SpeechTtsFormat + capabilities?: SpeechCapabilitiesResponse | null +}): SpeechPlaybackSupportResult { + if (typeof window === "undefined" || typeof window.Audio === "undefined") { + return { available: false, reason: "unsupported-environment" } + } + + if (options.playbackMode !== "streaming") { + return { available: true } + } + + if (!options.capabilities?.supportsStreamingTts) { + return { available: false, reason: "provider-streaming-unavailable" } + } + + if ( + typeof MediaSource === "undefined" || + !getCandidateMimeTypes(options.ttsFormat).some((candidate) => MediaSource.isTypeSupported(candidate)) + ) { + return { available: false, reason: "browser-streaming-unavailable" } + } + + return { available: true } +}