feat(speech): add configurable TTS playback modes
This commit is contained in:
@@ -219,10 +219,13 @@ export interface SpeechCapabilitiesResponse {
|
|||||||
provider: string
|
provider: string
|
||||||
supportsStt: boolean
|
supportsStt: boolean
|
||||||
supportsTts: boolean
|
supportsTts: boolean
|
||||||
|
supportsStreamingTts: boolean
|
||||||
baseUrl?: string
|
baseUrl?: string
|
||||||
sttModel: string
|
sttModel: string
|
||||||
ttsModel: string
|
ttsModel: string
|
||||||
ttsVoice: string
|
ttsVoice: string
|
||||||
|
ttsFormats: string[]
|
||||||
|
streamingTtsFormats: string[]
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SpeechTranscriptionResponse {
|
export interface SpeechTranscriptionResponse {
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ const TranscribeBodySchema = z.object({
|
|||||||
|
|
||||||
const SynthesizeBodySchema = z.object({
|
const SynthesizeBodySchema = z.object({
|
||||||
text: z.string().trim().min(1, "Text is required"),
|
text: z.string().trim().min(1, "Text is required"),
|
||||||
format: z.enum(["mp3", "wav", "opus"]).optional(),
|
format: z.enum(["mp3", "wav", "opus", "aac"]).optional(),
|
||||||
})
|
})
|
||||||
|
|
||||||
function getSpeechErrorStatus(error: unknown): number {
|
function getSpeechErrorStatus(error: unknown): number {
|
||||||
@@ -57,4 +57,18 @@ export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) {
|
|||||||
return { error: getSpeechErrorMessage(error, "Failed to synthesize audio") }
|
return { error: getSpeechErrorMessage(error, "Failed to synthesize audio") }
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
app.post("/api/speech/synthesize/stream", async (request, reply) => {
|
||||||
|
try {
|
||||||
|
const body = SynthesizeBodySchema.parse(request.body ?? {})
|
||||||
|
const result = await deps.speechService.synthesizeStream(body)
|
||||||
|
reply.header("Content-Type", result.mimeType)
|
||||||
|
reply.header("Cache-Control", "no-store")
|
||||||
|
return reply.send(result.stream)
|
||||||
|
} catch (error) {
|
||||||
|
request.log.error({ err: error }, "Failed to stream synthesized audio")
|
||||||
|
reply.code(getSpeechErrorStatus(error))
|
||||||
|
return { error: getSpeechErrorMessage(error, "Failed to stream synthesized audio") }
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
|
import { Readable } from "node:stream"
|
||||||
import OpenAI from "openai"
|
import OpenAI from "openai"
|
||||||
import { toFile } from "openai/uploads"
|
import { toFile } from "openai/uploads"
|
||||||
import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types"
|
import type { SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../../api-types"
|
||||||
import type { Logger } from "../../logger"
|
import type { Logger } from "../../logger"
|
||||||
import type { NormalizedSpeechSettings, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"
|
import type { NormalizedSpeechSettings, SpeechSynthesisStreamResponse, SynthesizeSpeechInput, TranscribeAudioInput } from "../service"
|
||||||
|
|
||||||
interface OpenAICompatibleSpeechProviderOptions {
|
interface OpenAICompatibleSpeechProviderOptions {
|
||||||
settings: NormalizedSpeechSettings
|
settings: NormalizedSpeechSettings
|
||||||
@@ -20,10 +21,13 @@ export class OpenAICompatibleSpeechProvider {
|
|||||||
provider: settings.provider,
|
provider: settings.provider,
|
||||||
supportsStt: true,
|
supportsStt: true,
|
||||||
supportsTts: true,
|
supportsTts: true,
|
||||||
|
supportsStreamingTts: true,
|
||||||
baseUrl: settings.baseUrl,
|
baseUrl: settings.baseUrl,
|
||||||
sttModel: settings.sttModel,
|
sttModel: settings.sttModel,
|
||||||
ttsModel: settings.ttsModel,
|
ttsModel: settings.ttsModel,
|
||||||
ttsVoice: settings.ttsVoice,
|
ttsVoice: settings.ttsVoice,
|
||||||
|
ttsFormats: ["mp3", "wav", "opus", "aac"],
|
||||||
|
streamingTtsFormats: ["mp3", "wav", "opus", "aac"],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,8 +96,7 @@ export class OpenAICompatibleSpeechProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
|
async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
|
||||||
const client = this.createClient()
|
const format = input.format ?? this.options.settings.ttsFormat
|
||||||
const format = input.format ?? "mp3"
|
|
||||||
|
|
||||||
this.options.logger.info(
|
this.options.logger.info(
|
||||||
{
|
{
|
||||||
@@ -104,12 +107,7 @@ export class OpenAICompatibleSpeechProvider {
|
|||||||
"speech.synthesize",
|
"speech.synthesize",
|
||||||
)
|
)
|
||||||
|
|
||||||
const response = await client.audio.speech.create({
|
const response = await this.requestSpeechAudio(input.text, format)
|
||||||
model: this.options.settings.ttsModel,
|
|
||||||
voice: this.options.settings.ttsVoice as any,
|
|
||||||
input: input.text,
|
|
||||||
response_format: format as any,
|
|
||||||
})
|
|
||||||
|
|
||||||
const audioBuffer = Buffer.from(await response.arrayBuffer())
|
const audioBuffer = Buffer.from(await response.arrayBuffer())
|
||||||
return {
|
return {
|
||||||
@@ -118,6 +116,58 @@ export class OpenAICompatibleSpeechProvider {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
|
||||||
|
const format = input.format ?? this.options.settings.ttsFormat
|
||||||
|
|
||||||
|
this.options.logger.info(
|
||||||
|
{
|
||||||
|
model: this.options.settings.ttsModel,
|
||||||
|
voice: this.options.settings.ttsVoice,
|
||||||
|
format,
|
||||||
|
},
|
||||||
|
"speech.synthesize.stream",
|
||||||
|
)
|
||||||
|
|
||||||
|
const response = await this.requestSpeechAudio(input.text, format)
|
||||||
|
if (!response.body) {
|
||||||
|
throw new Error("Speech provider did not return a stream.")
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
stream: Readable.fromWeb(response.body as any),
|
||||||
|
mimeType: mimeTypeForFormat(format),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async requestSpeechAudio(text: string, format: "mp3" | "wav" | "opus" | "aac"): Promise<Response> {
|
||||||
|
const { settings } = this.options
|
||||||
|
if (!settings.apiKey) {
|
||||||
|
throw new Error("Speech provider is not configured. Add an API key in Speech settings.")
|
||||||
|
}
|
||||||
|
|
||||||
|
const endpoint = new URL("audio/speech", ensureTrailingSlash(settings.baseUrl ?? "https://api.openai.com/v1"))
|
||||||
|
const response = await fetch(endpoint, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${settings.apiKey}`,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
model: settings.ttsModel,
|
||||||
|
voice: settings.ttsVoice,
|
||||||
|
input: text,
|
||||||
|
response_format: format,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
const detail = await response.text()
|
||||||
|
throw new Error(detail || `Speech synthesis failed with ${response.status}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
return response
|
||||||
|
}
|
||||||
|
|
||||||
private createClient(): OpenAI {
|
private createClient(): OpenAI {
|
||||||
const { settings } = this.options
|
const { settings } = this.options
|
||||||
if (!settings.apiKey) {
|
if (!settings.apiKey) {
|
||||||
@@ -141,8 +191,13 @@ function extensionForMime(mimeType: string): string {
|
|||||||
return "webm"
|
return "webm"
|
||||||
}
|
}
|
||||||
|
|
||||||
function mimeTypeForFormat(format: "mp3" | "wav" | "opus"): string {
|
function mimeTypeForFormat(format: "mp3" | "wav" | "opus" | "aac"): string {
|
||||||
if (format === "wav") return "audio/wav"
|
if (format === "wav") return "audio/wav"
|
||||||
if (format === "opus") return "audio/opus"
|
if (format === "opus") return "audio/opus"
|
||||||
|
if (format === "aac") return "audio/aac"
|
||||||
return "audio/mpeg"
|
return "audio/mpeg"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function ensureTrailingSlash(value: string): string {
|
||||||
|
return value.endsWith("/") ? value : `${value}/`
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import { z } from "zod"
|
import { z } from "zod"
|
||||||
|
import type { Readable } from "node:stream"
|
||||||
import type { Logger } from "../logger"
|
import type { Logger } from "../logger"
|
||||||
import type { SettingsService } from "../settings/service"
|
import type { SettingsService } from "../settings/service"
|
||||||
import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
|
import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
|
||||||
@@ -13,6 +14,7 @@ const ServerSpeechSettingsSchema = z.object({
|
|||||||
sttModel: z.string().optional(),
|
sttModel: z.string().optional(),
|
||||||
ttsModel: z.string().optional(),
|
ttsModel: z.string().optional(),
|
||||||
ttsVoice: z.string().optional(),
|
ttsVoice: z.string().optional(),
|
||||||
|
ttsFormat: z.enum(["mp3", "wav", "opus", "aac"]).optional(),
|
||||||
})
|
})
|
||||||
.optional(),
|
.optional(),
|
||||||
})
|
})
|
||||||
@@ -27,13 +29,19 @@ export interface TranscribeAudioInput {
|
|||||||
|
|
||||||
export interface SynthesizeSpeechInput {
|
export interface SynthesizeSpeechInput {
|
||||||
text: string
|
text: string
|
||||||
format?: "mp3" | "wav" | "opus"
|
format?: "mp3" | "wav" | "opus" | "aac"
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SpeechSynthesisStreamResponse {
|
||||||
|
stream: Readable
|
||||||
|
mimeType: string
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface SpeechProvider {
|
export interface SpeechProvider {
|
||||||
getCapabilities(): SpeechCapabilitiesResponse
|
getCapabilities(): SpeechCapabilitiesResponse
|
||||||
transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
|
transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
|
||||||
synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
|
synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
|
||||||
|
synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse>
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface NormalizedSpeechSettings {
|
export interface NormalizedSpeechSettings {
|
||||||
@@ -43,12 +51,14 @@ export interface NormalizedSpeechSettings {
|
|||||||
sttModel: string
|
sttModel: string
|
||||||
ttsModel: string
|
ttsModel: string
|
||||||
ttsVoice: string
|
ttsVoice: string
|
||||||
|
ttsFormat: "mp3" | "wav" | "opus" | "aac"
|
||||||
}
|
}
|
||||||
|
|
||||||
const DEFAULT_PROVIDER = "openai-compatible"
|
const DEFAULT_PROVIDER = "openai-compatible"
|
||||||
const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
|
const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
|
||||||
const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
|
const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
|
||||||
const DEFAULT_TTS_VOICE = "alloy"
|
const DEFAULT_TTS_VOICE = "alloy"
|
||||||
|
const DEFAULT_TTS_FORMAT = "mp3"
|
||||||
export class SpeechService {
|
export class SpeechService {
|
||||||
constructor(
|
constructor(
|
||||||
private readonly settings: SettingsService,
|
private readonly settings: SettingsService,
|
||||||
@@ -67,6 +77,10 @@ export class SpeechService {
|
|||||||
return this.createProvider().synthesize(input)
|
return this.createProvider().synthesize(input)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async synthesizeStream(input: SynthesizeSpeechInput): Promise<SpeechSynthesisStreamResponse> {
|
||||||
|
return this.createProvider().synthesizeStream(input)
|
||||||
|
}
|
||||||
|
|
||||||
private createProvider(): SpeechProvider {
|
private createProvider(): SpeechProvider {
|
||||||
const settings = this.resolveSettings()
|
const settings = this.resolveSettings()
|
||||||
return new OpenAICompatibleSpeechProvider({
|
return new OpenAICompatibleSpeechProvider({
|
||||||
@@ -86,6 +100,7 @@ export class SpeechService {
|
|||||||
sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
|
sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
|
||||||
ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
|
ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
|
||||||
ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
|
ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
|
||||||
|
ttsFormat: speech.ttsFormat ?? DEFAULT_TTS_FORMAT,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
import { Show, createEffect, createMemo, createSignal, type Component } from "solid-js"
|
import { For, Show, createEffect, createMemo, createSignal, type Component } from "solid-js"
|
||||||
import { Mic, Volume2 } from "lucide-solid"
|
import { Loader2, Mic, Square, Volume2 } from "lucide-solid"
|
||||||
import { useConfig, type SpeechSettings } from "../../stores/preferences"
|
import { useConfig, type SpeechSettings } from "../../stores/preferences"
|
||||||
import { useI18n } from "../../lib/i18n"
|
import { useI18n } from "../../lib/i18n"
|
||||||
import { loadSpeechCapabilities, speechCapabilities, speechCapabilitiesError, speechCapabilitiesLoading } from "../../stores/speech"
|
import { loadSpeechCapabilities, speechCapabilities, speechCapabilitiesError, speechCapabilitiesLoading } from "../../stores/speech"
|
||||||
import { getLogger } from "../../lib/logger"
|
import { getLogger } from "../../lib/logger"
|
||||||
|
import { useSpeech } from "../../lib/hooks/use-speech"
|
||||||
|
|
||||||
const log = getLogger("actions")
|
const log = getLogger("actions")
|
||||||
|
|
||||||
@@ -13,6 +14,8 @@ type DraftFields = {
|
|||||||
sttModel: string
|
sttModel: string
|
||||||
ttsModel: string
|
ttsModel: string
|
||||||
ttsVoice: string
|
ttsVoice: string
|
||||||
|
playbackMode: SpeechSettings["playbackMode"]
|
||||||
|
ttsFormat: SpeechSettings["ttsFormat"]
|
||||||
}
|
}
|
||||||
|
|
||||||
function createDraftFields(speech: SpeechSettings): DraftFields {
|
function createDraftFields(speech: SpeechSettings): DraftFields {
|
||||||
@@ -22,11 +25,21 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
|
|||||||
sttModel: speech.sttModel,
|
sttModel: speech.sttModel,
|
||||||
ttsModel: speech.ttsModel,
|
ttsModel: speech.ttsModel,
|
||||||
ttsVoice: speech.ttsVoice,
|
ttsVoice: speech.ttsVoice,
|
||||||
|
playbackMode: speech.playbackMode,
|
||||||
|
ttsFormat: speech.ttsFormat,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
|
function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
|
||||||
return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
|
return (
|
||||||
|
a.apiKey === b.apiKey &&
|
||||||
|
a.baseUrl === b.baseUrl &&
|
||||||
|
a.sttModel === b.sttModel &&
|
||||||
|
a.ttsModel === b.ttsModel &&
|
||||||
|
a.ttsVoice === b.ttsVoice &&
|
||||||
|
a.playbackMode === b.playbackMode &&
|
||||||
|
a.ttsFormat === b.ttsFormat
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
export const SpeechSettingsCard: Component = () => {
|
export const SpeechSettingsCard: Component = () => {
|
||||||
@@ -39,6 +52,15 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
const [apiKeyTouched, setApiKeyTouched] = createSignal(false)
|
const [apiKeyTouched, setApiKeyTouched] = createSignal(false)
|
||||||
const [clearStoredApiKey, setClearStoredApiKey] = createSignal(false)
|
const [clearStoredApiKey, setClearStoredApiKey] = createSignal(false)
|
||||||
|
|
||||||
|
const testSpeech = useSpeech({
|
||||||
|
id: () => "settings-speech-test",
|
||||||
|
text: () => t("settings.speech.testPlayback.sample"),
|
||||||
|
settingsOverride: () => ({
|
||||||
|
playbackMode: drafts().playbackMode,
|
||||||
|
ttsFormat: drafts().ttsFormat,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
|
||||||
createEffect(() => {
|
createEffect(() => {
|
||||||
const speech = serverSettings().speech
|
const speech = serverSettings().speech
|
||||||
const nextDrafts = createDraftFields(speech)
|
const nextDrafts = createDraftFields(speech)
|
||||||
@@ -84,7 +106,9 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
(current.baseUrl || "") !== (speech.baseUrl || "") ||
|
(current.baseUrl || "") !== (speech.baseUrl || "") ||
|
||||||
current.sttModel !== speech.sttModel ||
|
current.sttModel !== speech.sttModel ||
|
||||||
current.ttsModel !== speech.ttsModel ||
|
current.ttsModel !== speech.ttsModel ||
|
||||||
current.ttsVoice !== speech.ttsVoice
|
current.ttsVoice !== speech.ttsVoice ||
|
||||||
|
current.playbackMode !== speech.playbackMode ||
|
||||||
|
current.ttsFormat !== speech.ttsFormat
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -108,6 +132,8 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
sttModel: current.sttModel.trim() || undefined,
|
sttModel: current.sttModel.trim() || undefined,
|
||||||
ttsModel: current.ttsModel.trim() || undefined,
|
ttsModel: current.ttsModel.trim() || undefined,
|
||||||
ttsVoice: current.ttsVoice.trim() || undefined,
|
ttsVoice: current.ttsVoice.trim() || undefined,
|
||||||
|
playbackMode: current.playbackMode,
|
||||||
|
ttsFormat: current.ttsFormat,
|
||||||
})
|
})
|
||||||
await loadSpeechCapabilities(true)
|
await loadSpeechCapabilities(true)
|
||||||
setDrafts({
|
setDrafts({
|
||||||
@@ -116,6 +142,8 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
|
sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
|
||||||
ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
|
ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
|
||||||
ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
|
ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
|
||||||
|
playbackMode: current.playbackMode,
|
||||||
|
ttsFormat: current.ttsFormat,
|
||||||
})
|
})
|
||||||
setApiKeyTouched(false)
|
setApiKeyTouched(false)
|
||||||
setClearStoredApiKey(false)
|
setClearStoredApiKey(false)
|
||||||
@@ -151,6 +179,32 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
<span class="settings-inline-note">{t("settings.speech.provider.openaiCompatible")}</span>
|
<span class="settings-inline-note">{t("settings.speech.provider.openaiCompatible")}</span>
|
||||||
<span class="settings-inline-note">{capabilityLabel()}</span>
|
<span class="settings-inline-note">{capabilityLabel()}</span>
|
||||||
<span class="settings-inline-note">{saveStatusLabel()}</span>
|
<span class="settings-inline-note">{saveStatusLabel()}</span>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
class="selector-button selector-button-secondary w-auto whitespace-nowrap inline-flex items-center gap-2"
|
||||||
|
onClick={() => void testSpeech.toggle()}
|
||||||
|
disabled={isSaving()}
|
||||||
|
title={testSpeech.buttonTitle()}
|
||||||
|
aria-label={testSpeech.buttonTitle()}
|
||||||
|
>
|
||||||
|
<Show
|
||||||
|
when={testSpeech.isLoading()}
|
||||||
|
fallback={
|
||||||
|
<Show when={testSpeech.isPlaying()} fallback={<Volume2 class="w-3.5 h-3.5" aria-hidden="true" />}>
|
||||||
|
<Square class="w-3.5 h-3.5" aria-hidden="true" />
|
||||||
|
</Show>
|
||||||
|
}
|
||||||
|
>
|
||||||
|
<Loader2 class="w-3.5 h-3.5 animate-spin" aria-hidden="true" />
|
||||||
|
</Show>
|
||||||
|
<span>
|
||||||
|
{testSpeech.isPlaying()
|
||||||
|
? t("settings.speech.testPlayback.stop")
|
||||||
|
: testSpeech.isLoading()
|
||||||
|
? t("settings.speech.testPlayback.generating")
|
||||||
|
: t("settings.speech.testPlayback.action")}
|
||||||
|
</span>
|
||||||
|
</button>
|
||||||
<button
|
<button
|
||||||
type="button"
|
type="button"
|
||||||
class="selector-button selector-button-primary w-auto whitespace-nowrap"
|
class="selector-button selector-button-primary w-auto whitespace-nowrap"
|
||||||
@@ -213,8 +267,31 @@ export const SpeechSettingsCard: Component = () => {
|
|||||||
onInput={(value) => updateDraft("ttsVoice", value)}
|
onInput={(value) => updateDraft("ttsVoice", value)}
|
||||||
icon={<Mic class="w-3.5 h-3.5 icon-muted flex-shrink-0" />}
|
icon={<Mic class="w-3.5 h-3.5 icon-muted flex-shrink-0" />}
|
||||||
/>
|
/>
|
||||||
|
<SelectField
|
||||||
|
label={t("settings.speech.playbackMode.title")}
|
||||||
|
caption={t("settings.speech.playbackMode.subtitle")}
|
||||||
|
value={drafts().playbackMode}
|
||||||
|
onInput={(value) => updateDraft("playbackMode", value as DraftFields["playbackMode"])}
|
||||||
|
options={[
|
||||||
|
{ value: "streaming", label: t("settings.speech.playbackMode.streaming") },
|
||||||
|
{ value: "buffered", label: t("settings.speech.playbackMode.buffered") },
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
<SelectField
|
||||||
|
label={t("settings.speech.ttsFormat.title")}
|
||||||
|
caption={t("settings.speech.ttsFormat.subtitle")}
|
||||||
|
value={drafts().ttsFormat}
|
||||||
|
onInput={(value) => updateDraft("ttsFormat", value as DraftFields["ttsFormat"])}
|
||||||
|
options={[
|
||||||
|
{ value: "mp3", label: "MP3" },
|
||||||
|
{ value: "wav", label: "WAV" },
|
||||||
|
{ value: "opus", label: "Opus" },
|
||||||
|
{ value: "aac", label: "AAC" },
|
||||||
|
]}
|
||||||
|
/>
|
||||||
|
|
||||||
<div class="settings-inline-note">{t("settings.speech.help")}</div>
|
<div class="settings-inline-note">{t("settings.speech.help")}</div>
|
||||||
|
<div class="settings-inline-note">{t("settings.speech.testPlayback.note")}</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
@@ -249,4 +326,26 @@ const Field: Component<{
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const SelectField: Component<{
|
||||||
|
label: string
|
||||||
|
caption: string
|
||||||
|
value: string
|
||||||
|
onInput: (value: string) => void
|
||||||
|
options: Array<{ value: string; label: string }>
|
||||||
|
}> = (props) => {
|
||||||
|
return (
|
||||||
|
<div class="settings-toggle-row settings-toggle-row-compact">
|
||||||
|
<div>
|
||||||
|
<div class="settings-toggle-title">{props.label}</div>
|
||||||
|
<div class="settings-toggle-caption">{props.caption}</div>
|
||||||
|
</div>
|
||||||
|
<div class="min-w-[18rem] max-w-[24rem] w-full">
|
||||||
|
<select value={props.value} onInput={(event) => props.onInput(event.currentTarget.value)} class="selector-input w-full">
|
||||||
|
<For each={props.options}>{(option) => <option value={option.value}>{option.label}</option>}</For>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
export default SpeechSettingsCard
|
export default SpeechSettingsCard
|
||||||
|
|||||||
@@ -123,6 +123,28 @@ async function request<T>(path: string, init?: RequestInit): Promise<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function requestRaw(path: string, init?: RequestInit): Promise<Response> {
|
||||||
|
const url = API_BASE ? new URL(path, API_BASE).toString() : path
|
||||||
|
const headers = normalizeHeaders(init?.headers)
|
||||||
|
if (init?.body !== undefined && !headers["Content-Type"]) {
|
||||||
|
headers["Content-Type"] = "application/json"
|
||||||
|
}
|
||||||
|
|
||||||
|
const method = (init?.method ?? "GET").toUpperCase()
|
||||||
|
const startedAt = Date.now()
|
||||||
|
logHttp(`${method} ${path}`)
|
||||||
|
|
||||||
|
const response = await fetch(url, { ...init, headers, credentials: init?.credentials ?? "include" })
|
||||||
|
if (!response.ok) {
|
||||||
|
const message = await response.text()
|
||||||
|
logHttp(`${method} ${path} -> ${response.status}`, { durationMs: Date.now() - startedAt, error: message })
|
||||||
|
throw new Error(message || `Request failed with ${response.status}`)
|
||||||
|
}
|
||||||
|
|
||||||
|
logHttp(`${method} ${path} -> ${response.status}`, { durationMs: Date.now() - startedAt })
|
||||||
|
return response
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
export const serverApi = {
|
export const serverApi = {
|
||||||
fetchWorkspaces(): Promise<WorkspaceDescriptor[]> {
|
fetchWorkspaces(): Promise<WorkspaceDescriptor[]> {
|
||||||
@@ -253,12 +275,22 @@ export const serverApi = {
|
|||||||
body: JSON.stringify(payload),
|
body: JSON.stringify(payload),
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
synthesizeSpeech(payload: { text: string; format?: "mp3" | "wav" | "opus" }): Promise<SpeechSynthesisResponse> {
|
synthesizeSpeech(payload: { text: string; format?: "mp3" | "wav" | "opus" | "aac" }): Promise<SpeechSynthesisResponse> {
|
||||||
return request<SpeechSynthesisResponse>("/api/speech/synthesize", {
|
return request<SpeechSynthesisResponse>("/api/speech/synthesize", {
|
||||||
method: "POST",
|
method: "POST",
|
||||||
body: JSON.stringify(payload),
|
body: JSON.stringify(payload),
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
|
synthesizeSpeechStream(
|
||||||
|
payload: { text: string; format?: "mp3" | "wav" | "opus" | "aac" },
|
||||||
|
signal?: AbortSignal,
|
||||||
|
): Promise<Response> {
|
||||||
|
return requestRaw("/api/speech/synthesize/stream", {
|
||||||
|
method: "POST",
|
||||||
|
body: JSON.stringify(payload),
|
||||||
|
signal,
|
||||||
|
})
|
||||||
|
},
|
||||||
listFileSystem(path?: string, options?: { includeFiles?: boolean }): Promise<FileSystemListResponse> {
|
listFileSystem(path?: string, options?: { includeFiles?: boolean }): Promise<FileSystemListResponse> {
|
||||||
const params = new URLSearchParams()
|
const params = new URLSearchParams()
|
||||||
if (path && path !== ".") {
|
if (path && path !== ".") {
|
||||||
|
|||||||
@@ -3,12 +3,14 @@ import { showAlertDialog } from "../../stores/alerts"
|
|||||||
import { serverApi } from "../api-client"
|
import { serverApi } from "../api-client"
|
||||||
import { useI18n } from "../i18n"
|
import { useI18n } from "../i18n"
|
||||||
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
||||||
|
import { useConfig, type SpeechSettings } from "../../stores/preferences"
|
||||||
|
|
||||||
type SpeechPlaybackState = "idle" | "loading" | "playing"
|
type SpeechPlaybackState = "idle" | "loading" | "playing"
|
||||||
|
|
||||||
interface UseSpeechOptions {
|
interface UseSpeechOptions {
|
||||||
id: Accessor<string>
|
id: Accessor<string>
|
||||||
text: Accessor<string>
|
text: Accessor<string>
|
||||||
|
settingsOverride?: Accessor<Partial<Pick<SpeechSettings, "playbackMode" | "ttsFormat">>>
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ActivePlaybackEntry {
|
interface ActivePlaybackEntry {
|
||||||
@@ -44,17 +46,25 @@ function setActivePlayback(ownerId: string, stop: () => void) {
|
|||||||
|
|
||||||
export function useSpeech(options: UseSpeechOptions) {
|
export function useSpeech(options: UseSpeechOptions) {
|
||||||
const { t } = useI18n()
|
const { t } = useI18n()
|
||||||
|
const { serverSettings } = useConfig()
|
||||||
const [state, setState] = createSignal<SpeechPlaybackState>("idle")
|
const [state, setState] = createSignal<SpeechPlaybackState>("idle")
|
||||||
|
|
||||||
let requestVersion = 0
|
let requestVersion = 0
|
||||||
let audio: HTMLAudioElement | null = null
|
let audio: HTMLAudioElement | null = null
|
||||||
let objectUrl: string | null = null
|
let objectUrl: string | null = null
|
||||||
|
let mediaSource: MediaSource | null = null
|
||||||
|
let abortController: AbortController | null = null
|
||||||
|
|
||||||
createEffect(() => {
|
createEffect(() => {
|
||||||
void loadSpeechCapabilities()
|
void loadSpeechCapabilities()
|
||||||
})
|
})
|
||||||
|
|
||||||
const cleanupAudio = () => {
|
const cleanupAudio = () => {
|
||||||
|
if (abortController) {
|
||||||
|
abortController.abort()
|
||||||
|
abortController = null
|
||||||
|
}
|
||||||
|
|
||||||
if (audio) {
|
if (audio) {
|
||||||
audio.pause()
|
audio.pause()
|
||||||
audio.currentTime = 0
|
audio.currentTime = 0
|
||||||
@@ -63,6 +73,8 @@ export function useSpeech(options: UseSpeechOptions) {
|
|||||||
audio = null
|
audio = null
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mediaSource = null
|
||||||
|
|
||||||
if (objectUrl) {
|
if (objectUrl) {
|
||||||
URL.revokeObjectURL(objectUrl)
|
URL.revokeObjectURL(objectUrl)
|
||||||
objectUrl = null
|
objectUrl = null
|
||||||
@@ -85,9 +97,20 @@ export function useSpeech(options: UseSpeechOptions) {
|
|||||||
|
|
||||||
const isSupported = () => typeof window !== "undefined" && typeof window.Audio !== "undefined"
|
const isSupported = () => typeof window !== "undefined" && typeof window.Audio !== "undefined"
|
||||||
|
|
||||||
|
const resolvedSettings = () => ({
|
||||||
|
...serverSettings().speech,
|
||||||
|
...(options.settingsOverride?.() ?? {}),
|
||||||
|
})
|
||||||
|
|
||||||
const canUseSpeech = () => {
|
const canUseSpeech = () => {
|
||||||
const capabilities = speechCapabilities()
|
const capabilities = speechCapabilities()
|
||||||
return Boolean(isSupported() && capabilities?.available && capabilities?.configured && capabilities?.supportsTts)
|
if (!isSupported() || !capabilities?.available || !capabilities?.configured || !capabilities?.supportsTts) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if (resolvedSettings().playbackMode === "streaming") {
|
||||||
|
return Boolean(capabilities.supportsStreamingTts)
|
||||||
|
}
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
const stop = () => {
|
const stop = () => {
|
||||||
@@ -125,38 +148,15 @@ export function useSpeech(options: UseSpeechOptions) {
|
|||||||
cleanupAudio()
|
cleanupAudio()
|
||||||
setState("loading")
|
setState("loading")
|
||||||
|
|
||||||
|
const settings = resolvedSettings()
|
||||||
|
const format = settings.ttsFormat
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await serverApi.synthesizeSpeech({
|
if (settings.playbackMode === "streaming") {
|
||||||
text,
|
await startStreamingPlayback(ownerId, currentRequest, text, format)
|
||||||
format: "mp3",
|
} else {
|
||||||
})
|
await startBufferedPlayback(ownerId, currentRequest, text, format)
|
||||||
|
|
||||||
if (currentRequest !== requestVersion) {
|
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const nextUrl = createObjectUrlFromBase64(response.audioBase64, response.mimeType)
|
|
||||||
const nextAudio = new Audio(nextUrl)
|
|
||||||
objectUrl = nextUrl
|
|
||||||
audio = nextAudio
|
|
||||||
|
|
||||||
const finish = () => {
|
|
||||||
if (activePlayback?.ownerId === ownerId) {
|
|
||||||
activePlayback = null
|
|
||||||
}
|
|
||||||
resetOwnerState(ownerId)
|
|
||||||
}
|
|
||||||
|
|
||||||
nextAudio.addEventListener("ended", finish, { once: true })
|
|
||||||
nextAudio.addEventListener("error", finish, { once: true })
|
|
||||||
|
|
||||||
setActivePlayback(ownerId, () => {
|
|
||||||
cleanupAudio()
|
|
||||||
setState("idle")
|
|
||||||
})
|
|
||||||
|
|
||||||
setState("playing")
|
|
||||||
await nextAudio.play()
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (currentRequest !== requestVersion) {
|
if (currentRequest !== requestVersion) {
|
||||||
return
|
return
|
||||||
@@ -170,6 +170,102 @@ export function useSpeech(options: UseSpeechOptions) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function startBufferedPlayback(
|
||||||
|
ownerId: string,
|
||||||
|
currentRequest: number,
|
||||||
|
text: string,
|
||||||
|
format: "mp3" | "wav" | "opus" | "aac",
|
||||||
|
) {
|
||||||
|
const response = await serverApi.synthesizeSpeech({ text, format })
|
||||||
|
|
||||||
|
if (currentRequest !== requestVersion) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const nextUrl = createObjectUrlFromBase64(response.audioBase64, response.mimeType)
|
||||||
|
const nextAudio = new Audio(nextUrl)
|
||||||
|
objectUrl = nextUrl
|
||||||
|
audio = nextAudio
|
||||||
|
|
||||||
|
attachPlaybackLifecycle(ownerId, nextAudio)
|
||||||
|
setActivePlayback(ownerId, () => {
|
||||||
|
cleanupAudio()
|
||||||
|
setState("idle")
|
||||||
|
})
|
||||||
|
setState("playing")
|
||||||
|
await nextAudio.play()
|
||||||
|
}
|
||||||
|
|
||||||
|
async function startStreamingPlayback(
|
||||||
|
ownerId: string,
|
||||||
|
currentRequest: number,
|
||||||
|
text: string,
|
||||||
|
format: "mp3" | "wav" | "opus" | "aac",
|
||||||
|
) {
|
||||||
|
if (typeof MediaSource === "undefined") {
|
||||||
|
throw new Error("MediaSource is not available in this browser.")
|
||||||
|
}
|
||||||
|
|
||||||
|
const controller = new AbortController()
|
||||||
|
abortController = controller
|
||||||
|
const response = await serverApi.synthesizeSpeechStream({ text, format }, controller.signal)
|
||||||
|
const mimeType = response.headers.get("content-type") || formatToMimeType(format)
|
||||||
|
|
||||||
|
if (!MediaSource.isTypeSupported(mimeType)) {
|
||||||
|
throw new Error(`Streaming playback is not supported for ${mimeType}.`)
|
||||||
|
}
|
||||||
|
|
||||||
|
const stream = response.body
|
||||||
|
if (!stream) {
|
||||||
|
throw new Error("Speech stream did not include a response body.")
|
||||||
|
}
|
||||||
|
|
||||||
|
const nextMediaSource = new MediaSource()
|
||||||
|
const nextObjectUrl = URL.createObjectURL(nextMediaSource)
|
||||||
|
const nextAudio = new Audio(nextObjectUrl)
|
||||||
|
mediaSource = nextMediaSource
|
||||||
|
objectUrl = nextObjectUrl
|
||||||
|
audio = nextAudio
|
||||||
|
|
||||||
|
attachPlaybackLifecycle(ownerId, nextAudio)
|
||||||
|
setActivePlayback(ownerId, () => {
|
||||||
|
cleanupAudio()
|
||||||
|
setState("idle")
|
||||||
|
})
|
||||||
|
|
||||||
|
await new Promise<void>((resolve, reject) => {
|
||||||
|
const handleSourceOpen = () => {
|
||||||
|
nextMediaSource.removeEventListener("sourceopen", handleSourceOpen)
|
||||||
|
void streamToMediaSource({
|
||||||
|
mediaSource: nextMediaSource,
|
||||||
|
stream,
|
||||||
|
mimeType,
|
||||||
|
audioElement: nextAudio,
|
||||||
|
onPlayable: async () => {
|
||||||
|
if (currentRequest !== requestVersion) return
|
||||||
|
if (state() !== "playing") {
|
||||||
|
setState("playing")
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await nextAudio.play()
|
||||||
|
} catch (error) {
|
||||||
|
reject(error)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
onComplete: resolve,
|
||||||
|
onError: reject,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
nextMediaSource.addEventListener("sourceopen", handleSourceOpen, { once: true })
|
||||||
|
nextAudio.addEventListener(
|
||||||
|
"error",
|
||||||
|
() => reject(new Error("Unable to play streamed speech.")),
|
||||||
|
{ once: true },
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
const toggle = async () => {
|
const toggle = async () => {
|
||||||
if (state() === "idle") {
|
if (state() === "idle") {
|
||||||
await start()
|
await start()
|
||||||
@@ -193,6 +289,100 @@ export function useSpeech(options: UseSpeechOptions) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function attachPlaybackLifecycle(ownerId: string, audio: HTMLAudioElement) {
|
||||||
|
const finish = () => {
|
||||||
|
if (activePlayback?.ownerId === ownerId) {
|
||||||
|
activePlayback = null
|
||||||
|
}
|
||||||
|
resetOwnerState(ownerId)
|
||||||
|
}
|
||||||
|
|
||||||
|
audio.addEventListener("ended", finish, { once: true })
|
||||||
|
audio.addEventListener("error", finish, { once: true })
|
||||||
|
}
|
||||||
|
|
||||||
|
async function streamToMediaSource(options: {
|
||||||
|
mediaSource: MediaSource
|
||||||
|
stream: ReadableStream<Uint8Array>
|
||||||
|
mimeType: string
|
||||||
|
audioElement: HTMLAudioElement
|
||||||
|
onPlayable: () => Promise<void>
|
||||||
|
onComplete: () => void
|
||||||
|
onError: (error: unknown) => void
|
||||||
|
}) {
|
||||||
|
try {
|
||||||
|
const sourceBuffer = options.mediaSource.addSourceBuffer(options.mimeType)
|
||||||
|
const reader = options.stream.getReader()
|
||||||
|
let startedPlayback = false
|
||||||
|
let queue: Uint8Array[] = []
|
||||||
|
let processing = false
|
||||||
|
|
||||||
|
const flushQueue = async () => {
|
||||||
|
if (processing || sourceBuffer.updating || queue.length === 0) return
|
||||||
|
processing = true
|
||||||
|
const chunk = queue.shift()!
|
||||||
|
await appendChunk(sourceBuffer, chunk)
|
||||||
|
if (!startedPlayback) {
|
||||||
|
startedPlayback = true
|
||||||
|
await options.onPlayable()
|
||||||
|
}
|
||||||
|
processing = false
|
||||||
|
await flushQueue()
|
||||||
|
}
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read()
|
||||||
|
if (done) break
|
||||||
|
if (value && value.byteLength > 0) {
|
||||||
|
queue.push(value)
|
||||||
|
await flushQueue()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (queue.length > 0 || sourceBuffer.updating) {
|
||||||
|
if (queue.length > 0) {
|
||||||
|
await flushQueue()
|
||||||
|
} else {
|
||||||
|
await waitForUpdateEnd(sourceBuffer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.mediaSource.readyState === "open") {
|
||||||
|
options.mediaSource.endOfStream()
|
||||||
|
}
|
||||||
|
options.onComplete()
|
||||||
|
} catch (error) {
|
||||||
|
options.onError(error)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function appendChunk(sourceBuffer: SourceBuffer, chunk: Uint8Array): Promise<void> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const handleUpdateEnd = () => {
|
||||||
|
cleanup()
|
||||||
|
resolve()
|
||||||
|
}
|
||||||
|
const handleError = () => {
|
||||||
|
cleanup()
|
||||||
|
reject(new Error("Failed to append audio stream chunk."))
|
||||||
|
}
|
||||||
|
const cleanup = () => {
|
||||||
|
sourceBuffer.removeEventListener("updateend", handleUpdateEnd)
|
||||||
|
sourceBuffer.removeEventListener("error", handleError)
|
||||||
|
}
|
||||||
|
|
||||||
|
sourceBuffer.addEventListener("updateend", handleUpdateEnd, { once: true })
|
||||||
|
sourceBuffer.addEventListener("error", handleError, { once: true })
|
||||||
|
sourceBuffer.appendBuffer(new Uint8Array(chunk).buffer)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
function waitForUpdateEnd(sourceBuffer: SourceBuffer): Promise<void> {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
sourceBuffer.addEventListener("updateend", () => resolve(), { once: true })
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
function createObjectUrlFromBase64(audioBase64: string, mimeType: string): string {
|
function createObjectUrlFromBase64(audioBase64: string, mimeType: string): string {
|
||||||
const binary = atob(audioBase64)
|
const binary = atob(audioBase64)
|
||||||
const bytes = new Uint8Array(binary.length)
|
const bytes = new Uint8Array(binary.length)
|
||||||
@@ -201,3 +391,10 @@ function createObjectUrlFromBase64(audioBase64: string, mimeType: string): strin
|
|||||||
}
|
}
|
||||||
return URL.createObjectURL(new Blob([bytes], { type: mimeType || "audio/mpeg" }))
|
return URL.createObjectURL(new Blob([bytes], { type: mimeType || "audio/mpeg" }))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function formatToMimeType(format: "mp3" | "wav" | "opus" | "aac"): string {
|
||||||
|
if (format === "wav") return "audio/wav"
|
||||||
|
if (format === "opus") return "audio/opus"
|
||||||
|
if (format === "aac") return "audio/aac"
|
||||||
|
return "audio/mpeg"
|
||||||
|
}
|
||||||
|
|||||||
@@ -166,7 +166,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
"settings.speech.ttsModel.subtitle": "Default text-to-speech model reserved for future playback features.",
|
||||||
"settings.speech.ttsVoice.title": "Default voice",
|
"settings.speech.ttsVoice.title": "Default voice",
|
||||||
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
"settings.speech.ttsVoice.subtitle": "Default text-to-speech voice reserved for future playback features.",
|
||||||
"settings.speech.help": "Prompt voice input only appears when speech transcription is configured and supported by this browser.",
|
"settings.speech.playbackMode.title": "Playback mode",
|
||||||
|
"settings.speech.playbackMode.subtitle": "Choose whether TTS starts playing as audio streams in or after the full file is generated.",
|
||||||
|
"settings.speech.playbackMode.streaming": "Streaming",
|
||||||
|
"settings.speech.playbackMode.buffered": "Buffered",
|
||||||
|
"settings.speech.ttsFormat.title": "Output format",
|
||||||
|
"settings.speech.ttsFormat.subtitle": "Choose the audio format for synthesized speech. Streaming support depends on your provider and browser.",
|
||||||
|
"settings.speech.help": "Prompt voice input appears when speech transcription is configured and supported. Message playback uses the TTS mode and format selected here.",
|
||||||
|
"settings.speech.testPlayback.action": "Test playback",
|
||||||
|
"settings.speech.testPlayback.generating": "Generating sample",
|
||||||
|
"settings.speech.testPlayback.stop": "Stop sample",
|
||||||
|
"settings.speech.testPlayback.sample": "Thank you for using CodeNomad, your speech settings are working fine.",
|
||||||
|
"settings.speech.testPlayback.note": "The test uses your current playback mode and format immediately. Save API key, base URL, model, or voice changes first if you want those reflected too.",
|
||||||
"settings.speech.save.action": "Save",
|
"settings.speech.save.action": "Save",
|
||||||
"settings.speech.save.saving": "Saving...",
|
"settings.speech.save.saving": "Saving...",
|
||||||
"settings.speech.save.saved": "Saved",
|
"settings.speech.save.saved": "Saved",
|
||||||
|
|||||||
@@ -166,7 +166,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.ttsModel.subtitle": "Modelo predeterminado de texto a voz reservado para futuras funciones de reproducción.",
|
"settings.speech.ttsModel.subtitle": "Modelo predeterminado de texto a voz reservado para futuras funciones de reproducción.",
|
||||||
"settings.speech.ttsVoice.title": "Voz predeterminada",
|
"settings.speech.ttsVoice.title": "Voz predeterminada",
|
||||||
"settings.speech.ttsVoice.subtitle": "Voz predeterminada de texto a voz reservada para futuras funciones de reproducción.",
|
"settings.speech.ttsVoice.subtitle": "Voz predeterminada de texto a voz reservada para futuras funciones de reproducción.",
|
||||||
"settings.speech.help": "La entrada de voz del prompt solo aparece cuando la transcripción de voz está configurada y este navegador la admite.",
|
"settings.speech.playbackMode.title": "Modo de reproduccion",
|
||||||
|
"settings.speech.playbackMode.subtitle": "Elige si TTS empieza a reproducirse mientras llega el audio o despues de generar el archivo completo.",
|
||||||
|
"settings.speech.playbackMode.streaming": "Streaming",
|
||||||
|
"settings.speech.playbackMode.buffered": "Buffered",
|
||||||
|
"settings.speech.ttsFormat.title": "Formato de salida",
|
||||||
|
"settings.speech.ttsFormat.subtitle": "Elige el formato de audio para la voz sintetizada. La compatibilidad de streaming depende de tu proveedor y navegador.",
|
||||||
|
"settings.speech.help": "La entrada de voz del prompt aparece cuando la transcripcion de voz esta configurada y es compatible. La reproduccion de mensajes usa el modo y formato TTS seleccionados aqui.",
|
||||||
|
"settings.speech.testPlayback.action": "Probar reproduccion",
|
||||||
|
"settings.speech.testPlayback.generating": "Generando muestra",
|
||||||
|
"settings.speech.testPlayback.stop": "Detener muestra",
|
||||||
|
"settings.speech.testPlayback.sample": "Esta es una prueba de reproduccion de voz con el modo y formato seleccionados actualmente.",
|
||||||
|
"settings.speech.testPlayback.note": "La prueba usa de inmediato el modo y formato actuales. Guarda primero los cambios de API key, base URL, modelo o voz si tambien quieres probarlos.",
|
||||||
"settings.speech.save.action": "Guardar",
|
"settings.speech.save.action": "Guardar",
|
||||||
"settings.speech.save.saving": "Guardando...",
|
"settings.speech.save.saving": "Guardando...",
|
||||||
"settings.speech.save.saved": "Guardado",
|
"settings.speech.save.saved": "Guardado",
|
||||||
|
|||||||
@@ -166,7 +166,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.ttsModel.subtitle": "Modèle de synthèse vocale par défaut réservé aux futures fonctions de lecture.",
|
"settings.speech.ttsModel.subtitle": "Modèle de synthèse vocale par défaut réservé aux futures fonctions de lecture.",
|
||||||
"settings.speech.ttsVoice.title": "Voix par défaut",
|
"settings.speech.ttsVoice.title": "Voix par défaut",
|
||||||
"settings.speech.ttsVoice.subtitle": "Voix de synthèse vocale par défaut réservée aux futures fonctions de lecture.",
|
"settings.speech.ttsVoice.subtitle": "Voix de synthèse vocale par défaut réservée aux futures fonctions de lecture.",
|
||||||
"settings.speech.help": "La saisie vocale du prompt n'apparaît que lorsque la transcription vocale est configurée et prise en charge par ce navigateur.",
|
"settings.speech.playbackMode.title": "Mode de lecture",
|
||||||
|
"settings.speech.playbackMode.subtitle": "Choisissez si le TTS commence a jouer pendant le flux audio ou apres la generation complete du fichier.",
|
||||||
|
"settings.speech.playbackMode.streaming": "Streaming",
|
||||||
|
"settings.speech.playbackMode.buffered": "Buffered",
|
||||||
|
"settings.speech.ttsFormat.title": "Format de sortie",
|
||||||
|
"settings.speech.ttsFormat.subtitle": "Choisissez le format audio pour la voix synthetisee. La prise en charge du streaming depend du fournisseur et du navigateur.",
|
||||||
|
"settings.speech.help": "La saisie vocale du prompt apparait lorsque la transcription vocale est configuree et prise en charge. La lecture des messages utilise le mode et le format TTS selectionnes ici.",
|
||||||
|
"settings.speech.testPlayback.action": "Tester la lecture",
|
||||||
|
"settings.speech.testPlayback.generating": "Generation de l'extrait",
|
||||||
|
"settings.speech.testPlayback.stop": "Arreter l'extrait",
|
||||||
|
"settings.speech.testPlayback.sample": "Ceci est un test de lecture vocale utilisant le mode de lecture et le format actuellement selectionnes.",
|
||||||
|
"settings.speech.testPlayback.note": "Le test utilise immediatement le mode et le format actuels. Enregistrez d'abord les changements d'API key, d'URL de base, de modele ou de voix si vous voulez aussi les tester.",
|
||||||
"settings.speech.save.action": "Enregistrer",
|
"settings.speech.save.action": "Enregistrer",
|
||||||
"settings.speech.save.saving": "Enregistrement...",
|
"settings.speech.save.saving": "Enregistrement...",
|
||||||
"settings.speech.save.saved": "Enregistré",
|
"settings.speech.save.saved": "Enregistré",
|
||||||
|
|||||||
@@ -165,7 +165,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.ttsModel.subtitle": "מודל ברירת מחדל לטקסט-לדיבור השמור ליכולות ניגון עתידיות.",
|
"settings.speech.ttsModel.subtitle": "מודל ברירת מחדל לטקסט-לדיבור השמור ליכולות ניגון עתידיות.",
|
||||||
"settings.speech.ttsVoice.title": "קול ברירת מחדל",
|
"settings.speech.ttsVoice.title": "קול ברירת מחדל",
|
||||||
"settings.speech.ttsVoice.subtitle": "קול ברירת מחדל לטקסט-לדיבור השמור ליכולות ניגון עתידיות.",
|
"settings.speech.ttsVoice.subtitle": "קול ברירת מחדל לטקסט-לדיבור השמור ליכולות ניגון עתידיות.",
|
||||||
"settings.speech.help": "קלט קולי לפרומפט מופיע רק כאשר תמלול קול מוגדר ונתמך בדפדפן זה.",
|
"settings.speech.playbackMode.title": "מצב ניגון",
|
||||||
|
"settings.speech.playbackMode.subtitle": "בחר אם TTS יתחיל לנגן בזמן שהאודיו מוזרם או רק אחרי שהקובץ כולו נוצר.",
|
||||||
|
"settings.speech.playbackMode.streaming": "סטרימינג",
|
||||||
|
"settings.speech.playbackMode.buffered": "באפר מלא",
|
||||||
|
"settings.speech.ttsFormat.title": "פורמט פלט",
|
||||||
|
"settings.speech.ttsFormat.subtitle": "בחר את פורמט האודיו לדיבור מסונתז. תמיכת סטרימינג תלויה בספק ובדפדפן.",
|
||||||
|
"settings.speech.help": "קלט קולי לפרומפט מופיע כאשר תמלול קול מוגדר ונתמך. השמעת הודעות משתמשת במצב ובפורמט ה-TTS שנבחרו כאן.",
|
||||||
|
"settings.speech.testPlayback.action": "בדוק ניגון",
|
||||||
|
"settings.speech.testPlayback.generating": "יוצר דוגמה",
|
||||||
|
"settings.speech.testPlayback.stop": "עצור דוגמה",
|
||||||
|
"settings.speech.testPlayback.sample": "זהו מבחן ניגון קולי המשתמש במצב ובפורמט שנבחרו כרגע.",
|
||||||
|
"settings.speech.testPlayback.note": "המבחן משתמש מיד במצב ובפורמט הנוכחיים. שמור תחילה שינויים ב-API key, ב-Base URL, במודל או בקול אם גם אותם תרצה לבדוק.",
|
||||||
"settings.speech.save.action": "שמור",
|
"settings.speech.save.action": "שמור",
|
||||||
"settings.speech.save.saving": "שומר...",
|
"settings.speech.save.saving": "שומר...",
|
||||||
"settings.speech.save.saved": "נשמר",
|
"settings.speech.save.saved": "נשמר",
|
||||||
|
|||||||
@@ -166,7 +166,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.ttsModel.subtitle": "将来の再生機能のために予約されている既定の音声合成モデルです。",
|
"settings.speech.ttsModel.subtitle": "将来の再生機能のために予約されている既定の音声合成モデルです。",
|
||||||
"settings.speech.ttsVoice.title": "既定の音声",
|
"settings.speech.ttsVoice.title": "既定の音声",
|
||||||
"settings.speech.ttsVoice.subtitle": "将来の再生機能のために予約されている既定の音声合成ボイスです。",
|
"settings.speech.ttsVoice.subtitle": "将来の再生機能のために予約されている既定の音声合成ボイスです。",
|
||||||
"settings.speech.help": "プロンプト音声入力は、音声文字起こしが設定され、このブラウザーでサポートされている場合にのみ表示されます。",
|
"settings.speech.playbackMode.title": "再生モード",
|
||||||
|
"settings.speech.playbackMode.subtitle": "音声が届き次第再生を始めるか、ファイル全体の生成後に再生するかを選択します。",
|
||||||
|
"settings.speech.playbackMode.streaming": "Streaming",
|
||||||
|
"settings.speech.playbackMode.buffered": "Buffered",
|
||||||
|
"settings.speech.ttsFormat.title": "出力形式",
|
||||||
|
"settings.speech.ttsFormat.subtitle": "音声合成の出力形式を選択します。ストリーミング対応はプロバイダーとブラウザーに依存します。",
|
||||||
|
"settings.speech.help": "プロンプト音声入力は音声文字起こしが設定され対応している場合に表示されます。メッセージ再生にはここで選んだTTSモードと形式が使われます。",
|
||||||
|
"settings.speech.testPlayback.action": "再生をテスト",
|
||||||
|
"settings.speech.testPlayback.generating": "サンプルを生成中",
|
||||||
|
"settings.speech.testPlayback.stop": "サンプルを停止",
|
||||||
|
"settings.speech.testPlayback.sample": "現在選択している再生モードと形式で音声再生をテストします。",
|
||||||
|
"settings.speech.testPlayback.note": "このテストは現在の再生モードと形式をすぐに使います。APIキー、Base URL、モデル、音声の変更も試したい場合は先に保存してください。",
|
||||||
"settings.speech.save.action": "保存",
|
"settings.speech.save.action": "保存",
|
||||||
"settings.speech.save.saving": "保存中...",
|
"settings.speech.save.saving": "保存中...",
|
||||||
"settings.speech.save.saved": "保存済み",
|
"settings.speech.save.saved": "保存済み",
|
||||||
|
|||||||
@@ -166,7 +166,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.ttsModel.subtitle": "Модель синтеза речи по умолчанию, зарезервированная для будущих функций воспроизведения.",
|
"settings.speech.ttsModel.subtitle": "Модель синтеза речи по умолчанию, зарезервированная для будущих функций воспроизведения.",
|
||||||
"settings.speech.ttsVoice.title": "Голос по умолчанию",
|
"settings.speech.ttsVoice.title": "Голос по умолчанию",
|
||||||
"settings.speech.ttsVoice.subtitle": "Голос синтеза речи по умолчанию, зарезервированный для будущих функций воспроизведения.",
|
"settings.speech.ttsVoice.subtitle": "Голос синтеза речи по умолчанию, зарезервированный для будущих функций воспроизведения.",
|
||||||
"settings.speech.help": "Голосовой ввод в поле запроса появляется только если распознавание речи настроено и поддерживается этим браузером.",
|
"settings.speech.playbackMode.title": "Режим воспроизведения",
|
||||||
|
"settings.speech.playbackMode.subtitle": "Выберите, начинать ли воспроизведение TTS во время поступления аудио или только после полной генерации файла.",
|
||||||
|
"settings.speech.playbackMode.streaming": "Потоковый",
|
||||||
|
"settings.speech.playbackMode.buffered": "Буферизованный",
|
||||||
|
"settings.speech.ttsFormat.title": "Формат вывода",
|
||||||
|
"settings.speech.ttsFormat.subtitle": "Выберите аудиоформат для синтезированной речи. Поддержка потокового режима зависит от провайдера и браузера.",
|
||||||
|
"settings.speech.help": "Голосовой ввод появляется, когда распознавание речи настроено и поддерживается. Для воспроизведения сообщений используются выбранные здесь режим и формат TTS.",
|
||||||
|
"settings.speech.testPlayback.action": "Проверить воспроизведение",
|
||||||
|
"settings.speech.testPlayback.generating": "Генерация примера",
|
||||||
|
"settings.speech.testPlayback.stop": "Остановить пример",
|
||||||
|
"settings.speech.testPlayback.sample": "Это тест голосового воспроизведения с текущим режимом и выбранным форматом.",
|
||||||
|
"settings.speech.testPlayback.note": "Тест сразу использует текущие режим и формат. Сначала сохраните изменения API key, Base URL, модели или голоса, если хотите проверить и их.",
|
||||||
"settings.speech.save.action": "Сохранить",
|
"settings.speech.save.action": "Сохранить",
|
||||||
"settings.speech.save.saving": "Сохранение...",
|
"settings.speech.save.saving": "Сохранение...",
|
||||||
"settings.speech.save.saved": "Сохранено",
|
"settings.speech.save.saved": "Сохранено",
|
||||||
|
|||||||
@@ -166,7 +166,18 @@ export const settingsMessages = {
|
|||||||
"settings.speech.ttsModel.subtitle": "为未来播放功能预留的默认文字转语音模型。",
|
"settings.speech.ttsModel.subtitle": "为未来播放功能预留的默认文字转语音模型。",
|
||||||
"settings.speech.ttsVoice.title": "默认语音",
|
"settings.speech.ttsVoice.title": "默认语音",
|
||||||
"settings.speech.ttsVoice.subtitle": "为未来播放功能预留的默认文字转语音音色。",
|
"settings.speech.ttsVoice.subtitle": "为未来播放功能预留的默认文字转语音音色。",
|
||||||
"settings.speech.help": "只有在语音转写已配置且当前浏览器支持时,提示框语音输入才会显示。",
|
"settings.speech.playbackMode.title": "播放模式",
|
||||||
|
"settings.speech.playbackMode.subtitle": "选择在音频流入时开始播放,还是在整个文件生成完成后再播放。",
|
||||||
|
"settings.speech.playbackMode.streaming": "流式",
|
||||||
|
"settings.speech.playbackMode.buffered": "缓冲后播放",
|
||||||
|
"settings.speech.ttsFormat.title": "输出格式",
|
||||||
|
"settings.speech.ttsFormat.subtitle": "选择语音合成的音频格式。流式支持取决于你的提供商和浏览器。",
|
||||||
|
"settings.speech.help": "当语音转写已配置且受支持时,提示框语音输入会显示。消息播放会使用这里选择的 TTS 模式和格式。",
|
||||||
|
"settings.speech.testPlayback.action": "测试播放",
|
||||||
|
"settings.speech.testPlayback.generating": "正在生成示例",
|
||||||
|
"settings.speech.testPlayback.stop": "停止示例",
|
||||||
|
"settings.speech.testPlayback.sample": "这是一个使用当前播放模式和所选格式的语音播放测试。",
|
||||||
|
"settings.speech.testPlayback.note": "测试会立即使用当前播放模式和格式。如果你也想测试 API key、Base URL、模型或音色的更改,请先保存。",
|
||||||
"settings.speech.save.action": "保存",
|
"settings.speech.save.action": "保存",
|
||||||
"settings.speech.save.saving": "保存中...",
|
"settings.speech.save.saving": "保存中...",
|
||||||
"settings.speech.save.saved": "已保存",
|
"settings.speech.save.saved": "已保存",
|
||||||
|
|||||||
@@ -29,6 +29,8 @@ export type ExpansionPreference = "expanded" | "collapsed"
|
|||||||
export type ToolInputsVisibilityPreference = "hidden" | "collapsed" | "expanded"
|
export type ToolInputsVisibilityPreference = "hidden" | "collapsed" | "expanded"
|
||||||
export type ListeningMode = "local" | "all"
|
export type ListeningMode = "local" | "all"
|
||||||
export type SpeechProviderPreference = "openai-compatible"
|
export type SpeechProviderPreference = "openai-compatible"
|
||||||
|
export type SpeechPlaybackMode = "streaming" | "buffered"
|
||||||
|
export type SpeechTtsFormat = "mp3" | "wav" | "opus" | "aac"
|
||||||
|
|
||||||
export interface SpeechSettings {
|
export interface SpeechSettings {
|
||||||
provider: SpeechProviderPreference
|
provider: SpeechProviderPreference
|
||||||
@@ -38,6 +40,8 @@ export interface SpeechSettings {
|
|||||||
sttModel: string
|
sttModel: string
|
||||||
ttsModel: string
|
ttsModel: string
|
||||||
ttsVoice: string
|
ttsVoice: string
|
||||||
|
playbackMode: SpeechPlaybackMode
|
||||||
|
ttsFormat: SpeechTtsFormat
|
||||||
}
|
}
|
||||||
|
|
||||||
export type SpeechSettingsUpdate = Partial<Omit<SpeechSettings, "apiKey">> & {
|
export type SpeechSettingsUpdate = Partial<Omit<SpeechSettings, "apiKey">> & {
|
||||||
@@ -145,6 +149,8 @@ const defaultSpeechSettings: SpeechSettings = {
|
|||||||
sttModel: "gpt-4o-mini-transcribe",
|
sttModel: "gpt-4o-mini-transcribe",
|
||||||
ttsModel: "gpt-4o-mini-tts",
|
ttsModel: "gpt-4o-mini-tts",
|
||||||
ttsVoice: "alloy",
|
ttsVoice: "alloy",
|
||||||
|
playbackMode: "streaming",
|
||||||
|
ttsFormat: "mp3",
|
||||||
}
|
}
|
||||||
|
|
||||||
function normalizeUiSettings(input?: Partial<UiSettings> | null): UiSettings {
|
function normalizeUiSettings(input?: Partial<UiSettings> | null): UiSettings {
|
||||||
@@ -203,6 +209,14 @@ function normalizeSpeechSettings(input?: Partial<SpeechSettings> | null): Speech
|
|||||||
typeof sanitized.ttsVoice === "string" && sanitized.ttsVoice.trim()
|
typeof sanitized.ttsVoice === "string" && sanitized.ttsVoice.trim()
|
||||||
? sanitized.ttsVoice.trim()
|
? sanitized.ttsVoice.trim()
|
||||||
: defaultSpeechSettings.ttsVoice,
|
: defaultSpeechSettings.ttsVoice,
|
||||||
|
playbackMode:
|
||||||
|
sanitized.playbackMode === "buffered" || sanitized.playbackMode === "streaming"
|
||||||
|
? sanitized.playbackMode
|
||||||
|
: defaultSpeechSettings.playbackMode,
|
||||||
|
ttsFormat:
|
||||||
|
sanitized.ttsFormat === "wav" || sanitized.ttsFormat === "opus" || sanitized.ttsFormat === "aac" || sanitized.ttsFormat === "mp3"
|
||||||
|
? sanitized.ttsFormat
|
||||||
|
: defaultSpeechSettings.ttsFormat,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user