feat(speech): add prompt voice input (#249)

## Summary
- add server-backed speech capabilities and transcription endpoints plus
UI settings for speech configuration
- add push-to-talk prompt voice input with microphone controls,
transcription insertion, and browser capability gating
- keep prompt controls aligned by restoring right-side nav placement and
moving the mic beside the expand control
This commit is contained in:
Shantur Rathore
2026-03-25 14:08:11 +00:00
committed by GitHub
parent a950d47df0
commit 1233121a13
40 changed files with 1545 additions and 27 deletions

View File

@@ -0,0 +1,91 @@
import { z } from "zod"
import type { Logger } from "../logger"
import type { SettingsService } from "../settings/service"
import type { SpeechCapabilitiesResponse, SpeechSynthesisResponse, SpeechTranscriptionResponse } from "../api-types"
import { OpenAICompatibleSpeechProvider } from "./providers/openai-compatible"
const ServerSpeechSettingsSchema = z.object({
speech: z
.object({
provider: z.string().optional(),
apiKey: z.string().optional(),
baseUrl: z.string().optional(),
sttModel: z.string().optional(),
ttsModel: z.string().optional(),
ttsVoice: z.string().optional(),
})
.optional(),
})
export interface TranscribeAudioInput {
audioBase64: string
mimeType: string
filename?: string
language?: string
prompt?: string
}
export interface SynthesizeSpeechInput {
text: string
format?: "mp3" | "wav" | "opus"
}
export interface SpeechProvider {
getCapabilities(): SpeechCapabilitiesResponse
transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse>
synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse>
}
export interface NormalizedSpeechSettings {
provider: string
apiKey?: string
baseUrl?: string
sttModel: string
ttsModel: string
ttsVoice: string
}
const DEFAULT_PROVIDER = "openai-compatible"
const DEFAULT_STT_MODEL = "gpt-4o-mini-transcribe"
const DEFAULT_TTS_MODEL = "gpt-4o-mini-tts"
const DEFAULT_TTS_VOICE = "alloy"
export class SpeechService {
constructor(
private readonly settings: SettingsService,
private readonly logger: Logger,
) {}
getCapabilities(): SpeechCapabilitiesResponse {
return this.createProvider().getCapabilities()
}
async transcribe(input: TranscribeAudioInput): Promise<SpeechTranscriptionResponse> {
return this.createProvider().transcribe(input)
}
async synthesize(input: SynthesizeSpeechInput): Promise<SpeechSynthesisResponse> {
return this.createProvider().synthesize(input)
}
private createProvider(): SpeechProvider {
const settings = this.resolveSettings()
return new OpenAICompatibleSpeechProvider({
settings,
logger: this.logger.child({ provider: settings.provider }),
})
}
private resolveSettings(): NormalizedSpeechSettings {
const parsed = ServerSpeechSettingsSchema.parse(this.settings.getOwner("config", "server") ?? {})
const speech = parsed.speech ?? {}
return {
provider: speech.provider?.trim() || DEFAULT_PROVIDER,
apiKey: speech.apiKey?.trim() || process.env.OPENAI_API_KEY,
baseUrl: speech.baseUrl?.trim() || process.env.OPENAI_BASE_URL || undefined,
sttModel: speech.sttModel?.trim() || DEFAULT_STT_MODEL,
ttsModel: speech.ttsModel?.trim() || DEFAULT_TTS_MODEL,
ttsVoice: speech.ttsVoice?.trim() || DEFAULT_TTS_VOICE,
}
}
}