Revert "feat(speech): add realtime prompt dictation support"
This reverts commit f9b5e2b529.
This commit is contained in:
@@ -11,7 +11,7 @@ import { getCommands } from "../stores/commands"
|
||||
import { showAlertDialog } from "../stores/alerts"
|
||||
import { useI18n } from "../lib/i18n"
|
||||
import { getLogger } from "../lib/logger"
|
||||
import { preferences, useConfig } from "../stores/preferences"
|
||||
import { preferences } from "../stores/preferences"
|
||||
import type { ExpandState, PromptInputApi, PromptInputProps, PromptInsertMode, PromptMode } from "./prompt-input/types"
|
||||
import { usePromptState } from "./prompt-input/usePromptState"
|
||||
import { usePromptAttachments } from "./prompt-input/usePromptAttachments"
|
||||
@@ -22,7 +22,6 @@ const log = getLogger("actions")
|
||||
|
||||
export default function PromptInput(props: PromptInputProps) {
|
||||
const { t } = useI18n()
|
||||
const { serverSettings } = useConfig()
|
||||
const [, setIsFocused] = createSignal(false)
|
||||
const [mode, setMode] = createSignal<PromptMode>("normal")
|
||||
const [expandState, setExpandState] = createSignal<ExpandState>("normal")
|
||||
@@ -419,7 +418,6 @@ export default function PromptInput(props: PromptInputProps) {
|
||||
getTextarea: () => textareaRef ?? null,
|
||||
enabled: () => preferences().showPromptVoiceInput,
|
||||
disabled: () => Boolean(props.disabled),
|
||||
useRealtime: () => serverSettings().speech.useRealtime,
|
||||
})
|
||||
const showVoiceInput = () =>
|
||||
preferences().showPromptVoiceInput &&
|
||||
|
||||
@@ -1,110 +0,0 @@
|
||||
export interface RealtimePcmStreamHandle {
|
||||
stop(): Promise<void>
|
||||
}
|
||||
|
||||
interface CreateRealtimePcmStreamOptions {
|
||||
onChunk: (audioBase64: string) => void | Promise<void>
|
||||
}
|
||||
|
||||
const TARGET_SAMPLE_RATE = 24000
|
||||
const PROCESSOR_BUFFER_SIZE = 4096
|
||||
|
||||
export async function createRealtimePcmStream(
|
||||
options: CreateRealtimePcmStreamOptions,
|
||||
): Promise<RealtimePcmStreamHandle> {
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
channelCount: 1,
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
autoGainControl: true,
|
||||
},
|
||||
})
|
||||
|
||||
const AudioContextCtor = window.AudioContext || (window as any).webkitAudioContext
|
||||
if (!AudioContextCtor) {
|
||||
stream.getTracks().forEach((track) => track.stop())
|
||||
throw new Error("AudioContext is not supported in this browser.")
|
||||
}
|
||||
|
||||
const audioContext = new AudioContextCtor()
|
||||
await audioContext.resume()
|
||||
|
||||
const source = audioContext.createMediaStreamSource(stream)
|
||||
const processor = audioContext.createScriptProcessor(PROCESSOR_BUFFER_SIZE, 1, 1)
|
||||
const sink = audioContext.createGain()
|
||||
sink.gain.value = 0
|
||||
|
||||
source.connect(processor)
|
||||
processor.connect(sink)
|
||||
sink.connect(audioContext.destination)
|
||||
|
||||
processor.onaudioprocess = (event) => {
|
||||
const input = event.inputBuffer.getChannelData(0)
|
||||
const resampled = downsampleBuffer(input, audioContext.sampleRate, TARGET_SAMPLE_RATE)
|
||||
if (resampled.length === 0) return
|
||||
const pcm16 = floatTo16BitPcm(resampled)
|
||||
void options.onChunk(base64EncodePcm16(pcm16))
|
||||
}
|
||||
|
||||
let stopped = false
|
||||
return {
|
||||
async stop() {
|
||||
if (stopped) return
|
||||
stopped = true
|
||||
processor.onaudioprocess = null
|
||||
source.disconnect()
|
||||
processor.disconnect()
|
||||
sink.disconnect()
|
||||
stream.getTracks().forEach((track) => track.stop())
|
||||
await audioContext.close()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
function downsampleBuffer(buffer: Float32Array, inputSampleRate: number, outputSampleRate: number): Float32Array {
|
||||
if (inputSampleRate === outputSampleRate) {
|
||||
return buffer.slice()
|
||||
}
|
||||
|
||||
const sampleRateRatio = inputSampleRate / outputSampleRate
|
||||
const outputLength = Math.max(1, Math.round(buffer.length / sampleRateRatio))
|
||||
const output = new Float32Array(outputLength)
|
||||
let outputIndex = 0
|
||||
let inputIndex = 0
|
||||
|
||||
while (outputIndex < outputLength) {
|
||||
const nextInputIndex = Math.min(buffer.length, Math.round((outputIndex + 1) * sampleRateRatio))
|
||||
let sum = 0
|
||||
let count = 0
|
||||
for (let i = inputIndex; i < nextInputIndex; i += 1) {
|
||||
sum += buffer[i]
|
||||
count += 1
|
||||
}
|
||||
output[outputIndex] = count > 0 ? sum / count : buffer[Math.min(buffer.length - 1, inputIndex)]
|
||||
outputIndex += 1
|
||||
inputIndex = nextInputIndex
|
||||
}
|
||||
|
||||
return output
|
||||
}
|
||||
|
||||
function floatTo16BitPcm(buffer: Float32Array): Int16Array {
|
||||
const pcm16 = new Int16Array(buffer.length)
|
||||
for (let i = 0; i < buffer.length; i += 1) {
|
||||
const sample = Math.max(-1, Math.min(1, buffer[i]))
|
||||
pcm16[i] = sample < 0 ? Math.round(sample * 0x8000) : Math.round(sample * 0x7fff)
|
||||
}
|
||||
return pcm16
|
||||
}
|
||||
|
||||
function base64EncodePcm16(buffer: Int16Array): string {
|
||||
const bytes = new Uint8Array(buffer.buffer)
|
||||
let binary = ""
|
||||
const chunkSize = 0x8000
|
||||
for (let offset = 0; offset < bytes.length; offset += chunkSize) {
|
||||
const chunk = bytes.subarray(offset, offset + chunkSize)
|
||||
binary += String.fromCharCode(...chunk)
|
||||
}
|
||||
return btoa(binary)
|
||||
}
|
||||
@@ -1,36 +0,0 @@
|
||||
export interface PromptVoiceAnchor {
|
||||
prompt: string
|
||||
start: number
|
||||
end: number
|
||||
}
|
||||
|
||||
export function createPromptVoiceAnchor(prompt: string, start: number, end: number): PromptVoiceAnchor {
|
||||
return { prompt, start, end }
|
||||
}
|
||||
|
||||
export function buildPromptWithInsertedTranscript(anchor: PromptVoiceAnchor, insertedText: string): { value: string; cursor: number } {
|
||||
const before = anchor.prompt.slice(0, anchor.start)
|
||||
const after = anchor.prompt.slice(anchor.end)
|
||||
const normalized = insertedText.trim()
|
||||
|
||||
if (!normalized) {
|
||||
return {
|
||||
value: before + after,
|
||||
cursor: before.length,
|
||||
}
|
||||
}
|
||||
|
||||
const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
|
||||
const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
|
||||
return {
|
||||
value: `${before}${prefix}${normalized}${suffix}${after}`,
|
||||
cursor: before.length + prefix.length + normalized.length,
|
||||
}
|
||||
}
|
||||
|
||||
export function appendVoiceTranscript(current: string, next: string): string {
|
||||
const normalized = next.trim()
|
||||
if (!normalized) return current
|
||||
if (!current.trim()) return normalized
|
||||
return /\s$/.test(current) ? `${current}${normalized}` : `${current} ${normalized}`
|
||||
}
|
||||
@@ -1,241 +0,0 @@
|
||||
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
|
||||
import { showAlertDialog } from "../../stores/alerts"
|
||||
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
||||
import { serverApi } from "../../lib/api-client"
|
||||
import { useI18n } from "../../lib/i18n"
|
||||
import { createPromptVoiceAnchor, buildPromptWithInsertedTranscript } from "./promptVoiceInsertion"
|
||||
|
||||
interface UsePromptBufferedVoiceInputOptions {
|
||||
prompt: Accessor<string>
|
||||
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||
getTextarea: () => HTMLTextAreaElement | null
|
||||
enabled: Accessor<boolean>
|
||||
disabled: Accessor<boolean>
|
||||
}
|
||||
|
||||
type VoiceInputState = "idle" | "recording" | "transcribing"
|
||||
|
||||
export function usePromptBufferedVoiceInput(options: UsePromptBufferedVoiceInputOptions) {
|
||||
const { t } = useI18n()
|
||||
const [state, setState] = createSignal<VoiceInputState>("idle")
|
||||
const [elapsedMs, setElapsedMs] = createSignal(0)
|
||||
|
||||
let mediaRecorder: MediaRecorder | null = null
|
||||
let mediaStream: MediaStream | null = null
|
||||
let timerId: number | undefined
|
||||
let shouldTranscribe = true
|
||||
let recordedChunks: Blob[] = []
|
||||
let recordingStartedAt = 0
|
||||
|
||||
createEffect(() => {
|
||||
void loadSpeechCapabilities()
|
||||
})
|
||||
|
||||
onCleanup(() => {
|
||||
cleanupMedia(false)
|
||||
})
|
||||
|
||||
const isSupported = () => {
|
||||
if (typeof window === "undefined") return false
|
||||
return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
|
||||
}
|
||||
|
||||
const canUseVoiceInput = () => {
|
||||
const capabilities = speechCapabilities()
|
||||
return Boolean(
|
||||
options.enabled() &&
|
||||
isSupported() &&
|
||||
capabilities?.available &&
|
||||
capabilities?.configured &&
|
||||
capabilities?.supportsStt,
|
||||
)
|
||||
}
|
||||
|
||||
async function toggleRecording(): Promise<void> {
|
||||
if (state() === "recording") {
|
||||
stopRecording()
|
||||
return
|
||||
}
|
||||
|
||||
if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
|
||||
|
||||
try {
|
||||
await startRecording()
|
||||
} catch (error) {
|
||||
cleanupMedia(false)
|
||||
showAlertDialog(t("promptInput.voiceInput.error.permission"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
detail: error instanceof Error ? error.message : String(error),
|
||||
variant: "error",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
function stopRecording() {
|
||||
if (!mediaRecorder || state() !== "recording") return
|
||||
shouldTranscribe = true
|
||||
mediaRecorder.stop()
|
||||
setState("transcribing")
|
||||
stopTimer()
|
||||
}
|
||||
|
||||
function cancelRecording() {
|
||||
if (!mediaRecorder || state() !== "recording") return
|
||||
shouldTranscribe = false
|
||||
mediaRecorder.stop()
|
||||
cleanupMedia(false)
|
||||
}
|
||||
|
||||
async function startRecording() {
|
||||
if (!isSupported()) {
|
||||
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
variant: "error",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
recordedChunks = []
|
||||
shouldTranscribe = true
|
||||
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
|
||||
mediaRecorder = createRecorder(mediaStream)
|
||||
|
||||
mediaRecorder.addEventListener("dataavailable", (event) => {
|
||||
if (event.data.size > 0) {
|
||||
recordedChunks.push(event.data)
|
||||
}
|
||||
})
|
||||
|
||||
mediaRecorder.addEventListener("stop", () => {
|
||||
void finalizeRecording()
|
||||
})
|
||||
|
||||
recordingStartedAt = Date.now()
|
||||
setElapsedMs(0)
|
||||
setState("recording")
|
||||
startTimer()
|
||||
mediaRecorder.start()
|
||||
}
|
||||
|
||||
async function finalizeRecording() {
|
||||
const recorder = mediaRecorder
|
||||
const stream = mediaStream
|
||||
mediaRecorder = null
|
||||
mediaStream = null
|
||||
|
||||
if (!shouldTranscribe || recordedChunks.length === 0) {
|
||||
recordedChunks = []
|
||||
stopTracks(stream)
|
||||
setState("idle")
|
||||
setElapsedMs(0)
|
||||
return
|
||||
}
|
||||
|
||||
const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
|
||||
|
||||
try {
|
||||
const audioBlob = new Blob(recordedChunks, { type: mimeType })
|
||||
const transcription = await serverApi.transcribeAudio({
|
||||
audioBase64: await blobToBase64(audioBlob),
|
||||
mimeType,
|
||||
})
|
||||
if (transcription.text.trim()) {
|
||||
insertTranscript(transcription.text.trim())
|
||||
}
|
||||
} catch (error) {
|
||||
showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
detail: error instanceof Error ? error.message : String(error),
|
||||
variant: "error",
|
||||
})
|
||||
} finally {
|
||||
recordedChunks = []
|
||||
stopTracks(stream)
|
||||
setState("idle")
|
||||
setElapsedMs(0)
|
||||
}
|
||||
}
|
||||
|
||||
function insertTranscript(text: string) {
|
||||
const current = options.prompt()
|
||||
const textarea = options.getTextarea()
|
||||
const start = textarea ? textarea.selectionStart : current.length
|
||||
const end = textarea ? textarea.selectionEnd : current.length
|
||||
const { value, cursor } = buildPromptWithInsertedTranscript(
|
||||
createPromptVoiceAnchor(current, start, end),
|
||||
text,
|
||||
)
|
||||
|
||||
options.setPrompt(value)
|
||||
if (textarea) {
|
||||
setTimeout(() => {
|
||||
textarea.focus()
|
||||
textarea.setSelectionRange(cursor, cursor)
|
||||
}, 0)
|
||||
}
|
||||
}
|
||||
|
||||
function cleanupMedia(resetState = true) {
|
||||
stopTimer()
|
||||
if (mediaRecorder && mediaRecorder.state !== "inactive") {
|
||||
mediaRecorder.stop()
|
||||
}
|
||||
mediaRecorder = null
|
||||
stopTracks(mediaStream)
|
||||
mediaStream = null
|
||||
recordedChunks = []
|
||||
if (resetState) {
|
||||
setState("idle")
|
||||
setElapsedMs(0)
|
||||
}
|
||||
}
|
||||
|
||||
function startTimer() {
|
||||
stopTimer()
|
||||
timerId = window.setInterval(() => {
|
||||
setElapsedMs(Date.now() - recordingStartedAt)
|
||||
}, 250)
|
||||
}
|
||||
|
||||
function stopTimer() {
|
||||
if (timerId !== undefined) {
|
||||
window.clearInterval(timerId)
|
||||
timerId = undefined
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
state,
|
||||
elapsedMs,
|
||||
canUseVoiceInput,
|
||||
toggleRecording,
|
||||
cancelRecording,
|
||||
isRecording: () => state() === "recording",
|
||||
isTranscribing: () => state() === "transcribing",
|
||||
buttonTitle: () => {
|
||||
if (state() === "recording") return t("promptInput.voiceInput.stop.title")
|
||||
if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
|
||||
return t("promptInput.voiceInput.start.title")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
function createRecorder(stream: MediaStream): MediaRecorder {
|
||||
const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
|
||||
const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
|
||||
return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
|
||||
}
|
||||
|
||||
function stopTracks(stream: MediaStream | null) {
|
||||
stream?.getTracks().forEach((track) => track.stop())
|
||||
}
|
||||
|
||||
async function blobToBase64(blob: Blob): Promise<string> {
|
||||
const buffer = await blob.arrayBuffer()
|
||||
const bytes = new Uint8Array(buffer)
|
||||
let binary = ""
|
||||
for (const byte of bytes) {
|
||||
binary += String.fromCharCode(byte)
|
||||
}
|
||||
return btoa(binary)
|
||||
}
|
||||
@@ -1,325 +0,0 @@
|
||||
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
|
||||
import { showAlertDialog } from "../../stores/alerts"
|
||||
import { serverApi, type SpeechRealtimeEvent } from "../../lib/api-client"
|
||||
import { useI18n } from "../../lib/i18n"
|
||||
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
||||
import { createRealtimePcmStream, type RealtimePcmStreamHandle } from "./createRealtimePcmStream"
|
||||
import { appendVoiceTranscript, buildPromptWithInsertedTranscript, createPromptVoiceAnchor } from "./promptVoiceInsertion"
|
||||
|
||||
interface UsePromptRealtimeVoiceInputOptions {
|
||||
prompt: Accessor<string>
|
||||
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||
getTextarea: () => HTMLTextAreaElement | null
|
||||
enabled: Accessor<boolean>
|
||||
disabled: Accessor<boolean>
|
||||
}
|
||||
|
||||
type RealtimeVoiceState = "idle" | "connecting" | "listening" | "finalizing"
|
||||
|
||||
const FINAL_TRANSCRIPT_TIMEOUT_MS = 10000
|
||||
|
||||
export function usePromptRealtimeVoiceInput(options: UsePromptRealtimeVoiceInputOptions) {
|
||||
const { t } = useI18n()
|
||||
const [state, setState] = createSignal<RealtimeVoiceState>("idle")
|
||||
const [elapsedMs, setElapsedMs] = createSignal(0)
|
||||
|
||||
let activeSessionId: string | null = null
|
||||
let eventSource: EventSource | null = null
|
||||
let pcmStream: RealtimePcmStreamHandle | null = null
|
||||
let audioQueue: Promise<void> = Promise.resolve()
|
||||
let timerId: number | undefined
|
||||
let recordingStartedAt = 0
|
||||
let finalizeTimerId: number | undefined
|
||||
let anchor = createPromptVoiceAnchor("", 0, 0)
|
||||
let finalTranscript = ""
|
||||
let liveTranscript = ""
|
||||
let activeLiveItemId: string | null = null
|
||||
let closing = false
|
||||
|
||||
createEffect(() => {
|
||||
void loadSpeechCapabilities()
|
||||
})
|
||||
|
||||
onCleanup(() => {
|
||||
cancelRecording()
|
||||
})
|
||||
|
||||
const isSupported = () => {
|
||||
if (typeof window === "undefined") return false
|
||||
return Boolean(window.AudioContext || (window as any).webkitAudioContext) && Boolean(navigator.mediaDevices?.getUserMedia) && typeof EventSource !== "undefined"
|
||||
}
|
||||
|
||||
const canUseVoiceInput = () => {
|
||||
const capabilities = speechCapabilities()
|
||||
return Boolean(
|
||||
options.enabled() &&
|
||||
isSupported() &&
|
||||
capabilities?.available &&
|
||||
capabilities?.configured &&
|
||||
capabilities?.supportsStt &&
|
||||
capabilities?.supportsRealtimeTranscription,
|
||||
)
|
||||
}
|
||||
|
||||
async function toggleRecording(): Promise<void> {
|
||||
if (state() === "listening" || state() === "connecting") {
|
||||
await stopRecording()
|
||||
return
|
||||
}
|
||||
|
||||
if (!canUseVoiceInput() || options.disabled() || state() === "finalizing") return
|
||||
|
||||
try {
|
||||
await startRecording()
|
||||
} catch (error) {
|
||||
await cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
detail: error instanceof Error ? error.message : String(error),
|
||||
variant: "error",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
async function startRecording() {
|
||||
if (!isSupported()) {
|
||||
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
variant: "error",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
resetTranscriptState()
|
||||
captureAnchor()
|
||||
setState("connecting")
|
||||
setElapsedMs(0)
|
||||
|
||||
const created = await serverApi.createRealtimeSpeechSession({
|
||||
language: detectLanguage(),
|
||||
})
|
||||
activeSessionId = created.sessionId
|
||||
connectEventStream(created.sessionId)
|
||||
|
||||
pcmStream = await createRealtimePcmStream({
|
||||
onChunk: (audioBase64) => {
|
||||
const sessionId = activeSessionId
|
||||
if (!sessionId || closing) return
|
||||
audioQueue = audioQueue
|
||||
.then(() => serverApi.appendRealtimeSpeechAudio(sessionId, { audioBase64 }))
|
||||
.catch((error) => {
|
||||
handleRealtimeError(error)
|
||||
})
|
||||
},
|
||||
})
|
||||
|
||||
recordingStartedAt = Date.now()
|
||||
startTimer()
|
||||
setState("listening")
|
||||
}
|
||||
|
||||
async function stopRecording() {
|
||||
const sessionId = activeSessionId
|
||||
if (!sessionId || (state() !== "listening" && state() !== "connecting")) return
|
||||
|
||||
setState("finalizing")
|
||||
stopTimer()
|
||||
|
||||
if (pcmStream) {
|
||||
const stream = pcmStream
|
||||
pcmStream = null
|
||||
await stream.stop()
|
||||
}
|
||||
|
||||
try {
|
||||
await audioQueue.catch(() => undefined)
|
||||
await serverApi.finalizeRealtimeSpeechSession(sessionId)
|
||||
scheduleFinalizeClose(FINAL_TRANSCRIPT_TIMEOUT_MS)
|
||||
} catch (error) {
|
||||
handleRealtimeError(error)
|
||||
}
|
||||
}
|
||||
|
||||
function cancelRecording() {
|
||||
void cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||
}
|
||||
|
||||
function connectEventStream(sessionId: string) {
|
||||
eventSource?.close()
|
||||
eventSource = serverApi.connectRealtimeSpeechEvents(
|
||||
sessionId,
|
||||
(event) => handleEvent(event),
|
||||
() => {
|
||||
if (closing) return
|
||||
handleRealtimeError(new Error(t("promptInput.voiceInput.error.connection")))
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
function handleEvent(event: SpeechRealtimeEvent) {
|
||||
if (event.type === "session.ready") {
|
||||
return
|
||||
}
|
||||
|
||||
if (event.type === "session.error") {
|
||||
handleRealtimeError(new Error(event.message))
|
||||
return
|
||||
}
|
||||
|
||||
if (event.type === "transcript.partial") {
|
||||
activeLiveItemId = event.itemId
|
||||
liveTranscript = event.text
|
||||
renderPrompt(false)
|
||||
return
|
||||
}
|
||||
|
||||
if (event.type === "transcript.final") {
|
||||
activeLiveItemId = activeLiveItemId === event.itemId ? null : activeLiveItemId
|
||||
liveTranscript = ""
|
||||
finalTranscript = appendVoiceTranscript(finalTranscript, event.text)
|
||||
renderPrompt(true)
|
||||
if (state() === "finalizing") {
|
||||
scheduleFinalizeClose(250)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if (event.type === "session.closed") {
|
||||
void cleanupSession({ revertPrompt: false, closeRemote: false })
|
||||
}
|
||||
}
|
||||
|
||||
function captureAnchor() {
|
||||
const textarea = options.getTextarea()
|
||||
const current = options.prompt()
|
||||
const start = textarea ? textarea.selectionStart : current.length
|
||||
const end = textarea ? textarea.selectionEnd : current.length
|
||||
anchor = createPromptVoiceAnchor(current, start, end)
|
||||
}
|
||||
|
||||
function renderPrompt(persistDraft: boolean) {
|
||||
const inserted = [finalTranscript, liveTranscript.trim()].filter(Boolean).join(finalTranscript && liveTranscript.trim() ? " " : "")
|
||||
const { value, cursor } = buildPromptWithInsertedTranscript(anchor, inserted)
|
||||
options.setPrompt(value, persistDraft ? undefined : { persistDraft: false })
|
||||
syncTextareaCursor(cursor)
|
||||
}
|
||||
|
||||
function syncTextareaCursor(cursor: number) {
|
||||
const textarea = options.getTextarea()
|
||||
if (!textarea) return
|
||||
queueMicrotask(() => {
|
||||
const next = options.getTextarea()
|
||||
if (!next) return
|
||||
next.focus()
|
||||
next.setSelectionRange(cursor, cursor)
|
||||
})
|
||||
}
|
||||
|
||||
function scheduleFinalizeClose(delayMs: number) {
|
||||
if (finalizeTimerId !== undefined) {
|
||||
window.clearTimeout(finalizeTimerId)
|
||||
}
|
||||
finalizeTimerId = window.setTimeout(() => {
|
||||
void cleanupSession({ revertPrompt: false, closeRemote: true })
|
||||
}, delayMs)
|
||||
}
|
||||
|
||||
async function cleanupSession(cleanupOptions: { revertPrompt: boolean; closeRemote: boolean }) {
|
||||
if (closing) return
|
||||
closing = true
|
||||
|
||||
if (finalizeTimerId !== undefined) {
|
||||
window.clearTimeout(finalizeTimerId)
|
||||
finalizeTimerId = undefined
|
||||
}
|
||||
|
||||
stopTimer()
|
||||
|
||||
const sessionId = activeSessionId
|
||||
activeSessionId = null
|
||||
|
||||
eventSource?.close()
|
||||
eventSource = null
|
||||
|
||||
if (pcmStream) {
|
||||
const stream = pcmStream
|
||||
pcmStream = null
|
||||
await stream.stop().catch(() => undefined)
|
||||
}
|
||||
|
||||
await audioQueue.catch(() => undefined)
|
||||
audioQueue = Promise.resolve()
|
||||
|
||||
if (cleanupOptions.closeRemote && sessionId) {
|
||||
await serverApi.closeRealtimeSpeechSession(sessionId).catch(() => undefined)
|
||||
}
|
||||
|
||||
if (!cleanupOptions.revertPrompt && !finalTranscript.trim() && liveTranscript.trim()) {
|
||||
finalTranscript = appendVoiceTranscript(finalTranscript, liveTranscript)
|
||||
liveTranscript = ""
|
||||
}
|
||||
|
||||
if (cleanupOptions.revertPrompt) {
|
||||
options.setPrompt(anchor.prompt)
|
||||
} else if (finalTranscript.trim()) {
|
||||
renderPrompt(true)
|
||||
}
|
||||
|
||||
resetTranscriptState()
|
||||
setState("idle")
|
||||
setElapsedMs(0)
|
||||
closing = false
|
||||
}
|
||||
|
||||
function resetTranscriptState() {
|
||||
finalTranscript = ""
|
||||
liveTranscript = ""
|
||||
activeLiveItemId = null
|
||||
}
|
||||
|
||||
function handleRealtimeError(error: unknown) {
|
||||
if (closing) return
|
||||
void cleanupSession({ revertPrompt: true, closeRemote: true })
|
||||
showAlertDialog(t("promptInput.voiceInput.error.connection"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
detail: error instanceof Error ? error.message : String(error),
|
||||
variant: "error",
|
||||
})
|
||||
}
|
||||
|
||||
function startTimer() {
|
||||
stopTimer()
|
||||
timerId = window.setInterval(() => {
|
||||
setElapsedMs(Date.now() - recordingStartedAt)
|
||||
}, 250)
|
||||
}
|
||||
|
||||
function stopTimer() {
|
||||
if (timerId !== undefined) {
|
||||
window.clearInterval(timerId)
|
||||
timerId = undefined
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
state,
|
||||
elapsedMs,
|
||||
canUseVoiceInput,
|
||||
toggleRecording,
|
||||
cancelRecording,
|
||||
isRecording: () => state() === "connecting" || state() === "listening",
|
||||
isTranscribing: () => state() === "finalizing",
|
||||
buttonTitle: () => {
|
||||
if (state() === "connecting") return t("promptInput.voiceInput.connecting.title")
|
||||
if (state() === "listening") return t("promptInput.voiceInput.stop.title")
|
||||
if (state() === "finalizing") return t("promptInput.voiceInput.transcribing.title")
|
||||
return t("promptInput.voiceInput.start.title")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
function detectLanguage(): string | undefined {
|
||||
if (typeof navigator === "undefined") return undefined
|
||||
const [language] = navigator.language.split("-")
|
||||
return language?.trim() || undefined
|
||||
}
|
||||
@@ -22,7 +22,7 @@ type HistorySelectOptions = {
|
||||
|
||||
type PromptState = {
|
||||
prompt: Accessor<string>
|
||||
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||
setPrompt: (value: string) => void
|
||||
clearPrompt: () => void
|
||||
|
||||
draftLoadedNonce: Accessor<number>
|
||||
@@ -48,11 +48,11 @@ export function usePromptState(options: PromptStateOptions): PromptState {
|
||||
const [historyDraft, setHistoryDraft] = createSignal<string | null>(null)
|
||||
const [draftLoadedNonce, setDraftLoadedNonce] = createSignal(0)
|
||||
|
||||
const setPrompt = (value: string, setOptions?: { persistDraft?: boolean }) => {
|
||||
const setPrompt = (value: string) => {
|
||||
setPromptInternal(value)
|
||||
// Persist drafts only when the user is at the "fresh" position (not browsing history).
|
||||
// This keeps the bottom-of-history draft stable even if the user edits recalled history entries.
|
||||
if (setOptions?.persistDraft !== false && historyIndex() === -1) {
|
||||
if (historyIndex() === -1) {
|
||||
setSessionDraftPrompt(options.instanceId(), options.sessionId(), value)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,30 +1,242 @@
|
||||
import type { Accessor } from "solid-js"
|
||||
import { usePromptBufferedVoiceInput } from "./usePromptBufferedVoiceInput"
|
||||
import { usePromptRealtimeVoiceInput } from "./usePromptRealtimeVoiceInput"
|
||||
import { createEffect, createSignal, onCleanup, type Accessor } from "solid-js"
|
||||
import { showAlertDialog } from "../../stores/alerts"
|
||||
import { loadSpeechCapabilities, speechCapabilities } from "../../stores/speech"
|
||||
import { serverApi } from "../../lib/api-client"
|
||||
import { useI18n } from "../../lib/i18n"
|
||||
|
||||
interface UsePromptVoiceInputOptions {
|
||||
prompt: Accessor<string>
|
||||
setPrompt: (value: string, options?: { persistDraft?: boolean }) => void
|
||||
setPrompt: (value: string) => void
|
||||
getTextarea: () => HTMLTextAreaElement | null
|
||||
enabled: Accessor<boolean>
|
||||
disabled: Accessor<boolean>
|
||||
useRealtime: Accessor<boolean>
|
||||
}
|
||||
|
||||
type VoiceInputState = "idle" | "recording" | "transcribing"
|
||||
|
||||
export function usePromptVoiceInput(options: UsePromptVoiceInputOptions) {
|
||||
const buffered = usePromptBufferedVoiceInput(options)
|
||||
const realtime = usePromptRealtimeVoiceInput(options)
|
||||
const { t } = useI18n()
|
||||
const [state, setState] = createSignal<VoiceInputState>("idle")
|
||||
const [elapsedMs, setElapsedMs] = createSignal(0)
|
||||
|
||||
const active = () => (options.useRealtime() ? realtime : buffered)
|
||||
let mediaRecorder: MediaRecorder | null = null
|
||||
let mediaStream: MediaStream | null = null
|
||||
let timerId: number | undefined
|
||||
let shouldTranscribe = true
|
||||
let recordedChunks: Blob[] = []
|
||||
let recordingStartedAt = 0
|
||||
|
||||
createEffect(() => {
|
||||
void loadSpeechCapabilities()
|
||||
})
|
||||
|
||||
onCleanup(() => {
|
||||
cleanupMedia(false)
|
||||
})
|
||||
|
||||
const isSupported = () => {
|
||||
if (typeof window === "undefined") return false
|
||||
return typeof window.MediaRecorder !== "undefined" && Boolean(navigator.mediaDevices?.getUserMedia)
|
||||
}
|
||||
|
||||
const canUseVoiceInput = () => {
|
||||
const capabilities = speechCapabilities()
|
||||
return Boolean(
|
||||
options.enabled() &&
|
||||
isSupported() &&
|
||||
capabilities?.available &&
|
||||
capabilities?.configured &&
|
||||
capabilities?.supportsStt,
|
||||
)
|
||||
}
|
||||
|
||||
async function toggleRecording(): Promise<void> {
|
||||
if (state() === "recording") {
|
||||
stopRecording()
|
||||
return
|
||||
}
|
||||
|
||||
if (!canUseVoiceInput() || options.disabled() || state() === "transcribing") return
|
||||
|
||||
try {
|
||||
await startRecording()
|
||||
} catch (error) {
|
||||
cleanupMedia(false)
|
||||
showAlertDialog(t("promptInput.voiceInput.error.permission"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
detail: error instanceof Error ? error.message : String(error),
|
||||
variant: "error",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
function stopRecording() {
|
||||
if (!mediaRecorder || state() !== "recording") return
|
||||
shouldTranscribe = true
|
||||
mediaRecorder.stop()
|
||||
setState("transcribing")
|
||||
stopTimer()
|
||||
}
|
||||
|
||||
function cancelRecording() {
|
||||
if (!mediaRecorder || state() !== "recording") return
|
||||
shouldTranscribe = false
|
||||
mediaRecorder.stop()
|
||||
cleanupMedia(false)
|
||||
}
|
||||
|
||||
async function startRecording() {
|
||||
if (!isSupported()) {
|
||||
showAlertDialog(t("promptInput.voiceInput.error.unsupported"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
variant: "error",
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
recordedChunks = []
|
||||
shouldTranscribe = true
|
||||
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true })
|
||||
mediaRecorder = createRecorder(mediaStream)
|
||||
|
||||
mediaRecorder.addEventListener("dataavailable", (event) => {
|
||||
if (event.data.size > 0) {
|
||||
recordedChunks.push(event.data)
|
||||
}
|
||||
})
|
||||
|
||||
mediaRecorder.addEventListener("stop", () => {
|
||||
void finalizeRecording()
|
||||
})
|
||||
|
||||
recordingStartedAt = Date.now()
|
||||
setElapsedMs(0)
|
||||
setState("recording")
|
||||
startTimer()
|
||||
mediaRecorder.start()
|
||||
}
|
||||
|
||||
async function finalizeRecording() {
|
||||
const recorder = mediaRecorder
|
||||
const stream = mediaStream
|
||||
mediaRecorder = null
|
||||
mediaStream = null
|
||||
|
||||
if (!shouldTranscribe || recordedChunks.length === 0) {
|
||||
recordedChunks = []
|
||||
stopTracks(stream)
|
||||
setState("idle")
|
||||
setElapsedMs(0)
|
||||
return
|
||||
}
|
||||
|
||||
const mimeType = recorder?.mimeType || recordedChunks[0]?.type || "audio/webm"
|
||||
|
||||
try {
|
||||
const audioBlob = new Blob(recordedChunks, { type: mimeType })
|
||||
const transcription = await serverApi.transcribeAudio({
|
||||
audioBase64: await blobToBase64(audioBlob),
|
||||
mimeType,
|
||||
})
|
||||
if (transcription.text.trim()) {
|
||||
insertTranscript(transcription.text.trim())
|
||||
}
|
||||
} catch (error) {
|
||||
showAlertDialog(t("promptInput.voiceInput.error.transcribe"), {
|
||||
title: t("promptInput.voiceInput.error.title"),
|
||||
detail: error instanceof Error ? error.message : String(error),
|
||||
variant: "error",
|
||||
})
|
||||
} finally {
|
||||
recordedChunks = []
|
||||
stopTracks(stream)
|
||||
setState("idle")
|
||||
setElapsedMs(0)
|
||||
}
|
||||
}
|
||||
|
||||
function insertTranscript(text: string) {
|
||||
const current = options.prompt()
|
||||
const textarea = options.getTextarea()
|
||||
const start = textarea ? textarea.selectionStart : current.length
|
||||
const end = textarea ? textarea.selectionEnd : current.length
|
||||
const before = current.slice(0, start)
|
||||
const after = current.slice(end)
|
||||
const prefix = before.length > 0 && !/\s$/.test(before) ? " " : ""
|
||||
const suffix = after.length > 0 && !/^\s/.test(after) ? " " : ""
|
||||
const nextValue = `${before}${prefix}${text}${suffix}${after}`
|
||||
const cursor = before.length + prefix.length + text.length
|
||||
|
||||
options.setPrompt(nextValue)
|
||||
if (textarea) {
|
||||
setTimeout(() => {
|
||||
textarea.focus()
|
||||
textarea.setSelectionRange(cursor, cursor)
|
||||
}, 0)
|
||||
}
|
||||
}
|
||||
|
||||
function cleanupMedia(resetState = true) {
|
||||
stopTimer()
|
||||
if (mediaRecorder && mediaRecorder.state !== "inactive") {
|
||||
mediaRecorder.stop()
|
||||
}
|
||||
mediaRecorder = null
|
||||
stopTracks(mediaStream)
|
||||
mediaStream = null
|
||||
recordedChunks = []
|
||||
if (resetState) {
|
||||
setState("idle")
|
||||
setElapsedMs(0)
|
||||
}
|
||||
}
|
||||
|
||||
function startTimer() {
|
||||
stopTimer()
|
||||
timerId = window.setInterval(() => {
|
||||
setElapsedMs(Date.now() - recordingStartedAt)
|
||||
}, 250)
|
||||
}
|
||||
|
||||
function stopTimer() {
|
||||
if (timerId !== undefined) {
|
||||
window.clearInterval(timerId)
|
||||
timerId = undefined
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
state: () => active().state(),
|
||||
elapsedMs: () => active().elapsedMs(),
|
||||
canUseVoiceInput: () => active().canUseVoiceInput(),
|
||||
toggleRecording: () => active().toggleRecording(),
|
||||
cancelRecording: () => active().cancelRecording(),
|
||||
isRecording: () => active().isRecording(),
|
||||
isTranscribing: () => active().isTranscribing(),
|
||||
buttonTitle: () => active().buttonTitle(),
|
||||
state,
|
||||
elapsedMs,
|
||||
canUseVoiceInput,
|
||||
toggleRecording,
|
||||
cancelRecording,
|
||||
isRecording: () => state() === "recording",
|
||||
isTranscribing: () => state() === "transcribing",
|
||||
buttonTitle: () => {
|
||||
if (state() === "recording") return t("promptInput.voiceInput.stop.title")
|
||||
if (state() === "transcribing") return t("promptInput.voiceInput.transcribing.title")
|
||||
return t("promptInput.voiceInput.start.title")
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
function createRecorder(stream: MediaStream): MediaRecorder {
|
||||
const candidates = ["audio/webm;codecs=opus", "audio/webm", "audio/mp4", "audio/ogg;codecs=opus"]
|
||||
const supported = candidates.find((candidate) => typeof MediaRecorder.isTypeSupported !== "function" || MediaRecorder.isTypeSupported(candidate))
|
||||
return supported ? new MediaRecorder(stream, { mimeType: supported }) : new MediaRecorder(stream)
|
||||
}
|
||||
|
||||
function stopTracks(stream: MediaStream | null) {
|
||||
stream?.getTracks().forEach((track) => track.stop())
|
||||
}
|
||||
|
||||
async function blobToBase64(blob: Blob): Promise<string> {
|
||||
const buffer = await blob.arrayBuffer()
|
||||
const bytes = new Uint8Array(buffer)
|
||||
let binary = ""
|
||||
for (const byte of bytes) {
|
||||
binary += String.fromCharCode(byte)
|
||||
}
|
||||
return btoa(binary)
|
||||
}
|
||||
|
||||
@@ -10,8 +10,6 @@ const log = getLogger("actions")
|
||||
type DraftFields = {
|
||||
apiKey: string
|
||||
baseUrl: string
|
||||
useRealtime: boolean
|
||||
realtimeModel: string
|
||||
sttModel: string
|
||||
ttsModel: string
|
||||
ttsVoice: string
|
||||
@@ -21,8 +19,6 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
|
||||
return {
|
||||
apiKey: speech.apiKey ?? "",
|
||||
baseUrl: speech.baseUrl ?? "",
|
||||
useRealtime: speech.useRealtime,
|
||||
realtimeModel: speech.realtimeModel,
|
||||
sttModel: speech.sttModel,
|
||||
ttsModel: speech.ttsModel,
|
||||
ttsVoice: speech.ttsVoice,
|
||||
@@ -30,7 +26,7 @@ function createDraftFields(speech: SpeechSettings): DraftFields {
|
||||
}
|
||||
|
||||
function isDraftEqual(a: DraftFields, b: DraftFields): boolean {
|
||||
return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.useRealtime === b.useRealtime && a.realtimeModel === b.realtimeModel && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
|
||||
return a.apiKey === b.apiKey && a.baseUrl === b.baseUrl && a.sttModel === b.sttModel && a.ttsModel === b.ttsModel && a.ttsVoice === b.ttsVoice
|
||||
}
|
||||
|
||||
export const SpeechSettingsCard: Component = () => {
|
||||
@@ -61,7 +57,7 @@ export const SpeechSettingsCard: Component = () => {
|
||||
return speechCapabilities()?.configured ? t("settings.speech.status.configured") : t("settings.speech.status.missing")
|
||||
}
|
||||
|
||||
const updateDraft = <K extends keyof DraftFields>(key: K, value: DraftFields[K]) => {
|
||||
const updateDraft = (key: keyof DraftFields, value: string) => {
|
||||
setSaveStatus("idle")
|
||||
setDrafts((current) => ({ ...current, [key]: value }))
|
||||
}
|
||||
@@ -69,14 +65,12 @@ export const SpeechSettingsCard: Component = () => {
|
||||
const isDirty = createMemo(() => {
|
||||
const speech = serverSettings().speech
|
||||
const current = drafts()
|
||||
return (
|
||||
(current.apiKey || "") !== (speech.apiKey || "") ||
|
||||
(current.baseUrl || "") !== (speech.baseUrl || "") ||
|
||||
current.useRealtime !== speech.useRealtime ||
|
||||
current.realtimeModel !== speech.realtimeModel ||
|
||||
current.sttModel !== speech.sttModel ||
|
||||
current.ttsModel !== speech.ttsModel ||
|
||||
current.ttsVoice !== speech.ttsVoice
|
||||
return (
|
||||
(current.apiKey || "") !== (speech.apiKey || "") ||
|
||||
(current.baseUrl || "") !== (speech.baseUrl || "") ||
|
||||
current.sttModel !== speech.sttModel ||
|
||||
current.ttsModel !== speech.ttsModel ||
|
||||
current.ttsVoice !== speech.ttsVoice
|
||||
)
|
||||
})
|
||||
|
||||
@@ -96,8 +90,6 @@ export const SpeechSettingsCard: Component = () => {
|
||||
await updateSpeechSettings({
|
||||
apiKey: current.apiKey.trim() || undefined,
|
||||
baseUrl: current.baseUrl.trim() || undefined,
|
||||
useRealtime: current.useRealtime,
|
||||
realtimeModel: current.realtimeModel.trim() || undefined,
|
||||
sttModel: current.sttModel.trim() || undefined,
|
||||
ttsModel: current.ttsModel.trim() || undefined,
|
||||
ttsVoice: current.ttsVoice.trim() || undefined,
|
||||
@@ -106,8 +98,6 @@ export const SpeechSettingsCard: Component = () => {
|
||||
setDrafts({
|
||||
apiKey: current.apiKey.trim(),
|
||||
baseUrl: current.baseUrl.trim(),
|
||||
useRealtime: current.useRealtime,
|
||||
realtimeModel: current.realtimeModel.trim() || serverSettings().speech.realtimeModel,
|
||||
sttModel: current.sttModel.trim() || serverSettings().speech.sttModel,
|
||||
ttsModel: current.ttsModel.trim() || serverSettings().speech.ttsModel,
|
||||
ttsVoice: current.ttsVoice.trim() || serverSettings().speech.ttsVoice,
|
||||
@@ -169,27 +159,6 @@ export const SpeechSettingsCard: Component = () => {
|
||||
onInput={(value) => updateDraft("baseUrl", value)}
|
||||
placeholder={t("settings.speech.baseUrl.placeholder")}
|
||||
/>
|
||||
<div class="settings-toggle-row">
|
||||
<div>
|
||||
<div class="settings-toggle-title">{t("settings.speech.realtime.title")}</div>
|
||||
<div class="settings-toggle-caption">{t("settings.speech.realtime.subtitle")}</div>
|
||||
</div>
|
||||
<label class="settings-checkbox-toggle">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={drafts().useRealtime}
|
||||
onChange={(event) => updateDraft("useRealtime", event.currentTarget.checked)}
|
||||
/>
|
||||
<span>{t("settings.common.enabled")}</span>
|
||||
</label>
|
||||
</div>
|
||||
<Field
|
||||
label={t("settings.speech.realtimeModel.title")}
|
||||
caption={t("settings.speech.realtimeModel.subtitle")}
|
||||
value={drafts().realtimeModel}
|
||||
onInput={(value) => updateDraft("realtimeModel", value)}
|
||||
placeholder={t("settings.speech.realtimeModel.placeholder")}
|
||||
/>
|
||||
<Field
|
||||
label={t("settings.speech.sttModel.title")}
|
||||
caption={t("settings.speech.sttModel.subtitle")}
|
||||
|
||||
Reference in New Issue
Block a user