// React hook for hold-to-talk voice input using Anthropic voice_stream STT. // // Hold the keybinding to record; release to stop and submit. Auto-repeat // key events reset an internal timer — when no keypress arrives within // RELEASE_TIMEOUT_MS the recording stops automatically. Uses the native // audio module (macOS) or SoX for recording, and Anthropic's voice_stream // endpoint (conversation_engine) for STT. import { useCallback, useEffect, useRef, useState } from 'react' import { useSetVoiceState } from '../context/voice.js' import { useTerminalFocus } from '../ink/hooks/use-terminal-focus.js' import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, logEvent, } from '../services/analytics/index.js' import { getVoiceKeyterms } from '../services/voiceKeyterms.js' import { connectVoiceStream, type FinalizeSource, isVoiceStreamAvailable, type VoiceStreamConnection, } from '../services/voiceStreamSTT.js' import { logForDebugging } from '../utils/debug.js' import { toError } from '../utils/errors.js' import { getSystemLocaleLanguage } from '../utils/intl.js' import { logError } from '../utils/log.js' import { getInitialSettings } from '../utils/settings/settings.js' import { sleep } from '../utils/sleep.js' // ─── Language normalization ───────────────────────────────────────────── const DEFAULT_STT_LANGUAGE = 'en' // Maps language names (English and native) to BCP-47 codes supported by // the voice_stream Deepgram backend. Keys must be lowercase. // // This list must be a SUBSET of the server-side supported_language_codes // allowlist (GrowthBook: speech_to_text_voice_stream_config). // If the CLI sends a code the server rejects, the WebSocket closes with // 1008 "Unsupported language" and voice breaks. Unsupported languages // fall back to DEFAULT_STT_LANGUAGE so recording still works. const LANGUAGE_NAME_TO_CODE: Record = { english: 'en', spanish: 'es', español: 'es', espanol: 'es', french: 'fr', français: 'fr', francais: 'fr', japanese: 'ja', 日本語: 'ja', german: 'de', deutsch: 'de', portuguese: 'pt', português: 'pt', portugues: 'pt', italian: 'it', italiano: 'it', korean: 'ko', 한국어: 'ko', hindi: 'hi', हिन्दी: 'hi', हिंदी: 'hi', indonesian: 'id', 'bahasa indonesia': 'id', bahasa: 'id', russian: 'ru', русский: 'ru', polish: 'pl', polski: 'pl', turkish: 'tr', türkçe: 'tr', turkce: 'tr', dutch: 'nl', nederlands: 'nl', ukrainian: 'uk', українська: 'uk', greek: 'el', ελληνικά: 'el', czech: 'cs', čeština: 'cs', cestina: 'cs', danish: 'da', dansk: 'da', swedish: 'sv', svenska: 'sv', norwegian: 'no', norsk: 'no', } // Subset of the GrowthBook speech_to_text_voice_stream_config allowlist. // Sending a code not in the server allowlist closes the connection. const SUPPORTED_LANGUAGE_CODES = new Set([ 'en', 'es', 'fr', 'ja', 'de', 'pt', 'it', 'ko', 'hi', 'id', 'ru', 'pl', 'tr', 'nl', 'uk', 'el', 'cs', 'da', 'sv', 'no', ]) // Normalize a language preference string (from settings.language) to a // BCP-47 code supported by the voice_stream endpoint. Returns the // default language if the input cannot be resolved. When the input is // non-empty but unsupported, fellBackFrom is set to the original input so // callers can surface a warning. export function normalizeLanguageForSTT(language: string | undefined): { code: string fellBackFrom?: string } { if (!language) return { code: DEFAULT_STT_LANGUAGE } const lower = language.toLowerCase().trim() if (!lower) return { code: DEFAULT_STT_LANGUAGE } if (SUPPORTED_LANGUAGE_CODES.has(lower)) return { code: lower } const fromName = LANGUAGE_NAME_TO_CODE[lower] if (fromName) return { code: fromName } const base = lower.split('-')[0] if (base && SUPPORTED_LANGUAGE_CODES.has(base)) return { code: base } return { code: DEFAULT_STT_LANGUAGE, fellBackFrom: language } } // Lazy-loaded voice module. We defer importing voice.ts (and its native // audio-capture-napi dependency) until voice input is actually activated. // On macOS, loading the native audio module can trigger a TCC microphone // permission prompt — we must avoid that until voice input is actually enabled. type VoiceModule = typeof import('../services/voice.js') let voiceModule: VoiceModule | null = null type VoiceState = 'idle' | 'recording' | 'processing' type UseVoiceOptions = { onTranscript: (text: string) => void onError?: (message: string) => void enabled: boolean focusMode: boolean } type UseVoiceReturn = { state: VoiceState handleKeyEvent: (fallbackMs?: number) => void } // Gap (ms) between auto-repeat key events that signals key release. // Terminal auto-repeat typically fires every 30-80ms; 200ms comfortably // covers jitter while still feeling responsive. const RELEASE_TIMEOUT_MS = 200 // Fallback (ms) to arm the release timer if no auto-repeat is seen. // macOS default key repeat delay is ~500ms; 600ms gives headroom. // If the user tapped and released before auto-repeat started, this // ensures the release timer gets armed and recording stops. // // For modifier-combo first-press activation (handleKeyEvent called at // t=0, before any auto-repeat), callers should pass FIRST_PRESS_FALLBACK_MS // instead — the gap to the next keypress is the OS initial repeat *delay* // (up to ~2s on macOS with slider at "Long"), not the repeat *rate*. const REPEAT_FALLBACK_MS = 600 export const FIRST_PRESS_FALLBACK_MS = 2000 // How long (ms) to keep a focus-mode session alive without any speech // before tearing it down to free the WebSocket connection. Re-arms on // the next focus cycle (blur → refocus). const FOCUS_SILENCE_TIMEOUT_MS = 5_000 // Number of bars shown in the recording waveform visualizer. const AUDIO_LEVEL_BARS = 16 // Compute RMS amplitude from a 16-bit signed PCM buffer and return a // normalized 0-1 value. A sqrt curve spreads quieter levels across more // of the visual range so the waveform uses the full set of block heights. export function computeLevel(chunk: Buffer): number { const samples = chunk.length >> 1 // 16-bit = 2 bytes per sample if (samples === 0) return 0 let sumSq = 0 for (let i = 0; i < chunk.length - 1; i += 2) { // Read 16-bit signed little-endian const sample = ((chunk[i]! | (chunk[i + 1]! << 8)) << 16) >> 16 sumSq += sample * sample } const rms = Math.sqrt(sumSq / samples) const normalized = Math.min(rms / 2000, 1) return Math.sqrt(normalized) } export function useVoice({ onTranscript, onError, enabled, focusMode, }: UseVoiceOptions): UseVoiceReturn { const [state, setState] = useState('idle') const stateRef = useRef('idle') const connectionRef = useRef(null) const accumulatedRef = useRef('') const onTranscriptRef = useRef(onTranscript) const onErrorRef = useRef(onError) const cleanupTimerRef = useRef | null>(null) const releaseTimerRef = useRef | null>(null) // True once we've seen a second keypress (auto-repeat) while recording. // The OS key repeat delay (~500ms on macOS) means the first keypress is // solo — arming the release timer before auto-repeat starts would cause // a false release. const seenRepeatRef = useRef(false) const repeatFallbackTimerRef = useRef | null>( null, ) // True when the current recording session was started by terminal focus // (not by a keypress). Focus-driven sessions end on blur, not key release. const focusTriggeredRef = useRef(false) // Timer that tears down the session after prolonged silence in focus mode. const focusSilenceTimerRef = useRef | null>( null, ) // Set when a focus-mode session is torn down due to silence. Prevents // the focus effect from immediately restarting. Cleared on blur so the // next focus cycle re-arms recording. const silenceTimedOutRef = useRef(false) const recordingStartRef = useRef(0) // Incremented on each startRecordingSession(). Callbacks capture their // generation and bail if a newer session has started — prevents a zombie // slow-connecting WS from an abandoned session from overwriting // connectionRef mid-way through the next session. const sessionGenRef = useRef(0) // True if the early-error retry fired during this session. // Tracked for the tengu_voice_recording_completed analytics event. const retryUsedRef = useRef(false) // Full audio captured this session, kept for silent-drop replay. ~1% of // sessions get a sticky-broken CE pod that accepts audio but returns zero // transcripts (anthropics/anthropic#287008 session-sticky variant); when // finalize() resolves via no_data_timeout with hadAudioSignal=true, we // replay the buffer on a fresh WS once. Bounded: 32KB/s × ~60s max ≈ 2MB. const fullAudioRef = useRef([]) const silentDropRetriedRef = useRef(false) // Bumped when the early-error retry is scheduled. Captured per // attemptConnect — onError swallows stale-gen events (conn 1's // trailing close-error) but surfaces current-gen ones (conn 2's // genuine failure). Same shape as sessionGenRef, one level down. const attemptGenRef = useRef(0) // Running total of chars flushed in focus mode (each final transcript is // injected immediately and accumulatedRef reset). Added to transcriptChars // in the completed event so focus-mode sessions don't false-positive as // silent-drops (transcriptChars=0 despite successful transcription). const focusFlushedCharsRef = useRef(0) // True if at least one audio chunk with non-trivial signal was received. // Used to distinguish "microphone is silent/inaccessible" from "speech not detected". const hasAudioSignalRef = useRef(false) // True once onReady fired for the current session. Unlike connectionRef // (which cleanup() nulls), this survives effect-order races where Effect 3 // cleanup runs before Effect 2's finishRecording() — e.g. /voice toggled // off mid-recording in focus mode. Used for the wsConnected analytics // dimension and error-message branching. Reset in startRecordingSession. const everConnectedRef = useRef(false) const audioLevelsRef = useRef([]) const isFocused = useTerminalFocus() const setVoiceState = useSetVoiceState() // Keep callback refs current without triggering re-renders onTranscriptRef.current = onTranscript onErrorRef.current = onError function updateState(newState: VoiceState): void { stateRef.current = newState setState(newState) setVoiceState(prev => { if (prev.voiceState === newState) return prev return { ...prev, voiceState: newState } }) } const cleanup = useCallback((): void => { // Stale any in-flight session (main connection isStale(), replay // isStale(), finishRecording continuation). Without this, disabling // voice during the replay window lets the stale replay open a WS, // accumulate transcript, and inject it after voice was torn down. sessionGenRef.current++ if (cleanupTimerRef.current) { clearTimeout(cleanupTimerRef.current) cleanupTimerRef.current = null } if (releaseTimerRef.current) { clearTimeout(releaseTimerRef.current) releaseTimerRef.current = null } if (repeatFallbackTimerRef.current) { clearTimeout(repeatFallbackTimerRef.current) repeatFallbackTimerRef.current = null } if (focusSilenceTimerRef.current) { clearTimeout(focusSilenceTimerRef.current) focusSilenceTimerRef.current = null } silenceTimedOutRef.current = false voiceModule?.stopRecording() if (connectionRef.current) { connectionRef.current.close() connectionRef.current = null } accumulatedRef.current = '' audioLevelsRef.current = [] fullAudioRef.current = [] setVoiceState(prev => { if (prev.voiceInterimTranscript === '' && !prev.voiceAudioLevels.length) return prev return { ...prev, voiceInterimTranscript: '', voiceAudioLevels: [] } }) }, [setVoiceState]) function finishRecording(): void { logForDebugging( '[voice] finishRecording: stopping recording, transitioning to processing', ) // Session ending — stale any in-flight attempt so its late onError // (conn 2 responding after user released key) doesn't double-fire on // top of the "check network" message below. attemptGenRef.current++ // Capture focusTriggered BEFORE clearing it — needed as an event dimension // so BigQuery can filter out passive focus-mode auto-recordings (user focused // terminal without speaking → ambient noise sets hadAudioSignal=true → false // silent-drop signature). focusFlushedCharsRef fixes transcriptChars accuracy // for sessions WITH speech; focusTriggered enables filtering sessions WITHOUT. const focusTriggered = focusTriggeredRef.current focusTriggeredRef.current = false updateState('processing') voiceModule?.stopRecording() // Capture duration BEFORE the finalize round-trip so that the WebSocket // wait time is not included (otherwise a quick tap looks like > 2s). // All ref-backed values are captured here, BEFORE the async boundary — // a keypress during the finalize wait can start a new session and reset // these refs (e.g. focusFlushedCharsRef = 0 in startRecordingSession), // reproducing the silent-drop false-positive this ref exists to prevent. const recordingDurationMs = Date.now() - recordingStartRef.current const hadAudioSignal = hasAudioSignalRef.current const retried = retryUsedRef.current const focusFlushedChars = focusFlushedCharsRef.current // wsConnected distinguishes "backend received audio but dropped it" (the // bug backend PR #287008 fixes) from "WS handshake never completed" — // in the latter case audio is still in audioBuffer, never reached the // server, but hasAudioSignalRef is already true from ambient noise. const wsConnected = everConnectedRef.current // Capture generation BEFORE the .then() — if a new session starts during // the finalize wait, sessionGenRef has already advanced by the time the // continuation runs, so capturing inside the .then() would yield the new // session's gen and every staleness check would be a no-op. const myGen = sessionGenRef.current const isStale = () => sessionGenRef.current !== myGen logForDebugging('[voice] Recording stopped') // Send finalize and wait for the WebSocket to close before reading the // accumulated transcript. The close handler promotes any unreported // interim text to final, so we must wait for it to fire. const finalizePromise: Promise = connectionRef.current ? connectionRef.current.finalize() : Promise.resolve(undefined) void finalizePromise .then(async finalizeSource => { if (isStale()) return // Silent-drop replay: when the server accepted audio (wsConnected), // the mic captured real signal (hadAudioSignal), but finalize timed // out with zero transcript — the ~1% session-sticky CE-pod bug. // Replay the buffered audio on a fresh connection once. A 250ms // backoff clears the same-pod rapid-reconnect race (same gap as the // early-error retry path below). if ( finalizeSource === 'no_data_timeout' && hadAudioSignal && wsConnected && !focusTriggered && focusFlushedChars === 0 && accumulatedRef.current.trim() === '' && !silentDropRetriedRef.current && fullAudioRef.current.length > 0 ) { silentDropRetriedRef.current = true logForDebugging( `[voice] Silent-drop detected (no_data_timeout, ${String(fullAudioRef.current.length)} chunks); replaying on fresh connection`, ) logEvent('tengu_voice_silent_drop_replay', { recordingDurationMs, chunkCount: fullAudioRef.current.length, }) if (connectionRef.current) { connectionRef.current.close() connectionRef.current = null } const replayBuffer = fullAudioRef.current await sleep(250) if (isStale()) return const stt = normalizeLanguageForSTT(getInitialSettings().language) const keyterms = await getVoiceKeyterms() if (isStale()) return await new Promise(resolve => { void connectVoiceStream( { onTranscript: (t, isFinal) => { if (isStale()) return if (isFinal && t.trim()) { if (accumulatedRef.current) accumulatedRef.current += ' ' accumulatedRef.current += t.trim() } }, onError: () => resolve(), onClose: () => {}, onReady: conn => { if (isStale()) { conn.close() resolve() return } connectionRef.current = conn const SLICE = 32_000 let slice: Buffer[] = [] let bytes = 0 for (const c of replayBuffer) { if (bytes > 0 && bytes + c.length > SLICE) { conn.send(Buffer.concat(slice)) slice = [] bytes = 0 } slice.push(c) bytes += c.length } if (slice.length) conn.send(Buffer.concat(slice)) void conn.finalize().then(() => { conn.close() resolve() }) }, }, { language: stt.code, keyterms }, ).then( c => { if (!c) resolve() }, () => resolve(), ) }) if (isStale()) return } fullAudioRef.current = [] const text = accumulatedRef.current.trim() logForDebugging( `[voice] Final transcript assembled (${String(text.length)} chars): "${text.slice(0, 200)}"`, ) // Tracks silent-drop rate: transcriptChars=0 + hadAudioSignal=true // + recordingDurationMs>2000 = the bug backend PR #287008 fixes. // focusFlushedCharsRef makes transcriptChars accurate for focus mode // (where each final is injected immediately and accumulatedRef reset). // // NOTE: this fires only on the finishRecording() path. The onError // fallthrough and !conn (no-OAuth) paths bypass this → don't compute // COUNT(completed)/COUNT(started) as a success rate; the silent-drop // denominator (completed events only) is internally consistent. logEvent('tengu_voice_recording_completed', { transcriptChars: text.length + focusFlushedChars, recordingDurationMs, hadAudioSignal, retried, silentDropRetried: silentDropRetriedRef.current, wsConnected, focusTriggered, }) if (connectionRef.current) { connectionRef.current.close() connectionRef.current = null } if (text) { logForDebugging( `[voice] Injecting transcript (${String(text.length)} chars)`, ) onTranscriptRef.current(text) } else if (focusFlushedChars === 0 && recordingDurationMs > 2000) { // Only warn about empty transcript if nothing was flushed in focus // mode either, and recording was > 2s (short recordings = accidental // taps → silently return to idle). if (!wsConnected) { // WS never connected → audio never reached backend. Not a silent // drop; a connection failure (slow OAuth refresh, network, etc). onErrorRef.current?.( 'Voice connection failed. Check your network and try again.', ) } else if (!hadAudioSignal) { // Distinguish silent mic (capture issue) from speech not recognized. onErrorRef.current?.( 'No audio detected from microphone. Check that the correct input device is selected and that Claude Code has microphone access.', ) } else { onErrorRef.current?.('No speech detected.') } } accumulatedRef.current = '' setVoiceState(prev => { if (prev.voiceInterimTranscript === '') return prev return { ...prev, voiceInterimTranscript: '' } }) updateState('idle') }) .catch(err => { logError(toError(err)) if (!isStale()) updateState('idle') }) } // When voice is enabled, lazy-import voice.ts so checkRecordingAvailability // et al. are ready when the user presses the voice key. Do NOT preload the // native module — require('audio-capture.node') is a synchronous dlopen of // CoreAudio/AudioUnit that blocks the event loop for ~1s (warm) to ~8s // (cold coreaudiod). setImmediate doesn't help: it yields one tick, then the // dlopen still blocks. The first voice keypress pays the dlopen cost instead. useEffect(() => { if (enabled && !voiceModule) { void import('../services/voice.js').then(mod => { voiceModule = mod }) } }, [enabled]) // ── Focus silence timer ──────────────────────────────────────────── // Arms (or resets) a timer that tears down the focus-mode session // after FOCUS_SILENCE_TIMEOUT_MS of no speech. Called when a session // starts and after each flushed transcript. function armFocusSilenceTimer(): void { if (focusSilenceTimerRef.current) { clearTimeout(focusSilenceTimerRef.current) } focusSilenceTimerRef.current = setTimeout( ( focusSilenceTimerRef, stateRef, focusTriggeredRef, silenceTimedOutRef, finishRecording, ) => { focusSilenceTimerRef.current = null if (stateRef.current === 'recording' && focusTriggeredRef.current) { logForDebugging( '[voice] Focus silence timeout — tearing down session', ) silenceTimedOutRef.current = true finishRecording() } }, FOCUS_SILENCE_TIMEOUT_MS, focusSilenceTimerRef, stateRef, focusTriggeredRef, silenceTimedOutRef, finishRecording, ) } // ── Focus-driven recording ────────────────────────────────────────── // In focus mode, start recording when the terminal gains focus and // stop when it loses focus. This enables a "multi-clauding army" // workflow where voice input follows window focus. useEffect(() => { if (!enabled || !focusMode) { // Focus mode was disabled while a focus-driven recording was active — // stop the recording so it doesn't linger until the silence timer fires. if (focusTriggeredRef.current && stateRef.current === 'recording') { logForDebugging( '[voice] Focus mode disabled during recording, finishing', ) finishRecording() } return } let cancelled = false if ( isFocused && stateRef.current === 'idle' && !silenceTimedOutRef.current ) { const beginFocusRecording = (): void => { // Re-check conditions — state or enabled/focusMode may have changed // during the await (effect cleanup sets cancelled). if ( cancelled || stateRef.current !== 'idle' || silenceTimedOutRef.current ) return logForDebugging('[voice] Focus gained, starting recording session') focusTriggeredRef.current = true void startRecordingSession() armFocusSilenceTimer() } if (voiceModule) { beginFocusRecording() } else { // Voice module is loading (async import resolves from cache as a // microtask). Wait for it before starting the recording session. void import('../services/voice.js').then(mod => { voiceModule = mod beginFocusRecording() }) } } else if (!isFocused) { // Clear the silence timeout flag on blur so the next focus // cycle re-arms recording. silenceTimedOutRef.current = false if (stateRef.current === 'recording') { logForDebugging('[voice] Focus lost, finishing recording') finishRecording() } } return () => { cancelled = true } }, [enabled, focusMode, isFocused]) // ── Start a new recording session (voice_stream connect + audio) ── async function startRecordingSession(): Promise { if (!voiceModule) { onErrorRef.current?.( 'Voice module not loaded yet. Try again in a moment.', ) return } // Transition to 'recording' synchronously, BEFORE any await. Callers // read state synchronously right after `void startRecordingSession()`: // - useVoiceIntegration.tsx space-hold guard reads voiceState from the // store immediately — if it sees 'idle' it clears isSpaceHoldActiveRef // and space auto-repeat leaks into the text input (100% repro) // - handleKeyEvent's `currentState === 'idle'` re-entry check below // If an await runs first, both see stale 'idle'. See PR #20873 review. updateState('recording') recordingStartRef.current = Date.now() accumulatedRef.current = '' seenRepeatRef.current = false hasAudioSignalRef.current = false retryUsedRef.current = false silentDropRetriedRef.current = false fullAudioRef.current = [] focusFlushedCharsRef.current = 0 everConnectedRef.current = false const myGen = ++sessionGenRef.current // ── Pre-check: can we actually record audio? ────────────── const availability = await voiceModule.checkRecordingAvailability() if (!availability.available) { logForDebugging( `[voice] Recording not available: ${availability.reason ?? 'unknown'}`, ) onErrorRef.current?.( availability.reason ?? 'Audio recording is not available.', ) cleanup() updateState('idle') return } logForDebugging( '[voice] Starting recording session, connecting voice stream', ) // Clear any previous error setVoiceState(prev => { if (!prev.voiceError) return prev return { ...prev, voiceError: null } }) // Buffer audio chunks while the WebSocket connects. Once the connection // is ready (onReady fires), buffered chunks are flushed and subsequent // chunks are sent directly. const audioBuffer: Buffer[] = [] // Start recording IMMEDIATELY — audio is buffered until the WebSocket // opens, eliminating the 1-2s latency from waiting for OAuth + WS connect. logForDebugging( '[voice] startRecording: buffering audio while WebSocket connects', ) audioLevelsRef.current = [] const started = await voiceModule.startRecording( (chunk: Buffer) => { // Copy for fullAudioRef replay buffer. send() in voiceStreamSTT // copies again defensively — acceptable overhead at audio rates. // Skip buffering in focus mode — replay is gated on !focusTriggered // so the buffer is dead weight (up to ~20MB for a 10min session). const owned = Buffer.from(chunk) if (!focusTriggeredRef.current) { fullAudioRef.current.push(owned) } if (connectionRef.current) { connectionRef.current.send(owned) } else { audioBuffer.push(owned) } // Update audio level histogram for the recording visualizer const level = computeLevel(chunk) if (!hasAudioSignalRef.current && level > 0.01) { hasAudioSignalRef.current = true } const levels = audioLevelsRef.current if (levels.length >= AUDIO_LEVEL_BARS) { levels.shift() } levels.push(level) // Copy the array so React sees a new reference const snapshot = [...levels] audioLevelsRef.current = snapshot setVoiceState(prev => ({ ...prev, voiceAudioLevels: snapshot })) }, () => { // External end (e.g. device error) - treat as stop if (stateRef.current === 'recording') { finishRecording() } }, { silenceDetection: false }, ) if (!started) { logError(new Error('[voice] Recording failed — no audio tool found')) onErrorRef.current?.( 'Failed to start audio capture. Check that your microphone is accessible.', ) cleanup() updateState('idle') setVoiceState(prev => ({ ...prev, voiceError: 'Recording failed — no audio tool found', })) return } const rawLanguage = getInitialSettings().language const stt = normalizeLanguageForSTT(rawLanguage) logEvent('tengu_voice_recording_started', { focusTriggered: focusTriggeredRef.current, sttLanguage: stt.code as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, sttLanguageIsDefault: !rawLanguage?.trim(), sttLanguageFellBack: stt.fellBackFrom !== undefined, // ISO 639 subtag from Intl (bounded set, never user text). undefined if // Intl failed — omitted from the payload, no retry cost (cached). systemLocaleLanguage: getSystemLocaleLanguage() as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, }) // Retry once if the connection errors before delivering any transcript. // The conversation-engine proxy can reject rapid reconnects (~1/N_pods // same-pod collision) or CE's Deepgram upstream can fail during its own // teardown window (anthropics/anthropic#287008 surfaces this as // TranscriptError instead of silent-drop). A 250ms backoff clears both. // Audio captured during the retry window routes to audioBuffer (via the // connectionRef.current null check in the recording callback above) and // is flushed by the second onReady. let sawTranscript = false // Connect WebSocket in parallel with audio recording. // Gather keyterms first (async but fast — no model calls), then connect. // Bail from callbacks if a newer session has started. Prevents a // slow-connecting zombie WS (e.g. user released, pressed again, first // WS still handshaking) from firing onReady/onError into the new // session and corrupting its connectionRef / triggering a bogus retry. const isStale = () => sessionGenRef.current !== myGen const attemptConnect = (keyterms: string[]): void => { const myAttemptGen = attemptGenRef.current void connectVoiceStream( { onTranscript: (text: string, isFinal: boolean) => { if (isStale()) return sawTranscript = true logForDebugging( `[voice] onTranscript: isFinal=${String(isFinal)} text="${text}"`, ) if (isFinal && text.trim()) { if (focusTriggeredRef.current) { // Focus mode: flush each final transcript immediately and // keep recording. This gives continuous transcription while // the terminal is focused. logForDebugging( `[voice] Focus mode: flushing final transcript immediately: "${text.trim()}"`, ) onTranscriptRef.current(text.trim()) focusFlushedCharsRef.current += text.trim().length setVoiceState(prev => { if (prev.voiceInterimTranscript === '') return prev return { ...prev, voiceInterimTranscript: '' } }) accumulatedRef.current = '' // User is actively speaking — reset the silence timer. armFocusSilenceTimer() } else { // Hold-to-talk: accumulate final transcripts separated by spaces if (accumulatedRef.current) { accumulatedRef.current += ' ' } accumulatedRef.current += text.trim() logForDebugging( `[voice] Accumulated final transcript: "${accumulatedRef.current}"`, ) // Clear interim since final supersedes it setVoiceState(prev => { const preview = accumulatedRef.current if (prev.voiceInterimTranscript === preview) return prev return { ...prev, voiceInterimTranscript: preview } }) } } else if (!isFinal) { // Active interim speech resets the focus silence timer. // Nova 3 disables auto-finalize so isFinal is never true // mid-stream — without this, the 5s timer fires during // active speech and tears down the session. if (focusTriggeredRef.current) { armFocusSilenceTimer() } // Show accumulated finals + current interim as live preview const interim = text.trim() const preview = accumulatedRef.current ? accumulatedRef.current + (interim ? ' ' + interim : '') : interim setVoiceState(prev => { if (prev.voiceInterimTranscript === preview) return prev return { ...prev, voiceInterimTranscript: preview } }) } }, onError: (error: string, opts?: { fatal?: boolean }) => { if (isStale()) { logForDebugging( `[voice] ignoring onError from stale session: ${error}`, ) return } // Swallow errors from superseded attempts. Covers conn 1's // trailing close after retry is scheduled, AND the current // conn's ws close event after its ws error already surfaced // below (gen bumped at surface). if (attemptGenRef.current !== myAttemptGen) { logForDebugging( `[voice] ignoring stale onError from superseded attempt: ${error}`, ) return } // Early-failure retry: server error before any transcript = // likely a transient upstream race (CE rejection, Deepgram // not ready). Clear connectionRef so audio re-buffers, back // off, reconnect. Skip if the user has already released the // key (state left 'recording') — no point retrying a session // they've ended. Fatal errors (Cloudflare bot challenge, auth // rejection) are the same failure on every retry attempt, so // fall through to surface the message. if ( !opts?.fatal && !sawTranscript && stateRef.current === 'recording' ) { if (!retryUsedRef.current) { retryUsedRef.current = true logForDebugging( `[voice] early voice_stream error (pre-transcript), retrying once: ${error}`, ) logEvent('tengu_voice_stream_early_retry', {}) connectionRef.current = null attemptGenRef.current++ setTimeout( (stateRef, attemptConnect, keyterms) => { if (stateRef.current === 'recording') { attemptConnect(keyterms) } }, 250, stateRef, attemptConnect, keyterms, ) return } } // Surfacing — bump gen so this conn's trailing close-error // (ws fires error then close 1006) is swallowed above. attemptGenRef.current++ logError(new Error(`[voice] voice_stream error: ${error}`)) onErrorRef.current?.(`Voice stream error: ${error}`) // Clear the audio buffer on error to avoid memory leaks audioBuffer.length = 0 focusTriggeredRef.current = false cleanup() updateState('idle') }, onClose: () => { // no-op; lifecycle handled by cleanup() }, onReady: conn => { // Only proceed if we're still in recording state AND this is // still the current session. A zombie late-connecting WS from // an abandoned session can pass the 'recording' check if the // user has since started a new session. if (isStale() || stateRef.current !== 'recording') { conn.close() return } // The WebSocket is now truly open — assign connectionRef so // subsequent audio callbacks send directly instead of buffering. connectionRef.current = conn everConnectedRef.current = true // Flush all audio chunks that were buffered while the WebSocket // was connecting. This is safe because onReady fires from the // WebSocket 'open' event, guaranteeing send() will not be dropped. // // Coalesce into ~1s slices rather than one ws.send per chunk // — fewer WS frames means less overhead on both ends. const SLICE_TARGET_BYTES = 32_000 // ~1s at 16kHz/16-bit/mono if (audioBuffer.length > 0) { let totalBytes = 0 for (const c of audioBuffer) totalBytes += c.length const slices: Buffer[][] = [[]] let sliceBytes = 0 for (const chunk of audioBuffer) { if ( sliceBytes > 0 && sliceBytes + chunk.length > SLICE_TARGET_BYTES ) { slices.push([]) sliceBytes = 0 } slices[slices.length - 1]!.push(chunk) sliceBytes += chunk.length } logForDebugging( `[voice] onReady: flushing ${String(audioBuffer.length)} buffered chunks (${String(totalBytes)} bytes) as ${String(slices.length)} coalesced frame(s)`, ) for (const slice of slices) { conn.send(Buffer.concat(slice)) } } audioBuffer.length = 0 // Reset the release timer now that the WebSocket is ready. // Only arm it if auto-repeat has been seen — otherwise the OS // key repeat delay (~500ms) hasn't elapsed yet and the timer // would fire prematurely. if (releaseTimerRef.current) { clearTimeout(releaseTimerRef.current) } if (seenRepeatRef.current) { releaseTimerRef.current = setTimeout( (releaseTimerRef, stateRef, finishRecording) => { releaseTimerRef.current = null if (stateRef.current === 'recording') { finishRecording() } }, RELEASE_TIMEOUT_MS, releaseTimerRef, stateRef, finishRecording, ) } }, }, { language: stt.code, keyterms, }, ).then(conn => { if (isStale()) { conn?.close() return } if (!conn) { logForDebugging( '[voice] Failed to connect to voice_stream (no OAuth token?)', ) onErrorRef.current?.( 'Voice mode requires a Claude.ai account. Please run /login to sign in.', ) // Clear the audio buffer on failure audioBuffer.length = 0 cleanup() updateState('idle') return } // Safety check: if the user released the key before connectVoiceStream // resolved (but after onReady already ran), close the connection. if (stateRef.current !== 'recording') { audioBuffer.length = 0 conn.close() return } }) } void getVoiceKeyterms().then(attemptConnect) } // ── Hold-to-talk handler ──────────────────────────────────────────── // Called on every keypress (including terminal auto-repeats while // the key is held). A gap longer than RELEASE_TIMEOUT_MS between // events is interpreted as key release. // // Recording starts immediately on the first keypress to eliminate // startup delay. The release timer is only armed after auto-repeat // is detected (to avoid false releases during the OS key repeat // delay of ~500ms on macOS). const handleKeyEvent = useCallback( (fallbackMs = REPEAT_FALLBACK_MS): void => { if (!enabled || !isVoiceStreamAvailable()) { return } // In focus mode, recording is driven by terminal focus, not keypresses. if (focusTriggeredRef.current) { // Active focus recording — ignore key events (session ends on blur). return } if (focusMode && silenceTimedOutRef.current) { // Focus session timed out due to silence — keypress re-arms it. logForDebugging( '[voice] Re-arming focus recording after silence timeout', ) silenceTimedOutRef.current = false focusTriggeredRef.current = true void startRecordingSession() armFocusSilenceTimer() return } const currentState = stateRef.current // Ignore keypresses while processing if (currentState === 'processing') { return } if (currentState === 'idle') { logForDebugging( '[voice] handleKeyEvent: idle, starting recording session immediately', ) void startRecordingSession() // Fallback: if no auto-repeat arrives within REPEAT_FALLBACK_MS, // arm the release timer anyway (the user likely tapped and released). repeatFallbackTimerRef.current = setTimeout( ( repeatFallbackTimerRef, stateRef, seenRepeatRef, releaseTimerRef, finishRecording, ) => { repeatFallbackTimerRef.current = null if (stateRef.current === 'recording' && !seenRepeatRef.current) { logForDebugging( '[voice] No auto-repeat seen, arming release timer via fallback', ) seenRepeatRef.current = true releaseTimerRef.current = setTimeout( (releaseTimerRef, stateRef, finishRecording) => { releaseTimerRef.current = null if (stateRef.current === 'recording') { finishRecording() } }, RELEASE_TIMEOUT_MS, releaseTimerRef, stateRef, finishRecording, ) } }, fallbackMs, repeatFallbackTimerRef, stateRef, seenRepeatRef, releaseTimerRef, finishRecording, ) } else if (currentState === 'recording') { // Second+ keypress while recording — auto-repeat has started. seenRepeatRef.current = true if (repeatFallbackTimerRef.current) { clearTimeout(repeatFallbackTimerRef.current) repeatFallbackTimerRef.current = null } } // Reset the release timer on every keypress (including auto-repeats) if (releaseTimerRef.current) { clearTimeout(releaseTimerRef.current) } // Only arm the release timer once auto-repeat has been seen. // The OS key repeat delay is ~500ms on macOS; without this gate // the 200ms timer fires before repeat starts, causing a false release. if (stateRef.current === 'recording' && seenRepeatRef.current) { releaseTimerRef.current = setTimeout( (releaseTimerRef, stateRef, finishRecording) => { releaseTimerRef.current = null if (stateRef.current === 'recording') { finishRecording() } }, RELEASE_TIMEOUT_MS, releaseTimerRef, stateRef, finishRecording, ) } }, [enabled, focusMode, cleanup], ) // Cleanup only when disabled or unmounted - NOT on state changes useEffect(() => { if (!enabled && stateRef.current !== 'idle') { cleanup() updateState('idle') } return () => { cleanup() } }, [enabled, cleanup]) return { state, handleKeyEvent, } }