source dump of claude code
at main 1144 lines 46 kB view raw
1// React hook for hold-to-talk voice input using Anthropic voice_stream STT. 2// 3// Hold the keybinding to record; release to stop and submit. Auto-repeat 4// key events reset an internal timer — when no keypress arrives within 5// RELEASE_TIMEOUT_MS the recording stops automatically. Uses the native 6// audio module (macOS) or SoX for recording, and Anthropic's voice_stream 7// endpoint (conversation_engine) for STT. 8 9import { useCallback, useEffect, useRef, useState } from 'react' 10import { useSetVoiceState } from '../context/voice.js' 11import { useTerminalFocus } from '../ink/hooks/use-terminal-focus.js' 12import { 13 type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 14 logEvent, 15} from '../services/analytics/index.js' 16import { getVoiceKeyterms } from '../services/voiceKeyterms.js' 17import { 18 connectVoiceStream, 19 type FinalizeSource, 20 isVoiceStreamAvailable, 21 type VoiceStreamConnection, 22} from '../services/voiceStreamSTT.js' 23import { logForDebugging } from '../utils/debug.js' 24import { toError } from '../utils/errors.js' 25import { getSystemLocaleLanguage } from '../utils/intl.js' 26import { logError } from '../utils/log.js' 27import { getInitialSettings } from '../utils/settings/settings.js' 28import { sleep } from '../utils/sleep.js' 29 30// ─── Language normalization ───────────────────────────────────────────── 31 32const DEFAULT_STT_LANGUAGE = 'en' 33 34// Maps language names (English and native) to BCP-47 codes supported by 35// the voice_stream Deepgram backend. Keys must be lowercase. 36// 37// This list must be a SUBSET of the server-side supported_language_codes 38// allowlist (GrowthBook: speech_to_text_voice_stream_config). 39// If the CLI sends a code the server rejects, the WebSocket closes with 40// 1008 "Unsupported language" and voice breaks. Unsupported languages 41// fall back to DEFAULT_STT_LANGUAGE so recording still works. 42const LANGUAGE_NAME_TO_CODE: Record<string, string> = { 43 english: 'en', 44 spanish: 'es', 45 español: 'es', 46 espanol: 'es', 47 french: 'fr', 48 français: 'fr', 49 francais: 'fr', 50 japanese: 'ja', 51 : 'ja', 52 german: 'de', 53 deutsch: 'de', 54 portuguese: 'pt', 55 português: 'pt', 56 portugues: 'pt', 57 italian: 'it', 58 italiano: 'it', 59 korean: 'ko', 60 : 'ko', 61 hindi: 'hi', 62 ि: 'hi', 63 ि: 'hi', 64 indonesian: 'id', 65 'bahasa indonesia': 'id', 66 bahasa: 'id', 67 russian: 'ru', 68 русский: 'ru', 69 polish: 'pl', 70 polski: 'pl', 71 turkish: 'tr', 72 türkçe: 'tr', 73 turkce: 'tr', 74 dutch: 'nl', 75 nederlands: 'nl', 76 ukrainian: 'uk', 77 українська: 'uk', 78 greek: 'el', 79 ελληνικά: 'el', 80 czech: 'cs', 81 čeština: 'cs', 82 cestina: 'cs', 83 danish: 'da', 84 dansk: 'da', 85 swedish: 'sv', 86 svenska: 'sv', 87 norwegian: 'no', 88 norsk: 'no', 89} 90 91// Subset of the GrowthBook speech_to_text_voice_stream_config allowlist. 92// Sending a code not in the server allowlist closes the connection. 93const SUPPORTED_LANGUAGE_CODES = new Set([ 94 'en', 95 'es', 96 'fr', 97 'ja', 98 'de', 99 'pt', 100 'it', 101 'ko', 102 'hi', 103 'id', 104 'ru', 105 'pl', 106 'tr', 107 'nl', 108 'uk', 109 'el', 110 'cs', 111 'da', 112 'sv', 113 'no', 114]) 115 116// Normalize a language preference string (from settings.language) to a 117// BCP-47 code supported by the voice_stream endpoint. Returns the 118// default language if the input cannot be resolved. When the input is 119// non-empty but unsupported, fellBackFrom is set to the original input so 120// callers can surface a warning. 121export function normalizeLanguageForSTT(language: string | undefined): { 122 code: string 123 fellBackFrom?: string 124} { 125 if (!language) return { code: DEFAULT_STT_LANGUAGE } 126 const lower = language.toLowerCase().trim() 127 if (!lower) return { code: DEFAULT_STT_LANGUAGE } 128 if (SUPPORTED_LANGUAGE_CODES.has(lower)) return { code: lower } 129 const fromName = LANGUAGE_NAME_TO_CODE[lower] 130 if (fromName) return { code: fromName } 131 const base = lower.split('-')[0] 132 if (base && SUPPORTED_LANGUAGE_CODES.has(base)) return { code: base } 133 return { code: DEFAULT_STT_LANGUAGE, fellBackFrom: language } 134} 135 136// Lazy-loaded voice module. We defer importing voice.ts (and its native 137// audio-capture-napi dependency) until voice input is actually activated. 138// On macOS, loading the native audio module can trigger a TCC microphone 139// permission prompt — we must avoid that until voice input is actually enabled. 140type VoiceModule = typeof import('../services/voice.js') 141let voiceModule: VoiceModule | null = null 142 143type VoiceState = 'idle' | 'recording' | 'processing' 144 145type UseVoiceOptions = { 146 onTranscript: (text: string) => void 147 onError?: (message: string) => void 148 enabled: boolean 149 focusMode: boolean 150} 151 152type UseVoiceReturn = { 153 state: VoiceState 154 handleKeyEvent: (fallbackMs?: number) => void 155} 156 157// Gap (ms) between auto-repeat key events that signals key release. 158// Terminal auto-repeat typically fires every 30-80ms; 200ms comfortably 159// covers jitter while still feeling responsive. 160const RELEASE_TIMEOUT_MS = 200 161 162// Fallback (ms) to arm the release timer if no auto-repeat is seen. 163// macOS default key repeat delay is ~500ms; 600ms gives headroom. 164// If the user tapped and released before auto-repeat started, this 165// ensures the release timer gets armed and recording stops. 166// 167// For modifier-combo first-press activation (handleKeyEvent called at 168// t=0, before any auto-repeat), callers should pass FIRST_PRESS_FALLBACK_MS 169// instead — the gap to the next keypress is the OS initial repeat *delay* 170// (up to ~2s on macOS with slider at "Long"), not the repeat *rate*. 171const REPEAT_FALLBACK_MS = 600 172export const FIRST_PRESS_FALLBACK_MS = 2000 173 174// How long (ms) to keep a focus-mode session alive without any speech 175// before tearing it down to free the WebSocket connection. Re-arms on 176// the next focus cycle (blur → refocus). 177const FOCUS_SILENCE_TIMEOUT_MS = 5_000 178 179// Number of bars shown in the recording waveform visualizer. 180const AUDIO_LEVEL_BARS = 16 181 182// Compute RMS amplitude from a 16-bit signed PCM buffer and return a 183// normalized 0-1 value. A sqrt curve spreads quieter levels across more 184// of the visual range so the waveform uses the full set of block heights. 185export function computeLevel(chunk: Buffer): number { 186 const samples = chunk.length >> 1 // 16-bit = 2 bytes per sample 187 if (samples === 0) return 0 188 let sumSq = 0 189 for (let i = 0; i < chunk.length - 1; i += 2) { 190 // Read 16-bit signed little-endian 191 const sample = ((chunk[i]! | (chunk[i + 1]! << 8)) << 16) >> 16 192 sumSq += sample * sample 193 } 194 const rms = Math.sqrt(sumSq / samples) 195 const normalized = Math.min(rms / 2000, 1) 196 return Math.sqrt(normalized) 197} 198 199export function useVoice({ 200 onTranscript, 201 onError, 202 enabled, 203 focusMode, 204}: UseVoiceOptions): UseVoiceReturn { 205 const [state, setState] = useState<VoiceState>('idle') 206 const stateRef = useRef<VoiceState>('idle') 207 const connectionRef = useRef<VoiceStreamConnection | null>(null) 208 const accumulatedRef = useRef('') 209 const onTranscriptRef = useRef(onTranscript) 210 const onErrorRef = useRef(onError) 211 const cleanupTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null) 212 const releaseTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null) 213 // True once we've seen a second keypress (auto-repeat) while recording. 214 // The OS key repeat delay (~500ms on macOS) means the first keypress is 215 // solo — arming the release timer before auto-repeat starts would cause 216 // a false release. 217 const seenRepeatRef = useRef(false) 218 const repeatFallbackTimerRef = useRef<ReturnType<typeof setTimeout> | null>( 219 null, 220 ) 221 // True when the current recording session was started by terminal focus 222 // (not by a keypress). Focus-driven sessions end on blur, not key release. 223 const focusTriggeredRef = useRef(false) 224 // Timer that tears down the session after prolonged silence in focus mode. 225 const focusSilenceTimerRef = useRef<ReturnType<typeof setTimeout> | null>( 226 null, 227 ) 228 // Set when a focus-mode session is torn down due to silence. Prevents 229 // the focus effect from immediately restarting. Cleared on blur so the 230 // next focus cycle re-arms recording. 231 const silenceTimedOutRef = useRef(false) 232 const recordingStartRef = useRef(0) 233 // Incremented on each startRecordingSession(). Callbacks capture their 234 // generation and bail if a newer session has started — prevents a zombie 235 // slow-connecting WS from an abandoned session from overwriting 236 // connectionRef mid-way through the next session. 237 const sessionGenRef = useRef(0) 238 // True if the early-error retry fired during this session. 239 // Tracked for the tengu_voice_recording_completed analytics event. 240 const retryUsedRef = useRef(false) 241 // Full audio captured this session, kept for silent-drop replay. ~1% of 242 // sessions get a sticky-broken CE pod that accepts audio but returns zero 243 // transcripts (anthropics/anthropic#287008 session-sticky variant); when 244 // finalize() resolves via no_data_timeout with hadAudioSignal=true, we 245 // replay the buffer on a fresh WS once. Bounded: 32KB/s × ~60s max ≈ 2MB. 246 const fullAudioRef = useRef<Buffer[]>([]) 247 const silentDropRetriedRef = useRef(false) 248 // Bumped when the early-error retry is scheduled. Captured per 249 // attemptConnect — onError swallows stale-gen events (conn 1's 250 // trailing close-error) but surfaces current-gen ones (conn 2's 251 // genuine failure). Same shape as sessionGenRef, one level down. 252 const attemptGenRef = useRef(0) 253 // Running total of chars flushed in focus mode (each final transcript is 254 // injected immediately and accumulatedRef reset). Added to transcriptChars 255 // in the completed event so focus-mode sessions don't false-positive as 256 // silent-drops (transcriptChars=0 despite successful transcription). 257 const focusFlushedCharsRef = useRef(0) 258 // True if at least one audio chunk with non-trivial signal was received. 259 // Used to distinguish "microphone is silent/inaccessible" from "speech not detected". 260 const hasAudioSignalRef = useRef(false) 261 // True once onReady fired for the current session. Unlike connectionRef 262 // (which cleanup() nulls), this survives effect-order races where Effect 3 263 // cleanup runs before Effect 2's finishRecording() — e.g. /voice toggled 264 // off mid-recording in focus mode. Used for the wsConnected analytics 265 // dimension and error-message branching. Reset in startRecordingSession. 266 const everConnectedRef = useRef(false) 267 const audioLevelsRef = useRef<number[]>([]) 268 const isFocused = useTerminalFocus() 269 const setVoiceState = useSetVoiceState() 270 271 // Keep callback refs current without triggering re-renders 272 onTranscriptRef.current = onTranscript 273 onErrorRef.current = onError 274 275 function updateState(newState: VoiceState): void { 276 stateRef.current = newState 277 setState(newState) 278 setVoiceState(prev => { 279 if (prev.voiceState === newState) return prev 280 return { ...prev, voiceState: newState } 281 }) 282 } 283 284 const cleanup = useCallback((): void => { 285 // Stale any in-flight session (main connection isStale(), replay 286 // isStale(), finishRecording continuation). Without this, disabling 287 // voice during the replay window lets the stale replay open a WS, 288 // accumulate transcript, and inject it after voice was torn down. 289 sessionGenRef.current++ 290 if (cleanupTimerRef.current) { 291 clearTimeout(cleanupTimerRef.current) 292 cleanupTimerRef.current = null 293 } 294 if (releaseTimerRef.current) { 295 clearTimeout(releaseTimerRef.current) 296 releaseTimerRef.current = null 297 } 298 if (repeatFallbackTimerRef.current) { 299 clearTimeout(repeatFallbackTimerRef.current) 300 repeatFallbackTimerRef.current = null 301 } 302 if (focusSilenceTimerRef.current) { 303 clearTimeout(focusSilenceTimerRef.current) 304 focusSilenceTimerRef.current = null 305 } 306 silenceTimedOutRef.current = false 307 voiceModule?.stopRecording() 308 if (connectionRef.current) { 309 connectionRef.current.close() 310 connectionRef.current = null 311 } 312 accumulatedRef.current = '' 313 audioLevelsRef.current = [] 314 fullAudioRef.current = [] 315 setVoiceState(prev => { 316 if (prev.voiceInterimTranscript === '' && !prev.voiceAudioLevels.length) 317 return prev 318 return { ...prev, voiceInterimTranscript: '', voiceAudioLevels: [] } 319 }) 320 }, [setVoiceState]) 321 322 function finishRecording(): void { 323 logForDebugging( 324 '[voice] finishRecording: stopping recording, transitioning to processing', 325 ) 326 // Session ending — stale any in-flight attempt so its late onError 327 // (conn 2 responding after user released key) doesn't double-fire on 328 // top of the "check network" message below. 329 attemptGenRef.current++ 330 // Capture focusTriggered BEFORE clearing it — needed as an event dimension 331 // so BigQuery can filter out passive focus-mode auto-recordings (user focused 332 // terminal without speaking → ambient noise sets hadAudioSignal=true → false 333 // silent-drop signature). focusFlushedCharsRef fixes transcriptChars accuracy 334 // for sessions WITH speech; focusTriggered enables filtering sessions WITHOUT. 335 const focusTriggered = focusTriggeredRef.current 336 focusTriggeredRef.current = false 337 updateState('processing') 338 voiceModule?.stopRecording() 339 // Capture duration BEFORE the finalize round-trip so that the WebSocket 340 // wait time is not included (otherwise a quick tap looks like > 2s). 341 // All ref-backed values are captured here, BEFORE the async boundary — 342 // a keypress during the finalize wait can start a new session and reset 343 // these refs (e.g. focusFlushedCharsRef = 0 in startRecordingSession), 344 // reproducing the silent-drop false-positive this ref exists to prevent. 345 const recordingDurationMs = Date.now() - recordingStartRef.current 346 const hadAudioSignal = hasAudioSignalRef.current 347 const retried = retryUsedRef.current 348 const focusFlushedChars = focusFlushedCharsRef.current 349 // wsConnected distinguishes "backend received audio but dropped it" (the 350 // bug backend PR #287008 fixes) from "WS handshake never completed" — 351 // in the latter case audio is still in audioBuffer, never reached the 352 // server, but hasAudioSignalRef is already true from ambient noise. 353 const wsConnected = everConnectedRef.current 354 // Capture generation BEFORE the .then() — if a new session starts during 355 // the finalize wait, sessionGenRef has already advanced by the time the 356 // continuation runs, so capturing inside the .then() would yield the new 357 // session's gen and every staleness check would be a no-op. 358 const myGen = sessionGenRef.current 359 const isStale = () => sessionGenRef.current !== myGen 360 logForDebugging('[voice] Recording stopped') 361 362 // Send finalize and wait for the WebSocket to close before reading the 363 // accumulated transcript. The close handler promotes any unreported 364 // interim text to final, so we must wait for it to fire. 365 const finalizePromise: Promise<FinalizeSource | undefined> = 366 connectionRef.current 367 ? connectionRef.current.finalize() 368 : Promise.resolve(undefined) 369 370 void finalizePromise 371 .then(async finalizeSource => { 372 if (isStale()) return 373 // Silent-drop replay: when the server accepted audio (wsConnected), 374 // the mic captured real signal (hadAudioSignal), but finalize timed 375 // out with zero transcript — the ~1% session-sticky CE-pod bug. 376 // Replay the buffered audio on a fresh connection once. A 250ms 377 // backoff clears the same-pod rapid-reconnect race (same gap as the 378 // early-error retry path below). 379 if ( 380 finalizeSource === 'no_data_timeout' && 381 hadAudioSignal && 382 wsConnected && 383 !focusTriggered && 384 focusFlushedChars === 0 && 385 accumulatedRef.current.trim() === '' && 386 !silentDropRetriedRef.current && 387 fullAudioRef.current.length > 0 388 ) { 389 silentDropRetriedRef.current = true 390 logForDebugging( 391 `[voice] Silent-drop detected (no_data_timeout, ${String(fullAudioRef.current.length)} chunks); replaying on fresh connection`, 392 ) 393 logEvent('tengu_voice_silent_drop_replay', { 394 recordingDurationMs, 395 chunkCount: fullAudioRef.current.length, 396 }) 397 if (connectionRef.current) { 398 connectionRef.current.close() 399 connectionRef.current = null 400 } 401 const replayBuffer = fullAudioRef.current 402 await sleep(250) 403 if (isStale()) return 404 const stt = normalizeLanguageForSTT(getInitialSettings().language) 405 const keyterms = await getVoiceKeyterms() 406 if (isStale()) return 407 await new Promise<void>(resolve => { 408 void connectVoiceStream( 409 { 410 onTranscript: (t, isFinal) => { 411 if (isStale()) return 412 if (isFinal && t.trim()) { 413 if (accumulatedRef.current) accumulatedRef.current += ' ' 414 accumulatedRef.current += t.trim() 415 } 416 }, 417 onError: () => resolve(), 418 onClose: () => {}, 419 onReady: conn => { 420 if (isStale()) { 421 conn.close() 422 resolve() 423 return 424 } 425 connectionRef.current = conn 426 const SLICE = 32_000 427 let slice: Buffer[] = [] 428 let bytes = 0 429 for (const c of replayBuffer) { 430 if (bytes > 0 && bytes + c.length > SLICE) { 431 conn.send(Buffer.concat(slice)) 432 slice = [] 433 bytes = 0 434 } 435 slice.push(c) 436 bytes += c.length 437 } 438 if (slice.length) conn.send(Buffer.concat(slice)) 439 void conn.finalize().then(() => { 440 conn.close() 441 resolve() 442 }) 443 }, 444 }, 445 { language: stt.code, keyterms }, 446 ).then( 447 c => { 448 if (!c) resolve() 449 }, 450 () => resolve(), 451 ) 452 }) 453 if (isStale()) return 454 } 455 fullAudioRef.current = [] 456 457 const text = accumulatedRef.current.trim() 458 logForDebugging( 459 `[voice] Final transcript assembled (${String(text.length)} chars): "${text.slice(0, 200)}"`, 460 ) 461 462 // Tracks silent-drop rate: transcriptChars=0 + hadAudioSignal=true 463 // + recordingDurationMs>2000 = the bug backend PR #287008 fixes. 464 // focusFlushedCharsRef makes transcriptChars accurate for focus mode 465 // (where each final is injected immediately and accumulatedRef reset). 466 // 467 // NOTE: this fires only on the finishRecording() path. The onError 468 // fallthrough and !conn (no-OAuth) paths bypass this → don't compute 469 // COUNT(completed)/COUNT(started) as a success rate; the silent-drop 470 // denominator (completed events only) is internally consistent. 471 logEvent('tengu_voice_recording_completed', { 472 transcriptChars: text.length + focusFlushedChars, 473 recordingDurationMs, 474 hadAudioSignal, 475 retried, 476 silentDropRetried: silentDropRetriedRef.current, 477 wsConnected, 478 focusTriggered, 479 }) 480 481 if (connectionRef.current) { 482 connectionRef.current.close() 483 connectionRef.current = null 484 } 485 486 if (text) { 487 logForDebugging( 488 `[voice] Injecting transcript (${String(text.length)} chars)`, 489 ) 490 onTranscriptRef.current(text) 491 } else if (focusFlushedChars === 0 && recordingDurationMs > 2000) { 492 // Only warn about empty transcript if nothing was flushed in focus 493 // mode either, and recording was > 2s (short recordings = accidental 494 // taps → silently return to idle). 495 if (!wsConnected) { 496 // WS never connected → audio never reached backend. Not a silent 497 // drop; a connection failure (slow OAuth refresh, network, etc). 498 onErrorRef.current?.( 499 'Voice connection failed. Check your network and try again.', 500 ) 501 } else if (!hadAudioSignal) { 502 // Distinguish silent mic (capture issue) from speech not recognized. 503 onErrorRef.current?.( 504 'No audio detected from microphone. Check that the correct input device is selected and that Claude Code has microphone access.', 505 ) 506 } else { 507 onErrorRef.current?.('No speech detected.') 508 } 509 } 510 511 accumulatedRef.current = '' 512 setVoiceState(prev => { 513 if (prev.voiceInterimTranscript === '') return prev 514 return { ...prev, voiceInterimTranscript: '' } 515 }) 516 updateState('idle') 517 }) 518 .catch(err => { 519 logError(toError(err)) 520 if (!isStale()) updateState('idle') 521 }) 522 } 523 524 // When voice is enabled, lazy-import voice.ts so checkRecordingAvailability 525 // et al. are ready when the user presses the voice key. Do NOT preload the 526 // native module — require('audio-capture.node') is a synchronous dlopen of 527 // CoreAudio/AudioUnit that blocks the event loop for ~1s (warm) to ~8s 528 // (cold coreaudiod). setImmediate doesn't help: it yields one tick, then the 529 // dlopen still blocks. The first voice keypress pays the dlopen cost instead. 530 useEffect(() => { 531 if (enabled && !voiceModule) { 532 void import('../services/voice.js').then(mod => { 533 voiceModule = mod 534 }) 535 } 536 }, [enabled]) 537 538 // ── Focus silence timer ──────────────────────────────────────────── 539 // Arms (or resets) a timer that tears down the focus-mode session 540 // after FOCUS_SILENCE_TIMEOUT_MS of no speech. Called when a session 541 // starts and after each flushed transcript. 542 function armFocusSilenceTimer(): void { 543 if (focusSilenceTimerRef.current) { 544 clearTimeout(focusSilenceTimerRef.current) 545 } 546 focusSilenceTimerRef.current = setTimeout( 547 ( 548 focusSilenceTimerRef, 549 stateRef, 550 focusTriggeredRef, 551 silenceTimedOutRef, 552 finishRecording, 553 ) => { 554 focusSilenceTimerRef.current = null 555 if (stateRef.current === 'recording' && focusTriggeredRef.current) { 556 logForDebugging( 557 '[voice] Focus silence timeout — tearing down session', 558 ) 559 silenceTimedOutRef.current = true 560 finishRecording() 561 } 562 }, 563 FOCUS_SILENCE_TIMEOUT_MS, 564 focusSilenceTimerRef, 565 stateRef, 566 focusTriggeredRef, 567 silenceTimedOutRef, 568 finishRecording, 569 ) 570 } 571 572 // ── Focus-driven recording ────────────────────────────────────────── 573 // In focus mode, start recording when the terminal gains focus and 574 // stop when it loses focus. This enables a "multi-clauding army" 575 // workflow where voice input follows window focus. 576 useEffect(() => { 577 if (!enabled || !focusMode) { 578 // Focus mode was disabled while a focus-driven recording was active — 579 // stop the recording so it doesn't linger until the silence timer fires. 580 if (focusTriggeredRef.current && stateRef.current === 'recording') { 581 logForDebugging( 582 '[voice] Focus mode disabled during recording, finishing', 583 ) 584 finishRecording() 585 } 586 return 587 } 588 let cancelled = false 589 if ( 590 isFocused && 591 stateRef.current === 'idle' && 592 !silenceTimedOutRef.current 593 ) { 594 const beginFocusRecording = (): void => { 595 // Re-check conditions — state or enabled/focusMode may have changed 596 // during the await (effect cleanup sets cancelled). 597 if ( 598 cancelled || 599 stateRef.current !== 'idle' || 600 silenceTimedOutRef.current 601 ) 602 return 603 logForDebugging('[voice] Focus gained, starting recording session') 604 focusTriggeredRef.current = true 605 void startRecordingSession() 606 armFocusSilenceTimer() 607 } 608 if (voiceModule) { 609 beginFocusRecording() 610 } else { 611 // Voice module is loading (async import resolves from cache as a 612 // microtask). Wait for it before starting the recording session. 613 void import('../services/voice.js').then(mod => { 614 voiceModule = mod 615 beginFocusRecording() 616 }) 617 } 618 } else if (!isFocused) { 619 // Clear the silence timeout flag on blur so the next focus 620 // cycle re-arms recording. 621 silenceTimedOutRef.current = false 622 if (stateRef.current === 'recording') { 623 logForDebugging('[voice] Focus lost, finishing recording') 624 finishRecording() 625 } 626 } 627 return () => { 628 cancelled = true 629 } 630 }, [enabled, focusMode, isFocused]) 631 632 // ── Start a new recording session (voice_stream connect + audio) ── 633 async function startRecordingSession(): Promise<void> { 634 if (!voiceModule) { 635 onErrorRef.current?.( 636 'Voice module not loaded yet. Try again in a moment.', 637 ) 638 return 639 } 640 641 // Transition to 'recording' synchronously, BEFORE any await. Callers 642 // read state synchronously right after `void startRecordingSession()`: 643 // - useVoiceIntegration.tsx space-hold guard reads voiceState from the 644 // store immediately — if it sees 'idle' it clears isSpaceHoldActiveRef 645 // and space auto-repeat leaks into the text input (100% repro) 646 // - handleKeyEvent's `currentState === 'idle'` re-entry check below 647 // If an await runs first, both see stale 'idle'. See PR #20873 review. 648 updateState('recording') 649 recordingStartRef.current = Date.now() 650 accumulatedRef.current = '' 651 seenRepeatRef.current = false 652 hasAudioSignalRef.current = false 653 retryUsedRef.current = false 654 silentDropRetriedRef.current = false 655 fullAudioRef.current = [] 656 focusFlushedCharsRef.current = 0 657 everConnectedRef.current = false 658 const myGen = ++sessionGenRef.current 659 660 // ── Pre-check: can we actually record audio? ────────────── 661 const availability = await voiceModule.checkRecordingAvailability() 662 if (!availability.available) { 663 logForDebugging( 664 `[voice] Recording not available: ${availability.reason ?? 'unknown'}`, 665 ) 666 onErrorRef.current?.( 667 availability.reason ?? 'Audio recording is not available.', 668 ) 669 cleanup() 670 updateState('idle') 671 return 672 } 673 674 logForDebugging( 675 '[voice] Starting recording session, connecting voice stream', 676 ) 677 // Clear any previous error 678 setVoiceState(prev => { 679 if (!prev.voiceError) return prev 680 return { ...prev, voiceError: null } 681 }) 682 683 // Buffer audio chunks while the WebSocket connects. Once the connection 684 // is ready (onReady fires), buffered chunks are flushed and subsequent 685 // chunks are sent directly. 686 const audioBuffer: Buffer[] = [] 687 688 // Start recording IMMEDIATELY — audio is buffered until the WebSocket 689 // opens, eliminating the 1-2s latency from waiting for OAuth + WS connect. 690 logForDebugging( 691 '[voice] startRecording: buffering audio while WebSocket connects', 692 ) 693 audioLevelsRef.current = [] 694 const started = await voiceModule.startRecording( 695 (chunk: Buffer) => { 696 // Copy for fullAudioRef replay buffer. send() in voiceStreamSTT 697 // copies again defensively — acceptable overhead at audio rates. 698 // Skip buffering in focus mode — replay is gated on !focusTriggered 699 // so the buffer is dead weight (up to ~20MB for a 10min session). 700 const owned = Buffer.from(chunk) 701 if (!focusTriggeredRef.current) { 702 fullAudioRef.current.push(owned) 703 } 704 if (connectionRef.current) { 705 connectionRef.current.send(owned) 706 } else { 707 audioBuffer.push(owned) 708 } 709 // Update audio level histogram for the recording visualizer 710 const level = computeLevel(chunk) 711 if (!hasAudioSignalRef.current && level > 0.01) { 712 hasAudioSignalRef.current = true 713 } 714 const levels = audioLevelsRef.current 715 if (levels.length >= AUDIO_LEVEL_BARS) { 716 levels.shift() 717 } 718 levels.push(level) 719 // Copy the array so React sees a new reference 720 const snapshot = [...levels] 721 audioLevelsRef.current = snapshot 722 setVoiceState(prev => ({ ...prev, voiceAudioLevels: snapshot })) 723 }, 724 () => { 725 // External end (e.g. device error) - treat as stop 726 if (stateRef.current === 'recording') { 727 finishRecording() 728 } 729 }, 730 { silenceDetection: false }, 731 ) 732 733 if (!started) { 734 logError(new Error('[voice] Recording failed — no audio tool found')) 735 onErrorRef.current?.( 736 'Failed to start audio capture. Check that your microphone is accessible.', 737 ) 738 cleanup() 739 updateState('idle') 740 setVoiceState(prev => ({ 741 ...prev, 742 voiceError: 'Recording failed — no audio tool found', 743 })) 744 return 745 } 746 747 const rawLanguage = getInitialSettings().language 748 const stt = normalizeLanguageForSTT(rawLanguage) 749 logEvent('tengu_voice_recording_started', { 750 focusTriggered: focusTriggeredRef.current, 751 sttLanguage: 752 stt.code as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 753 sttLanguageIsDefault: !rawLanguage?.trim(), 754 sttLanguageFellBack: stt.fellBackFrom !== undefined, 755 // ISO 639 subtag from Intl (bounded set, never user text). undefined if 756 // Intl failed — omitted from the payload, no retry cost (cached). 757 systemLocaleLanguage: 758 getSystemLocaleLanguage() as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 759 }) 760 761 // Retry once if the connection errors before delivering any transcript. 762 // The conversation-engine proxy can reject rapid reconnects (~1/N_pods 763 // same-pod collision) or CE's Deepgram upstream can fail during its own 764 // teardown window (anthropics/anthropic#287008 surfaces this as 765 // TranscriptError instead of silent-drop). A 250ms backoff clears both. 766 // Audio captured during the retry window routes to audioBuffer (via the 767 // connectionRef.current null check in the recording callback above) and 768 // is flushed by the second onReady. 769 let sawTranscript = false 770 771 // Connect WebSocket in parallel with audio recording. 772 // Gather keyterms first (async but fast — no model calls), then connect. 773 // Bail from callbacks if a newer session has started. Prevents a 774 // slow-connecting zombie WS (e.g. user released, pressed again, first 775 // WS still handshaking) from firing onReady/onError into the new 776 // session and corrupting its connectionRef / triggering a bogus retry. 777 const isStale = () => sessionGenRef.current !== myGen 778 779 const attemptConnect = (keyterms: string[]): void => { 780 const myAttemptGen = attemptGenRef.current 781 void connectVoiceStream( 782 { 783 onTranscript: (text: string, isFinal: boolean) => { 784 if (isStale()) return 785 sawTranscript = true 786 logForDebugging( 787 `[voice] onTranscript: isFinal=${String(isFinal)} text="${text}"`, 788 ) 789 if (isFinal && text.trim()) { 790 if (focusTriggeredRef.current) { 791 // Focus mode: flush each final transcript immediately and 792 // keep recording. This gives continuous transcription while 793 // the terminal is focused. 794 logForDebugging( 795 `[voice] Focus mode: flushing final transcript immediately: "${text.trim()}"`, 796 ) 797 onTranscriptRef.current(text.trim()) 798 focusFlushedCharsRef.current += text.trim().length 799 setVoiceState(prev => { 800 if (prev.voiceInterimTranscript === '') return prev 801 return { ...prev, voiceInterimTranscript: '' } 802 }) 803 accumulatedRef.current = '' 804 // User is actively speaking — reset the silence timer. 805 armFocusSilenceTimer() 806 } else { 807 // Hold-to-talk: accumulate final transcripts separated by spaces 808 if (accumulatedRef.current) { 809 accumulatedRef.current += ' ' 810 } 811 accumulatedRef.current += text.trim() 812 logForDebugging( 813 `[voice] Accumulated final transcript: "${accumulatedRef.current}"`, 814 ) 815 // Clear interim since final supersedes it 816 setVoiceState(prev => { 817 const preview = accumulatedRef.current 818 if (prev.voiceInterimTranscript === preview) return prev 819 return { ...prev, voiceInterimTranscript: preview } 820 }) 821 } 822 } else if (!isFinal) { 823 // Active interim speech resets the focus silence timer. 824 // Nova 3 disables auto-finalize so isFinal is never true 825 // mid-stream — without this, the 5s timer fires during 826 // active speech and tears down the session. 827 if (focusTriggeredRef.current) { 828 armFocusSilenceTimer() 829 } 830 // Show accumulated finals + current interim as live preview 831 const interim = text.trim() 832 const preview = accumulatedRef.current 833 ? accumulatedRef.current + (interim ? ' ' + interim : '') 834 : interim 835 setVoiceState(prev => { 836 if (prev.voiceInterimTranscript === preview) return prev 837 return { ...prev, voiceInterimTranscript: preview } 838 }) 839 } 840 }, 841 onError: (error: string, opts?: { fatal?: boolean }) => { 842 if (isStale()) { 843 logForDebugging( 844 `[voice] ignoring onError from stale session: ${error}`, 845 ) 846 return 847 } 848 // Swallow errors from superseded attempts. Covers conn 1's 849 // trailing close after retry is scheduled, AND the current 850 // conn's ws close event after its ws error already surfaced 851 // below (gen bumped at surface). 852 if (attemptGenRef.current !== myAttemptGen) { 853 logForDebugging( 854 `[voice] ignoring stale onError from superseded attempt: ${error}`, 855 ) 856 return 857 } 858 // Early-failure retry: server error before any transcript = 859 // likely a transient upstream race (CE rejection, Deepgram 860 // not ready). Clear connectionRef so audio re-buffers, back 861 // off, reconnect. Skip if the user has already released the 862 // key (state left 'recording') — no point retrying a session 863 // they've ended. Fatal errors (Cloudflare bot challenge, auth 864 // rejection) are the same failure on every retry attempt, so 865 // fall through to surface the message. 866 if ( 867 !opts?.fatal && 868 !sawTranscript && 869 stateRef.current === 'recording' 870 ) { 871 if (!retryUsedRef.current) { 872 retryUsedRef.current = true 873 logForDebugging( 874 `[voice] early voice_stream error (pre-transcript), retrying once: ${error}`, 875 ) 876 logEvent('tengu_voice_stream_early_retry', {}) 877 connectionRef.current = null 878 attemptGenRef.current++ 879 setTimeout( 880 (stateRef, attemptConnect, keyterms) => { 881 if (stateRef.current === 'recording') { 882 attemptConnect(keyterms) 883 } 884 }, 885 250, 886 stateRef, 887 attemptConnect, 888 keyterms, 889 ) 890 return 891 } 892 } 893 // Surfacing — bump gen so this conn's trailing close-error 894 // (ws fires error then close 1006) is swallowed above. 895 attemptGenRef.current++ 896 logError(new Error(`[voice] voice_stream error: ${error}`)) 897 onErrorRef.current?.(`Voice stream error: ${error}`) 898 // Clear the audio buffer on error to avoid memory leaks 899 audioBuffer.length = 0 900 focusTriggeredRef.current = false 901 cleanup() 902 updateState('idle') 903 }, 904 onClose: () => { 905 // no-op; lifecycle handled by cleanup() 906 }, 907 onReady: conn => { 908 // Only proceed if we're still in recording state AND this is 909 // still the current session. A zombie late-connecting WS from 910 // an abandoned session can pass the 'recording' check if the 911 // user has since started a new session. 912 if (isStale() || stateRef.current !== 'recording') { 913 conn.close() 914 return 915 } 916 917 // The WebSocket is now truly open — assign connectionRef so 918 // subsequent audio callbacks send directly instead of buffering. 919 connectionRef.current = conn 920 everConnectedRef.current = true 921 922 // Flush all audio chunks that were buffered while the WebSocket 923 // was connecting. This is safe because onReady fires from the 924 // WebSocket 'open' event, guaranteeing send() will not be dropped. 925 // 926 // Coalesce into ~1s slices rather than one ws.send per chunk 927 // — fewer WS frames means less overhead on both ends. 928 const SLICE_TARGET_BYTES = 32_000 // ~1s at 16kHz/16-bit/mono 929 if (audioBuffer.length > 0) { 930 let totalBytes = 0 931 for (const c of audioBuffer) totalBytes += c.length 932 const slices: Buffer[][] = [[]] 933 let sliceBytes = 0 934 for (const chunk of audioBuffer) { 935 if ( 936 sliceBytes > 0 && 937 sliceBytes + chunk.length > SLICE_TARGET_BYTES 938 ) { 939 slices.push([]) 940 sliceBytes = 0 941 } 942 slices[slices.length - 1]!.push(chunk) 943 sliceBytes += chunk.length 944 } 945 logForDebugging( 946 `[voice] onReady: flushing ${String(audioBuffer.length)} buffered chunks (${String(totalBytes)} bytes) as ${String(slices.length)} coalesced frame(s)`, 947 ) 948 for (const slice of slices) { 949 conn.send(Buffer.concat(slice)) 950 } 951 } 952 audioBuffer.length = 0 953 954 // Reset the release timer now that the WebSocket is ready. 955 // Only arm it if auto-repeat has been seen — otherwise the OS 956 // key repeat delay (~500ms) hasn't elapsed yet and the timer 957 // would fire prematurely. 958 if (releaseTimerRef.current) { 959 clearTimeout(releaseTimerRef.current) 960 } 961 if (seenRepeatRef.current) { 962 releaseTimerRef.current = setTimeout( 963 (releaseTimerRef, stateRef, finishRecording) => { 964 releaseTimerRef.current = null 965 if (stateRef.current === 'recording') { 966 finishRecording() 967 } 968 }, 969 RELEASE_TIMEOUT_MS, 970 releaseTimerRef, 971 stateRef, 972 finishRecording, 973 ) 974 } 975 }, 976 }, 977 { 978 language: stt.code, 979 keyterms, 980 }, 981 ).then(conn => { 982 if (isStale()) { 983 conn?.close() 984 return 985 } 986 if (!conn) { 987 logForDebugging( 988 '[voice] Failed to connect to voice_stream (no OAuth token?)', 989 ) 990 onErrorRef.current?.( 991 'Voice mode requires a Claude.ai account. Please run /login to sign in.', 992 ) 993 // Clear the audio buffer on failure 994 audioBuffer.length = 0 995 cleanup() 996 updateState('idle') 997 return 998 } 999 1000 // Safety check: if the user released the key before connectVoiceStream 1001 // resolved (but after onReady already ran), close the connection. 1002 if (stateRef.current !== 'recording') { 1003 audioBuffer.length = 0 1004 conn.close() 1005 return 1006 } 1007 }) 1008 } 1009 1010 void getVoiceKeyterms().then(attemptConnect) 1011 } 1012 1013 // ── Hold-to-talk handler ──────────────────────────────────────────── 1014 // Called on every keypress (including terminal auto-repeats while 1015 // the key is held). A gap longer than RELEASE_TIMEOUT_MS between 1016 // events is interpreted as key release. 1017 // 1018 // Recording starts immediately on the first keypress to eliminate 1019 // startup delay. The release timer is only armed after auto-repeat 1020 // is detected (to avoid false releases during the OS key repeat 1021 // delay of ~500ms on macOS). 1022 const handleKeyEvent = useCallback( 1023 (fallbackMs = REPEAT_FALLBACK_MS): void => { 1024 if (!enabled || !isVoiceStreamAvailable()) { 1025 return 1026 } 1027 1028 // In focus mode, recording is driven by terminal focus, not keypresses. 1029 if (focusTriggeredRef.current) { 1030 // Active focus recording — ignore key events (session ends on blur). 1031 return 1032 } 1033 if (focusMode && silenceTimedOutRef.current) { 1034 // Focus session timed out due to silence — keypress re-arms it. 1035 logForDebugging( 1036 '[voice] Re-arming focus recording after silence timeout', 1037 ) 1038 silenceTimedOutRef.current = false 1039 focusTriggeredRef.current = true 1040 void startRecordingSession() 1041 armFocusSilenceTimer() 1042 return 1043 } 1044 1045 const currentState = stateRef.current 1046 1047 // Ignore keypresses while processing 1048 if (currentState === 'processing') { 1049 return 1050 } 1051 1052 if (currentState === 'idle') { 1053 logForDebugging( 1054 '[voice] handleKeyEvent: idle, starting recording session immediately', 1055 ) 1056 void startRecordingSession() 1057 // Fallback: if no auto-repeat arrives within REPEAT_FALLBACK_MS, 1058 // arm the release timer anyway (the user likely tapped and released). 1059 repeatFallbackTimerRef.current = setTimeout( 1060 ( 1061 repeatFallbackTimerRef, 1062 stateRef, 1063 seenRepeatRef, 1064 releaseTimerRef, 1065 finishRecording, 1066 ) => { 1067 repeatFallbackTimerRef.current = null 1068 if (stateRef.current === 'recording' && !seenRepeatRef.current) { 1069 logForDebugging( 1070 '[voice] No auto-repeat seen, arming release timer via fallback', 1071 ) 1072 seenRepeatRef.current = true 1073 releaseTimerRef.current = setTimeout( 1074 (releaseTimerRef, stateRef, finishRecording) => { 1075 releaseTimerRef.current = null 1076 if (stateRef.current === 'recording') { 1077 finishRecording() 1078 } 1079 }, 1080 RELEASE_TIMEOUT_MS, 1081 releaseTimerRef, 1082 stateRef, 1083 finishRecording, 1084 ) 1085 } 1086 }, 1087 fallbackMs, 1088 repeatFallbackTimerRef, 1089 stateRef, 1090 seenRepeatRef, 1091 releaseTimerRef, 1092 finishRecording, 1093 ) 1094 } else if (currentState === 'recording') { 1095 // Second+ keypress while recording — auto-repeat has started. 1096 seenRepeatRef.current = true 1097 if (repeatFallbackTimerRef.current) { 1098 clearTimeout(repeatFallbackTimerRef.current) 1099 repeatFallbackTimerRef.current = null 1100 } 1101 } 1102 1103 // Reset the release timer on every keypress (including auto-repeats) 1104 if (releaseTimerRef.current) { 1105 clearTimeout(releaseTimerRef.current) 1106 } 1107 1108 // Only arm the release timer once auto-repeat has been seen. 1109 // The OS key repeat delay is ~500ms on macOS; without this gate 1110 // the 200ms timer fires before repeat starts, causing a false release. 1111 if (stateRef.current === 'recording' && seenRepeatRef.current) { 1112 releaseTimerRef.current = setTimeout( 1113 (releaseTimerRef, stateRef, finishRecording) => { 1114 releaseTimerRef.current = null 1115 if (stateRef.current === 'recording') { 1116 finishRecording() 1117 } 1118 }, 1119 RELEASE_TIMEOUT_MS, 1120 releaseTimerRef, 1121 stateRef, 1122 finishRecording, 1123 ) 1124 } 1125 }, 1126 [enabled, focusMode, cleanup], 1127 ) 1128 1129 // Cleanup only when disabled or unmounted - NOT on state changes 1130 useEffect(() => { 1131 if (!enabled && stateRef.current !== 'idle') { 1132 cleanup() 1133 updateState('idle') 1134 } 1135 return () => { 1136 cleanup() 1137 } 1138 }, [enabled, cleanup]) 1139 1140 return { 1141 state, 1142 handleKeyEvent, 1143 } 1144}