hooks/useVoice.ts at main · oppi.li/claude-code

oppi.li / claude-code
fork atom
source dump of claude code
fork atom
claude-code / hooks / useVoice.ts
at main 1144 lines 46 kB view raw
wrap content
oppi.li dump from zip 11d ago
63aada3f
   1// React hook for hold-to-talk voice input using Anthropic voice_stream STT.
   2//
   3// Hold the keybinding to record; release to stop and submit.  Auto-repeat
   4// key events reset an internal timer — when no keypress arrives within
   5// RELEASE_TIMEOUT_MS the recording stops automatically.  Uses the native
   6// audio module (macOS) or SoX for recording, and Anthropic's voice_stream
   7// endpoint (conversation_engine) for STT.
   8
   9import { useCallback, useEffect, useRef, useState } from 'react'
  10import { useSetVoiceState } from '../context/voice.js'
  11import { useTerminalFocus } from '../ink/hooks/use-terminal-focus.js'
  12import {
  13  type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  14  logEvent,
  15} from '../services/analytics/index.js'
  16import { getVoiceKeyterms } from '../services/voiceKeyterms.js'
  17import {
  18  connectVoiceStream,
  19  type FinalizeSource,
  20  isVoiceStreamAvailable,
  21  type VoiceStreamConnection,
  22} from '../services/voiceStreamSTT.js'
  23import { logForDebugging } from '../utils/debug.js'
  24import { toError } from '../utils/errors.js'
  25import { getSystemLocaleLanguage } from '../utils/intl.js'
  26import { logError } from '../utils/log.js'
  27import { getInitialSettings } from '../utils/settings/settings.js'
  28import { sleep } from '../utils/sleep.js'
  29
  30// ─── Language normalization ─────────────────────────────────────────────
  31
  32const DEFAULT_STT_LANGUAGE = 'en'
  33
  34// Maps language names (English and native) to BCP-47 codes supported by
  35// the voice_stream Deepgram backend.  Keys must be lowercase.
  36//
  37// This list must be a SUBSET of the server-side supported_language_codes
  38// allowlist (GrowthBook: speech_to_text_voice_stream_config).
  39// If the CLI sends a code the server rejects, the WebSocket closes with
  40// 1008 "Unsupported language" and voice breaks.  Unsupported languages
  41// fall back to DEFAULT_STT_LANGUAGE so recording still works.
  42const LANGUAGE_NAME_TO_CODE: Record<string, string> = {
  43  english: 'en',
  44  spanish: 'es',
  45  español: 'es',
  46  espanol: 'es',
  47  french: 'fr',
  48  français: 'fr',
  49  francais: 'fr',
  50  japanese: 'ja',
  51  日本語: 'ja',
  52  german: 'de',
  53  deutsch: 'de',
  54  portuguese: 'pt',
  55  português: 'pt',
  56  portugues: 'pt',
  57  italian: 'it',
  58  italiano: 'it',
  59  korean: 'ko',
  60  한국어: 'ko',
  61  hindi: 'hi',
  62  हिन्दी: 'hi',
  63  हिंदी: 'hi',
  64  indonesian: 'id',
  65  'bahasa indonesia': 'id',
  66  bahasa: 'id',
  67  russian: 'ru',
  68  русский: 'ru',
  69  polish: 'pl',
  70  polski: 'pl',
  71  turkish: 'tr',
  72  türkçe: 'tr',
  73  turkce: 'tr',
  74  dutch: 'nl',
  75  nederlands: 'nl',
  76  ukrainian: 'uk',
  77  українська: 'uk',
  78  greek: 'el',
  79  ελληνικά: 'el',
  80  czech: 'cs',
  81  čeština: 'cs',
  82  cestina: 'cs',
  83  danish: 'da',
  84  dansk: 'da',
  85  swedish: 'sv',
  86  svenska: 'sv',
  87  norwegian: 'no',
  88  norsk: 'no',
  89}
  90
  91// Subset of the GrowthBook speech_to_text_voice_stream_config allowlist.
  92// Sending a code not in the server allowlist closes the connection.
  93const SUPPORTED_LANGUAGE_CODES = new Set([
  94  'en',
  95  'es',
  96  'fr',
  97  'ja',
  98  'de',
  99  'pt',
 100  'it',
 101  'ko',
 102  'hi',
 103  'id',
 104  'ru',
 105  'pl',
 106  'tr',
 107  'nl',
 108  'uk',
 109  'el',
 110  'cs',
 111  'da',
 112  'sv',
 113  'no',
 114])
 115
 116// Normalize a language preference string (from settings.language) to a
 117// BCP-47 code supported by the voice_stream endpoint.  Returns the
 118// default language if the input cannot be resolved.  When the input is
 119// non-empty but unsupported, fellBackFrom is set to the original input so
 120// callers can surface a warning.
 121export function normalizeLanguageForSTT(language: string | undefined): {
 122  code: string
 123  fellBackFrom?: string
 124} {
 125  if (!language) return { code: DEFAULT_STT_LANGUAGE }
 126  const lower = language.toLowerCase().trim()
 127  if (!lower) return { code: DEFAULT_STT_LANGUAGE }
 128  if (SUPPORTED_LANGUAGE_CODES.has(lower)) return { code: lower }
 129  const fromName = LANGUAGE_NAME_TO_CODE[lower]
 130  if (fromName) return { code: fromName }
 131  const base = lower.split('-')[0]
 132  if (base && SUPPORTED_LANGUAGE_CODES.has(base)) return { code: base }
 133  return { code: DEFAULT_STT_LANGUAGE, fellBackFrom: language }
 134}
 135
 136// Lazy-loaded voice module. We defer importing voice.ts (and its native
 137// audio-capture-napi dependency) until voice input is actually activated.
 138// On macOS, loading the native audio module can trigger a TCC microphone
 139// permission prompt — we must avoid that until voice input is actually enabled.
 140type VoiceModule = typeof import('../services/voice.js')
 141let voiceModule: VoiceModule | null = null
 142
 143type VoiceState = 'idle' | 'recording' | 'processing'
 144
 145type UseVoiceOptions = {
 146  onTranscript: (text: string) => void
 147  onError?: (message: string) => void
 148  enabled: boolean
 149  focusMode: boolean
 150}
 151
 152type UseVoiceReturn = {
 153  state: VoiceState
 154  handleKeyEvent: (fallbackMs?: number) => void
 155}
 156
 157// Gap (ms) between auto-repeat key events that signals key release.
 158// Terminal auto-repeat typically fires every 30-80ms; 200ms comfortably
 159// covers jitter while still feeling responsive.
 160const RELEASE_TIMEOUT_MS = 200
 161
 162// Fallback (ms) to arm the release timer if no auto-repeat is seen.
 163// macOS default key repeat delay is ~500ms; 600ms gives headroom.
 164// If the user tapped and released before auto-repeat started, this
 165// ensures the release timer gets armed and recording stops.
 166//
 167// For modifier-combo first-press activation (handleKeyEvent called at
 168// t=0, before any auto-repeat), callers should pass FIRST_PRESS_FALLBACK_MS
 169// instead — the gap to the next keypress is the OS initial repeat *delay*
 170// (up to ~2s on macOS with slider at "Long"), not the repeat *rate*.
 171const REPEAT_FALLBACK_MS = 600
 172export const FIRST_PRESS_FALLBACK_MS = 2000
 173
 174// How long (ms) to keep a focus-mode session alive without any speech
 175// before tearing it down to free the WebSocket connection. Re-arms on
 176// the next focus cycle (blur → refocus).
 177const FOCUS_SILENCE_TIMEOUT_MS = 5_000
 178
 179// Number of bars shown in the recording waveform visualizer.
 180const AUDIO_LEVEL_BARS = 16
 181
 182// Compute RMS amplitude from a 16-bit signed PCM buffer and return a
 183// normalized 0-1 value. A sqrt curve spreads quieter levels across more
 184// of the visual range so the waveform uses the full set of block heights.
 185export function computeLevel(chunk: Buffer): number {
 186  const samples = chunk.length >> 1 // 16-bit = 2 bytes per sample
 187  if (samples === 0) return 0
 188  let sumSq = 0
 189  for (let i = 0; i < chunk.length - 1; i += 2) {
 190    // Read 16-bit signed little-endian
 191    const sample = ((chunk[i]! | (chunk[i + 1]! << 8)) << 16) >> 16
 192    sumSq += sample * sample
 193  }
 194  const rms = Math.sqrt(sumSq / samples)
 195  const normalized = Math.min(rms / 2000, 1)
 196  return Math.sqrt(normalized)
 197}
 198
 199export function useVoice({
 200  onTranscript,
 201  onError,
 202  enabled,
 203  focusMode,
 204}: UseVoiceOptions): UseVoiceReturn {
 205  const [state, setState] = useState<VoiceState>('idle')
 206  const stateRef = useRef<VoiceState>('idle')
 207  const connectionRef = useRef<VoiceStreamConnection | null>(null)
 208  const accumulatedRef = useRef('')
 209  const onTranscriptRef = useRef(onTranscript)
 210  const onErrorRef = useRef(onError)
 211  const cleanupTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null)
 212  const releaseTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null)
 213  // True once we've seen a second keypress (auto-repeat) while recording.
 214  // The OS key repeat delay (~500ms on macOS) means the first keypress is
 215  // solo — arming the release timer before auto-repeat starts would cause
 216  // a false release.
 217  const seenRepeatRef = useRef(false)
 218  const repeatFallbackTimerRef = useRef<ReturnType<typeof setTimeout> | null>(
 219    null,
 220  )
 221  // True when the current recording session was started by terminal focus
 222  // (not by a keypress). Focus-driven sessions end on blur, not key release.
 223  const focusTriggeredRef = useRef(false)
 224  // Timer that tears down the session after prolonged silence in focus mode.
 225  const focusSilenceTimerRef = useRef<ReturnType<typeof setTimeout> | null>(
 226    null,
 227  )
 228  // Set when a focus-mode session is torn down due to silence. Prevents
 229  // the focus effect from immediately restarting. Cleared on blur so the
 230  // next focus cycle re-arms recording.
 231  const silenceTimedOutRef = useRef(false)
 232  const recordingStartRef = useRef(0)
 233  // Incremented on each startRecordingSession(). Callbacks capture their
 234  // generation and bail if a newer session has started — prevents a zombie
 235  // slow-connecting WS from an abandoned session from overwriting
 236  // connectionRef mid-way through the next session.
 237  const sessionGenRef = useRef(0)
 238  // True if the early-error retry fired during this session.
 239  // Tracked for the tengu_voice_recording_completed analytics event.
 240  const retryUsedRef = useRef(false)
 241  // Full audio captured this session, kept for silent-drop replay. ~1% of
 242  // sessions get a sticky-broken CE pod that accepts audio but returns zero
 243  // transcripts (anthropics/anthropic#287008 session-sticky variant); when
 244  // finalize() resolves via no_data_timeout with hadAudioSignal=true, we
 245  // replay the buffer on a fresh WS once. Bounded: 32KB/s × ~60s max ≈ 2MB.
 246  const fullAudioRef = useRef<Buffer[]>([])
 247  const silentDropRetriedRef = useRef(false)
 248  // Bumped when the early-error retry is scheduled. Captured per
 249  // attemptConnect — onError swallows stale-gen events (conn 1's
 250  // trailing close-error) but surfaces current-gen ones (conn 2's
 251  // genuine failure). Same shape as sessionGenRef, one level down.
 252  const attemptGenRef = useRef(0)
 253  // Running total of chars flushed in focus mode (each final transcript is
 254  // injected immediately and accumulatedRef reset). Added to transcriptChars
 255  // in the completed event so focus-mode sessions don't false-positive as
 256  // silent-drops (transcriptChars=0 despite successful transcription).
 257  const focusFlushedCharsRef = useRef(0)
 258  // True if at least one audio chunk with non-trivial signal was received.
 259  // Used to distinguish "microphone is silent/inaccessible" from "speech not detected".
 260  const hasAudioSignalRef = useRef(false)
 261  // True once onReady fired for the current session. Unlike connectionRef
 262  // (which cleanup() nulls), this survives effect-order races where Effect 3
 263  // cleanup runs before Effect 2's finishRecording() — e.g. /voice toggled
 264  // off mid-recording in focus mode. Used for the wsConnected analytics
 265  // dimension and error-message branching. Reset in startRecordingSession.
 266  const everConnectedRef = useRef(false)
 267  const audioLevelsRef = useRef<number[]>([])
 268  const isFocused = useTerminalFocus()
 269  const setVoiceState = useSetVoiceState()
 270
 271  // Keep callback refs current without triggering re-renders
 272  onTranscriptRef.current = onTranscript
 273  onErrorRef.current = onError
 274
 275  function updateState(newState: VoiceState): void {
 276    stateRef.current = newState
 277    setState(newState)
 278    setVoiceState(prev => {
 279      if (prev.voiceState === newState) return prev
 280      return { ...prev, voiceState: newState }
 281    })
 282  }
 283
 284  const cleanup = useCallback((): void => {
 285    // Stale any in-flight session (main connection isStale(), replay
 286    // isStale(), finishRecording continuation). Without this, disabling
 287    // voice during the replay window lets the stale replay open a WS,
 288    // accumulate transcript, and inject it after voice was torn down.
 289    sessionGenRef.current++
 290    if (cleanupTimerRef.current) {
 291      clearTimeout(cleanupTimerRef.current)
 292      cleanupTimerRef.current = null
 293    }
 294    if (releaseTimerRef.current) {
 295      clearTimeout(releaseTimerRef.current)
 296      releaseTimerRef.current = null
 297    }
 298    if (repeatFallbackTimerRef.current) {
 299      clearTimeout(repeatFallbackTimerRef.current)
 300      repeatFallbackTimerRef.current = null
 301    }
 302    if (focusSilenceTimerRef.current) {
 303      clearTimeout(focusSilenceTimerRef.current)
 304      focusSilenceTimerRef.current = null
 305    }
 306    silenceTimedOutRef.current = false
 307    voiceModule?.stopRecording()
 308    if (connectionRef.current) {
 309      connectionRef.current.close()
 310      connectionRef.current = null
 311    }
 312    accumulatedRef.current = ''
 313    audioLevelsRef.current = []
 314    fullAudioRef.current = []
 315    setVoiceState(prev => {
 316      if (prev.voiceInterimTranscript === '' && !prev.voiceAudioLevels.length)
 317        return prev
 318      return { ...prev, voiceInterimTranscript: '', voiceAudioLevels: [] }
 319    })
 320  }, [setVoiceState])
 321
 322  function finishRecording(): void {
 323    logForDebugging(
 324      '[voice] finishRecording: stopping recording, transitioning to processing',
 325    )
 326    // Session ending — stale any in-flight attempt so its late onError
 327    // (conn 2 responding after user released key) doesn't double-fire on
 328    // top of the "check network" message below.
 329    attemptGenRef.current++
 330    // Capture focusTriggered BEFORE clearing it — needed as an event dimension
 331    // so BigQuery can filter out passive focus-mode auto-recordings (user focused
 332    // terminal without speaking → ambient noise sets hadAudioSignal=true → false
 333    // silent-drop signature). focusFlushedCharsRef fixes transcriptChars accuracy
 334    // for sessions WITH speech; focusTriggered enables filtering sessions WITHOUT.
 335    const focusTriggered = focusTriggeredRef.current
 336    focusTriggeredRef.current = false
 337    updateState('processing')
 338    voiceModule?.stopRecording()
 339    // Capture duration BEFORE the finalize round-trip so that the WebSocket
 340    // wait time is not included (otherwise a quick tap looks like > 2s).
 341    // All ref-backed values are captured here, BEFORE the async boundary —
 342    // a keypress during the finalize wait can start a new session and reset
 343    // these refs (e.g. focusFlushedCharsRef = 0 in startRecordingSession),
 344    // reproducing the silent-drop false-positive this ref exists to prevent.
 345    const recordingDurationMs = Date.now() - recordingStartRef.current
 346    const hadAudioSignal = hasAudioSignalRef.current
 347    const retried = retryUsedRef.current
 348    const focusFlushedChars = focusFlushedCharsRef.current
 349    // wsConnected distinguishes "backend received audio but dropped it" (the
 350    // bug backend PR #287008 fixes) from "WS handshake never completed" —
 351    // in the latter case audio is still in audioBuffer, never reached the
 352    // server, but hasAudioSignalRef is already true from ambient noise.
 353    const wsConnected = everConnectedRef.current
 354    // Capture generation BEFORE the .then() — if a new session starts during
 355    // the finalize wait, sessionGenRef has already advanced by the time the
 356    // continuation runs, so capturing inside the .then() would yield the new
 357    // session's gen and every staleness check would be a no-op.
 358    const myGen = sessionGenRef.current
 359    const isStale = () => sessionGenRef.current !== myGen
 360    logForDebugging('[voice] Recording stopped')
 361
 362    // Send finalize and wait for the WebSocket to close before reading the
 363    // accumulated transcript.  The close handler promotes any unreported
 364    // interim text to final, so we must wait for it to fire.
 365    const finalizePromise: Promise<FinalizeSource | undefined> =
 366      connectionRef.current
 367        ? connectionRef.current.finalize()
 368        : Promise.resolve(undefined)
 369
 370    void finalizePromise
 371      .then(async finalizeSource => {
 372        if (isStale()) return
 373        // Silent-drop replay: when the server accepted audio (wsConnected),
 374        // the mic captured real signal (hadAudioSignal), but finalize timed
 375        // out with zero transcript — the ~1% session-sticky CE-pod bug.
 376        // Replay the buffered audio on a fresh connection once. A 250ms
 377        // backoff clears the same-pod rapid-reconnect race (same gap as the
 378        // early-error retry path below).
 379        if (
 380          finalizeSource === 'no_data_timeout' &&
 381          hadAudioSignal &&
 382          wsConnected &&
 383          !focusTriggered &&
 384          focusFlushedChars === 0 &&
 385          accumulatedRef.current.trim() === '' &&
 386          !silentDropRetriedRef.current &&
 387          fullAudioRef.current.length > 0
 388        ) {
 389          silentDropRetriedRef.current = true
 390          logForDebugging(
 391            `[voice] Silent-drop detected (no_data_timeout, ${String(fullAudioRef.current.length)} chunks); replaying on fresh connection`,
 392          )
 393          logEvent('tengu_voice_silent_drop_replay', {
 394            recordingDurationMs,
 395            chunkCount: fullAudioRef.current.length,
 396          })
 397          if (connectionRef.current) {
 398            connectionRef.current.close()
 399            connectionRef.current = null
 400          }
 401          const replayBuffer = fullAudioRef.current
 402          await sleep(250)
 403          if (isStale()) return
 404          const stt = normalizeLanguageForSTT(getInitialSettings().language)
 405          const keyterms = await getVoiceKeyterms()
 406          if (isStale()) return
 407          await new Promise<void>(resolve => {
 408            void connectVoiceStream(
 409              {
 410                onTranscript: (t, isFinal) => {
 411                  if (isStale()) return
 412                  if (isFinal && t.trim()) {
 413                    if (accumulatedRef.current) accumulatedRef.current += ' '
 414                    accumulatedRef.current += t.trim()
 415                  }
 416                },
 417                onError: () => resolve(),
 418                onClose: () => {},
 419                onReady: conn => {
 420                  if (isStale()) {
 421                    conn.close()
 422                    resolve()
 423                    return
 424                  }
 425                  connectionRef.current = conn
 426                  const SLICE = 32_000
 427                  let slice: Buffer[] = []
 428                  let bytes = 0
 429                  for (const c of replayBuffer) {
 430                    if (bytes > 0 && bytes + c.length > SLICE) {
 431                      conn.send(Buffer.concat(slice))
 432                      slice = []
 433                      bytes = 0
 434                    }
 435                    slice.push(c)
 436                    bytes += c.length
 437                  }
 438                  if (slice.length) conn.send(Buffer.concat(slice))
 439                  void conn.finalize().then(() => {
 440                    conn.close()
 441                    resolve()
 442                  })
 443                },
 444              },
 445              { language: stt.code, keyterms },
 446            ).then(
 447              c => {
 448                if (!c) resolve()
 449              },
 450              () => resolve(),
 451            )
 452          })
 453          if (isStale()) return
 454        }
 455        fullAudioRef.current = []
 456
 457        const text = accumulatedRef.current.trim()
 458        logForDebugging(
 459          `[voice] Final transcript assembled (${String(text.length)} chars): "${text.slice(0, 200)}"`,
 460        )
 461
 462        // Tracks silent-drop rate: transcriptChars=0 + hadAudioSignal=true
 463        // + recordingDurationMs>2000 = the bug backend PR #287008 fixes.
 464        // focusFlushedCharsRef makes transcriptChars accurate for focus mode
 465        // (where each final is injected immediately and accumulatedRef reset).
 466        //
 467        // NOTE: this fires only on the finishRecording() path. The onError
 468        // fallthrough and !conn (no-OAuth) paths bypass this → don't compute
 469        // COUNT(completed)/COUNT(started) as a success rate; the silent-drop
 470        // denominator (completed events only) is internally consistent.
 471        logEvent('tengu_voice_recording_completed', {
 472          transcriptChars: text.length + focusFlushedChars,
 473          recordingDurationMs,
 474          hadAudioSignal,
 475          retried,
 476          silentDropRetried: silentDropRetriedRef.current,
 477          wsConnected,
 478          focusTriggered,
 479        })
 480
 481        if (connectionRef.current) {
 482          connectionRef.current.close()
 483          connectionRef.current = null
 484        }
 485
 486        if (text) {
 487          logForDebugging(
 488            `[voice] Injecting transcript (${String(text.length)} chars)`,
 489          )
 490          onTranscriptRef.current(text)
 491        } else if (focusFlushedChars === 0 && recordingDurationMs > 2000) {
 492          // Only warn about empty transcript if nothing was flushed in focus
 493          // mode either, and recording was > 2s (short recordings = accidental
 494          // taps → silently return to idle).
 495          if (!wsConnected) {
 496            // WS never connected → audio never reached backend. Not a silent
 497            // drop; a connection failure (slow OAuth refresh, network, etc).
 498            onErrorRef.current?.(
 499              'Voice connection failed. Check your network and try again.',
 500            )
 501          } else if (!hadAudioSignal) {
 502            // Distinguish silent mic (capture issue) from speech not recognized.
 503            onErrorRef.current?.(
 504              'No audio detected from microphone. Check that the correct input device is selected and that Claude Code has microphone access.',
 505            )
 506          } else {
 507            onErrorRef.current?.('No speech detected.')
 508          }
 509        }
 510
 511        accumulatedRef.current = ''
 512        setVoiceState(prev => {
 513          if (prev.voiceInterimTranscript === '') return prev
 514          return { ...prev, voiceInterimTranscript: '' }
 515        })
 516        updateState('idle')
 517      })
 518      .catch(err => {
 519        logError(toError(err))
 520        if (!isStale()) updateState('idle')
 521      })
 522  }
 523
 524  // When voice is enabled, lazy-import voice.ts so checkRecordingAvailability
 525  // et al. are ready when the user presses the voice key. Do NOT preload the
 526  // native module — require('audio-capture.node') is a synchronous dlopen of
 527  // CoreAudio/AudioUnit that blocks the event loop for ~1s (warm) to ~8s
 528  // (cold coreaudiod). setImmediate doesn't help: it yields one tick, then the
 529  // dlopen still blocks. The first voice keypress pays the dlopen cost instead.
 530  useEffect(() => {
 531    if (enabled && !voiceModule) {
 532      void import('../services/voice.js').then(mod => {
 533        voiceModule = mod
 534      })
 535    }
 536  }, [enabled])
 537
 538  // ── Focus silence timer ────────────────────────────────────────────
 539  // Arms (or resets) a timer that tears down the focus-mode session
 540  // after FOCUS_SILENCE_TIMEOUT_MS of no speech. Called when a session
 541  // starts and after each flushed transcript.
 542  function armFocusSilenceTimer(): void {
 543    if (focusSilenceTimerRef.current) {
 544      clearTimeout(focusSilenceTimerRef.current)
 545    }
 546    focusSilenceTimerRef.current = setTimeout(
 547      (
 548        focusSilenceTimerRef,
 549        stateRef,
 550        focusTriggeredRef,
 551        silenceTimedOutRef,
 552        finishRecording,
 553      ) => {
 554        focusSilenceTimerRef.current = null
 555        if (stateRef.current === 'recording' && focusTriggeredRef.current) {
 556          logForDebugging(
 557            '[voice] Focus silence timeout — tearing down session',
 558          )
 559          silenceTimedOutRef.current = true
 560          finishRecording()
 561        }
 562      },
 563      FOCUS_SILENCE_TIMEOUT_MS,
 564      focusSilenceTimerRef,
 565      stateRef,
 566      focusTriggeredRef,
 567      silenceTimedOutRef,
 568      finishRecording,
 569    )
 570  }
 571
 572  // ── Focus-driven recording ──────────────────────────────────────────
 573  // In focus mode, start recording when the terminal gains focus and
 574  // stop when it loses focus. This enables a "multi-clauding army"
 575  // workflow where voice input follows window focus.
 576  useEffect(() => {
 577    if (!enabled || !focusMode) {
 578      // Focus mode was disabled while a focus-driven recording was active —
 579      // stop the recording so it doesn't linger until the silence timer fires.
 580      if (focusTriggeredRef.current && stateRef.current === 'recording') {
 581        logForDebugging(
 582          '[voice] Focus mode disabled during recording, finishing',
 583        )
 584        finishRecording()
 585      }
 586      return
 587    }
 588    let cancelled = false
 589    if (
 590      isFocused &&
 591      stateRef.current === 'idle' &&
 592      !silenceTimedOutRef.current
 593    ) {
 594      const beginFocusRecording = (): void => {
 595        // Re-check conditions — state or enabled/focusMode may have changed
 596        // during the await (effect cleanup sets cancelled).
 597        if (
 598          cancelled ||
 599          stateRef.current !== 'idle' ||
 600          silenceTimedOutRef.current
 601        )
 602          return
 603        logForDebugging('[voice] Focus gained, starting recording session')
 604        focusTriggeredRef.current = true
 605        void startRecordingSession()
 606        armFocusSilenceTimer()
 607      }
 608      if (voiceModule) {
 609        beginFocusRecording()
 610      } else {
 611        // Voice module is loading (async import resolves from cache as a
 612        // microtask). Wait for it before starting the recording session.
 613        void import('../services/voice.js').then(mod => {
 614          voiceModule = mod
 615          beginFocusRecording()
 616        })
 617      }
 618    } else if (!isFocused) {
 619      // Clear the silence timeout flag on blur so the next focus
 620      // cycle re-arms recording.
 621      silenceTimedOutRef.current = false
 622      if (stateRef.current === 'recording') {
 623        logForDebugging('[voice] Focus lost, finishing recording')
 624        finishRecording()
 625      }
 626    }
 627    return () => {
 628      cancelled = true
 629    }
 630  }, [enabled, focusMode, isFocused])
 631
 632  // ── Start a new recording session (voice_stream connect + audio) ──
 633  async function startRecordingSession(): Promise<void> {
 634    if (!voiceModule) {
 635      onErrorRef.current?.(
 636        'Voice module not loaded yet. Try again in a moment.',
 637      )
 638      return
 639    }
 640
 641    // Transition to 'recording' synchronously, BEFORE any await. Callers
 642    // read state synchronously right after `void startRecordingSession()`:
 643    // - useVoiceIntegration.tsx space-hold guard reads voiceState from the
 644    //   store immediately — if it sees 'idle' it clears isSpaceHoldActiveRef
 645    //   and space auto-repeat leaks into the text input (100% repro)
 646    // - handleKeyEvent's `currentState === 'idle'` re-entry check below
 647    // If an await runs first, both see stale 'idle'. See PR #20873 review.
 648    updateState('recording')
 649    recordingStartRef.current = Date.now()
 650    accumulatedRef.current = ''
 651    seenRepeatRef.current = false
 652    hasAudioSignalRef.current = false
 653    retryUsedRef.current = false
 654    silentDropRetriedRef.current = false
 655    fullAudioRef.current = []
 656    focusFlushedCharsRef.current = 0
 657    everConnectedRef.current = false
 658    const myGen = ++sessionGenRef.current
 659
 660    // ── Pre-check: can we actually record audio? ──────────────
 661    const availability = await voiceModule.checkRecordingAvailability()
 662    if (!availability.available) {
 663      logForDebugging(
 664        `[voice] Recording not available: ${availability.reason ?? 'unknown'}`,
 665      )
 666      onErrorRef.current?.(
 667        availability.reason ?? 'Audio recording is not available.',
 668      )
 669      cleanup()
 670      updateState('idle')
 671      return
 672    }
 673
 674    logForDebugging(
 675      '[voice] Starting recording session, connecting voice stream',
 676    )
 677    // Clear any previous error
 678    setVoiceState(prev => {
 679      if (!prev.voiceError) return prev
 680      return { ...prev, voiceError: null }
 681    })
 682
 683    // Buffer audio chunks while the WebSocket connects. Once the connection
 684    // is ready (onReady fires), buffered chunks are flushed and subsequent
 685    // chunks are sent directly.
 686    const audioBuffer: Buffer[] = []
 687
 688    // Start recording IMMEDIATELY — audio is buffered until the WebSocket
 689    // opens, eliminating the 1-2s latency from waiting for OAuth + WS connect.
 690    logForDebugging(
 691      '[voice] startRecording: buffering audio while WebSocket connects',
 692    )
 693    audioLevelsRef.current = []
 694    const started = await voiceModule.startRecording(
 695      (chunk: Buffer) => {
 696        // Copy for fullAudioRef replay buffer. send() in voiceStreamSTT
 697        // copies again defensively — acceptable overhead at audio rates.
 698        // Skip buffering in focus mode — replay is gated on !focusTriggered
 699        // so the buffer is dead weight (up to ~20MB for a 10min session).
 700        const owned = Buffer.from(chunk)
 701        if (!focusTriggeredRef.current) {
 702          fullAudioRef.current.push(owned)
 703        }
 704        if (connectionRef.current) {
 705          connectionRef.current.send(owned)
 706        } else {
 707          audioBuffer.push(owned)
 708        }
 709        // Update audio level histogram for the recording visualizer
 710        const level = computeLevel(chunk)
 711        if (!hasAudioSignalRef.current && level > 0.01) {
 712          hasAudioSignalRef.current = true
 713        }
 714        const levels = audioLevelsRef.current
 715        if (levels.length >= AUDIO_LEVEL_BARS) {
 716          levels.shift()
 717        }
 718        levels.push(level)
 719        // Copy the array so React sees a new reference
 720        const snapshot = [...levels]
 721        audioLevelsRef.current = snapshot
 722        setVoiceState(prev => ({ ...prev, voiceAudioLevels: snapshot }))
 723      },
 724      () => {
 725        // External end (e.g. device error) - treat as stop
 726        if (stateRef.current === 'recording') {
 727          finishRecording()
 728        }
 729      },
 730      { silenceDetection: false },
 731    )
 732
 733    if (!started) {
 734      logError(new Error('[voice] Recording failed — no audio tool found'))
 735      onErrorRef.current?.(
 736        'Failed to start audio capture. Check that your microphone is accessible.',
 737      )
 738      cleanup()
 739      updateState('idle')
 740      setVoiceState(prev => ({
 741        ...prev,
 742        voiceError: 'Recording failed — no audio tool found',
 743      }))
 744      return
 745    }
 746
 747    const rawLanguage = getInitialSettings().language
 748    const stt = normalizeLanguageForSTT(rawLanguage)
 749    logEvent('tengu_voice_recording_started', {
 750      focusTriggered: focusTriggeredRef.current,
 751      sttLanguage:
 752        stt.code as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
 753      sttLanguageIsDefault: !rawLanguage?.trim(),
 754      sttLanguageFellBack: stt.fellBackFrom !== undefined,
 755      // ISO 639 subtag from Intl (bounded set, never user text). undefined if
 756      // Intl failed — omitted from the payload, no retry cost (cached).
 757      systemLocaleLanguage:
 758        getSystemLocaleLanguage() as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
 759    })
 760
 761    // Retry once if the connection errors before delivering any transcript.
 762    // The conversation-engine proxy can reject rapid reconnects (~1/N_pods
 763    // same-pod collision) or CE's Deepgram upstream can fail during its own
 764    // teardown window (anthropics/anthropic#287008 surfaces this as
 765    // TranscriptError instead of silent-drop). A 250ms backoff clears both.
 766    // Audio captured during the retry window routes to audioBuffer (via the
 767    // connectionRef.current null check in the recording callback above) and
 768    // is flushed by the second onReady.
 769    let sawTranscript = false
 770
 771    // Connect WebSocket in parallel with audio recording.
 772    // Gather keyterms first (async but fast — no model calls), then connect.
 773    // Bail from callbacks if a newer session has started. Prevents a
 774    // slow-connecting zombie WS (e.g. user released, pressed again, first
 775    // WS still handshaking) from firing onReady/onError into the new
 776    // session and corrupting its connectionRef / triggering a bogus retry.
 777    const isStale = () => sessionGenRef.current !== myGen
 778
 779    const attemptConnect = (keyterms: string[]): void => {
 780      const myAttemptGen = attemptGenRef.current
 781      void connectVoiceStream(
 782        {
 783          onTranscript: (text: string, isFinal: boolean) => {
 784            if (isStale()) return
 785            sawTranscript = true
 786            logForDebugging(
 787              `[voice] onTranscript: isFinal=${String(isFinal)} text="${text}"`,
 788            )
 789            if (isFinal && text.trim()) {
 790              if (focusTriggeredRef.current) {
 791                // Focus mode: flush each final transcript immediately and
 792                // keep recording. This gives continuous transcription while
 793                // the terminal is focused.
 794                logForDebugging(
 795                  `[voice] Focus mode: flushing final transcript immediately: "${text.trim()}"`,
 796                )
 797                onTranscriptRef.current(text.trim())
 798                focusFlushedCharsRef.current += text.trim().length
 799                setVoiceState(prev => {
 800                  if (prev.voiceInterimTranscript === '') return prev
 801                  return { ...prev, voiceInterimTranscript: '' }
 802                })
 803                accumulatedRef.current = ''
 804                // User is actively speaking — reset the silence timer.
 805                armFocusSilenceTimer()
 806              } else {
 807                // Hold-to-talk: accumulate final transcripts separated by spaces
 808                if (accumulatedRef.current) {
 809                  accumulatedRef.current += ' '
 810                }
 811                accumulatedRef.current += text.trim()
 812                logForDebugging(
 813                  `[voice] Accumulated final transcript: "${accumulatedRef.current}"`,
 814                )
 815                // Clear interim since final supersedes it
 816                setVoiceState(prev => {
 817                  const preview = accumulatedRef.current
 818                  if (prev.voiceInterimTranscript === preview) return prev
 819                  return { ...prev, voiceInterimTranscript: preview }
 820                })
 821              }
 822            } else if (!isFinal) {
 823              // Active interim speech resets the focus silence timer.
 824              // Nova 3 disables auto-finalize so isFinal is never true
 825              // mid-stream — without this, the 5s timer fires during
 826              // active speech and tears down the session.
 827              if (focusTriggeredRef.current) {
 828                armFocusSilenceTimer()
 829              }
 830              // Show accumulated finals + current interim as live preview
 831              const interim = text.trim()
 832              const preview = accumulatedRef.current
 833                ? accumulatedRef.current + (interim ? ' ' + interim : '')
 834                : interim
 835              setVoiceState(prev => {
 836                if (prev.voiceInterimTranscript === preview) return prev
 837                return { ...prev, voiceInterimTranscript: preview }
 838              })
 839            }
 840          },
 841          onError: (error: string, opts?: { fatal?: boolean }) => {
 842            if (isStale()) {
 843              logForDebugging(
 844                `[voice] ignoring onError from stale session: ${error}`,
 845              )
 846              return
 847            }
 848            // Swallow errors from superseded attempts. Covers conn 1's
 849            // trailing close after retry is scheduled, AND the current
 850            // conn's ws close event after its ws error already surfaced
 851            // below (gen bumped at surface).
 852            if (attemptGenRef.current !== myAttemptGen) {
 853              logForDebugging(
 854                `[voice] ignoring stale onError from superseded attempt: ${error}`,
 855              )
 856              return
 857            }
 858            // Early-failure retry: server error before any transcript =
 859            // likely a transient upstream race (CE rejection, Deepgram
 860            // not ready). Clear connectionRef so audio re-buffers, back
 861            // off, reconnect. Skip if the user has already released the
 862            // key (state left 'recording') — no point retrying a session
 863            // they've ended. Fatal errors (Cloudflare bot challenge, auth
 864            // rejection) are the same failure on every retry attempt, so
 865            // fall through to surface the message.
 866            if (
 867              !opts?.fatal &&
 868              !sawTranscript &&
 869              stateRef.current === 'recording'
 870            ) {
 871              if (!retryUsedRef.current) {
 872                retryUsedRef.current = true
 873                logForDebugging(
 874                  `[voice] early voice_stream error (pre-transcript), retrying once: ${error}`,
 875                )
 876                logEvent('tengu_voice_stream_early_retry', {})
 877                connectionRef.current = null
 878                attemptGenRef.current++
 879                setTimeout(
 880                  (stateRef, attemptConnect, keyterms) => {
 881                    if (stateRef.current === 'recording') {
 882                      attemptConnect(keyterms)
 883                    }
 884                  },
 885                  250,
 886                  stateRef,
 887                  attemptConnect,
 888                  keyterms,
 889                )
 890                return
 891              }
 892            }
 893            // Surfacing — bump gen so this conn's trailing close-error
 894            // (ws fires error then close 1006) is swallowed above.
 895            attemptGenRef.current++
 896            logError(new Error(`[voice] voice_stream error: ${error}`))
 897            onErrorRef.current?.(`Voice stream error: ${error}`)
 898            // Clear the audio buffer on error to avoid memory leaks
 899            audioBuffer.length = 0
 900            focusTriggeredRef.current = false
 901            cleanup()
 902            updateState('idle')
 903          },
 904          onClose: () => {
 905            // no-op; lifecycle handled by cleanup()
 906          },
 907          onReady: conn => {
 908            // Only proceed if we're still in recording state AND this is
 909            // still the current session. A zombie late-connecting WS from
 910            // an abandoned session can pass the 'recording' check if the
 911            // user has since started a new session.
 912            if (isStale() || stateRef.current !== 'recording') {
 913              conn.close()
 914              return
 915            }
 916
 917            // The WebSocket is now truly open — assign connectionRef so
 918            // subsequent audio callbacks send directly instead of buffering.
 919            connectionRef.current = conn
 920            everConnectedRef.current = true
 921
 922            // Flush all audio chunks that were buffered while the WebSocket
 923            // was connecting.  This is safe because onReady fires from the
 924            // WebSocket 'open' event, guaranteeing send() will not be dropped.
 925            //
 926            // Coalesce into ~1s slices rather than one ws.send per chunk
 927            // — fewer WS frames means less overhead on both ends.
 928            const SLICE_TARGET_BYTES = 32_000 // ~1s at 16kHz/16-bit/mono
 929            if (audioBuffer.length > 0) {
 930              let totalBytes = 0
 931              for (const c of audioBuffer) totalBytes += c.length
 932              const slices: Buffer[][] = [[]]
 933              let sliceBytes = 0
 934              for (const chunk of audioBuffer) {
 935                if (
 936                  sliceBytes > 0 &&
 937                  sliceBytes + chunk.length > SLICE_TARGET_BYTES
 938                ) {
 939                  slices.push([])
 940                  sliceBytes = 0
 941                }
 942                slices[slices.length - 1]!.push(chunk)
 943                sliceBytes += chunk.length
 944              }
 945              logForDebugging(
 946                `[voice] onReady: flushing ${String(audioBuffer.length)} buffered chunks (${String(totalBytes)} bytes) as ${String(slices.length)} coalesced frame(s)`,
 947              )
 948              for (const slice of slices) {
 949                conn.send(Buffer.concat(slice))
 950              }
 951            }
 952            audioBuffer.length = 0
 953
 954            // Reset the release timer now that the WebSocket is ready.
 955            // Only arm it if auto-repeat has been seen — otherwise the OS
 956            // key repeat delay (~500ms) hasn't elapsed yet and the timer
 957            // would fire prematurely.
 958            if (releaseTimerRef.current) {
 959              clearTimeout(releaseTimerRef.current)
 960            }
 961            if (seenRepeatRef.current) {
 962              releaseTimerRef.current = setTimeout(
 963                (releaseTimerRef, stateRef, finishRecording) => {
 964                  releaseTimerRef.current = null
 965                  if (stateRef.current === 'recording') {
 966                    finishRecording()
 967                  }
 968                },
 969                RELEASE_TIMEOUT_MS,
 970                releaseTimerRef,
 971                stateRef,
 972                finishRecording,
 973              )
 974            }
 975          },
 976        },
 977        {
 978          language: stt.code,
 979          keyterms,
 980        },
 981      ).then(conn => {
 982        if (isStale()) {
 983          conn?.close()
 984          return
 985        }
 986        if (!conn) {
 987          logForDebugging(
 988            '[voice] Failed to connect to voice_stream (no OAuth token?)',
 989          )
 990          onErrorRef.current?.(
 991            'Voice mode requires a Claude.ai account. Please run /login to sign in.',
 992          )
 993          // Clear the audio buffer on failure
 994          audioBuffer.length = 0
 995          cleanup()
 996          updateState('idle')
 997          return
 998        }
 999
1000        // Safety check: if the user released the key before connectVoiceStream
1001        // resolved (but after onReady already ran), close the connection.
1002        if (stateRef.current !== 'recording') {
1003          audioBuffer.length = 0
1004          conn.close()
1005          return
1006        }
1007      })
1008    }
1009
1010    void getVoiceKeyterms().then(attemptConnect)
1011  }
1012
1013  // ── Hold-to-talk handler ────────────────────────────────────────────
1014  // Called on every keypress (including terminal auto-repeats while
1015  // the key is held).  A gap longer than RELEASE_TIMEOUT_MS between
1016  // events is interpreted as key release.
1017  //
1018  // Recording starts immediately on the first keypress to eliminate
1019  // startup delay.  The release timer is only armed after auto-repeat
1020  // is detected (to avoid false releases during the OS key repeat
1021  // delay of ~500ms on macOS).
1022  const handleKeyEvent = useCallback(
1023    (fallbackMs = REPEAT_FALLBACK_MS): void => {
1024      if (!enabled || !isVoiceStreamAvailable()) {
1025        return
1026      }
1027
1028      // In focus mode, recording is driven by terminal focus, not keypresses.
1029      if (focusTriggeredRef.current) {
1030        // Active focus recording — ignore key events (session ends on blur).
1031        return
1032      }
1033      if (focusMode && silenceTimedOutRef.current) {
1034        // Focus session timed out due to silence — keypress re-arms it.
1035        logForDebugging(
1036          '[voice] Re-arming focus recording after silence timeout',
1037        )
1038        silenceTimedOutRef.current = false
1039        focusTriggeredRef.current = true
1040        void startRecordingSession()
1041        armFocusSilenceTimer()
1042        return
1043      }
1044
1045      const currentState = stateRef.current
1046
1047      // Ignore keypresses while processing
1048      if (currentState === 'processing') {
1049        return
1050      }
1051
1052      if (currentState === 'idle') {
1053        logForDebugging(
1054          '[voice] handleKeyEvent: idle, starting recording session immediately',
1055        )
1056        void startRecordingSession()
1057        // Fallback: if no auto-repeat arrives within REPEAT_FALLBACK_MS,
1058        // arm the release timer anyway (the user likely tapped and released).
1059        repeatFallbackTimerRef.current = setTimeout(
1060          (
1061            repeatFallbackTimerRef,
1062            stateRef,
1063            seenRepeatRef,
1064            releaseTimerRef,
1065            finishRecording,
1066          ) => {
1067            repeatFallbackTimerRef.current = null
1068            if (stateRef.current === 'recording' && !seenRepeatRef.current) {
1069              logForDebugging(
1070                '[voice] No auto-repeat seen, arming release timer via fallback',
1071              )
1072              seenRepeatRef.current = true
1073              releaseTimerRef.current = setTimeout(
1074                (releaseTimerRef, stateRef, finishRecording) => {
1075                  releaseTimerRef.current = null
1076                  if (stateRef.current === 'recording') {
1077                    finishRecording()
1078                  }
1079                },
1080                RELEASE_TIMEOUT_MS,
1081                releaseTimerRef,
1082                stateRef,
1083                finishRecording,
1084              )
1085            }
1086          },
1087          fallbackMs,
1088          repeatFallbackTimerRef,
1089          stateRef,
1090          seenRepeatRef,
1091          releaseTimerRef,
1092          finishRecording,
1093        )
1094      } else if (currentState === 'recording') {
1095        // Second+ keypress while recording — auto-repeat has started.
1096        seenRepeatRef.current = true
1097        if (repeatFallbackTimerRef.current) {
1098          clearTimeout(repeatFallbackTimerRef.current)
1099          repeatFallbackTimerRef.current = null
1100        }
1101      }
1102
1103      // Reset the release timer on every keypress (including auto-repeats)
1104      if (releaseTimerRef.current) {
1105        clearTimeout(releaseTimerRef.current)
1106      }
1107
1108      // Only arm the release timer once auto-repeat has been seen.
1109      // The OS key repeat delay is ~500ms on macOS; without this gate
1110      // the 200ms timer fires before repeat starts, causing a false release.
1111      if (stateRef.current === 'recording' && seenRepeatRef.current) {
1112        releaseTimerRef.current = setTimeout(
1113          (releaseTimerRef, stateRef, finishRecording) => {
1114            releaseTimerRef.current = null
1115            if (stateRef.current === 'recording') {
1116              finishRecording()
1117            }
1118          },
1119          RELEASE_TIMEOUT_MS,
1120          releaseTimerRef,
1121          stateRef,
1122          finishRecording,
1123        )
1124      }
1125    },
1126    [enabled, focusMode, cleanup],
1127  )
1128
1129  // Cleanup only when disabled or unmounted - NOT on state changes
1130  useEffect(() => {
1131    if (!enabled && stateRef.current !== 'idle') {
1132      cleanup()
1133      updateState('idle')
1134    }
1135    return () => {
1136      cleanup()
1137    }
1138  }, [enabled, cleanup])
1139
1140  return {
1141    state,
1142    handleKeyEvent,
1143  }
1144}