source dump of claude code
at main 525 lines 17 kB view raw
1// Voice service: audio recording for push-to-talk voice input. 2// 3// Recording uses native audio capture (cpal) on macOS, Linux, and Windows 4// for in-process mic access. Falls back to SoX `rec` or arecord (ALSA) 5// on Linux if the native module is unavailable. 6 7import { type ChildProcess, spawn, spawnSync } from 'child_process' 8import { readFile } from 'fs/promises' 9import { logForDebugging } from '../utils/debug.js' 10import { isEnvTruthy, isRunningOnHomespace } from '../utils/envUtils.js' 11import { logError } from '../utils/log.js' 12import { getPlatform } from '../utils/platform.js' 13 14// Lazy-loaded native audio module. audio-capture.node links against 15// CoreAudio.framework + AudioUnit.framework; dlopen is synchronous and 16// blocks the event loop for ~1s warm, up to ~8s on cold coreaudiod 17// (post-wake, post-boot). Load happens on first voice keypress — no 18// preload, because there's no way to make dlopen non-blocking and a 19// startup freeze is worse than a first-press delay. 20type AudioNapi = typeof import('audio-capture-napi') 21let audioNapi: AudioNapi | null = null 22let audioNapiPromise: Promise<AudioNapi> | null = null 23 24function loadAudioNapi(): Promise<AudioNapi> { 25 audioNapiPromise ??= (async () => { 26 const t0 = Date.now() 27 const mod = await import('audio-capture-napi') 28 // vendor/audio-capture-src/index.ts defers require(...node) until the 29 // first function call — trigger it here so timing reflects real cost. 30 mod.isNativeAudioAvailable() 31 audioNapi = mod 32 logForDebugging(`[voice] audio-capture-napi loaded in ${Date.now() - t0}ms`) 33 return mod 34 })() 35 return audioNapiPromise 36} 37 38// ─── Constants ─────────────────────────────────────────────────────── 39 40const RECORDING_SAMPLE_RATE = 16000 41const RECORDING_CHANNELS = 1 42 43// SoX silence detection: stop after this duration of silence 44const SILENCE_DURATION_SECS = '2.0' 45const SILENCE_THRESHOLD = '3%' 46 47// ─── Dependency check ──────────────────────────────────────────────── 48 49function hasCommand(cmd: string): boolean { 50 // Spawn the target directly instead of `which cmd`. On Termux/Android 51 // `which` is a shell builtin — the external binary is absent or 52 // kernel-blocked (EPERM) when spawned from Node. Only reached on 53 // non-Windows (win32 returns early from all callers), no PATHEXT issue. 54 // result.error is set iff the spawn itself fails (ENOENT/EACCES); exit 55 // code is irrelevant — an unrecognized --version still means cmd exists. 56 const result = spawnSync(cmd, ['--version'], { 57 stdio: 'ignore', 58 timeout: 3000, 59 }) 60 return result.error === undefined 61} 62 63// Probe whether arecord can actually open a capture device. hasCommand() 64// only checks PATH; on WSL1/Win10-WSL2/headless Linux the binary exists 65// but fails at open() because there is no ALSA card and no PulseAudio 66// server. On WSL2+WSLg (Win11), PulseAudio works via RDP pipes and arecord 67// succeeds. We spawn with the same args as startArecordRecording() and race 68// a short timer: if the process is still alive after 150ms it opened the 69// device; if it exits early the stderr tells us why. Memoized — audio 70// device availability does not change mid-session, and this is called on 71// every voice keypress via checkRecordingAvailability(). 72type ArecordProbeResult = { ok: boolean; stderr: string } 73let arecordProbe: Promise<ArecordProbeResult> | null = null 74 75function probeArecord(): Promise<ArecordProbeResult> { 76 arecordProbe ??= new Promise(resolve => { 77 const child = spawn( 78 'arecord', 79 [ 80 '-f', 81 'S16_LE', 82 '-r', 83 String(RECORDING_SAMPLE_RATE), 84 '-c', 85 String(RECORDING_CHANNELS), 86 '-t', 87 'raw', 88 '/dev/null', 89 ], 90 { stdio: ['ignore', 'ignore', 'pipe'] }, 91 ) 92 let stderr = '' 93 child.stderr?.on('data', (chunk: Buffer) => { 94 stderr += chunk.toString() 95 }) 96 const timer = setTimeout( 97 (c: ChildProcess, r: (v: ArecordProbeResult) => void) => { 98 c.kill('SIGTERM') 99 r({ ok: true, stderr: '' }) 100 }, 101 150, 102 child, 103 resolve, 104 ) 105 child.once('close', code => { 106 clearTimeout(timer) 107 // SIGTERM close (code=null) after timer fired is already resolved. 108 // Early close with code=0 is unusual (arecord shouldn't exit on its 109 // own) but treat as ok. 110 void resolve({ ok: code === 0, stderr: stderr.trim() }) 111 }) 112 child.once('error', () => { 113 clearTimeout(timer) 114 void resolve({ ok: false, stderr: 'arecord: command not found' }) 115 }) 116 }) 117 return arecordProbe 118} 119 120export function _resetArecordProbeForTesting(): void { 121 arecordProbe = null 122} 123 124// cpal's ALSA backend writes to our process stderr when it can't find any 125// sound cards (it runs in-process — no subprocess pipe to capture it). The 126// spawn fallbacks below pipe stderr correctly, so skip native when ALSA has 127// nothing to open. Memoized: card presence doesn't change mid-session. 128let linuxAlsaCardsMemo: Promise<boolean> | null = null 129 130function linuxHasAlsaCards(): Promise<boolean> { 131 linuxAlsaCardsMemo ??= readFile('/proc/asound/cards', 'utf8').then( 132 cards => { 133 const c = cards.trim() 134 return c !== '' && !c.includes('no soundcards') 135 }, 136 () => false, 137 ) 138 return linuxAlsaCardsMemo 139} 140 141export function _resetAlsaCardsForTesting(): void { 142 linuxAlsaCardsMemo = null 143} 144 145type PackageManagerInfo = { 146 cmd: string 147 args: string[] 148 displayCommand: string 149} 150 151function detectPackageManager(): PackageManagerInfo | null { 152 if (process.platform === 'darwin') { 153 if (hasCommand('brew')) { 154 return { 155 cmd: 'brew', 156 args: ['install', 'sox'], 157 displayCommand: 'brew install sox', 158 } 159 } 160 return null 161 } 162 163 if (process.platform === 'linux') { 164 if (hasCommand('apt-get')) { 165 return { 166 cmd: 'sudo', 167 args: ['apt-get', 'install', '-y', 'sox'], 168 displayCommand: 'sudo apt-get install sox', 169 } 170 } 171 if (hasCommand('dnf')) { 172 return { 173 cmd: 'sudo', 174 args: ['dnf', 'install', '-y', 'sox'], 175 displayCommand: 'sudo dnf install sox', 176 } 177 } 178 if (hasCommand('pacman')) { 179 return { 180 cmd: 'sudo', 181 args: ['pacman', '-S', '--noconfirm', 'sox'], 182 displayCommand: 'sudo pacman -S sox', 183 } 184 } 185 } 186 187 return null 188} 189 190export async function checkVoiceDependencies(): Promise<{ 191 available: boolean 192 missing: string[] 193 installCommand: string | null 194}> { 195 // Native audio module (cpal) handles everything on macOS, Linux, and Windows 196 const napi = await loadAudioNapi() 197 if (napi.isNativeAudioAvailable()) { 198 return { available: true, missing: [], installCommand: null } 199 } 200 201 // Windows has no supported fallback — native module is required 202 if (process.platform === 'win32') { 203 return { 204 available: false, 205 missing: ['Voice mode requires the native audio module (not loaded)'], 206 installCommand: null, 207 } 208 } 209 210 // On Linux, arecord (ALSA utils) is a valid fallback recording backend 211 if (process.platform === 'linux' && hasCommand('arecord')) { 212 return { available: true, missing: [], installCommand: null } 213 } 214 215 const missing: string[] = [] 216 217 if (!hasCommand('rec')) { 218 missing.push('sox (rec command)') 219 } 220 221 const pm = missing.length > 0 ? detectPackageManager() : null 222 return { 223 available: missing.length === 0, 224 missing, 225 installCommand: pm?.displayCommand ?? null, 226 } 227} 228 229// ─── Recording availability ────────────────────────────────────────── 230 231export type RecordingAvailability = { 232 available: boolean 233 reason: string | null 234} 235 236// Probe-record through the full fallback chain (native → arecord → SoX) 237// to verify that at least one backend can record. On macOS this also 238// triggers the TCC permission dialog on first use. We trust the probe 239// result over the TCC status API, which can be unreliable for ad-hoc 240// signed or cross-architecture binaries (e.g., x64-on-arm64). 241export async function requestMicrophonePermission(): Promise<boolean> { 242 const napi = await loadAudioNapi() 243 if (!napi.isNativeAudioAvailable()) { 244 return true // non-native platforms skip this check 245 } 246 247 const started = await startRecording( 248 _chunk => {}, // discard audio data — this is a permission probe only 249 () => {}, // ignore silence-detection end signal 250 { silenceDetection: false }, 251 ) 252 if (started) { 253 stopRecording() 254 return true 255 } 256 return false 257} 258 259export async function checkRecordingAvailability(): Promise<RecordingAvailability> { 260 // Remote environments have no local microphone 261 if (isRunningOnHomespace() || isEnvTruthy(process.env.CLAUDE_CODE_REMOTE)) { 262 return { 263 available: false, 264 reason: 265 'Voice mode requires microphone access, but no audio device is available in this environment.\n\nTo use voice mode, run Claude Code locally instead.', 266 } 267 } 268 269 // Native audio module (cpal) handles everything on macOS, Linux, and Windows 270 const napi = await loadAudioNapi() 271 if (napi.isNativeAudioAvailable()) { 272 return { available: true, reason: null } 273 } 274 275 // Windows has no supported fallback 276 if (process.platform === 'win32') { 277 return { 278 available: false, 279 reason: 280 'Voice recording requires the native audio module, which could not be loaded.', 281 } 282 } 283 284 const wslNoAudioReason = 285 'Voice mode could not access an audio device in WSL.\n\nWSL2 with WSLg (Windows 11) provides audio via PulseAudio — if you are on Windows 10 or WSL1, run Claude Code in native Windows instead.' 286 287 // On Linux (including WSL), probe arecord. hasCommand() is insufficient: 288 // the binary can exist while the device open() fails (WSL1, Win10-WSL2, 289 // headless Linux). WSL2+WSLg (Win11 default) works via PulseAudio RDP 290 // pipes — cpal fails (no /proc/asound/cards) but arecord succeeds. 291 if (process.platform === 'linux' && hasCommand('arecord')) { 292 const probe = await probeArecord() 293 if (probe.ok) { 294 return { available: true, reason: null } 295 } 296 if (getPlatform() === 'wsl') { 297 return { available: false, reason: wslNoAudioReason } 298 } 299 logForDebugging(`[voice] arecord probe failed: ${probe.stderr}`) 300 // fall through to SoX 301 } 302 303 // Fallback: check for SoX 304 if (!hasCommand('rec')) { 305 // WSL without arecord AND without SoX: the generic "install SoX" 306 // hint below is misleading on WSL1/Win10 (no audio devices at all), 307 // but correct on WSL2+WSLg (SoX works via PulseAudio). Since we can't 308 // distinguish WSLg-vs-not without a backend to probe, show the WSLg 309 // guidance — it points WSL1 users at native Windows AND tells WSLg 310 // users their setup should work (they can install sox or alsa-utils). 311 // Known gap: WSL with SoX but NO arecord skips both this branch and 312 // the probe above — hasCommand('rec') lies the same way. We optimistically 313 // trust it (WSLg+SoX would work) rather than probeSox() for a near-zero 314 // population (WSL1 × minimal distro × SoX-but-not-alsa-utils). 315 if (getPlatform() === 'wsl') { 316 return { available: false, reason: wslNoAudioReason } 317 } 318 const pm = detectPackageManager() 319 return { 320 available: false, 321 reason: pm 322 ? `Voice mode requires SoX for audio recording. Install it with: ${pm.displayCommand}` 323 : 'Voice mode requires SoX for audio recording. Install SoX manually:\n macOS: brew install sox\n Ubuntu/Debian: sudo apt-get install sox\n Fedora: sudo dnf install sox', 324 } 325 } 326 327 return { available: true, reason: null } 328} 329 330// ─── Recording (native audio on macOS/Linux/Windows, SoX/arecord fallback on Linux) ───────────── 331 332let activeRecorder: ChildProcess | null = null 333let nativeRecordingActive = false 334 335export async function startRecording( 336 onData: (chunk: Buffer) => void, 337 onEnd: () => void, 338 options?: { silenceDetection?: boolean }, 339): Promise<boolean> { 340 logForDebugging(`[voice] startRecording called, platform=${process.platform}`) 341 342 // Try native audio module first (macOS, Linux, Windows via cpal) 343 const napi = await loadAudioNapi() 344 const nativeAvailable = 345 napi.isNativeAudioAvailable() && 346 (process.platform !== 'linux' || (await linuxHasAlsaCards())) 347 const useSilenceDetection = options?.silenceDetection !== false 348 if (nativeAvailable) { 349 // Ensure any previous recording is fully stopped 350 if (nativeRecordingActive || napi.isNativeRecordingActive()) { 351 napi.stopNativeRecording() 352 nativeRecordingActive = false 353 } 354 const started = napi.startNativeRecording( 355 (data: Buffer) => { 356 onData(data) 357 }, 358 () => { 359 if (useSilenceDetection) { 360 nativeRecordingActive = false 361 onEnd() 362 } 363 // In push-to-talk mode, ignore the native module's silence-triggered 364 // onEnd. Recording continues until the caller explicitly calls 365 // stopRecording() (e.g. when the user presses Ctrl+X). 366 }, 367 ) 368 if (started) { 369 nativeRecordingActive = true 370 return true 371 } 372 // Native recording failed — fall through to platform fallbacks 373 } 374 375 // Windows has no supported fallback 376 if (process.platform === 'win32') { 377 logForDebugging('[voice] Windows native recording unavailable, no fallback') 378 return false 379 } 380 381 // On Linux, try arecord (ALSA utils) before SoX. Consult the probe so 382 // backend selection matches checkRecordingAvailability() — otherwise 383 // on headless Linux with both alsa-utils and SoX, the availability 384 // check falls through to SoX (probe.ok=false, not WSL) but this path 385 // would still pick broken arecord. Probe is memoized; zero latency. 386 if ( 387 process.platform === 'linux' && 388 hasCommand('arecord') && 389 (await probeArecord()).ok 390 ) { 391 return startArecordRecording(onData, onEnd) 392 } 393 394 // Fallback: SoX rec (Linux, or macOS if native module unavailable) 395 return startSoxRecording(onData, onEnd, options) 396} 397 398function startSoxRecording( 399 onData: (chunk: Buffer) => void, 400 onEnd: () => void, 401 options?: { silenceDetection?: boolean }, 402): boolean { 403 const useSilenceDetection = options?.silenceDetection !== false 404 405 // Record raw PCM: 16 kHz, 16-bit signed, mono, to stdout. 406 // --buffer 1024 forces SoX to flush audio in small chunks instead of 407 // accumulating data in its internal buffer. Without this, SoX may buffer 408 // several seconds of audio before writing anything to stdout when piped, 409 // causing zero data flow until the process exits. 410 const args = [ 411 '-q', // quiet 412 '--buffer', 413 '1024', 414 '-t', 415 'raw', 416 '-r', 417 String(RECORDING_SAMPLE_RATE), 418 '-e', 419 'signed', 420 '-b', 421 '16', 422 '-c', 423 String(RECORDING_CHANNELS), 424 '-', // stdout 425 ] 426 427 // Add silence detection filter (auto-stop on silence). 428 // Omit for push-to-talk where the user manually controls start/stop. 429 if (useSilenceDetection) { 430 args.push( 431 'silence', // start/stop on silence 432 '1', 433 '0.1', 434 SILENCE_THRESHOLD, 435 '1', 436 SILENCE_DURATION_SECS, 437 SILENCE_THRESHOLD, 438 ) 439 } 440 441 const child = spawn('rec', args, { 442 stdio: ['pipe', 'pipe', 'pipe'], 443 }) 444 445 activeRecorder = child 446 447 child.stdout?.on('data', (chunk: Buffer) => { 448 onData(chunk) 449 }) 450 451 // Consume stderr to prevent backpressure 452 child.stderr?.on('data', () => {}) 453 454 child.on('close', () => { 455 activeRecorder = null 456 onEnd() 457 }) 458 459 child.on('error', err => { 460 logError(err) 461 activeRecorder = null 462 onEnd() 463 }) 464 465 return true 466} 467 468function startArecordRecording( 469 onData: (chunk: Buffer) => void, 470 onEnd: () => void, 471): boolean { 472 // Record raw PCM: 16 kHz, 16-bit signed little-endian, mono, to stdout. 473 // arecord does not support built-in silence detection, so this backend 474 // is best suited for push-to-talk (silenceDetection: false). 475 const args = [ 476 '-f', 477 'S16_LE', // signed 16-bit little-endian 478 '-r', 479 String(RECORDING_SAMPLE_RATE), 480 '-c', 481 String(RECORDING_CHANNELS), 482 '-t', 483 'raw', // raw PCM, no WAV header 484 '-q', // quiet — no progress output 485 '-', // write to stdout 486 ] 487 488 const child = spawn('arecord', args, { 489 stdio: ['pipe', 'pipe', 'pipe'], 490 }) 491 492 activeRecorder = child 493 494 child.stdout?.on('data', (chunk: Buffer) => { 495 onData(chunk) 496 }) 497 498 // Consume stderr to prevent backpressure 499 child.stderr?.on('data', () => {}) 500 501 child.on('close', () => { 502 activeRecorder = null 503 onEnd() 504 }) 505 506 child.on('error', err => { 507 logError(err) 508 activeRecorder = null 509 onEnd() 510 }) 511 512 return true 513} 514 515export function stopRecording(): void { 516 if (nativeRecordingActive && audioNapi) { 517 audioNapi.stopNativeRecording() 518 nativeRecordingActive = false 519 return 520 } 521 if (activeRecorder) { 522 activeRecorder.kill('SIGTERM') 523 activeRecorder = null 524 } 525}