services/voice.ts at main · nonbinary.computer/claude-code

nonbinary.computer / claude-code
forked from oppi.li/claude-code
fork atom
source dump of claude code
fork atom
claude-code / services / voice.ts
at main 525 lines 17 kB view raw
wrap content
oppi.li dump from zip 11d ago
63aada3f
  1// Voice service: audio recording for push-to-talk voice input.
  2//
  3// Recording uses native audio capture (cpal) on macOS, Linux, and Windows
  4// for in-process mic access. Falls back to SoX `rec` or arecord (ALSA)
  5// on Linux if the native module is unavailable.
  6
  7import { type ChildProcess, spawn, spawnSync } from 'child_process'
  8import { readFile } from 'fs/promises'
  9import { logForDebugging } from '../utils/debug.js'
 10import { isEnvTruthy, isRunningOnHomespace } from '../utils/envUtils.js'
 11import { logError } from '../utils/log.js'
 12import { getPlatform } from '../utils/platform.js'
 13
 14// Lazy-loaded native audio module. audio-capture.node links against
 15// CoreAudio.framework + AudioUnit.framework; dlopen is synchronous and
 16// blocks the event loop for ~1s warm, up to ~8s on cold coreaudiod
 17// (post-wake, post-boot). Load happens on first voice keypress — no
 18// preload, because there's no way to make dlopen non-blocking and a
 19// startup freeze is worse than a first-press delay.
 20type AudioNapi = typeof import('audio-capture-napi')
 21let audioNapi: AudioNapi | null = null
 22let audioNapiPromise: Promise<AudioNapi> | null = null
 23
 24function loadAudioNapi(): Promise<AudioNapi> {
 25  audioNapiPromise ??= (async () => {
 26    const t0 = Date.now()
 27    const mod = await import('audio-capture-napi')
 28    // vendor/audio-capture-src/index.ts defers require(...node) until the
 29    // first function call — trigger it here so timing reflects real cost.
 30    mod.isNativeAudioAvailable()
 31    audioNapi = mod
 32    logForDebugging(`[voice] audio-capture-napi loaded in ${Date.now() - t0}ms`)
 33    return mod
 34  })()
 35  return audioNapiPromise
 36}
 37
 38// ─── Constants ───────────────────────────────────────────────────────
 39
 40const RECORDING_SAMPLE_RATE = 16000
 41const RECORDING_CHANNELS = 1
 42
 43// SoX silence detection: stop after this duration of silence
 44const SILENCE_DURATION_SECS = '2.0'
 45const SILENCE_THRESHOLD = '3%'
 46
 47// ─── Dependency check ────────────────────────────────────────────────
 48
 49function hasCommand(cmd: string): boolean {
 50  // Spawn the target directly instead of `which cmd`. On Termux/Android
 51  // `which` is a shell builtin — the external binary is absent or
 52  // kernel-blocked (EPERM) when spawned from Node. Only reached on
 53  // non-Windows (win32 returns early from all callers), no PATHEXT issue.
 54  // result.error is set iff the spawn itself fails (ENOENT/EACCES); exit
 55  // code is irrelevant — an unrecognized --version still means cmd exists.
 56  const result = spawnSync(cmd, ['--version'], {
 57    stdio: 'ignore',
 58    timeout: 3000,
 59  })
 60  return result.error === undefined
 61}
 62
 63// Probe whether arecord can actually open a capture device. hasCommand()
 64// only checks PATH; on WSL1/Win10-WSL2/headless Linux the binary exists
 65// but fails at open() because there is no ALSA card and no PulseAudio
 66// server. On WSL2+WSLg (Win11), PulseAudio works via RDP pipes and arecord
 67// succeeds. We spawn with the same args as startArecordRecording() and race
 68// a short timer: if the process is still alive after 150ms it opened the
 69// device; if it exits early the stderr tells us why. Memoized — audio
 70// device availability does not change mid-session, and this is called on
 71// every voice keypress via checkRecordingAvailability().
 72type ArecordProbeResult = { ok: boolean; stderr: string }
 73let arecordProbe: Promise<ArecordProbeResult> | null = null
 74
 75function probeArecord(): Promise<ArecordProbeResult> {
 76  arecordProbe ??= new Promise(resolve => {
 77    const child = spawn(
 78      'arecord',
 79      [
 80        '-f',
 81        'S16_LE',
 82        '-r',
 83        String(RECORDING_SAMPLE_RATE),
 84        '-c',
 85        String(RECORDING_CHANNELS),
 86        '-t',
 87        'raw',
 88        '/dev/null',
 89      ],
 90      { stdio: ['ignore', 'ignore', 'pipe'] },
 91    )
 92    let stderr = ''
 93    child.stderr?.on('data', (chunk: Buffer) => {
 94      stderr += chunk.toString()
 95    })
 96    const timer = setTimeout(
 97      (c: ChildProcess, r: (v: ArecordProbeResult) => void) => {
 98        c.kill('SIGTERM')
 99        r({ ok: true, stderr: '' })
100      },
101      150,
102      child,
103      resolve,
104    )
105    child.once('close', code => {
106      clearTimeout(timer)
107      // SIGTERM close (code=null) after timer fired is already resolved.
108      // Early close with code=0 is unusual (arecord shouldn't exit on its
109      // own) but treat as ok.
110      void resolve({ ok: code === 0, stderr: stderr.trim() })
111    })
112    child.once('error', () => {
113      clearTimeout(timer)
114      void resolve({ ok: false, stderr: 'arecord: command not found' })
115    })
116  })
117  return arecordProbe
118}
119
120export function _resetArecordProbeForTesting(): void {
121  arecordProbe = null
122}
123
124// cpal's ALSA backend writes to our process stderr when it can't find any
125// sound cards (it runs in-process — no subprocess pipe to capture it). The
126// spawn fallbacks below pipe stderr correctly, so skip native when ALSA has
127// nothing to open. Memoized: card presence doesn't change mid-session.
128let linuxAlsaCardsMemo: Promise<boolean> | null = null
129
130function linuxHasAlsaCards(): Promise<boolean> {
131  linuxAlsaCardsMemo ??= readFile('/proc/asound/cards', 'utf8').then(
132    cards => {
133      const c = cards.trim()
134      return c !== '' && !c.includes('no soundcards')
135    },
136    () => false,
137  )
138  return linuxAlsaCardsMemo
139}
140
141export function _resetAlsaCardsForTesting(): void {
142  linuxAlsaCardsMemo = null
143}
144
145type PackageManagerInfo = {
146  cmd: string
147  args: string[]
148  displayCommand: string
149}
150
151function detectPackageManager(): PackageManagerInfo | null {
152  if (process.platform === 'darwin') {
153    if (hasCommand('brew')) {
154      return {
155        cmd: 'brew',
156        args: ['install', 'sox'],
157        displayCommand: 'brew install sox',
158      }
159    }
160    return null
161  }
162
163  if (process.platform === 'linux') {
164    if (hasCommand('apt-get')) {
165      return {
166        cmd: 'sudo',
167        args: ['apt-get', 'install', '-y', 'sox'],
168        displayCommand: 'sudo apt-get install sox',
169      }
170    }
171    if (hasCommand('dnf')) {
172      return {
173        cmd: 'sudo',
174        args: ['dnf', 'install', '-y', 'sox'],
175        displayCommand: 'sudo dnf install sox',
176      }
177    }
178    if (hasCommand('pacman')) {
179      return {
180        cmd: 'sudo',
181        args: ['pacman', '-S', '--noconfirm', 'sox'],
182        displayCommand: 'sudo pacman -S sox',
183      }
184    }
185  }
186
187  return null
188}
189
190export async function checkVoiceDependencies(): Promise<{
191  available: boolean
192  missing: string[]
193  installCommand: string | null
194}> {
195  // Native audio module (cpal) handles everything on macOS, Linux, and Windows
196  const napi = await loadAudioNapi()
197  if (napi.isNativeAudioAvailable()) {
198    return { available: true, missing: [], installCommand: null }
199  }
200
201  // Windows has no supported fallback — native module is required
202  if (process.platform === 'win32') {
203    return {
204      available: false,
205      missing: ['Voice mode requires the native audio module (not loaded)'],
206      installCommand: null,
207    }
208  }
209
210  // On Linux, arecord (ALSA utils) is a valid fallback recording backend
211  if (process.platform === 'linux' && hasCommand('arecord')) {
212    return { available: true, missing: [], installCommand: null }
213  }
214
215  const missing: string[] = []
216
217  if (!hasCommand('rec')) {
218    missing.push('sox (rec command)')
219  }
220
221  const pm = missing.length > 0 ? detectPackageManager() : null
222  return {
223    available: missing.length === 0,
224    missing,
225    installCommand: pm?.displayCommand ?? null,
226  }
227}
228
229// ─── Recording availability ──────────────────────────────────────────
230
231export type RecordingAvailability = {
232  available: boolean
233  reason: string | null
234}
235
236// Probe-record through the full fallback chain (native → arecord → SoX)
237// to verify that at least one backend can record. On macOS this also
238// triggers the TCC permission dialog on first use. We trust the probe
239// result over the TCC status API, which can be unreliable for ad-hoc
240// signed or cross-architecture binaries (e.g., x64-on-arm64).
241export async function requestMicrophonePermission(): Promise<boolean> {
242  const napi = await loadAudioNapi()
243  if (!napi.isNativeAudioAvailable()) {
244    return true // non-native platforms skip this check
245  }
246
247  const started = await startRecording(
248    _chunk => {}, // discard audio data — this is a permission probe only
249    () => {}, // ignore silence-detection end signal
250    { silenceDetection: false },
251  )
252  if (started) {
253    stopRecording()
254    return true
255  }
256  return false
257}
258
259export async function checkRecordingAvailability(): Promise<RecordingAvailability> {
260  // Remote environments have no local microphone
261  if (isRunningOnHomespace() || isEnvTruthy(process.env.CLAUDE_CODE_REMOTE)) {
262    return {
263      available: false,
264      reason:
265        'Voice mode requires microphone access, but no audio device is available in this environment.\n\nTo use voice mode, run Claude Code locally instead.',
266    }
267  }
268
269  // Native audio module (cpal) handles everything on macOS, Linux, and Windows
270  const napi = await loadAudioNapi()
271  if (napi.isNativeAudioAvailable()) {
272    return { available: true, reason: null }
273  }
274
275  // Windows has no supported fallback
276  if (process.platform === 'win32') {
277    return {
278      available: false,
279      reason:
280        'Voice recording requires the native audio module, which could not be loaded.',
281    }
282  }
283
284  const wslNoAudioReason =
285    'Voice mode could not access an audio device in WSL.\n\nWSL2 with WSLg (Windows 11) provides audio via PulseAudio — if you are on Windows 10 or WSL1, run Claude Code in native Windows instead.'
286
287  // On Linux (including WSL), probe arecord. hasCommand() is insufficient:
288  // the binary can exist while the device open() fails (WSL1, Win10-WSL2,
289  // headless Linux). WSL2+WSLg (Win11 default) works via PulseAudio RDP
290  // pipes — cpal fails (no /proc/asound/cards) but arecord succeeds.
291  if (process.platform === 'linux' && hasCommand('arecord')) {
292    const probe = await probeArecord()
293    if (probe.ok) {
294      return { available: true, reason: null }
295    }
296    if (getPlatform() === 'wsl') {
297      return { available: false, reason: wslNoAudioReason }
298    }
299    logForDebugging(`[voice] arecord probe failed: ${probe.stderr}`)
300    // fall through to SoX
301  }
302
303  // Fallback: check for SoX
304  if (!hasCommand('rec')) {
305    // WSL without arecord AND without SoX: the generic "install SoX"
306    // hint below is misleading on WSL1/Win10 (no audio devices at all),
307    // but correct on WSL2+WSLg (SoX works via PulseAudio). Since we can't
308    // distinguish WSLg-vs-not without a backend to probe, show the WSLg
309    // guidance — it points WSL1 users at native Windows AND tells WSLg
310    // users their setup should work (they can install sox or alsa-utils).
311    // Known gap: WSL with SoX but NO arecord skips both this branch and
312    // the probe above — hasCommand('rec') lies the same way. We optimistically
313    // trust it (WSLg+SoX would work) rather than probeSox() for a near-zero
314    // population (WSL1 × minimal distro × SoX-but-not-alsa-utils).
315    if (getPlatform() === 'wsl') {
316      return { available: false, reason: wslNoAudioReason }
317    }
318    const pm = detectPackageManager()
319    return {
320      available: false,
321      reason: pm
322        ? `Voice mode requires SoX for audio recording. Install it with: ${pm.displayCommand}`
323        : 'Voice mode requires SoX for audio recording. Install SoX manually:\n  macOS: brew install sox\n  Ubuntu/Debian: sudo apt-get install sox\n  Fedora: sudo dnf install sox',
324    }
325  }
326
327  return { available: true, reason: null }
328}
329
330// ─── Recording (native audio on macOS/Linux/Windows, SoX/arecord fallback on Linux) ─────────────
331
332let activeRecorder: ChildProcess | null = null
333let nativeRecordingActive = false
334
335export async function startRecording(
336  onData: (chunk: Buffer) => void,
337  onEnd: () => void,
338  options?: { silenceDetection?: boolean },
339): Promise<boolean> {
340  logForDebugging(`[voice] startRecording called, platform=${process.platform}`)
341
342  // Try native audio module first (macOS, Linux, Windows via cpal)
343  const napi = await loadAudioNapi()
344  const nativeAvailable =
345    napi.isNativeAudioAvailable() &&
346    (process.platform !== 'linux' || (await linuxHasAlsaCards()))
347  const useSilenceDetection = options?.silenceDetection !== false
348  if (nativeAvailable) {
349    // Ensure any previous recording is fully stopped
350    if (nativeRecordingActive || napi.isNativeRecordingActive()) {
351      napi.stopNativeRecording()
352      nativeRecordingActive = false
353    }
354    const started = napi.startNativeRecording(
355      (data: Buffer) => {
356        onData(data)
357      },
358      () => {
359        if (useSilenceDetection) {
360          nativeRecordingActive = false
361          onEnd()
362        }
363        // In push-to-talk mode, ignore the native module's silence-triggered
364        // onEnd.  Recording continues until the caller explicitly calls
365        // stopRecording() (e.g. when the user presses Ctrl+X).
366      },
367    )
368    if (started) {
369      nativeRecordingActive = true
370      return true
371    }
372    // Native recording failed — fall through to platform fallbacks
373  }
374
375  // Windows has no supported fallback
376  if (process.platform === 'win32') {
377    logForDebugging('[voice] Windows native recording unavailable, no fallback')
378    return false
379  }
380
381  // On Linux, try arecord (ALSA utils) before SoX. Consult the probe so
382  // backend selection matches checkRecordingAvailability() — otherwise
383  // on headless Linux with both alsa-utils and SoX, the availability
384  // check falls through to SoX (probe.ok=false, not WSL) but this path
385  // would still pick broken arecord. Probe is memoized; zero latency.
386  if (
387    process.platform === 'linux' &&
388    hasCommand('arecord') &&
389    (await probeArecord()).ok
390  ) {
391    return startArecordRecording(onData, onEnd)
392  }
393
394  // Fallback: SoX rec (Linux, or macOS if native module unavailable)
395  return startSoxRecording(onData, onEnd, options)
396}
397
398function startSoxRecording(
399  onData: (chunk: Buffer) => void,
400  onEnd: () => void,
401  options?: { silenceDetection?: boolean },
402): boolean {
403  const useSilenceDetection = options?.silenceDetection !== false
404
405  // Record raw PCM: 16 kHz, 16-bit signed, mono, to stdout.
406  // --buffer 1024 forces SoX to flush audio in small chunks instead of
407  // accumulating data in its internal buffer. Without this, SoX may buffer
408  // several seconds of audio before writing anything to stdout when piped,
409  // causing zero data flow until the process exits.
410  const args = [
411    '-q', // quiet
412    '--buffer',
413    '1024',
414    '-t',
415    'raw',
416    '-r',
417    String(RECORDING_SAMPLE_RATE),
418    '-e',
419    'signed',
420    '-b',
421    '16',
422    '-c',
423    String(RECORDING_CHANNELS),
424    '-', // stdout
425  ]
426
427  // Add silence detection filter (auto-stop on silence).
428  // Omit for push-to-talk where the user manually controls start/stop.
429  if (useSilenceDetection) {
430    args.push(
431      'silence', // start/stop on silence
432      '1',
433      '0.1',
434      SILENCE_THRESHOLD,
435      '1',
436      SILENCE_DURATION_SECS,
437      SILENCE_THRESHOLD,
438    )
439  }
440
441  const child = spawn('rec', args, {
442    stdio: ['pipe', 'pipe', 'pipe'],
443  })
444
445  activeRecorder = child
446
447  child.stdout?.on('data', (chunk: Buffer) => {
448    onData(chunk)
449  })
450
451  // Consume stderr to prevent backpressure
452  child.stderr?.on('data', () => {})
453
454  child.on('close', () => {
455    activeRecorder = null
456    onEnd()
457  })
458
459  child.on('error', err => {
460    logError(err)
461    activeRecorder = null
462    onEnd()
463  })
464
465  return true
466}
467
468function startArecordRecording(
469  onData: (chunk: Buffer) => void,
470  onEnd: () => void,
471): boolean {
472  // Record raw PCM: 16 kHz, 16-bit signed little-endian, mono, to stdout.
473  // arecord does not support built-in silence detection, so this backend
474  // is best suited for push-to-talk (silenceDetection: false).
475  const args = [
476    '-f',
477    'S16_LE', // signed 16-bit little-endian
478    '-r',
479    String(RECORDING_SAMPLE_RATE),
480    '-c',
481    String(RECORDING_CHANNELS),
482    '-t',
483    'raw', // raw PCM, no WAV header
484    '-q', // quiet — no progress output
485    '-', // write to stdout
486  ]
487
488  const child = spawn('arecord', args, {
489    stdio: ['pipe', 'pipe', 'pipe'],
490  })
491
492  activeRecorder = child
493
494  child.stdout?.on('data', (chunk: Buffer) => {
495    onData(chunk)
496  })
497
498  // Consume stderr to prevent backpressure
499  child.stderr?.on('data', () => {})
500
501  child.on('close', () => {
502    activeRecorder = null
503    onEnd()
504  })
505
506  child.on('error', err => {
507    logError(err)
508    activeRecorder = null
509    onEnd()
510  })
511
512  return true
513}
514
515export function stopRecording(): void {
516  if (nativeRecordingActive && audioNapi) {
517    audioNapi.stopNativeRecording()
518    nativeRecordingActive = false
519    return
520  }
521  if (activeRecorder) {
522    activeRecorder.kill('SIGTERM')
523    activeRecorder = null
524  }
525}