source dump of claude code
at main 106 lines 3.5 kB view raw
1// Voice keyterms for improving STT accuracy in the voice_stream endpoint. 2// 3// Provides domain-specific vocabulary hints (Deepgram "keywords") so the STT 4// engine correctly recognises coding terminology, project names, and branch 5// names that would otherwise be misheard. 6 7import { basename } from 'path' 8import { getProjectRoot } from '../bootstrap/state.js' 9import { getBranch } from '../utils/git.js' 10 11// ─── Global keyterms ──────────────────────────────────────────────── 12 13const GLOBAL_KEYTERMS: readonly string[] = [ 14 // Terms Deepgram consistently mangles without keyword hints. 15 // Note: "Claude" and "Anthropic" are already server-side base keyterms. 16 // Avoid terms nobody speaks aloud as-spelled (stdout → "standard out"). 17 'MCP', 18 'symlink', 19 'grep', 20 'regex', 21 'localhost', 22 'codebase', 23 'TypeScript', 24 'JSON', 25 'OAuth', 26 'webhook', 27 'gRPC', 28 'dotfiles', 29 'subagent', 30 'worktree', 31] 32 33// ─── Helpers ──────────────────────────────────────────────────────── 34 35/** 36 * Split an identifier (camelCase, PascalCase, kebab-case, snake_case, or 37 * path segments) into individual words. Fragments of 2 chars or fewer are 38 * discarded to avoid noise. 39 */ 40export function splitIdentifier(name: string): string[] { 41 return name 42 .replace(/([a-z])([A-Z])/g, '$1 $2') 43 .split(/[-_./\s]+/) 44 .map(w => w.trim()) 45 .filter(w => w.length > 2 && w.length <= 20) 46} 47 48function fileNameWords(filePath: string): string[] { 49 const stem = basename(filePath).replace(/\.[^.]+$/, '') 50 return splitIdentifier(stem) 51} 52 53// ─── Public API ───────────────────────────────────────────────────── 54 55const MAX_KEYTERMS = 50 56 57/** 58 * Build a list of keyterms for the voice_stream STT endpoint. 59 * 60 * Combines hardcoded global coding terms with session context (project name, 61 * git branch, recent files) without any model calls. 62 */ 63export async function getVoiceKeyterms( 64 recentFiles?: ReadonlySet<string>, 65): Promise<string[]> { 66 const terms = new Set<string>(GLOBAL_KEYTERMS) 67 68 // Project root basename as a single term — users say "claude CLI internal" 69 // as a phrase, not isolated words. Keeping the whole basename lets the 70 // STT's keyterm boosting match the phrase regardless of separator. 71 try { 72 const projectRoot = getProjectRoot() 73 if (projectRoot) { 74 const name = basename(projectRoot) 75 if (name.length > 2 && name.length <= 50) { 76 terms.add(name) 77 } 78 } 79 } catch { 80 // getProjectRoot() may throw if not initialised yet — ignore 81 } 82 83 // Git branch words (e.g. "feat/voice-keyterms" → "feat", "voice", "keyterms") 84 try { 85 const branch = await getBranch() 86 if (branch) { 87 for (const word of splitIdentifier(branch)) { 88 terms.add(word) 89 } 90 } 91 } catch { 92 // getBranch() may fail if not in a git repo — ignore 93 } 94 95 // Recent file names — only scan enough to fill remaining slots 96 if (recentFiles) { 97 for (const filePath of recentFiles) { 98 if (terms.size >= MAX_KEYTERMS) break 99 for (const word of fileNameWords(filePath)) { 100 terms.add(word) 101 } 102 } 103 } 104 105 return [...terms].slice(0, MAX_KEYTERMS) 106}