source dump of claude code
at main 135 lines 4.9 kB view raw
1import { logForDebugging } from '../utils/debug.js' 2import { BridgeFatalError } from './bridgeApi.js' 3import type { BridgeApiClient } from './types.js' 4 5/** 6 * Ant-only fault injection for manually testing bridge recovery paths. 7 * 8 * Real failure modes this targets (BQ 2026-03-12, 7-day window): 9 * poll 404 not_found_error — 147K sessions/week, dead onEnvironmentLost gate 10 * ws_closed 1002/1006 — 22K sessions/week, zombie poll after close 11 * register transient failure — residual: network blips during doReconnect 12 * 13 * Usage: /bridge-kick <subcommand> from the REPL while Remote Control is 14 * connected, then tail debug.log to watch the recovery machinery react. 15 * 16 * Module-level state is intentional here: one bridge per REPL process, the 17 * /bridge-kick slash command has no other way to reach into initBridgeCore's 18 * closures, and teardown clears the slot. 19 */ 20 21/** One-shot fault to inject on the next matching api call. */ 22type BridgeFault = { 23 method: 24 | 'pollForWork' 25 | 'registerBridgeEnvironment' 26 | 'reconnectSession' 27 | 'heartbeatWork' 28 /** Fatal errors go through handleErrorStatus → BridgeFatalError. Transient 29 * errors surface as plain axios rejections (5xx / network). Recovery code 30 * distinguishes the two: fatal → teardown, transient → retry/backoff. */ 31 kind: 'fatal' | 'transient' 32 status: number 33 errorType?: string 34 /** Remaining injections. Decremented on consume; removed at 0. */ 35 count: number 36} 37 38export type BridgeDebugHandle = { 39 /** Invoke the transport's permanent-close handler directly. Tests the 40 * ws_closed → reconnectEnvironmentWithSession escalation (#22148). */ 41 fireClose: (code: number) => void 42 /** Call reconnectEnvironmentWithSession() — same as SIGUSR2 but 43 * reachable from the slash command. */ 44 forceReconnect: () => void 45 /** Queue a fault for the next N calls to the named api method. */ 46 injectFault: (fault: BridgeFault) => void 47 /** Abort the at-capacity sleep so an injected poll fault lands 48 * immediately instead of up to 10min later. */ 49 wakePollLoop: () => void 50 /** env/session IDs for the debug.log grep. */ 51 describe: () => string 52} 53 54let debugHandle: BridgeDebugHandle | null = null 55const faultQueue: BridgeFault[] = [] 56 57export function registerBridgeDebugHandle(h: BridgeDebugHandle): void { 58 debugHandle = h 59} 60 61export function clearBridgeDebugHandle(): void { 62 debugHandle = null 63 faultQueue.length = 0 64} 65 66export function getBridgeDebugHandle(): BridgeDebugHandle | null { 67 return debugHandle 68} 69 70export function injectBridgeFault(fault: BridgeFault): void { 71 faultQueue.push(fault) 72 logForDebugging( 73 `[bridge:debug] Queued fault: ${fault.method} ${fault.kind}/${fault.status}${fault.errorType ? `/${fault.errorType}` : ''} ×${fault.count}`, 74 ) 75} 76 77/** 78 * Wrap a BridgeApiClient so each call first checks the fault queue. If a 79 * matching fault is queued, throw the specified error instead of calling 80 * through. Delegates everything else to the real client. 81 * 82 * Only called when USER_TYPE === 'ant' — zero overhead in external builds. 83 */ 84export function wrapApiForFaultInjection( 85 api: BridgeApiClient, 86): BridgeApiClient { 87 function consume(method: BridgeFault['method']): BridgeFault | null { 88 const idx = faultQueue.findIndex(f => f.method === method) 89 if (idx === -1) return null 90 const fault = faultQueue[idx]! 91 fault.count-- 92 if (fault.count <= 0) faultQueue.splice(idx, 1) 93 return fault 94 } 95 96 function throwFault(fault: BridgeFault, context: string): never { 97 logForDebugging( 98 `[bridge:debug] Injecting ${fault.kind} fault into ${context}: status=${fault.status} errorType=${fault.errorType ?? 'none'}`, 99 ) 100 if (fault.kind === 'fatal') { 101 throw new BridgeFatalError( 102 `[injected] ${context} ${fault.status}`, 103 fault.status, 104 fault.errorType, 105 ) 106 } 107 // Transient: mimic an axios rejection (5xx / network). No .status on 108 // the error itself — that's how the catch blocks distinguish. 109 throw new Error(`[injected transient] ${context} ${fault.status}`) 110 } 111 112 return { 113 ...api, 114 async pollForWork(envId, secret, signal, reclaimMs) { 115 const f = consume('pollForWork') 116 if (f) throwFault(f, 'Poll') 117 return api.pollForWork(envId, secret, signal, reclaimMs) 118 }, 119 async registerBridgeEnvironment(config) { 120 const f = consume('registerBridgeEnvironment') 121 if (f) throwFault(f, 'Registration') 122 return api.registerBridgeEnvironment(config) 123 }, 124 async reconnectSession(envId, sessionId) { 125 const f = consume('reconnectSession') 126 if (f) throwFault(f, 'ReconnectSession') 127 return api.reconnectSession(envId, sessionId) 128 }, 129 async heartbeatWork(envId, workId, token) { 130 const f = consume('heartbeatWork') 131 if (f) throwFault(f, 'Heartbeat') 132 return api.heartbeatWork(envId, workId, token) 133 }, 134 } 135}