source dump of claude code
at main 2999 lines 116 kB view raw
1import { feature } from 'bun:bundle' 2import { randomUUID } from 'crypto' 3import { hostname, tmpdir } from 'os' 4import { basename, join, resolve } from 'path' 5import { getRemoteSessionUrl } from '../constants/product.js' 6import { shutdownDatadog } from '../services/analytics/datadog.js' 7import { shutdown1PEventLogging } from '../services/analytics/firstPartyEventLogger.js' 8import { checkGate_CACHED_OR_BLOCKING } from '../services/analytics/growthbook.js' 9import { 10 type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 11 logEvent, 12 logEventAsync, 13} from '../services/analytics/index.js' 14import { isInBundledMode } from '../utils/bundledMode.js' 15import { logForDebugging } from '../utils/debug.js' 16import { logForDiagnosticsNoPII } from '../utils/diagLogs.js' 17import { isEnvTruthy, isInProtectedNamespace } from '../utils/envUtils.js' 18import { errorMessage } from '../utils/errors.js' 19import { truncateToWidth } from '../utils/format.js' 20import { logError } from '../utils/log.js' 21import { sleep } from '../utils/sleep.js' 22import { createAgentWorktree, removeAgentWorktree } from '../utils/worktree.js' 23import { 24 BridgeFatalError, 25 createBridgeApiClient, 26 isExpiredErrorType, 27 isSuppressible403, 28 validateBridgeId, 29} from './bridgeApi.js' 30import { formatDuration } from './bridgeStatusUtil.js' 31import { createBridgeLogger } from './bridgeUI.js' 32import { createCapacityWake } from './capacityWake.js' 33import { describeAxiosError } from './debugUtils.js' 34import { createTokenRefreshScheduler } from './jwtUtils.js' 35import { getPollIntervalConfig } from './pollConfig.js' 36import { toCompatSessionId, toInfraSessionId } from './sessionIdCompat.js' 37import { createSessionSpawner, safeFilenameId } from './sessionRunner.js' 38import { getTrustedDeviceToken } from './trustedDevice.js' 39import { 40 BRIDGE_LOGIN_ERROR, 41 type BridgeApiClient, 42 type BridgeConfig, 43 type BridgeLogger, 44 DEFAULT_SESSION_TIMEOUT_MS, 45 type SessionDoneStatus, 46 type SessionHandle, 47 type SessionSpawner, 48 type SessionSpawnOpts, 49 type SpawnMode, 50} from './types.js' 51import { 52 buildCCRv2SdkUrl, 53 buildSdkUrl, 54 decodeWorkSecret, 55 registerWorker, 56 sameSessionId, 57} from './workSecret.js' 58 59export type BackoffConfig = { 60 connInitialMs: number 61 connCapMs: number 62 connGiveUpMs: number 63 generalInitialMs: number 64 generalCapMs: number 65 generalGiveUpMs: number 66 /** SIGTERM→SIGKILL grace period on shutdown. Default 30s. */ 67 shutdownGraceMs?: number 68 /** stopWorkWithRetry base delay (1s/2s/4s backoff). Default 1000ms. */ 69 stopWorkBaseDelayMs?: number 70} 71 72const DEFAULT_BACKOFF: BackoffConfig = { 73 connInitialMs: 2_000, 74 connCapMs: 120_000, // 2 minutes 75 connGiveUpMs: 600_000, // 10 minutes 76 generalInitialMs: 500, 77 generalCapMs: 30_000, 78 generalGiveUpMs: 600_000, // 10 minutes 79} 80 81/** Status update interval for the live display (ms). */ 82const STATUS_UPDATE_INTERVAL_MS = 1_000 83const SPAWN_SESSIONS_DEFAULT = 32 84 85/** 86 * GrowthBook gate for multi-session spawn modes (--spawn / --capacity / --create-session-in-dir). 87 * Sibling of tengu_ccr_bridge_multi_environment (multiple envs per host:dir) — 88 * this one enables multiple sessions per environment. 89 * Rollout staged via targeting rules: ants first, then gradual external. 90 * 91 * Uses the blocking gate check so a stale disk-cache miss doesn't unfairly 92 * deny access. The fast path (cache has true) is still instant; only the 93 * cold-start path awaits the server fetch, and that fetch also seeds the 94 * disk cache for next time. 95 */ 96async function isMultiSessionSpawnEnabled(): Promise<boolean> { 97 return checkGate_CACHED_OR_BLOCKING('tengu_ccr_bridge_multi_session') 98} 99 100/** 101 * Returns the threshold for detecting system sleep/wake in the poll loop. 102 * Must exceed the max backoff cap — otherwise normal backoff delays trigger 103 * false sleep detection (resetting the error budget indefinitely). Using 104 * 2× the connection backoff cap, matching the pattern in WebSocketTransport 105 * and replBridge. 106 */ 107function pollSleepDetectionThresholdMs(backoff: BackoffConfig): number { 108 return backoff.connCapMs * 2 109} 110 111/** 112 * Returns the args that must precede CLI flags when spawning a child claude 113 * process. In compiled binaries, process.execPath is the claude binary itself 114 * and args go directly to it. In npm installs (node running cli.js), 115 * process.execPath is the node runtime — the child spawn must pass the script 116 * path as the first arg, otherwise node interprets --sdk-url as a node option 117 * and exits with "bad option: --sdk-url". See anthropics/claude-code#28334. 118 */ 119function spawnScriptArgs(): string[] { 120 if (isInBundledMode() || !process.argv[1]) { 121 return [] 122 } 123 return [process.argv[1]] 124} 125 126/** Attempt to spawn a session; returns error string if spawn throws. */ 127function safeSpawn( 128 spawner: SessionSpawner, 129 opts: SessionSpawnOpts, 130 dir: string, 131): SessionHandle | string { 132 try { 133 return spawner.spawn(opts, dir) 134 } catch (err) { 135 const errMsg = errorMessage(err) 136 logError(new Error(`Session spawn failed: ${errMsg}`)) 137 return errMsg 138 } 139} 140 141export async function runBridgeLoop( 142 config: BridgeConfig, 143 environmentId: string, 144 environmentSecret: string, 145 api: BridgeApiClient, 146 spawner: SessionSpawner, 147 logger: BridgeLogger, 148 signal: AbortSignal, 149 backoffConfig: BackoffConfig = DEFAULT_BACKOFF, 150 initialSessionId?: string, 151 getAccessToken?: () => string | undefined | Promise<string | undefined>, 152): Promise<void> { 153 // Local abort controller so that onSessionDone can stop the poll loop. 154 // Linked to the incoming signal so external aborts also work. 155 const controller = new AbortController() 156 if (signal.aborted) { 157 controller.abort() 158 } else { 159 signal.addEventListener('abort', () => controller.abort(), { once: true }) 160 } 161 const loopSignal = controller.signal 162 163 const activeSessions = new Map<string, SessionHandle>() 164 const sessionStartTimes = new Map<string, number>() 165 const sessionWorkIds = new Map<string, string>() 166 // Compat-surface ID (session_*) computed once at spawn and cached so 167 // cleanup and status-update ticks use the same key regardless of whether 168 // the tengu_bridge_repl_v2_cse_shim_enabled gate flips mid-session. 169 const sessionCompatIds = new Map<string, string>() 170 // Session ingress JWTs for heartbeat auth, keyed by sessionId. 171 // Stored separately from handle.accessToken because the token refresh 172 // scheduler overwrites that field with the OAuth token (~3h55m in). 173 const sessionIngressTokens = new Map<string, string>() 174 const sessionTimers = new Map<string, ReturnType<typeof setTimeout>>() 175 const completedWorkIds = new Set<string>() 176 const sessionWorktrees = new Map< 177 string, 178 { 179 worktreePath: string 180 worktreeBranch?: string 181 gitRoot?: string 182 hookBased?: boolean 183 } 184 >() 185 // Track sessions killed by the timeout watchdog so onSessionDone can 186 // distinguish them from server-initiated or shutdown interrupts. 187 const timedOutSessions = new Set<string>() 188 // Sessions that already have a title (server-set or bridge-derived) so 189 // onFirstUserMessage doesn't clobber a user-assigned --name / web rename. 190 // Keyed by compatSessionId to match logger.setSessionTitle's key. 191 const titledSessions = new Set<string>() 192 // Signal to wake the at-capacity sleep early when a session completes, 193 // so the bridge can immediately accept new work. 194 const capacityWake = createCapacityWake(loopSignal) 195 196 /** 197 * Heartbeat all active work items. 198 * Returns 'ok' if at least one heartbeat succeeded, 'auth_failed' if any 199 * got a 401/403 (JWT expired — re-queued via reconnectSession so the next 200 * poll delivers fresh work), or 'failed' if all failed for other reasons. 201 */ 202 async function heartbeatActiveWorkItems(): Promise< 203 'ok' | 'auth_failed' | 'fatal' | 'failed' 204 > { 205 let anySuccess = false 206 let anyFatal = false 207 const authFailedSessions: string[] = [] 208 for (const [sessionId] of activeSessions) { 209 const workId = sessionWorkIds.get(sessionId) 210 const ingressToken = sessionIngressTokens.get(sessionId) 211 if (!workId || !ingressToken) { 212 continue 213 } 214 try { 215 await api.heartbeatWork(environmentId, workId, ingressToken) 216 anySuccess = true 217 } catch (err) { 218 logForDebugging( 219 `[bridge:heartbeat] Failed for sessionId=${sessionId} workId=${workId}: ${errorMessage(err)}`, 220 ) 221 if (err instanceof BridgeFatalError) { 222 logEvent('tengu_bridge_heartbeat_error', { 223 status: 224 err.status as unknown as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 225 error_type: (err.status === 401 || err.status === 403 226 ? 'auth_failed' 227 : 'fatal') as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 228 }) 229 if (err.status === 401 || err.status === 403) { 230 authFailedSessions.push(sessionId) 231 } else { 232 // 404/410 = environment expired or deleted — no point retrying 233 anyFatal = true 234 } 235 } 236 } 237 } 238 // JWT expired → trigger server-side re-dispatch. Without this, work stays 239 // ACK'd out of the Redis PEL and poll returns empty forever (CC-1263). 240 // The existingHandle path below delivers the fresh token to the child. 241 // sessionId is already in the format /bridge/reconnect expects: it comes 242 // from work.data.id, which matches the server's EnvironmentInstance store 243 // (cse_* under the compat gate, session_* otherwise). 244 for (const sessionId of authFailedSessions) { 245 logger.logVerbose( 246 `Session ${sessionId} token expired — re-queuing via bridge/reconnect`, 247 ) 248 try { 249 await api.reconnectSession(environmentId, sessionId) 250 logForDebugging( 251 `[bridge:heartbeat] Re-queued sessionId=${sessionId} via bridge/reconnect`, 252 ) 253 } catch (err) { 254 logger.logError( 255 `Failed to refresh session ${sessionId} token: ${errorMessage(err)}`, 256 ) 257 logForDebugging( 258 `[bridge:heartbeat] reconnectSession(${sessionId}) failed: ${errorMessage(err)}`, 259 { level: 'error' }, 260 ) 261 } 262 } 263 if (anyFatal) { 264 return 'fatal' 265 } 266 if (authFailedSessions.length > 0) { 267 return 'auth_failed' 268 } 269 return anySuccess ? 'ok' : 'failed' 270 } 271 272 // Sessions spawned with CCR v2 env vars. v2 children cannot use OAuth 273 // tokens (CCR worker endpoints validate the JWT's session_id claim, 274 // register_worker.go:32), so onRefresh triggers server re-dispatch 275 // instead — the next poll delivers fresh work with a new JWT via the 276 // existingHandle path below. 277 const v2Sessions = new Set<string>() 278 279 // Proactive token refresh: schedules a timer 5min before the session 280 // ingress JWT expires. v1 delivers OAuth directly; v2 calls 281 // reconnectSession to trigger server re-dispatch (CC-1263: without 282 // this, v2 daemon sessions silently die at ~5h since the server does 283 // not auto-re-dispatch ACK'd work on lease expiry). 284 const tokenRefresh = getAccessToken 285 ? createTokenRefreshScheduler({ 286 getAccessToken, 287 onRefresh: (sessionId, oauthToken) => { 288 const handle = activeSessions.get(sessionId) 289 if (!handle) { 290 return 291 } 292 if (v2Sessions.has(sessionId)) { 293 logger.logVerbose( 294 `Refreshing session ${sessionId} token via bridge/reconnect`, 295 ) 296 void api 297 .reconnectSession(environmentId, sessionId) 298 .catch((err: unknown) => { 299 logger.logError( 300 `Failed to refresh session ${sessionId} token: ${errorMessage(err)}`, 301 ) 302 logForDebugging( 303 `[bridge:token] reconnectSession(${sessionId}) failed: ${errorMessage(err)}`, 304 { level: 'error' }, 305 ) 306 }) 307 } else { 308 handle.updateAccessToken(oauthToken) 309 } 310 }, 311 label: 'bridge', 312 }) 313 : null 314 const loopStartTime = Date.now() 315 // Track all in-flight cleanup promises (stopWork, worktree removal) so 316 // the shutdown sequence can await them before process.exit(). 317 const pendingCleanups = new Set<Promise<unknown>>() 318 function trackCleanup(p: Promise<unknown>): void { 319 pendingCleanups.add(p) 320 void p.finally(() => pendingCleanups.delete(p)) 321 } 322 let connBackoff = 0 323 let generalBackoff = 0 324 let connErrorStart: number | null = null 325 let generalErrorStart: number | null = null 326 let lastPollErrorTime: number | null = null 327 let statusUpdateTimer: ReturnType<typeof setInterval> | null = null 328 // Set by BridgeFatalError and give-up paths so the shutdown block can 329 // skip the resume message (resume is impossible after env expiry/auth 330 // failure/sustained connection errors). 331 let fatalExit = false 332 333 logForDebugging( 334 `[bridge:work] Starting poll loop spawnMode=${config.spawnMode} maxSessions=${config.maxSessions} environmentId=${environmentId}`, 335 ) 336 logForDiagnosticsNoPII('info', 'bridge_loop_started', { 337 max_sessions: config.maxSessions, 338 spawn_mode: config.spawnMode, 339 }) 340 341 // For ant users, show where session debug logs will land so they can tail them. 342 // sessionRunner.ts uses the same base path. File appears once a session spawns. 343 if (process.env.USER_TYPE === 'ant') { 344 let debugGlob: string 345 if (config.debugFile) { 346 const ext = config.debugFile.lastIndexOf('.') 347 debugGlob = 348 ext > 0 349 ? `${config.debugFile.slice(0, ext)}-*${config.debugFile.slice(ext)}` 350 : `${config.debugFile}-*` 351 } else { 352 debugGlob = join(tmpdir(), 'claude', 'bridge-session-*.log') 353 } 354 logger.setDebugLogPath(debugGlob) 355 } 356 357 logger.printBanner(config, environmentId) 358 359 // Seed the logger's session count + spawn mode before any render. Without 360 // this, setAttached() below renders with the logger's default sessionMax=1, 361 // showing "Capacity: 0/1" until the status ticker kicks in (which is gated 362 // by !initialSessionId and only starts after the poll loop picks up work). 363 logger.updateSessionCount(0, config.maxSessions, config.spawnMode) 364 365 // If an initial session was pre-created, show its URL from the start so 366 // the user can click through immediately (matching /remote-control behavior). 367 if (initialSessionId) { 368 logger.setAttached(initialSessionId) 369 } 370 371 /** Refresh the inline status display. Shows idle or active depending on state. */ 372 function updateStatusDisplay(): void { 373 // Push the session count (no-op when maxSessions === 1) so the 374 // next renderStatusLine tick shows the current count. 375 logger.updateSessionCount( 376 activeSessions.size, 377 config.maxSessions, 378 config.spawnMode, 379 ) 380 381 // Push per-session activity into the multi-session display. 382 for (const [sid, handle] of activeSessions) { 383 const act = handle.currentActivity 384 if (act) { 385 logger.updateSessionActivity(sessionCompatIds.get(sid) ?? sid, act) 386 } 387 } 388 389 if (activeSessions.size === 0) { 390 logger.updateIdleStatus() 391 return 392 } 393 394 // Show the most recently started session that is still actively working. 395 // Sessions whose current activity is 'result' or 'error' are between 396 // turns — the CLI emitted its result but the process stays alive waiting 397 // for the next user message. Skip updating so the status line keeps 398 // whatever state it had (Attached / session title). 399 const [sessionId, handle] = [...activeSessions.entries()].pop()! 400 const startTime = sessionStartTimes.get(sessionId) 401 if (!startTime) return 402 403 const activity = handle.currentActivity 404 if (!activity || activity.type === 'result' || activity.type === 'error') { 405 // Session is between turns — keep current status (Attached/titled). 406 // In multi-session mode, still refresh so bullet-list activities stay current. 407 if (config.maxSessions > 1) logger.refreshDisplay() 408 return 409 } 410 411 const elapsed = formatDuration(Date.now() - startTime) 412 413 // Build trail from recent tool activities (last 5) 414 const trail = handle.activities 415 .filter(a => a.type === 'tool_start') 416 .slice(-5) 417 .map(a => a.summary) 418 419 logger.updateSessionStatus(sessionId, elapsed, activity, trail) 420 } 421 422 /** Start the status display update ticker. */ 423 function startStatusUpdates(): void { 424 stopStatusUpdates() 425 // Call immediately so the first transition (e.g. Connecting → Ready) 426 // happens without delay, avoiding concurrent timer races. 427 updateStatusDisplay() 428 statusUpdateTimer = setInterval( 429 updateStatusDisplay, 430 STATUS_UPDATE_INTERVAL_MS, 431 ) 432 } 433 434 /** Stop the status display update ticker. */ 435 function stopStatusUpdates(): void { 436 if (statusUpdateTimer) { 437 clearInterval(statusUpdateTimer) 438 statusUpdateTimer = null 439 } 440 } 441 442 function onSessionDone( 443 sessionId: string, 444 startTime: number, 445 handle: SessionHandle, 446 ): (status: SessionDoneStatus) => void { 447 return (rawStatus: SessionDoneStatus): void => { 448 const workId = sessionWorkIds.get(sessionId) 449 activeSessions.delete(sessionId) 450 sessionStartTimes.delete(sessionId) 451 sessionWorkIds.delete(sessionId) 452 sessionIngressTokens.delete(sessionId) 453 const compatId = sessionCompatIds.get(sessionId) ?? sessionId 454 sessionCompatIds.delete(sessionId) 455 logger.removeSession(compatId) 456 titledSessions.delete(compatId) 457 v2Sessions.delete(sessionId) 458 // Clear per-session timeout timer 459 const timer = sessionTimers.get(sessionId) 460 if (timer) { 461 clearTimeout(timer) 462 sessionTimers.delete(sessionId) 463 } 464 // Clear token refresh timer 465 tokenRefresh?.cancel(sessionId) 466 // Wake the at-capacity sleep so the bridge can accept new work immediately 467 capacityWake.wake() 468 469 // If the session was killed by the timeout watchdog, treat it as a 470 // failed session (not a server/shutdown interrupt) so we still call 471 // stopWork and archiveSession below. 472 const wasTimedOut = timedOutSessions.delete(sessionId) 473 const status: SessionDoneStatus = 474 wasTimedOut && rawStatus === 'interrupted' ? 'failed' : rawStatus 475 const durationMs = Date.now() - startTime 476 477 logForDebugging( 478 `[bridge:session] sessionId=${sessionId} workId=${workId ?? 'unknown'} exited status=${status} duration=${formatDuration(durationMs)}`, 479 ) 480 logEvent('tengu_bridge_session_done', { 481 status: 482 status as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 483 duration_ms: durationMs, 484 }) 485 logForDiagnosticsNoPII('info', 'bridge_session_done', { 486 status, 487 duration_ms: durationMs, 488 }) 489 490 // Clear the status display before printing final log 491 logger.clearStatus() 492 stopStatusUpdates() 493 494 // Build error message from stderr if available 495 const stderrSummary = 496 handle.lastStderr.length > 0 ? handle.lastStderr.join('\n') : undefined 497 let failureMessage: string | undefined 498 499 switch (status) { 500 case 'completed': 501 logger.logSessionComplete(sessionId, durationMs) 502 break 503 case 'failed': 504 // Skip failure log during shutdown — the child exits non-zero when 505 // killed, which is expected and not a real failure. 506 // Also skip for timeout-killed sessions — the timeout watchdog 507 // already logged a clear timeout message. 508 if (!wasTimedOut && !loopSignal.aborted) { 509 failureMessage = stderrSummary ?? 'Process exited with error' 510 logger.logSessionFailed(sessionId, failureMessage) 511 logError(new Error(`Bridge session failed: ${failureMessage}`)) 512 } 513 break 514 case 'interrupted': 515 logger.logVerbose(`Session ${sessionId} interrupted`) 516 break 517 } 518 519 // Notify the server that this work item is done. Skip for interrupted 520 // sessions — interrupts are either server-initiated (the server already 521 // knows) or caused by bridge shutdown (which calls stopWork() separately). 522 if (status !== 'interrupted' && workId) { 523 trackCleanup( 524 stopWorkWithRetry( 525 api, 526 environmentId, 527 workId, 528 logger, 529 backoffConfig.stopWorkBaseDelayMs, 530 ), 531 ) 532 completedWorkIds.add(workId) 533 } 534 535 // Clean up worktree if one was created for this session 536 const wt = sessionWorktrees.get(sessionId) 537 if (wt) { 538 sessionWorktrees.delete(sessionId) 539 trackCleanup( 540 removeAgentWorktree( 541 wt.worktreePath, 542 wt.worktreeBranch, 543 wt.gitRoot, 544 wt.hookBased, 545 ).catch((err: unknown) => 546 logger.logVerbose( 547 `Failed to remove worktree ${wt.worktreePath}: ${errorMessage(err)}`, 548 ), 549 ), 550 ) 551 } 552 553 // Lifecycle decision: in multi-session mode, keep the bridge running 554 // after a session completes. In single-session mode, abort the poll 555 // loop so the bridge exits cleanly. 556 if (status !== 'interrupted' && !loopSignal.aborted) { 557 if (config.spawnMode !== 'single-session') { 558 // Multi-session: archive the completed session so it doesn't linger 559 // as stale in the web UI. archiveSession is idempotent (409 if already 560 // archived), so double-archiving at shutdown is safe. 561 // sessionId arrived as cse_* from the work poll (infrastructure-layer 562 // tag). archiveSession hits /v1/sessions/{id}/archive which is the 563 // compat surface and validates TagSession (session_*). Re-tag — same 564 // UUID underneath. 565 trackCleanup( 566 api 567 .archiveSession(compatId) 568 .catch((err: unknown) => 569 logger.logVerbose( 570 `Failed to archive session ${sessionId}: ${errorMessage(err)}`, 571 ), 572 ), 573 ) 574 logForDebugging( 575 `[bridge:session] Session ${status}, returning to idle (multi-session mode)`, 576 ) 577 } else { 578 // Single-session: coupled lifecycle — tear down environment 579 logForDebugging( 580 `[bridge:session] Session ${status}, aborting poll loop to tear down environment`, 581 ) 582 controller.abort() 583 return 584 } 585 } 586 587 if (!loopSignal.aborted) { 588 startStatusUpdates() 589 } 590 } 591 } 592 593 // Start the idle status display immediately — unless we have a pre-created 594 // session, in which case setAttached() already set up the display and the 595 // poll loop will start status updates when it picks up the session. 596 if (!initialSessionId) { 597 startStatusUpdates() 598 } 599 600 while (!loopSignal.aborted) { 601 // Fetched once per iteration — the GrowthBook cache refreshes every 602 // 5 min, so a loop running at the at-capacity rate picks up config 603 // changes within one sleep cycle. 604 const pollConfig = getPollIntervalConfig() 605 606 try { 607 const work = await api.pollForWork( 608 environmentId, 609 environmentSecret, 610 loopSignal, 611 pollConfig.reclaim_older_than_ms, 612 ) 613 614 // Log reconnection if we were previously disconnected 615 const wasDisconnected = 616 connErrorStart !== null || generalErrorStart !== null 617 if (wasDisconnected) { 618 const disconnectedMs = 619 Date.now() - (connErrorStart ?? generalErrorStart ?? Date.now()) 620 logger.logReconnected(disconnectedMs) 621 logForDebugging( 622 `[bridge:poll] Reconnected after ${formatDuration(disconnectedMs)}`, 623 ) 624 logEvent('tengu_bridge_reconnected', { 625 disconnected_ms: disconnectedMs, 626 }) 627 } 628 629 connBackoff = 0 630 generalBackoff = 0 631 connErrorStart = null 632 generalErrorStart = null 633 lastPollErrorTime = null 634 635 // Null response = no work available in the queue. 636 // Add a minimum delay to avoid hammering the server. 637 if (!work) { 638 // Use live check (not a snapshot) since sessions can end during poll. 639 const atCap = activeSessions.size >= config.maxSessions 640 if (atCap) { 641 const atCapMs = pollConfig.multisession_poll_interval_ms_at_capacity 642 // Heartbeat loops WITHOUT polling. When at-capacity polling is also 643 // enabled (atCapMs > 0), the loop tracks a deadline and breaks out 644 // to poll at that interval — heartbeat and poll compose instead of 645 // one suppressing the other. We break out to poll when: 646 // - Poll deadline reached (atCapMs > 0 only) 647 // - Auth fails (JWT expired → poll refreshes tokens) 648 // - Capacity wake fires (session ended → poll for new work) 649 // - Loop aborted (shutdown) 650 if (pollConfig.non_exclusive_heartbeat_interval_ms > 0) { 651 logEvent('tengu_bridge_heartbeat_mode_entered', { 652 active_sessions: activeSessions.size, 653 heartbeat_interval_ms: 654 pollConfig.non_exclusive_heartbeat_interval_ms, 655 }) 656 // Deadline computed once at entry — GB updates to atCapMs don't 657 // shift an in-flight deadline (next entry picks up the new value). 658 const pollDeadline = atCapMs > 0 ? Date.now() + atCapMs : null 659 let hbResult: 'ok' | 'auth_failed' | 'fatal' | 'failed' = 'ok' 660 let hbCycles = 0 661 while ( 662 !loopSignal.aborted && 663 activeSessions.size >= config.maxSessions && 664 (pollDeadline === null || Date.now() < pollDeadline) 665 ) { 666 // Re-read config each cycle so GrowthBook updates take effect 667 const hbConfig = getPollIntervalConfig() 668 if (hbConfig.non_exclusive_heartbeat_interval_ms <= 0) break 669 670 // Capture capacity signal BEFORE the async heartbeat call so 671 // a session ending during the HTTP request is caught by the 672 // subsequent sleep (instead of being lost to a replaced controller). 673 const cap = capacityWake.signal() 674 675 hbResult = await heartbeatActiveWorkItems() 676 if (hbResult === 'auth_failed' || hbResult === 'fatal') { 677 cap.cleanup() 678 break 679 } 680 681 hbCycles++ 682 await sleep( 683 hbConfig.non_exclusive_heartbeat_interval_ms, 684 cap.signal, 685 ) 686 cap.cleanup() 687 } 688 689 // Determine exit reason for telemetry 690 const exitReason = 691 hbResult === 'auth_failed' || hbResult === 'fatal' 692 ? hbResult 693 : loopSignal.aborted 694 ? 'shutdown' 695 : activeSessions.size < config.maxSessions 696 ? 'capacity_changed' 697 : pollDeadline !== null && Date.now() >= pollDeadline 698 ? 'poll_due' 699 : 'config_disabled' 700 logEvent('tengu_bridge_heartbeat_mode_exited', { 701 reason: 702 exitReason as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 703 heartbeat_cycles: hbCycles, 704 active_sessions: activeSessions.size, 705 }) 706 if (exitReason === 'poll_due') { 707 // bridgeApi throttles empty-poll logs (EMPTY_POLL_LOG_INTERVAL=100) 708 // so the once-per-10min poll_due poll is invisible at counter=2. 709 // Log it here so verification runs see both endpoints in the debug log. 710 logForDebugging( 711 `[bridge:poll] Heartbeat poll_due after ${hbCycles} cycles — falling through to pollForWork`, 712 ) 713 } 714 715 // On auth_failed or fatal, sleep before polling to avoid a tight 716 // poll+heartbeat loop. Auth_failed: heartbeatActiveWorkItems 717 // already called reconnectSession — the sleep gives the server 718 // time to propagate the re-queue. Fatal (404/410): may be a 719 // single work item GCd while the environment is still valid. 720 // Use atCapMs if enabled, else the heartbeat interval as a floor 721 // (guaranteed > 0 here) so heartbeat-only configs don't tight-loop. 722 if (hbResult === 'auth_failed' || hbResult === 'fatal') { 723 const cap = capacityWake.signal() 724 await sleep( 725 atCapMs > 0 726 ? atCapMs 727 : pollConfig.non_exclusive_heartbeat_interval_ms, 728 cap.signal, 729 ) 730 cap.cleanup() 731 } 732 } else if (atCapMs > 0) { 733 // Heartbeat disabled: slow poll as liveness signal. 734 const cap = capacityWake.signal() 735 await sleep(atCapMs, cap.signal) 736 cap.cleanup() 737 } 738 } else { 739 const interval = 740 activeSessions.size > 0 741 ? pollConfig.multisession_poll_interval_ms_partial_capacity 742 : pollConfig.multisession_poll_interval_ms_not_at_capacity 743 await sleep(interval, loopSignal) 744 } 745 continue 746 } 747 748 // At capacity — we polled to keep the heartbeat alive, but cannot 749 // accept new work right now. We still enter the switch below so that 750 // token refreshes for existing sessions are processed (the case 751 // 'session' handler checks for existing sessions before the inner 752 // capacity guard). 753 const atCapacityBeforeSwitch = activeSessions.size >= config.maxSessions 754 755 // Skip work items that have already been completed and stopped. 756 // The server may re-deliver stale work before processing our stop 757 // request, which would otherwise cause a duplicate session spawn. 758 if (completedWorkIds.has(work.id)) { 759 logForDebugging( 760 `[bridge:work] Skipping already-completed workId=${work.id}`, 761 ) 762 // Respect capacity throttle — without a sleep here, persistent stale 763 // redeliveries would tight-loop at poll-request speed (the !work 764 // branch above is the only sleep, and work != null skips it). 765 if (atCapacityBeforeSwitch) { 766 const cap = capacityWake.signal() 767 if (pollConfig.non_exclusive_heartbeat_interval_ms > 0) { 768 await heartbeatActiveWorkItems() 769 await sleep( 770 pollConfig.non_exclusive_heartbeat_interval_ms, 771 cap.signal, 772 ) 773 } else if (pollConfig.multisession_poll_interval_ms_at_capacity > 0) { 774 await sleep( 775 pollConfig.multisession_poll_interval_ms_at_capacity, 776 cap.signal, 777 ) 778 } 779 cap.cleanup() 780 } else { 781 await sleep(1000, loopSignal) 782 } 783 continue 784 } 785 786 // Decode the work secret for session spawning and to extract the JWT 787 // used for the ack call below. 788 let secret 789 try { 790 secret = decodeWorkSecret(work.secret) 791 } catch (err) { 792 const errMsg = errorMessage(err) 793 logger.logError( 794 `Failed to decode work secret for workId=${work.id}: ${errMsg}`, 795 ) 796 logEvent('tengu_bridge_work_secret_failed', {}) 797 // Can't ack (needs the JWT we failed to decode). stopWork uses OAuth, 798 // so it's callable here — prevents XAUTOCLAIM from re-delivering this 799 // poisoned item every reclaim_older_than_ms cycle. 800 completedWorkIds.add(work.id) 801 trackCleanup( 802 stopWorkWithRetry( 803 api, 804 environmentId, 805 work.id, 806 logger, 807 backoffConfig.stopWorkBaseDelayMs, 808 ), 809 ) 810 // Respect capacity throttle before retrying — without a sleep here, 811 // repeated decode failures at capacity would tight-loop at 812 // poll-request speed (work != null skips the !work sleep above). 813 if (atCapacityBeforeSwitch) { 814 const cap = capacityWake.signal() 815 if (pollConfig.non_exclusive_heartbeat_interval_ms > 0) { 816 await heartbeatActiveWorkItems() 817 await sleep( 818 pollConfig.non_exclusive_heartbeat_interval_ms, 819 cap.signal, 820 ) 821 } else if (pollConfig.multisession_poll_interval_ms_at_capacity > 0) { 822 await sleep( 823 pollConfig.multisession_poll_interval_ms_at_capacity, 824 cap.signal, 825 ) 826 } 827 cap.cleanup() 828 } 829 continue 830 } 831 832 // Explicitly acknowledge after committing to handle the work — NOT 833 // before. The at-capacity guard inside case 'session' can break 834 // without spawning; acking there would permanently lose the work. 835 // Ack failures are non-fatal: server re-delivers, and existingHandle 836 // / completedWorkIds paths handle the dedup. 837 const ackWork = async (): Promise<void> => { 838 logForDebugging(`[bridge:work] Acknowledging workId=${work.id}`) 839 try { 840 await api.acknowledgeWork( 841 environmentId, 842 work.id, 843 secret.session_ingress_token, 844 ) 845 } catch (err) { 846 logForDebugging( 847 `[bridge:work] Acknowledge failed workId=${work.id}: ${errorMessage(err)}`, 848 ) 849 } 850 } 851 852 const workType: string = work.data.type 853 switch (work.data.type) { 854 case 'healthcheck': 855 await ackWork() 856 logForDebugging('[bridge:work] Healthcheck received') 857 logger.logVerbose('Healthcheck received') 858 break 859 case 'session': { 860 const sessionId = work.data.id 861 try { 862 validateBridgeId(sessionId, 'session_id') 863 } catch { 864 await ackWork() 865 logger.logError(`Invalid session_id received: ${sessionId}`) 866 break 867 } 868 869 // If the session is already running, deliver the fresh token so 870 // the child process can reconnect its WebSocket with the new 871 // session ingress token. This handles the case where the server 872 // re-dispatches work for an existing session after the WS drops. 873 const existingHandle = activeSessions.get(sessionId) 874 if (existingHandle) { 875 existingHandle.updateAccessToken(secret.session_ingress_token) 876 sessionIngressTokens.set(sessionId, secret.session_ingress_token) 877 sessionWorkIds.set(sessionId, work.id) 878 // Re-schedule next refresh from the fresh JWT's expiry. onRefresh 879 // branches on v2Sessions so both v1 and v2 are safe here. 880 tokenRefresh?.schedule(sessionId, secret.session_ingress_token) 881 logForDebugging( 882 `[bridge:work] Updated access token for existing sessionId=${sessionId} workId=${work.id}`, 883 ) 884 await ackWork() 885 break 886 } 887 888 // At capacity — token refresh for existing sessions is handled 889 // above, but we cannot spawn new ones. The post-switch capacity 890 // sleep will throttle the loop; just break here. 891 if (activeSessions.size >= config.maxSessions) { 892 logForDebugging( 893 `[bridge:work] At capacity (${activeSessions.size}/${config.maxSessions}), cannot spawn new session for workId=${work.id}`, 894 ) 895 break 896 } 897 898 await ackWork() 899 const spawnStartTime = Date.now() 900 901 // CCR v2 path: register this bridge as the session worker, get the 902 // epoch, and point the child at /v1/code/sessions/{id}. The child 903 // already has the full v2 client (SSETransport + CCRClient) — same 904 // code path environment-manager launches in containers. 905 // 906 // v1 path: Session-Ingress WebSocket. Uses config.sessionIngressUrl 907 // (not secret.api_base_url, which may point to a remote proxy tunnel 908 // that doesn't know about locally-created sessions). 909 let sdkUrl: string 910 let useCcrV2 = false 911 let workerEpoch: number | undefined 912 // Server decides per-session via the work secret; env var is the 913 // ant-dev override (e.g. forcing v2 before the server flag is on). 914 if ( 915 secret.use_code_sessions === true || 916 isEnvTruthy(process.env.CLAUDE_BRIDGE_USE_CCR_V2) 917 ) { 918 sdkUrl = buildCCRv2SdkUrl(config.apiBaseUrl, sessionId) 919 // Retry once on transient failure (network blip, 500) before 920 // permanently giving up and killing the session. 921 for (let attempt = 1; attempt <= 2; attempt++) { 922 try { 923 workerEpoch = await registerWorker( 924 sdkUrl, 925 secret.session_ingress_token, 926 ) 927 useCcrV2 = true 928 logForDebugging( 929 `[bridge:session] CCR v2: registered worker sessionId=${sessionId} epoch=${workerEpoch} attempt=${attempt}`, 930 ) 931 break 932 } catch (err) { 933 const errMsg = errorMessage(err) 934 if (attempt < 2) { 935 logForDebugging( 936 `[bridge:session] CCR v2: registerWorker attempt ${attempt} failed, retrying: ${errMsg}`, 937 ) 938 await sleep(2_000, loopSignal) 939 if (loopSignal.aborted) break 940 continue 941 } 942 logger.logError( 943 `CCR v2 worker registration failed for session ${sessionId}: ${errMsg}`, 944 ) 945 logError(new Error(`registerWorker failed: ${errMsg}`)) 946 completedWorkIds.add(work.id) 947 trackCleanup( 948 stopWorkWithRetry( 949 api, 950 environmentId, 951 work.id, 952 logger, 953 backoffConfig.stopWorkBaseDelayMs, 954 ), 955 ) 956 } 957 } 958 if (!useCcrV2) break 959 } else { 960 sdkUrl = buildSdkUrl(config.sessionIngressUrl, sessionId) 961 } 962 963 // In worktree mode, on-demand sessions get an isolated git worktree 964 // so concurrent sessions don't interfere with each other's file 965 // changes. The pre-created initial session (if any) runs in 966 // config.dir so the user's first session lands in the directory they 967 // invoked `rc` from — matching the old single-session UX. 968 // In same-dir and single-session modes, all sessions share config.dir. 969 // Capture spawnMode before the await below — the `w` key handler 970 // mutates config.spawnMode directly, and createAgentWorktree can 971 // take 1-2s, so reading config.spawnMode after the await can 972 // produce contradictory analytics (spawn_mode:'same-dir', in_worktree:true). 973 const spawnModeAtDecision = config.spawnMode 974 let sessionDir = config.dir 975 let worktreeCreateMs = 0 976 if ( 977 spawnModeAtDecision === 'worktree' && 978 (initialSessionId === undefined || 979 !sameSessionId(sessionId, initialSessionId)) 980 ) { 981 const wtStart = Date.now() 982 try { 983 const wt = await createAgentWorktree( 984 `bridge-${safeFilenameId(sessionId)}`, 985 ) 986 worktreeCreateMs = Date.now() - wtStart 987 sessionWorktrees.set(sessionId, { 988 worktreePath: wt.worktreePath, 989 worktreeBranch: wt.worktreeBranch, 990 gitRoot: wt.gitRoot, 991 hookBased: wt.hookBased, 992 }) 993 sessionDir = wt.worktreePath 994 logForDebugging( 995 `[bridge:session] Created worktree for sessionId=${sessionId} at ${wt.worktreePath}`, 996 ) 997 } catch (err) { 998 const errMsg = errorMessage(err) 999 logger.logError( 1000 `Failed to create worktree for session ${sessionId}: ${errMsg}`, 1001 ) 1002 logError(new Error(`Worktree creation failed: ${errMsg}`)) 1003 completedWorkIds.add(work.id) 1004 trackCleanup( 1005 stopWorkWithRetry( 1006 api, 1007 environmentId, 1008 work.id, 1009 logger, 1010 backoffConfig.stopWorkBaseDelayMs, 1011 ), 1012 ) 1013 break 1014 } 1015 } 1016 1017 logForDebugging( 1018 `[bridge:session] Spawning sessionId=${sessionId} sdkUrl=${sdkUrl}`, 1019 ) 1020 1021 // compat-surface session_* form for logger/Sessions-API calls. 1022 // Work poll returns cse_* under v2 compat; convert before spawn so 1023 // the onFirstUserMessage callback can close over it. 1024 const compatSessionId = toCompatSessionId(sessionId) 1025 1026 const spawnResult = safeSpawn( 1027 spawner, 1028 { 1029 sessionId, 1030 sdkUrl, 1031 accessToken: secret.session_ingress_token, 1032 useCcrV2, 1033 workerEpoch, 1034 onFirstUserMessage: text => { 1035 // Server-set titles (--name, web rename) win. fetchSessionTitle 1036 // runs concurrently; if it already populated titledSessions, 1037 // skip. If it hasn't resolved yet, the derived title sticks — 1038 // acceptable since the server had no title at spawn time. 1039 if (titledSessions.has(compatSessionId)) return 1040 titledSessions.add(compatSessionId) 1041 const title = deriveSessionTitle(text) 1042 logger.setSessionTitle(compatSessionId, title) 1043 logForDebugging( 1044 `[bridge:title] derived title for ${compatSessionId}: ${title}`, 1045 ) 1046 void import('./createSession.js') 1047 .then(({ updateBridgeSessionTitle }) => 1048 updateBridgeSessionTitle(compatSessionId, title, { 1049 baseUrl: config.apiBaseUrl, 1050 }), 1051 ) 1052 .catch(err => 1053 logForDebugging( 1054 `[bridge:title] failed to update title for ${compatSessionId}: ${err}`, 1055 { level: 'error' }, 1056 ), 1057 ) 1058 }, 1059 }, 1060 sessionDir, 1061 ) 1062 if (typeof spawnResult === 'string') { 1063 logger.logError( 1064 `Failed to spawn session ${sessionId}: ${spawnResult}`, 1065 ) 1066 // Clean up worktree if one was created for this session 1067 const wt = sessionWorktrees.get(sessionId) 1068 if (wt) { 1069 sessionWorktrees.delete(sessionId) 1070 trackCleanup( 1071 removeAgentWorktree( 1072 wt.worktreePath, 1073 wt.worktreeBranch, 1074 wt.gitRoot, 1075 wt.hookBased, 1076 ).catch((err: unknown) => 1077 logger.logVerbose( 1078 `Failed to remove worktree ${wt.worktreePath}: ${errorMessage(err)}`, 1079 ), 1080 ), 1081 ) 1082 } 1083 completedWorkIds.add(work.id) 1084 trackCleanup( 1085 stopWorkWithRetry( 1086 api, 1087 environmentId, 1088 work.id, 1089 logger, 1090 backoffConfig.stopWorkBaseDelayMs, 1091 ), 1092 ) 1093 break 1094 } 1095 const handle = spawnResult 1096 1097 const spawnDurationMs = Date.now() - spawnStartTime 1098 logEvent('tengu_bridge_session_started', { 1099 active_sessions: activeSessions.size, 1100 spawn_mode: 1101 spawnModeAtDecision as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 1102 in_worktree: sessionWorktrees.has(sessionId), 1103 spawn_duration_ms: spawnDurationMs, 1104 worktree_create_ms: worktreeCreateMs, 1105 inProtectedNamespace: isInProtectedNamespace(), 1106 }) 1107 logForDiagnosticsNoPII('info', 'bridge_session_started', { 1108 spawn_mode: spawnModeAtDecision, 1109 in_worktree: sessionWorktrees.has(sessionId), 1110 spawn_duration_ms: spawnDurationMs, 1111 worktree_create_ms: worktreeCreateMs, 1112 }) 1113 1114 activeSessions.set(sessionId, handle) 1115 sessionWorkIds.set(sessionId, work.id) 1116 sessionIngressTokens.set(sessionId, secret.session_ingress_token) 1117 sessionCompatIds.set(sessionId, compatSessionId) 1118 1119 const startTime = Date.now() 1120 sessionStartTimes.set(sessionId, startTime) 1121 1122 // Use a generic prompt description since we no longer get startup_context 1123 logger.logSessionStart(sessionId, `Session ${sessionId}`) 1124 1125 // Compute the actual debug file path (mirrors sessionRunner.ts logic) 1126 const safeId = safeFilenameId(sessionId) 1127 let sessionDebugFile: string | undefined 1128 if (config.debugFile) { 1129 const ext = config.debugFile.lastIndexOf('.') 1130 if (ext > 0) { 1131 sessionDebugFile = `${config.debugFile.slice(0, ext)}-${safeId}${config.debugFile.slice(ext)}` 1132 } else { 1133 sessionDebugFile = `${config.debugFile}-${safeId}` 1134 } 1135 } else if (config.verbose || process.env.USER_TYPE === 'ant') { 1136 sessionDebugFile = join( 1137 tmpdir(), 1138 'claude', 1139 `bridge-session-${safeId}.log`, 1140 ) 1141 } 1142 1143 if (sessionDebugFile) { 1144 logger.logVerbose(`Debug log: ${sessionDebugFile}`) 1145 } 1146 1147 // Register in the sessions Map before starting status updates so the 1148 // first render tick shows the correct count and bullet list in sync. 1149 logger.addSession( 1150 compatSessionId, 1151 getRemoteSessionUrl(compatSessionId, config.sessionIngressUrl), 1152 ) 1153 1154 // Start live status updates and transition to "Attached" state. 1155 startStatusUpdates() 1156 logger.setAttached(compatSessionId) 1157 1158 // One-shot title fetch. If the session already has a title (set via 1159 // --name, web rename, or /remote-control), display it and mark as 1160 // titled so the first-user-message fallback doesn't overwrite it. 1161 // Otherwise onFirstUserMessage derives one from the first prompt. 1162 void fetchSessionTitle(compatSessionId, config.apiBaseUrl) 1163 .then(title => { 1164 if (title && activeSessions.has(sessionId)) { 1165 titledSessions.add(compatSessionId) 1166 logger.setSessionTitle(compatSessionId, title) 1167 logForDebugging( 1168 `[bridge:title] server title for ${compatSessionId}: ${title}`, 1169 ) 1170 } 1171 }) 1172 .catch(err => 1173 logForDebugging( 1174 `[bridge:title] failed to fetch title for ${compatSessionId}: ${err}`, 1175 { level: 'error' }, 1176 ), 1177 ) 1178 1179 // Start per-session timeout watchdog 1180 const timeoutMs = 1181 config.sessionTimeoutMs ?? DEFAULT_SESSION_TIMEOUT_MS 1182 if (timeoutMs > 0) { 1183 const timer = setTimeout( 1184 onSessionTimeout, 1185 timeoutMs, 1186 sessionId, 1187 timeoutMs, 1188 logger, 1189 timedOutSessions, 1190 handle, 1191 ) 1192 sessionTimers.set(sessionId, timer) 1193 } 1194 1195 // Schedule proactive token refresh before the JWT expires. 1196 // onRefresh branches on v2Sessions: v1 delivers OAuth to the 1197 // child, v2 triggers server re-dispatch via reconnectSession. 1198 if (useCcrV2) { 1199 v2Sessions.add(sessionId) 1200 } 1201 tokenRefresh?.schedule(sessionId, secret.session_ingress_token) 1202 1203 void handle.done.then(onSessionDone(sessionId, startTime, handle)) 1204 break 1205 } 1206 default: 1207 await ackWork() 1208 // Gracefully ignore unknown work types. The backend may send new 1209 // types before the bridge client is updated. 1210 logForDebugging( 1211 `[bridge:work] Unknown work type: ${workType}, skipping`, 1212 ) 1213 break 1214 } 1215 1216 // When at capacity, throttle the loop. The switch above still runs so 1217 // existing-session token refreshes are processed, but we sleep here 1218 // to avoid busy-looping. Include the capacity wake signal so the 1219 // sleep is interrupted immediately when a session completes. 1220 if (atCapacityBeforeSwitch) { 1221 const cap = capacityWake.signal() 1222 if (pollConfig.non_exclusive_heartbeat_interval_ms > 0) { 1223 await heartbeatActiveWorkItems() 1224 await sleep( 1225 pollConfig.non_exclusive_heartbeat_interval_ms, 1226 cap.signal, 1227 ) 1228 } else if (pollConfig.multisession_poll_interval_ms_at_capacity > 0) { 1229 await sleep( 1230 pollConfig.multisession_poll_interval_ms_at_capacity, 1231 cap.signal, 1232 ) 1233 } 1234 cap.cleanup() 1235 } 1236 } catch (err) { 1237 if (loopSignal.aborted) { 1238 break 1239 } 1240 1241 // Fatal errors (401/403) — no point retrying, auth won't fix itself 1242 if (err instanceof BridgeFatalError) { 1243 fatalExit = true 1244 // Server-enforced expiry gets a clean status message, not an error 1245 if (isExpiredErrorType(err.errorType)) { 1246 logger.logStatus(err.message) 1247 } else if (isSuppressible403(err)) { 1248 // Cosmetic 403 errors (e.g., external_poll_sessions scope, 1249 // environments:manage permission) — don't show to user 1250 logForDebugging(`[bridge:work] Suppressed 403 error: ${err.message}`) 1251 } else { 1252 logger.logError(err.message) 1253 logError(err) 1254 } 1255 logEvent('tengu_bridge_fatal_error', { 1256 status: err.status, 1257 error_type: 1258 err.errorType as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 1259 }) 1260 logForDiagnosticsNoPII( 1261 isExpiredErrorType(err.errorType) ? 'info' : 'error', 1262 'bridge_fatal_error', 1263 { status: err.status, error_type: err.errorType }, 1264 ) 1265 break 1266 } 1267 1268 const errMsg = describeAxiosError(err) 1269 1270 if (isConnectionError(err) || isServerError(err)) { 1271 const now = Date.now() 1272 1273 // Detect system sleep/wake: if the gap since the last poll error 1274 // greatly exceeds the expected backoff, the machine likely slept. 1275 // Reset error tracking so the bridge retries with a fresh budget. 1276 if ( 1277 lastPollErrorTime !== null && 1278 now - lastPollErrorTime > pollSleepDetectionThresholdMs(backoffConfig) 1279 ) { 1280 logForDebugging( 1281 `[bridge:work] Detected system sleep (${Math.round((now - lastPollErrorTime) / 1000)}s gap), resetting error budget`, 1282 ) 1283 logForDiagnosticsNoPII('info', 'bridge_poll_sleep_detected', { 1284 gapMs: now - lastPollErrorTime, 1285 }) 1286 connErrorStart = null 1287 connBackoff = 0 1288 generalErrorStart = null 1289 generalBackoff = 0 1290 } 1291 lastPollErrorTime = now 1292 1293 if (!connErrorStart) { 1294 connErrorStart = now 1295 } 1296 const elapsed = now - connErrorStart 1297 if (elapsed >= backoffConfig.connGiveUpMs) { 1298 logger.logError( 1299 `Server unreachable for ${Math.round(elapsed / 60_000)} minutes, giving up.`, 1300 ) 1301 logEvent('tengu_bridge_poll_give_up', { 1302 error_type: 1303 'connection' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 1304 elapsed_ms: elapsed, 1305 }) 1306 logForDiagnosticsNoPII('error', 'bridge_poll_give_up', { 1307 error_type: 'connection', 1308 elapsed_ms: elapsed, 1309 }) 1310 fatalExit = true 1311 break 1312 } 1313 1314 // Reset the other track when switching error types 1315 generalErrorStart = null 1316 generalBackoff = 0 1317 1318 connBackoff = connBackoff 1319 ? Math.min(connBackoff * 2, backoffConfig.connCapMs) 1320 : backoffConfig.connInitialMs 1321 const delay = addJitter(connBackoff) 1322 logger.logVerbose( 1323 `Connection error, retrying in ${formatDelay(delay)} (${Math.round(elapsed / 1000)}s elapsed): ${errMsg}`, 1324 ) 1325 logger.updateReconnectingStatus( 1326 formatDelay(delay), 1327 formatDuration(elapsed), 1328 ) 1329 // The poll_due heartbeat-loop exit leaves a healthy lease exposed to 1330 // this backoff path. Heartbeat before each sleep so /poll outages 1331 // (the VerifyEnvironmentSecretAuth DB path heartbeat was introduced 1332 // to avoid) don't kill the 300s lease TTL. No-op when activeSessions 1333 // is empty or heartbeat is disabled. 1334 if (getPollIntervalConfig().non_exclusive_heartbeat_interval_ms > 0) { 1335 await heartbeatActiveWorkItems() 1336 } 1337 await sleep(delay, loopSignal) 1338 } else { 1339 const now = Date.now() 1340 1341 // Sleep detection for general errors (same logic as connection errors) 1342 if ( 1343 lastPollErrorTime !== null && 1344 now - lastPollErrorTime > pollSleepDetectionThresholdMs(backoffConfig) 1345 ) { 1346 logForDebugging( 1347 `[bridge:work] Detected system sleep (${Math.round((now - lastPollErrorTime) / 1000)}s gap), resetting error budget`, 1348 ) 1349 logForDiagnosticsNoPII('info', 'bridge_poll_sleep_detected', { 1350 gapMs: now - lastPollErrorTime, 1351 }) 1352 connErrorStart = null 1353 connBackoff = 0 1354 generalErrorStart = null 1355 generalBackoff = 0 1356 } 1357 lastPollErrorTime = now 1358 1359 if (!generalErrorStart) { 1360 generalErrorStart = now 1361 } 1362 const elapsed = now - generalErrorStart 1363 if (elapsed >= backoffConfig.generalGiveUpMs) { 1364 logger.logError( 1365 `Persistent errors for ${Math.round(elapsed / 60_000)} minutes, giving up.`, 1366 ) 1367 logEvent('tengu_bridge_poll_give_up', { 1368 error_type: 1369 'general' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 1370 elapsed_ms: elapsed, 1371 }) 1372 logForDiagnosticsNoPII('error', 'bridge_poll_give_up', { 1373 error_type: 'general', 1374 elapsed_ms: elapsed, 1375 }) 1376 fatalExit = true 1377 break 1378 } 1379 1380 // Reset the other track when switching error types 1381 connErrorStart = null 1382 connBackoff = 0 1383 1384 generalBackoff = generalBackoff 1385 ? Math.min(generalBackoff * 2, backoffConfig.generalCapMs) 1386 : backoffConfig.generalInitialMs 1387 const delay = addJitter(generalBackoff) 1388 logger.logVerbose( 1389 `Poll failed, retrying in ${formatDelay(delay)} (${Math.round(elapsed / 1000)}s elapsed): ${errMsg}`, 1390 ) 1391 logger.updateReconnectingStatus( 1392 formatDelay(delay), 1393 formatDuration(elapsed), 1394 ) 1395 if (getPollIntervalConfig().non_exclusive_heartbeat_interval_ms > 0) { 1396 await heartbeatActiveWorkItems() 1397 } 1398 await sleep(delay, loopSignal) 1399 } 1400 } 1401 } 1402 1403 // Clean up 1404 stopStatusUpdates() 1405 logger.clearStatus() 1406 1407 const loopDurationMs = Date.now() - loopStartTime 1408 logEvent('tengu_bridge_shutdown', { 1409 active_sessions: activeSessions.size, 1410 loop_duration_ms: loopDurationMs, 1411 }) 1412 logForDiagnosticsNoPII('info', 'bridge_shutdown', { 1413 active_sessions: activeSessions.size, 1414 loop_duration_ms: loopDurationMs, 1415 }) 1416 1417 // Graceful shutdown: kill active sessions, report them as interrupted, 1418 // archive sessions, then deregister the environment so the web UI shows 1419 // the bridge as offline. 1420 1421 // Collect all session IDs to archive on exit. This includes: 1422 // 1. Active sessions (snapshot before killing — onSessionDone clears maps) 1423 // 2. The initial auto-created session (may never have had work dispatched) 1424 // api.archiveSession is idempotent (409 if already archived), so 1425 // double-archiving is safe. 1426 const sessionsToArchive = new Set(activeSessions.keys()) 1427 if (initialSessionId) { 1428 sessionsToArchive.add(initialSessionId) 1429 } 1430 // Snapshot before killing — onSessionDone clears sessionCompatIds. 1431 const compatIdSnapshot = new Map(sessionCompatIds) 1432 1433 if (activeSessions.size > 0) { 1434 logForDebugging( 1435 `[bridge:shutdown] Shutting down ${activeSessions.size} active session(s)`, 1436 ) 1437 logger.logStatus( 1438 `Shutting down ${activeSessions.size} active session(s)\u2026`, 1439 ) 1440 1441 // Snapshot work IDs before killing — onSessionDone clears the maps when 1442 // each child exits, so we need a copy for the stopWork calls below. 1443 const shutdownWorkIds = new Map(sessionWorkIds) 1444 1445 for (const [sessionId, handle] of activeSessions.entries()) { 1446 logForDebugging( 1447 `[bridge:shutdown] Sending SIGTERM to sessionId=${sessionId}`, 1448 ) 1449 handle.kill() 1450 } 1451 1452 const timeout = new AbortController() 1453 await Promise.race([ 1454 Promise.allSettled([...activeSessions.values()].map(h => h.done)), 1455 sleep(backoffConfig.shutdownGraceMs ?? 30_000, timeout.signal), 1456 ]) 1457 timeout.abort() 1458 1459 // SIGKILL any processes that didn't respond to SIGTERM within the grace window 1460 for (const [sid, handle] of activeSessions.entries()) { 1461 logForDebugging(`[bridge:shutdown] Force-killing stuck sessionId=${sid}`) 1462 handle.forceKill() 1463 } 1464 1465 // Clear any remaining session timeout and refresh timers 1466 for (const timer of sessionTimers.values()) { 1467 clearTimeout(timer) 1468 } 1469 sessionTimers.clear() 1470 tokenRefresh?.cancelAll() 1471 1472 // Clean up any remaining worktrees from active sessions. 1473 // Snapshot and clear the map first so onSessionDone (which may fire 1474 // during the await below when handle.done resolves) won't try to 1475 // remove the same worktrees again. 1476 if (sessionWorktrees.size > 0) { 1477 const remainingWorktrees = [...sessionWorktrees.values()] 1478 sessionWorktrees.clear() 1479 logForDebugging( 1480 `[bridge:shutdown] Cleaning up ${remainingWorktrees.length} worktree(s)`, 1481 ) 1482 await Promise.allSettled( 1483 remainingWorktrees.map(wt => 1484 removeAgentWorktree( 1485 wt.worktreePath, 1486 wt.worktreeBranch, 1487 wt.gitRoot, 1488 wt.hookBased, 1489 ), 1490 ), 1491 ) 1492 } 1493 1494 // Stop all active work items so the server knows they're done 1495 await Promise.allSettled( 1496 [...shutdownWorkIds.entries()].map(([sessionId, workId]) => { 1497 return api 1498 .stopWork(environmentId, workId, true) 1499 .catch(err => 1500 logger.logVerbose( 1501 `Failed to stop work ${workId} for session ${sessionId}: ${errorMessage(err)}`, 1502 ), 1503 ) 1504 }), 1505 ) 1506 } 1507 1508 // Ensure all in-flight cleanup (stopWork, worktree removal) from 1509 // onSessionDone completes before deregistering — otherwise 1510 // process.exit() can kill them mid-flight. 1511 if (pendingCleanups.size > 0) { 1512 await Promise.allSettled([...pendingCleanups]) 1513 } 1514 1515 // In single-session mode with a known session, leave the session and 1516 // environment alive so `claude remote-control --session-id=<id>` can resume. 1517 // The backend GCs stale environments via a 4h TTL (BRIDGE_LAST_POLL_TTL). 1518 // Archiving the session or deregistering the environment would make the 1519 // printed resume command a lie — deregister deletes Firestore + Redis stream. 1520 // Skip when the loop exited fatally (env expired, auth failed, give-up) — 1521 // resume is impossible in those cases and the message would contradict the 1522 // error already printed. 1523 // feature('KAIROS') gate: --session-id is ant-only; without the gate, 1524 // revert to the pre-PR behavior (archive + deregister on every shutdown). 1525 if ( 1526 feature('KAIROS') && 1527 config.spawnMode === 'single-session' && 1528 initialSessionId && 1529 !fatalExit 1530 ) { 1531 logger.logStatus( 1532 `Resume this session by running \`claude remote-control --continue\``, 1533 ) 1534 logForDebugging( 1535 `[bridge:shutdown] Skipping archive+deregister to allow resume of session ${initialSessionId}`, 1536 ) 1537 return 1538 } 1539 1540 // Archive all known sessions so they don't linger as idle/running on the 1541 // server after the bridge goes offline. 1542 if (sessionsToArchive.size > 0) { 1543 logForDebugging( 1544 `[bridge:shutdown] Archiving ${sessionsToArchive.size} session(s)`, 1545 ) 1546 await Promise.allSettled( 1547 [...sessionsToArchive].map(sessionId => 1548 api 1549 .archiveSession( 1550 compatIdSnapshot.get(sessionId) ?? toCompatSessionId(sessionId), 1551 ) 1552 .catch(err => 1553 logger.logVerbose( 1554 `Failed to archive session ${sessionId}: ${errorMessage(err)}`, 1555 ), 1556 ), 1557 ), 1558 ) 1559 } 1560 1561 // Deregister the environment so the web UI shows the bridge as offline 1562 // and the Redis stream is cleaned up. 1563 try { 1564 await api.deregisterEnvironment(environmentId) 1565 logForDebugging( 1566 `[bridge:shutdown] Environment deregistered, bridge offline`, 1567 ) 1568 logger.logVerbose('Environment deregistered.') 1569 } catch (err) { 1570 logger.logVerbose(`Failed to deregister environment: ${errorMessage(err)}`) 1571 } 1572 1573 // Clear the crash-recovery pointer — the env is gone, pointer would be 1574 // stale. The early return above (resumable SIGINT shutdown) skips this, 1575 // leaving the pointer as a backup for the printed --session-id hint. 1576 const { clearBridgePointer } = await import('./bridgePointer.js') 1577 await clearBridgePointer(config.dir) 1578 1579 logger.logVerbose('Environment offline.') 1580} 1581 1582const CONNECTION_ERROR_CODES = new Set([ 1583 'ECONNREFUSED', 1584 'ECONNRESET', 1585 'ETIMEDOUT', 1586 'ENETUNREACH', 1587 'EHOSTUNREACH', 1588]) 1589 1590export function isConnectionError(err: unknown): boolean { 1591 if ( 1592 err && 1593 typeof err === 'object' && 1594 'code' in err && 1595 typeof err.code === 'string' && 1596 CONNECTION_ERROR_CODES.has(err.code) 1597 ) { 1598 return true 1599 } 1600 return false 1601} 1602 1603/** Detect HTTP 5xx errors from axios (code: 'ERR_BAD_RESPONSE'). */ 1604export function isServerError(err: unknown): boolean { 1605 return ( 1606 !!err && 1607 typeof err === 'object' && 1608 'code' in err && 1609 typeof err.code === 'string' && 1610 err.code === 'ERR_BAD_RESPONSE' 1611 ) 1612} 1613 1614/** Add ±25% jitter to a delay value. */ 1615function addJitter(ms: number): number { 1616 return Math.max(0, ms + ms * 0.25 * (2 * Math.random() - 1)) 1617} 1618 1619function formatDelay(ms: number): string { 1620 return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${Math.round(ms)}ms` 1621} 1622 1623/** 1624 * Retry stopWork with exponential backoff (3 attempts, 1s/2s/4s). 1625 * Ensures the server learns the work item ended, preventing server-side zombies. 1626 */ 1627async function stopWorkWithRetry( 1628 api: BridgeApiClient, 1629 environmentId: string, 1630 workId: string, 1631 logger: BridgeLogger, 1632 baseDelayMs = 1000, 1633): Promise<void> { 1634 const MAX_ATTEMPTS = 3 1635 1636 for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { 1637 try { 1638 await api.stopWork(environmentId, workId, false) 1639 logForDebugging( 1640 `[bridge:work] stopWork succeeded for workId=${workId} on attempt ${attempt}/${MAX_ATTEMPTS}`, 1641 ) 1642 return 1643 } catch (err) { 1644 // Auth/permission errors won't be fixed by retrying 1645 if (err instanceof BridgeFatalError) { 1646 if (isSuppressible403(err)) { 1647 logForDebugging( 1648 `[bridge:work] Suppressed stopWork 403 for ${workId}: ${err.message}`, 1649 ) 1650 } else { 1651 logger.logError(`Failed to stop work ${workId}: ${err.message}`) 1652 } 1653 logForDiagnosticsNoPII('error', 'bridge_stop_work_failed', { 1654 attempts: attempt, 1655 fatal: true, 1656 }) 1657 return 1658 } 1659 const errMsg = errorMessage(err) 1660 if (attempt < MAX_ATTEMPTS) { 1661 const delay = addJitter(baseDelayMs * Math.pow(2, attempt - 1)) 1662 logger.logVerbose( 1663 `Failed to stop work ${workId} (attempt ${attempt}/${MAX_ATTEMPTS}), retrying in ${formatDelay(delay)}: ${errMsg}`, 1664 ) 1665 await sleep(delay) 1666 } else { 1667 logger.logError( 1668 `Failed to stop work ${workId} after ${MAX_ATTEMPTS} attempts: ${errMsg}`, 1669 ) 1670 logForDiagnosticsNoPII('error', 'bridge_stop_work_failed', { 1671 attempts: MAX_ATTEMPTS, 1672 }) 1673 } 1674 } 1675 } 1676} 1677 1678function onSessionTimeout( 1679 sessionId: string, 1680 timeoutMs: number, 1681 logger: BridgeLogger, 1682 timedOutSessions: Set<string>, 1683 handle: SessionHandle, 1684): void { 1685 logForDebugging( 1686 `[bridge:session] sessionId=${sessionId} timed out after ${formatDuration(timeoutMs)}`, 1687 ) 1688 logEvent('tengu_bridge_session_timeout', { 1689 timeout_ms: timeoutMs, 1690 }) 1691 logger.logSessionFailed( 1692 sessionId, 1693 `Session timed out after ${formatDuration(timeoutMs)}`, 1694 ) 1695 timedOutSessions.add(sessionId) 1696 handle.kill() 1697} 1698 1699export type ParsedArgs = { 1700 verbose: boolean 1701 sandbox: boolean 1702 debugFile?: string 1703 sessionTimeoutMs?: number 1704 permissionMode?: string 1705 name?: string 1706 /** Value passed to --spawn (if any); undefined if no --spawn flag was given. */ 1707 spawnMode: SpawnMode | undefined 1708 /** Value passed to --capacity (if any); undefined if no --capacity flag was given. */ 1709 capacity: number | undefined 1710 /** --[no-]create-session-in-dir override; undefined = use default (on). */ 1711 createSessionInDir: boolean | undefined 1712 /** Resume an existing session instead of creating a new one. */ 1713 sessionId?: string 1714 /** Resume the last session in this directory (reads bridge-pointer.json). */ 1715 continueSession: boolean 1716 help: boolean 1717 error?: string 1718} 1719 1720const SPAWN_FLAG_VALUES = ['session', 'same-dir', 'worktree'] as const 1721 1722function parseSpawnValue(raw: string | undefined): SpawnMode | string { 1723 if (raw === 'session') return 'single-session' 1724 if (raw === 'same-dir') return 'same-dir' 1725 if (raw === 'worktree') return 'worktree' 1726 return `--spawn requires one of: ${SPAWN_FLAG_VALUES.join(', ')} (got: ${raw ?? '<missing>'})` 1727} 1728 1729function parseCapacityValue(raw: string | undefined): number | string { 1730 const n = raw === undefined ? NaN : parseInt(raw, 10) 1731 if (isNaN(n) || n < 1) { 1732 return `--capacity requires a positive integer (got: ${raw ?? '<missing>'})` 1733 } 1734 return n 1735} 1736 1737export function parseArgs(args: string[]): ParsedArgs { 1738 let verbose = false 1739 let sandbox = false 1740 let debugFile: string | undefined 1741 let sessionTimeoutMs: number | undefined 1742 let permissionMode: string | undefined 1743 let name: string | undefined 1744 let help = false 1745 let spawnMode: SpawnMode | undefined 1746 let capacity: number | undefined 1747 let createSessionInDir: boolean | undefined 1748 let sessionId: string | undefined 1749 let continueSession = false 1750 1751 for (let i = 0; i < args.length; i++) { 1752 const arg = args[i]! 1753 if (arg === '--help' || arg === '-h') { 1754 help = true 1755 } else if (arg === '--verbose' || arg === '-v') { 1756 verbose = true 1757 } else if (arg === '--sandbox') { 1758 sandbox = true 1759 } else if (arg === '--no-sandbox') { 1760 sandbox = false 1761 } else if (arg === '--debug-file' && i + 1 < args.length) { 1762 debugFile = resolve(args[++i]!) 1763 } else if (arg.startsWith('--debug-file=')) { 1764 debugFile = resolve(arg.slice('--debug-file='.length)) 1765 } else if (arg === '--session-timeout' && i + 1 < args.length) { 1766 sessionTimeoutMs = parseInt(args[++i]!, 10) * 1000 1767 } else if (arg.startsWith('--session-timeout=')) { 1768 sessionTimeoutMs = 1769 parseInt(arg.slice('--session-timeout='.length), 10) * 1000 1770 } else if (arg === '--permission-mode' && i + 1 < args.length) { 1771 permissionMode = args[++i]! 1772 } else if (arg.startsWith('--permission-mode=')) { 1773 permissionMode = arg.slice('--permission-mode='.length) 1774 } else if (arg === '--name' && i + 1 < args.length) { 1775 name = args[++i]! 1776 } else if (arg.startsWith('--name=')) { 1777 name = arg.slice('--name='.length) 1778 } else if ( 1779 feature('KAIROS') && 1780 arg === '--session-id' && 1781 i + 1 < args.length 1782 ) { 1783 sessionId = args[++i]! 1784 if (!sessionId) { 1785 return makeError('--session-id requires a value') 1786 } 1787 } else if (feature('KAIROS') && arg.startsWith('--session-id=')) { 1788 sessionId = arg.slice('--session-id='.length) 1789 if (!sessionId) { 1790 return makeError('--session-id requires a value') 1791 } 1792 } else if (feature('KAIROS') && (arg === '--continue' || arg === '-c')) { 1793 continueSession = true 1794 } else if (arg === '--spawn' || arg.startsWith('--spawn=')) { 1795 if (spawnMode !== undefined) { 1796 return makeError('--spawn may only be specified once') 1797 } 1798 const raw = arg.startsWith('--spawn=') 1799 ? arg.slice('--spawn='.length) 1800 : args[++i] 1801 const v = parseSpawnValue(raw) 1802 if (v === 'single-session' || v === 'same-dir' || v === 'worktree') { 1803 spawnMode = v 1804 } else { 1805 return makeError(v) 1806 } 1807 } else if (arg === '--capacity' || arg.startsWith('--capacity=')) { 1808 if (capacity !== undefined) { 1809 return makeError('--capacity may only be specified once') 1810 } 1811 const raw = arg.startsWith('--capacity=') 1812 ? arg.slice('--capacity='.length) 1813 : args[++i] 1814 const v = parseCapacityValue(raw) 1815 if (typeof v === 'number') capacity = v 1816 else return makeError(v) 1817 } else if (arg === '--create-session-in-dir') { 1818 createSessionInDir = true 1819 } else if (arg === '--no-create-session-in-dir') { 1820 createSessionInDir = false 1821 } else { 1822 return makeError( 1823 `Unknown argument: ${arg}\nRun 'claude remote-control --help' for usage.`, 1824 ) 1825 } 1826 } 1827 1828 // Note: gate check for --spawn/--capacity/--create-session-in-dir is in bridgeMain 1829 // (gate-aware error). Flag cross-validation happens here. 1830 1831 // --capacity only makes sense for multi-session modes. 1832 if (spawnMode === 'single-session' && capacity !== undefined) { 1833 return makeError( 1834 `--capacity cannot be used with --spawn=session (single-session mode has fixed capacity 1).`, 1835 ) 1836 } 1837 1838 // --session-id / --continue resume a specific session on its original 1839 // environment; incompatible with spawn-related flags (which configure 1840 // fresh session creation), and mutually exclusive with each other. 1841 if ( 1842 (sessionId || continueSession) && 1843 (spawnMode !== undefined || 1844 capacity !== undefined || 1845 createSessionInDir !== undefined) 1846 ) { 1847 return makeError( 1848 `--session-id and --continue cannot be used with --spawn, --capacity, or --create-session-in-dir.`, 1849 ) 1850 } 1851 if (sessionId && continueSession) { 1852 return makeError(`--session-id and --continue cannot be used together.`) 1853 } 1854 1855 return { 1856 verbose, 1857 sandbox, 1858 debugFile, 1859 sessionTimeoutMs, 1860 permissionMode, 1861 name, 1862 spawnMode, 1863 capacity, 1864 createSessionInDir, 1865 sessionId, 1866 continueSession, 1867 help, 1868 } 1869 1870 function makeError(error: string): ParsedArgs { 1871 return { 1872 verbose, 1873 sandbox, 1874 debugFile, 1875 sessionTimeoutMs, 1876 permissionMode, 1877 name, 1878 spawnMode, 1879 capacity, 1880 createSessionInDir, 1881 sessionId, 1882 continueSession, 1883 help, 1884 error, 1885 } 1886 } 1887} 1888 1889async function printHelp(): Promise<void> { 1890 // Use EXTERNAL_PERMISSION_MODES for help text — internal modes (bubble) 1891 // are ant-only and auto is feature-gated; they're still accepted by validation. 1892 const { EXTERNAL_PERMISSION_MODES } = await import('../types/permissions.js') 1893 const modes = EXTERNAL_PERMISSION_MODES.join(', ') 1894 const showServer = await isMultiSessionSpawnEnabled() 1895 const serverOptions = showServer 1896 ? ` --spawn <mode> Spawn mode: same-dir, worktree, session 1897 (default: same-dir) 1898 --capacity <N> Max concurrent sessions in worktree or 1899 same-dir mode (default: ${SPAWN_SESSIONS_DEFAULT}) 1900 --[no-]create-session-in-dir Pre-create a session in the current 1901 directory; in worktree mode this session 1902 stays in cwd while on-demand sessions get 1903 isolated worktrees (default: on) 1904` 1905 : '' 1906 const serverDescription = showServer 1907 ? ` 1908 Remote Control runs as a persistent server that accepts multiple concurrent 1909 sessions in the current directory. One session is pre-created on start so 1910 you have somewhere to type immediately. Use --spawn=worktree to isolate 1911 each on-demand session in its own git worktree, or --spawn=session for 1912 the classic single-session mode (exits when that session ends). Press 'w' 1913 during runtime to toggle between same-dir and worktree. 1914` 1915 : '' 1916 const serverNote = showServer 1917 ? ` - Worktree mode requires a git repository or WorktreeCreate/WorktreeRemove hooks 1918` 1919 : '' 1920 const help = ` 1921Remote Control - Connect your local environment to claude.ai/code 1922 1923USAGE 1924 claude remote-control [options] 1925OPTIONS 1926 --name <name> Name for the session (shown in claude.ai/code) 1927${ 1928 feature('KAIROS') 1929 ? ` -c, --continue Resume the last session in this directory 1930 --session-id <id> Resume a specific session by ID (cannot be 1931 used with spawn flags or --continue) 1932` 1933 : '' 1934} --permission-mode <mode> Permission mode for spawned sessions 1935 (${modes}) 1936 --debug-file <path> Write debug logs to file 1937 -v, --verbose Enable verbose output 1938 -h, --help Show this help 1939${serverOptions} 1940DESCRIPTION 1941 Remote Control allows you to control sessions on your local device from 1942 claude.ai/code (https://claude.ai/code). Run this command in the 1943 directory you want to work in, then connect from the Claude app or web. 1944${serverDescription} 1945NOTES 1946 - You must be logged in with a Claude account that has a subscription 1947 - Run \`claude\` first in the directory to accept the workspace trust dialog 1948${serverNote}` 1949 // biome-ignore lint/suspicious/noConsole: intentional help output 1950 console.log(help) 1951} 1952 1953const TITLE_MAX_LEN = 80 1954 1955/** Derive a session title from a user message: first line, truncated. */ 1956function deriveSessionTitle(text: string): string { 1957 // Collapse whitespace — newlines/tabs would break the single-line status display. 1958 const flat = text.replace(/\s+/g, ' ').trim() 1959 return truncateToWidth(flat, TITLE_MAX_LEN) 1960} 1961 1962/** 1963 * One-shot fetch of a session's title via GET /v1/sessions/{id}. 1964 * 1965 * Uses `getBridgeSession` from createSession.ts (ccr-byoc headers + org UUID) 1966 * rather than the environments-level bridgeApi client, whose headers make the 1967 * Sessions API return 404. Returns undefined if the session has no title yet 1968 * or the fetch fails — the caller falls back to deriving a title from the 1969 * first user message. 1970 */ 1971async function fetchSessionTitle( 1972 compatSessionId: string, 1973 baseUrl: string, 1974): Promise<string | undefined> { 1975 const { getBridgeSession } = await import('./createSession.js') 1976 const session = await getBridgeSession(compatSessionId, { baseUrl }) 1977 return session?.title || undefined 1978} 1979 1980export async function bridgeMain(args: string[]): Promise<void> { 1981 const parsed = parseArgs(args) 1982 1983 if (parsed.help) { 1984 await printHelp() 1985 return 1986 } 1987 if (parsed.error) { 1988 // biome-ignore lint/suspicious/noConsole: intentional error output 1989 console.error(`Error: ${parsed.error}`) 1990 // eslint-disable-next-line custom-rules/no-process-exit 1991 process.exit(1) 1992 } 1993 1994 const { 1995 verbose, 1996 sandbox, 1997 debugFile, 1998 sessionTimeoutMs, 1999 permissionMode, 2000 name, 2001 spawnMode: parsedSpawnMode, 2002 capacity: parsedCapacity, 2003 createSessionInDir: parsedCreateSessionInDir, 2004 sessionId: parsedSessionId, 2005 continueSession, 2006 } = parsed 2007 // Mutable so --continue can set it from the pointer file. The #20460 2008 // resume flow below then treats it the same as an explicit --session-id. 2009 let resumeSessionId = parsedSessionId 2010 // When --continue found a pointer, this is the directory it came from 2011 // (may be a worktree sibling, not `dir`). On resume-flow deterministic 2012 // failure, clear THIS file so --continue doesn't keep hitting the same 2013 // dead session. Undefined for explicit --session-id (leaves pointer alone). 2014 let resumePointerDir: string | undefined 2015 2016 const usedMultiSessionFeature = 2017 parsedSpawnMode !== undefined || 2018 parsedCapacity !== undefined || 2019 parsedCreateSessionInDir !== undefined 2020 2021 // Validate permission mode early so the user gets an error before 2022 // the bridge starts polling for work. 2023 if (permissionMode !== undefined) { 2024 const { PERMISSION_MODES } = await import('../types/permissions.js') 2025 const valid: readonly string[] = PERMISSION_MODES 2026 if (!valid.includes(permissionMode)) { 2027 // biome-ignore lint/suspicious/noConsole: intentional error output 2028 console.error( 2029 `Error: Invalid permission mode '${permissionMode}'. Valid modes: ${valid.join(', ')}`, 2030 ) 2031 // eslint-disable-next-line custom-rules/no-process-exit 2032 process.exit(1) 2033 } 2034 } 2035 2036 const dir = resolve('.') 2037 2038 // The bridge fast-path bypasses init.ts, so we must enable config reading 2039 // before any code that transitively calls getGlobalConfig() 2040 const { enableConfigs, checkHasTrustDialogAccepted } = await import( 2041 '../utils/config.js' 2042 ) 2043 enableConfigs() 2044 2045 // Initialize analytics and error reporting sinks. The bridge bypasses the 2046 // setup() init flow, so we call initSinks() directly to attach sinks here. 2047 const { initSinks } = await import('../utils/sinks.js') 2048 initSinks() 2049 2050 // Gate-aware validation: --spawn / --capacity / --create-session-in-dir require 2051 // the multi-session gate. parseArgs has already validated flag combinations; 2052 // here we only check the gate since that requires an async GrowthBook call. 2053 // Runs after enableConfigs() (GrowthBook cache reads global config) and after 2054 // initSinks() so the denial event can be enqueued. 2055 const multiSessionEnabled = await isMultiSessionSpawnEnabled() 2056 if (usedMultiSessionFeature && !multiSessionEnabled) { 2057 await logEventAsync('tengu_bridge_multi_session_denied', { 2058 used_spawn: parsedSpawnMode !== undefined, 2059 used_capacity: parsedCapacity !== undefined, 2060 used_create_session_in_dir: parsedCreateSessionInDir !== undefined, 2061 }) 2062 // logEventAsync only enqueues — process.exit() discards buffered events. 2063 // Flush explicitly, capped at 500ms to match gracefulShutdown.ts. 2064 // (sleep() doesn't unref its timer, but process.exit() follows immediately 2065 // so the ref'd timer can't delay shutdown.) 2066 await Promise.race([ 2067 Promise.all([shutdown1PEventLogging(), shutdownDatadog()]), 2068 sleep(500, undefined, { unref: true }), 2069 ]).catch(() => {}) 2070 // biome-ignore lint/suspicious/noConsole: intentional error output 2071 console.error( 2072 'Error: Multi-session Remote Control is not enabled for your account yet.', 2073 ) 2074 // eslint-disable-next-line custom-rules/no-process-exit 2075 process.exit(1) 2076 } 2077 2078 // Set the bootstrap CWD so that trust checks, project config lookups, and 2079 // git utilities (getBranch, getRemoteUrl) resolve against the correct path. 2080 const { setOriginalCwd, setCwdState } = await import('../bootstrap/state.js') 2081 setOriginalCwd(dir) 2082 setCwdState(dir) 2083 2084 // The bridge bypasses main.tsx (which renders the interactive TrustDialog via showSetupScreens), 2085 // so we must verify trust was previously established by a normal `claude` session. 2086 if (!checkHasTrustDialogAccepted()) { 2087 // biome-ignore lint/suspicious/noConsole:: intentional console output 2088 console.error( 2089 `Error: Workspace not trusted. Please run \`claude\` in ${dir} first to review and accept the workspace trust dialog.`, 2090 ) 2091 // eslint-disable-next-line custom-rules/no-process-exit 2092 process.exit(1) 2093 } 2094 2095 // Resolve auth 2096 const { clearOAuthTokenCache, checkAndRefreshOAuthTokenIfNeeded } = 2097 await import('../utils/auth.js') 2098 const { getBridgeAccessToken, getBridgeBaseUrl } = await import( 2099 './bridgeConfig.js' 2100 ) 2101 2102 const bridgeToken = getBridgeAccessToken() 2103 if (!bridgeToken) { 2104 // biome-ignore lint/suspicious/noConsole:: intentional console output 2105 console.error(BRIDGE_LOGIN_ERROR) 2106 // eslint-disable-next-line custom-rules/no-process-exit 2107 process.exit(1) 2108 } 2109 2110 // First-time remote dialog — explain what bridge does and get consent 2111 const { 2112 getGlobalConfig, 2113 saveGlobalConfig, 2114 getCurrentProjectConfig, 2115 saveCurrentProjectConfig, 2116 } = await import('../utils/config.js') 2117 if (!getGlobalConfig().remoteDialogSeen) { 2118 const readline = await import('readline') 2119 const rl = readline.createInterface({ 2120 input: process.stdin, 2121 output: process.stdout, 2122 }) 2123 // biome-ignore lint/suspicious/noConsole:: intentional console output 2124 console.log( 2125 '\nRemote Control lets you access this CLI session from the web (claude.ai/code)\nor the Claude app, so you can pick up where you left off on any device.\n\nYou can disconnect remote access anytime by running /remote-control again.\n', 2126 ) 2127 const answer = await new Promise<string>(resolve => { 2128 rl.question('Enable Remote Control? (y/n) ', resolve) 2129 }) 2130 rl.close() 2131 saveGlobalConfig(current => { 2132 if (current.remoteDialogSeen) return current 2133 return { ...current, remoteDialogSeen: true } 2134 }) 2135 if (answer.toLowerCase() !== 'y' && answer.toLowerCase() !== 'yes') { 2136 // eslint-disable-next-line custom-rules/no-process-exit 2137 process.exit(0) 2138 } 2139 } 2140 2141 // --continue: resolve the most recent session from the crash-recovery 2142 // pointer and chain into the #20460 --session-id flow. Worktree-aware: 2143 // checks current dir first (fast path, zero exec), then fans out to git 2144 // worktree siblings if that misses — the REPL bridge writes to 2145 // getOriginalCwd() which EnterWorktreeTool/activeWorktreeSession can 2146 // point at a worktree while the user's shell is at the repo root. 2147 // KAIROS-gated at parseArgs — continueSession is always false in external 2148 // builds, so this block tree-shakes. 2149 if (feature('KAIROS') && continueSession) { 2150 const { readBridgePointerAcrossWorktrees } = await import( 2151 './bridgePointer.js' 2152 ) 2153 const found = await readBridgePointerAcrossWorktrees(dir) 2154 if (!found) { 2155 // biome-ignore lint/suspicious/noConsole: intentional error output 2156 console.error( 2157 `Error: No recent session found in this directory or its worktrees. Run \`claude remote-control\` to start a new one.`, 2158 ) 2159 // eslint-disable-next-line custom-rules/no-process-exit 2160 process.exit(1) 2161 } 2162 const { pointer, dir: pointerDir } = found 2163 const ageMin = Math.round(pointer.ageMs / 60_000) 2164 const ageStr = ageMin < 60 ? `${ageMin}m` : `${Math.round(ageMin / 60)}h` 2165 const fromWt = pointerDir !== dir ? ` from worktree ${pointerDir}` : '' 2166 // biome-ignore lint/suspicious/noConsole: intentional info output 2167 console.error( 2168 `Resuming session ${pointer.sessionId} (${ageStr} ago)${fromWt}\u2026`, 2169 ) 2170 resumeSessionId = pointer.sessionId 2171 // Track where the pointer came from so the #20460 exit(1) paths below 2172 // clear the RIGHT file on deterministic failure — otherwise --continue 2173 // would keep hitting the same dead session. May be a worktree sibling. 2174 resumePointerDir = pointerDir 2175 } 2176 2177 // In production, baseUrl is the Anthropic API (from OAuth config). 2178 // CLAUDE_BRIDGE_BASE_URL overrides this for ant local dev only. 2179 const baseUrl = getBridgeBaseUrl() 2180 2181 // For non-localhost targets, require HTTPS to protect credentials. 2182 if ( 2183 baseUrl.startsWith('http://') && 2184 !baseUrl.includes('localhost') && 2185 !baseUrl.includes('127.0.0.1') 2186 ) { 2187 // biome-ignore lint/suspicious/noConsole:: intentional console output 2188 console.error( 2189 'Error: Remote Control base URL uses HTTP. Only HTTPS or localhost HTTP is allowed.', 2190 ) 2191 // eslint-disable-next-line custom-rules/no-process-exit 2192 process.exit(1) 2193 } 2194 2195 // Session ingress URL for WebSocket connections. In production this is the 2196 // same as baseUrl (Envoy routes /v1/session_ingress/* to session-ingress). 2197 // Locally, session-ingress runs on a different port (9413) than the 2198 // contain-provide-api (8211), so CLAUDE_BRIDGE_SESSION_INGRESS_URL must be 2199 // set explicitly. Ant-only, matching CLAUDE_BRIDGE_BASE_URL. 2200 const sessionIngressUrl = 2201 process.env.USER_TYPE === 'ant' && 2202 process.env.CLAUDE_BRIDGE_SESSION_INGRESS_URL 2203 ? process.env.CLAUDE_BRIDGE_SESSION_INGRESS_URL 2204 : baseUrl 2205 2206 const { getBranch, getRemoteUrl, findGitRoot } = await import( 2207 '../utils/git.js' 2208 ) 2209 2210 // Precheck worktree availability for the first-run dialog and the `w` 2211 // toggle. Unconditional so we know upfront whether worktree is an option. 2212 const { hasWorktreeCreateHook } = await import('../utils/hooks.js') 2213 const worktreeAvailable = hasWorktreeCreateHook() || findGitRoot(dir) !== null 2214 2215 // Load saved per-project spawn-mode preference. Gated by multiSessionEnabled 2216 // so a GrowthBook rollback cleanly reverts users to single-session — 2217 // otherwise a saved pref would silently re-enable multi-session behavior 2218 // (worktree isolation, 32 max sessions, w toggle) despite the gate being off. 2219 // Also guard against a stale worktree pref left over from when this dir WAS 2220 // a git repo (or the user copied config) — clear it on disk so the warning 2221 // doesn't repeat on every launch. 2222 let savedSpawnMode = multiSessionEnabled 2223 ? getCurrentProjectConfig().remoteControlSpawnMode 2224 : undefined 2225 if (savedSpawnMode === 'worktree' && !worktreeAvailable) { 2226 // biome-ignore lint/suspicious/noConsole: intentional warning output 2227 console.error( 2228 'Warning: Saved spawn mode is worktree but this directory is not a git repository. Falling back to same-dir.', 2229 ) 2230 savedSpawnMode = undefined 2231 saveCurrentProjectConfig(current => { 2232 if (current.remoteControlSpawnMode === undefined) return current 2233 return { ...current, remoteControlSpawnMode: undefined } 2234 }) 2235 } 2236 2237 // First-run spawn-mode choice: ask once per project when the choice is 2238 // meaningful (gate on, both modes available, no explicit override, not 2239 // resuming). Saves to ProjectConfig so subsequent runs skip this. 2240 if ( 2241 multiSessionEnabled && 2242 !savedSpawnMode && 2243 worktreeAvailable && 2244 parsedSpawnMode === undefined && 2245 !resumeSessionId && 2246 process.stdin.isTTY 2247 ) { 2248 const readline = await import('readline') 2249 const rl = readline.createInterface({ 2250 input: process.stdin, 2251 output: process.stdout, 2252 }) 2253 // biome-ignore lint/suspicious/noConsole: intentional dialog output 2254 console.log( 2255 `\nClaude Remote Control is launching in spawn mode which lets you create new sessions in this project from Claude Code on Web or your Mobile app. Learn more here: https://code.claude.com/docs/en/remote-control\n\n` + 2256 `Spawn mode for this project:\n` + 2257 ` [1] same-dir \u2014 sessions share the current directory (default)\n` + 2258 ` [2] worktree \u2014 each session gets an isolated git worktree\n\n` + 2259 `This can be changed later or explicitly set with --spawn=same-dir or --spawn=worktree.\n`, 2260 ) 2261 const answer = await new Promise<string>(resolve => { 2262 rl.question('Choose [1/2] (default: 1): ', resolve) 2263 }) 2264 rl.close() 2265 const chosen: 'same-dir' | 'worktree' = 2266 answer.trim() === '2' ? 'worktree' : 'same-dir' 2267 savedSpawnMode = chosen 2268 logEvent('tengu_bridge_spawn_mode_chosen', { 2269 spawn_mode: 2270 chosen as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 2271 }) 2272 saveCurrentProjectConfig(current => { 2273 if (current.remoteControlSpawnMode === chosen) return current 2274 return { ...current, remoteControlSpawnMode: chosen } 2275 }) 2276 } 2277 2278 // Determine effective spawn mode. 2279 // Precedence: resume > explicit --spawn > saved project pref > gate default 2280 // - resuming via --continue / --session-id: always single-session (resume 2281 // targets one specific session in its original directory) 2282 // - explicit --spawn flag: use that value directly (does not persist) 2283 // - saved ProjectConfig.remoteControlSpawnMode: set by first-run dialog or `w` 2284 // - default with gate on: same-dir (persistent multi-session, shared cwd) 2285 // - default with gate off: single-session (unchanged legacy behavior) 2286 // Track how spawn mode was determined, for rollout analytics. 2287 type SpawnModeSource = 'resume' | 'flag' | 'saved' | 'gate_default' 2288 let spawnModeSource: SpawnModeSource 2289 let spawnMode: SpawnMode 2290 if (resumeSessionId) { 2291 spawnMode = 'single-session' 2292 spawnModeSource = 'resume' 2293 } else if (parsedSpawnMode !== undefined) { 2294 spawnMode = parsedSpawnMode 2295 spawnModeSource = 'flag' 2296 } else if (savedSpawnMode !== undefined) { 2297 spawnMode = savedSpawnMode 2298 spawnModeSource = 'saved' 2299 } else { 2300 spawnMode = multiSessionEnabled ? 'same-dir' : 'single-session' 2301 spawnModeSource = 'gate_default' 2302 } 2303 const maxSessions = 2304 spawnMode === 'single-session' 2305 ? 1 2306 : (parsedCapacity ?? SPAWN_SESSIONS_DEFAULT) 2307 // Pre-create an empty session on start so the user has somewhere to type 2308 // immediately, running in the current directory (exempted from worktree 2309 // creation in the spawn loop). On by default; --no-create-session-in-dir 2310 // opts out for a pure on-demand server where every session is isolated. 2311 // The effectiveResumeSessionId guard at the creation site handles the 2312 // resume case (skip creation when resume succeeded; fall through to 2313 // fresh creation on env-mismatch fallback). 2314 const preCreateSession = parsedCreateSessionInDir ?? true 2315 2316 // Without --continue: a leftover pointer means the previous run didn't 2317 // shut down cleanly (crash, kill -9, terminal closed). Clear it so the 2318 // stale env doesn't linger past its relevance. Runs in all modes 2319 // (clearBridgePointer is a no-op when no file exists) — covers the 2320 // gate-transition case where a user crashed in single-session mode then 2321 // starts fresh in worktree mode. Only single-session mode writes new 2322 // pointers. 2323 if (!resumeSessionId) { 2324 const { clearBridgePointer } = await import('./bridgePointer.js') 2325 await clearBridgePointer(dir) 2326 } 2327 2328 // Worktree mode requires either git or WorktreeCreate/WorktreeRemove hooks. 2329 // Only reachable via explicit --spawn=worktree (default is same-dir); 2330 // saved worktree pref was already guarded above. 2331 if (spawnMode === 'worktree' && !worktreeAvailable) { 2332 // biome-ignore lint/suspicious/noConsole: intentional error output 2333 console.error( 2334 `Error: Worktree mode requires a git repository or WorktreeCreate hooks configured. Use --spawn=session for single-session mode.`, 2335 ) 2336 // eslint-disable-next-line custom-rules/no-process-exit 2337 process.exit(1) 2338 } 2339 2340 const branch = await getBranch() 2341 const gitRepoUrl = await getRemoteUrl() 2342 const machineName = hostname() 2343 const bridgeId = randomUUID() 2344 2345 const { handleOAuth401Error } = await import('../utils/auth.js') 2346 const api = createBridgeApiClient({ 2347 baseUrl, 2348 getAccessToken: getBridgeAccessToken, 2349 runnerVersion: MACRO.VERSION, 2350 onDebug: logForDebugging, 2351 onAuth401: handleOAuth401Error, 2352 getTrustedDeviceToken, 2353 }) 2354 2355 // When resuming a session via --session-id, fetch it to learn its 2356 // environment_id and reuse that for registration (idempotent on the 2357 // backend). Left undefined otherwise — the backend rejects 2358 // client-generated UUIDs and will allocate a fresh environment. 2359 // feature('KAIROS') gate: --session-id is ant-only; parseArgs already 2360 // rejects the flag when the gate is off, so resumeSessionId is always 2361 // undefined here in external builds — this guard is for tree-shaking. 2362 let reuseEnvironmentId: string | undefined 2363 if (feature('KAIROS') && resumeSessionId) { 2364 try { 2365 validateBridgeId(resumeSessionId, 'sessionId') 2366 } catch { 2367 // biome-ignore lint/suspicious/noConsole: intentional error output 2368 console.error( 2369 `Error: Invalid session ID "${resumeSessionId}". Session IDs must not contain unsafe characters.`, 2370 ) 2371 // eslint-disable-next-line custom-rules/no-process-exit 2372 process.exit(1) 2373 } 2374 // Proactively refresh the OAuth token — getBridgeSession uses raw axios 2375 // without the withOAuthRetry 401-refresh logic. An expired-but-present 2376 // token would otherwise produce a misleading "not found" error. 2377 await checkAndRefreshOAuthTokenIfNeeded() 2378 clearOAuthTokenCache() 2379 const { getBridgeSession } = await import('./createSession.js') 2380 const session = await getBridgeSession(resumeSessionId, { 2381 baseUrl, 2382 getAccessToken: getBridgeAccessToken, 2383 }) 2384 if (!session) { 2385 // Session gone on server → pointer is stale. Clear it so the user 2386 // isn't re-prompted next launch. (Explicit --session-id leaves the 2387 // pointer alone — it's an independent file they may not even have.) 2388 // resumePointerDir may be a worktree sibling — clear THAT file. 2389 if (resumePointerDir) { 2390 const { clearBridgePointer } = await import('./bridgePointer.js') 2391 await clearBridgePointer(resumePointerDir) 2392 } 2393 // biome-ignore lint/suspicious/noConsole: intentional error output 2394 console.error( 2395 `Error: Session ${resumeSessionId} not found. It may have been archived or expired, or your login may have lapsed (run \`claude /login\`).`, 2396 ) 2397 // eslint-disable-next-line custom-rules/no-process-exit 2398 process.exit(1) 2399 } 2400 if (!session.environment_id) { 2401 if (resumePointerDir) { 2402 const { clearBridgePointer } = await import('./bridgePointer.js') 2403 await clearBridgePointer(resumePointerDir) 2404 } 2405 // biome-ignore lint/suspicious/noConsole: intentional error output 2406 console.error( 2407 `Error: Session ${resumeSessionId} has no environment_id. It may never have been attached to a bridge.`, 2408 ) 2409 // eslint-disable-next-line custom-rules/no-process-exit 2410 process.exit(1) 2411 } 2412 reuseEnvironmentId = session.environment_id 2413 logForDebugging( 2414 `[bridge:init] Resuming session ${resumeSessionId} on environment ${reuseEnvironmentId}`, 2415 ) 2416 } 2417 2418 const config: BridgeConfig = { 2419 dir, 2420 machineName, 2421 branch, 2422 gitRepoUrl, 2423 maxSessions, 2424 spawnMode, 2425 verbose, 2426 sandbox, 2427 bridgeId, 2428 workerType: 'claude_code', 2429 environmentId: randomUUID(), 2430 reuseEnvironmentId, 2431 apiBaseUrl: baseUrl, 2432 sessionIngressUrl, 2433 debugFile, 2434 sessionTimeoutMs, 2435 } 2436 2437 logForDebugging( 2438 `[bridge:init] bridgeId=${bridgeId}${reuseEnvironmentId ? ` reuseEnvironmentId=${reuseEnvironmentId}` : ''} dir=${dir} branch=${branch} gitRepoUrl=${gitRepoUrl} machine=${machineName}`, 2439 ) 2440 logForDebugging( 2441 `[bridge:init] apiBaseUrl=${baseUrl} sessionIngressUrl=${sessionIngressUrl}`, 2442 ) 2443 logForDebugging( 2444 `[bridge:init] sandbox=${sandbox}${debugFile ? ` debugFile=${debugFile}` : ''}`, 2445 ) 2446 2447 // Register the bridge environment before entering the poll loop. 2448 let environmentId: string 2449 let environmentSecret: string 2450 try { 2451 const reg = await api.registerBridgeEnvironment(config) 2452 environmentId = reg.environment_id 2453 environmentSecret = reg.environment_secret 2454 } catch (err) { 2455 logEvent('tengu_bridge_registration_failed', { 2456 status: err instanceof BridgeFatalError ? err.status : undefined, 2457 }) 2458 // Registration failures are fatal — print a clean message instead of a stack trace. 2459 // biome-ignore lint/suspicious/noConsole:: intentional console output 2460 console.error( 2461 err instanceof BridgeFatalError && err.status === 404 2462 ? 'Remote Control environments are not available for your account.' 2463 : `Error: ${errorMessage(err)}`, 2464 ) 2465 // eslint-disable-next-line custom-rules/no-process-exit 2466 process.exit(1) 2467 } 2468 2469 // Tracks whether the --session-id resume flow completed successfully. 2470 // Used below to skip fresh session creation and seed initialSessionId. 2471 // Cleared on env mismatch so we gracefully fall back to a new session. 2472 let effectiveResumeSessionId: string | undefined 2473 if (feature('KAIROS') && resumeSessionId) { 2474 if (reuseEnvironmentId && environmentId !== reuseEnvironmentId) { 2475 // Backend returned a different environment_id — the original env 2476 // expired or was reaped. Reconnect won't work against the new env 2477 // (session is bound to the old one). Log to sentry for visibility 2478 // and fall through to fresh session creation on the new env. 2479 logError( 2480 new Error( 2481 `Bridge resume env mismatch: requested ${reuseEnvironmentId}, backend returned ${environmentId}. Falling back to fresh session.`, 2482 ), 2483 ) 2484 // biome-ignore lint/suspicious/noConsole: intentional warning output 2485 console.warn( 2486 `Warning: Could not resume session ${resumeSessionId} — its environment has expired. Creating a fresh session instead.`, 2487 ) 2488 // Don't deregister — we're going to use this new environment. 2489 // effectiveResumeSessionId stays undefined → fresh session path below. 2490 } else { 2491 // Force-stop any stale worker instances for this session and re-queue 2492 // it so our poll loop picks it up. Must happen after registration so 2493 // the backend knows a live worker exists for the environment. 2494 // 2495 // The pointer stores a session_* ID but /bridge/reconnect looks 2496 // sessions up by their infra tag (cse_*) when ccr_v2_compat_enabled 2497 // is on. Try both; the conversion is a no-op if already cse_*. 2498 const infraResumeId = toInfraSessionId(resumeSessionId) 2499 const reconnectCandidates = 2500 infraResumeId === resumeSessionId 2501 ? [resumeSessionId] 2502 : [resumeSessionId, infraResumeId] 2503 let reconnected = false 2504 let lastReconnectErr: unknown 2505 for (const candidateId of reconnectCandidates) { 2506 try { 2507 await api.reconnectSession(environmentId, candidateId) 2508 logForDebugging( 2509 `[bridge:init] Session ${candidateId} re-queued via bridge/reconnect`, 2510 ) 2511 effectiveResumeSessionId = resumeSessionId 2512 reconnected = true 2513 break 2514 } catch (err) { 2515 lastReconnectErr = err 2516 logForDebugging( 2517 `[bridge:init] reconnectSession(${candidateId}) failed: ${errorMessage(err)}`, 2518 ) 2519 } 2520 } 2521 if (!reconnected) { 2522 const err = lastReconnectErr 2523 2524 // Do NOT deregister on transient reconnect failure — at this point 2525 // environmentId IS the session's own environment. Deregistering 2526 // would make retry impossible. The backend's 4h TTL cleans up. 2527 const isFatal = err instanceof BridgeFatalError 2528 // Clear pointer only on fatal reconnect failure. Transient failures 2529 // ("try running the same command again") should keep the pointer so 2530 // next launch re-prompts — that IS the retry mechanism. 2531 if (resumePointerDir && isFatal) { 2532 const { clearBridgePointer } = await import('./bridgePointer.js') 2533 await clearBridgePointer(resumePointerDir) 2534 } 2535 // biome-ignore lint/suspicious/noConsole: intentional error output 2536 console.error( 2537 isFatal 2538 ? `Error: ${errorMessage(err)}` 2539 : `Error: Failed to reconnect session ${resumeSessionId}: ${errorMessage(err)}\nThe session may still be resumable — try running the same command again.`, 2540 ) 2541 // eslint-disable-next-line custom-rules/no-process-exit 2542 process.exit(1) 2543 } 2544 } 2545 } 2546 2547 logForDebugging( 2548 `[bridge:init] Registered, server environmentId=${environmentId}`, 2549 ) 2550 const startupPollConfig = getPollIntervalConfig() 2551 logEvent('tengu_bridge_started', { 2552 max_sessions: config.maxSessions, 2553 has_debug_file: !!config.debugFile, 2554 sandbox: config.sandbox, 2555 verbose: config.verbose, 2556 heartbeat_interval_ms: 2557 startupPollConfig.non_exclusive_heartbeat_interval_ms, 2558 spawn_mode: 2559 config.spawnMode as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 2560 spawn_mode_source: 2561 spawnModeSource as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 2562 multi_session_gate: multiSessionEnabled, 2563 pre_create_session: preCreateSession, 2564 worktree_available: worktreeAvailable, 2565 }) 2566 logForDiagnosticsNoPII('info', 'bridge_started', { 2567 max_sessions: config.maxSessions, 2568 sandbox: config.sandbox, 2569 spawn_mode: config.spawnMode, 2570 }) 2571 2572 const spawner = createSessionSpawner({ 2573 execPath: process.execPath, 2574 scriptArgs: spawnScriptArgs(), 2575 env: process.env, 2576 verbose, 2577 sandbox, 2578 debugFile, 2579 permissionMode, 2580 onDebug: logForDebugging, 2581 onActivity: (sessionId, activity) => { 2582 logForDebugging( 2583 `[bridge:activity] sessionId=${sessionId} ${activity.type} ${activity.summary}`, 2584 ) 2585 }, 2586 onPermissionRequest: (sessionId, request, _accessToken) => { 2587 logForDebugging( 2588 `[bridge:perm] sessionId=${sessionId} tool=${request.request.tool_name} request_id=${request.request_id} (not auto-approving)`, 2589 ) 2590 }, 2591 }) 2592 2593 const logger = createBridgeLogger({ verbose }) 2594 const { parseGitHubRepository } = await import('../utils/detectRepository.js') 2595 const ownerRepo = gitRepoUrl ? parseGitHubRepository(gitRepoUrl) : null 2596 // Use the repo name from the parsed owner/repo, or fall back to the dir basename 2597 const repoName = ownerRepo ? ownerRepo.split('/').pop()! : basename(dir) 2598 logger.setRepoInfo(repoName, branch) 2599 2600 // `w` toggle is available iff we're in a multi-session mode AND worktree 2601 // is a valid option. When unavailable, the mode suffix and hint are hidden. 2602 const toggleAvailable = spawnMode !== 'single-session' && worktreeAvailable 2603 if (toggleAvailable) { 2604 // Safe cast: spawnMode is not single-session (checked above), and the 2605 // saved-worktree-in-non-git guard + exit check above ensure worktree 2606 // is only reached when available. 2607 logger.setSpawnModeDisplay(spawnMode as 'same-dir' | 'worktree') 2608 } 2609 2610 // Listen for keys: space toggles QR code, w toggles spawn mode 2611 const onStdinData = (data: Buffer): void => { 2612 if (data[0] === 0x03 || data[0] === 0x04) { 2613 // Ctrl+C / Ctrl+D — trigger graceful shutdown 2614 process.emit('SIGINT') 2615 return 2616 } 2617 if (data[0] === 0x20 /* space */) { 2618 logger.toggleQr() 2619 return 2620 } 2621 if (data[0] === 0x77 /* 'w' */) { 2622 if (!toggleAvailable) return 2623 const newMode: 'same-dir' | 'worktree' = 2624 config.spawnMode === 'same-dir' ? 'worktree' : 'same-dir' 2625 config.spawnMode = newMode 2626 logEvent('tengu_bridge_spawn_mode_toggled', { 2627 spawn_mode: 2628 newMode as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, 2629 }) 2630 logger.logStatus( 2631 newMode === 'worktree' 2632 ? 'Spawn mode: worktree (new sessions get isolated git worktrees)' 2633 : 'Spawn mode: same-dir (new sessions share the current directory)', 2634 ) 2635 logger.setSpawnModeDisplay(newMode) 2636 logger.refreshDisplay() 2637 saveCurrentProjectConfig(current => { 2638 if (current.remoteControlSpawnMode === newMode) return current 2639 return { ...current, remoteControlSpawnMode: newMode } 2640 }) 2641 return 2642 } 2643 } 2644 if (process.stdin.isTTY) { 2645 process.stdin.setRawMode(true) 2646 process.stdin.resume() 2647 process.stdin.on('data', onStdinData) 2648 } 2649 2650 const controller = new AbortController() 2651 const onSigint = (): void => { 2652 logForDebugging('[bridge:shutdown] SIGINT received, shutting down') 2653 controller.abort() 2654 } 2655 const onSigterm = (): void => { 2656 logForDebugging('[bridge:shutdown] SIGTERM received, shutting down') 2657 controller.abort() 2658 } 2659 process.on('SIGINT', onSigint) 2660 process.on('SIGTERM', onSigterm) 2661 2662 // Auto-create an empty session so the user has somewhere to type 2663 // immediately (matching /remote-control behavior). Controlled by 2664 // preCreateSession: on by default; --no-create-session-in-dir opts out. 2665 // When a --session-id resume succeeded, skip creation entirely — the 2666 // session already exists and bridge/reconnect has re-queued it. 2667 // When resume was requested but failed on env mismatch, effectiveResumeSessionId 2668 // is undefined, so we fall through to fresh session creation (honoring the 2669 // "Creating a fresh session instead" warning printed above). 2670 let initialSessionId: string | null = 2671 feature('KAIROS') && effectiveResumeSessionId 2672 ? effectiveResumeSessionId 2673 : null 2674 if (preCreateSession && !(feature('KAIROS') && effectiveResumeSessionId)) { 2675 const { createBridgeSession } = await import('./createSession.js') 2676 try { 2677 initialSessionId = await createBridgeSession({ 2678 environmentId, 2679 title: name, 2680 events: [], 2681 gitRepoUrl, 2682 branch, 2683 signal: controller.signal, 2684 baseUrl, 2685 getAccessToken: getBridgeAccessToken, 2686 permissionMode, 2687 }) 2688 if (initialSessionId) { 2689 logForDebugging( 2690 `[bridge:init] Created initial session ${initialSessionId}`, 2691 ) 2692 } 2693 } catch (err) { 2694 logForDebugging( 2695 `[bridge:init] Session creation failed (non-fatal): ${errorMessage(err)}`, 2696 ) 2697 } 2698 } 2699 2700 // Crash-recovery pointer: write immediately so kill -9 at any point 2701 // after this leaves a recoverable trail. Covers both fresh sessions and 2702 // resumed ones (so a second crash after resume is still recoverable). 2703 // Cleared when runBridgeLoop falls through to archive+deregister; left in 2704 // place on the SIGINT resumable-shutdown return (backup for when the user 2705 // closes the terminal before copying the printed --session-id hint). 2706 // Refreshed hourly so a 5h+ session that crashes still has a fresh 2707 // pointer (staleness checks file mtime, backend TTL is rolling-from-poll). 2708 let pointerRefreshTimer: ReturnType<typeof setInterval> | null = null 2709 // Single-session only: --continue forces single-session mode on resume, 2710 // so a pointer written in multi-session mode would contradict the user's 2711 // config when they try to resume. The resumable-shutdown path is also 2712 // gated to single-session (line ~1254) so the pointer would be orphaned. 2713 if (initialSessionId && spawnMode === 'single-session') { 2714 const { writeBridgePointer } = await import('./bridgePointer.js') 2715 const pointerPayload = { 2716 sessionId: initialSessionId, 2717 environmentId, 2718 source: 'standalone' as const, 2719 } 2720 await writeBridgePointer(config.dir, pointerPayload) 2721 pointerRefreshTimer = setInterval( 2722 writeBridgePointer, 2723 60 * 60 * 1000, 2724 config.dir, 2725 pointerPayload, 2726 ) 2727 // Don't let the interval keep the process alive on its own. 2728 pointerRefreshTimer.unref?.() 2729 } 2730 2731 try { 2732 await runBridgeLoop( 2733 config, 2734 environmentId, 2735 environmentSecret, 2736 api, 2737 spawner, 2738 logger, 2739 controller.signal, 2740 undefined, 2741 initialSessionId ?? undefined, 2742 async () => { 2743 // Clear the memoized OAuth token cache so we re-read from secure 2744 // storage, picking up tokens refreshed by child processes. 2745 clearOAuthTokenCache() 2746 // Proactively refresh the token if it's expired on disk too. 2747 await checkAndRefreshOAuthTokenIfNeeded() 2748 return getBridgeAccessToken() 2749 }, 2750 ) 2751 } finally { 2752 if (pointerRefreshTimer !== null) { 2753 clearInterval(pointerRefreshTimer) 2754 } 2755 process.off('SIGINT', onSigint) 2756 process.off('SIGTERM', onSigterm) 2757 process.stdin.off('data', onStdinData) 2758 if (process.stdin.isTTY) { 2759 process.stdin.setRawMode(false) 2760 } 2761 process.stdin.pause() 2762 } 2763 2764 // The bridge bypasses init.ts (and its graceful shutdown handler), so we 2765 // must exit explicitly. 2766 // eslint-disable-next-line custom-rules/no-process-exit 2767 process.exit(0) 2768} 2769 2770// ─── Headless bridge (daemon worker) ──────────────────────────────────────── 2771 2772/** 2773 * Thrown by runBridgeHeadless for configuration issues the supervisor should 2774 * NOT retry (trust not accepted, worktree unavailable, http-not-https). The 2775 * daemon worker catches this and exits with EXIT_CODE_PERMANENT so the 2776 * supervisor parks the worker instead of respawning it on backoff. 2777 */ 2778export class BridgeHeadlessPermanentError extends Error { 2779 constructor(message: string) { 2780 super(message) 2781 this.name = 'BridgeHeadlessPermanentError' 2782 } 2783} 2784 2785export type HeadlessBridgeOpts = { 2786 dir: string 2787 name?: string 2788 spawnMode: 'same-dir' | 'worktree' 2789 capacity: number 2790 permissionMode?: string 2791 sandbox: boolean 2792 sessionTimeoutMs?: number 2793 createSessionOnStart: boolean 2794 getAccessToken: () => string | undefined 2795 onAuth401: (failedToken: string) => Promise<boolean> 2796 log: (s: string) => void 2797} 2798 2799/** 2800 * Non-interactive bridge entrypoint for the `remoteControl` daemon worker. 2801 * 2802 * Linear subset of bridgeMain(): no readline dialogs, no stdin key handlers, 2803 * no TUI, no process.exit(). Config comes from the caller (daemon.json), auth 2804 * comes via IPC (supervisor's AuthManager), logs go to the worker's stdout 2805 * pipe. Throws on fatal errors — the worker catches and maps permanent vs 2806 * transient to the right exit code. 2807 * 2808 * Resolves cleanly when `signal` aborts and the poll loop tears down. 2809 */ 2810export async function runBridgeHeadless( 2811 opts: HeadlessBridgeOpts, 2812 signal: AbortSignal, 2813): Promise<void> { 2814 const { dir, log } = opts 2815 2816 // Worker inherits the supervisor's CWD. chdir first so git utilities 2817 // (getBranch/getRemoteUrl) — which read from bootstrap CWD state set 2818 // below — resolve against the right repo. 2819 process.chdir(dir) 2820 const { setOriginalCwd, setCwdState } = await import('../bootstrap/state.js') 2821 setOriginalCwd(dir) 2822 setCwdState(dir) 2823 2824 const { enableConfigs, checkHasTrustDialogAccepted } = await import( 2825 '../utils/config.js' 2826 ) 2827 enableConfigs() 2828 const { initSinks } = await import('../utils/sinks.js') 2829 initSinks() 2830 2831 if (!checkHasTrustDialogAccepted()) { 2832 throw new BridgeHeadlessPermanentError( 2833 `Workspace not trusted: ${dir}. Run \`claude\` in that directory first to accept the trust dialog.`, 2834 ) 2835 } 2836 2837 if (!opts.getAccessToken()) { 2838 // Transient — supervisor's AuthManager may pick up a token on next cycle. 2839 throw new Error(BRIDGE_LOGIN_ERROR) 2840 } 2841 2842 const { getBridgeBaseUrl } = await import('./bridgeConfig.js') 2843 const baseUrl = getBridgeBaseUrl() 2844 if ( 2845 baseUrl.startsWith('http://') && 2846 !baseUrl.includes('localhost') && 2847 !baseUrl.includes('127.0.0.1') 2848 ) { 2849 throw new BridgeHeadlessPermanentError( 2850 'Remote Control base URL uses HTTP. Only HTTPS or localhost HTTP is allowed.', 2851 ) 2852 } 2853 const sessionIngressUrl = 2854 process.env.USER_TYPE === 'ant' && 2855 process.env.CLAUDE_BRIDGE_SESSION_INGRESS_URL 2856 ? process.env.CLAUDE_BRIDGE_SESSION_INGRESS_URL 2857 : baseUrl 2858 2859 const { getBranch, getRemoteUrl, findGitRoot } = await import( 2860 '../utils/git.js' 2861 ) 2862 const { hasWorktreeCreateHook } = await import('../utils/hooks.js') 2863 2864 if (opts.spawnMode === 'worktree') { 2865 const worktreeAvailable = 2866 hasWorktreeCreateHook() || findGitRoot(dir) !== null 2867 if (!worktreeAvailable) { 2868 throw new BridgeHeadlessPermanentError( 2869 `Worktree mode requires a git repository or WorktreeCreate hooks. Directory ${dir} has neither.`, 2870 ) 2871 } 2872 } 2873 2874 const branch = await getBranch() 2875 const gitRepoUrl = await getRemoteUrl() 2876 const machineName = hostname() 2877 const bridgeId = randomUUID() 2878 2879 const config: BridgeConfig = { 2880 dir, 2881 machineName, 2882 branch, 2883 gitRepoUrl, 2884 maxSessions: opts.capacity, 2885 spawnMode: opts.spawnMode, 2886 verbose: false, 2887 sandbox: opts.sandbox, 2888 bridgeId, 2889 workerType: 'claude_code', 2890 environmentId: randomUUID(), 2891 apiBaseUrl: baseUrl, 2892 sessionIngressUrl, 2893 sessionTimeoutMs: opts.sessionTimeoutMs, 2894 } 2895 2896 const api = createBridgeApiClient({ 2897 baseUrl, 2898 getAccessToken: opts.getAccessToken, 2899 runnerVersion: MACRO.VERSION, 2900 onDebug: log, 2901 onAuth401: opts.onAuth401, 2902 getTrustedDeviceToken, 2903 }) 2904 2905 let environmentId: string 2906 let environmentSecret: string 2907 try { 2908 const reg = await api.registerBridgeEnvironment(config) 2909 environmentId = reg.environment_id 2910 environmentSecret = reg.environment_secret 2911 } catch (err) { 2912 // Transient — let supervisor backoff-retry. 2913 throw new Error(`Bridge registration failed: ${errorMessage(err)}`) 2914 } 2915 2916 const spawner = createSessionSpawner({ 2917 execPath: process.execPath, 2918 scriptArgs: spawnScriptArgs(), 2919 env: process.env, 2920 verbose: false, 2921 sandbox: opts.sandbox, 2922 permissionMode: opts.permissionMode, 2923 onDebug: log, 2924 }) 2925 2926 const logger = createHeadlessBridgeLogger(log) 2927 logger.printBanner(config, environmentId) 2928 2929 let initialSessionId: string | undefined 2930 if (opts.createSessionOnStart) { 2931 const { createBridgeSession } = await import('./createSession.js') 2932 try { 2933 const sid = await createBridgeSession({ 2934 environmentId, 2935 title: opts.name, 2936 events: [], 2937 gitRepoUrl, 2938 branch, 2939 signal, 2940 baseUrl, 2941 getAccessToken: opts.getAccessToken, 2942 permissionMode: opts.permissionMode, 2943 }) 2944 if (sid) { 2945 initialSessionId = sid 2946 log(`created initial session ${sid}`) 2947 } 2948 } catch (err) { 2949 log(`session pre-creation failed (non-fatal): ${errorMessage(err)}`) 2950 } 2951 } 2952 2953 await runBridgeLoop( 2954 config, 2955 environmentId, 2956 environmentSecret, 2957 api, 2958 spawner, 2959 logger, 2960 signal, 2961 undefined, 2962 initialSessionId, 2963 async () => opts.getAccessToken(), 2964 ) 2965} 2966 2967/** BridgeLogger adapter that routes everything to a single line-log fn. */ 2968function createHeadlessBridgeLogger(log: (s: string) => void): BridgeLogger { 2969 const noop = (): void => {} 2970 return { 2971 printBanner: (cfg, envId) => 2972 log( 2973 `registered environmentId=${envId} dir=${cfg.dir} spawnMode=${cfg.spawnMode} capacity=${cfg.maxSessions}`, 2974 ), 2975 logSessionStart: (id, _prompt) => log(`session start ${id}`), 2976 logSessionComplete: (id, ms) => log(`session complete ${id} (${ms}ms)`), 2977 logSessionFailed: (id, err) => log(`session failed ${id}: ${err}`), 2978 logStatus: log, 2979 logVerbose: log, 2980 logError: s => log(`error: ${s}`), 2981 logReconnected: ms => log(`reconnected after ${ms}ms`), 2982 addSession: (id, _url) => log(`session attached ${id}`), 2983 removeSession: id => log(`session detached ${id}`), 2984 updateIdleStatus: noop, 2985 updateReconnectingStatus: noop, 2986 updateSessionStatus: noop, 2987 updateSessionActivity: noop, 2988 updateSessionCount: noop, 2989 updateFailedStatus: noop, 2990 setSpawnModeDisplay: noop, 2991 setRepoInfo: noop, 2992 setDebugLogPath: noop, 2993 setAttached: noop, 2994 setSessionTitle: noop, 2995 clearStatus: noop, 2996 toggleQr: noop, 2997 refreshDisplay: noop, 2998 } 2999}