source dump of claude code
at main 658 lines 24 kB view raw
1/** 2 * CLI `ComputerExecutor` implementation. Wraps two native modules: 3 * - `@ant/computer-use-input` (Rust/enigo) — mouse, keyboard, frontmost app 4 * - `@ant/computer-use-swift` — SCContentFilter screenshots, NSWorkspace apps, TCC 5 * 6 * Contract: `packages/desktop/computer-use-mcp/src/executor.ts` in the apps 7 * repo. The reference impl is Cowork's `apps/desktop/src/main/nest-only/ 8 * computer-use/executor.ts` — see notable deviations under "CLI deltas" below. 9 * 10 * ── CLI deltas from Cowork ───────────────────────────────────────────────── 11 * 12 * No `withClickThrough`. Cowork wraps every mouse op in 13 * `BrowserWindow.setIgnoreMouseEvents(true)` so clicks fall through the 14 * overlay. We're a terminal — no window — so the click-through bracket is 15 * a no-op. The sentinel `CLI_HOST_BUNDLE_ID` never matches frontmost. 16 * 17 * Terminal as surrogate host. `getTerminalBundleId()` detects the emulator 18 * we're running inside. It's passed as `hostBundleId` to `prepareDisplay`/ 19 * `resolvePrepareCapture` so the Swift side exempts it from hide AND skips 20 * it in the activate z-order walk (so the terminal being frontmost doesn't 21 * eat clicks meant for the target app). Also stripped from `allowedBundleIds` 22 * via `withoutTerminal()` so screenshots don't capture it (Swift 0.2.1's 23 * captureExcluding takes an allow-list despite the name — apps#30355). 24 * `capabilities.hostBundleId` stays as the sentinel — the package's 25 * frontmost gate uses that, and the terminal being frontmost is fine. 26 * 27 * Clipboard via `pbcopy`/`pbpaste`. No Electron `clipboard` module. 28 */ 29 30import type { 31 ComputerExecutor, 32 DisplayGeometry, 33 FrontmostApp, 34 InstalledApp, 35 ResolvePrepareCaptureResult, 36 RunningApp, 37 ScreenshotResult, 38} from '@ant/computer-use-mcp' 39 40import { API_RESIZE_PARAMS, targetImageSize } from '@ant/computer-use-mcp' 41import { logForDebugging } from '../debug.js' 42import { errorMessage } from '../errors.js' 43import { execFileNoThrow } from '../execFileNoThrow.js' 44import { sleep } from '../sleep.js' 45import { 46 CLI_CU_CAPABILITIES, 47 CLI_HOST_BUNDLE_ID, 48 getTerminalBundleId, 49} from './common.js' 50import { drainRunLoop } from './drainRunLoop.js' 51import { notifyExpectedEscape } from './escHotkey.js' 52import { requireComputerUseInput } from './inputLoader.js' 53import { requireComputerUseSwift } from './swiftLoader.js' 54 55// ── Helpers ─────────────────────────────────────────────────────────────────── 56 57const SCREENSHOT_JPEG_QUALITY = 0.75 58 59/** Logical → physical → API target dims. See `targetImageSize` + COORDINATES.md. */ 60function computeTargetDims( 61 logicalW: number, 62 logicalH: number, 63 scaleFactor: number, 64): [number, number] { 65 const physW = Math.round(logicalW * scaleFactor) 66 const physH = Math.round(logicalH * scaleFactor) 67 return targetImageSize(physW, physH, API_RESIZE_PARAMS) 68} 69 70async function readClipboardViaPbpaste(): Promise<string> { 71 const { stdout, code } = await execFileNoThrow('pbpaste', [], { 72 useCwd: false, 73 }) 74 if (code !== 0) { 75 throw new Error(`pbpaste exited with code ${code}`) 76 } 77 return stdout 78} 79 80async function writeClipboardViaPbcopy(text: string): Promise<void> { 81 const { code } = await execFileNoThrow('pbcopy', [], { 82 input: text, 83 useCwd: false, 84 }) 85 if (code !== 0) { 86 throw new Error(`pbcopy exited with code ${code}`) 87 } 88} 89 90type Input = ReturnType<typeof requireComputerUseInput> 91 92/** 93 * Single-element key sequence matching "escape" or "esc" (case-insensitive). 94 * Used to hole-punch the CGEventTap abort for model-synthesized Escape — enigo 95 * accepts both spellings, so the tap must too. 96 */ 97function isBareEscape(parts: readonly string[]): boolean { 98 if (parts.length !== 1) return false 99 const lower = parts[0]!.toLowerCase() 100 return lower === 'escape' || lower === 'esc' 101} 102 103/** 104 * Instant move, then 50ms — an input→HID→AppKit→NSEvent round-trip before the 105 * caller reads `NSEvent.mouseLocation` or dispatches a click. Used for click, 106 * scroll, and drag-from; `animatedMove` is reserved for drag-to only. The 107 * intermediate animation frames were triggering hover states and, on the 108 * decomposed mouseDown/moveMouse path, emitting stray `.leftMouseDragged` 109 * events (toolCalls.ts handleScroll's mouse_full workaround). 110 */ 111const MOVE_SETTLE_MS = 50 112 113async function moveAndSettle( 114 input: Input, 115 x: number, 116 y: number, 117): Promise<void> { 118 await input.moveMouse(x, y, false) 119 await sleep(MOVE_SETTLE_MS) 120} 121 122/** 123 * Release `pressed` in reverse (last pressed = first released). Errors are 124 * swallowed so a release failure never masks the real error. 125 * 126 * Drains via pop() rather than snapshotting length: if a drainRunLoop- 127 * orphaned press lambda resolves an in-flight input.key() AFTER finally 128 * calls us, that late push is still released on the next iteration. The 129 * orphaned flag stops the lambda at its NEXT check, not the current await. 130 */ 131async function releasePressed(input: Input, pressed: string[]): Promise<void> { 132 let k: string | undefined 133 while ((k = pressed.pop()) !== undefined) { 134 try { 135 await input.key(k, 'release') 136 } catch { 137 // Swallow — best-effort release. 138 } 139 } 140} 141 142/** 143 * Bracket `fn()` with modifier press/release. `pressed` tracks which presses 144 * actually landed, so a mid-press throw only releases what was pressed — no 145 * stuck modifiers. The finally covers both press-phase and fn() throws. 146 * 147 * Caller must already be inside drainRunLoop() — key() dispatches to the 148 * main queue and needs the pump to resolve. 149 */ 150async function withModifiers<T>( 151 input: Input, 152 mods: string[], 153 fn: () => Promise<T>, 154): Promise<T> { 155 const pressed: string[] = [] 156 try { 157 for (const m of mods) { 158 await input.key(m, 'press') 159 pressed.push(m) 160 } 161 return await fn() 162 } finally { 163 await releasePressed(input, pressed) 164 } 165} 166 167/** 168 * Port of Cowork's `typeViaClipboard`. Sequence: 169 * 1. Save the user's clipboard. 170 * 2. Write our text. 171 * 3. READ-BACK VERIFY — clipboard writes can silently fail. If the 172 * read-back doesn't match, never press Cmd+V (would paste junk). 173 * 4. Cmd+V via keys(). 174 * 5. Sleep 100ms — battle-tested threshold for the paste-effect vs 175 * clipboard-restore race. Restoring too soon means the target app 176 * pastes the RESTORED content. 177 * 6. Restore — in a `finally`, so a throw between 2-5 never leaves the 178 * user's clipboard clobbered. Restore failures are swallowed. 179 */ 180async function typeViaClipboard(input: Input, text: string): Promise<void> { 181 let saved: string | undefined 182 try { 183 saved = await readClipboardViaPbpaste() 184 } catch { 185 logForDebugging( 186 '[computer-use] pbpaste before paste failed; proceeding without restore', 187 ) 188 } 189 190 try { 191 await writeClipboardViaPbcopy(text) 192 if ((await readClipboardViaPbpaste()) !== text) { 193 throw new Error('Clipboard write did not round-trip.') 194 } 195 await input.keys(['command', 'v']) 196 await sleep(100) 197 } finally { 198 if (typeof saved === 'string') { 199 try { 200 await writeClipboardViaPbcopy(saved) 201 } catch { 202 logForDebugging('[computer-use] clipboard restore after paste failed') 203 } 204 } 205 } 206} 207 208/** 209 * Port of Cowork's `animateMouseMovement` + `animatedMove`. Ease-out-cubic at 210 * 60fps; distance-proportional duration at 2000 px/sec, capped at 0.5s. When 211 * the sub-gate is off (or distance < ~2 frames), falls through to 212 * `moveAndSettle`. Called only from `drag` for the press→to motion — target 213 * apps may watch for `.leftMouseDragged` specifically (not just "button down + 214 * position changed") and the slow motion gives them time to process 215 * intermediate positions (scrollbars, window resizes). 216 */ 217async function animatedMove( 218 input: Input, 219 targetX: number, 220 targetY: number, 221 mouseAnimationEnabled: boolean, 222): Promise<void> { 223 if (!mouseAnimationEnabled) { 224 await moveAndSettle(input, targetX, targetY) 225 return 226 } 227 const start = await input.mouseLocation() 228 const deltaX = targetX - start.x 229 const deltaY = targetY - start.y 230 const distance = Math.hypot(deltaX, deltaY) 231 if (distance < 1) return 232 const durationSec = Math.min(distance / 2000, 0.5) 233 if (durationSec < 0.03) { 234 await moveAndSettle(input, targetX, targetY) 235 return 236 } 237 const frameRate = 60 238 const frameIntervalMs = 1000 / frameRate 239 const totalFrames = Math.floor(durationSec * frameRate) 240 for (let frame = 1; frame <= totalFrames; frame++) { 241 const t = frame / totalFrames 242 const eased = 1 - Math.pow(1 - t, 3) 243 await input.moveMouse( 244 Math.round(start.x + deltaX * eased), 245 Math.round(start.y + deltaY * eased), 246 false, 247 ) 248 if (frame < totalFrames) { 249 await sleep(frameIntervalMs) 250 } 251 } 252 // Last frame has no trailing sleep — same HID round-trip before the 253 // caller's mouseButton reads NSEvent.mouseLocation. 254 await sleep(MOVE_SETTLE_MS) 255} 256 257// ── Factory ─────────────────────────────────────────────────────────────── 258 259export function createCliExecutor(opts: { 260 getMouseAnimationEnabled: () => boolean 261 getHideBeforeActionEnabled: () => boolean 262}): ComputerExecutor { 263 if (process.platform !== 'darwin') { 264 throw new Error( 265 `createCliExecutor called on ${process.platform}. Computer control is macOS-only.`, 266 ) 267 } 268 269 // Swift loaded once at factory time — every executor method needs it. 270 // Input loaded lazily via requireComputerUseInput() on first mouse/keyboard 271 // call — it caches internally, so screenshot-only flows never pull the 272 // enigo .node. 273 const cu = requireComputerUseSwift() 274 275 const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts 276 const terminalBundleId = getTerminalBundleId() 277 const surrogateHost = terminalBundleId ?? CLI_HOST_BUNDLE_ID 278 // Swift 0.2.1's captureExcluding/captureRegion take an ALLOW list despite the 279 // name (apps#30355 — complement computed Swift-side against running apps). 280 // The terminal isn't in the user's grants so it's naturally excluded, but if 281 // the package ever passes it through we strip it here so the terminal never 282 // photobombs a screenshot. 283 const withoutTerminal = (allowed: readonly string[]): string[] => 284 terminalBundleId === null 285 ? [...allowed] 286 : allowed.filter(id => id !== terminalBundleId) 287 288 logForDebugging( 289 terminalBundleId 290 ? `[computer-use] terminal ${terminalBundleId} → surrogate host (hide-exempt, activate-skip, screenshot-excluded)` 291 : '[computer-use] terminal not detected; falling back to sentinel host', 292 ) 293 294 return { 295 capabilities: { 296 ...CLI_CU_CAPABILITIES, 297 hostBundleId: CLI_HOST_BUNDLE_ID, 298 }, 299 300 // ── Pre-action sequence (hide + defocus) ──────────────────────────── 301 302 async prepareForAction( 303 allowlistBundleIds: string[], 304 displayId?: number, 305 ): Promise<string[]> { 306 if (!getHideBeforeActionEnabled()) { 307 return [] 308 } 309 // prepareDisplay isn't @MainActor (plain Task{}), but its .hide() calls 310 // trigger window-manager events that queue on CFRunLoop. Without the 311 // pump, those pile up during Swift's ~1s of usleeps and flush all at 312 // once when the next pumped call runs — visible window flashing. 313 // Electron drains CFRunLoop continuously so Cowork doesn't see this. 314 // Worst-case 100ms + 5×200ms safety-net ≈ 1.1s, well under the 30s 315 // drainRunLoop ceiling. 316 // 317 // "Continue with action execution even if switching fails" — the 318 // frontmost gate in toolCalls.ts catches any actual unsafe state. 319 return drainRunLoop(async () => { 320 try { 321 const result = await cu.apps.prepareDisplay( 322 allowlistBundleIds, 323 surrogateHost, 324 displayId, 325 ) 326 if (result.activated) { 327 logForDebugging( 328 `[computer-use] prepareForAction: activated ${result.activated}`, 329 ) 330 } 331 return result.hidden 332 } catch (err) { 333 logForDebugging( 334 `[computer-use] prepareForAction failed; continuing to action: ${errorMessage(err)}`, 335 { level: 'warn' }, 336 ) 337 return [] 338 } 339 }) 340 }, 341 342 async previewHideSet( 343 allowlistBundleIds: string[], 344 displayId?: number, 345 ): Promise<Array<{ bundleId: string; displayName: string }>> { 346 return cu.apps.previewHideSet( 347 [...allowlistBundleIds, surrogateHost], 348 displayId, 349 ) 350 }, 351 352 // ── Display ────────────────────────────────────────────────────────── 353 354 async getDisplaySize(displayId?: number): Promise<DisplayGeometry> { 355 return cu.display.getSize(displayId) 356 }, 357 358 async listDisplays(): Promise<DisplayGeometry[]> { 359 return cu.display.listAll() 360 }, 361 362 async findWindowDisplays( 363 bundleIds: string[], 364 ): Promise<Array<{ bundleId: string; displayIds: number[] }>> { 365 return cu.apps.findWindowDisplays(bundleIds) 366 }, 367 368 async resolvePrepareCapture(opts: { 369 allowedBundleIds: string[] 370 preferredDisplayId?: number 371 autoResolve: boolean 372 doHide?: boolean 373 }): Promise<ResolvePrepareCaptureResult> { 374 const d = cu.display.getSize(opts.preferredDisplayId) 375 const [targetW, targetH] = computeTargetDims( 376 d.width, 377 d.height, 378 d.scaleFactor, 379 ) 380 return drainRunLoop(() => 381 cu.resolvePrepareCapture( 382 withoutTerminal(opts.allowedBundleIds), 383 surrogateHost, 384 SCREENSHOT_JPEG_QUALITY, 385 targetW, 386 targetH, 387 opts.preferredDisplayId, 388 opts.autoResolve, 389 opts.doHide, 390 ), 391 ) 392 }, 393 394 /** 395 * Pre-size to `targetImageSize` output so the API transcoder's early-return 396 * fires — no server-side resize, `scaleCoord` stays coherent. See 397 * packages/desktop/computer-use-mcp/COORDINATES.md. 398 */ 399 async screenshot(opts: { 400 allowedBundleIds: string[] 401 displayId?: number 402 }): Promise<ScreenshotResult> { 403 const d = cu.display.getSize(opts.displayId) 404 const [targetW, targetH] = computeTargetDims( 405 d.width, 406 d.height, 407 d.scaleFactor, 408 ) 409 return drainRunLoop(() => 410 cu.screenshot.captureExcluding( 411 withoutTerminal(opts.allowedBundleIds), 412 SCREENSHOT_JPEG_QUALITY, 413 targetW, 414 targetH, 415 opts.displayId, 416 ), 417 ) 418 }, 419 420 async zoom( 421 regionLogical: { x: number; y: number; w: number; h: number }, 422 allowedBundleIds: string[], 423 displayId?: number, 424 ): Promise<{ base64: string; width: number; height: number }> { 425 const d = cu.display.getSize(displayId) 426 const [outW, outH] = computeTargetDims( 427 regionLogical.w, 428 regionLogical.h, 429 d.scaleFactor, 430 ) 431 return drainRunLoop(() => 432 cu.screenshot.captureRegion( 433 withoutTerminal(allowedBundleIds), 434 regionLogical.x, 435 regionLogical.y, 436 regionLogical.w, 437 regionLogical.h, 438 outW, 439 outH, 440 SCREENSHOT_JPEG_QUALITY, 441 displayId, 442 ), 443 ) 444 }, 445 446 // ── Keyboard ───────────────────────────────────────────────────────── 447 448 /** 449 * xdotool-style sequence e.g. "ctrl+shift+a" → split on '+' and pass to 450 * keys(). keys() dispatches to DispatchQueue.main — drainRunLoop pumps 451 * CFRunLoop so it resolves. Rust's error-path cleanup (enigo_wrap.rs) 452 * releases modifiers on each invocation, so a mid-loop throw leaves 453 * nothing stuck. 8ms between iterations — 125Hz USB polling cadence. 454 */ 455 async key(keySequence: string, repeat?: number): Promise<void> { 456 const input = requireComputerUseInput() 457 const parts = keySequence.split('+').filter(p => p.length > 0) 458 // Bare-only: the CGEventTap checks event.flags.isEmpty so ctrl+escape 459 // etc. pass through without aborting. 460 const isEsc = isBareEscape(parts) 461 const n = repeat ?? 1 462 await drainRunLoop(async () => { 463 for (let i = 0; i < n; i++) { 464 if (i > 0) { 465 await sleep(8) 466 } 467 if (isEsc) { 468 notifyExpectedEscape() 469 } 470 await input.keys(parts) 471 } 472 }) 473 }, 474 475 async holdKey(keyNames: string[], durationMs: number): Promise<void> { 476 const input = requireComputerUseInput() 477 // Press/release each wrapped in drainRunLoop; the sleep sits outside so 478 // durationMs isn't bounded by drainRunLoop's 30s timeout. `pressed` 479 // tracks which presses landed so a mid-press throw still releases 480 // everything that was actually pressed. 481 // 482 // `orphaned` guards against a timeout-orphan race: if the press-phase 483 // drainRunLoop times out while the esc-hotkey pump-retain keeps the 484 // pump running, the orphaned lambda would continue pushing to `pressed` 485 // after finally's releasePressed snapshotted the length — leaving keys 486 // stuck. The flag stops the lambda at the next iteration. 487 const pressed: string[] = [] 488 let orphaned = false 489 try { 490 await drainRunLoop(async () => { 491 for (const k of keyNames) { 492 if (orphaned) return 493 // Bare Escape: notify the CGEventTap so it doesn't fire the 494 // abort callback for a model-synthesized press. Same as key(). 495 if (isBareEscape([k])) { 496 notifyExpectedEscape() 497 } 498 await input.key(k, 'press') 499 pressed.push(k) 500 } 501 }) 502 await sleep(durationMs) 503 } finally { 504 orphaned = true 505 await drainRunLoop(() => releasePressed(input, pressed)) 506 } 507 }, 508 509 async type(text: string, opts: { viaClipboard: boolean }): Promise<void> { 510 const input = requireComputerUseInput() 511 if (opts.viaClipboard) { 512 // keys(['command','v']) inside needs the pump. 513 await drainRunLoop(() => typeViaClipboard(input, text)) 514 return 515 } 516 // `toolCalls.ts` handles the grapheme loop + 8ms sleeps and calls this 517 // once per grapheme. typeText doesn't dispatch to the main queue. 518 await input.typeText(text) 519 }, 520 521 readClipboard: readClipboardViaPbpaste, 522 523 writeClipboard: writeClipboardViaPbcopy, 524 525 // ── Mouse ──────────────────────────────────────────────────────────── 526 527 async moveMouse(x: number, y: number): Promise<void> { 528 await moveAndSettle(requireComputerUseInput(), x, y) 529 }, 530 531 /** 532 * Move, then click. Modifiers are press/release bracketed via withModifiers 533 * — same pattern as Cowork. AppKit computes NSEvent.clickCount from timing 534 * + position proximity, so double/triple click work without setting the 535 * CGEvent clickState field. key() inside withModifiers needs the pump; 536 * the modifier-less path doesn't. 537 */ 538 async click( 539 x: number, 540 y: number, 541 button: 'left' | 'right' | 'middle', 542 count: 1 | 2 | 3, 543 modifiers?: string[], 544 ): Promise<void> { 545 const input = requireComputerUseInput() 546 await moveAndSettle(input, x, y) 547 if (modifiers && modifiers.length > 0) { 548 await drainRunLoop(() => 549 withModifiers(input, modifiers, () => 550 input.mouseButton(button, 'click', count), 551 ), 552 ) 553 } else { 554 await input.mouseButton(button, 'click', count) 555 } 556 }, 557 558 async mouseDown(): Promise<void> { 559 await requireComputerUseInput().mouseButton('left', 'press') 560 }, 561 562 async mouseUp(): Promise<void> { 563 await requireComputerUseInput().mouseButton('left', 'release') 564 }, 565 566 async getCursorPosition(): Promise<{ x: number; y: number }> { 567 return requireComputerUseInput().mouseLocation() 568 }, 569 570 /** 571 * `from === undefined` → drag from current cursor (training's 572 * left_click_drag with start_coordinate omitted). Inner `finally`: the 573 * button is ALWAYS released even if the move throws — otherwise the 574 * user's left button is stuck-pressed until they physically click. 575 * 50ms sleep after press: enigo's move_mouse reads NSEvent.pressedMouseButtons 576 * to decide .leftMouseDragged vs .mouseMoved; the synthetic leftMouseDown 577 * needs a HID-tap round-trip to show up there. 578 */ 579 async drag( 580 from: { x: number; y: number } | undefined, 581 to: { x: number; y: number }, 582 ): Promise<void> { 583 const input = requireComputerUseInput() 584 if (from !== undefined) { 585 await moveAndSettle(input, from.x, from.y) 586 } 587 await input.mouseButton('left', 'press') 588 await sleep(MOVE_SETTLE_MS) 589 try { 590 await animatedMove(input, to.x, to.y, getMouseAnimationEnabled()) 591 } finally { 592 await input.mouseButton('left', 'release') 593 } 594 }, 595 596 /** 597 * Move first, then scroll each axis. Vertical-first — it's the common 598 * axis; a horizontal failure shouldn't lose the vertical. 599 */ 600 async scroll(x: number, y: number, dx: number, dy: number): Promise<void> { 601 const input = requireComputerUseInput() 602 await moveAndSettle(input, x, y) 603 if (dy !== 0) { 604 await input.mouseScroll(dy, 'vertical') 605 } 606 if (dx !== 0) { 607 await input.mouseScroll(dx, 'horizontal') 608 } 609 }, 610 611 // ── App management ─────────────────────────────────────────────────── 612 613 async getFrontmostApp(): Promise<FrontmostApp | null> { 614 const info = requireComputerUseInput().getFrontmostAppInfo() 615 if (!info || !info.bundleId) return null 616 return { bundleId: info.bundleId, displayName: info.appName } 617 }, 618 619 async appUnderPoint( 620 x: number, 621 y: number, 622 ): Promise<{ bundleId: string; displayName: string } | null> { 623 return cu.apps.appUnderPoint(x, y) 624 }, 625 626 async listInstalledApps(): Promise<InstalledApp[]> { 627 // `ComputerUseInstalledApp` is `{bundleId, displayName, path}`. 628 // `InstalledApp` adds optional `iconDataUrl` — left unpopulated; 629 // the approval dialog fetches lazily via getAppIcon() below. 630 return drainRunLoop(() => cu.apps.listInstalled()) 631 }, 632 633 async getAppIcon(path: string): Promise<string | undefined> { 634 return cu.apps.iconDataUrl(path) ?? undefined 635 }, 636 637 async listRunningApps(): Promise<RunningApp[]> { 638 return cu.apps.listRunning() 639 }, 640 641 async openApp(bundleId: string): Promise<void> { 642 await cu.apps.open(bundleId) 643 }, 644 } 645} 646 647/** 648 * Module-level export (not on the executor object) — called at turn-end from 649 * `stopHooks.ts` / `query.ts`, outside the executor lifecycle. Fire-and-forget 650 * at the call site; the caller `.catch()`es. 651 */ 652export async function unhideComputerUseApps( 653 bundleIds: readonly string[], 654): Promise<void> { 655 if (bundleIds.length === 0) return 656 const cu = requireComputerUseSwift() 657 await cu.apps.unhide([...bundleIds]) 658}