utils/nativeInstaller/pidLock.ts at main

oppi.li / claude-code
fork atom
source dump of claude code
fork atom
claude-code / utils / nativeInstaller / pidLock.ts
at main 433 lines 12 kB view raw
wrap content
oppi.li dump from zip 5d ago
63aada3f
  1/**
  2 * PID-Based Version Locking
  3 *
  4 * This module provides PID-based locking for running Claude Code versions.
  5 * Unlike mtime-based locking (which can hold locks for 30 days after a crash),
  6 * PID-based locking can immediately detect when a process is no longer running.
  7 *
  8 * Lock files contain JSON with the PID and metadata, and staleness is determined
  9 * by checking if the process is still alive.
 10 */
 11
 12import { basename, join } from 'path'
 13import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
 14import { logForDebugging } from '../debug.js'
 15import { isEnvDefinedFalsy, isEnvTruthy } from '../envUtils.js'
 16import { isENOENT, toError } from '../errors.js'
 17import { getFsImplementation } from '../fsOperations.js'
 18import { getProcessCommand } from '../genericProcessUtils.js'
 19import { logError } from '../log.js'
 20import {
 21  jsonParse,
 22  jsonStringify,
 23  writeFileSync_DEPRECATED,
 24} from '../slowOperations.js'
 25
 26/**
 27 * Check if PID-based version locking is enabled.
 28 * When disabled, falls back to mtime-based locking (30-day timeout).
 29 *
 30 * Controlled by GrowthBook gate with local override:
 31 * - Set ENABLE_PID_BASED_VERSION_LOCKING=true to force-enable
 32 * - Set ENABLE_PID_BASED_VERSION_LOCKING=false to force-disable
 33 * - If unset, GrowthBook gate (tengu_pid_based_version_locking) controls rollout
 34 */
 35export function isPidBasedLockingEnabled(): boolean {
 36  const envVar = process.env.ENABLE_PID_BASED_VERSION_LOCKING
 37  // If env var is explicitly set, respect it
 38  if (isEnvTruthy(envVar)) {
 39    return true
 40  }
 41  if (isEnvDefinedFalsy(envVar)) {
 42    return false
 43  }
 44  // GrowthBook controls gradual rollout (returns false for external users)
 45  return getFeatureValue_CACHED_MAY_BE_STALE(
 46    'tengu_pid_based_version_locking',
 47    false,
 48  )
 49}
 50
 51/**
 52 * Content stored in a version lock file
 53 */
 54export type VersionLockContent = {
 55  pid: number
 56  version: string
 57  execPath: string
 58  acquiredAt: number // timestamp when lock was acquired
 59}
 60
 61/**
 62 * Information about a lock for diagnostic purposes
 63 */
 64export type LockInfo = {
 65  version: string
 66  pid: number
 67  isProcessRunning: boolean
 68  execPath: string
 69  acquiredAt: Date
 70  lockFilePath: string
 71}
 72
 73// Fallback stale timeout (2 hours) - used when PID check is inconclusive
 74// This is much shorter than the previous 30-day timeout but still allows
 75// for edge cases like network filesystems where PID check might fail
 76const FALLBACK_STALE_MS = 2 * 60 * 60 * 1000
 77
 78/**
 79 * Check if a process with the given PID is currently running
 80 * Uses signal 0 which doesn't actually send a signal but checks if we can
 81 */
 82export function isProcessRunning(pid: number): boolean {
 83  // PID 0 is special - it refers to the current process group, not a real process
 84  // PID 1 is init/systemd and is always running but shouldn't be considered for locks
 85  if (pid <= 1) {
 86    return false
 87  }
 88
 89  try {
 90    process.kill(pid, 0)
 91    return true
 92  } catch {
 93    return false
 94  }
 95}
 96
 97/**
 98 * Validate that a running process is actually a Claude process
 99 * This helps mitigate PID reuse issues
100 */
101function isClaudeProcess(pid: number, expectedExecPath: string): boolean {
102  if (!isProcessRunning(pid)) {
103    return false
104  }
105
106  // If the PID matches our current process, we know it's valid
107  // This handles test environments where the command might not contain 'claude'
108  if (pid === process.pid) {
109    return true
110  }
111
112  try {
113    const command = getProcessCommand(pid)
114    if (!command) {
115      // If we can't get the command, trust the PID check
116      // This is conservative - we'd rather not delete a running version
117      return true
118    }
119
120    // Check if the command contains 'claude' or the expected exec path
121    const normalizedCommand = command.toLowerCase()
122    const normalizedExecPath = expectedExecPath.toLowerCase()
123
124    return (
125      normalizedCommand.includes('claude') ||
126      normalizedCommand.includes(normalizedExecPath)
127    )
128  } catch {
129    // If command check fails, trust the PID check
130    return true
131  }
132}
133
134/**
135 * Read and parse a lock file's content
136 */
137export function readLockContent(
138  lockFilePath: string,
139): VersionLockContent | null {
140  const fs = getFsImplementation()
141
142  try {
143    const content = fs.readFileSync(lockFilePath, { encoding: 'utf8' })
144    if (!content || content.trim() === '') {
145      return null
146    }
147
148    const parsed = jsonParse(content) as VersionLockContent
149
150    // Validate required fields
151    if (typeof parsed.pid !== 'number' || !parsed.version || !parsed.execPath) {
152      return null
153    }
154
155    return parsed
156  } catch {
157    return null
158  }
159}
160
161/**
162 * Check if a lock file represents an active lock (process still running)
163 */
164export function isLockActive(lockFilePath: string): boolean {
165  const content = readLockContent(lockFilePath)
166
167  if (!content) {
168    return false
169  }
170
171  const { pid, execPath } = content
172
173  // Primary check: is the process running?
174  if (!isProcessRunning(pid)) {
175    return false
176  }
177
178  // Secondary validation: is it actually a Claude process?
179  // This helps with PID reuse scenarios
180  if (!isClaudeProcess(pid, execPath)) {
181    logForDebugging(
182      `Lock PID ${pid} is running but does not appear to be Claude - treating as stale`,
183    )
184    return false
185  }
186
187  // Fallback: if the lock is very old (> 2 hours) and we can't validate
188  // the command, be conservative and consider it potentially stale
189  // This handles edge cases like network filesystems
190  const fs = getFsImplementation()
191  try {
192    const stats = fs.statSync(lockFilePath)
193    const age = Date.now() - stats.mtimeMs
194    if (age > FALLBACK_STALE_MS) {
195      // Double-check that we can still see the process
196      if (!isProcessRunning(pid)) {
197        return false
198      }
199    }
200  } catch {
201    // If we can't stat the file, trust the PID check
202  }
203
204  return true
205}
206
207/**
208 * Write lock content to a file atomically
209 */
210function writeLockFile(
211  lockFilePath: string,
212  content: VersionLockContent,
213): void {
214  const fs = getFsImplementation()
215  const tempPath = `${lockFilePath}.tmp.${process.pid}.${Date.now()}`
216
217  try {
218    writeFileSync_DEPRECATED(tempPath, jsonStringify(content, null, 2), {
219      encoding: 'utf8',
220      flush: true,
221    })
222    fs.renameSync(tempPath, lockFilePath)
223  } catch (error) {
224    // Clean up temp file on failure (best-effort)
225    try {
226      fs.unlinkSync(tempPath)
227    } catch {
228      // Ignore cleanup errors (ENOENT expected if write failed before file creation)
229    }
230    throw error
231  }
232}
233
234/**
235 * Try to acquire a lock on a version file
236 * Returns a release function if successful, null if the lock is already held
237 */
238export async function tryAcquireLock(
239  versionPath: string,
240  lockFilePath: string,
241): Promise<(() => void) | null> {
242  const fs = getFsImplementation()
243  const versionName = basename(versionPath)
244
245  // Check if there's an existing active lock (including by our own process)
246  // Use isLockActive for consistency with cleanup - it checks both PID running AND
247  // validates it's actually a Claude process (to handle PID reuse scenarios)
248  if (isLockActive(lockFilePath)) {
249    const existingContent = readLockContent(lockFilePath)
250    logForDebugging(
251      `Cannot acquire lock for ${versionName} - held by PID ${existingContent?.pid}`,
252    )
253    return null
254  }
255
256  // Try to acquire the lock
257  const lockContent: VersionLockContent = {
258    pid: process.pid,
259    version: versionName,
260    execPath: process.execPath,
261    acquiredAt: Date.now(),
262  }
263
264  try {
265    writeLockFile(lockFilePath, lockContent)
266
267    // Verify we actually got the lock (race condition check)
268    const verifyContent = readLockContent(lockFilePath)
269    if (verifyContent?.pid !== process.pid) {
270      // Another process won the race
271      return null
272    }
273
274    logForDebugging(`Acquired PID lock for ${versionName} (PID ${process.pid})`)
275
276    // Return release function
277    return () => {
278      try {
279        // Only release if we still own the lock
280        const currentContent = readLockContent(lockFilePath)
281        if (currentContent?.pid === process.pid) {
282          fs.unlinkSync(lockFilePath)
283          logForDebugging(`Released PID lock for ${versionName}`)
284        }
285      } catch (error) {
286        logForDebugging(`Failed to release lock for ${versionName}: ${error}`)
287      }
288    }
289  } catch (error) {
290    logForDebugging(`Failed to acquire lock for ${versionName}: ${error}`)
291    return null
292  }
293}
294
295/**
296 * Acquire a lock and hold it for the lifetime of the process
297 * This is used for locking the currently running version
298 */
299export async function acquireProcessLifetimeLock(
300  versionPath: string,
301  lockFilePath: string,
302): Promise<boolean> {
303  const release = await tryAcquireLock(versionPath, lockFilePath)
304
305  if (!release) {
306    return false
307  }
308
309  // Register cleanup on process exit
310  const cleanup = () => {
311    try {
312      release()
313    } catch {
314      // Ignore errors during process exit
315    }
316  }
317
318  process.on('exit', cleanup)
319  process.on('SIGINT', cleanup)
320  process.on('SIGTERM', cleanup)
321
322  // Don't call release() - we want to hold the lock until process exits
323  return true
324}
325
326/**
327 * Execute a callback while holding a lock
328 * Returns true if the callback executed, false if lock couldn't be acquired
329 */
330export async function withLock(
331  versionPath: string,
332  lockFilePath: string,
333  callback: () => void | Promise<void>,
334): Promise<boolean> {
335  const release = await tryAcquireLock(versionPath, lockFilePath)
336
337  if (!release) {
338    return false
339  }
340
341  try {
342    await callback()
343    return true
344  } finally {
345    release()
346  }
347}
348
349/**
350 * Get information about all version locks for diagnostics
351 */
352export function getAllLockInfo(locksDir: string): LockInfo[] {
353  const fs = getFsImplementation()
354  const lockInfos: LockInfo[] = []
355
356  try {
357    const lockFiles = fs
358      .readdirStringSync(locksDir)
359      .filter((f: string) => f.endsWith('.lock'))
360
361    for (const lockFile of lockFiles) {
362      const lockFilePath = join(locksDir, lockFile)
363      const content = readLockContent(lockFilePath)
364
365      if (content) {
366        lockInfos.push({
367          version: content.version,
368          pid: content.pid,
369          isProcessRunning: isProcessRunning(content.pid),
370          execPath: content.execPath,
371          acquiredAt: new Date(content.acquiredAt),
372          lockFilePath,
373        })
374      }
375    }
376  } catch (error) {
377    if (isENOENT(error)) {
378      return lockInfos
379    }
380    logError(toError(error))
381  }
382
383  return lockInfos
384}
385
386/**
387 * Clean up stale locks (locks where the process is no longer running)
388 * Returns the number of locks cleaned up
389 *
390 * Handles both:
391 * - PID-based locks (files containing JSON with PID)
392 * - Legacy proper-lockfile locks (directories created by mtime-based locking)
393 */
394export function cleanupStaleLocks(locksDir: string): number {
395  const fs = getFsImplementation()
396  let cleanedCount = 0
397
398  try {
399    const lockEntries = fs
400      .readdirStringSync(locksDir)
401      .filter((f: string) => f.endsWith('.lock'))
402
403    for (const lockEntry of lockEntries) {
404      const lockFilePath = join(locksDir, lockEntry)
405
406      try {
407        const stats = fs.lstatSync(lockFilePath)
408
409        if (stats.isDirectory()) {
410          // Legacy proper-lockfile directory lock - always remove when PID-based
411          // locking is enabled since these are from a different locking mechanism
412          fs.rmSync(lockFilePath, { recursive: true, force: true })
413          cleanedCount++
414          logForDebugging(`Cleaned up legacy directory lock: ${lockEntry}`)
415        } else if (!isLockActive(lockFilePath)) {
416          // PID-based file lock with no running process
417          fs.unlinkSync(lockFilePath)
418          cleanedCount++
419          logForDebugging(`Cleaned up stale lock: ${lockEntry}`)
420        }
421      } catch {
422        // Ignore individual cleanup errors
423      }
424    }
425  } catch (error) {
426    if (isENOENT(error)) {
427      return 0
428    }
429    logError(toError(error))
430  }
431
432  return cleanedCount
433}