monitor-emacs.sh at main · aesthetic.computer/core

aesthetic.computer / core
fork atom
Monorepo for Aesthetic.Computer aesthetic.computer
fork atom
core / monitor-emacs.sh
at main 264 lines 8.2 kB view raw
wrap content
Jeffrey Alan Scudder restore 4270 files deleted by papers oven auto-build (aaa62f4a1) 19d ago
ea1fd8af
  1#!/usr/bin/env bash
  2# DEPRECATED: Use ac-emacs-crash-monitor (fish) instead.
  3# The fish monitor has startup-lock awareness, correct config on restart,
  4# load-aware timeouts, and CPU monitoring. Run: ac-emacs-start-monitor
  5#
  6# Emacs Watchdog - monitors emacs daemon health and auto-recovers from hangs
  7# Run: ./monitor-emacs.sh (foreground) or ./monitor-emacs.sh & (background)
  8# Stop: kill $(cat /tmp/emacs-watchdog.pid) or ac-watchdog-stop
  9
 10CHECK_INTERVAL="${WATCHDOG_INTERVAL:-10}"  # Check every N seconds
 11CPU_THRESHOLD="${WATCHDOG_CPU:-85}"        # CPU % threshold to consider "stuck"
 12CPU_SAMPLES="${WATCHDOG_SAMPLES:-3}"       # Consecutive high-CPU samples before action
 13TIMEOUT_THRESHOLD=5                         # Seconds to wait for emacsclient response
 14
 15LOG_DIR="/workspaces/aesthetic-computer/.emacs-logs"
 16LOG_FILE="$LOG_DIR/watchdog.log"
 17PID_FILE="/tmp/emacs-watchdog.pid"
 18WARNING_FILE="/tmp/emacs-watchdog-warning"
 19
 20mkdir -p "$LOG_DIR"
 21
 22log() {
 23    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
 24}
 25
 26# Track high CPU samples
 27high_cpu_count=0
 28
 29check_emacs_health() {
 30    local daemon_pid
 31    daemon_pid=$(pgrep -f "emacs.*daemon" 2>/dev/null | head -1)
 32    
 33    if [ -z "$daemon_pid" ]; then
 34        echo "not_running"
 35        return
 36    fi
 37    
 38    # Check if responsive (quick eval)
 39    if ! timeout "$TIMEOUT_THRESHOLD" emacsclient -e "t" >/dev/null 2>&1; then
 40        echo "unresponsive:$daemon_pid"
 41        return
 42    fi
 43    
 44    # Check CPU usage
 45    local cpu
 46    cpu=$(ps -p "$daemon_pid" -o %cpu= 2>/dev/null | tr -d ' ' | cut -d. -f1)
 47    
 48    if [ -n "$cpu" ] && [ "$cpu" -gt "$CPU_THRESHOLD" ]; then
 49        echo "high_cpu:$cpu:$daemon_pid"
 50        return
 51    fi
 52    
 53    echo "healthy:$daemon_pid"
 54}
 55
 56restart_emacs() {
 57    local reason="$1"
 58    log "🔄 WATCHDOG: Restarting emacs daemon (reason: $reason)"
 59    
 60    # Kill all emacs processes
 61    pkill -9 -f "emacs.*daemon" 2>/dev/null
 62    pkill -9 emacs 2>/dev/null
 63    pkill -9 emacsclient 2>/dev/null
 64    sleep 2
 65    
 66    # Start fresh daemon
 67    log "🚀 WATCHDOG: Starting fresh emacs daemon..."
 68    emacs --daemon 2>&1 | head -10 >> "$LOG_FILE"
 69    
 70    sleep 2
 71    
 72    # Verify
 73    if timeout 5 emacsclient -e "t" >/dev/null 2>&1; then
 74        log "✅ WATCHDOG: Emacs daemon restarted successfully"
 75        # Trigger aesthetic-backend so tabs/terminals come back after crash
 76        if timeout 15 emacsclient -e "(aesthetic-backend \"artery\")" >/dev/null 2>&1; then
 77            log "🧭 WATCHDOG: aesthetic-backend triggered after restart"
 78        else
 79            log "⚠️  WATCHDOG: Failed to trigger aesthetic-backend after restart"
 80        fi
 81        show_warning "$reason"
 82        return 0
 83    else
 84        log "❌ WATCHDOG: Emacs daemon failed to restart"
 85        return 1
 86    fi
 87}
 88
 89show_warning() {
 90    local reason="$1"
 91    local timestamp
 92    timestamp=$(date '+%Y-%m-%d %H:%M:%S')
 93    
 94    # Write warning file for artery TUI to detect
 95    cat > "$WARNING_FILE" << EOF
 96{
 97  "timestamp": "$timestamp",
 98  "reason": "$reason",
 99  "message": "Emacs daemon was auto-restarted. Restart the '💻 Aesthetic' task to reconnect.",
100  "acknowledged": false
101}
102EOF
103    
104    # Try ac-notify if available
105    if [ -x "/workspaces/aesthetic-computer/ac-notify" ]; then
106        /workspaces/aesthetic-computer/ac-notify "⚠️ Emacs Recovered" "Watchdog: $reason. Restart the Aesthetic task." 2>/dev/null &
107    fi
108    
109    # Ring terminal bell
110    printf '\a' 2>/dev/null
111    
112    log "⚠️  WARNING: $reason - user notification sent"
113}
114
115show_status() {
116    local status
117    status=$(check_emacs_health)
118    
119    echo "=== Emacs Watchdog Status ==="
120    echo "PID File: $PID_FILE"
121    echo "Log File: $LOG_FILE"
122    echo "Check Interval: ${CHECK_INTERVAL}s"
123    echo "CPU Threshold: ${CPU_THRESHOLD}%"
124    echo "CPU Samples: ${CPU_SAMPLES}"
125    echo ""
126    
127    case "$status" in
128        not_running)
129            echo "Emacs: 🔴 Not running"
130            ;;
131        unresponsive:*)
132            echo "Emacs: ⚠️  Unresponsive (PID: ${status#unresponsive:})"
133            ;;
134        high_cpu:*:*)
135            local cpu="${status#high_cpu:}"
136            cpu="${cpu%%:*}"
137            local pid="${status##*:}"
138            echo "Emacs: ⚠️  High CPU ${cpu}% (PID: $pid)"
139            ;;
140        healthy:*)
141            echo "Emacs: ✅ Healthy (PID: ${status#healthy:})"
142            ;;
143    esac
144    
145    if [ -f "$PID_FILE" ]; then
146        local wpid
147        wpid=$(cat "$PID_FILE")
148        if ps -p "$wpid" >/dev/null 2>&1; then
149            echo "Watchdog: 🟢 Running (PID: $wpid)"
150        else
151            echo "Watchdog: 🔴 Dead (stale PID file)"
152        fi
153    else
154        echo "Watchdog: 🔴 Not running"
155    fi
156}
157
158main_loop() {
159    # Write PID file
160    echo $$ > "$PID_FILE"
161    
162    log "🐕 WATCHDOG: Starting emacs health monitor (PID: $$)"
163    log "   Check interval: ${CHECK_INTERVAL}s, CPU threshold: ${CPU_THRESHOLD}%, Samples needed: ${CPU_SAMPLES}"
164    
165    while true; do
166        status=$(check_emacs_health)
167        
168        case "$status" in
169            "not_running")
170                # Daemon not running - probably intentional, don't log spam
171                high_cpu_count=0
172                ;;
173            unresponsive:*)
174                local pid="${status#unresponsive:}"
175                log "⚠️  WATCHDOG: Emacs daemon (PID: $pid) UNRESPONSIVE - restarting"
176                high_cpu_count=0
177                restart_emacs "unresponsive (timeout after ${TIMEOUT_THRESHOLD}s)"
178                ;;
179            high_cpu:*:*)
180                local cpu="${status#high_cpu:}"
181                cpu="${cpu%%:*}"
182                local pid="${status##*:}"
183                high_cpu_count=$((high_cpu_count + 1))
184                log "⚠️  WATCHDOG: High CPU detected (${cpu}%) on PID $pid - sample $high_cpu_count/$CPU_SAMPLES"
185                
186                if [ "$high_cpu_count" -ge "$CPU_SAMPLES" ]; then
187                    log "🔥 WATCHDOG: Sustained high CPU for ${CPU_SAMPLES} checks - restarting"
188                    restart_emacs "sustained high CPU (${cpu}% for ${CPU_SAMPLES} samples)"
189                    high_cpu_count=0
190                fi
191                ;;
192            healthy:*)
193                if [ "$high_cpu_count" -gt 0 ]; then
194                    log "✅ WATCHDOG: CPU normalized after $high_cpu_count samples"
195                fi
196                high_cpu_count=0
197                ;;
198        esac
199        
200        sleep "$CHECK_INTERVAL"
201    done
202}
203
204cleanup() {
205    log "🛑 WATCHDOG: Shutting down (signal received)"
206    rm -f "$PID_FILE"
207    exit 0
208}
209
210# Handle signals gracefully
211trap cleanup SIGTERM SIGINT SIGHUP
212
213# Parse arguments
214case "${1:-}" in
215    status|--status|-s)
216        show_status
217        exit 0
218        ;;
219    stop|--stop)
220        if [ -f "$PID_FILE" ]; then
221            wpid=$(cat "$PID_FILE")
222            if kill -0 "$wpid" 2>/dev/null; then
223                kill "$wpid"
224                echo "Watchdog (PID: $wpid) stopped"
225                rm -f "$PID_FILE"
226            else
227                echo "Watchdog not running (stale PID file)"
228                rm -f "$PID_FILE"
229            fi
230        else
231            echo "Watchdog not running (no PID file)"
232        fi
233        exit 0
234        ;;
235    help|--help|-h)
236        echo "Usage: $0 [command]"
237        echo ""
238        echo "Commands:"
239        echo "  (none)    Start watchdog in foreground"
240        echo "  status    Show emacs and watchdog status"
241        echo "  stop      Stop running watchdog"
242        echo "  help      Show this help"
243        echo ""
244        echo "Environment variables:"
245        echo "  WATCHDOG_INTERVAL  Check interval in seconds (default: 10)"
246        echo "  WATCHDOG_CPU       CPU threshold percentage (default: 85)"
247        echo "  WATCHDOG_SAMPLES   High-CPU samples before restart (default: 3)"
248        exit 0
249        ;;
250esac
251
252# Check if already running
253if [ -f "$PID_FILE" ]; then
254    existing_pid=$(cat "$PID_FILE")
255    if ps -p "$existing_pid" >/dev/null 2>&1; then
256        echo "Watchdog already running (PID: $existing_pid)"
257        echo "Use '$0 stop' to stop it first, or '$0 status' to check status"
258        exit 1
259    else
260        rm -f "$PID_FILE"
261    fi
262fi
263
264main_loop