my own status page
at main 201 lines 7.8 kB view raw
1import type { Env } from "./types"; 2import { getManifest } from "./manifest"; 3import { checkHealth } from "./health"; 4import { insertPing, getLatestPing, pruneOldPings, createIncident, updateIncident, addIncidentUpdate, getActiveIncidentForService, getActiveIncidents, getRecentlyResolvedIncident, getRecentlyResolvedIncidents, setIncidentGitHub } from "./db"; 5import { refreshDevices } from "./tailscale"; 6import { handleStatusRoute } from "./routes/status"; 7import { handleFavicon } from "./routes/favicon"; 8import { handleUptime } from "./routes/uptime"; 9import { handleBadgeRoute } from "./routes/badge"; 10import { handleIndex } from "./routes/index"; 11import { handleIncidentRoute } from "./routes/incidents"; 12import { createIssue, assignIssue, commentOnIssue, closeIssue, parseRepo, syncGitHubIncidents } from "./github"; 13import { schemas } from "./schemas"; 14 15async function handleRequest(request: Request, env: Env): Promise<Response> { 16 const url = new URL(request.url); 17 const path = url.pathname; 18 19 if (path === "/" || path === "") { 20 return handleIndex(env); 21 } 22 23 if (path === "/favicon.svg") { 24 return handleFavicon(env); 25 } 26 27 if (path === "/health") { 28 return Response.json({ ok: true, timestamp: new Date().toISOString() }); 29 } 30 31 if (path === "/api/schemas") { 32 return Response.json(schemas); 33 } 34 35 const schemaMatch = path.match(/^\/api\/schemas\/(.+)$/); 36 if (schemaMatch) { 37 const schema = schemas[schemaMatch[1]]; 38 if (schema) { 39 return Response.json(schema); 40 } 41 return Response.json({ error: "schema not found" }, { status: 404 }); 42 } 43 44 if (path.startsWith("/api/status")) { 45 const res = await handleStatusRoute(env, path); 46 if (res) return res; 47 } 48 49 const uptimeMatch = path.match(/^\/api\/uptime\/(.+)$/); 50 if (uptimeMatch) { 51 return handleUptime(env, uptimeMatch[1], url); 52 } 53 54 if (path.startsWith("/badge")) { 55 const badge = await handleBadgeRoute(env, path, url); 56 if (badge) return badge; 57 } 58 59 if (path.startsWith("/api/incidents")) { 60 const res = await handleIncidentRoute(request, env, path); 61 if (res) return res; 62 } 63 64 return new Response("Not Found", { status: 404 }); 65} 66 67export default { 68 async fetch(request: Request, env: Env): Promise<Response> { 69 if (request.method === "OPTIONS") { 70 return new Response(null, { 71 headers: { 72 "Access-Control-Allow-Origin": "*", 73 "Access-Control-Allow-Methods": "GET, POST, PATCH, OPTIONS", 74 "Access-Control-Allow-Headers": "Content-Type, Authorization", 75 }, 76 }); 77 } 78 79 const response = await handleRequest(request, env); 80 const corsResponse = new Response(response.body, response); 81 corsResponse.headers.set("Access-Control-Allow-Origin", "*"); 82 return corsResponse; 83 }, 84 85 async scheduled(_controller: ScheduledController, env: Env): Promise<void> { 86 const [manifest] = await Promise.all([ 87 getManifest(env), 88 refreshDevices(env), 89 ]); 90 91 const checks = Object.values(manifest).flatMap((machine) => { 92 const triageUrl = machine.triage_url; 93 return machine.services 94 .filter((svc) => svc.health_url) 95 .map(async (svc) => { 96 const previous = await getLatestPing(env.DB, svc.name); 97 const result = await checkHealth(svc); 98 await insertPing(env.DB, svc.name, result.status, result.latency_ms); 99 100 const isDown = result.status === "down" || result.status === "timeout"; 101 const wasUp = !previous || previous.status === "up" || previous.status === "degraded"; 102 103 if (isDown) { 104 // Track consecutive failures in KV for flap prevention 105 const failKey = `triage:${svc.name}:failures`; 106 const current = parseInt((await env.KV.get(failKey)) ?? "0"); 107 const failures = current + 1; 108 await env.KV.put(failKey, String(failures), { expirationTtl: 1800 }); 109 110 // Only trigger after 2 consecutive failures (10 min of downtime) 111 if (failures >= 2) { 112 const existing = await getActiveIncidentForService(env.DB, svc.name); 113 if (!existing) { 114 // Check cooldown: no incident resolved in last 15 min 115 const recent = await getRecentlyResolvedIncident(env.DB, svc.name, 900); 116 if (!recent) { 117 const id = await createIncident(env.DB, { 118 service_id: svc.name, 119 title: `${svc.name} is ${result.status}`, 120 severity: "major", 121 }); 122 123 // Create GitHub issue on the service's repo 124 if (env.GITHUB_TOKEN && svc.repository) { 125 const parsed = parseRepo(svc.repository); 126 if (parsed) { 127 try { 128 const issueNumber = await createIssue(env.GITHUB_TOKEN, parsed.owner, parsed.repo, { 129 title: `${svc.name} is ${result.status}`, 130 body: `Automated incident detected by [infra.dunkirk.sh](https://infra.dunkirk.sh)\n\n**Service:** ${svc.name}\n**Health URL:** ${svc.health_url}\n**Status:** ${result.status}${result.status_code ? ` (HTTP ${result.status_code})` : ""}${result.error ? `${result.error}` : ""}\n**Latency:** ${result.latency_ms}ms\n**Detected at:** ${new Date().toISOString()}\n\n---\n*Comments on this issue will appear on the status page. Close the issue to resolve the incident.*`, 131 labels: ["incident"], 132 }); 133 if (env.GITHUB_ASSIGN_TOKEN && env.GITHUB_ASSIGNEE) { 134 await assignIssue(env.GITHUB_ASSIGN_TOKEN, parsed.owner, parsed.repo, issueNumber, [env.GITHUB_ASSIGNEE]); 135 } 136 await setIncidentGitHub(env.DB, id, `${parsed.owner}/${parsed.repo}`, issueNumber); 137 } catch (_) {} // best effort 138 } 139 } 140 141 // Fire webhook to triage agent (non-blocking) 142 if (triageUrl && env.TRIAGE_AUTH_TOKEN) { 143 fetch(triageUrl, { 144 method: "POST", 145 headers: { 146 "Content-Type": "application/json", 147 Authorization: `Bearer ${env.TRIAGE_AUTH_TOKEN}`, 148 }, 149 body: JSON.stringify({ 150 incident_id: id, 151 service_id: svc.name, 152 service_name: svc.name, 153 health_url: svc.health_url, 154 callback_url: `https://infra.dunkirk.sh/api/incidents/${id}`, 155 }), 156 }).catch(() => {}); // fire and forget 157 } 158 } 159 } 160 } 161 } else { 162 // Service is up — clear failure counter (only if one exists, to avoid unnecessary KV delete ops) 163 const failKey = `triage:${svc.name}:failures`; 164 if (await env.KV.get(failKey)) { 165 await env.KV.delete(failKey); 166 } 167 168 // Auto-resolve active incidents 169 const active = await getActiveIncidentForService(env.DB, svc.name); 170 if (active) { 171 await updateIncident(env.DB, active.id, { 172 status: "resolved", 173 resolved_at: Math.floor(Date.now() / 1000), 174 }); 175 await addIncidentUpdate(env.DB, active.id, "resolved", "Service recovered automatically"); 176 177 // Close the GitHub issue 178 if (env.GITHUB_TOKEN && active.github_repo && active.github_issue_number) { 179 const parsed = parseRepo(`https://github.com/${active.github_repo}`); 180 if (parsed) { 181 commentOnIssue(env.GITHUB_TOKEN, parsed.owner, parsed.repo, active.github_issue_number, "Service recovered automatically. Closing issue.").catch(() => {}); 182 closeIssue(env.GITHUB_TOKEN, parsed.owner, parsed.repo, active.github_issue_number).catch(() => {}); 183 } 184 } 185 } 186 } 187 }); 188 }); 189 190 await Promise.all(checks); 191 await pruneOldPings(env.DB, 365); 192 193 // Sync GitHub issue comments/state back to incidents 194 if (env.GITHUB_TOKEN) { 195 const active = await getActiveIncidents(env.DB); 196 const recentlyResolved = await getRecentlyResolvedIncidents(env.DB, 86400 * 7); 197 const toSync = [...active, ...recentlyResolved]; 198 await syncGitHubIncidents(env.DB, env.KV, env.GITHUB_TOKEN, toSync); 199 } 200 }, 201} satisfies ExportedHandler<Env>;