Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
at main 322 lines 13 kB view raw
1#!/usr/bin/env npx tsx 2/** 3 * Architecture Evaluation Runner — tests whether generated apps actually work. 4 * 5 * Workflow: 6 * 1. Clean and re-bootstrap the todo-app example 7 * 2. Start the server 8 * 3. Run CRUD tests via HTTP 9 * 4. Score: what percentage of operations work correctly 10 * 5. Log results 11 * 12 * Usage: npx tsx experiments/eval-runner-arch.ts [--no-log] 13 */ 14 15import { execSync, spawn } from 'node:child_process'; 16import { resolve } from 'node:path'; 17import { appendFileSync, existsSync, rmSync } from 'node:fs'; 18 19const ROOT = resolve(import.meta.dirname, '..'); 20const TODO_APP = resolve(ROOT, 'examples/todo-app'); 21const RESULTS_FILE = resolve(ROOT, 'experiments/results-arch.tsv'); 22const CLI = resolve(ROOT, 'dist/cli.js'); 23 24const noLog = process.argv.includes('--no-log'); 25const skipBootstrap = process.argv.includes('--skip-bootstrap'); 26 27// ─── Step 1: Rebuild Phoenix and re-bootstrap todo-app ────────────────────── 28 29if (!skipBootstrap) { 30 console.log('Building Phoenix...'); 31 execSync('npm run build', { cwd: ROOT, stdio: 'pipe' }); 32 33 console.log('Cleaning todo-app...'); 34 for (const d of ['src', '.phoenix', 'data', 'dist']) { 35 const p = resolve(TODO_APP, d); 36 if (existsSync(p)) rmSync(p, { recursive: true, force: true }); 37 } 38 // Remove db files 39 for (const f of ['app.db', 'todos.db', 'data.db']) { 40 const p = resolve(TODO_APP, f); 41 if (existsSync(p)) rmSync(p); 42 } 43 44 console.log('Initializing with sqlite-web-api...'); 45 execSync(`node ${CLI} init --arch=sqlite-web-api`, { cwd: TODO_APP, stdio: 'pipe' }); 46 47 console.log('Bootstrapping (LLM generation)...'); 48 execSync(`node ${CLI} bootstrap`, { cwd: TODO_APP, stdio: 'pipe', timeout: 900000 }); 49 50 console.log('Installing dependencies...'); 51 execSync('npm install', { cwd: TODO_APP, stdio: 'pipe', timeout: 60000 }); 52} 53 54// ─── Step 2: Start the server ─────────────────────────────────────────────── 55 56// Clean any leftover DB 57const dbPath = resolve(TODO_APP, 'data/app.db'); 58if (existsSync(dbPath)) rmSync(dbPath); 59const dbShm = dbPath + '-shm'; 60const dbWal = dbPath + '-wal'; 61if (existsSync(dbShm)) rmSync(dbShm); 62if (existsSync(dbWal)) rmSync(dbWal); 63 64console.log('Starting server...'); 65const server = spawn('npx', ['tsx', 'src/server.ts'], { 66 cwd: TODO_APP, 67 stdio: 'pipe', 68 env: { ...process.env, PORT: '4567' }, 69}); 70 71let serverOutput = ''; 72server.stdout.on('data', (d) => { serverOutput += d.toString(); }); 73server.stderr.on('data', (d) => { serverOutput += d.toString(); }); 74 75// Wait for server to start 76await new Promise<void>((resolve, reject) => { 77 const timeout = setTimeout(() => reject(new Error('Server start timeout')), 10000); 78 const check = setInterval(async () => { 79 try { 80 const res = await fetch('http://localhost:4567/health'); 81 if (res.ok) { clearInterval(check); clearTimeout(timeout); resolve(); } 82 } catch { /* not ready yet */ } 83 }, 500); 84}); 85 86console.log('Server ready on :4567'); 87 88// ─── Step 3: Run CRUD tests ──────────────────────────────────────────────── 89 90interface TestResult { 91 name: string; 92 pass: boolean; 93 detail: string; 94} 95 96const results: TestResult[] = []; 97const BASE = 'http://localhost:4567'; 98 99async function test(name: string, fn: () => Promise<boolean>): Promise<void> { 100 try { 101 const pass = await fn(); 102 results.push({ name, pass, detail: pass ? 'ok' : 'assertion failed' }); 103 console.log(` ${pass ? '✓' : '✗'} ${name}`); 104 } catch (e) { 105 results.push({ name, pass: false, detail: String(e) }); 106 console.log(`${name}${e}`); 107 } 108} 109 110console.log('\nRunning tests:'); 111 112// ─── Categories ───────────────────────────────────────────────────────────── 113 114let projId: number | null = null; 115 116await test('POST /projects creates project', async () => { 117 const res = await fetch(`${BASE}/projects`, { 118 method: 'POST', headers: { 'Content-Type': 'application/json' }, 119 body: JSON.stringify({ name: 'Work', color: '#ff0000' }), 120 }); 121 if (res.status !== 201) return false; 122 const body = await res.json() as Record<string, unknown>; 123 projId = body.id as number; 124 return body.name === 'Work' && typeof body.id === 'number'; 125}); 126 127await test('POST /projects rejects empty name', async () => { 128 const res = await fetch(`${BASE}/projects`, { 129 method: 'POST', headers: { 'Content-Type': 'application/json' }, 130 body: JSON.stringify({ name: '' }), 131 }); 132 return res.status === 400; 133}); 134 135await test('GET /projects returns array', async () => { 136 const res = await fetch(`${BASE}/projects`); 137 if (res.status !== 200) return false; 138 const body = await res.json() as unknown[]; 139 return Array.isArray(body) && body.length >= 1; 140}); 141 142// ─── Todos with categories ────────────────────────────────────────────────── 143 144let todoId: number | null = null; 145 146await test('POST /todos creates todo with category', async () => { 147 const res = await fetch(`${BASE}/tasks`, { 148 method: 'POST', headers: { 'Content-Type': 'application/json' }, 149 body: JSON.stringify({ title: 'Finish report', project_id: projId }), 150 }); 151 if (res.status !== 201) return false; 152 const body = await res.json() as Record<string, unknown>; 153 todoId = body.id as number; 154 return body.title === 'Finish report' && typeof body.id === 'number'; 155}); 156 157await test('POST /todos creates todo without category', async () => { 158 const res = await fetch(`${BASE}/tasks`, { 159 method: 'POST', headers: { 'Content-Type': 'application/json' }, 160 body: JSON.stringify({ title: 'Buy milk' }), 161 }); 162 return res.status === 201; 163}); 164 165await test('POST /todos rejects invalid project_id', async () => { 166 const res = await fetch(`${BASE}/tasks`, { 167 method: 'POST', headers: { 'Content-Type': 'application/json' }, 168 body: JSON.stringify({ title: 'Bad category', project_id: 9999 }), 169 }); 170 return res.status === 400; 171}); 172 173await test('POST /todos rejects empty title', async () => { 174 const res = await fetch(`${BASE}/tasks`, { 175 method: 'POST', headers: { 'Content-Type': 'application/json' }, 176 body: JSON.stringify({ title: '' }), 177 }); 178 return res.status === 400; 179}); 180 181await test('GET /todos returns todos with project_name', async () => { 182 const res = await fetch(`${BASE}/tasks`); 183 if (res.status !== 200) return false; 184 const body = await res.json() as Array<Record<string, unknown>>; 185 const withCat = body.find(t => t.title === 'Finish report'); 186 return withCat?.project_name === 'Work'; 187}); 188 189await test('GET /todos/:id returns todo with project_name', async () => { 190 if (!todoId) return false; 191 const res = await fetch(`${BASE}/tasks/${todoId}`); 192 if (res.status !== 200) return false; 193 const body = await res.json() as Record<string, unknown>; 194 return body.project_name === 'Work'; 195}); 196 197await test('GET /todos/999 returns 404', async () => { 198 return (await fetch(`${BASE}/tasks/999`)).status === 404; 199}); 200 201// ─── Filtering ────────────────────────────────────────────────────────────── 202 203await test('PATCH /todos/:id marks completed', async () => { 204 if (!todoId) return false; 205 // Try integer 1, then boolean true — LLM might use either schema 206 let res = await fetch(`${BASE}/tasks/${todoId}`, { 207 method: 'PATCH', headers: { 'Content-Type': 'application/json' }, 208 body: JSON.stringify({ completed: 1 }), 209 }); 210 if (res.status !== 200) { 211 res = await fetch(`${BASE}/tasks/${todoId}`, { 212 method: 'PATCH', headers: { 'Content-Type': 'application/json' }, 213 body: JSON.stringify({ completed: true }), 214 }); 215 } 216 if (res.status !== 200) return false; 217 const body = await res.json() as Record<string, unknown>; 218 return body.completed === 1 || body.completed === true; 219}); 220 221await test('GET /todos?completed=1 filters completed', async () => { 222 let res = await fetch(`${BASE}/tasks?completed=1`); 223 if (res.status !== 200) res = await fetch(`${BASE}/tasks?completed=true`); 224 if (res.status !== 200) return false; 225 const body = await res.json() as Array<Record<string, unknown>>; 226 return body.length >= 1 && body.every(t => t.completed === 1 || t.completed === true); 227}); 228 229await test('GET /todos?completed=0 filters incomplete', async () => { 230 // Try both completed=0 and status=active since LLM may interpret either way 231 let res = await fetch(`${BASE}/tasks?completed=0`); 232 if (res.status !== 200) res = await fetch(`${BASE}/tasks?completed=false`); 233 if (res.status !== 200) return false; 234 const body = await res.json() as Array<Record<string, unknown>>; 235 return body.length >= 1 && body.every(t => t.completed === 0 || t.completed === false); 236}); 237 238await test('GET /todos?project_id=N filters by category', async () => { 239 if (!projId) return false; 240 const res = await fetch(`${BASE}/tasks?project_id=${projId}`); 241 if (res.status !== 200) return false; 242 const body = await res.json() as Array<Record<string, unknown>>; 243 return body.length >= 1; 244}); 245 246// ─── Stats ────────────────────────────────────────────────────────────────── 247 248await test('GET /stats returns counts', async () => { 249 const res = await fetch(`${BASE}/tasks/stats`); 250 if (res.status !== 200) return false; 251 const body = await res.json() as Record<string, unknown>; 252 // Accept various field naming conventions 253 const hasTotal = typeof body.total === 'number' || typeof body.total_tasks === 'number'; 254 const hasCompleted = typeof body.completed === 'number' || typeof body.completed_tasks === 'number'; 255 return hasTotal && hasCompleted; 256}); 257 258await test('GET /stats includes aggregates', async () => { 259 const res = await fetch(`${BASE}/tasks/stats`); 260 if (res.status !== 200) return false; 261 const body = await res.json() as Record<string, unknown>; 262 // Accept by_category, by_project, or any array field with counts 263 const hasAggregates = body.by_category || body.by_project || body.overdue_tasks !== undefined || body.completion_percentage !== undefined; 264 return !!hasAggregates; 265}); 266 267// ─── Delete ───────────────────────────────────────────────────────────────── 268 269await test('DELETE /todos/:id returns 204', async () => { 270 if (!todoId) return false; 271 return (await fetch(`${BASE}/tasks/${todoId}`, { method: 'DELETE' })).status === 204; 272}); 273 274await test('DELETE /projects/:id with todos returns 400', async () => { 275 // "Buy milk" has no category, but create one with a category to test 276 const res = await fetch(`${BASE}/projects`, { 277 method: 'POST', headers: { 'Content-Type': 'application/json' }, 278 body: JSON.stringify({ name: 'Temp' }), 279 }); 280 const cat = await res.json() as Record<string, unknown>; 281 await fetch(`${BASE}/tasks`, { 282 method: 'POST', headers: { 'Content-Type': 'application/json' }, 283 body: JSON.stringify({ title: 'Temp todo', project_id: cat.id }), 284 }); 285 const delRes = await fetch(`${BASE}/projects/${cat.id}`, { method: 'DELETE' }); 286 return delRes.status === 400; 287}); 288 289await test('DELETE /projects/:id without todos returns 204', async () => { 290 if (!projId) return false; 291 // projId's todos were already deleted 292 return (await fetch(`${BASE}/projects/${projId}`, { method: 'DELETE' })).status === 204; 293}); 294 295// ─── Step 4: Score ────────────────────────────────────────────────────────── 296 297server.kill(); 298 299const passed = results.filter(r => r.pass).length; 300const total = results.length; 301const score = total > 0 ? passed / total : 0; 302 303console.log(`\n Score: ${passed}/${total} (${(score * 100).toFixed(0)}%)`); 304for (const r of results.filter(r => !r.pass)) { 305 console.log(` FAIL: ${r.name}${r.detail}`); 306} 307 308// ─── Step 5: Log ──────────────────────────────────────────────────────────── 309 310if (!noLog) { 311 const header = 'timestamp\tscore\tpassed\ttotal\tfailures'; 312 if (!existsSync(RESULTS_FILE)) { 313 appendFileSync(RESULTS_FILE, header + '\n'); 314 } 315 const failures = results.filter(r => !r.pass).map(r => r.name).join('; ') || 'none'; 316 const row = [new Date().toISOString(), score.toFixed(2), passed, total, failures].join('\t'); 317 appendFileSync(RESULTS_FILE, row + '\n'); 318 console.log(` Results appended to experiments/results-arch.tsv`); 319} 320 321console.log(`\nval_score=${score.toFixed(4)}`); 322process.exit(score === 1 ? 0 : 1);