Reference implementation for the Phoenix Architecture. Work in progress.
aicoding.leaflet.pub/
ai
coding
crazy
1#!/usr/bin/env npx tsx
2/**
3 * Architecture Evaluation Runner — tests whether generated apps actually work.
4 *
5 * Workflow:
6 * 1. Clean and re-bootstrap the todo-app example
7 * 2. Start the server
8 * 3. Run CRUD tests via HTTP
9 * 4. Score: what percentage of operations work correctly
10 * 5. Log results
11 *
12 * Usage: npx tsx experiments/eval-runner-arch.ts [--no-log]
13 */
14
15import { execSync, spawn } from 'node:child_process';
16import { resolve } from 'node:path';
17import { appendFileSync, existsSync, rmSync } from 'node:fs';
18
19const ROOT = resolve(import.meta.dirname, '..');
20const TODO_APP = resolve(ROOT, 'examples/todo-app');
21const RESULTS_FILE = resolve(ROOT, 'experiments/results-arch.tsv');
22const CLI = resolve(ROOT, 'dist/cli.js');
23
24const noLog = process.argv.includes('--no-log');
25const skipBootstrap = process.argv.includes('--skip-bootstrap');
26
27// ─── Step 1: Rebuild Phoenix and re-bootstrap todo-app ──────────────────────
28
29if (!skipBootstrap) {
30 console.log('Building Phoenix...');
31 execSync('npm run build', { cwd: ROOT, stdio: 'pipe' });
32
33 console.log('Cleaning todo-app...');
34 for (const d of ['src', '.phoenix', 'data', 'dist']) {
35 const p = resolve(TODO_APP, d);
36 if (existsSync(p)) rmSync(p, { recursive: true, force: true });
37 }
38 // Remove db files
39 for (const f of ['app.db', 'todos.db', 'data.db']) {
40 const p = resolve(TODO_APP, f);
41 if (existsSync(p)) rmSync(p);
42 }
43
44 console.log('Initializing with sqlite-web-api...');
45 execSync(`node ${CLI} init --arch=sqlite-web-api`, { cwd: TODO_APP, stdio: 'pipe' });
46
47 console.log('Bootstrapping (LLM generation)...');
48 execSync(`node ${CLI} bootstrap`, { cwd: TODO_APP, stdio: 'pipe', timeout: 900000 });
49
50 console.log('Installing dependencies...');
51 execSync('npm install', { cwd: TODO_APP, stdio: 'pipe', timeout: 60000 });
52}
53
54// ─── Step 2: Start the server ───────────────────────────────────────────────
55
56// Clean any leftover DB
57const dbPath = resolve(TODO_APP, 'data/app.db');
58if (existsSync(dbPath)) rmSync(dbPath);
59const dbShm = dbPath + '-shm';
60const dbWal = dbPath + '-wal';
61if (existsSync(dbShm)) rmSync(dbShm);
62if (existsSync(dbWal)) rmSync(dbWal);
63
64console.log('Starting server...');
65const server = spawn('npx', ['tsx', 'src/server.ts'], {
66 cwd: TODO_APP,
67 stdio: 'pipe',
68 env: { ...process.env, PORT: '4567' },
69});
70
71let serverOutput = '';
72server.stdout.on('data', (d) => { serverOutput += d.toString(); });
73server.stderr.on('data', (d) => { serverOutput += d.toString(); });
74
75// Wait for server to start
76await new Promise<void>((resolve, reject) => {
77 const timeout = setTimeout(() => reject(new Error('Server start timeout')), 10000);
78 const check = setInterval(async () => {
79 try {
80 const res = await fetch('http://localhost:4567/health');
81 if (res.ok) { clearInterval(check); clearTimeout(timeout); resolve(); }
82 } catch { /* not ready yet */ }
83 }, 500);
84});
85
86console.log('Server ready on :4567');
87
88// ─── Step 3: Run CRUD tests ────────────────────────────────────────────────
89
90interface TestResult {
91 name: string;
92 pass: boolean;
93 detail: string;
94}
95
96const results: TestResult[] = [];
97const BASE = 'http://localhost:4567';
98
99async function test(name: string, fn: () => Promise<boolean>): Promise<void> {
100 try {
101 const pass = await fn();
102 results.push({ name, pass, detail: pass ? 'ok' : 'assertion failed' });
103 console.log(` ${pass ? '✓' : '✗'} ${name}`);
104 } catch (e) {
105 results.push({ name, pass: false, detail: String(e) });
106 console.log(` ✗ ${name} — ${e}`);
107 }
108}
109
110console.log('\nRunning tests:');
111
112// ─── Categories ─────────────────────────────────────────────────────────────
113
114let projId: number | null = null;
115
116await test('POST /projects creates project', async () => {
117 const res = await fetch(`${BASE}/projects`, {
118 method: 'POST', headers: { 'Content-Type': 'application/json' },
119 body: JSON.stringify({ name: 'Work', color: '#ff0000' }),
120 });
121 if (res.status !== 201) return false;
122 const body = await res.json() as Record<string, unknown>;
123 projId = body.id as number;
124 return body.name === 'Work' && typeof body.id === 'number';
125});
126
127await test('POST /projects rejects empty name', async () => {
128 const res = await fetch(`${BASE}/projects`, {
129 method: 'POST', headers: { 'Content-Type': 'application/json' },
130 body: JSON.stringify({ name: '' }),
131 });
132 return res.status === 400;
133});
134
135await test('GET /projects returns array', async () => {
136 const res = await fetch(`${BASE}/projects`);
137 if (res.status !== 200) return false;
138 const body = await res.json() as unknown[];
139 return Array.isArray(body) && body.length >= 1;
140});
141
142// ─── Todos with categories ──────────────────────────────────────────────────
143
144let todoId: number | null = null;
145
146await test('POST /todos creates todo with category', async () => {
147 const res = await fetch(`${BASE}/tasks`, {
148 method: 'POST', headers: { 'Content-Type': 'application/json' },
149 body: JSON.stringify({ title: 'Finish report', project_id: projId }),
150 });
151 if (res.status !== 201) return false;
152 const body = await res.json() as Record<string, unknown>;
153 todoId = body.id as number;
154 return body.title === 'Finish report' && typeof body.id === 'number';
155});
156
157await test('POST /todos creates todo without category', async () => {
158 const res = await fetch(`${BASE}/tasks`, {
159 method: 'POST', headers: { 'Content-Type': 'application/json' },
160 body: JSON.stringify({ title: 'Buy milk' }),
161 });
162 return res.status === 201;
163});
164
165await test('POST /todos rejects invalid project_id', async () => {
166 const res = await fetch(`${BASE}/tasks`, {
167 method: 'POST', headers: { 'Content-Type': 'application/json' },
168 body: JSON.stringify({ title: 'Bad category', project_id: 9999 }),
169 });
170 return res.status === 400;
171});
172
173await test('POST /todos rejects empty title', async () => {
174 const res = await fetch(`${BASE}/tasks`, {
175 method: 'POST', headers: { 'Content-Type': 'application/json' },
176 body: JSON.stringify({ title: '' }),
177 });
178 return res.status === 400;
179});
180
181await test('GET /todos returns todos with project_name', async () => {
182 const res = await fetch(`${BASE}/tasks`);
183 if (res.status !== 200) return false;
184 const body = await res.json() as Array<Record<string, unknown>>;
185 const withCat = body.find(t => t.title === 'Finish report');
186 return withCat?.project_name === 'Work';
187});
188
189await test('GET /todos/:id returns todo with project_name', async () => {
190 if (!todoId) return false;
191 const res = await fetch(`${BASE}/tasks/${todoId}`);
192 if (res.status !== 200) return false;
193 const body = await res.json() as Record<string, unknown>;
194 return body.project_name === 'Work';
195});
196
197await test('GET /todos/999 returns 404', async () => {
198 return (await fetch(`${BASE}/tasks/999`)).status === 404;
199});
200
201// ─── Filtering ──────────────────────────────────────────────────────────────
202
203await test('PATCH /todos/:id marks completed', async () => {
204 if (!todoId) return false;
205 // Try integer 1, then boolean true — LLM might use either schema
206 let res = await fetch(`${BASE}/tasks/${todoId}`, {
207 method: 'PATCH', headers: { 'Content-Type': 'application/json' },
208 body: JSON.stringify({ completed: 1 }),
209 });
210 if (res.status !== 200) {
211 res = await fetch(`${BASE}/tasks/${todoId}`, {
212 method: 'PATCH', headers: { 'Content-Type': 'application/json' },
213 body: JSON.stringify({ completed: true }),
214 });
215 }
216 if (res.status !== 200) return false;
217 const body = await res.json() as Record<string, unknown>;
218 return body.completed === 1 || body.completed === true;
219});
220
221await test('GET /todos?completed=1 filters completed', async () => {
222 let res = await fetch(`${BASE}/tasks?completed=1`);
223 if (res.status !== 200) res = await fetch(`${BASE}/tasks?completed=true`);
224 if (res.status !== 200) return false;
225 const body = await res.json() as Array<Record<string, unknown>>;
226 return body.length >= 1 && body.every(t => t.completed === 1 || t.completed === true);
227});
228
229await test('GET /todos?completed=0 filters incomplete', async () => {
230 // Try both completed=0 and status=active since LLM may interpret either way
231 let res = await fetch(`${BASE}/tasks?completed=0`);
232 if (res.status !== 200) res = await fetch(`${BASE}/tasks?completed=false`);
233 if (res.status !== 200) return false;
234 const body = await res.json() as Array<Record<string, unknown>>;
235 return body.length >= 1 && body.every(t => t.completed === 0 || t.completed === false);
236});
237
238await test('GET /todos?project_id=N filters by category', async () => {
239 if (!projId) return false;
240 const res = await fetch(`${BASE}/tasks?project_id=${projId}`);
241 if (res.status !== 200) return false;
242 const body = await res.json() as Array<Record<string, unknown>>;
243 return body.length >= 1;
244});
245
246// ─── Stats ──────────────────────────────────────────────────────────────────
247
248await test('GET /stats returns counts', async () => {
249 const res = await fetch(`${BASE}/tasks/stats`);
250 if (res.status !== 200) return false;
251 const body = await res.json() as Record<string, unknown>;
252 // Accept various field naming conventions
253 const hasTotal = typeof body.total === 'number' || typeof body.total_tasks === 'number';
254 const hasCompleted = typeof body.completed === 'number' || typeof body.completed_tasks === 'number';
255 return hasTotal && hasCompleted;
256});
257
258await test('GET /stats includes aggregates', async () => {
259 const res = await fetch(`${BASE}/tasks/stats`);
260 if (res.status !== 200) return false;
261 const body = await res.json() as Record<string, unknown>;
262 // Accept by_category, by_project, or any array field with counts
263 const hasAggregates = body.by_category || body.by_project || body.overdue_tasks !== undefined || body.completion_percentage !== undefined;
264 return !!hasAggregates;
265});
266
267// ─── Delete ─────────────────────────────────────────────────────────────────
268
269await test('DELETE /todos/:id returns 204', async () => {
270 if (!todoId) return false;
271 return (await fetch(`${BASE}/tasks/${todoId}`, { method: 'DELETE' })).status === 204;
272});
273
274await test('DELETE /projects/:id with todos returns 400', async () => {
275 // "Buy milk" has no category, but create one with a category to test
276 const res = await fetch(`${BASE}/projects`, {
277 method: 'POST', headers: { 'Content-Type': 'application/json' },
278 body: JSON.stringify({ name: 'Temp' }),
279 });
280 const cat = await res.json() as Record<string, unknown>;
281 await fetch(`${BASE}/tasks`, {
282 method: 'POST', headers: { 'Content-Type': 'application/json' },
283 body: JSON.stringify({ title: 'Temp todo', project_id: cat.id }),
284 });
285 const delRes = await fetch(`${BASE}/projects/${cat.id}`, { method: 'DELETE' });
286 return delRes.status === 400;
287});
288
289await test('DELETE /projects/:id without todos returns 204', async () => {
290 if (!projId) return false;
291 // projId's todos were already deleted
292 return (await fetch(`${BASE}/projects/${projId}`, { method: 'DELETE' })).status === 204;
293});
294
295// ─── Step 4: Score ──────────────────────────────────────────────────────────
296
297server.kill();
298
299const passed = results.filter(r => r.pass).length;
300const total = results.length;
301const score = total > 0 ? passed / total : 0;
302
303console.log(`\n Score: ${passed}/${total} (${(score * 100).toFixed(0)}%)`);
304for (const r of results.filter(r => !r.pass)) {
305 console.log(` FAIL: ${r.name} — ${r.detail}`);
306}
307
308// ─── Step 5: Log ────────────────────────────────────────────────────────────
309
310if (!noLog) {
311 const header = 'timestamp\tscore\tpassed\ttotal\tfailures';
312 if (!existsSync(RESULTS_FILE)) {
313 appendFileSync(RESULTS_FILE, header + '\n');
314 }
315 const failures = results.filter(r => !r.pass).map(r => r.name).join('; ') || 'none';
316 const row = [new Date().toISOString(), score.toFixed(2), passed, total, failures].join('\t');
317 appendFileSync(RESULTS_FILE, row + '\n');
318 console.log(` Results appended to experiments/results-arch.tsv`);
319}
320
321console.log(`\nval_score=${score.toFixed(4)}`);
322process.exit(score === 1 ? 0 : 1);