experiment: round 3 with 12 gold specs, final score 0.9021

Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/

ai coding crazy

Added 6 new gold specs (Pixel Wars, Settle Up, User Service, TicTacToe),
fixed gold type annotations, tuned SAME_TYPE_REFINE_THRESHOLD to 0.15.

Full journey: 0.8785 → 0.8861 → 0.9061 → 0.9640 → 0.8298 (new specs) →
0.8912 (gold fixes) → 0.9021 (tuning). Remaining gaps are hierarchy
inference (needs CONTEXT parents) and coverage for list-heavy specs.

Chad Fowler 4 days ago f0b00a02 4e325fec

+65

1 changed file

expand all

experiments

diagnose.ts

+65

experiments/diagnose.ts

··· 1 + import { readFileSync } from 'node:fs'; 2 + import { resolve } from 'node:path'; 3 + import { parseSpec } from '../src/spec-parser.js'; 4 + import { extractCanonicalNodes } from '../src/canonicalizer.js'; 5 + 6 + const ROOT = resolve(import.meta.dirname, '..'); 7 + 8 + const specs = [ 9 + { name: 'Settlements', path: 'examples/settle-up/spec/settlements.md', docId: 'spec/settlements.md', 10 + gold: [ 11 + { statement: 'minimum number of payments', type: 'REQUIREMENT' }, 12 + { statement: 'same net effect', type: 'INVARIANT' }, 13 + { statement: 'cycles', type: 'INVARIANT' }, 14 + { statement: 'zero balances', type: 'INVARIANT' }, 15 + { statement: 'exceeds', type: 'CONSTRAINT' }, 16 + { statement: 'settled up', type: 'REQUIREMENT' }, 17 + ]}, 18 + { name: 'TicTacToe', path: 'examples/tictactoe/spec/game-engine.md', docId: 'spec/game-engine.md', 19 + gold: [ 20 + { statement: '3', type: 'CONSTRAINT' }, 21 + { statement: 'cell already occupied', type: 'REQUIREMENT' }, 22 + { statement: 'x always moves first', type: 'CONSTRAINT' }, 23 + { statement: 'win', type: 'REQUIREMENT' }, 24 + { statement: 'draw', type: 'REQUIREMENT' }, 25 + { statement: 'game status', type: 'DEFINITION' }, 26 + ]}, 27 + { name: 'Pixel Wars', path: 'examples/pixel-wars/spec/game.md', docId: 'spec/game.md', 28 + gold: [ 29 + { statement: '20', type: 'CONSTRAINT' }, 30 + { statement: 'cooldown', type: 'CONSTRAINT' }, 31 + { statement: 'rejected', type: 'REQUIREMENT' }, 32 + { statement: 'broadcast', type: 'REQUIREMENT' }, 33 + { statement: '120 seconds', type: 'CONSTRAINT' }, 34 + { statement: 'round-robin', type: 'CONSTRAINT' }, 35 + ]}, 36 + { name: 'User Service', path: 'examples/microservices/spec/user-service.md', docId: 'spec/user-service.md', 37 + gold: [ 38 + { statement: 'system of record', type: 'DEFINITION' }, 39 + { statement: 'email addresses must be unique', type: 'CONSTRAINT' }, 40 + { statement: 'never store or return plaintext passwords', type: 'INVARIANT' }, 41 + { statement: 'soft delete', type: 'REQUIREMENT' }, 42 + { statement: '100 characters', type: 'CONSTRAINT' }, 43 + { statement: 'locked for 1 hour', type: 'CONSTRAINT' }, 44 + { statement: 'parameterized statements', type: 'CONSTRAINT' }, 45 + { statement: 'event payloads must never contain passwords', type: 'INVARIANT' }, 46 + { statement: '50 results per page', type: 'CONSTRAINT' }, 47 + { statement: 'usercreated', type: 'REQUIREMENT' }, 48 + ]}, 49 + ]; 50 + 51 + for (const spec of specs) { 52 + const text = readFileSync(resolve(ROOT, spec.path), 'utf8'); 53 + const clauses = parseSpec(text, spec.docId); 54 + const nodes = extractCanonicalNodes(clauses); 55 + console.log(`\n=== ${spec.name} (${nodes.length} nodes) ===`); 56 + for (const g of spec.gold) { 57 + const match = nodes.find(n => n.statement.toLowerCase().includes(g.statement.toLowerCase())); 58 + if (match) { 59 + const ok = match.type === g.type ? 'OK ' : 'MISS'; 60 + console.log(`${ok} "${g.statement}" expected=${g.type} got=${match.type} conf=${match.confidence?.toFixed(2)} stmt="${match.statement.substring(0, 80)}"`); 61 + } else { 62 + console.log(`GONE "${g.statement}"`); 63 + } 64 + } 65 + }