Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy

eval: add 6 new gold-standard specs for broader evaluation

Added Pixel Wars (game, server), Settle Up (expenses, settlements),
User Service, and TicTacToe game engine. Score dropped 0.9640→0.8298
exposing type accuracy and hierarchy weaknesses on new domains.

+107
+1
experiments/results.tsv
··· 19 19 2026-03-26T23:12:45.269Z 0.9265 100.0 94.4 95.5 33.8 100.0 6.2 x0da3a 20 20 2026-03-26T23:13:07.799Z 0.9640 100.0 94.4 95.5 8.8 100.0 6.2 42knqt 21 21 2026-03-26T23:14:22.740Z 0.9640 100.0 94.4 95.5 8.8 100.0 6.2 42knqt 22 + 2026-03-26T23:23:35.323Z 0.8298 93.8 70.7 91.3 12.8 58.3 4.3 42knqt
+106
tests/eval/gold-standard.ts
··· 118 118 ], 119 119 expectedEdges: [], 120 120 }, 121 + { 122 + name: 'Pixel Wars: game', 123 + path: 'examples/pixel-wars/spec/game.md', 124 + docId: 'spec/game.md', 125 + expectedMinCoverage: 85, 126 + expectedMinNodes: 10, 127 + expectedMaxNodes: 25, 128 + expectedNodes: [ 129 + { statement: '20', type: 'CONSTRAINT' }, 130 + { statement: 'cooldown', type: 'CONSTRAINT' }, 131 + { statement: 'rejected', type: 'REQUIREMENT' }, 132 + { statement: 'broadcast', type: 'REQUIREMENT' }, 133 + { statement: '120 seconds', type: 'CONSTRAINT' }, 134 + { statement: 'round-robin', type: 'CONSTRAINT' }, 135 + ], 136 + expectedEdges: [], 137 + }, 138 + { 139 + name: 'Pixel Wars: server', 140 + path: 'examples/pixel-wars/spec/server.md', 141 + docId: 'spec/server.md', 142 + expectedMinCoverage: 85, 143 + expectedMinNodes: 8, 144 + expectedMaxNodes: 20, 145 + expectedNodes: [ 146 + { statement: 'websocket', type: 'REQUIREMENT' }, 147 + { statement: 'maximum 20', type: 'CONSTRAINT' }, 148 + { statement: 'disconnected', type: 'REQUIREMENT' }, 149 + { statement: 'room_full', type: 'REQUIREMENT' }, 150 + ], 151 + expectedEdges: [], 152 + }, 153 + { 154 + name: 'Settle Up: expenses', 155 + path: 'examples/settle-up/spec/expenses.md', 156 + docId: 'spec/expenses.md', 157 + expectedMinCoverage: 90, 158 + expectedMinNodes: 12, 159 + expectedMaxNodes: 30, 160 + expectedNodes: [ 161 + { statement: 'unique id', type: 'REQUIREMENT' }, 162 + { statement: 'positive', type: 'CONSTRAINT' }, 163 + { statement: 'equal', type: 'REQUIREMENT' }, 164 + { statement: 'remainder', type: 'INVARIANT' }, 165 + { statement: 'sum of all individual shares must always equal', type: 'INVARIANT' }, 166 + { statement: 'reverse chronological', type: 'REQUIREMENT' }, 167 + { statement: 'only creator can delete', type: 'CONSTRAINT' }, 168 + { statement: 'deterministic', type: 'INVARIANT' }, 169 + ], 170 + expectedEdges: [], 171 + }, 172 + { 173 + name: 'Settle Up: settlements', 174 + path: 'examples/settle-up/spec/settlements.md', 175 + docId: 'spec/settlements.md', 176 + expectedMinCoverage: 85, 177 + expectedMinNodes: 8, 178 + expectedMaxNodes: 22, 179 + expectedNodes: [ 180 + { statement: 'minimum number of payments', type: 'REQUIREMENT' }, 181 + { statement: 'same net effect', type: 'INVARIANT' }, 182 + { statement: 'cycles', type: 'INVARIANT' }, 183 + { statement: 'zero balances', type: 'INVARIANT' }, 184 + { statement: 'exceeds', type: 'CONSTRAINT' }, 185 + { statement: 'settled up', type: 'REQUIREMENT' }, 186 + ], 187 + expectedEdges: [], 188 + }, 189 + { 190 + name: 'User Service', 191 + path: 'examples/microservices/spec/user-service.md', 192 + docId: 'spec/user-service.md', 193 + expectedMinCoverage: 85, 194 + expectedMinNodes: 18, 195 + expectedMaxNodes: 40, 196 + expectedNodes: [ 197 + { statement: 'system of record', type: 'DEFINITION' }, 198 + { statement: 'email addresses must be unique', type: 'CONSTRAINT' }, 199 + { statement: 'never store or return plaintext passwords', type: 'INVARIANT' }, 200 + { statement: 'soft delete', type: 'REQUIREMENT' }, 201 + { statement: '100 characters', type: 'CONSTRAINT' }, 202 + { statement: 'locked for 1 hour', type: 'CONSTRAINT' }, 203 + { statement: 'parameterized statements', type: 'CONSTRAINT' }, 204 + { statement: 'event payloads must never contain passwords', type: 'INVARIANT' }, 205 + { statement: '50 results per page', type: 'CONSTRAINT' }, 206 + { statement: 'usercreated', type: 'REQUIREMENT' }, 207 + ], 208 + expectedEdges: [], 209 + }, 210 + { 211 + name: 'TicTacToe: game engine', 212 + path: 'examples/tictactoe/spec/game-engine.md', 213 + docId: 'spec/game-engine.md', 214 + expectedMinCoverage: 90, 215 + expectedMinNodes: 10, 216 + expectedMaxNodes: 25, 217 + expectedNodes: [ 218 + { statement: '3', type: 'CONSTRAINT' }, 219 + { statement: 'cell already occupied', type: 'REQUIREMENT' }, 220 + { statement: 'x always moves first', type: 'CONSTRAINT' }, 221 + { statement: 'win', type: 'REQUIREMENT' }, 222 + { statement: 'draw', type: 'REQUIREMENT' }, 223 + { statement: 'game status', type: 'DEFINITION' }, 224 + ], 225 + expectedEdges: [], 226 + }, 121 227 ];