Reference implementation for the Phoenix Architecture. Work in progress.
aicoding.leaflet.pub/
ai
coding
crazy
1#!/usr/bin/env npx tsx
2/**
3 * Evaluation Runner — Fixed harness for the autoresearch experiment loop.
4 *
5 * DO NOT MODIFY THIS FILE during experiments.
6 * The agent modifies only experiments/config.ts.
7 *
8 * Usage: npx tsx experiments/eval-runner.ts [--json] [--no-log]
9 */
10
11import { readFileSync, appendFileSync, existsSync } from 'node:fs';
12import { resolve } from 'node:path';
13import { parseSpec } from '../src/spec-parser.js';
14import { extractCanonicalNodes, extractCandidates } from '../src/canonicalizer.js';
15import { GOLD_SPECS, type GoldSpec } from '../tests/eval/gold-standard.js';
16import type { CanonicalNode } from '../src/models/canonical.js';
17import { CONFIG } from '../src/experiment-config.js';
18
19const ROOT = resolve(import.meta.dirname, '..');
20const RESULTS_FILE = resolve(ROOT, 'experiments/results.tsv');
21
22// ─── Metrics computation (same as eval test, but standalone) ────────────────
23
24function loadAndExtract(spec: GoldSpec) {
25 const text = readFileSync(resolve(ROOT, spec.path), 'utf8');
26 const clauses = parseSpec(text, spec.docId);
27 const { candidates, coverage } = extractCandidates(clauses);
28 const nodes = extractCanonicalNodes(clauses);
29 const avgCoverage = coverage.length > 0
30 ? coverage.reduce((s, c) => s + c.coverage_pct, 0) / coverage.length
31 : 0;
32 return { clauses, candidates, coverage, nodes, avgCoverage };
33}
34
35function findNode(nodes: CanonicalNode[], substringMatch: string): CanonicalNode | undefined {
36 const lower = substringMatch.toLowerCase();
37 return nodes.find(n => n.statement.toLowerCase().includes(lower));
38}
39
40interface SpecMetrics {
41 recall: number;
42 typeAccuracy: number;
43 coverage: number;
44 linkPrecision: number;
45 resDRate: number;
46 orphanRate: number;
47 hierCoverage: number;
48 maxDegree: number;
49 nodeCount: number;
50}
51
52function computeMetrics(spec: GoldSpec, nodes: CanonicalNode[], avgCoverage: number): SpecMetrics {
53 let found = 0;
54 let typeCorrect = 0;
55 for (const expected of spec.expectedNodes) {
56 const node = findNode(nodes, expected.statement);
57 if (node) {
58 found++;
59 if (node.type === expected.type) typeCorrect++;
60 }
61 }
62 const recall = spec.expectedNodes.length > 0 ? found / spec.expectedNodes.length : 1;
63 const typeAccuracy = found > 0 ? typeCorrect / found : 0;
64
65 let edgesFound = 0;
66 for (const expected of spec.expectedEdges) {
67 const from = findNode(nodes, expected.from);
68 const to = findNode(nodes, expected.to);
69 if (from && to) {
70 const isLinked = from.linked_canon_ids.includes(to.canon_id) || to.linked_canon_ids.includes(from.canon_id);
71 if (isLinked) {
72 const edgeType = from.link_types?.[to.canon_id] || to.link_types?.[from.canon_id];
73 if (edgeType === expected.type) edgesFound++;
74 }
75 }
76 }
77 const linkPrecision = spec.expectedEdges.length > 0 ? edgesFound / spec.expectedEdges.length : 1;
78
79 let totalEdges = 0;
80 let relatesToEdges = 0;
81 for (const n of nodes) {
82 for (const [, et] of Object.entries(n.link_types ?? {})) {
83 totalEdges++;
84 if (et === 'relates_to') relatesToEdges++;
85 }
86 }
87 const resDRate = totalEdges > 0 ? relatesToEdges / totalEdges : 0;
88
89 const orphanCount = nodes.filter(n => n.linked_canon_ids.length === 0).length;
90 const orphanRate = nodes.length > 0 ? orphanCount / nodes.length : 0;
91
92 const nonContext = nodes.filter(n => n.type !== 'CONTEXT');
93 const withParent = nonContext.filter(n => n.parent_canon_id).length;
94 const hierCoverage = nonContext.length > 0 ? withParent / nonContext.length : 0;
95
96 const maxDegree = Math.max(0, ...nodes.map(n => n.linked_canon_ids.length));
97
98 return { recall, typeAccuracy, coverage: avgCoverage, linkPrecision, resDRate, orphanRate, hierCoverage, maxDegree, nodeCount: nodes.length };
99}
100
101// ─── Composite score ────────────────────────────────────────────────────────
102
103function compositeScore(avgRecall: number, avgTypeAcc: number, avgCoverage: number, avgDRate: number, avgHier: number): number {
104 return (
105 0.30 * avgRecall +
106 0.25 * avgTypeAcc +
107 0.20 * (avgCoverage / 100) +
108 0.15 * (1 - avgDRate) +
109 0.10 * avgHier
110 );
111}
112
113// ─── Main ───────────────────────────────────────────────────────────────────
114
115const args = process.argv.slice(2);
116const jsonMode = args.includes('--json');
117const noLog = args.includes('--no-log');
118
119const allMetrics: { name: string; metrics: SpecMetrics }[] = [];
120
121for (const spec of GOLD_SPECS) {
122 try {
123 const { nodes, avgCoverage } = loadAndExtract(spec);
124 const metrics = computeMetrics(spec, nodes, avgCoverage);
125 allMetrics.push({ name: spec.name, metrics });
126 } catch (e) {
127 console.error(`FAILED: ${spec.name} — ${e}`);
128 allMetrics.push({
129 name: spec.name,
130 metrics: { recall: 0, typeAccuracy: 0, coverage: 0, linkPrecision: 0, resDRate: 1, orphanRate: 1, hierCoverage: 0, maxDegree: 0, nodeCount: 0 },
131 });
132 }
133}
134
135// Aggregates
136const count = allMetrics.length;
137const avgRecall = allMetrics.reduce((s, m) => s + m.metrics.recall, 0) / count;
138const avgTypeAcc = allMetrics.reduce((s, m) => s + m.metrics.typeAccuracy, 0) / count;
139const avgCoverage = allMetrics.reduce((s, m) => s + m.metrics.coverage, 0) / count;
140const avgDRate = allMetrics.reduce((s, m) => s + m.metrics.resDRate, 0) / count;
141const avgHier = allMetrics.reduce((s, m) => s + m.metrics.hierCoverage, 0) / count;
142const avgOrphan = allMetrics.reduce((s, m) => s + m.metrics.orphanRate, 0) / count;
143const score = compositeScore(avgRecall, avgTypeAcc, avgCoverage, avgDRate, avgHier);
144
145if (jsonMode) {
146 console.log(JSON.stringify({ score, avgRecall, avgTypeAcc, avgCoverage, avgDRate, avgHier, avgOrphan, perSpec: allMetrics }, null, 2));
147} else {
148 // ASCII table
149 console.log('\n╔═══════════════════════════════════════════════════════════════════════╗');
150 console.log('║ PHOENIX CANONICALIZATION — EXPERIMENT EVAL ║');
151 console.log('╠═══════════════════════════════════════════════════════════════════════╣');
152 console.log('║ Spec │ Recall │ TypeAcc │ Cover │ ResD% │ Hier% │ Nodes ║');
153 console.log('╠═══════════════════╪════════╪═════════╪═══════╪═══════╪═══════╪═══════╣');
154
155 for (const { name, metrics: m } of allMetrics) {
156 const n = name.padEnd(18);
157 const recall = (m.recall * 100).toFixed(0).padStart(5) + '%';
158 const type = (m.typeAccuracy * 100).toFixed(0).padStart(6) + '%';
159 const cov = m.coverage.toFixed(0).padStart(4) + '%';
160 const resD = (m.resDRate * 100).toFixed(0).padStart(4) + '%';
161 const hier = (m.hierCoverage * 100).toFixed(0).padStart(4) + '%';
162 const nodeCount = String(m.nodeCount).padStart(5);
163 console.log(`║ ${n} │ ${recall} │ ${type} │ ${cov} │ ${resD} │ ${hier} │ ${nodeCount} ║`);
164 }
165
166 console.log('╠═══════════════════╪════════╪═════════╪═══════╪═══════╪═══════╪═══════╣');
167 const avgR = (avgRecall * 100).toFixed(0).padStart(5) + '%';
168 const avgT = (avgTypeAcc * 100).toFixed(0).padStart(6) + '%';
169 const avgC = avgCoverage.toFixed(0).padStart(4) + '%';
170 const avgD = (avgDRate * 100).toFixed(0).padStart(4) + '%';
171 const avgH = (avgHier * 100).toFixed(0).padStart(4) + '%';
172 console.log(`║ ${'AVERAGE'.padEnd(18)} │ ${avgR} │ ${avgT} │ ${avgC} │ ${avgD} │ ${avgH} │ ║`);
173 console.log('╚═══════════════════════════════════════════════════════════════════════╝');
174 console.log(`\n COMPOSITE SCORE: ${score.toFixed(4)}`);
175 console.log(` Formula: 0.30·recall + 0.25·typeAcc + 0.20·coverage + 0.15·(1-dRate) + 0.10·hier`);
176 console.log(`\n Targets: Recall ≥95%, TypeAcc ≥90%, Coverage ≥95%, ResD ≤20%, Hier ≥50%`);
177}
178
179// ─── Append to results.tsv ──────────────────────────────────────────────────
180
181if (!noLog) {
182 const timestamp = new Date().toISOString();
183 const header = 'timestamp\tscore\trecall\ttype_acc\tcoverage\td_rate\thier\torphan\tconfig_hash';
184
185 if (!existsSync(RESULTS_FILE)) {
186 appendFileSync(RESULTS_FILE, header + '\n');
187 }
188
189 // Simple config hash for dedup detection
190 const configStr = JSON.stringify(CONFIG);
191 let hash = 0;
192 for (let i = 0; i < configStr.length; i++) {
193 hash = ((hash << 5) - hash + configStr.charCodeAt(i)) | 0;
194 }
195 const configHash = Math.abs(hash).toString(36);
196
197 const row = [
198 timestamp,
199 score.toFixed(4),
200 (avgRecall * 100).toFixed(1),
201 (avgTypeAcc * 100).toFixed(1),
202 avgCoverage.toFixed(1),
203 (avgDRate * 100).toFixed(1),
204 (avgHier * 100).toFixed(1),
205 (avgOrphan * 100).toFixed(1),
206 configHash,
207 ].join('\t');
208
209 appendFileSync(RESULTS_FILE, row + '\n');
210 if (!jsonMode) {
211 console.log(`\n Results appended to experiments/results.tsv`);
212 }
213}
214
215// Exit with score as a parseable last line
216if (!jsonMode) {
217 console.log(`\nval_score=${score.toFixed(4)}`);
218}