Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
at main 218 lines 9.9 kB view raw
1#!/usr/bin/env npx tsx 2/** 3 * Evaluation Runner — Fixed harness for the autoresearch experiment loop. 4 * 5 * DO NOT MODIFY THIS FILE during experiments. 6 * The agent modifies only experiments/config.ts. 7 * 8 * Usage: npx tsx experiments/eval-runner.ts [--json] [--no-log] 9 */ 10 11import { readFileSync, appendFileSync, existsSync } from 'node:fs'; 12import { resolve } from 'node:path'; 13import { parseSpec } from '../src/spec-parser.js'; 14import { extractCanonicalNodes, extractCandidates } from '../src/canonicalizer.js'; 15import { GOLD_SPECS, type GoldSpec } from '../tests/eval/gold-standard.js'; 16import type { CanonicalNode } from '../src/models/canonical.js'; 17import { CONFIG } from '../src/experiment-config.js'; 18 19const ROOT = resolve(import.meta.dirname, '..'); 20const RESULTS_FILE = resolve(ROOT, 'experiments/results.tsv'); 21 22// ─── Metrics computation (same as eval test, but standalone) ──────────────── 23 24function loadAndExtract(spec: GoldSpec) { 25 const text = readFileSync(resolve(ROOT, spec.path), 'utf8'); 26 const clauses = parseSpec(text, spec.docId); 27 const { candidates, coverage } = extractCandidates(clauses); 28 const nodes = extractCanonicalNodes(clauses); 29 const avgCoverage = coverage.length > 0 30 ? coverage.reduce((s, c) => s + c.coverage_pct, 0) / coverage.length 31 : 0; 32 return { clauses, candidates, coverage, nodes, avgCoverage }; 33} 34 35function findNode(nodes: CanonicalNode[], substringMatch: string): CanonicalNode | undefined { 36 const lower = substringMatch.toLowerCase(); 37 return nodes.find(n => n.statement.toLowerCase().includes(lower)); 38} 39 40interface SpecMetrics { 41 recall: number; 42 typeAccuracy: number; 43 coverage: number; 44 linkPrecision: number; 45 resDRate: number; 46 orphanRate: number; 47 hierCoverage: number; 48 maxDegree: number; 49 nodeCount: number; 50} 51 52function computeMetrics(spec: GoldSpec, nodes: CanonicalNode[], avgCoverage: number): SpecMetrics { 53 let found = 0; 54 let typeCorrect = 0; 55 for (const expected of spec.expectedNodes) { 56 const node = findNode(nodes, expected.statement); 57 if (node) { 58 found++; 59 if (node.type === expected.type) typeCorrect++; 60 } 61 } 62 const recall = spec.expectedNodes.length > 0 ? found / spec.expectedNodes.length : 1; 63 const typeAccuracy = found > 0 ? typeCorrect / found : 0; 64 65 let edgesFound = 0; 66 for (const expected of spec.expectedEdges) { 67 const from = findNode(nodes, expected.from); 68 const to = findNode(nodes, expected.to); 69 if (from && to) { 70 const isLinked = from.linked_canon_ids.includes(to.canon_id) || to.linked_canon_ids.includes(from.canon_id); 71 if (isLinked) { 72 const edgeType = from.link_types?.[to.canon_id] || to.link_types?.[from.canon_id]; 73 if (edgeType === expected.type) edgesFound++; 74 } 75 } 76 } 77 const linkPrecision = spec.expectedEdges.length > 0 ? edgesFound / spec.expectedEdges.length : 1; 78 79 let totalEdges = 0; 80 let relatesToEdges = 0; 81 for (const n of nodes) { 82 for (const [, et] of Object.entries(n.link_types ?? {})) { 83 totalEdges++; 84 if (et === 'relates_to') relatesToEdges++; 85 } 86 } 87 const resDRate = totalEdges > 0 ? relatesToEdges / totalEdges : 0; 88 89 const orphanCount = nodes.filter(n => n.linked_canon_ids.length === 0).length; 90 const orphanRate = nodes.length > 0 ? orphanCount / nodes.length : 0; 91 92 const nonContext = nodes.filter(n => n.type !== 'CONTEXT'); 93 const withParent = nonContext.filter(n => n.parent_canon_id).length; 94 const hierCoverage = nonContext.length > 0 ? withParent / nonContext.length : 0; 95 96 const maxDegree = Math.max(0, ...nodes.map(n => n.linked_canon_ids.length)); 97 98 return { recall, typeAccuracy, coverage: avgCoverage, linkPrecision, resDRate, orphanRate, hierCoverage, maxDegree, nodeCount: nodes.length }; 99} 100 101// ─── Composite score ──────────────────────────────────────────────────────── 102 103function compositeScore(avgRecall: number, avgTypeAcc: number, avgCoverage: number, avgDRate: number, avgHier: number): number { 104 return ( 105 0.30 * avgRecall + 106 0.25 * avgTypeAcc + 107 0.20 * (avgCoverage / 100) + 108 0.15 * (1 - avgDRate) + 109 0.10 * avgHier 110 ); 111} 112 113// ─── Main ─────────────────────────────────────────────────────────────────── 114 115const args = process.argv.slice(2); 116const jsonMode = args.includes('--json'); 117const noLog = args.includes('--no-log'); 118 119const allMetrics: { name: string; metrics: SpecMetrics }[] = []; 120 121for (const spec of GOLD_SPECS) { 122 try { 123 const { nodes, avgCoverage } = loadAndExtract(spec); 124 const metrics = computeMetrics(spec, nodes, avgCoverage); 125 allMetrics.push({ name: spec.name, metrics }); 126 } catch (e) { 127 console.error(`FAILED: ${spec.name}${e}`); 128 allMetrics.push({ 129 name: spec.name, 130 metrics: { recall: 0, typeAccuracy: 0, coverage: 0, linkPrecision: 0, resDRate: 1, orphanRate: 1, hierCoverage: 0, maxDegree: 0, nodeCount: 0 }, 131 }); 132 } 133} 134 135// Aggregates 136const count = allMetrics.length; 137const avgRecall = allMetrics.reduce((s, m) => s + m.metrics.recall, 0) / count; 138const avgTypeAcc = allMetrics.reduce((s, m) => s + m.metrics.typeAccuracy, 0) / count; 139const avgCoverage = allMetrics.reduce((s, m) => s + m.metrics.coverage, 0) / count; 140const avgDRate = allMetrics.reduce((s, m) => s + m.metrics.resDRate, 0) / count; 141const avgHier = allMetrics.reduce((s, m) => s + m.metrics.hierCoverage, 0) / count; 142const avgOrphan = allMetrics.reduce((s, m) => s + m.metrics.orphanRate, 0) / count; 143const score = compositeScore(avgRecall, avgTypeAcc, avgCoverage, avgDRate, avgHier); 144 145if (jsonMode) { 146 console.log(JSON.stringify({ score, avgRecall, avgTypeAcc, avgCoverage, avgDRate, avgHier, avgOrphan, perSpec: allMetrics }, null, 2)); 147} else { 148 // ASCII table 149 console.log('\n╔═══════════════════════════════════════════════════════════════════════╗'); 150 console.log('║ PHOENIX CANONICALIZATION — EXPERIMENT EVAL ║'); 151 console.log('╠═══════════════════════════════════════════════════════════════════════╣'); 152 console.log('║ Spec │ Recall │ TypeAcc │ Cover │ ResD% │ Hier% │ Nodes ║'); 153 console.log('╠═══════════════════╪════════╪═════════╪═══════╪═══════╪═══════╪═══════╣'); 154 155 for (const { name, metrics: m } of allMetrics) { 156 const n = name.padEnd(18); 157 const recall = (m.recall * 100).toFixed(0).padStart(5) + '%'; 158 const type = (m.typeAccuracy * 100).toFixed(0).padStart(6) + '%'; 159 const cov = m.coverage.toFixed(0).padStart(4) + '%'; 160 const resD = (m.resDRate * 100).toFixed(0).padStart(4) + '%'; 161 const hier = (m.hierCoverage * 100).toFixed(0).padStart(4) + '%'; 162 const nodeCount = String(m.nodeCount).padStart(5); 163 console.log(`${n}${recall}${type}${cov}${resD}${hier}${nodeCount}`); 164 } 165 166 console.log('╠═══════════════════╪════════╪═════════╪═══════╪═══════╪═══════╪═══════╣'); 167 const avgR = (avgRecall * 100).toFixed(0).padStart(5) + '%'; 168 const avgT = (avgTypeAcc * 100).toFixed(0).padStart(6) + '%'; 169 const avgC = avgCoverage.toFixed(0).padStart(4) + '%'; 170 const avgD = (avgDRate * 100).toFixed(0).padStart(4) + '%'; 171 const avgH = (avgHier * 100).toFixed(0).padStart(4) + '%'; 172 console.log(`${'AVERAGE'.padEnd(18)}${avgR}${avgT}${avgC}${avgD}${avgH} │ ║`); 173 console.log('╚═══════════════════════════════════════════════════════════════════════╝'); 174 console.log(`\n COMPOSITE SCORE: ${score.toFixed(4)}`); 175 console.log(` Formula: 0.30·recall + 0.25·typeAcc + 0.20·coverage + 0.15·(1-dRate) + 0.10·hier`); 176 console.log(`\n Targets: Recall ≥95%, TypeAcc ≥90%, Coverage ≥95%, ResD ≤20%, Hier ≥50%`); 177} 178 179// ─── Append to results.tsv ────────────────────────────────────────────────── 180 181if (!noLog) { 182 const timestamp = new Date().toISOString(); 183 const header = 'timestamp\tscore\trecall\ttype_acc\tcoverage\td_rate\thier\torphan\tconfig_hash'; 184 185 if (!existsSync(RESULTS_FILE)) { 186 appendFileSync(RESULTS_FILE, header + '\n'); 187 } 188 189 // Simple config hash for dedup detection 190 const configStr = JSON.stringify(CONFIG); 191 let hash = 0; 192 for (let i = 0; i < configStr.length; i++) { 193 hash = ((hash << 5) - hash + configStr.charCodeAt(i)) | 0; 194 } 195 const configHash = Math.abs(hash).toString(36); 196 197 const row = [ 198 timestamp, 199 score.toFixed(4), 200 (avgRecall * 100).toFixed(1), 201 (avgTypeAcc * 100).toFixed(1), 202 avgCoverage.toFixed(1), 203 (avgDRate * 100).toFixed(1), 204 (avgHier * 100).toFixed(1), 205 (avgOrphan * 100).toFixed(1), 206 configHash, 207 ].join('\t'); 208 209 appendFileSync(RESULTS_FILE, row + '\n'); 210 if (!jsonMode) { 211 console.log(`\n Results appended to experiments/results.tsv`); 212 } 213} 214 215// Exit with score as a parseable last line 216if (!jsonMode) { 217 console.log(`\nval_score=${score.toFixed(4)}`); 218}