Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
at main 391 lines 14 kB view raw
1/** 2 * Resolution Engine — Phase 2 of canonicalization. 3 * 4 * Takes flat CandidateNode[] from extraction and produces 5 * a structured CanonicalGraph with: 6 * - Deduplication / merge of equivalent candidates 7 * - Typed edge inference (constrains, defines, refines, etc.) 8 * - Hierarchy from heading structure 9 * - canon_anchor for stable soft identity 10 * - IDF-weighted linking via inverted index (replaces O(n²)) 11 */ 12 13import type { CanonicalNode, CandidateNode } from './models/canonical.js'; 14import type { Clause } from './models/clause.js'; 15import type { EdgeType } from './models/canonical.js'; 16import { CanonicalType } from './models/canonical.js'; 17import { sha256 } from './semhash.js'; 18import { CONFIG } from './experiment-config.js'; 19 20// ─── Main entry point ──────────────────────────────────────────────────────── 21 22/** 23 * Resolve candidate nodes into a canonical graph. 24 */ 25export function resolveGraph(candidates: CandidateNode[], clauses: Clause[]): CanonicalNode[] { 26 if (candidates.length === 0) return []; 27 28 // Build clause index for hierarchy inference 29 const clauseMap = new Map(clauses.map(c => [c.clause_id, c])); 30 31 // Step 1: Convert candidates to draft nodes 32 let nodes = candidates.map(c => candidateToNode(c)); 33 34 // Step 2: Deduplicate 35 nodes = deduplicateNodes(nodes); 36 37 // Step 3: Compute IDF over all tags 38 const idf = computeIDF(nodes); 39 40 // Step 4: Build inverted index and infer typed edges 41 inferTypedEdges(nodes, idf); 42 43 // Step 5: Infer hierarchy from heading structure 44 inferHierarchy(nodes, clauseMap); 45 46 // Step 6: Compute anchors 47 computeAnchors(nodes); 48 49 // Step 7: Enforce max degree 50 enforceMaxDegree(nodes, idf); 51 52 return nodes; 53} 54 55// ─── Step 1: Convert ───────────────────────────────────────────────────────── 56 57function candidateToNode(c: CandidateNode): CanonicalNode { 58 return { 59 canon_id: c.candidate_id, 60 type: c.type, 61 statement: c.statement, 62 confidence: c.confidence, 63 source_clause_ids: [...c.source_clause_ids], 64 linked_canon_ids: [], 65 link_types: {}, 66 tags: [...c.tags], 67 extraction_method: c.extraction_method, 68 }; 69} 70 71// ─── Step 2: Deduplication ─────────────────────────────────────────────────── 72 73function deduplicateNodes(nodes: CanonicalNode[]): CanonicalNode[] { 74 // Group by normalized statement fingerprint (token trigrams) 75 const groups = new Map<string, CanonicalNode[]>(); 76 77 for (const node of nodes) { 78 const fp = statementFingerprint(node.statement); 79 const group = groups.get(fp) ?? []; 80 group.push(node); 81 groups.set(fp, group); 82 } 83 84 const merged: CanonicalNode[] = []; 85 86 for (const group of groups.values()) { 87 if (group.length === 1) { 88 merged.push(group[0]); 89 continue; 90 } 91 92 // Check pairwise similarity within fingerprint group 93 const used = new Set<number>(); 94 for (let i = 0; i < group.length; i++) { 95 if (used.has(i)) continue; 96 97 let primary = group[i]; 98 const mergedSources = new Set(primary.source_clause_ids); 99 const mergedTags = new Set(primary.tags); 100 101 for (let j = i + 1; j < group.length; j++) { 102 if (used.has(j)) continue; 103 const sim = tokenJaccard(primary.statement, group[j].statement); 104 if (sim > CONFIG.JACCARD_DEDUP_THRESHOLD && areTypesCompatible(primary.type, group[j].type)) { 105 // Merge: keep higher confidence node as primary 106 used.add(j); 107 for (const s of group[j].source_clause_ids) mergedSources.add(s); 108 for (const t of group[j].tags) mergedTags.add(t); 109 110 if ((group[j].confidence ?? 0) > (primary.confidence ?? 0)) { 111 const oldSources = mergedSources; 112 primary = { ...group[j], source_clause_ids: [...oldSources], tags: [...mergedTags] }; 113 } 114 } 115 } 116 117 primary.source_clause_ids = [...mergedSources]; 118 primary.tags = [...mergedTags]; 119 merged.push(primary); 120 } 121 } 122 123 return merged; 124} 125 126function statementFingerprint(statement: string): string { 127 // Coarse fingerprint: sorted 3-char token prefixes 128 const tokens = statement.toLowerCase().split(/\s+/).filter(t => t.length > 2); 129 const prefixes = tokens.map(t => t.slice(0, 3)).sort(); 130 return prefixes.slice(0, CONFIG.FINGERPRINT_PREFIX_COUNT).join('|'); 131} 132 133function tokenJaccard(a: string, b: string): number { 134 const ta = new Set(a.toLowerCase().split(/\s+/)); 135 const tb = new Set(b.toLowerCase().split(/\s+/)); 136 let intersection = 0; 137 for (const t of ta) if (tb.has(t)) intersection++; 138 const union = ta.size + tb.size - intersection; 139 return union > 0 ? intersection / union : 0; 140} 141 142function areTypesCompatible(a: CanonicalType, b: CanonicalType): boolean { 143 if (a === b) return true; 144 if (a === CanonicalType.CONTEXT || b === CanonicalType.CONTEXT) return true; 145 return false; 146} 147 148// ─── Step 3: IDF computation ───────────────────────────────────────────────── 149 150function computeIDF(nodes: CanonicalNode[]): Map<string, number> { 151 const docFreq = new Map<string, number>(); 152 const N = nodes.length; 153 154 for (const node of nodes) { 155 const uniqueTags = new Set(node.tags); 156 for (const tag of uniqueTags) { 157 docFreq.set(tag, (docFreq.get(tag) ?? 0) + 1); 158 } 159 } 160 161 const idf = new Map<string, number>(); 162 for (const [tag, df] of docFreq) { 163 idf.set(tag, Math.log((N + 1) / (df + 1)) + 1); 164 } 165 166 return idf; 167} 168 169// ─── Step 4: Typed edge inference ──────────────────────────────────────────── 170 171function inferTypedEdges(nodes: CanonicalNode[], idf: Map<string, number>): void { 172 // Build inverted index: tag → node indices 173 const tagIndex = new Map<string, number[]>(); 174 for (let i = 0; i < nodes.length; i++) { 175 for (const tag of nodes[i].tags) { 176 const list = tagIndex.get(tag) ?? []; 177 list.push(i); 178 tagIndex.set(tag, list); 179 } 180 } 181 182 // Compute IDF threshold: only skip tags appearing in >40% of nodes. 183 // IDF for a tag in 40% of N nodes ≈ log(N / (0.4*N)) + 1 ≈ log(2.5) + 1 ≈ 1.92 184 // We use a hard threshold based on document frequency, not percentile. 185 const N = nodes.length; 186 const maxDF = Math.max(2, Math.floor(N * CONFIG.DOC_FREQ_CUTOFF)); 187 const idfThreshold = Math.log((N + 1) / (maxDF + 1)) + 1; 188 189 // Generate candidate pairs from inverted index 190 const pairScores = new Map<string, { i: number; j: number; sharedNonTrivial: number; sharedTags: string[] }>(); 191 192 for (const [tag, indices] of tagIndex) { 193 for (let a = 0; a < indices.length; a++) { 194 for (let b = a + 1; b < indices.length; b++) { 195 const i = indices[a]; 196 const j = indices[b]; 197 const key = i < j ? `${i}:${j}` : `${j}:${i}`; 198 const entry = pairScores.get(key) ?? { i: Math.min(i, j), j: Math.max(i, j), sharedNonTrivial: 0, sharedTags: [] }; 199 const tagIdf = idf.get(tag) ?? 0; 200 if (tagIdf > idfThreshold) { 201 entry.sharedNonTrivial++; 202 } 203 entry.sharedTags.push(tag); 204 pairScores.set(key, entry); 205 } 206 } 207 } 208 209 // Create edges for pairs with enough shared tags (at least MIN_SHARED_TAGS total, 210 // and at least 1 non-trivial tag) 211 for (const { i, j, sharedNonTrivial, sharedTags } of pairScores.values()) { 212 if (sharedTags.length < CONFIG.MIN_SHARED_TAGS || sharedNonTrivial < 1) continue; 213 214 const nodeA = nodes[i]; 215 const nodeB = nodes[j]; 216 217 // Skip linking canon→canon within same canon_id 218 if (nodeA.canon_id === nodeB.canon_id) continue; 219 220 // Infer edge type 221 const edgeType = inferEdgeType(nodeA, nodeB); 222 223 // Add bidirectional link 224 addEdge(nodeA, nodeB, edgeType); 225 addEdge(nodeB, nodeA, reverseEdgeType(edgeType)); 226 } 227} 228 229function inferEdgeType(from: CanonicalNode, to: CanonicalNode): EdgeType { 230 const types = new Set([from.type, to.type]); 231 232 // Definition ↔ anything = defines 233 if (types.has(CanonicalType.DEFINITION)) return 'defines'; 234 235 // Constraint ↔ Requirement = constrains 236 if (types.has(CanonicalType.CONSTRAINT) && types.has(CanonicalType.REQUIREMENT)) return 'constrains'; 237 238 // Invariant ↔ Requirement = invariant_of 239 if (types.has(CanonicalType.INVARIANT) && types.has(CanonicalType.REQUIREMENT)) return 'invariant_of'; 240 241 // Invariant ↔ Constraint = constrains (invariants constrain constraints) 242 if (types.has(CanonicalType.INVARIANT) && types.has(CanonicalType.CONSTRAINT)) return 'constrains'; 243 244 // Context ↔ any non-context = refines (context provides framing) 245 if (types.has(CanonicalType.CONTEXT) && types.size > 1) return 'refines'; 246 247 // Same-type pairs: use tag overlap to distinguish refines vs relates_to 248 if (from.type === to.type) { 249 const fromTags = new Set(from.tags); 250 const toTags = new Set(to.tags); 251 let shared = 0; 252 for (const t of fromTags) if (toTags.has(t)) shared++; 253 const smaller = Math.min(fromTags.size, toTags.size); 254 255 // If one node's tags are largely a subset of the other's, the smaller 256 // one refines a broader concept — that's a refinement relationship 257 if (smaller > 0 && shared / smaller >= CONFIG.SAME_TYPE_REFINE_THRESHOLD) return 'refines'; 258 } 259 260 return 'relates_to'; 261} 262 263function reverseEdgeType(type: EdgeType): EdgeType { 264 // Most edge types are symmetric in our model 265 return type; 266} 267 268function addEdge(from: CanonicalNode, to: CanonicalNode, type: EdgeType): void { 269 if (!from.linked_canon_ids.includes(to.canon_id)) { 270 from.linked_canon_ids.push(to.canon_id); 271 } 272 if (!from.link_types) from.link_types = {}; 273 from.link_types[to.canon_id] = type; 274} 275 276// ─── Step 5: Hierarchy inference ───────────────────────────────────────────── 277 278function inferHierarchy(nodes: CanonicalNode[], clauseMap: Map<string, Clause>): void { 279 // Group nodes by source document 280 const byDoc = new Map<string, CanonicalNode[]>(); 281 for (const node of nodes) { 282 for (const clauseId of node.source_clause_ids) { 283 const clause = clauseMap.get(clauseId); 284 if (!clause) continue; 285 const docId = clause.source_doc_id; 286 const list = byDoc.get(docId) ?? []; 287 list.push(node); 288 byDoc.set(docId, list); 289 } 290 } 291 292 for (const docNodes of byDoc.values()) { 293 // Build entries with section depth for all nodes 294 const entries: { node: CanonicalNode; depth: number; sectionPath: string[] }[] = []; 295 296 for (const node of docNodes) { 297 const clause = clauseMap.get(node.source_clause_ids[0]); 298 if (!clause) continue; 299 entries.push({ node, depth: clause.section_path.length, sectionPath: clause.section_path }); 300 } 301 302 // For each node, find the nearest parent at a shallower section depth. 303 // Prefer CONTEXT parents, but fall back to any type if no CONTEXT exists. 304 for (const child of entries) { 305 if (child.depth === 0) continue; // top-level nodes have no parent 306 307 let bestParent: CanonicalNode | null = null; 308 let bestDepth = -1; 309 let bestIsContext = false; 310 311 for (const candidate of entries) { 312 if (candidate.node.canon_id === child.node.canon_id) continue; 313 if (candidate.depth >= child.depth) continue; 314 if (candidate.depth <= bestDepth) continue; 315 316 const prefixMatch = candidate.sectionPath.every((seg, i) => child.sectionPath[i] === seg); 317 if (!prefixMatch) continue; 318 319 const isContext = candidate.node.type === CanonicalType.CONTEXT; 320 // Prefer CONTEXT parents at the same depth, but accept any type 321 if (candidate.depth > bestDepth || (isContext && !bestIsContext)) { 322 bestParent = candidate.node; 323 bestDepth = candidate.depth; 324 bestIsContext = isContext; 325 } 326 } 327 328 if (bestParent) { 329 child.node.parent_canon_id = bestParent.canon_id; 330 } 331 } 332 } 333} 334 335// ─── Step 6: Anchor computation ────────────────────────────────────────────── 336 337function computeAnchors(nodes: CanonicalNode[]): void { 338 for (const node of nodes) { 339 const sortedTags = [...node.tags].sort().join(','); 340 const sortedSources = [...node.source_clause_ids].sort().join(','); 341 node.canon_anchor = sha256([node.type, sortedTags, sortedSources].join('\x00')); 342 } 343} 344 345// ─── Step 7: Enforce max degree ────────────────────────────────────────────── 346 347function enforceMaxDegree(nodes: CanonicalNode[], idf: Map<string, number>): void { 348 for (const node of nodes) { 349 // Count non-duplicate edges 350 const edges = node.linked_canon_ids.filter( 351 id => node.link_types?.[id] !== 'duplicates' 352 ); 353 354 if (edges.length <= CONFIG.MAX_DEGREE) continue; 355 356 // Score each edge by shared tag IDF 357 const nodeTagSet = new Set(node.tags); 358 const edgeScores: { id: string; score: number }[] = []; 359 360 const nodeIndex = new Map(nodes.map(n => [n.canon_id, n])); 361 362 for (const id of edges) { 363 const target = nodeIndex.get(id); 364 if (!target) { edgeScores.push({ id, score: 0 }); continue; } 365 366 let score = 0; 367 for (const tag of target.tags) { 368 if (nodeTagSet.has(tag)) { 369 score += idf.get(tag) ?? 0; 370 } 371 } 372 edgeScores.push({ id, score }); 373 } 374 375 edgeScores.sort((a, b) => b.score - a.score); 376 const keep = new Set(edgeScores.slice(0, CONFIG.MAX_DEGREE).map(e => e.id)); 377 378 // Also keep all 'duplicates' edges 379 for (const id of node.linked_canon_ids) { 380 if (node.link_types?.[id] === 'duplicates') keep.add(id); 381 } 382 383 // Filter 384 node.linked_canon_ids = node.linked_canon_ids.filter(id => keep.has(id)); 385 if (node.link_types) { 386 for (const id of Object.keys(node.link_types)) { 387 if (!keep.has(id)) delete node.link_types[id]; 388 } 389 } 390 } 391}