Reference implementation for the Phoenix Architecture. Work in progress.
aicoding.leaflet.pub/
ai
coding
crazy
1/**
2 * Resolution Engine — Phase 2 of canonicalization.
3 *
4 * Takes flat CandidateNode[] from extraction and produces
5 * a structured CanonicalGraph with:
6 * - Deduplication / merge of equivalent candidates
7 * - Typed edge inference (constrains, defines, refines, etc.)
8 * - Hierarchy from heading structure
9 * - canon_anchor for stable soft identity
10 * - IDF-weighted linking via inverted index (replaces O(n²))
11 */
12
13import type { CanonicalNode, CandidateNode } from './models/canonical.js';
14import type { Clause } from './models/clause.js';
15import type { EdgeType } from './models/canonical.js';
16import { CanonicalType } from './models/canonical.js';
17import { sha256 } from './semhash.js';
18import { CONFIG } from './experiment-config.js';
19
20// ─── Main entry point ────────────────────────────────────────────────────────
21
22/**
23 * Resolve candidate nodes into a canonical graph.
24 */
25export function resolveGraph(candidates: CandidateNode[], clauses: Clause[]): CanonicalNode[] {
26 if (candidates.length === 0) return [];
27
28 // Build clause index for hierarchy inference
29 const clauseMap = new Map(clauses.map(c => [c.clause_id, c]));
30
31 // Step 1: Convert candidates to draft nodes
32 let nodes = candidates.map(c => candidateToNode(c));
33
34 // Step 2: Deduplicate
35 nodes = deduplicateNodes(nodes);
36
37 // Step 3: Compute IDF over all tags
38 const idf = computeIDF(nodes);
39
40 // Step 4: Build inverted index and infer typed edges
41 inferTypedEdges(nodes, idf);
42
43 // Step 5: Infer hierarchy from heading structure
44 inferHierarchy(nodes, clauseMap);
45
46 // Step 6: Compute anchors
47 computeAnchors(nodes);
48
49 // Step 7: Enforce max degree
50 enforceMaxDegree(nodes, idf);
51
52 return nodes;
53}
54
55// ─── Step 1: Convert ─────────────────────────────────────────────────────────
56
57function candidateToNode(c: CandidateNode): CanonicalNode {
58 return {
59 canon_id: c.candidate_id,
60 type: c.type,
61 statement: c.statement,
62 confidence: c.confidence,
63 source_clause_ids: [...c.source_clause_ids],
64 linked_canon_ids: [],
65 link_types: {},
66 tags: [...c.tags],
67 extraction_method: c.extraction_method,
68 };
69}
70
71// ─── Step 2: Deduplication ───────────────────────────────────────────────────
72
73function deduplicateNodes(nodes: CanonicalNode[]): CanonicalNode[] {
74 // Group by normalized statement fingerprint (token trigrams)
75 const groups = new Map<string, CanonicalNode[]>();
76
77 for (const node of nodes) {
78 const fp = statementFingerprint(node.statement);
79 const group = groups.get(fp) ?? [];
80 group.push(node);
81 groups.set(fp, group);
82 }
83
84 const merged: CanonicalNode[] = [];
85
86 for (const group of groups.values()) {
87 if (group.length === 1) {
88 merged.push(group[0]);
89 continue;
90 }
91
92 // Check pairwise similarity within fingerprint group
93 const used = new Set<number>();
94 for (let i = 0; i < group.length; i++) {
95 if (used.has(i)) continue;
96
97 let primary = group[i];
98 const mergedSources = new Set(primary.source_clause_ids);
99 const mergedTags = new Set(primary.tags);
100
101 for (let j = i + 1; j < group.length; j++) {
102 if (used.has(j)) continue;
103 const sim = tokenJaccard(primary.statement, group[j].statement);
104 if (sim > CONFIG.JACCARD_DEDUP_THRESHOLD && areTypesCompatible(primary.type, group[j].type)) {
105 // Merge: keep higher confidence node as primary
106 used.add(j);
107 for (const s of group[j].source_clause_ids) mergedSources.add(s);
108 for (const t of group[j].tags) mergedTags.add(t);
109
110 if ((group[j].confidence ?? 0) > (primary.confidence ?? 0)) {
111 const oldSources = mergedSources;
112 primary = { ...group[j], source_clause_ids: [...oldSources], tags: [...mergedTags] };
113 }
114 }
115 }
116
117 primary.source_clause_ids = [...mergedSources];
118 primary.tags = [...mergedTags];
119 merged.push(primary);
120 }
121 }
122
123 return merged;
124}
125
126function statementFingerprint(statement: string): string {
127 // Coarse fingerprint: sorted 3-char token prefixes
128 const tokens = statement.toLowerCase().split(/\s+/).filter(t => t.length > 2);
129 const prefixes = tokens.map(t => t.slice(0, 3)).sort();
130 return prefixes.slice(0, CONFIG.FINGERPRINT_PREFIX_COUNT).join('|');
131}
132
133function tokenJaccard(a: string, b: string): number {
134 const ta = new Set(a.toLowerCase().split(/\s+/));
135 const tb = new Set(b.toLowerCase().split(/\s+/));
136 let intersection = 0;
137 for (const t of ta) if (tb.has(t)) intersection++;
138 const union = ta.size + tb.size - intersection;
139 return union > 0 ? intersection / union : 0;
140}
141
142function areTypesCompatible(a: CanonicalType, b: CanonicalType): boolean {
143 if (a === b) return true;
144 if (a === CanonicalType.CONTEXT || b === CanonicalType.CONTEXT) return true;
145 return false;
146}
147
148// ─── Step 3: IDF computation ─────────────────────────────────────────────────
149
150function computeIDF(nodes: CanonicalNode[]): Map<string, number> {
151 const docFreq = new Map<string, number>();
152 const N = nodes.length;
153
154 for (const node of nodes) {
155 const uniqueTags = new Set(node.tags);
156 for (const tag of uniqueTags) {
157 docFreq.set(tag, (docFreq.get(tag) ?? 0) + 1);
158 }
159 }
160
161 const idf = new Map<string, number>();
162 for (const [tag, df] of docFreq) {
163 idf.set(tag, Math.log((N + 1) / (df + 1)) + 1);
164 }
165
166 return idf;
167}
168
169// ─── Step 4: Typed edge inference ────────────────────────────────────────────
170
171function inferTypedEdges(nodes: CanonicalNode[], idf: Map<string, number>): void {
172 // Build inverted index: tag → node indices
173 const tagIndex = new Map<string, number[]>();
174 for (let i = 0; i < nodes.length; i++) {
175 for (const tag of nodes[i].tags) {
176 const list = tagIndex.get(tag) ?? [];
177 list.push(i);
178 tagIndex.set(tag, list);
179 }
180 }
181
182 // Compute IDF threshold: only skip tags appearing in >40% of nodes.
183 // IDF for a tag in 40% of N nodes ≈ log(N / (0.4*N)) + 1 ≈ log(2.5) + 1 ≈ 1.92
184 // We use a hard threshold based on document frequency, not percentile.
185 const N = nodes.length;
186 const maxDF = Math.max(2, Math.floor(N * CONFIG.DOC_FREQ_CUTOFF));
187 const idfThreshold = Math.log((N + 1) / (maxDF + 1)) + 1;
188
189 // Generate candidate pairs from inverted index
190 const pairScores = new Map<string, { i: number; j: number; sharedNonTrivial: number; sharedTags: string[] }>();
191
192 for (const [tag, indices] of tagIndex) {
193 for (let a = 0; a < indices.length; a++) {
194 for (let b = a + 1; b < indices.length; b++) {
195 const i = indices[a];
196 const j = indices[b];
197 const key = i < j ? `${i}:${j}` : `${j}:${i}`;
198 const entry = pairScores.get(key) ?? { i: Math.min(i, j), j: Math.max(i, j), sharedNonTrivial: 0, sharedTags: [] };
199 const tagIdf = idf.get(tag) ?? 0;
200 if (tagIdf > idfThreshold) {
201 entry.sharedNonTrivial++;
202 }
203 entry.sharedTags.push(tag);
204 pairScores.set(key, entry);
205 }
206 }
207 }
208
209 // Create edges for pairs with enough shared tags (at least MIN_SHARED_TAGS total,
210 // and at least 1 non-trivial tag)
211 for (const { i, j, sharedNonTrivial, sharedTags } of pairScores.values()) {
212 if (sharedTags.length < CONFIG.MIN_SHARED_TAGS || sharedNonTrivial < 1) continue;
213
214 const nodeA = nodes[i];
215 const nodeB = nodes[j];
216
217 // Skip linking canon→canon within same canon_id
218 if (nodeA.canon_id === nodeB.canon_id) continue;
219
220 // Infer edge type
221 const edgeType = inferEdgeType(nodeA, nodeB);
222
223 // Add bidirectional link
224 addEdge(nodeA, nodeB, edgeType);
225 addEdge(nodeB, nodeA, reverseEdgeType(edgeType));
226 }
227}
228
229function inferEdgeType(from: CanonicalNode, to: CanonicalNode): EdgeType {
230 const types = new Set([from.type, to.type]);
231
232 // Definition ↔ anything = defines
233 if (types.has(CanonicalType.DEFINITION)) return 'defines';
234
235 // Constraint ↔ Requirement = constrains
236 if (types.has(CanonicalType.CONSTRAINT) && types.has(CanonicalType.REQUIREMENT)) return 'constrains';
237
238 // Invariant ↔ Requirement = invariant_of
239 if (types.has(CanonicalType.INVARIANT) && types.has(CanonicalType.REQUIREMENT)) return 'invariant_of';
240
241 // Invariant ↔ Constraint = constrains (invariants constrain constraints)
242 if (types.has(CanonicalType.INVARIANT) && types.has(CanonicalType.CONSTRAINT)) return 'constrains';
243
244 // Context ↔ any non-context = refines (context provides framing)
245 if (types.has(CanonicalType.CONTEXT) && types.size > 1) return 'refines';
246
247 // Same-type pairs: use tag overlap to distinguish refines vs relates_to
248 if (from.type === to.type) {
249 const fromTags = new Set(from.tags);
250 const toTags = new Set(to.tags);
251 let shared = 0;
252 for (const t of fromTags) if (toTags.has(t)) shared++;
253 const smaller = Math.min(fromTags.size, toTags.size);
254
255 // If one node's tags are largely a subset of the other's, the smaller
256 // one refines a broader concept — that's a refinement relationship
257 if (smaller > 0 && shared / smaller >= CONFIG.SAME_TYPE_REFINE_THRESHOLD) return 'refines';
258 }
259
260 return 'relates_to';
261}
262
263function reverseEdgeType(type: EdgeType): EdgeType {
264 // Most edge types are symmetric in our model
265 return type;
266}
267
268function addEdge(from: CanonicalNode, to: CanonicalNode, type: EdgeType): void {
269 if (!from.linked_canon_ids.includes(to.canon_id)) {
270 from.linked_canon_ids.push(to.canon_id);
271 }
272 if (!from.link_types) from.link_types = {};
273 from.link_types[to.canon_id] = type;
274}
275
276// ─── Step 5: Hierarchy inference ─────────────────────────────────────────────
277
278function inferHierarchy(nodes: CanonicalNode[], clauseMap: Map<string, Clause>): void {
279 // Group nodes by source document
280 const byDoc = new Map<string, CanonicalNode[]>();
281 for (const node of nodes) {
282 for (const clauseId of node.source_clause_ids) {
283 const clause = clauseMap.get(clauseId);
284 if (!clause) continue;
285 const docId = clause.source_doc_id;
286 const list = byDoc.get(docId) ?? [];
287 list.push(node);
288 byDoc.set(docId, list);
289 }
290 }
291
292 for (const docNodes of byDoc.values()) {
293 // Build entries with section depth for all nodes
294 const entries: { node: CanonicalNode; depth: number; sectionPath: string[] }[] = [];
295
296 for (const node of docNodes) {
297 const clause = clauseMap.get(node.source_clause_ids[0]);
298 if (!clause) continue;
299 entries.push({ node, depth: clause.section_path.length, sectionPath: clause.section_path });
300 }
301
302 // For each node, find the nearest parent at a shallower section depth.
303 // Prefer CONTEXT parents, but fall back to any type if no CONTEXT exists.
304 for (const child of entries) {
305 if (child.depth === 0) continue; // top-level nodes have no parent
306
307 let bestParent: CanonicalNode | null = null;
308 let bestDepth = -1;
309 let bestIsContext = false;
310
311 for (const candidate of entries) {
312 if (candidate.node.canon_id === child.node.canon_id) continue;
313 if (candidate.depth >= child.depth) continue;
314 if (candidate.depth <= bestDepth) continue;
315
316 const prefixMatch = candidate.sectionPath.every((seg, i) => child.sectionPath[i] === seg);
317 if (!prefixMatch) continue;
318
319 const isContext = candidate.node.type === CanonicalType.CONTEXT;
320 // Prefer CONTEXT parents at the same depth, but accept any type
321 if (candidate.depth > bestDepth || (isContext && !bestIsContext)) {
322 bestParent = candidate.node;
323 bestDepth = candidate.depth;
324 bestIsContext = isContext;
325 }
326 }
327
328 if (bestParent) {
329 child.node.parent_canon_id = bestParent.canon_id;
330 }
331 }
332 }
333}
334
335// ─── Step 6: Anchor computation ──────────────────────────────────────────────
336
337function computeAnchors(nodes: CanonicalNode[]): void {
338 for (const node of nodes) {
339 const sortedTags = [...node.tags].sort().join(',');
340 const sortedSources = [...node.source_clause_ids].sort().join(',');
341 node.canon_anchor = sha256([node.type, sortedTags, sortedSources].join('\x00'));
342 }
343}
344
345// ─── Step 7: Enforce max degree ──────────────────────────────────────────────
346
347function enforceMaxDegree(nodes: CanonicalNode[], idf: Map<string, number>): void {
348 for (const node of nodes) {
349 // Count non-duplicate edges
350 const edges = node.linked_canon_ids.filter(
351 id => node.link_types?.[id] !== 'duplicates'
352 );
353
354 if (edges.length <= CONFIG.MAX_DEGREE) continue;
355
356 // Score each edge by shared tag IDF
357 const nodeTagSet = new Set(node.tags);
358 const edgeScores: { id: string; score: number }[] = [];
359
360 const nodeIndex = new Map(nodes.map(n => [n.canon_id, n]));
361
362 for (const id of edges) {
363 const target = nodeIndex.get(id);
364 if (!target) { edgeScores.push({ id, score: 0 }); continue; }
365
366 let score = 0;
367 for (const tag of target.tags) {
368 if (nodeTagSet.has(tag)) {
369 score += idf.get(tag) ?? 0;
370 }
371 }
372 edgeScores.push({ id, score });
373 }
374
375 edgeScores.sort((a, b) => b.score - a.score);
376 const keep = new Set(edgeScores.slice(0, CONFIG.MAX_DEGREE).map(e => e.id));
377
378 // Also keep all 'duplicates' edges
379 for (const id of node.linked_canon_ids) {
380 if (node.link_types?.[id] === 'duplicates') keep.add(id);
381 }
382
383 // Filter
384 node.linked_canon_ids = node.linked_canon_ids.filter(id => keep.has(id));
385 if (node.link_types) {
386 for (const id of Object.keys(node.link_types)) {
387 if (!keep.has(id)) delete node.link_types[id];
388 }
389 }
390 }
391}