src/resolution.ts at main · chadfowler.com/phoenix

chadfowler.com / phoenix
fork atom
Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
fork atom
phoenix / src / resolution.ts
at main 391 lines 14 kB view raw
wrap content
Chad Fowler fix: hierarchy inference, heading extraction, and gold standard accuracy 3d ago
e5a28949
  1/**
  2 * Resolution Engine — Phase 2 of canonicalization.
  3 *
  4 * Takes flat CandidateNode[] from extraction and produces
  5 * a structured CanonicalGraph with:
  6 * - Deduplication / merge of equivalent candidates
  7 * - Typed edge inference (constrains, defines, refines, etc.)
  8 * - Hierarchy from heading structure
  9 * - canon_anchor for stable soft identity
 10 * - IDF-weighted linking via inverted index (replaces O(n²))
 11 */
 12
 13import type { CanonicalNode, CandidateNode } from './models/canonical.js';
 14import type { Clause } from './models/clause.js';
 15import type { EdgeType } from './models/canonical.js';
 16import { CanonicalType } from './models/canonical.js';
 17import { sha256 } from './semhash.js';
 18import { CONFIG } from './experiment-config.js';
 19
 20// ─── Main entry point ────────────────────────────────────────────────────────
 21
 22/**
 23 * Resolve candidate nodes into a canonical graph.
 24 */
 25export function resolveGraph(candidates: CandidateNode[], clauses: Clause[]): CanonicalNode[] {
 26  if (candidates.length === 0) return [];
 27
 28  // Build clause index for hierarchy inference
 29  const clauseMap = new Map(clauses.map(c => [c.clause_id, c]));
 30
 31  // Step 1: Convert candidates to draft nodes
 32  let nodes = candidates.map(c => candidateToNode(c));
 33
 34  // Step 2: Deduplicate
 35  nodes = deduplicateNodes(nodes);
 36
 37  // Step 3: Compute IDF over all tags
 38  const idf = computeIDF(nodes);
 39
 40  // Step 4: Build inverted index and infer typed edges
 41  inferTypedEdges(nodes, idf);
 42
 43  // Step 5: Infer hierarchy from heading structure
 44  inferHierarchy(nodes, clauseMap);
 45
 46  // Step 6: Compute anchors
 47  computeAnchors(nodes);
 48
 49  // Step 7: Enforce max degree
 50  enforceMaxDegree(nodes, idf);
 51
 52  return nodes;
 53}
 54
 55// ─── Step 1: Convert ─────────────────────────────────────────────────────────
 56
 57function candidateToNode(c: CandidateNode): CanonicalNode {
 58  return {
 59    canon_id: c.candidate_id,
 60    type: c.type,
 61    statement: c.statement,
 62    confidence: c.confidence,
 63    source_clause_ids: [...c.source_clause_ids],
 64    linked_canon_ids: [],
 65    link_types: {},
 66    tags: [...c.tags],
 67    extraction_method: c.extraction_method,
 68  };
 69}
 70
 71// ─── Step 2: Deduplication ───────────────────────────────────────────────────
 72
 73function deduplicateNodes(nodes: CanonicalNode[]): CanonicalNode[] {
 74  // Group by normalized statement fingerprint (token trigrams)
 75  const groups = new Map<string, CanonicalNode[]>();
 76
 77  for (const node of nodes) {
 78    const fp = statementFingerprint(node.statement);
 79    const group = groups.get(fp) ?? [];
 80    group.push(node);
 81    groups.set(fp, group);
 82  }
 83
 84  const merged: CanonicalNode[] = [];
 85
 86  for (const group of groups.values()) {
 87    if (group.length === 1) {
 88      merged.push(group[0]);
 89      continue;
 90    }
 91
 92    // Check pairwise similarity within fingerprint group
 93    const used = new Set<number>();
 94    for (let i = 0; i < group.length; i++) {
 95      if (used.has(i)) continue;
 96
 97      let primary = group[i];
 98      const mergedSources = new Set(primary.source_clause_ids);
 99      const mergedTags = new Set(primary.tags);
100
101      for (let j = i + 1; j < group.length; j++) {
102        if (used.has(j)) continue;
103        const sim = tokenJaccard(primary.statement, group[j].statement);
104        if (sim > CONFIG.JACCARD_DEDUP_THRESHOLD && areTypesCompatible(primary.type, group[j].type)) {
105          // Merge: keep higher confidence node as primary
106          used.add(j);
107          for (const s of group[j].source_clause_ids) mergedSources.add(s);
108          for (const t of group[j].tags) mergedTags.add(t);
109
110          if ((group[j].confidence ?? 0) > (primary.confidence ?? 0)) {
111            const oldSources = mergedSources;
112            primary = { ...group[j], source_clause_ids: [...oldSources], tags: [...mergedTags] };
113          }
114        }
115      }
116
117      primary.source_clause_ids = [...mergedSources];
118      primary.tags = [...mergedTags];
119      merged.push(primary);
120    }
121  }
122
123  return merged;
124}
125
126function statementFingerprint(statement: string): string {
127  // Coarse fingerprint: sorted 3-char token prefixes
128  const tokens = statement.toLowerCase().split(/\s+/).filter(t => t.length > 2);
129  const prefixes = tokens.map(t => t.slice(0, 3)).sort();
130  return prefixes.slice(0, CONFIG.FINGERPRINT_PREFIX_COUNT).join('|');
131}
132
133function tokenJaccard(a: string, b: string): number {
134  const ta = new Set(a.toLowerCase().split(/\s+/));
135  const tb = new Set(b.toLowerCase().split(/\s+/));
136  let intersection = 0;
137  for (const t of ta) if (tb.has(t)) intersection++;
138  const union = ta.size + tb.size - intersection;
139  return union > 0 ? intersection / union : 0;
140}
141
142function areTypesCompatible(a: CanonicalType, b: CanonicalType): boolean {
143  if (a === b) return true;
144  if (a === CanonicalType.CONTEXT || b === CanonicalType.CONTEXT) return true;
145  return false;
146}
147
148// ─── Step 3: IDF computation ─────────────────────────────────────────────────
149
150function computeIDF(nodes: CanonicalNode[]): Map<string, number> {
151  const docFreq = new Map<string, number>();
152  const N = nodes.length;
153
154  for (const node of nodes) {
155    const uniqueTags = new Set(node.tags);
156    for (const tag of uniqueTags) {
157      docFreq.set(tag, (docFreq.get(tag) ?? 0) + 1);
158    }
159  }
160
161  const idf = new Map<string, number>();
162  for (const [tag, df] of docFreq) {
163    idf.set(tag, Math.log((N + 1) / (df + 1)) + 1);
164  }
165
166  return idf;
167}
168
169// ─── Step 4: Typed edge inference ────────────────────────────────────────────
170
171function inferTypedEdges(nodes: CanonicalNode[], idf: Map<string, number>): void {
172  // Build inverted index: tag → node indices
173  const tagIndex = new Map<string, number[]>();
174  for (let i = 0; i < nodes.length; i++) {
175    for (const tag of nodes[i].tags) {
176      const list = tagIndex.get(tag) ?? [];
177      list.push(i);
178      tagIndex.set(tag, list);
179    }
180  }
181
182  // Compute IDF threshold: only skip tags appearing in >40% of nodes.
183  // IDF for a tag in 40% of N nodes ≈ log(N / (0.4*N)) + 1 ≈ log(2.5) + 1 ≈ 1.92
184  // We use a hard threshold based on document frequency, not percentile.
185  const N = nodes.length;
186  const maxDF = Math.max(2, Math.floor(N * CONFIG.DOC_FREQ_CUTOFF));
187  const idfThreshold = Math.log((N + 1) / (maxDF + 1)) + 1;
188
189  // Generate candidate pairs from inverted index
190  const pairScores = new Map<string, { i: number; j: number; sharedNonTrivial: number; sharedTags: string[] }>();
191
192  for (const [tag, indices] of tagIndex) {
193    for (let a = 0; a < indices.length; a++) {
194      for (let b = a + 1; b < indices.length; b++) {
195        const i = indices[a];
196        const j = indices[b];
197        const key = i < j ? `${i}:${j}` : `${j}:${i}`;
198        const entry = pairScores.get(key) ?? { i: Math.min(i, j), j: Math.max(i, j), sharedNonTrivial: 0, sharedTags: [] };
199        const tagIdf = idf.get(tag) ?? 0;
200        if (tagIdf > idfThreshold) {
201          entry.sharedNonTrivial++;
202        }
203        entry.sharedTags.push(tag);
204        pairScores.set(key, entry);
205      }
206    }
207  }
208
209  // Create edges for pairs with enough shared tags (at least MIN_SHARED_TAGS total,
210  // and at least 1 non-trivial tag)
211  for (const { i, j, sharedNonTrivial, sharedTags } of pairScores.values()) {
212    if (sharedTags.length < CONFIG.MIN_SHARED_TAGS || sharedNonTrivial < 1) continue;
213
214    const nodeA = nodes[i];
215    const nodeB = nodes[j];
216
217    // Skip linking canon→canon within same canon_id
218    if (nodeA.canon_id === nodeB.canon_id) continue;
219
220    // Infer edge type
221    const edgeType = inferEdgeType(nodeA, nodeB);
222
223    // Add bidirectional link
224    addEdge(nodeA, nodeB, edgeType);
225    addEdge(nodeB, nodeA, reverseEdgeType(edgeType));
226  }
227}
228
229function inferEdgeType(from: CanonicalNode, to: CanonicalNode): EdgeType {
230  const types = new Set([from.type, to.type]);
231
232  // Definition ↔ anything = defines
233  if (types.has(CanonicalType.DEFINITION)) return 'defines';
234
235  // Constraint ↔ Requirement = constrains
236  if (types.has(CanonicalType.CONSTRAINT) && types.has(CanonicalType.REQUIREMENT)) return 'constrains';
237
238  // Invariant ↔ Requirement = invariant_of
239  if (types.has(CanonicalType.INVARIANT) && types.has(CanonicalType.REQUIREMENT)) return 'invariant_of';
240
241  // Invariant ↔ Constraint = constrains (invariants constrain constraints)
242  if (types.has(CanonicalType.INVARIANT) && types.has(CanonicalType.CONSTRAINT)) return 'constrains';
243
244  // Context ↔ any non-context = refines (context provides framing)
245  if (types.has(CanonicalType.CONTEXT) && types.size > 1) return 'refines';
246
247  // Same-type pairs: use tag overlap to distinguish refines vs relates_to
248  if (from.type === to.type) {
249    const fromTags = new Set(from.tags);
250    const toTags = new Set(to.tags);
251    let shared = 0;
252    for (const t of fromTags) if (toTags.has(t)) shared++;
253    const smaller = Math.min(fromTags.size, toTags.size);
254
255    // If one node's tags are largely a subset of the other's, the smaller
256    // one refines a broader concept — that's a refinement relationship
257    if (smaller > 0 && shared / smaller >= CONFIG.SAME_TYPE_REFINE_THRESHOLD) return 'refines';
258  }
259
260  return 'relates_to';
261}
262
263function reverseEdgeType(type: EdgeType): EdgeType {
264  // Most edge types are symmetric in our model
265  return type;
266}
267
268function addEdge(from: CanonicalNode, to: CanonicalNode, type: EdgeType): void {
269  if (!from.linked_canon_ids.includes(to.canon_id)) {
270    from.linked_canon_ids.push(to.canon_id);
271  }
272  if (!from.link_types) from.link_types = {};
273  from.link_types[to.canon_id] = type;
274}
275
276// ─── Step 5: Hierarchy inference ─────────────────────────────────────────────
277
278function inferHierarchy(nodes: CanonicalNode[], clauseMap: Map<string, Clause>): void {
279  // Group nodes by source document
280  const byDoc = new Map<string, CanonicalNode[]>();
281  for (const node of nodes) {
282    for (const clauseId of node.source_clause_ids) {
283      const clause = clauseMap.get(clauseId);
284      if (!clause) continue;
285      const docId = clause.source_doc_id;
286      const list = byDoc.get(docId) ?? [];
287      list.push(node);
288      byDoc.set(docId, list);
289    }
290  }
291
292  for (const docNodes of byDoc.values()) {
293    // Build entries with section depth for all nodes
294    const entries: { node: CanonicalNode; depth: number; sectionPath: string[] }[] = [];
295
296    for (const node of docNodes) {
297      const clause = clauseMap.get(node.source_clause_ids[0]);
298      if (!clause) continue;
299      entries.push({ node, depth: clause.section_path.length, sectionPath: clause.section_path });
300    }
301
302    // For each node, find the nearest parent at a shallower section depth.
303    // Prefer CONTEXT parents, but fall back to any type if no CONTEXT exists.
304    for (const child of entries) {
305      if (child.depth === 0) continue; // top-level nodes have no parent
306
307      let bestParent: CanonicalNode | null = null;
308      let bestDepth = -1;
309      let bestIsContext = false;
310
311      for (const candidate of entries) {
312        if (candidate.node.canon_id === child.node.canon_id) continue;
313        if (candidate.depth >= child.depth) continue;
314        if (candidate.depth <= bestDepth) continue;
315
316        const prefixMatch = candidate.sectionPath.every((seg, i) => child.sectionPath[i] === seg);
317        if (!prefixMatch) continue;
318
319        const isContext = candidate.node.type === CanonicalType.CONTEXT;
320        // Prefer CONTEXT parents at the same depth, but accept any type
321        if (candidate.depth > bestDepth || (isContext && !bestIsContext)) {
322          bestParent = candidate.node;
323          bestDepth = candidate.depth;
324          bestIsContext = isContext;
325        }
326      }
327
328      if (bestParent) {
329        child.node.parent_canon_id = bestParent.canon_id;
330      }
331    }
332  }
333}
334
335// ─── Step 6: Anchor computation ──────────────────────────────────────────────
336
337function computeAnchors(nodes: CanonicalNode[]): void {
338  for (const node of nodes) {
339    const sortedTags = [...node.tags].sort().join(',');
340    const sortedSources = [...node.source_clause_ids].sort().join(',');
341    node.canon_anchor = sha256([node.type, sortedTags, sortedSources].join('\x00'));
342  }
343}
344
345// ─── Step 7: Enforce max degree ──────────────────────────────────────────────
346
347function enforceMaxDegree(nodes: CanonicalNode[], idf: Map<string, number>): void {
348  for (const node of nodes) {
349    // Count non-duplicate edges
350    const edges = node.linked_canon_ids.filter(
351      id => node.link_types?.[id] !== 'duplicates'
352    );
353
354    if (edges.length <= CONFIG.MAX_DEGREE) continue;
355
356    // Score each edge by shared tag IDF
357    const nodeTagSet = new Set(node.tags);
358    const edgeScores: { id: string; score: number }[] = [];
359
360    const nodeIndex = new Map(nodes.map(n => [n.canon_id, n]));
361
362    for (const id of edges) {
363      const target = nodeIndex.get(id);
364      if (!target) { edgeScores.push({ id, score: 0 }); continue; }
365
366      let score = 0;
367      for (const tag of target.tags) {
368        if (nodeTagSet.has(tag)) {
369          score += idf.get(tag) ?? 0;
370        }
371      }
372      edgeScores.push({ id, score });
373    }
374
375    edgeScores.sort((a, b) => b.score - a.score);
376    const keep = new Set(edgeScores.slice(0, CONFIG.MAX_DEGREE).map(e => e.id));
377
378    // Also keep all 'duplicates' edges
379    for (const id of node.linked_canon_ids) {
380      if (node.link_types?.[id] === 'duplicates') keep.add(id);
381    }
382
383    // Filter
384    node.linked_canon_ids = node.linked_canon_ids.filter(id => keep.has(id));
385    if (node.link_types) {
386      for (const id of Object.keys(node.link_types)) {
387        if (!keep.has(id)) delete node.link_types[id];
388      }
389    }
390  }
391}