Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
at main 327 lines 10 kB view raw
1/** 2 * A/B/C/D Change Classifier 3 * 4 * Classifies each clause change using multiple signals. 5 * 6 * A = Trivial (formatting only) 7 * B = Local semantic change 8 * C = Contextual semantic shift 9 * D = Uncertain 10 */ 11 12import type { Clause, ClauseDiff } from './models/clause.js'; 13import { DiffType } from './models/clause.js'; 14import type { CanonicalNode } from './models/canonical.js'; 15import type { ClassificationSignals, ChangeClassification } from './models/classification.js'; 16import { ChangeClass } from './models/classification.js'; 17import { extractTerms } from './canonicalizer.js'; 18import { CONFIG } from './experiment-config.js'; 19 20/** 21 * Classify a single clause diff. 22 */ 23export function classifyChange( 24 diff: ClauseDiff, 25 canonicalNodesBefore: CanonicalNode[], 26 canonicalNodesAfter: CanonicalNode[], 27 warmHashBefore?: string, 28 warmHashAfter?: string, 29): ChangeClassification { 30 // Pure additions and removals 31 if (diff.diff_type === DiffType.ADDED) { 32 const canonImpact = countCanonImpact(undefined, diff.clause_after, canonicalNodesBefore, canonicalNodesAfter); 33 return { 34 change_class: ChangeClass.B, 35 confidence: 0.9, 36 signals: { 37 norm_diff: 1, 38 semhash_delta: true, 39 context_cold_delta: true, 40 term_ref_delta: 1, 41 section_structure_delta: true, 42 canon_impact: canonImpact, 43 }, 44 clause_id_after: diff.clause_id_after, 45 }; 46 } 47 48 if (diff.diff_type === DiffType.REMOVED) { 49 const canonImpact = countCanonImpact(diff.clause_before, undefined, canonicalNodesBefore, canonicalNodesAfter); 50 return { 51 change_class: ChangeClass.B, 52 confidence: 0.9, 53 signals: { 54 norm_diff: 1, 55 semhash_delta: true, 56 context_cold_delta: true, 57 term_ref_delta: 1, 58 section_structure_delta: true, 59 canon_impact: canonImpact, 60 }, 61 clause_id_before: diff.clause_id_before, 62 }; 63 } 64 65 if (diff.diff_type === DiffType.UNCHANGED) { 66 // Check if warm hash changed (contextual shift even without content change) 67 if (warmHashBefore && warmHashAfter && warmHashBefore !== warmHashAfter) { 68 return { 69 change_class: ChangeClass.C, 70 confidence: 0.8, 71 signals: { 72 norm_diff: 0, 73 semhash_delta: false, 74 context_cold_delta: false, 75 term_ref_delta: 0, 76 section_structure_delta: false, 77 canon_impact: 0, 78 }, 79 clause_id_before: diff.clause_id_before, 80 clause_id_after: diff.clause_id_after, 81 }; 82 } 83 return { 84 change_class: ChangeClass.A, 85 confidence: 1.0, 86 signals: { 87 norm_diff: 0, 88 semhash_delta: false, 89 context_cold_delta: false, 90 term_ref_delta: 0, 91 section_structure_delta: false, 92 canon_impact: 0, 93 }, 94 clause_id_before: diff.clause_id_before, 95 clause_id_after: diff.clause_id_after, 96 }; 97 } 98 99 // MODIFIED or MOVED — compute signals 100 const before = diff.clause_before!; 101 const after = diff.clause_after!; 102 103 const normDiff = normalizedEditDistance(before.normalized_text, after.normalized_text); 104 const semhashDelta = before.clause_semhash !== after.clause_semhash; 105 const contextColdDelta = before.context_semhash_cold !== after.context_semhash_cold; 106 const termDelta = termJaccardDistance(before.normalized_text, after.normalized_text); 107 const sectionDelta = before.section_path.join('/') !== after.section_path.join('/'); 108 const canonImpact = countCanonImpact(before, after, canonicalNodesBefore, canonicalNodesAfter); 109 110 // Anchor-based matching: detect "same concept, different wording" 111 // If anchors match across before/after, the change is likely cosmetic (A/B, not C/D) 112 const anchorMatch = computeAnchorOverlap(before, after, canonicalNodesBefore, canonicalNodesAfter); 113 114 const signals: ClassificationSignals = { 115 norm_diff: normDiff, 116 semhash_delta: semhashDelta, 117 context_cold_delta: contextColdDelta, 118 term_ref_delta: termDelta, 119 section_structure_delta: sectionDelta, 120 canon_impact: canonImpact, 121 }; 122 123 // Classification logic 124 if (!semhashDelta) { 125 // Content identical, only moved 126 return { 127 change_class: ChangeClass.A, 128 confidence: 0.95, 129 signals, 130 clause_id_before: diff.clause_id_before, 131 clause_id_after: diff.clause_id_after, 132 }; 133 } 134 135 // Compute confidence and classify 136 if (normDiff < CONFIG.CLASS_A_NORM_DIFF && termDelta < CONFIG.CLASS_A_TERM_DELTA) { 137 // Check if numeric values changed — that's semantically significant even with small edit distance 138 const beforeNums = (before.normalized_text.match(/\d+/g) ?? []).join(','); 139 const afterNums = (after.normalized_text.match(/\d+/g) ?? []).join(','); 140 if (beforeNums !== afterNums) { 141 return { 142 change_class: ChangeClass.B, 143 confidence: 0.75, 144 signals, 145 clause_id_before: diff.clause_id_before, 146 clause_id_after: diff.clause_id_after, 147 }; 148 } 149 return { 150 change_class: ChangeClass.A, 151 confidence: 0.85, 152 signals, 153 clause_id_before: diff.clause_id_before, 154 clause_id_after: diff.clause_id_after, 155 }; 156 } 157 158 // Local semantic change (small edit distance, moderate term change) 159 if (normDiff < CONFIG.CLASS_B_NORM_DIFF && termDelta < CONFIG.CLASS_B_TERM_DELTA) { 160 return { 161 change_class: ChangeClass.B, 162 confidence: 0.8, 163 signals, 164 clause_id_before: diff.clause_id_before, 165 clause_id_after: diff.clause_id_after, 166 }; 167 } 168 169 // Contextual shift: section structure changed OR high canonical impact 170 if (sectionDelta || canonImpact > 2) { 171 const confidence = canonImpact > 2 ? 0.9 : 0.7; 172 return { 173 change_class: ChangeClass.C, 174 confidence, 175 signals, 176 clause_id_before: diff.clause_id_before, 177 clause_id_after: diff.clause_id_after, 178 }; 179 } 180 181 // High uncertainty — but check anchor overlap first 182 if (normDiff > CONFIG.CLASS_D_HIGH_CHANGE || termDelta > CONFIG.CLASS_D_HIGH_CHANGE) { 183 // If anchors match, the concepts are the same despite heavy rewording → B not D 184 if (anchorMatch > CONFIG.ANCHOR_MATCH_THRESHOLD) { 185 return { 186 change_class: ChangeClass.B, 187 confidence: 0.65, 188 signals, 189 clause_id_before: diff.clause_id_before, 190 clause_id_after: diff.clause_id_after, 191 }; 192 } 193 return { 194 change_class: ChangeClass.D, 195 confidence: 0.4, 196 signals, 197 clause_id_before: diff.clause_id_before, 198 clause_id_after: diff.clause_id_after, 199 }; 200 } 201 202 return { 203 change_class: ChangeClass.B, 204 confidence: 0.6, 205 signals, 206 clause_id_before: diff.clause_id_before, 207 clause_id_after: diff.clause_id_after, 208 }; 209} 210 211/** 212 * Classify all diffs in a change set. 213 */ 214export function classifyChanges( 215 diffs: ClauseDiff[], 216 canonicalNodesBefore: CanonicalNode[], 217 canonicalNodesAfter: CanonicalNode[], 218 warmHashesBefore?: Map<string, string>, 219 warmHashesAfter?: Map<string, string>, 220): ChangeClassification[] { 221 return diffs.map(diff => { 222 const warmBefore = diff.clause_id_before ? warmHashesBefore?.get(diff.clause_id_before) : undefined; 223 const warmAfter = diff.clause_id_after ? warmHashesAfter?.get(diff.clause_id_after) : undefined; 224 return classifyChange(diff, canonicalNodesBefore, canonicalNodesAfter, warmBefore, warmAfter); 225 }); 226} 227 228/** 229 * Normalized edit distance (Levenshtein / max length). 230 * Returns 0 for identical, 1 for completely different. 231 */ 232function normalizedEditDistance(a: string, b: string): number { 233 if (a === b) return 0; 234 if (a.length === 0 || b.length === 0) return 1; 235 236 const maxLen = Math.max(a.length, b.length); 237 const dist = levenshtein(a, b); 238 return dist / maxLen; 239} 240 241/** 242 * Levenshtein distance (optimized for reasonable string lengths). 243 */ 244function levenshtein(a: string, b: string): number { 245 const m = a.length; 246 const n = b.length; 247 const dp: number[] = Array.from({ length: n + 1 }, (_, i) => i); 248 249 for (let i = 1; i <= m; i++) { 250 let prev = dp[0]; 251 dp[0] = i; 252 for (let j = 1; j <= n; j++) { 253 const temp = dp[j]; 254 if (a[i - 1] === b[j - 1]) { 255 dp[j] = prev; 256 } else { 257 dp[j] = 1 + Math.min(prev, dp[j], dp[j - 1]); 258 } 259 prev = temp; 260 } 261 } 262 263 return dp[n]; 264} 265 266/** 267 * Jaccard distance of extracted terms between two texts. 268 */ 269function termJaccardDistance(textA: string, textB: string): number { 270 const termsA = new Set(extractTerms(textA)); 271 const termsB = new Set(extractTerms(textB)); 272 273 if (termsA.size === 0 && termsB.size === 0) return 0; 274 275 const intersection = [...termsA].filter(t => termsB.has(t)).length; 276 const union = new Set([...termsA, ...termsB]).size; 277 278 return 1 - (intersection / union); 279} 280 281/** 282 * Compute anchor overlap: what fraction of canon nodes from the 'before' clause 283 * have matching anchors in the 'after' graph. High overlap → same concepts, just reworded. 284 */ 285function computeAnchorOverlap( 286 before: Clause, 287 after: Clause, 288 canonBefore: CanonicalNode[], 289 canonAfter: CanonicalNode[], 290): number { 291 const nodesBefore = canonBefore.filter(n => n.source_clause_ids.includes(before.clause_id)); 292 if (nodesBefore.length === 0) return 0; 293 294 // Collect all anchors from after nodes linked to the after clause 295 const nodesAfter = canonAfter.filter(n => n.source_clause_ids.includes(after.clause_id)); 296 const afterAnchors = new Set(nodesAfter.map(n => n.canon_anchor).filter(Boolean)); 297 298 if (afterAnchors.size === 0) return 0; 299 300 let matched = 0; 301 for (const node of nodesBefore) { 302 if (node.canon_anchor && afterAnchors.has(node.canon_anchor)) matched++; 303 } 304 305 return matched / nodesBefore.length; 306} 307 308/** 309 * Count canonical nodes affected by a change. 310 */ 311function countCanonImpact( 312 before: Clause | undefined, 313 after: Clause | undefined, 314 canonBefore: CanonicalNode[], 315 canonAfter: CanonicalNode[], 316): number { 317 let impact = 0; 318 319 if (before) { 320 impact += canonBefore.filter(n => n.source_clause_ids.includes(before.clause_id)).length; 321 } 322 if (after) { 323 impact += canonAfter.filter(n => n.source_clause_ids.includes(after.clause_id)).length; 324 } 325 326 return impact; 327}