Reference implementation for the Phoenix Architecture. Work in progress.
aicoding.leaflet.pub/
ai
coding
crazy
1/**
2 * A/B/C/D Change Classifier
3 *
4 * Classifies each clause change using multiple signals.
5 *
6 * A = Trivial (formatting only)
7 * B = Local semantic change
8 * C = Contextual semantic shift
9 * D = Uncertain
10 */
11
12import type { Clause, ClauseDiff } from './models/clause.js';
13import { DiffType } from './models/clause.js';
14import type { CanonicalNode } from './models/canonical.js';
15import type { ClassificationSignals, ChangeClassification } from './models/classification.js';
16import { ChangeClass } from './models/classification.js';
17import { extractTerms } from './canonicalizer.js';
18import { CONFIG } from './experiment-config.js';
19
20/**
21 * Classify a single clause diff.
22 */
23export function classifyChange(
24 diff: ClauseDiff,
25 canonicalNodesBefore: CanonicalNode[],
26 canonicalNodesAfter: CanonicalNode[],
27 warmHashBefore?: string,
28 warmHashAfter?: string,
29): ChangeClassification {
30 // Pure additions and removals
31 if (diff.diff_type === DiffType.ADDED) {
32 const canonImpact = countCanonImpact(undefined, diff.clause_after, canonicalNodesBefore, canonicalNodesAfter);
33 return {
34 change_class: ChangeClass.B,
35 confidence: 0.9,
36 signals: {
37 norm_diff: 1,
38 semhash_delta: true,
39 context_cold_delta: true,
40 term_ref_delta: 1,
41 section_structure_delta: true,
42 canon_impact: canonImpact,
43 },
44 clause_id_after: diff.clause_id_after,
45 };
46 }
47
48 if (diff.diff_type === DiffType.REMOVED) {
49 const canonImpact = countCanonImpact(diff.clause_before, undefined, canonicalNodesBefore, canonicalNodesAfter);
50 return {
51 change_class: ChangeClass.B,
52 confidence: 0.9,
53 signals: {
54 norm_diff: 1,
55 semhash_delta: true,
56 context_cold_delta: true,
57 term_ref_delta: 1,
58 section_structure_delta: true,
59 canon_impact: canonImpact,
60 },
61 clause_id_before: diff.clause_id_before,
62 };
63 }
64
65 if (diff.diff_type === DiffType.UNCHANGED) {
66 // Check if warm hash changed (contextual shift even without content change)
67 if (warmHashBefore && warmHashAfter && warmHashBefore !== warmHashAfter) {
68 return {
69 change_class: ChangeClass.C,
70 confidence: 0.8,
71 signals: {
72 norm_diff: 0,
73 semhash_delta: false,
74 context_cold_delta: false,
75 term_ref_delta: 0,
76 section_structure_delta: false,
77 canon_impact: 0,
78 },
79 clause_id_before: diff.clause_id_before,
80 clause_id_after: diff.clause_id_after,
81 };
82 }
83 return {
84 change_class: ChangeClass.A,
85 confidence: 1.0,
86 signals: {
87 norm_diff: 0,
88 semhash_delta: false,
89 context_cold_delta: false,
90 term_ref_delta: 0,
91 section_structure_delta: false,
92 canon_impact: 0,
93 },
94 clause_id_before: diff.clause_id_before,
95 clause_id_after: diff.clause_id_after,
96 };
97 }
98
99 // MODIFIED or MOVED — compute signals
100 const before = diff.clause_before!;
101 const after = diff.clause_after!;
102
103 const normDiff = normalizedEditDistance(before.normalized_text, after.normalized_text);
104 const semhashDelta = before.clause_semhash !== after.clause_semhash;
105 const contextColdDelta = before.context_semhash_cold !== after.context_semhash_cold;
106 const termDelta = termJaccardDistance(before.normalized_text, after.normalized_text);
107 const sectionDelta = before.section_path.join('/') !== after.section_path.join('/');
108 const canonImpact = countCanonImpact(before, after, canonicalNodesBefore, canonicalNodesAfter);
109
110 // Anchor-based matching: detect "same concept, different wording"
111 // If anchors match across before/after, the change is likely cosmetic (A/B, not C/D)
112 const anchorMatch = computeAnchorOverlap(before, after, canonicalNodesBefore, canonicalNodesAfter);
113
114 const signals: ClassificationSignals = {
115 norm_diff: normDiff,
116 semhash_delta: semhashDelta,
117 context_cold_delta: contextColdDelta,
118 term_ref_delta: termDelta,
119 section_structure_delta: sectionDelta,
120 canon_impact: canonImpact,
121 };
122
123 // Classification logic
124 if (!semhashDelta) {
125 // Content identical, only moved
126 return {
127 change_class: ChangeClass.A,
128 confidence: 0.95,
129 signals,
130 clause_id_before: diff.clause_id_before,
131 clause_id_after: diff.clause_id_after,
132 };
133 }
134
135 // Compute confidence and classify
136 if (normDiff < CONFIG.CLASS_A_NORM_DIFF && termDelta < CONFIG.CLASS_A_TERM_DELTA) {
137 // Check if numeric values changed — that's semantically significant even with small edit distance
138 const beforeNums = (before.normalized_text.match(/\d+/g) ?? []).join(',');
139 const afterNums = (after.normalized_text.match(/\d+/g) ?? []).join(',');
140 if (beforeNums !== afterNums) {
141 return {
142 change_class: ChangeClass.B,
143 confidence: 0.75,
144 signals,
145 clause_id_before: diff.clause_id_before,
146 clause_id_after: diff.clause_id_after,
147 };
148 }
149 return {
150 change_class: ChangeClass.A,
151 confidence: 0.85,
152 signals,
153 clause_id_before: diff.clause_id_before,
154 clause_id_after: diff.clause_id_after,
155 };
156 }
157
158 // Local semantic change (small edit distance, moderate term change)
159 if (normDiff < CONFIG.CLASS_B_NORM_DIFF && termDelta < CONFIG.CLASS_B_TERM_DELTA) {
160 return {
161 change_class: ChangeClass.B,
162 confidence: 0.8,
163 signals,
164 clause_id_before: diff.clause_id_before,
165 clause_id_after: diff.clause_id_after,
166 };
167 }
168
169 // Contextual shift: section structure changed OR high canonical impact
170 if (sectionDelta || canonImpact > 2) {
171 const confidence = canonImpact > 2 ? 0.9 : 0.7;
172 return {
173 change_class: ChangeClass.C,
174 confidence,
175 signals,
176 clause_id_before: diff.clause_id_before,
177 clause_id_after: diff.clause_id_after,
178 };
179 }
180
181 // High uncertainty — but check anchor overlap first
182 if (normDiff > CONFIG.CLASS_D_HIGH_CHANGE || termDelta > CONFIG.CLASS_D_HIGH_CHANGE) {
183 // If anchors match, the concepts are the same despite heavy rewording → B not D
184 if (anchorMatch > CONFIG.ANCHOR_MATCH_THRESHOLD) {
185 return {
186 change_class: ChangeClass.B,
187 confidence: 0.65,
188 signals,
189 clause_id_before: diff.clause_id_before,
190 clause_id_after: diff.clause_id_after,
191 };
192 }
193 return {
194 change_class: ChangeClass.D,
195 confidence: 0.4,
196 signals,
197 clause_id_before: diff.clause_id_before,
198 clause_id_after: diff.clause_id_after,
199 };
200 }
201
202 return {
203 change_class: ChangeClass.B,
204 confidence: 0.6,
205 signals,
206 clause_id_before: diff.clause_id_before,
207 clause_id_after: diff.clause_id_after,
208 };
209}
210
211/**
212 * Classify all diffs in a change set.
213 */
214export function classifyChanges(
215 diffs: ClauseDiff[],
216 canonicalNodesBefore: CanonicalNode[],
217 canonicalNodesAfter: CanonicalNode[],
218 warmHashesBefore?: Map<string, string>,
219 warmHashesAfter?: Map<string, string>,
220): ChangeClassification[] {
221 return diffs.map(diff => {
222 const warmBefore = diff.clause_id_before ? warmHashesBefore?.get(diff.clause_id_before) : undefined;
223 const warmAfter = diff.clause_id_after ? warmHashesAfter?.get(diff.clause_id_after) : undefined;
224 return classifyChange(diff, canonicalNodesBefore, canonicalNodesAfter, warmBefore, warmAfter);
225 });
226}
227
228/**
229 * Normalized edit distance (Levenshtein / max length).
230 * Returns 0 for identical, 1 for completely different.
231 */
232function normalizedEditDistance(a: string, b: string): number {
233 if (a === b) return 0;
234 if (a.length === 0 || b.length === 0) return 1;
235
236 const maxLen = Math.max(a.length, b.length);
237 const dist = levenshtein(a, b);
238 return dist / maxLen;
239}
240
241/**
242 * Levenshtein distance (optimized for reasonable string lengths).
243 */
244function levenshtein(a: string, b: string): number {
245 const m = a.length;
246 const n = b.length;
247 const dp: number[] = Array.from({ length: n + 1 }, (_, i) => i);
248
249 for (let i = 1; i <= m; i++) {
250 let prev = dp[0];
251 dp[0] = i;
252 for (let j = 1; j <= n; j++) {
253 const temp = dp[j];
254 if (a[i - 1] === b[j - 1]) {
255 dp[j] = prev;
256 } else {
257 dp[j] = 1 + Math.min(prev, dp[j], dp[j - 1]);
258 }
259 prev = temp;
260 }
261 }
262
263 return dp[n];
264}
265
266/**
267 * Jaccard distance of extracted terms between two texts.
268 */
269function termJaccardDistance(textA: string, textB: string): number {
270 const termsA = new Set(extractTerms(textA));
271 const termsB = new Set(extractTerms(textB));
272
273 if (termsA.size === 0 && termsB.size === 0) return 0;
274
275 const intersection = [...termsA].filter(t => termsB.has(t)).length;
276 const union = new Set([...termsA, ...termsB]).size;
277
278 return 1 - (intersection / union);
279}
280
281/**
282 * Compute anchor overlap: what fraction of canon nodes from the 'before' clause
283 * have matching anchors in the 'after' graph. High overlap → same concepts, just reworded.
284 */
285function computeAnchorOverlap(
286 before: Clause,
287 after: Clause,
288 canonBefore: CanonicalNode[],
289 canonAfter: CanonicalNode[],
290): number {
291 const nodesBefore = canonBefore.filter(n => n.source_clause_ids.includes(before.clause_id));
292 if (nodesBefore.length === 0) return 0;
293
294 // Collect all anchors from after nodes linked to the after clause
295 const nodesAfter = canonAfter.filter(n => n.source_clause_ids.includes(after.clause_id));
296 const afterAnchors = new Set(nodesAfter.map(n => n.canon_anchor).filter(Boolean));
297
298 if (afterAnchors.size === 0) return 0;
299
300 let matched = 0;
301 for (const node of nodesBefore) {
302 if (node.canon_anchor && afterAnchors.has(node.canon_anchor)) matched++;
303 }
304
305 return matched / nodesBefore.length;
306}
307
308/**
309 * Count canonical nodes affected by a change.
310 */
311function countCanonImpact(
312 before: Clause | undefined,
313 after: Clause | undefined,
314 canonBefore: CanonicalNode[],
315 canonAfter: CanonicalNode[],
316): number {
317 let impact = 0;
318
319 if (before) {
320 impact += canonBefore.filter(n => n.source_clause_ids.includes(before.clause_id)).length;
321 }
322 if (after) {
323 impact += canonAfter.filter(n => n.source_clause_ids.includes(after.clause_id)).length;
324 }
325
326 return impact;
327}