Reference implementation for the Phoenix Architecture. Work in progress.
aicoding.leaflet.pub/
ai
coding
crazy
1/**
2 * Canonicalization Engine v2
3 *
4 * Phase 1: Extraction — sentence-level, scoring rubric, CONTEXT type.
5 * Produces CandidateNode[] with confidence scores and coverage.
6 *
7 * Also exports the legacy extractCanonicalNodes() for backward compat,
8 * which runs extraction + resolution in one call.
9 */
10
11import type { Clause } from './models/clause.js';
12import type { CanonicalNode, CandidateNode, ExtractionCoverage } from './models/canonical.js';
13import { CanonicalType } from './models/canonical.js';
14import { sha256 } from './semhash.js';
15import { normalizeText } from './normalizer.js';
16import { segmentSentences } from './sentence-segmenter.js';
17import { resolveGraph } from './resolution.js';
18import { CONFIG } from './experiment-config.js';
19
20// ─── Domain term whitelist (short tokens to keep) ────────────────────────────
21
22const DOMAIN_TERMS = new Set([
23 'id', 'ui', 'ux', 'api', 'jwt', 'sso', 'otp', 'ip', 'db', 'tls', 'ssl',
24 'rsa', 'aes', 'rs256', 'hs256', 'oidc', 'oauth', '2fa', 'mfa', 'url',
25 'uri', 'http', 'https', 'sql', 'css', 'html', 'xml', 'json', 'yaml',
26 'csv', 'tcp', 'udp', 'dns', 'cdn', 'ci', 'cd', 'io', 'os', 'vm',
27]);
28
29// ─── Scoring rubric for type classification ──────────────────────────────────
30
31interface TypeScores {
32 [CanonicalType.REQUIREMENT]: number;
33 [CanonicalType.CONSTRAINT]: number;
34 [CanonicalType.INVARIANT]: number;
35 [CanonicalType.DEFINITION]: number;
36 [CanonicalType.CONTEXT]: number;
37}
38
39function emptyScores(): TypeScores {
40 return {
41 [CanonicalType.REQUIREMENT]: 0,
42 [CanonicalType.CONSTRAINT]: 0,
43 [CanonicalType.INVARIANT]: 0,
44 [CanonicalType.DEFINITION]: 0,
45 [CanonicalType.CONTEXT]: 0,
46 };
47}
48
49/** Score a sentence across all types; highest score wins */
50function scoreSentence(text: string, headingContext: CanonicalType | null): { type: CanonicalType; confidence: number } {
51 const scores = emptyScores();
52 const lower = text.toLowerCase();
53
54 // ── Constraint signals ──
55 if (/\b(?:must not|shall not|may not|cannot|can't|disallowed|forbidden|prohibited)\b/i.test(text)) {
56 scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_NEGATION_WEIGHT;
57 }
58 if (/\b(?:limited to|maximum|minimum|at most|at least|no more than|no fewer than|up to|ceiling|floor)\b/i.test(text)) {
59 scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_LIMIT_WEIGHT;
60 }
61 // Numeric bounds: "5 per minute", "≤ 100", "between 1 and 10"
62 if (/\b\d+\s*(?:per|\/)\s*\w+\b/i.test(text) || /[≤≥<>]\s*\d+/.test(text)) {
63 scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_NUMERIC_WEIGHT;
64 }
65
66 // ── Invariant signals ──
67 if (/\b(?:always|never|at all times|regardless|invariant|guaranteed|must remain|must always|must never)\b/i.test(text)) {
68 scores[CanonicalType.INVARIANT] += CONFIG.INVARIANT_SIGNAL_WEIGHT;
69 }
70
71 // ── Requirement signals ──
72 if (/\b(?:must|shall)\b/i.test(text) && !/\b(?:must not|shall not|must always|must never|must remain)\b/i.test(text)) {
73 scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_MODAL_WEIGHT;
74 }
75 if (/\b(?:required|requires?|needs? to|has to|will)\b/i.test(text)) {
76 scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_KEYWORD_WEIGHT;
77 }
78 if (/\b(?:support|provide|implement|enable|allow|accept|return|create|delete|update|send|receive|handle|manage|track|store|validate|generate)\b/i.test(text)) {
79 scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_VERB_WEIGHT;
80 }
81
82 // ── Definition signals ──
83 if (/\b(?:is defined as|means|refers to|is a|is an)\b/i.test(text) && text.length < CONFIG.DEFINITION_MAX_LENGTH) {
84 scores[CanonicalType.DEFINITION] += CONFIG.DEFINITION_EXPLICIT_WEIGHT;
85 }
86 // Colon pattern "Term: definition text" but not enumerations
87 if (/^[A-Z][a-zA-Z\s]{2,30}:\s+[A-Z]/.test(text) && !/[:,]\s*$/.test(text)) {
88 scores[CanonicalType.DEFINITION] += CONFIG.DEFINITION_COLON_WEIGHT;
89 }
90
91 // ── Context signals (no actionable keywords) ──
92 if (!hasAnyModal(lower) && !hasAnyKeyword(lower)) {
93 scores[CanonicalType.CONTEXT] += CONFIG.CONTEXT_NO_MODAL_WEIGHT;
94 }
95 // Short sentence without verb-like keywords
96 if (text.split(/\s+/).length < 8 && !hasAnyModal(lower)) {
97 scores[CanonicalType.CONTEXT] += CONFIG.CONTEXT_SHORT_WEIGHT;
98 }
99
100 // ── Heading context bonus ──
101 if (headingContext) {
102 scores[headingContext] += CONFIG.HEADING_CONTEXT_BONUS;
103 }
104
105 // ── Also give constraint "must" credit since "must" appears in constraints too ──
106 if (/\b(?:must|shall)\b/i.test(text)) {
107 scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_MUST_BONUS;
108 }
109
110 // Pick winner
111 const entries = Object.entries(scores) as [CanonicalType, number][];
112 entries.sort((a, b) => b[1] - a[1]);
113 const [winType, winScore] = entries[0];
114 const runnerUp = entries[1][1];
115
116 // If nothing scored above 0, it's CONTEXT
117 if (winScore === 0) {
118 return { type: CanonicalType.CONTEXT, confidence: CONFIG.MIN_CONFIDENCE };
119 }
120
121 const confidence = Math.max(CONFIG.MIN_CONFIDENCE, Math.min(CONFIG.MAX_CONFIDENCE, (winScore - runnerUp) / Math.max(winScore, 1)));
122 return { type: winType, confidence };
123}
124
125function hasAnyModal(lower: string): boolean {
126 return /\b(?:must|shall|should|will|required|requires?|needs? to|has to|cannot|forbidden|prohibited)\b/.test(lower);
127}
128
129function hasAnyKeyword(lower: string): boolean {
130 return /\b(?:support|provide|implement|enable|allow|accept|return|create|delete|update|send|receive|handle|manage|track|store|validate|generate|defined|means|refers)\b/.test(lower);
131}
132
133// ─── Heading context (same as v1) ────────────────────────────────────────────
134
135const HEADING_CONTEXT: [RegExp, CanonicalType][] = [
136 [/\b(?:constraint|security|limit|restrict)/i, CanonicalType.CONSTRAINT],
137 [/\b(?:requirement|feature|capability)/i, CanonicalType.REQUIREMENT],
138 [/\b(?:definition|glossary|term)/i, CanonicalType.DEFINITION],
139 [/\b(?:invariant|guarantee)/i, CanonicalType.INVARIANT],
140];
141
142function getHeadingContext(sectionPath: string[]): CanonicalType | null {
143 for (let i = sectionPath.length - 1; i >= 0; i--) {
144 for (const [pattern, type] of HEADING_CONTEXT) {
145 if (pattern.test(sectionPath[i])) return type;
146 }
147 }
148 return null;
149}
150
151// ─── Phase 1: Extract candidates ─────────────────────────────────────────────
152
153export interface ExtractionResult {
154 candidates: CandidateNode[];
155 coverage: ExtractionCoverage[];
156}
157
158/**
159 * Phase 1: Extract candidate nodes from clauses using sentence segmentation
160 * and scoring rubric.
161 */
162export function extractCandidates(clauses: Clause[]): ExtractionResult {
163 const allCandidates: CandidateNode[] = [];
164 const allCoverage: ExtractionCoverage[] = [];
165
166 for (const clause of clauses) {
167 const { candidates, coverage } = extractFromClause(clause);
168 allCandidates.push(...candidates);
169 allCoverage.push(coverage);
170 }
171
172 return { candidates: allCandidates, coverage: allCoverage };
173}
174
175function extractFromClause(clause: Clause): { candidates: CandidateNode[]; coverage: ExtractionCoverage } {
176 const sentences = segmentSentences(clause.raw_text);
177 const headingContext = getHeadingContext(clause.section_path);
178 const candidates: CandidateNode[] = [];
179 let extractedCount = 0;
180 let contextCount = 0;
181 const uncovered: ExtractionCoverage['uncovered'] = [];
182
183 for (const sentence of sentences) {
184 const content = sentence.text.trim();
185 if (!content || content.length < CONFIG.MIN_EXTRACTION_LENGTH) {
186 uncovered.push({ text: content, reason: 'too_short' });
187 continue;
188 }
189
190 const normalizedStatement = normalizeText(content);
191 if (!normalizedStatement) {
192 uncovered.push({ text: content, reason: 'too_short' });
193 continue;
194 }
195
196 const { type, confidence } = scoreSentence(content, headingContext);
197 const tags = extractTerms(normalizedStatement);
198
199 const candidateId = sha256([type, normalizedStatement, clause.clause_id].join('\x00'));
200
201 candidates.push({
202 candidate_id: candidateId,
203 type,
204 statement: normalizedStatement,
205 confidence,
206 source_clause_ids: [clause.clause_id],
207 tags,
208 sentence_index: sentence.index,
209 extraction_method: 'rule',
210 });
211
212 if (type === CanonicalType.CONTEXT) {
213 contextCount++;
214 } else {
215 extractedCount++;
216 }
217 }
218
219 const total = sentences.length;
220 const coverage: ExtractionCoverage = {
221 clause_id: clause.clause_id,
222 total_sentences: total,
223 extracted_sentences: extractedCount,
224 context_sentences: contextCount,
225 coverage_pct: total > 0 ? ((extractedCount + contextCount) / total) * 100 : 0,
226 uncovered,
227 };
228
229 return { candidates, coverage };
230}
231
232// ─── Term extraction (v2: acronym whitelist + hyphenated compounds) ───────────
233
234const STOP_WORDS = new Set([
235 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
236 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
237 'should', 'may', 'might', 'shall', 'can', 'must', 'need', 'to', 'of',
238 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through',
239 'and', 'or', 'but', 'not', 'no', 'if', 'then', 'else', 'when', 'where',
240 'that', 'this', 'these', 'those', 'it', 'its', 'all', 'each', 'every',
241 'any', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
242]);
243
244/**
245 * Extract key terms from normalized text.
246 * Preserves domain acronyms and hyphenated compounds.
247 */
248export function extractTerms(text: string): string[] {
249 const lower = text.toLowerCase();
250
251 // Extract hyphenated compounds first (e.g., rate-limit, in-progress)
252 const hyphenated = lower.match(/\b[a-z0-9]+-[a-z0-9]+(?:-[a-z0-9]+)*/g) || [];
253
254 // Split remaining into words
255 const words = lower
256 .split(/\s+/)
257 .map(w => w.replace(/[^a-z0-9-]/g, ''))
258 .filter(Boolean);
259
260 const terms = new Set<string>();
261
262 // Add hyphenated compounds
263 for (const h of hyphenated) {
264 if (h.length >= CONFIG.MIN_TERM_LENGTH) terms.add(h);
265 }
266
267 // Add individual words
268 for (const w of words) {
269 // Skip stop words
270 if (STOP_WORDS.has(w)) continue;
271 // Keep domain terms regardless of length
272 if (DOMAIN_TERMS.has(w)) {
273 terms.add(w);
274 continue;
275 }
276 // Keep words > 2 chars
277 if (w.length > CONFIG.MIN_WORD_LENGTH && !w.includes('-')) {
278 terms.add(w);
279 }
280 }
281
282 return [...terms];
283}
284
285// ─── Legacy API: extract + resolve in one call ───────────────────────────────
286
287/**
288 * Extract canonical nodes from clauses (v2: sentence-level + resolution).
289 * Backward-compatible API — returns CanonicalNode[].
290 */
291export function extractCanonicalNodes(clauses: Clause[]): CanonicalNode[] {
292 const { candidates } = extractCandidates(clauses);
293 return resolveGraph(candidates, clauses);
294}