Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
at main 294 lines 11 kB view raw
1/** 2 * Canonicalization Engine v2 3 * 4 * Phase 1: Extraction — sentence-level, scoring rubric, CONTEXT type. 5 * Produces CandidateNode[] with confidence scores and coverage. 6 * 7 * Also exports the legacy extractCanonicalNodes() for backward compat, 8 * which runs extraction + resolution in one call. 9 */ 10 11import type { Clause } from './models/clause.js'; 12import type { CanonicalNode, CandidateNode, ExtractionCoverage } from './models/canonical.js'; 13import { CanonicalType } from './models/canonical.js'; 14import { sha256 } from './semhash.js'; 15import { normalizeText } from './normalizer.js'; 16import { segmentSentences } from './sentence-segmenter.js'; 17import { resolveGraph } from './resolution.js'; 18import { CONFIG } from './experiment-config.js'; 19 20// ─── Domain term whitelist (short tokens to keep) ──────────────────────────── 21 22const DOMAIN_TERMS = new Set([ 23 'id', 'ui', 'ux', 'api', 'jwt', 'sso', 'otp', 'ip', 'db', 'tls', 'ssl', 24 'rsa', 'aes', 'rs256', 'hs256', 'oidc', 'oauth', '2fa', 'mfa', 'url', 25 'uri', 'http', 'https', 'sql', 'css', 'html', 'xml', 'json', 'yaml', 26 'csv', 'tcp', 'udp', 'dns', 'cdn', 'ci', 'cd', 'io', 'os', 'vm', 27]); 28 29// ─── Scoring rubric for type classification ────────────────────────────────── 30 31interface TypeScores { 32 [CanonicalType.REQUIREMENT]: number; 33 [CanonicalType.CONSTRAINT]: number; 34 [CanonicalType.INVARIANT]: number; 35 [CanonicalType.DEFINITION]: number; 36 [CanonicalType.CONTEXT]: number; 37} 38 39function emptyScores(): TypeScores { 40 return { 41 [CanonicalType.REQUIREMENT]: 0, 42 [CanonicalType.CONSTRAINT]: 0, 43 [CanonicalType.INVARIANT]: 0, 44 [CanonicalType.DEFINITION]: 0, 45 [CanonicalType.CONTEXT]: 0, 46 }; 47} 48 49/** Score a sentence across all types; highest score wins */ 50function scoreSentence(text: string, headingContext: CanonicalType | null): { type: CanonicalType; confidence: number } { 51 const scores = emptyScores(); 52 const lower = text.toLowerCase(); 53 54 // ── Constraint signals ── 55 if (/\b(?:must not|shall not|may not|cannot|can't|disallowed|forbidden|prohibited)\b/i.test(text)) { 56 scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_NEGATION_WEIGHT; 57 } 58 if (/\b(?:limited to|maximum|minimum|at most|at least|no more than|no fewer than|up to|ceiling|floor)\b/i.test(text)) { 59 scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_LIMIT_WEIGHT; 60 } 61 // Numeric bounds: "5 per minute", "≤ 100", "between 1 and 10" 62 if (/\b\d+\s*(?:per|\/)\s*\w+\b/i.test(text) || /[≤≥<>]\s*\d+/.test(text)) { 63 scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_NUMERIC_WEIGHT; 64 } 65 66 // ── Invariant signals ── 67 if (/\b(?:always|never|at all times|regardless|invariant|guaranteed|must remain|must always|must never)\b/i.test(text)) { 68 scores[CanonicalType.INVARIANT] += CONFIG.INVARIANT_SIGNAL_WEIGHT; 69 } 70 71 // ── Requirement signals ── 72 if (/\b(?:must|shall)\b/i.test(text) && !/\b(?:must not|shall not|must always|must never|must remain)\b/i.test(text)) { 73 scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_MODAL_WEIGHT; 74 } 75 if (/\b(?:required|requires?|needs? to|has to|will)\b/i.test(text)) { 76 scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_KEYWORD_WEIGHT; 77 } 78 if (/\b(?:support|provide|implement|enable|allow|accept|return|create|delete|update|send|receive|handle|manage|track|store|validate|generate)\b/i.test(text)) { 79 scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_VERB_WEIGHT; 80 } 81 82 // ── Definition signals ── 83 if (/\b(?:is defined as|means|refers to|is a|is an)\b/i.test(text) && text.length < CONFIG.DEFINITION_MAX_LENGTH) { 84 scores[CanonicalType.DEFINITION] += CONFIG.DEFINITION_EXPLICIT_WEIGHT; 85 } 86 // Colon pattern "Term: definition text" but not enumerations 87 if (/^[A-Z][a-zA-Z\s]{2,30}:\s+[A-Z]/.test(text) && !/[:,]\s*$/.test(text)) { 88 scores[CanonicalType.DEFINITION] += CONFIG.DEFINITION_COLON_WEIGHT; 89 } 90 91 // ── Context signals (no actionable keywords) ── 92 if (!hasAnyModal(lower) && !hasAnyKeyword(lower)) { 93 scores[CanonicalType.CONTEXT] += CONFIG.CONTEXT_NO_MODAL_WEIGHT; 94 } 95 // Short sentence without verb-like keywords 96 if (text.split(/\s+/).length < 8 && !hasAnyModal(lower)) { 97 scores[CanonicalType.CONTEXT] += CONFIG.CONTEXT_SHORT_WEIGHT; 98 } 99 100 // ── Heading context bonus ── 101 if (headingContext) { 102 scores[headingContext] += CONFIG.HEADING_CONTEXT_BONUS; 103 } 104 105 // ── Also give constraint "must" credit since "must" appears in constraints too ── 106 if (/\b(?:must|shall)\b/i.test(text)) { 107 scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_MUST_BONUS; 108 } 109 110 // Pick winner 111 const entries = Object.entries(scores) as [CanonicalType, number][]; 112 entries.sort((a, b) => b[1] - a[1]); 113 const [winType, winScore] = entries[0]; 114 const runnerUp = entries[1][1]; 115 116 // If nothing scored above 0, it's CONTEXT 117 if (winScore === 0) { 118 return { type: CanonicalType.CONTEXT, confidence: CONFIG.MIN_CONFIDENCE }; 119 } 120 121 const confidence = Math.max(CONFIG.MIN_CONFIDENCE, Math.min(CONFIG.MAX_CONFIDENCE, (winScore - runnerUp) / Math.max(winScore, 1))); 122 return { type: winType, confidence }; 123} 124 125function hasAnyModal(lower: string): boolean { 126 return /\b(?:must|shall|should|will|required|requires?|needs? to|has to|cannot|forbidden|prohibited)\b/.test(lower); 127} 128 129function hasAnyKeyword(lower: string): boolean { 130 return /\b(?:support|provide|implement|enable|allow|accept|return|create|delete|update|send|receive|handle|manage|track|store|validate|generate|defined|means|refers)\b/.test(lower); 131} 132 133// ─── Heading context (same as v1) ──────────────────────────────────────────── 134 135const HEADING_CONTEXT: [RegExp, CanonicalType][] = [ 136 [/\b(?:constraint|security|limit|restrict)/i, CanonicalType.CONSTRAINT], 137 [/\b(?:requirement|feature|capability)/i, CanonicalType.REQUIREMENT], 138 [/\b(?:definition|glossary|term)/i, CanonicalType.DEFINITION], 139 [/\b(?:invariant|guarantee)/i, CanonicalType.INVARIANT], 140]; 141 142function getHeadingContext(sectionPath: string[]): CanonicalType | null { 143 for (let i = sectionPath.length - 1; i >= 0; i--) { 144 for (const [pattern, type] of HEADING_CONTEXT) { 145 if (pattern.test(sectionPath[i])) return type; 146 } 147 } 148 return null; 149} 150 151// ─── Phase 1: Extract candidates ───────────────────────────────────────────── 152 153export interface ExtractionResult { 154 candidates: CandidateNode[]; 155 coverage: ExtractionCoverage[]; 156} 157 158/** 159 * Phase 1: Extract candidate nodes from clauses using sentence segmentation 160 * and scoring rubric. 161 */ 162export function extractCandidates(clauses: Clause[]): ExtractionResult { 163 const allCandidates: CandidateNode[] = []; 164 const allCoverage: ExtractionCoverage[] = []; 165 166 for (const clause of clauses) { 167 const { candidates, coverage } = extractFromClause(clause); 168 allCandidates.push(...candidates); 169 allCoverage.push(coverage); 170 } 171 172 return { candidates: allCandidates, coverage: allCoverage }; 173} 174 175function extractFromClause(clause: Clause): { candidates: CandidateNode[]; coverage: ExtractionCoverage } { 176 const sentences = segmentSentences(clause.raw_text); 177 const headingContext = getHeadingContext(clause.section_path); 178 const candidates: CandidateNode[] = []; 179 let extractedCount = 0; 180 let contextCount = 0; 181 const uncovered: ExtractionCoverage['uncovered'] = []; 182 183 for (const sentence of sentences) { 184 const content = sentence.text.trim(); 185 if (!content || content.length < CONFIG.MIN_EXTRACTION_LENGTH) { 186 uncovered.push({ text: content, reason: 'too_short' }); 187 continue; 188 } 189 190 const normalizedStatement = normalizeText(content); 191 if (!normalizedStatement) { 192 uncovered.push({ text: content, reason: 'too_short' }); 193 continue; 194 } 195 196 const { type, confidence } = scoreSentence(content, headingContext); 197 const tags = extractTerms(normalizedStatement); 198 199 const candidateId = sha256([type, normalizedStatement, clause.clause_id].join('\x00')); 200 201 candidates.push({ 202 candidate_id: candidateId, 203 type, 204 statement: normalizedStatement, 205 confidence, 206 source_clause_ids: [clause.clause_id], 207 tags, 208 sentence_index: sentence.index, 209 extraction_method: 'rule', 210 }); 211 212 if (type === CanonicalType.CONTEXT) { 213 contextCount++; 214 } else { 215 extractedCount++; 216 } 217 } 218 219 const total = sentences.length; 220 const coverage: ExtractionCoverage = { 221 clause_id: clause.clause_id, 222 total_sentences: total, 223 extracted_sentences: extractedCount, 224 context_sentences: contextCount, 225 coverage_pct: total > 0 ? ((extractedCount + contextCount) / total) * 100 : 0, 226 uncovered, 227 }; 228 229 return { candidates, coverage }; 230} 231 232// ─── Term extraction (v2: acronym whitelist + hyphenated compounds) ─────────── 233 234const STOP_WORDS = new Set([ 235 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 236 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 237 'should', 'may', 'might', 'shall', 'can', 'must', 'need', 'to', 'of', 238 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 239 'and', 'or', 'but', 'not', 'no', 'if', 'then', 'else', 'when', 'where', 240 'that', 'this', 'these', 'those', 'it', 'its', 'all', 'each', 'every', 241 'any', 'both', 'few', 'more', 'most', 'other', 'some', 'such', 242]); 243 244/** 245 * Extract key terms from normalized text. 246 * Preserves domain acronyms and hyphenated compounds. 247 */ 248export function extractTerms(text: string): string[] { 249 const lower = text.toLowerCase(); 250 251 // Extract hyphenated compounds first (e.g., rate-limit, in-progress) 252 const hyphenated = lower.match(/\b[a-z0-9]+-[a-z0-9]+(?:-[a-z0-9]+)*/g) || []; 253 254 // Split remaining into words 255 const words = lower 256 .split(/\s+/) 257 .map(w => w.replace(/[^a-z0-9-]/g, '')) 258 .filter(Boolean); 259 260 const terms = new Set<string>(); 261 262 // Add hyphenated compounds 263 for (const h of hyphenated) { 264 if (h.length >= CONFIG.MIN_TERM_LENGTH) terms.add(h); 265 } 266 267 // Add individual words 268 for (const w of words) { 269 // Skip stop words 270 if (STOP_WORDS.has(w)) continue; 271 // Keep domain terms regardless of length 272 if (DOMAIN_TERMS.has(w)) { 273 terms.add(w); 274 continue; 275 } 276 // Keep words > 2 chars 277 if (w.length > CONFIG.MIN_WORD_LENGTH && !w.includes('-')) { 278 terms.add(w); 279 } 280 } 281 282 return [...terms]; 283} 284 285// ─── Legacy API: extract + resolve in one call ─────────────────────────────── 286 287/** 288 * Extract canonical nodes from clauses (v2: sentence-level + resolution). 289 * Backward-compatible API — returns CanonicalNode[]. 290 */ 291export function extractCanonicalNodes(clauses: Clause[]): CanonicalNode[] { 292 const { candidates } = extractCandidates(clauses); 293 return resolveGraph(candidates, clauses); 294}