src/canonicalizer.ts at main · chadfowler.com/phoenix

chadfowler.com / phoenix
fork atom
Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
fork atom
phoenix / src / canonicalizer.ts
at main 294 lines 11 kB view raw
wrap content
Chad Fowler feat: add autoresearch-style experiment loop for canonicalization tuning 3d ago
9470fa51
  1/**
  2 * Canonicalization Engine v2
  3 *
  4 * Phase 1: Extraction — sentence-level, scoring rubric, CONTEXT type.
  5 * Produces CandidateNode[] with confidence scores and coverage.
  6 *
  7 * Also exports the legacy extractCanonicalNodes() for backward compat,
  8 * which runs extraction + resolution in one call.
  9 */
 10
 11import type { Clause } from './models/clause.js';
 12import type { CanonicalNode, CandidateNode, ExtractionCoverage } from './models/canonical.js';
 13import { CanonicalType } from './models/canonical.js';
 14import { sha256 } from './semhash.js';
 15import { normalizeText } from './normalizer.js';
 16import { segmentSentences } from './sentence-segmenter.js';
 17import { resolveGraph } from './resolution.js';
 18import { CONFIG } from './experiment-config.js';
 19
 20// ─── Domain term whitelist (short tokens to keep) ────────────────────────────
 21
 22const DOMAIN_TERMS = new Set([
 23  'id', 'ui', 'ux', 'api', 'jwt', 'sso', 'otp', 'ip', 'db', 'tls', 'ssl',
 24  'rsa', 'aes', 'rs256', 'hs256', 'oidc', 'oauth', '2fa', 'mfa', 'url',
 25  'uri', 'http', 'https', 'sql', 'css', 'html', 'xml', 'json', 'yaml',
 26  'csv', 'tcp', 'udp', 'dns', 'cdn', 'ci', 'cd', 'io', 'os', 'vm',
 27]);
 28
 29// ─── Scoring rubric for type classification ──────────────────────────────────
 30
 31interface TypeScores {
 32  [CanonicalType.REQUIREMENT]: number;
 33  [CanonicalType.CONSTRAINT]: number;
 34  [CanonicalType.INVARIANT]: number;
 35  [CanonicalType.DEFINITION]: number;
 36  [CanonicalType.CONTEXT]: number;
 37}
 38
 39function emptyScores(): TypeScores {
 40  return {
 41    [CanonicalType.REQUIREMENT]: 0,
 42    [CanonicalType.CONSTRAINT]: 0,
 43    [CanonicalType.INVARIANT]: 0,
 44    [CanonicalType.DEFINITION]: 0,
 45    [CanonicalType.CONTEXT]: 0,
 46  };
 47}
 48
 49/** Score a sentence across all types; highest score wins */
 50function scoreSentence(text: string, headingContext: CanonicalType | null): { type: CanonicalType; confidence: number } {
 51  const scores = emptyScores();
 52  const lower = text.toLowerCase();
 53
 54  // ── Constraint signals ──
 55  if (/\b(?:must not|shall not|may not|cannot|can't|disallowed|forbidden|prohibited)\b/i.test(text)) {
 56    scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_NEGATION_WEIGHT;
 57  }
 58  if (/\b(?:limited to|maximum|minimum|at most|at least|no more than|no fewer than|up to|ceiling|floor)\b/i.test(text)) {
 59    scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_LIMIT_WEIGHT;
 60  }
 61  // Numeric bounds: "5 per minute", "≤ 100", "between 1 and 10"
 62  if (/\b\d+\s*(?:per|\/)\s*\w+\b/i.test(text) || /[≤≥<>]\s*\d+/.test(text)) {
 63    scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_NUMERIC_WEIGHT;
 64  }
 65
 66  // ── Invariant signals ──
 67  if (/\b(?:always|never|at all times|regardless|invariant|guaranteed|must remain|must always|must never)\b/i.test(text)) {
 68    scores[CanonicalType.INVARIANT] += CONFIG.INVARIANT_SIGNAL_WEIGHT;
 69  }
 70
 71  // ── Requirement signals ──
 72  if (/\b(?:must|shall)\b/i.test(text) && !/\b(?:must not|shall not|must always|must never|must remain)\b/i.test(text)) {
 73    scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_MODAL_WEIGHT;
 74  }
 75  if (/\b(?:required|requires?|needs? to|has to|will)\b/i.test(text)) {
 76    scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_KEYWORD_WEIGHT;
 77  }
 78  if (/\b(?:support|provide|implement|enable|allow|accept|return|create|delete|update|send|receive|handle|manage|track|store|validate|generate)\b/i.test(text)) {
 79    scores[CanonicalType.REQUIREMENT] += CONFIG.REQUIREMENT_VERB_WEIGHT;
 80  }
 81
 82  // ── Definition signals ──
 83  if (/\b(?:is defined as|means|refers to|is a|is an)\b/i.test(text) && text.length < CONFIG.DEFINITION_MAX_LENGTH) {
 84    scores[CanonicalType.DEFINITION] += CONFIG.DEFINITION_EXPLICIT_WEIGHT;
 85  }
 86  // Colon pattern "Term: definition text" but not enumerations
 87  if (/^[A-Z][a-zA-Z\s]{2,30}:\s+[A-Z]/.test(text) && !/[:,]\s*$/.test(text)) {
 88    scores[CanonicalType.DEFINITION] += CONFIG.DEFINITION_COLON_WEIGHT;
 89  }
 90
 91  // ── Context signals (no actionable keywords) ──
 92  if (!hasAnyModal(lower) && !hasAnyKeyword(lower)) {
 93    scores[CanonicalType.CONTEXT] += CONFIG.CONTEXT_NO_MODAL_WEIGHT;
 94  }
 95  // Short sentence without verb-like keywords
 96  if (text.split(/\s+/).length < 8 && !hasAnyModal(lower)) {
 97    scores[CanonicalType.CONTEXT] += CONFIG.CONTEXT_SHORT_WEIGHT;
 98  }
 99
100  // ── Heading context bonus ──
101  if (headingContext) {
102    scores[headingContext] += CONFIG.HEADING_CONTEXT_BONUS;
103  }
104
105  // ── Also give constraint "must" credit since "must" appears in constraints too ──
106  if (/\b(?:must|shall)\b/i.test(text)) {
107    scores[CanonicalType.CONSTRAINT] += CONFIG.CONSTRAINT_MUST_BONUS;
108  }
109
110  // Pick winner
111  const entries = Object.entries(scores) as [CanonicalType, number][];
112  entries.sort((a, b) => b[1] - a[1]);
113  const [winType, winScore] = entries[0];
114  const runnerUp = entries[1][1];
115
116  // If nothing scored above 0, it's CONTEXT
117  if (winScore === 0) {
118    return { type: CanonicalType.CONTEXT, confidence: CONFIG.MIN_CONFIDENCE };
119  }
120
121  const confidence = Math.max(CONFIG.MIN_CONFIDENCE, Math.min(CONFIG.MAX_CONFIDENCE, (winScore - runnerUp) / Math.max(winScore, 1)));
122  return { type: winType, confidence };
123}
124
125function hasAnyModal(lower: string): boolean {
126  return /\b(?:must|shall|should|will|required|requires?|needs? to|has to|cannot|forbidden|prohibited)\b/.test(lower);
127}
128
129function hasAnyKeyword(lower: string): boolean {
130  return /\b(?:support|provide|implement|enable|allow|accept|return|create|delete|update|send|receive|handle|manage|track|store|validate|generate|defined|means|refers)\b/.test(lower);
131}
132
133// ─── Heading context (same as v1) ────────────────────────────────────────────
134
135const HEADING_CONTEXT: [RegExp, CanonicalType][] = [
136  [/\b(?:constraint|security|limit|restrict)/i, CanonicalType.CONSTRAINT],
137  [/\b(?:requirement|feature|capability)/i, CanonicalType.REQUIREMENT],
138  [/\b(?:definition|glossary|term)/i, CanonicalType.DEFINITION],
139  [/\b(?:invariant|guarantee)/i, CanonicalType.INVARIANT],
140];
141
142function getHeadingContext(sectionPath: string[]): CanonicalType | null {
143  for (let i = sectionPath.length - 1; i >= 0; i--) {
144    for (const [pattern, type] of HEADING_CONTEXT) {
145      if (pattern.test(sectionPath[i])) return type;
146    }
147  }
148  return null;
149}
150
151// ─── Phase 1: Extract candidates ─────────────────────────────────────────────
152
153export interface ExtractionResult {
154  candidates: CandidateNode[];
155  coverage: ExtractionCoverage[];
156}
157
158/**
159 * Phase 1: Extract candidate nodes from clauses using sentence segmentation
160 * and scoring rubric.
161 */
162export function extractCandidates(clauses: Clause[]): ExtractionResult {
163  const allCandidates: CandidateNode[] = [];
164  const allCoverage: ExtractionCoverage[] = [];
165
166  for (const clause of clauses) {
167    const { candidates, coverage } = extractFromClause(clause);
168    allCandidates.push(...candidates);
169    allCoverage.push(coverage);
170  }
171
172  return { candidates: allCandidates, coverage: allCoverage };
173}
174
175function extractFromClause(clause: Clause): { candidates: CandidateNode[]; coverage: ExtractionCoverage } {
176  const sentences = segmentSentences(clause.raw_text);
177  const headingContext = getHeadingContext(clause.section_path);
178  const candidates: CandidateNode[] = [];
179  let extractedCount = 0;
180  let contextCount = 0;
181  const uncovered: ExtractionCoverage['uncovered'] = [];
182
183  for (const sentence of sentences) {
184    const content = sentence.text.trim();
185    if (!content || content.length < CONFIG.MIN_EXTRACTION_LENGTH) {
186      uncovered.push({ text: content, reason: 'too_short' });
187      continue;
188    }
189
190    const normalizedStatement = normalizeText(content);
191    if (!normalizedStatement) {
192      uncovered.push({ text: content, reason: 'too_short' });
193      continue;
194    }
195
196    const { type, confidence } = scoreSentence(content, headingContext);
197    const tags = extractTerms(normalizedStatement);
198
199    const candidateId = sha256([type, normalizedStatement, clause.clause_id].join('\x00'));
200
201    candidates.push({
202      candidate_id: candidateId,
203      type,
204      statement: normalizedStatement,
205      confidence,
206      source_clause_ids: [clause.clause_id],
207      tags,
208      sentence_index: sentence.index,
209      extraction_method: 'rule',
210    });
211
212    if (type === CanonicalType.CONTEXT) {
213      contextCount++;
214    } else {
215      extractedCount++;
216    }
217  }
218
219  const total = sentences.length;
220  const coverage: ExtractionCoverage = {
221    clause_id: clause.clause_id,
222    total_sentences: total,
223    extracted_sentences: extractedCount,
224    context_sentences: contextCount,
225    coverage_pct: total > 0 ? ((extractedCount + contextCount) / total) * 100 : 0,
226    uncovered,
227  };
228
229  return { candidates, coverage };
230}
231
232// ─── Term extraction (v2: acronym whitelist + hyphenated compounds) ───────────
233
234const STOP_WORDS = new Set([
235  'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
236  'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
237  'should', 'may', 'might', 'shall', 'can', 'must', 'need', 'to', 'of',
238  'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through',
239  'and', 'or', 'but', 'not', 'no', 'if', 'then', 'else', 'when', 'where',
240  'that', 'this', 'these', 'those', 'it', 'its', 'all', 'each', 'every',
241  'any', 'both', 'few', 'more', 'most', 'other', 'some', 'such',
242]);
243
244/**
245 * Extract key terms from normalized text.
246 * Preserves domain acronyms and hyphenated compounds.
247 */
248export function extractTerms(text: string): string[] {
249  const lower = text.toLowerCase();
250
251  // Extract hyphenated compounds first (e.g., rate-limit, in-progress)
252  const hyphenated = lower.match(/\b[a-z0-9]+-[a-z0-9]+(?:-[a-z0-9]+)*/g) || [];
253
254  // Split remaining into words
255  const words = lower
256    .split(/\s+/)
257    .map(w => w.replace(/[^a-z0-9-]/g, ''))
258    .filter(Boolean);
259
260  const terms = new Set<string>();
261
262  // Add hyphenated compounds
263  for (const h of hyphenated) {
264    if (h.length >= CONFIG.MIN_TERM_LENGTH) terms.add(h);
265  }
266
267  // Add individual words
268  for (const w of words) {
269    // Skip stop words
270    if (STOP_WORDS.has(w)) continue;
271    // Keep domain terms regardless of length
272    if (DOMAIN_TERMS.has(w)) {
273      terms.add(w);
274      continue;
275    }
276    // Keep words > 2 chars
277    if (w.length > CONFIG.MIN_WORD_LENGTH && !w.includes('-')) {
278      terms.add(w);
279    }
280  }
281
282  return [...terms];
283}
284
285// ─── Legacy API: extract + resolve in one call ───────────────────────────────
286
287/**
288 * Extract canonical nodes from clauses (v2: sentence-level + resolution).
289 * Backward-compatible API — returns CanonicalNode[].
290 */
291export function extractCanonicalNodes(clauses: Clause[]): CanonicalNode[] {
292  const { candidates } = extractCandidates(clauses);
293  return resolveGraph(candidates, clauses);
294}