Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
at main 145 lines 4.6 kB view raw
1/** 2 * Spec Parser — Markdown → Clause[] 3 * 4 * Splits a Markdown document on heading boundaries. 5 * Each heading + its body = one Clause. 6 * Tracks section hierarchy for nested headings. 7 */ 8 9import type { Clause } from './models/clause.js'; 10import { normalizeText } from './normalizer.js'; 11import { clauseSemhash, contextSemhashCold, clauseId } from './semhash.js'; 12 13interface RawSection { 14 heading: string; 15 level: number; 16 startLine: number; // 1-indexed 17 endLine: number; // 1-indexed, inclusive 18 rawText: string; 19 sectionPath: string[]; 20} 21 22/** 23 * Parse a Markdown document into an array of Clauses. 24 */ 25export function parseSpec(content: string, docId: string): Clause[] { 26 const lines = content.split('\n'); 27 const sections = extractSections(lines); 28 29 if (sections.length === 0) { 30 // No headings found — treat entire document as one clause 31 if (content.trim().length === 0) return []; 32 const normalizedText = normalizeText(content); 33 const semhash = clauseSemhash(normalizedText); 34 const sectionPath: string[] = []; 35 const id = clauseId(docId, sectionPath, normalizedText); 36 const ctxHash = contextSemhashCold(normalizedText, sectionPath, '', ''); 37 return [{ 38 clause_id: id, 39 source_doc_id: docId, 40 source_line_range: [1, lines.length], 41 raw_text: content, 42 normalized_text: normalizedText, 43 section_path: sectionPath, 44 clause_semhash: semhash, 45 context_semhash_cold: ctxHash, 46 }]; 47 } 48 49 // Build clauses without context hashes first 50 const preClauses: Omit<Clause, 'context_semhash_cold'>[] = sections.map(sec => { 51 const normalized = normalizeText(sec.rawText); 52 const semhash = clauseSemhash(normalized); 53 const id = clauseId(docId, sec.sectionPath, normalized); 54 return { 55 clause_id: id, 56 source_doc_id: docId, 57 source_line_range: [sec.startLine, sec.endLine] as [number, number], 58 raw_text: sec.rawText, 59 normalized_text: normalized, 60 section_path: sec.sectionPath, 61 clause_semhash: semhash, 62 }; 63 }); 64 65 // Now compute context hashes with neighbor awareness 66 const clauses: Clause[] = preClauses.map((pc, i) => { 67 const prev = i > 0 ? preClauses[i - 1].clause_semhash : ''; 68 const next = i < preClauses.length - 1 ? preClauses[i + 1].clause_semhash : ''; 69 const ctxHash = contextSemhashCold(pc.normalized_text, pc.section_path, prev, next); 70 return { ...pc, context_semhash_cold: ctxHash }; 71 }); 72 73 return clauses; 74} 75 76/** 77 * Extract sections from Markdown lines. 78 * A section = heading line through (but not including) the next heading of same or higher level, 79 * or end of file. 80 */ 81function extractSections(lines: string[]): RawSection[] { 82 const headingPattern = /^(#{1,6})\s+(.+)/; 83 const headingIndices: { index: number; level: number; text: string }[] = []; 84 85 for (let i = 0; i < lines.length; i++) { 86 const match = lines[i].match(headingPattern); 87 if (match) { 88 headingIndices.push({ 89 index: i, 90 level: match[1].length, 91 text: match[2].trim(), 92 }); 93 } 94 } 95 96 if (headingIndices.length === 0) return []; 97 98 // Build sections with proper section_path tracking 99 const sections: RawSection[] = []; 100 const pathStack: { level: number; text: string }[] = []; 101 102 // Capture pre-heading content as a preamble section 103 if (headingIndices.length > 0 && headingIndices[0].index > 0) { 104 const preambleText = lines.slice(0, headingIndices[0].index).join('\n').trim(); 105 if (preambleText.length > 0) { 106 sections.push({ 107 heading: '(preamble)', 108 level: 0, 109 startLine: 1, 110 endLine: headingIndices[0].index, 111 rawText: preambleText, 112 sectionPath: ['(preamble)'], 113 }); 114 } 115 } 116 117 for (let h = 0; h < headingIndices.length; h++) { 118 const { index, level, text } = headingIndices[h]; 119 const startLine = index + 1; // 1-indexed 120 const endLine = h < headingIndices.length - 1 121 ? headingIndices[h + 1].index // line before next heading (0-indexed), = next heading 1-indexed - 1 122 : lines.length; 123 124 // Update section path stack 125 while (pathStack.length > 0 && pathStack[pathStack.length - 1].level >= level) { 126 pathStack.pop(); 127 } 128 pathStack.push({ level, text }); 129 const sectionPath = pathStack.map(p => p.text); 130 131 // Extract raw text for this section 132 const rawText = lines.slice(index, endLine).join('\n'); 133 134 sections.push({ 135 heading: text, 136 level, 137 startLine, 138 endLine, 139 rawText, 140 sectionPath: [...sectionPath], 141 }); 142 } 143 144 return sections; 145}