Reference implementation for the Phoenix Architecture. Work in progress.
aicoding.leaflet.pub/
ai
coding
crazy
1/**
2 * Spec Parser — Markdown → Clause[]
3 *
4 * Splits a Markdown document on heading boundaries.
5 * Each heading + its body = one Clause.
6 * Tracks section hierarchy for nested headings.
7 */
8
9import type { Clause } from './models/clause.js';
10import { normalizeText } from './normalizer.js';
11import { clauseSemhash, contextSemhashCold, clauseId } from './semhash.js';
12
13interface RawSection {
14 heading: string;
15 level: number;
16 startLine: number; // 1-indexed
17 endLine: number; // 1-indexed, inclusive
18 rawText: string;
19 sectionPath: string[];
20}
21
22/**
23 * Parse a Markdown document into an array of Clauses.
24 */
25export function parseSpec(content: string, docId: string): Clause[] {
26 const lines = content.split('\n');
27 const sections = extractSections(lines);
28
29 if (sections.length === 0) {
30 // No headings found — treat entire document as one clause
31 if (content.trim().length === 0) return [];
32 const normalizedText = normalizeText(content);
33 const semhash = clauseSemhash(normalizedText);
34 const sectionPath: string[] = [];
35 const id = clauseId(docId, sectionPath, normalizedText);
36 const ctxHash = contextSemhashCold(normalizedText, sectionPath, '', '');
37 return [{
38 clause_id: id,
39 source_doc_id: docId,
40 source_line_range: [1, lines.length],
41 raw_text: content,
42 normalized_text: normalizedText,
43 section_path: sectionPath,
44 clause_semhash: semhash,
45 context_semhash_cold: ctxHash,
46 }];
47 }
48
49 // Build clauses without context hashes first
50 const preClauses: Omit<Clause, 'context_semhash_cold'>[] = sections.map(sec => {
51 const normalized = normalizeText(sec.rawText);
52 const semhash = clauseSemhash(normalized);
53 const id = clauseId(docId, sec.sectionPath, normalized);
54 return {
55 clause_id: id,
56 source_doc_id: docId,
57 source_line_range: [sec.startLine, sec.endLine] as [number, number],
58 raw_text: sec.rawText,
59 normalized_text: normalized,
60 section_path: sec.sectionPath,
61 clause_semhash: semhash,
62 };
63 });
64
65 // Now compute context hashes with neighbor awareness
66 const clauses: Clause[] = preClauses.map((pc, i) => {
67 const prev = i > 0 ? preClauses[i - 1].clause_semhash : '';
68 const next = i < preClauses.length - 1 ? preClauses[i + 1].clause_semhash : '';
69 const ctxHash = contextSemhashCold(pc.normalized_text, pc.section_path, prev, next);
70 return { ...pc, context_semhash_cold: ctxHash };
71 });
72
73 return clauses;
74}
75
76/**
77 * Extract sections from Markdown lines.
78 * A section = heading line through (but not including) the next heading of same or higher level,
79 * or end of file.
80 */
81function extractSections(lines: string[]): RawSection[] {
82 const headingPattern = /^(#{1,6})\s+(.+)/;
83 const headingIndices: { index: number; level: number; text: string }[] = [];
84
85 for (let i = 0; i < lines.length; i++) {
86 const match = lines[i].match(headingPattern);
87 if (match) {
88 headingIndices.push({
89 index: i,
90 level: match[1].length,
91 text: match[2].trim(),
92 });
93 }
94 }
95
96 if (headingIndices.length === 0) return [];
97
98 // Build sections with proper section_path tracking
99 const sections: RawSection[] = [];
100 const pathStack: { level: number; text: string }[] = [];
101
102 // Capture pre-heading content as a preamble section
103 if (headingIndices.length > 0 && headingIndices[0].index > 0) {
104 const preambleText = lines.slice(0, headingIndices[0].index).join('\n').trim();
105 if (preambleText.length > 0) {
106 sections.push({
107 heading: '(preamble)',
108 level: 0,
109 startLine: 1,
110 endLine: headingIndices[0].index,
111 rawText: preambleText,
112 sectionPath: ['(preamble)'],
113 });
114 }
115 }
116
117 for (let h = 0; h < headingIndices.length; h++) {
118 const { index, level, text } = headingIndices[h];
119 const startLine = index + 1; // 1-indexed
120 const endLine = h < headingIndices.length - 1
121 ? headingIndices[h + 1].index // line before next heading (0-indexed), = next heading 1-indexed - 1
122 : lines.length;
123
124 // Update section path stack
125 while (pathStack.length > 0 && pathStack[pathStack.length - 1].level >= level) {
126 pathStack.pop();
127 }
128 pathStack.push({ level, text });
129 const sectionPath = pathStack.map(p => p.text);
130
131 // Extract raw text for this section
132 const rawText = lines.slice(index, endLine).join('\n');
133
134 sections.push({
135 heading: text,
136 level,
137 startLine,
138 endLine,
139 rawText,
140 sectionPath: [...sectionPath],
141 });
142 }
143
144 return sections;
145}