src/spec-parser.ts at main · chadfowler.com/phoenix

chadfowler.com / phoenix
fork atom
Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
fork atom
phoenix / src / spec-parser.ts
at main 145 lines 4.6 kB view raw
wrap content
Chad Fowler Phoenix VCS v0.1.0 — initial commit 5w ago
030f51e3
  1/**
  2 * Spec Parser — Markdown → Clause[]
  3 *
  4 * Splits a Markdown document on heading boundaries.
  5 * Each heading + its body = one Clause.
  6 * Tracks section hierarchy for nested headings.
  7 */
  8
  9import type { Clause } from './models/clause.js';
 10import { normalizeText } from './normalizer.js';
 11import { clauseSemhash, contextSemhashCold, clauseId } from './semhash.js';
 12
 13interface RawSection {
 14  heading: string;
 15  level: number;
 16  startLine: number; // 1-indexed
 17  endLine: number;   // 1-indexed, inclusive
 18  rawText: string;
 19  sectionPath: string[];
 20}
 21
 22/**
 23 * Parse a Markdown document into an array of Clauses.
 24 */
 25export function parseSpec(content: string, docId: string): Clause[] {
 26  const lines = content.split('\n');
 27  const sections = extractSections(lines);
 28
 29  if (sections.length === 0) {
 30    // No headings found — treat entire document as one clause
 31    if (content.trim().length === 0) return [];
 32    const normalizedText = normalizeText(content);
 33    const semhash = clauseSemhash(normalizedText);
 34    const sectionPath: string[] = [];
 35    const id = clauseId(docId, sectionPath, normalizedText);
 36    const ctxHash = contextSemhashCold(normalizedText, sectionPath, '', '');
 37    return [{
 38      clause_id: id,
 39      source_doc_id: docId,
 40      source_line_range: [1, lines.length],
 41      raw_text: content,
 42      normalized_text: normalizedText,
 43      section_path: sectionPath,
 44      clause_semhash: semhash,
 45      context_semhash_cold: ctxHash,
 46    }];
 47  }
 48
 49  // Build clauses without context hashes first
 50  const preClauses: Omit<Clause, 'context_semhash_cold'>[] = sections.map(sec => {
 51    const normalized = normalizeText(sec.rawText);
 52    const semhash = clauseSemhash(normalized);
 53    const id = clauseId(docId, sec.sectionPath, normalized);
 54    return {
 55      clause_id: id,
 56      source_doc_id: docId,
 57      source_line_range: [sec.startLine, sec.endLine] as [number, number],
 58      raw_text: sec.rawText,
 59      normalized_text: normalized,
 60      section_path: sec.sectionPath,
 61      clause_semhash: semhash,
 62    };
 63  });
 64
 65  // Now compute context hashes with neighbor awareness
 66  const clauses: Clause[] = preClauses.map((pc, i) => {
 67    const prev = i > 0 ? preClauses[i - 1].clause_semhash : '';
 68    const next = i < preClauses.length - 1 ? preClauses[i + 1].clause_semhash : '';
 69    const ctxHash = contextSemhashCold(pc.normalized_text, pc.section_path, prev, next);
 70    return { ...pc, context_semhash_cold: ctxHash };
 71  });
 72
 73  return clauses;
 74}
 75
 76/**
 77 * Extract sections from Markdown lines.
 78 * A section = heading line through (but not including) the next heading of same or higher level,
 79 * or end of file.
 80 */
 81function extractSections(lines: string[]): RawSection[] {
 82  const headingPattern = /^(#{1,6})\s+(.+)/;
 83  const headingIndices: { index: number; level: number; text: string }[] = [];
 84
 85  for (let i = 0; i < lines.length; i++) {
 86    const match = lines[i].match(headingPattern);
 87    if (match) {
 88      headingIndices.push({
 89        index: i,
 90        level: match[1].length,
 91        text: match[2].trim(),
 92      });
 93    }
 94  }
 95
 96  if (headingIndices.length === 0) return [];
 97
 98  // Build sections with proper section_path tracking
 99  const sections: RawSection[] = [];
100  const pathStack: { level: number; text: string }[] = [];
101
102  // Capture pre-heading content as a preamble section
103  if (headingIndices.length > 0 && headingIndices[0].index > 0) {
104    const preambleText = lines.slice(0, headingIndices[0].index).join('\n').trim();
105    if (preambleText.length > 0) {
106      sections.push({
107        heading: '(preamble)',
108        level: 0,
109        startLine: 1,
110        endLine: headingIndices[0].index,
111        rawText: preambleText,
112        sectionPath: ['(preamble)'],
113      });
114    }
115  }
116
117  for (let h = 0; h < headingIndices.length; h++) {
118    const { index, level, text } = headingIndices[h];
119    const startLine = index + 1; // 1-indexed
120    const endLine = h < headingIndices.length - 1
121      ? headingIndices[h + 1].index // line before next heading (0-indexed), = next heading 1-indexed - 1
122      : lines.length;
123
124    // Update section path stack
125    while (pathStack.length > 0 && pathStack[pathStack.length - 1].level >= level) {
126      pathStack.pop();
127    }
128    pathStack.push({ level, text });
129    const sectionPath = pathStack.map(p => p.text);
130
131    // Extract raw text for this section
132    const rawText = lines.slice(index, endLine).join('\n');
133
134    sections.push({
135      heading: text,
136      level,
137      startLine,
138      endLine,
139      rawText,
140      sectionPath: [...sectionPath],
141    });
142  }
143
144  return sections;
145}