Reference implementation for the Phoenix Architecture. Work in progress.
aicoding.leaflet.pub/
ai
coding
crazy
1/**
2 * Sentence Segmenter — splits clause text into semantic units.
3 *
4 * Rules:
5 * - List items (-, *, •, numbered) are each one sentence
6 * - Prose is split on sentence-ending punctuation
7 * - Compound modals ("must A and must B") are split into two
8 * - Lines with sequence indicators (→, ->) are kept atomic
9 */
10
11import { CONFIG } from './experiment-config.js';
12
13/** A segmented sentence with its position index */
14export interface Sentence {
15 text: string;
16 index: number;
17 /** Whether this came from a list item (vs prose splitting) */
18 fromList: boolean;
19}
20
21/**
22 * Segment clause raw text into individual sentences.
23 */
24export function segmentSentences(rawText: string): Sentence[] {
25 const lines = rawText.split('\n');
26 const sentences: Sentence[] = [];
27 let idx = 0;
28
29 let proseBuffer = '';
30
31 for (const line of lines) {
32 const trimmed = line.trim();
33
34 // Extract heading text as a sentence (provides section context)
35 const headingMatch = trimmed.match(/^#{1,6}\s+(.*)/);
36 if (headingMatch) {
37 if (proseBuffer) {
38 flushProse(proseBuffer, sentences, idx);
39 idx = sentences.length;
40 proseBuffer = '';
41 }
42 const headingText = headingMatch[1].trim();
43 if (headingText.length >= CONFIG.MIN_LIST_ITEM_LENGTH) {
44 sentences.push({ text: headingText, index: idx++, fromList: false });
45 }
46 continue;
47 }
48
49 // Skip empty lines — flush prose buffer
50 if (!trimmed) {
51 if (proseBuffer) {
52 flushProse(proseBuffer, sentences, idx);
53 idx = sentences.length;
54 proseBuffer = '';
55 }
56 continue;
57 }
58
59 // Detect list items
60 const listMatch = trimmed.match(/^(?:[-*•]|\d+[.)]\s*)\s*(.*)/);
61 if (listMatch) {
62 // Flush any pending prose
63 if (proseBuffer) {
64 flushProse(proseBuffer, sentences, idx);
65 idx = sentences.length;
66 proseBuffer = '';
67 }
68 const content = listMatch[1].trim();
69 if (content.length >= CONFIG.MIN_LIST_ITEM_LENGTH) {
70 // Split compound modals within list items
71 const subs = splitCompoundModals(content);
72 for (const sub of subs) {
73 sentences.push({ text: sub, index: idx++, fromList: true });
74 }
75 }
76 } else {
77 // Prose line — accumulate
78 proseBuffer += (proseBuffer ? ' ' : '') + trimmed;
79 }
80 }
81
82 // Flush remaining prose
83 if (proseBuffer) {
84 flushProse(proseBuffer, sentences, idx);
85 }
86
87 return sentences;
88}
89
90/**
91 * Split prose text into sentences and add to the array.
92 */
93function flushProse(text: string, sentences: Sentence[], startIdx: number): void {
94 // Split on sentence boundaries: period/exclamation/question followed by space + uppercase
95 const raw = splitProseIntoSentences(text);
96 let idx = startIdx;
97 for (const s of raw) {
98 const trimmed = s.trim();
99 if (trimmed.length < CONFIG.MIN_PROSE_SENTENCE_LENGTH) continue;
100 // Split compound modals
101 const subs = splitCompoundModals(trimmed);
102 for (const sub of subs) {
103 sentences.push({ text: sub, index: idx++, fromList: false });
104 }
105 }
106}
107
108/**
109 * Split prose text on sentence boundaries.
110 */
111function splitProseIntoSentences(text: string): string[] {
112 // Don't split if it's short enough to be one sentence
113 if (text.length < CONFIG.PROSE_SPLIT_THRESHOLD) return [text];
114
115 const results: string[] = [];
116 // Split on '. ', '! ', '? ' followed by uppercase letter
117 const pattern = /([.!?])\s+(?=[A-Z])/g;
118 let lastIdx = 0;
119 let match: RegExpExecArray | null;
120
121 while ((match = pattern.exec(text)) !== null) {
122 const end = match.index + match[1].length;
123 results.push(text.slice(lastIdx, end).trim());
124 lastIdx = end + match[0].length - match[1].length;
125 }
126
127 if (lastIdx < text.length) {
128 results.push(text.slice(lastIdx).trim());
129 }
130
131 return results.filter(s => s.length > 0);
132}
133
134/**
135 * Split compound modal sentences:
136 * "X must do A and must do B" → ["X must do A", "must do B"]
137 * "X must do A; Y must do B" → ["X must do A", "Y must do B"]
138 *
139 * Only split if both parts contain a modal verb.
140 */
141function splitCompoundModals(text: string): string[] {
142 // Check for semicolons with modals on both sides
143 const semiParts = text.split(/\s*;\s*/);
144 if (semiParts.length > 1 && semiParts.every(p => hasModal(p))) {
145 return semiParts.filter(p => p.length >= CONFIG.MIN_SPLIT_PART_LENGTH);
146 }
147
148 // Check for " and " + modal or " and " separating complete modal clauses
149 const andPattern = /\s+and\s+(?=(?:must|shall|should|will|cannot|must not)\s)/i;
150 const andMatch = text.match(andPattern);
151 if (andMatch && andMatch.index !== undefined) {
152 const left = text.slice(0, andMatch.index).trim();
153 const right = text.slice(andMatch.index + andMatch[0].length).trim();
154 if (left.length >= CONFIG.MIN_SPLIT_PART_LENGTH && right.length >= CONFIG.MIN_SPLIT_PART_LENGTH && hasModal(left)) {
155 return [left, right];
156 }
157 }
158
159 return [text];
160}
161
162function hasModal(text: string): boolean {
163 return /\b(?:must|shall|should|will|cannot|must not|may not)\b/i.test(text);
164}