Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
at main 164 lines 5.1 kB view raw
1/** 2 * Sentence Segmenter — splits clause text into semantic units. 3 * 4 * Rules: 5 * - List items (-, *, •, numbered) are each one sentence 6 * - Prose is split on sentence-ending punctuation 7 * - Compound modals ("must A and must B") are split into two 8 * - Lines with sequence indicators (→, ->) are kept atomic 9 */ 10 11import { CONFIG } from './experiment-config.js'; 12 13/** A segmented sentence with its position index */ 14export interface Sentence { 15 text: string; 16 index: number; 17 /** Whether this came from a list item (vs prose splitting) */ 18 fromList: boolean; 19} 20 21/** 22 * Segment clause raw text into individual sentences. 23 */ 24export function segmentSentences(rawText: string): Sentence[] { 25 const lines = rawText.split('\n'); 26 const sentences: Sentence[] = []; 27 let idx = 0; 28 29 let proseBuffer = ''; 30 31 for (const line of lines) { 32 const trimmed = line.trim(); 33 34 // Extract heading text as a sentence (provides section context) 35 const headingMatch = trimmed.match(/^#{1,6}\s+(.*)/); 36 if (headingMatch) { 37 if (proseBuffer) { 38 flushProse(proseBuffer, sentences, idx); 39 idx = sentences.length; 40 proseBuffer = ''; 41 } 42 const headingText = headingMatch[1].trim(); 43 if (headingText.length >= CONFIG.MIN_LIST_ITEM_LENGTH) { 44 sentences.push({ text: headingText, index: idx++, fromList: false }); 45 } 46 continue; 47 } 48 49 // Skip empty lines — flush prose buffer 50 if (!trimmed) { 51 if (proseBuffer) { 52 flushProse(proseBuffer, sentences, idx); 53 idx = sentences.length; 54 proseBuffer = ''; 55 } 56 continue; 57 } 58 59 // Detect list items 60 const listMatch = trimmed.match(/^(?:[-*•]|\d+[.)]\s*)\s*(.*)/); 61 if (listMatch) { 62 // Flush any pending prose 63 if (proseBuffer) { 64 flushProse(proseBuffer, sentences, idx); 65 idx = sentences.length; 66 proseBuffer = ''; 67 } 68 const content = listMatch[1].trim(); 69 if (content.length >= CONFIG.MIN_LIST_ITEM_LENGTH) { 70 // Split compound modals within list items 71 const subs = splitCompoundModals(content); 72 for (const sub of subs) { 73 sentences.push({ text: sub, index: idx++, fromList: true }); 74 } 75 } 76 } else { 77 // Prose line — accumulate 78 proseBuffer += (proseBuffer ? ' ' : '') + trimmed; 79 } 80 } 81 82 // Flush remaining prose 83 if (proseBuffer) { 84 flushProse(proseBuffer, sentences, idx); 85 } 86 87 return sentences; 88} 89 90/** 91 * Split prose text into sentences and add to the array. 92 */ 93function flushProse(text: string, sentences: Sentence[], startIdx: number): void { 94 // Split on sentence boundaries: period/exclamation/question followed by space + uppercase 95 const raw = splitProseIntoSentences(text); 96 let idx = startIdx; 97 for (const s of raw) { 98 const trimmed = s.trim(); 99 if (trimmed.length < CONFIG.MIN_PROSE_SENTENCE_LENGTH) continue; 100 // Split compound modals 101 const subs = splitCompoundModals(trimmed); 102 for (const sub of subs) { 103 sentences.push({ text: sub, index: idx++, fromList: false }); 104 } 105 } 106} 107 108/** 109 * Split prose text on sentence boundaries. 110 */ 111function splitProseIntoSentences(text: string): string[] { 112 // Don't split if it's short enough to be one sentence 113 if (text.length < CONFIG.PROSE_SPLIT_THRESHOLD) return [text]; 114 115 const results: string[] = []; 116 // Split on '. ', '! ', '? ' followed by uppercase letter 117 const pattern = /([.!?])\s+(?=[A-Z])/g; 118 let lastIdx = 0; 119 let match: RegExpExecArray | null; 120 121 while ((match = pattern.exec(text)) !== null) { 122 const end = match.index + match[1].length; 123 results.push(text.slice(lastIdx, end).trim()); 124 lastIdx = end + match[0].length - match[1].length; 125 } 126 127 if (lastIdx < text.length) { 128 results.push(text.slice(lastIdx).trim()); 129 } 130 131 return results.filter(s => s.length > 0); 132} 133 134/** 135 * Split compound modal sentences: 136 * "X must do A and must do B" → ["X must do A", "must do B"] 137 * "X must do A; Y must do B" → ["X must do A", "Y must do B"] 138 * 139 * Only split if both parts contain a modal verb. 140 */ 141function splitCompoundModals(text: string): string[] { 142 // Check for semicolons with modals on both sides 143 const semiParts = text.split(/\s*;\s*/); 144 if (semiParts.length > 1 && semiParts.every(p => hasModal(p))) { 145 return semiParts.filter(p => p.length >= CONFIG.MIN_SPLIT_PART_LENGTH); 146 } 147 148 // Check for " and " + modal or " and " separating complete modal clauses 149 const andPattern = /\s+and\s+(?=(?:must|shall|should|will|cannot|must not)\s)/i; 150 const andMatch = text.match(andPattern); 151 if (andMatch && andMatch.index !== undefined) { 152 const left = text.slice(0, andMatch.index).trim(); 153 const right = text.slice(andMatch.index + andMatch[0].length).trim(); 154 if (left.length >= CONFIG.MIN_SPLIT_PART_LENGTH && right.length >= CONFIG.MIN_SPLIT_PART_LENGTH && hasModal(left)) { 155 return [left, right]; 156 } 157 } 158 159 return [text]; 160} 161 162function hasModal(text: string): boolean { 163 return /\b(?:must|shall|should|will|cannot|must not|may not)\b/i.test(text); 164}