Reference implementation for the Phoenix Architecture. Work in progress. aicoding.leaflet.pub/
ai coding crazy
at main 109 lines 3.2 kB view raw
1/** 2 * Text normalization for stable semantic hashing. 3 * 4 * Goals: 5 * - Formatting-only changes produce identical normalized output 6 * - Unordered list items are sorted for hash stability 7 * - Ordered/sequence lists are preserved (arrows, ordinals, numbered) 8 * - Deterministic and idempotent 9 */ 10 11/** 12 * Normalize a block of text for semantic hashing. 13 */ 14export function normalizeText(raw: string): string { 15 let text = raw; 16 17 // Remove fenced code blocks entirely (preserve that code existed but not its content) 18 text = text.replace(/```[\s\S]*?```/g, '(code block)'); 19 20 // Remove markdown heading markers 21 text = text.replace(/^#{1,6}\s+/gm, ''); 22 23 // Remove bold/italic markers 24 text = text.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1'); 25 text = text.replace(/_{1,3}([^_]+)_{1,3}/g, '$1'); 26 27 // Remove inline code backticks (but keep content) 28 text = text.replace(/`([^`]+)`/g, '$1'); 29 30 // Remove link syntax, keep text: [text](url) → text 31 text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); 32 33 // Lowercase 34 text = text.toLowerCase(); 35 36 // Process lines 37 const lines = text.split('\n'); 38 const processed: string[] = []; 39 let listBuffer: string[] = []; 40 let listIsOrdered = false; 41 42 for (const line of lines) { 43 const trimmed = line.replace(/\s+/g, ' ').trim(); 44 if (trimmed === '') { 45 // Flush list buffer on blank line 46 if (listBuffer.length > 0) { 47 flushList(listBuffer, listIsOrdered, processed); 48 listBuffer = []; 49 listIsOrdered = false; 50 } 51 continue; 52 } 53 54 // Detect list items (-, *, •, numbered) 55 const listMatch = trimmed.match(/^(?:[-*•]|\d+[.)]\s*)\s*(.*)/); 56 if (listMatch) { 57 const content = listMatch[1].trim(); 58 // Detect if this is a numbered list (ordered) on first item 59 if (listBuffer.length === 0) { 60 listIsOrdered = /^\d+[.)]/.test(trimmed); 61 } 62 // Detect sequence indicators in any item 63 if (isSequenceContent(content)) { 64 listIsOrdered = true; 65 } 66 listBuffer.push(content); 67 } else { 68 // Flush any pending list 69 if (listBuffer.length > 0) { 70 flushList(listBuffer, listIsOrdered, processed); 71 listBuffer = []; 72 listIsOrdered = false; 73 } 74 processed.push(trimmed); 75 } 76 } 77 78 // Flush remaining list 79 if (listBuffer.length > 0) { 80 flushList(listBuffer, listIsOrdered, processed); 81 } 82 83 return processed.join('\n'); 84} 85 86/** 87 * Check if list item content contains sequence/order indicators 88 * that should prevent sorting. 89 */ 90function isSequenceContent(text: string): boolean { 91 // Arrows: →, ->, =>, ← 92 if (/[→←⇒⇐]|->|<-|=>/.test(text)) return true; 93 // Ordinals: 1st, 2nd, first, second, then, finally 94 if (/\b(?:1st|2nd|3rd|\d+th|first|second|third|then|finally|next|after)\b/i.test(text)) return true; 95 // Comma-delimited sequence with 3+ items that look like states/steps 96 if (/\w+\s*,\s*\w+\s*,\s*\w+/.test(text)) return true; 97 return false; 98} 99 100/** 101 * Flush a list buffer to processed lines. 102 * Unordered lists are sorted; ordered/sequence lists preserve order. 103 */ 104function flushList(items: string[], isOrdered: boolean, out: string[]): void { 105 if (!isOrdered) { 106 items.sort(); 107 } 108 out.push(...items); 109}