Reference implementation for the Phoenix Architecture. Work in progress.
aicoding.leaflet.pub/
ai
coding
crazy
1/**
2 * Text normalization for stable semantic hashing.
3 *
4 * Goals:
5 * - Formatting-only changes produce identical normalized output
6 * - Unordered list items are sorted for hash stability
7 * - Ordered/sequence lists are preserved (arrows, ordinals, numbered)
8 * - Deterministic and idempotent
9 */
10
11/**
12 * Normalize a block of text for semantic hashing.
13 */
14export function normalizeText(raw: string): string {
15 let text = raw;
16
17 // Remove fenced code blocks entirely (preserve that code existed but not its content)
18 text = text.replace(/```[\s\S]*?```/g, '(code block)');
19
20 // Remove markdown heading markers
21 text = text.replace(/^#{1,6}\s+/gm, '');
22
23 // Remove bold/italic markers
24 text = text.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
25 text = text.replace(/_{1,3}([^_]+)_{1,3}/g, '$1');
26
27 // Remove inline code backticks (but keep content)
28 text = text.replace(/`([^`]+)`/g, '$1');
29
30 // Remove link syntax, keep text: [text](url) → text
31 text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
32
33 // Lowercase
34 text = text.toLowerCase();
35
36 // Process lines
37 const lines = text.split('\n');
38 const processed: string[] = [];
39 let listBuffer: string[] = [];
40 let listIsOrdered = false;
41
42 for (const line of lines) {
43 const trimmed = line.replace(/\s+/g, ' ').trim();
44 if (trimmed === '') {
45 // Flush list buffer on blank line
46 if (listBuffer.length > 0) {
47 flushList(listBuffer, listIsOrdered, processed);
48 listBuffer = [];
49 listIsOrdered = false;
50 }
51 continue;
52 }
53
54 // Detect list items (-, *, •, numbered)
55 const listMatch = trimmed.match(/^(?:[-*•]|\d+[.)]\s*)\s*(.*)/);
56 if (listMatch) {
57 const content = listMatch[1].trim();
58 // Detect if this is a numbered list (ordered) on first item
59 if (listBuffer.length === 0) {
60 listIsOrdered = /^\d+[.)]/.test(trimmed);
61 }
62 // Detect sequence indicators in any item
63 if (isSequenceContent(content)) {
64 listIsOrdered = true;
65 }
66 listBuffer.push(content);
67 } else {
68 // Flush any pending list
69 if (listBuffer.length > 0) {
70 flushList(listBuffer, listIsOrdered, processed);
71 listBuffer = [];
72 listIsOrdered = false;
73 }
74 processed.push(trimmed);
75 }
76 }
77
78 // Flush remaining list
79 if (listBuffer.length > 0) {
80 flushList(listBuffer, listIsOrdered, processed);
81 }
82
83 return processed.join('\n');
84}
85
86/**
87 * Check if list item content contains sequence/order indicators
88 * that should prevent sorting.
89 */
90function isSequenceContent(text: string): boolean {
91 // Arrows: →, ->, =>, ←
92 if (/[→←⇒⇐]|->|<-|=>/.test(text)) return true;
93 // Ordinals: 1st, 2nd, first, second, then, finally
94 if (/\b(?:1st|2nd|3rd|\d+th|first|second|third|then|finally|next|after)\b/i.test(text)) return true;
95 // Comma-delimited sequence with 3+ items that look like states/steps
96 if (/\w+\s*,\s*\w+\s*,\s*\w+/.test(text)) return true;
97 return false;
98}
99
100/**
101 * Flush a list buffer to processed lines.
102 * Unordered lists are sorted; ordered/sequence lists preserve order.
103 */
104function flushList(items: string[], isOrdered: boolean, out: string[]): void {
105 if (!isOrdered) {
106 items.sort();
107 }
108 out.push(...items);
109}