forked from
jollywhoppers.com/witchsky.app
Bluesky app fork with some witchin' additions 💫
1import {type AppBskyRichtextFacet} from '@atproto/api'
2import {parse, postprocess, preprocess} from 'micromark'
3
4type FormatKind = 'bold' | 'italic' | 'underline' | 'link'
5
6interface FormatRange {
7 kind: FormatKind
8 start: number
9 end: number
10 uri?: string
11}
12
13function mergeOverlapping(
14 ranges: FormatRange[],
15 kind: FormatKind,
16): FormatRange[] {
17 const filtered = ranges
18 .filter(f => f.kind === kind)
19 .sort((a, b) => a.start - b.start)
20 const out: FormatRange[] = []
21 for (const r of filtered) {
22 const prev = out[out.length - 1]
23 if (prev && r.start <= prev.end) {
24 prev.end = Math.max(prev.end, r.end)
25 } else {
26 out.push({...r})
27 }
28 }
29 return out
30}
31
32export function parseMarkdownEvents(text: string) {
33 try {
34 const chunks = preprocess()(text, undefined, true)
35 return postprocess(parse().document().write(chunks))
36 } catch {
37 return null
38 }
39}
40
41export function parseMarkdownRichText(text: string): {
42 text: string
43 facets: AppBskyRichtextFacet.Main[]
44 escapedByteStarts: Set<number>
45} {
46 const events = parseMarkdownEvents(text)
47 if (!events) return {text, facets: [], escapedByteStarts: new Set()}
48
49 const stripRanges: {start: number; end: number}[] = []
50 const formatRanges: FormatRange[] = []
51 const escapedCharOrigPositions: number[] = []
52
53 // Links: labelText comes before resource in the event stream, so we
54 // buffer link state per nesting level until we can emit the full range.
55 const linkStack: {
56 labelTextStart: number
57 labelTextEnd: number
58 uri: string
59 }[] = []
60 // Track whether each strong span uses _ (underline) or * (bold).
61 const strongKindStack: ('bold' | 'underline')[] = []
62 let awaitingStrongKind = false
63
64 for (const [eventType, token] of events) {
65 const {type} = token
66 const start = token.start.offset
67 const end = token.end.offset
68
69 if (eventType === 'enter') {
70 switch (type) {
71 case 'emphasisSequence':
72 case 'labelMarker': // [ and ]
73 case 'resource': // (url)
74 stripRanges.push({start, end})
75 break
76 case 'strongSequence':
77 if (awaitingStrongKind) {
78 strongKindStack.push(text[start] === '_' ? 'underline' : 'bold')
79 awaitingStrongKind = false
80 }
81 stripRanges.push({start, end})
82 break
83 case 'strong':
84 awaitingStrongKind = true
85 break
86 case 'characterEscape':
87 stripRanges.push({start, end: start + 1}) // strip only the backslash
88 escapedCharOrigPositions.push(start + 1) // position of the kept char
89 break
90 case 'link':
91 linkStack.push({labelTextStart: -1, labelTextEnd: -1, uri: ''})
92 break
93 case 'labelText':
94 if (linkStack.length > 0) {
95 const top = linkStack[linkStack.length - 1]
96 top.labelTextStart = start
97 top.labelTextEnd = end
98 }
99 break
100 case 'resourceDestinationString':
101 if (linkStack.length > 0) {
102 let uri = text.slice(start, end)
103 if (
104 !uri.startsWith('http://') &&
105 !uri.startsWith('https://') &&
106 !uri.startsWith('mailto:')
107 ) {
108 uri = `https://${uri}`
109 }
110 linkStack[linkStack.length - 1].uri = uri
111 }
112 break
113 case 'emphasisText':
114 formatRanges.push({kind: 'italic', start, end})
115 break
116 case 'strongText':
117 formatRanges.push({
118 kind: strongKindStack[strongKindStack.length - 1] ?? 'bold',
119 start,
120 end,
121 })
122 break
123 }
124 } else {
125 if (type === 'strong') {
126 strongKindStack.pop()
127 } else if (type === 'link') {
128 const info = linkStack.pop()
129 if (info && info.labelTextStart >= 0 && info.uri) {
130 formatRanges.push({
131 kind: 'link',
132 start: info.labelTextStart,
133 end: info.labelTextEnd,
134 uri: info.uri,
135 })
136 }
137 }
138 }
139 }
140
141 if (stripRanges.length === 0) {
142 return {text, facets: [], escapedByteStarts: new Set()}
143 }
144
145 stripRanges.sort((a, b) => a.start - b.start)
146
147 // removedBefore[i] = total chars stripped from positions [0, i)
148 // Computed by marking each delimiter's removal at its end position,
149 // then prefix-summing.
150 const removedBefore = new Int32Array(text.length + 2)
151 for (const {start, end} of stripRanges) {
152 if (end <= text.length) {
153 removedBefore[end] += end - start
154 }
155 }
156 for (let i = 1; i <= text.length; i++) {
157 removedBefore[i] += removedBefore[i - 1]
158 }
159
160 const toStripped = (orig: number): number => orig - removedBefore[orig]
161
162 // Build stripped text
163 let stripped = ''
164 let si = 0
165 let i = 0
166 while (i < text.length) {
167 if (si < stripRanges.length && i === stripRanges[si].start) {
168 i = stripRanges[si].end
169 si++
170 } else {
171 stripped += text[i]
172 i++
173 }
174 }
175
176 // Build char-index → UTF-8-byte-offset map once so facet byte positions
177 // don't require O(n) UnicodeString construction per boundary.
178 const charToUtf8Byte = new Int32Array(stripped.length + 1)
179 {
180 let b = 0
181 for (let c = 0; c < stripped.length; ) {
182 charToUtf8Byte[c] = b
183 const cp = stripped.codePointAt(c)!
184 b += cp <= 0x7f ? 1 : cp <= 0x7ff ? 2 : cp <= 0xffff ? 3 : 4
185 c += cp > 0xffff ? 2 : 1
186 }
187 charToUtf8Byte[stripped.length] = b
188 }
189
190 const escapedByteStarts = new Set(
191 escapedCharOrigPositions.map(orig => charToUtf8Byte[toStripped(orig)]),
192 )
193
194 if (formatRanges.length === 0) {
195 return {text: stripped, facets: [], escapedByteStarts}
196 }
197
198 // Map format ranges to stripped char positions, discard empty ranges
199 const mapped = formatRanges
200 .map(f => ({
201 kind: f.kind,
202 start: toStripped(f.start),
203 end: toStripped(f.end),
204 uri: f.uri,
205 }))
206 .filter(f => f.start < f.end)
207
208 if (mapped.length === 0) {
209 return {text: stripped, facets: [], escapedByteStarts}
210 }
211
212 // Merge overlapping ranges of the same kind (bold or italic) so that nested
213 // same-kind emphasis (e.g. *emph *with emph* in it*) doesn't lose the outer
214 // range when the inner one closes first in the sweep below.
215 const mergedMapped = [
216 ...(['bold', 'italic', 'underline'] as const).flatMap(k =>
217 mergeOverlapping(mapped, k),
218 ),
219 ...mapped.filter(f => f.kind === 'link'),
220 ]
221
222 // Interval sweep to produce non-overlapping facets with merged features.
223 // This correctly handles bold+italic on the same range (e.g. ***text***).
224 type SweepEvent = {
225 pos: number
226 isOpen: boolean
227 kind: FormatKind
228 uri?: string
229 }
230 const sweep: SweepEvent[] = []
231 for (const f of mergedMapped) {
232 sweep.push({pos: f.start, isOpen: true, kind: f.kind, uri: f.uri})
233 sweep.push({pos: f.end, isOpen: false, kind: f.kind, uri: f.uri})
234 }
235 sweep.sort((a, b) => a.pos - b.pos || (a.isOpen ? -1 : 1))
236
237 const active = new Map<FormatKind, string | undefined>()
238 let curPos = 0
239 const facets: AppBskyRichtextFacet.Main[] = []
240
241 for (const ev of sweep) {
242 if (ev.pos > curPos && active.size > 0) {
243 const features: AppBskyRichtextFacet.Main['features'] = []
244 for (const kind of ['bold', 'italic', 'underline'] as const) {
245 if (active.has(kind))
246 features.push({$type: `app.bsky.richtext.facet#${kind}`} as any)
247 }
248 if (active.has('link'))
249 features.push({
250 $type: 'app.bsky.richtext.facet#link',
251 uri: active.get('link')!,
252 })
253 facets.push({
254 index: {
255 byteStart: charToUtf8Byte[curPos],
256 byteEnd: charToUtf8Byte[ev.pos],
257 },
258 features,
259 })
260 }
261 curPos = ev.pos
262 if (ev.isOpen) {
263 active.set(ev.kind, ev.uri)
264 } else {
265 active.delete(ev.kind)
266 }
267 }
268
269 return {text: stripped, facets, escapedByteStarts}
270}