import {type AppBskyRichtextFacet} from '@atproto/api' import {parse, postprocess, preprocess} from 'micromark' type FormatKind = 'bold' | 'italic' | 'underline' | 'link' interface FormatRange { kind: FormatKind start: number end: number uri?: string } function mergeOverlapping( ranges: FormatRange[], kind: FormatKind, ): FormatRange[] { const filtered = ranges .filter(f => f.kind === kind) .sort((a, b) => a.start - b.start) const out: FormatRange[] = [] for (const r of filtered) { const prev = out[out.length - 1] if (prev && r.start <= prev.end) { prev.end = Math.max(prev.end, r.end) } else { out.push({...r}) } } return out } export function parseMarkdownEvents(text: string) { try { const chunks = preprocess()(text, undefined, true) return postprocess(parse().document().write(chunks)) } catch { return null } } export function parseMarkdownRichText(text: string): { text: string facets: AppBskyRichtextFacet.Main[] escapedByteStarts: Set } { const events = parseMarkdownEvents(text) if (!events) return {text, facets: [], escapedByteStarts: new Set()} const stripRanges: {start: number; end: number}[] = [] const formatRanges: FormatRange[] = [] const escapedCharOrigPositions: number[] = [] // Links: labelText comes before resource in the event stream, so we // buffer link state per nesting level until we can emit the full range. const linkStack: { labelTextStart: number labelTextEnd: number uri: string }[] = [] // Track whether each strong span uses _ (underline) or * (bold). const strongKindStack: ('bold' | 'underline')[] = [] let awaitingStrongKind = false for (const [eventType, token] of events) { const {type} = token const start = token.start.offset const end = token.end.offset if (eventType === 'enter') { switch (type) { case 'emphasisSequence': case 'labelMarker': // [ and ] case 'resource': // (url) stripRanges.push({start, end}) break case 'strongSequence': if (awaitingStrongKind) { strongKindStack.push(text[start] === '_' ? 'underline' : 'bold') awaitingStrongKind = false } stripRanges.push({start, end}) break case 'strong': awaitingStrongKind = true break case 'characterEscape': stripRanges.push({start, end: start + 1}) // strip only the backslash escapedCharOrigPositions.push(start + 1) // position of the kept char break case 'link': linkStack.push({labelTextStart: -1, labelTextEnd: -1, uri: ''}) break case 'labelText': if (linkStack.length > 0) { const top = linkStack[linkStack.length - 1] top.labelTextStart = start top.labelTextEnd = end } break case 'resourceDestinationString': if (linkStack.length > 0) { let uri = text.slice(start, end) if ( !uri.startsWith('http://') && !uri.startsWith('https://') && !uri.startsWith('mailto:') ) { uri = `https://${uri}` } linkStack[linkStack.length - 1].uri = uri } break case 'emphasisText': formatRanges.push({kind: 'italic', start, end}) break case 'strongText': formatRanges.push({ kind: strongKindStack[strongKindStack.length - 1] ?? 'bold', start, end, }) break } } else { if (type === 'strong') { strongKindStack.pop() } else if (type === 'link') { const info = linkStack.pop() if (info && info.labelTextStart >= 0 && info.uri) { formatRanges.push({ kind: 'link', start: info.labelTextStart, end: info.labelTextEnd, uri: info.uri, }) } } } } if (stripRanges.length === 0) { return {text, facets: [], escapedByteStarts: new Set()} } stripRanges.sort((a, b) => a.start - b.start) // removedBefore[i] = total chars stripped from positions [0, i) // Computed by marking each delimiter's removal at its end position, // then prefix-summing. const removedBefore = new Int32Array(text.length + 2) for (const {start, end} of stripRanges) { if (end <= text.length) { removedBefore[end] += end - start } } for (let i = 1; i <= text.length; i++) { removedBefore[i] += removedBefore[i - 1] } const toStripped = (orig: number): number => orig - removedBefore[orig] // Build stripped text let stripped = '' let si = 0 let i = 0 while (i < text.length) { if (si < stripRanges.length && i === stripRanges[si].start) { i = stripRanges[si].end si++ } else { stripped += text[i] i++ } } // Build char-index → UTF-8-byte-offset map once so facet byte positions // don't require O(n) UnicodeString construction per boundary. const charToUtf8Byte = new Int32Array(stripped.length + 1) { let b = 0 for (let c = 0; c < stripped.length; ) { charToUtf8Byte[c] = b const cp = stripped.codePointAt(c)! b += cp <= 0x7f ? 1 : cp <= 0x7ff ? 2 : cp <= 0xffff ? 3 : 4 c += cp > 0xffff ? 2 : 1 } charToUtf8Byte[stripped.length] = b } const escapedByteStarts = new Set( escapedCharOrigPositions.map(orig => charToUtf8Byte[toStripped(orig)]), ) if (formatRanges.length === 0) { return {text: stripped, facets: [], escapedByteStarts} } // Map format ranges to stripped char positions, discard empty ranges const mapped = formatRanges .map(f => ({ kind: f.kind, start: toStripped(f.start), end: toStripped(f.end), uri: f.uri, })) .filter(f => f.start < f.end) if (mapped.length === 0) { return {text: stripped, facets: [], escapedByteStarts} } // Merge overlapping ranges of the same kind (bold or italic) so that nested // same-kind emphasis (e.g. *emph *with emph* in it*) doesn't lose the outer // range when the inner one closes first in the sweep below. const mergedMapped = [ ...(['bold', 'italic', 'underline'] as const).flatMap(k => mergeOverlapping(mapped, k), ), ...mapped.filter(f => f.kind === 'link'), ] // Interval sweep to produce non-overlapping facets with merged features. // This correctly handles bold+italic on the same range (e.g. ***text***). type SweepEvent = { pos: number isOpen: boolean kind: FormatKind uri?: string } const sweep: SweepEvent[] = [] for (const f of mergedMapped) { sweep.push({pos: f.start, isOpen: true, kind: f.kind, uri: f.uri}) sweep.push({pos: f.end, isOpen: false, kind: f.kind, uri: f.uri}) } sweep.sort((a, b) => a.pos - b.pos || (a.isOpen ? -1 : 1)) const active = new Map() let curPos = 0 const facets: AppBskyRichtextFacet.Main[] = [] for (const ev of sweep) { if (ev.pos > curPos && active.size > 0) { const features: AppBskyRichtextFacet.Main['features'] = [] for (const kind of ['bold', 'italic', 'underline'] as const) { if (active.has(kind)) features.push({$type: `app.bsky.richtext.facet#${kind}`} as any) } if (active.has('link')) features.push({ $type: 'app.bsky.richtext.facet#link', uri: active.get('link')!, }) facets.push({ index: { byteStart: charToUtf8Byte[curPos], byteEnd: charToUtf8Byte[ev.pos], }, features, }) } curPos = ev.pos if (ev.isOpen) { active.set(ev.kind, ev.uri) } else { active.delete(ev.kind) } } return {text: stripped, facets, escapedByteStarts} }