src/lib/strings/richtext-markdown.ts at feat/markdown-basic

madoka.systems / witchsky.app
forked from jollywhoppers.com/witchsky.app
fork atom
Bluesky app fork with some witchin' additions 💫
fork atom
witchsky.app / src / lib / strings / richtext-markdown.ts
at feat/markdown-basic 270 lines 7.9 kB view raw
wrap content
madoka.systems feat: compose bold/italic/underline markdown in post editor 3w ago
229ed977
  1import {type AppBskyRichtextFacet} from '@atproto/api'
  2import {parse, postprocess, preprocess} from 'micromark'
  3
  4type FormatKind = 'bold' | 'italic' | 'underline' | 'link'
  5
  6interface FormatRange {
  7  kind: FormatKind
  8  start: number
  9  end: number
 10  uri?: string
 11}
 12
 13function mergeOverlapping(
 14  ranges: FormatRange[],
 15  kind: FormatKind,
 16): FormatRange[] {
 17  const filtered = ranges
 18    .filter(f => f.kind === kind)
 19    .sort((a, b) => a.start - b.start)
 20  const out: FormatRange[] = []
 21  for (const r of filtered) {
 22    const prev = out[out.length - 1]
 23    if (prev && r.start <= prev.end) {
 24      prev.end = Math.max(prev.end, r.end)
 25    } else {
 26      out.push({...r})
 27    }
 28  }
 29  return out
 30}
 31
 32export function parseMarkdownEvents(text: string) {
 33  try {
 34    const chunks = preprocess()(text, undefined, true)
 35    return postprocess(parse().document().write(chunks))
 36  } catch {
 37    return null
 38  }
 39}
 40
 41export function parseMarkdownRichText(text: string): {
 42  text: string
 43  facets: AppBskyRichtextFacet.Main[]
 44  escapedByteStarts: Set<number>
 45} {
 46  const events = parseMarkdownEvents(text)
 47  if (!events) return {text, facets: [], escapedByteStarts: new Set()}
 48
 49  const stripRanges: {start: number; end: number}[] = []
 50  const formatRanges: FormatRange[] = []
 51  const escapedCharOrigPositions: number[] = []
 52
 53  // Links: labelText comes before resource in the event stream, so we
 54  // buffer link state per nesting level until we can emit the full range.
 55  const linkStack: {
 56    labelTextStart: number
 57    labelTextEnd: number
 58    uri: string
 59  }[] = []
 60  // Track whether each strong span uses _ (underline) or * (bold).
 61  const strongKindStack: ('bold' | 'underline')[] = []
 62  let awaitingStrongKind = false
 63
 64  for (const [eventType, token] of events) {
 65    const {type} = token
 66    const start = token.start.offset
 67    const end = token.end.offset
 68
 69    if (eventType === 'enter') {
 70      switch (type) {
 71        case 'emphasisSequence':
 72        case 'labelMarker': // [ and ]
 73        case 'resource': // (url)
 74          stripRanges.push({start, end})
 75          break
 76        case 'strongSequence':
 77          if (awaitingStrongKind) {
 78            strongKindStack.push(text[start] === '_' ? 'underline' : 'bold')
 79            awaitingStrongKind = false
 80          }
 81          stripRanges.push({start, end})
 82          break
 83        case 'strong':
 84          awaitingStrongKind = true
 85          break
 86        case 'characterEscape':
 87          stripRanges.push({start, end: start + 1}) // strip only the backslash
 88          escapedCharOrigPositions.push(start + 1) // position of the kept char
 89          break
 90        case 'link':
 91          linkStack.push({labelTextStart: -1, labelTextEnd: -1, uri: ''})
 92          break
 93        case 'labelText':
 94          if (linkStack.length > 0) {
 95            const top = linkStack[linkStack.length - 1]
 96            top.labelTextStart = start
 97            top.labelTextEnd = end
 98          }
 99          break
100        case 'resourceDestinationString':
101          if (linkStack.length > 0) {
102            let uri = text.slice(start, end)
103            if (
104              !uri.startsWith('http://') &&
105              !uri.startsWith('https://') &&
106              !uri.startsWith('mailto:')
107            ) {
108              uri = `https://${uri}`
109            }
110            linkStack[linkStack.length - 1].uri = uri
111          }
112          break
113        case 'emphasisText':
114          formatRanges.push({kind: 'italic', start, end})
115          break
116        case 'strongText':
117          formatRanges.push({
118            kind: strongKindStack[strongKindStack.length - 1] ?? 'bold',
119            start,
120            end,
121          })
122          break
123      }
124    } else {
125      if (type === 'strong') {
126        strongKindStack.pop()
127      } else if (type === 'link') {
128        const info = linkStack.pop()
129        if (info && info.labelTextStart >= 0 && info.uri) {
130          formatRanges.push({
131            kind: 'link',
132            start: info.labelTextStart,
133            end: info.labelTextEnd,
134            uri: info.uri,
135          })
136        }
137      }
138    }
139  }
140
141  if (stripRanges.length === 0) {
142    return {text, facets: [], escapedByteStarts: new Set()}
143  }
144
145  stripRanges.sort((a, b) => a.start - b.start)
146
147  // removedBefore[i] = total chars stripped from positions [0, i)
148  // Computed by marking each delimiter's removal at its end position,
149  // then prefix-summing.
150  const removedBefore = new Int32Array(text.length + 2)
151  for (const {start, end} of stripRanges) {
152    if (end <= text.length) {
153      removedBefore[end] += end - start
154    }
155  }
156  for (let i = 1; i <= text.length; i++) {
157    removedBefore[i] += removedBefore[i - 1]
158  }
159
160  const toStripped = (orig: number): number => orig - removedBefore[orig]
161
162  // Build stripped text
163  let stripped = ''
164  let si = 0
165  let i = 0
166  while (i < text.length) {
167    if (si < stripRanges.length && i === stripRanges[si].start) {
168      i = stripRanges[si].end
169      si++
170    } else {
171      stripped += text[i]
172      i++
173    }
174  }
175
176  // Build char-index → UTF-8-byte-offset map once so facet byte positions
177  // don't require O(n) UnicodeString construction per boundary.
178  const charToUtf8Byte = new Int32Array(stripped.length + 1)
179  {
180    let b = 0
181    for (let c = 0; c < stripped.length; ) {
182      charToUtf8Byte[c] = b
183      const cp = stripped.codePointAt(c)!
184      b += cp <= 0x7f ? 1 : cp <= 0x7ff ? 2 : cp <= 0xffff ? 3 : 4
185      c += cp > 0xffff ? 2 : 1
186    }
187    charToUtf8Byte[stripped.length] = b
188  }
189
190  const escapedByteStarts = new Set(
191    escapedCharOrigPositions.map(orig => charToUtf8Byte[toStripped(orig)]),
192  )
193
194  if (formatRanges.length === 0) {
195    return {text: stripped, facets: [], escapedByteStarts}
196  }
197
198  // Map format ranges to stripped char positions, discard empty ranges
199  const mapped = formatRanges
200    .map(f => ({
201      kind: f.kind,
202      start: toStripped(f.start),
203      end: toStripped(f.end),
204      uri: f.uri,
205    }))
206    .filter(f => f.start < f.end)
207
208  if (mapped.length === 0) {
209    return {text: stripped, facets: [], escapedByteStarts}
210  }
211
212  // Merge overlapping ranges of the same kind (bold or italic) so that nested
213  // same-kind emphasis (e.g. *emph *with emph* in it*) doesn't lose the outer
214  // range when the inner one closes first in the sweep below.
215  const mergedMapped = [
216    ...(['bold', 'italic', 'underline'] as const).flatMap(k =>
217      mergeOverlapping(mapped, k),
218    ),
219    ...mapped.filter(f => f.kind === 'link'),
220  ]
221
222  // Interval sweep to produce non-overlapping facets with merged features.
223  // This correctly handles bold+italic on the same range (e.g. ***text***).
224  type SweepEvent = {
225    pos: number
226    isOpen: boolean
227    kind: FormatKind
228    uri?: string
229  }
230  const sweep: SweepEvent[] = []
231  for (const f of mergedMapped) {
232    sweep.push({pos: f.start, isOpen: true, kind: f.kind, uri: f.uri})
233    sweep.push({pos: f.end, isOpen: false, kind: f.kind, uri: f.uri})
234  }
235  sweep.sort((a, b) => a.pos - b.pos || (a.isOpen ? -1 : 1))
236
237  const active = new Map<FormatKind, string | undefined>()
238  let curPos = 0
239  const facets: AppBskyRichtextFacet.Main[] = []
240
241  for (const ev of sweep) {
242    if (ev.pos > curPos && active.size > 0) {
243      const features: AppBskyRichtextFacet.Main['features'] = []
244      for (const kind of ['bold', 'italic', 'underline'] as const) {
245        if (active.has(kind))
246          features.push({$type: `app.bsky.richtext.facet#${kind}`} as any)
247      }
248      if (active.has('link'))
249        features.push({
250          $type: 'app.bsky.richtext.facet#link',
251          uri: active.get('link')!,
252        })
253      facets.push({
254        index: {
255          byteStart: charToUtf8Byte[curPos],
256          byteEnd: charToUtf8Byte[ev.pos],
257        },
258        features,
259      })
260    }
261    curPos = ev.pos
262    if (ev.isOpen) {
263      active.set(ev.kind, ev.uri)
264    } else {
265      active.delete(ev.kind)
266    }
267  }
268
269  return {text: stripped, facets, escapedByteStarts}
270}