1// taken and modified from: https://github.com/mary-ext/atcute/blob/trunk/packages/bluesky/richtext-parser/lib/index.ts
2
3const ESCAPE_RE = /^\\([^0-9A-Za-z\s])/;
4
5const MENTION_RE = /^[@@]([a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*(?:\.[a-zA-Z]{2,}))($|\s|\p{P})/u;
6
7const DID_RE = /^(did:([a-z0-9]+):([A-Za-z0-9.\-_%:]+))($|\s|\p{P})/u;
8
9const TOPIC_RE =
10 /^(?:#(?!\ufe0f|\u20e3)|#)([\p{N}]*[\p{L}\p{M}\p{Pc}][\p{L}\p{M}\p{Pc}\p{N}]*)($|\s|\p{P})/u;
11
12const EMOTE_RE = /^:([\w-]+):/;
13
14const AUTOLINK_RE = /^https?:\/\/[\S]+/;
15const AUTOLINK_BACKPEDAL_RE = /(?:(?<!\(.*)\))?[.,;]*$/;
16
17const LINK_RE =
18 /^\[((?:\[[^\]]*\]|[^[\]]|\](?=[^[]*\]))*)\]\(\s*<?((?:\([^)]*\)|[^\s\\]|\\.)*?)>?(?:\s+['"]([^]*?)['"])?\s*\)/;
19const UNESCAPE_URL_RE = /\\([^0-9A-Za-z\s])/g;
20
21const EMPHASIS_RE =
22 /^\b_((?:__|\\[^]|[^\\_])+?)_\b|^\*(?=\S)((?:\*\*|\\[^]|\s+(?:\\[^]|[^\s*\\]|\*\*)|[^\s*\\])+?)\*(?!\*)/;
23
24const STRONG_RE = /^\*\*((?:\\[^]|[^\\])+?)\*\*(?!\*)/;
25
26const UNDERLINE_RE = /^__((?:\\[^]|~(?!~)|[^~\\]|\s(?!~~))+?)__(?!_)/;
27
28const DELETE_RE = /^~~((?:\\[^]|~(?!~)|[^~\\]|\s(?!~~))+?)~~/;
29
30const CODE_RE = /^(`+)([^]*?[^`])\1(?!`)/;
31const CODE_ESCAPE_BACKTICKS_RE = /^ (?= *`)|(` *) $/g;
32
33const TEXT_RE =
34 /^[^]+?(?:(?=$|[~*_`:\\[]|https?:\/\/)|(?<=\s|[(){}/\\[\]\-|:;'".,=+])(?=[@@##]|did:[a-z0-9]+:))/;
35
36export interface EscapeToken {
37 type: 'escape';
38 raw: string;
39 escaped: string;
40}
41
42export interface MentionToken {
43 type: 'mention';
44 raw: string;
45 handle?: string;
46 did?: string;
47}
48
49export interface TopicToken {
50 type: 'topic';
51 raw: string;
52 name: string;
53}
54
55export interface EmoteToken {
56 type: 'emote';
57 raw: string;
58 name: string;
59}
60
61export interface AutolinkToken {
62 type: 'autolink';
63 raw: string;
64 url: string;
65}
66
67export interface LinkToken {
68 type: 'link';
69 raw: string;
70 url: string;
71 children: Token[];
72}
73
74export interface UnderlineToken {
75 type: 'underline';
76 raw: string;
77 children: Token[];
78}
79
80export interface StrongToken {
81 type: 'strong';
82 raw: string;
83 children: Token[];
84}
85
86export interface EmphasisToken {
87 type: 'emphasis';
88 raw: string;
89 children: Token[];
90}
91
92export interface DeleteToken {
93 type: 'delete';
94 raw: string;
95 children: Token[];
96}
97
98export interface CodeToken {
99 type: 'code';
100 raw: string;
101 content: string;
102}
103
104export interface TextToken {
105 type: 'text';
106 raw: string;
107 content: string;
108}
109
110export type Token =
111 | EscapeToken
112 | MentionToken
113 | TopicToken
114 | EmoteToken
115 | AutolinkToken
116 | LinkToken
117 | StrongToken
118 | EmphasisToken
119 | UnderlineToken
120 | DeleteToken
121 | CodeToken
122 | TextToken;
123
124const tokenizeEscape = (src: string): EscapeToken | undefined => {
125 const match = ESCAPE_RE.exec(src);
126 if (match) {
127 return {
128 type: 'escape',
129 raw: match[0],
130 escaped: match[1]
131 };
132 }
133};
134
135const tokenizeMention = (src: string): MentionToken | undefined => {
136 const match = MENTION_RE.exec(src);
137 if (match && match[2] !== '@') {
138 const suffix = match[2].length;
139
140 return {
141 type: 'mention',
142 raw: suffix > 0 ? match[0].slice(0, -suffix) : match[0],
143 handle: match[1]
144 };
145 }
146
147 const didMatch = DID_RE.exec(src);
148 if (didMatch) {
149 const suffix = didMatch[4].length;
150
151 return {
152 type: 'mention',
153 raw: suffix > 0 ? didMatch[0].slice(0, -suffix) : didMatch[0],
154 did: didMatch[1]
155 };
156 }
157};
158
159const tokenizeTopic = (src: string): TopicToken | undefined => {
160 const match = TOPIC_RE.exec(src);
161 if (match && match[2] !== '#') {
162 const suffix = match[2].length;
163
164 return {
165 type: 'topic',
166 raw: suffix > 0 ? match[0].slice(0, -suffix) : match[0],
167 name: match[1]
168 };
169 }
170};
171
172const tokenizeEmote = (src: string): EmoteToken | undefined => {
173 const match = EMOTE_RE.exec(src);
174 if (match) {
175 return {
176 type: 'emote',
177 raw: match[0],
178 name: match[1]
179 };
180 }
181};
182
183const tokenizeAutolink = (src: string): AutolinkToken | undefined => {
184 const match = AUTOLINK_RE.exec(src);
185 if (match) {
186 const url = match[0].replace(AUTOLINK_BACKPEDAL_RE, '');
187
188 return {
189 type: 'autolink',
190 raw: url,
191 url: url
192 };
193 }
194};
195
196const tokenizeLink = (src: string): LinkToken | undefined => {
197 const match = LINK_RE.exec(src);
198 if (match) {
199 return {
200 type: 'link',
201 raw: match[0],
202 url: match[2].replace(UNESCAPE_URL_RE, '$1'),
203 children: tokenize(match[1])
204 };
205 }
206};
207
208const _tokenizeEmphasis = (src: string): EmphasisToken | undefined => {
209 const match = EMPHASIS_RE.exec(src);
210 if (match) {
211 return {
212 type: 'emphasis',
213 raw: match[0],
214 children: tokenize(match[2] || match[1])
215 };
216 }
217};
218
219const _tokenizeStrong = (src: string): StrongToken | undefined => {
220 const match = STRONG_RE.exec(src);
221 if (match) {
222 return {
223 type: 'strong',
224 raw: match[0],
225 children: tokenize(match[1])
226 };
227 }
228};
229
230const _tokenizeUnderline = (src: string): UnderlineToken | undefined => {
231 const match = UNDERLINE_RE.exec(src);
232 if (match) {
233 return {
234 type: 'underline',
235 raw: match[0],
236 children: tokenize(match[1])
237 };
238 }
239};
240
241const tokenizeEmStrongU = (
242 src: string
243): EmphasisToken | StrongToken | UnderlineToken | undefined => {
244 let token: EmphasisToken | StrongToken | UnderlineToken | undefined;
245
246 {
247 const match = _tokenizeEmphasis(src);
248 if (match && (!token || match.raw.length > token.raw.length)) {
249 token = match;
250 }
251 }
252
253 {
254 const match = _tokenizeStrong(src);
255 if (match && (!token || match.raw.length > token.raw.length)) {
256 token = match;
257 }
258 }
259
260 {
261 const match = _tokenizeUnderline(src);
262 if (match && (!token || match.raw.length > token.raw.length)) {
263 token = match;
264 }
265 }
266
267 return token;
268};
269
270const tokenizeDelete = (src: string): DeleteToken | undefined => {
271 const match = DELETE_RE.exec(src);
272 if (match) {
273 return {
274 type: 'delete',
275 raw: match[0],
276 children: tokenize(match[1])
277 };
278 }
279};
280
281const tokenizeCode = (src: string): CodeToken | undefined => {
282 const match = CODE_RE.exec(src);
283 if (match) {
284 return {
285 type: 'code',
286 raw: match[0],
287 content: match[2].replace(CODE_ESCAPE_BACKTICKS_RE, '$1')
288 };
289 }
290};
291
292const tokenizeText = (src: string): TextToken | undefined => {
293 const match = TEXT_RE.exec(src);
294 if (match) {
295 return {
296 type: 'text',
297 raw: match[0],
298 content: match[0]
299 };
300 }
301};
302
303export const tokenize = (src: string): Token[] => {
304 const tokens: Token[] = [];
305
306 let last: Token | undefined;
307 let token: Token | undefined;
308
309 while (src) {
310 last = token;
311
312 if (
313 (token =
314 tokenizeEscape(src) ||
315 tokenizeMention(src) ||
316 tokenizeAutolink(src) ||
317 tokenizeTopic(src) ||
318 tokenizeEmote(src) ||
319 tokenizeLink(src) ||
320 tokenizeEmStrongU(src) ||
321 tokenizeDelete(src) ||
322 tokenizeCode(src))
323 ) {
324 src = src.slice(token.raw.length);
325 tokens.push(token);
326 continue;
327 }
328
329 if ((token = tokenizeText(src))) {
330 src = src.slice(token.raw.length);
331
332 if (last && last.type === 'text') {
333 last.raw += token.raw;
334 last.content += token.content;
335 token = last;
336 } else {
337 tokens.push(token);
338 }
339
340 continue;
341 }
342
343 if (src) {
344 throw new Error(`infinite loop encountered`);
345 }
346 }
347
348 return tokens;
349};