this repo has no description
1/*! 𝦗𖹭
2*/
3
412||+typeof await/2//2; export default
5/**
6 12y2 markup parser factory
7 @implements Parser_Collection
8**/
9class Markup_12y2_Ref { constructor() {
10
11 // TokenType 🏷 enum
12 // BlockType 🏷 enum
13 // Text 🏷 string 📝 from input text
14 // ArgPattern 🏷 RegExp
15 // GroupNum 🏷 number - regex capturing group num
16 // RawArgs 🏷 Array - array with .named field
17 // Block 🏷 Object - has .type .args .content
18 // CurrentBlock 🏷 Object - block + other fields
19
20 // all state is stored in these vars (and REGEX.lastIndex)
21 let current, brackets
22
23 // About __proto__ in object literals:
24 // https://tc39.es/ecma262/multipage/ecmascript-language-expressions.html#sec-runtime-semantics-propertydefinitionevaluation
25
26 // elements which can survive an eol (without a body)
27 const IS_BLOCK = {__proto__:null, code:1, divider:1, ROOT:1, heading:1, quote:1, table:1, table_cell:1, image:1, video:1, audio:1, spoiler:1, align:1, list:1, list_item:1, youtube:1, anchor:1}
28
29 // RegExp
30 // GroupNum -> TokenType
31 // GroupNum -> ArgPattern
32 const MACROS = {
33 '{EOL}': "(?![^\\n])",
34 '{BOL}': "^",
35 '{ANY}': "[^]",
36 '{URL_CHARS}': "[-\\w/%&=#+~@$*'!?,.;:]*",
37 '{URL_FINAL}': "[-\\w/%&=#+~@$*']",
38 }
39 const GROUPS = [], ARGTYPES = []
40 let regi = []
41 function PAT({raw}, ...groups) {
42 regi.push(
43 raw.join("()")
44 .replace(/\\`/g, "`")
45 .replace(/[(](?![?)])/g, "(?:")
46 .replace(/[{][A-Z_]+[}]/g, match=>MACROS[match])
47 )
48 for (let g of groups) {
49 GROUPS.push(Object.keys(g)[0])
50 ARGTYPES.push(Object.values(g)[0])
51 }
52 }
53
54 // ArgPattern
55 const ARGS_NORMAL = // /[...]?{?/
56 /(?:\[([^\]\n]*)\])?({\n?)?/y
57
58 const ARGS_WORD = // /[...]?{/ or /[...] ?<word>/ or / <word>/
59 /(?:\[([^\]\n]*)\]|(?=[ {]))({\n?| ?([^\s`^()+=\[\]{}\\|"';:,.<>/?!*]*))/y // todo: more complex rule for word parsing //TODO: does this set the body flag right? //(what did i mean by this?)
60 const ARGS_LINE = // /[...]?{/ or /[...] ?/ or / /
61 /(?:\[([^\]\n]*)\]|(?=[ {]))(?:({\n?)| ?)/y // probably dont need this, we can strip space after { in all cases instead.
62 const ARGS_HEADING = // /[...]?{/ or /[...] ?/ or / /
63 /(?:\[([^\]\n]*)\]|(?=[ {]))(?:({\n?)| ?)/y
64
65 // this is like args_heading kinda, except always counts as a line start. maybe backport this to args heading etc.?
66 const ARGS_ANCHOR = // /[...]{?/
67 /\[([^\]\n]*)\]({\n?| ?|)/y
68
69 const ARGS_BODYLESS = // /[...]?/
70 /(?:\[([^\]\n]*)\])?/y
71 const ARGS_TABLE = // /[...]? */
72 /(?:\[([^\]\n]*)\])? */y
73
74 const ARGS_CODE = // ... ```
75 /(?: *([-\w.+#$ ]+?) *(?![^\n]))?\n?([^]*?)(?:\n?```|$)/y // @@@ backported \n```
76
77 PAT`[\n]?[}]${{ BLOCK_END: 0}}`
78 PAT`[\n]${{ NEWLINE: 0}}`
79 PAT`{BOL}[#]{1,4}${{ HEADING: ARGS_HEADING}}`
80 PAT`{BOL}[-]{3,}{EOL}${{ DIVIDER: 0}}`
81 PAT`([*][*]|[_][_]|[~][~]|[/])${{ STYLE: true}}`
82 PAT`[\\][a-z]+(?![a-zA-Z0-9])${{ TAG: true}}`
83 PAT`[\\][{][\n]?${{ NULL_ENV: 0}}`
84 PAT`[\\]{ANY}${{ ESCAPED: 0}}`
85 PAT`{BOL}[>]${{ QUOTE: ARGS_HEADING}}`
86 PAT`{BOL}[\`]{3}(?=[^\n\`]*?{EOL})${{ CODE_BLOCK: ARGS_CODE}}`
87 PAT`[\`][^\`\n]*([\`]{2}[^\`\n]*)*[\`]?${{ INLINE_CODE: 0}}`
88 PAT`([!]${{ EMBED: ARGS_BODYLESS}})?\b(https?://|sbs:){URL_CHARS}({URL_FINAL}|[(]{URL_CHARS}[)]({URL_CHARS}{URL_FINAL})?)${{ LINK: ARGS_NORMAL}}`
89 PAT`{BOL} *[|]${{ TABLE_START: ARGS_TABLE}}`
90 PAT` *[|]${{ TABLE_CELL: ARGS_TABLE}}`
91 PAT`{BOL} *[-]${{ LIST_ITEM: ARGS_HEADING}}`
92
93 const REGEX = new RegExp(regi.join("|"), 'g')
94 regi = null
95
96 //todo: org tables separators?
97
98 // TokenType -> ArgRegex
99 const TAGS = {
100 __proto__:null,
101 '\\sub': ARGS_WORD,
102 '\\sup': ARGS_WORD,
103 '\\b': ARGS_WORD,
104 '\\i': ARGS_WORD,
105 '\\u': ARGS_WORD,
106 '\\s': ARGS_WORD,
107 '\\quote': ARGS_LINE,
108 '\\align': ARGS_LINE,
109 '\\spoiler': ARGS_LINE, '\\h': ARGS_LINE,
110 '\\ruby': ARGS_WORD,
111 '\\key': ARGS_WORD,
112 '\\a': ARGS_ANCHOR,
113 '\\link': ARGS_NORMAL, // should use arg parse mode, i think?
114 }
115
116 // process a token
117 // 📥 _token_type 🏷 TokenType 📝
118 // 📥 token 🏷 Text 📝 token text, including arguments
119 // 📥 rarys 🏷 RawArgs 📝 raw arguments
120 // 📥 body 🏷 Text 📝 argmatch[2] (varies)
121 // 📥 base_token 🏷 Text 📝 token text, without arguments
122 function PROCESS(_token_type, token, rargs, body, args_token) {
123 switch (_token_type) { default: {
124 throw new TypeError("unknown token type: "+_token_type)
125 // error
126 } break; case 'NEWLINE': {
127 NEWLINE(true)
128 } break; case 'HEADING': {
129 let level = token.length
130 let args = {level}
131 let id = rargs[0]
132 args.id = id ? id.replace(/\W+/g, "-") : null
133 // todo: anchor name (and, can this be chosen automatically based on contents?)
134 OPEN('heading', args, body)
135 } break; case 'DIVIDER': {
136 BLOCK('divider')
137 } break; case 'BLOCK_END': {
138 if (brackets>0) {
139 while (!current.body)
140 CANCEL()
141 if ('invalid'===current.type) {
142 if ("\n}"==token)
143 NEWLINE(false) // false since we already closed everything
144 TEXT("}")
145 }
146 CLOSE()
147 } else {
148 // hack:
149 if ("\n}"==token)
150 NEWLINE(true)
151 TEXT("}")
152 }
153 } break; case 'NULL_ENV': {
154 OPEN('null_env', null, true)
155 current.prev = current.parent.prev
156 } break; case 'ESCAPED': {
157 if ("\\\n"===token)
158 NEWLINE(false)
159 else if ("\\."===token) { // \. is a no-op
160 // todo: close lists too
161 //current.content.push("")
162 current.prev = 'block'
163 } else
164 TEXT(token.substring(1))
165 } break; case 'QUOTE': {
166 OPEN('quote', {cite: rargs[0]}, body)
167 } break; case 'CODE_BLOCK': {
168 let lang = rargs
169 BLOCK('code', {text: body, lang})
170 } break; case 'INLINE_CODE': {
171 BLOCK('icode', {text: token.replace(/`(`)?/g, "$1")})
172 } break; case 'EMBED': {
173 let url = token.substring(1) // ehh better
174 let [type, args] = process_embed(url, rargs)
175 BLOCK(type, args)
176 } break; case 'LINK': {
177 let url = token
178 let args = {url}
179 if (body) {
180 OPEN('link', args, body)
181 } else {
182 args.text = rargs[0]
183 BLOCK('simple_link', args)
184 }
185 } break; case 'TABLE_START': {
186 OPEN('table_row', token+args_token) // special OPEN call
187 OPEN('table_cell', rargs, body)
188 } break; case 'TABLE_CELL': {
189 while (current.type!=='table_cell')
190 CANCEL()
191 CLOSE() // cell
192 // we don't know whether these are row args or cell args,
193 // so just pass the raw args directly, and parse them later.
194 OPEN('table_cell', rargs, body)
195 } break; case 'INVALID_TAG': {
196 if (body)
197 OPEN('invalid', {text: token+args_token, reason: "invalid tag"}, body)
198 else
199 BLOCK('invalid', {text: token+args_token, reason: "invalid tag"})
200 } break; case 'LIST_ITEM': {
201 let indent = token.indexOf("-")
202 OPEN('list_item', {indent}, body)
203
204 } break; case '\\sub': {
205 OPEN('subscript', null, body)
206 } break; case '\\sup': {
207 OPEN('superscript', null, body)
208 } break; case '\\b': {
209 OPEN('bold', null, body)
210 } break; case '\\i': {
211 OPEN('italic', null, body)
212 } break; case '\\u': {
213 OPEN('underline', null, body)
214 } break; case '\\s': {
215 OPEN('strikethrough', null, body)
216 } break; case '\\quote': {
217 OPEN('quote', {cite: rargs[0]}, body)
218 } break; case '\\align': {
219 let a = rargs[0]
220 if (!['left', 'right', 'center'].includes(a))
221 a = 'center'
222 OPEN('align', {align: a}, body)
223 } break; case '\\spoiler': case '\\h': {
224 let label = arg0(rargs, "spoiler")
225 OPEN('spoiler', {label}, body)
226 } break; case '\\ruby': {
227 let text = arg0(rargs, "true")
228 OPEN('ruby', {text}, body)
229 } break; case '\\key': {
230 OPEN('key', null, body)
231 } break; case '\\a': {
232 let id = rargs[0]
233 id = id ? id.replace(/\W+/g, "-") : null
234 OPEN('anchor', {id}, body)
235 //BLOCK('anchor', {id})
236 } break; case '\\link': {
237 let args = {url: rargs[0]}
238 if (body) {
239 OPEN('link', args, body)
240 } else {
241 args.text = args.url
242 BLOCK('simple_link', args)
243 }
244 } }
245 }
246
247 function arg0(rargs, def) {
248 if (rargs.length<1)
249 return def
250 return rargs[0]
251 }
252
253
254
255 const null_args = []
256 null_args.named = Object.freeze({})
257 Object.freeze(null_args)
258 // todo: do we even need named args?
259 function parse_args(arglist) {
260 // note: checks undefined AND "" (\tag AND \tag[])
261 if (!arglist)
262 return null_args
263 let list = [], named = {}
264 list.named = named
265 for (let arg of arglist.split(";")) {
266 let [, name, value] = /^(?:([^=]*)=)?(.*)$/.exec(arg)
267 // value OR =value
268 // (this is to allow values to contain =. ex: [=1=2] is "1=2")
269 if (!name)
270 list.push(value)
271 else // name=value
272 named[name] = value
273 }
274 return list
275 }
276 // process an embed url: !https://example.com/image.png[alt=balls]
277 // returns [type: String, args: Object]
278 function process_embed(url, rargs) {
279 let type
280 let args = {url}
281 for (let arg of rargs) {
282 let m
283 if ('video'===arg || 'audio'===arg || 'image'===arg) {
284 type = arg
285 } else if (m = /^(\d+)x(\d+)$/.exec(arg)) {
286 args.width = +m[1]
287 args.height = +m[2]
288 } else {
289 if (args.alt==undefined)
290 args.alt = arg
291 else
292 args.alt += ";"+arg
293 }
294 }
295 if (rargs.named.alt!=undefined)
296 args.alt = rargs.named.alt
297 // todo: improve this
298 if (!type) {
299 //let u = new URL(url, "x-relative:/")
300 //let ext = /[.]([a-z0-9A-Z]{3,4})(?!\w)[^.]*$/.exec(url)
301 if (/[.](mp3|ogg|wav|m4a)\b/i.test(url))
302 type = 'audio'
303 else if (/[.](mp4|mkv|mov)\b/i.test(url))
304 type = 'video'
305 else if (/^https?:[/][/](?:www[.])?(?:youtube.com[/]watch[?]v=|youtu[.]be[/]|youtube.com[/]shorts[/])[\w-]{11}/.test(url)) {
306 // todo: accept [start-end] args maybe?
307 type = 'youtube'
308 }
309 }
310 if (!type)
311 type = 'image'
312 return [type, args]
313 }
314
315 // start a new block
316 function OPEN(type, args, body) {
317 current = Object.seal({
318 type, args, content: [],
319 body, parent: current,
320 prev: 'all_newline',
321 })
322 if (body)
323 brackets++
324 }
325 // move up
326 function pop() {
327 if (current.body)
328 brackets--
329 let o = current
330 current = current.parent
331 return o
332 }
333
334 function CANCEL() {
335 if ('style'===current.type) {
336 let o = pop()
337 current.content.push(o.args, ...o.content)
338 current.prev = o.prev
339 return
340 }
341 if ('table_cell'===current.type) {
342 if (current.content.length) {
343 CLOSE() // table_cell
344 current.args = {}
345 } else {
346 // cancelling an empty table cell means:
347 // it's the end of the row, so discard the cell
348 let o = pop()
349 // if the ROW is empty (i.e. we just have a single | )
350 if (!current.content.length) {
351 let o = pop() // discard the row
352 TEXT(o.args)
353 return
354 // todo: maybe also cancel rows with 1 unclosed cell?
355 // like `| abc` -> text
356 }
357 // transfer args to the row, and parse as table row args:
358 let ret = current.args = {}
359 for (let arg of o.args) {
360 if ("*"===arg || "#"===arg) {
361 ret.header = true
362 }
363 }
364 }
365 // fallthrough to close the table_row
366 }
367 CLOSE()
368 }
369
370 function get_last(block) {
371 return block.content[block.content.length-1]
372 }
373
374 function CLOSE() {
375 let o = pop()
376
377 if ('null_env'===o.type) {
378 current.content.push(...o.content)
379 current.prev = o.prev
380 return
381 }
382
383 if ('newline'===o.prev)
384 o.content.push("\n")
385 let node = {type: o.type, args: o.args, content: o.content}
386 let dest = current
387
388 // merge list_item with preceeding list
389 if ('list_item'===o.type) {
390 node.args = null
391 let indent = o.args.indent
392 while (1) {
393 let curr = dest
394 dest = get_last(curr)
395 if (!dest || dest.type!=='list' || dest.args.indent>indent) {
396 // create a new level in the list
397 dest = {type:'list', args:{indent}, content:[]}
398 // safe because there's no newline
399 curr.content.push(dest)
400 break
401 }
402 if (dest.args.indent == indent)
403 break
404 }
405 }
406 // merge table_row with preceeding table
407 else if ('table_row'===o.type) {
408 dest = get_last(current)
409 if (!dest || 'table'!==dest.type) {
410 dest = {type:'table', args:null, content:[]}
411 current.content.push(dest)
412 }
413 }
414 // table cell
415 else if ('table_cell'===o.type) {
416 let ret = node.args = {}
417 for (let arg of o.args) {
418 let m
419 if ("*"===arg || "#"===arg)
420 ret.header = true
421 else if (['red', 'orange', 'yellow', 'green', 'blue', 'purple', 'gray'].includes(arg))
422 ret.color = arg
423 else if (m = /^(\d*)x(\d*)$/.exec(arg)) {
424 let [, w, h] = m
425 if (+w > 1) ret.colspan = +w
426 if (+h > 1) ret.rowspan = +h
427 }
428 }
429 } else if ('style'===o.type) {
430 node.type = {
431 __proto__:null,
432 '**': 'bold', '__': 'underline',
433 '~~': 'strikethrough', '/': 'italic',
434 }[o.args]
435 node.args = null
436 }
437
438 dest.content.push(node)
439 current.prev = o.type in IS_BLOCK ? 'block' : o.prev
440 }
441 // push text
442 function TEXT(text) {
443 if (text!=="") {
444 current.content.push(text) // todo: merge with surrounding textnodes?
445 current.prev = 'text'
446 }
447 }
448 // push empty tag
449 function BLOCK(type, args) {
450 current.content.push({type, args})
451 current.prev = type in IS_BLOCK ? 'block' : 'text'
452 }
453
454 function NEWLINE(real) {
455 if (real)
456 while (!current.body && 'ROOT'!=current.type)
457 CANCEL()
458 if ('block'!==current.prev)
459 current.content.push("\n")
460 if ('all_newline'!==current.prev)
461 current.prev = 'newline'
462 }
463
464 function in_table() {
465 for (let c=current; ; c=c.parent) {
466 if ('table_cell'===c.type)
467 return true
468 if ('style'!==c.type)
469 return false
470 }
471 }
472 // todo: this should check for body
473 function find_style(token) {
474 for (let c=current; 'style'===c.type; c=c.parent)
475 if (c.args===token)
476 return c
477 }
478 function do_style(token_text, before, after) {
479 for (let c=current; 'style'===c.type; c=c.parent)
480 if (c.args===token_text) {
481 if (!after || /[^\s,'"][-\s.,:;!?'")}{]/y.test(before+after))
482 return c
483 else
484 break
485 }
486
487 if (!before || /[\s.({}'"][^\s,'"]/y.test(before+after))
488 return true
489 }
490
491 function parse(text) {
492 let tree = {type: 'ROOT', content: [], prev: 'all_newline'}
493 current = tree
494 brackets = 0
495
496 // MAIN LOOP //
497 let prev = -1
498 let last = REGEX.lastIndex = 0
499 let match
500 function nevermind() {
501 REGEX.lastIndex = match.index+1
502 }
503 function accept() {
504 TEXT(text.substring(last, match.index))
505 last = REGEX.lastIndex
506 }
507 function start_line() {
508 text = text.substring(last)
509 last = REGEX.lastIndex = 0
510 prev = -1
511 }
512 main: while (match = REGEX.exec(text)) {
513 // check for infinite loops
514 if (match.index===prev)
515 throw ["INFINITE LOOP", match]
516 prev = match.index
517 // 2: figure out which token type was matched
518 let token_text = match[0]
519 let group_num = match.indexOf("", 1)-1
520
521 // 3: get type + argument pattern
522 let type = GROUPS[group_num]
523 let argregex
524 // 4: special cases:
525 if ('TAG'===type) {
526 if (token_text in TAGS) {
527 type = token_text
528 argregex = TAGS[type]
529 } else {
530 type = 'INVALID_TAG'
531 argregex = ARGS_NORMAL
532 }
533 } else if ('STYLE'===type) {
534 let c = do_style(token_text, text.charAt(match.index-1), text.charAt(REGEX.lastIndex))
535 if (!c) { // no
536 nevermind()
537 } else if (true===c) { // open new
538 accept()
539 OPEN('style', token_text)
540 } else { // close
541 accept()
542 while (current != c)
543 CANCEL()
544 CLOSE()
545 }
546 continue main
547 } else if ('TABLE_CELL'===type && !in_table()) {
548 nevermind()
549 continue main
550 } else {
551 argregex = ARGTYPES[group_num]
552 }
553 // 5: parse args and {
554 if (!argregex) {
555 accept()
556 let body = 'NULL_ENV'===type //h
557 PROCESS(type, token_text, null, body, token_text)
558 if (body || 'NEWLINE'===type)
559 start_line()
560 } else {
561 // try to match arguments
562 argregex.lastIndex = REGEX.lastIndex
563 let argmatch = argregex.exec(text)
564 if (null===argmatch) {
565 nevermind()
566 continue main
567 }
568 REGEX.lastIndex = argregex.lastIndex
569 accept()
570
571 let args = argmatch[1]
572 let body = argmatch[2] // flag: args with {, or word args
573 let word = argmatch[3] // contents: word args & code block
574 if (ARGS_CODE!==argregex) {
575 args = parse_args(args)
576 body = body>="{"
577 }
578
579 PROCESS(type, token_text, args, body, argmatch[0])
580 // word tags
581 if (undefined!==word) {
582 // escaping in word args? idk. todo
583 TEXT(word.replace(/\\([^])/g, "$1"))
584 CLOSE()
585 }
586 // tags with { body
587 else if (argmatch[2]!==undefined && ARGS_CODE!==argregex) {
588 start_line()
589 }
590 }
591 } // end of main loop
592
593 TEXT(text.substring(last)) // text after last token
594
595 while ('ROOT'!==current.type)
596 CANCEL()
597 if ('newline'===current.prev) //todo: this is repeated
598 current.content.push("\n")
599
600 return tree // technically we could return `current` here and get rid of `tree` entirely
601 }
602
603 /**
604 Parser function
605 (closure method)
606 @type {Parser}
607 @kind function
608 **/
609 this.parse = parse
610 /**
611 @type {Object<string,Parser>}
612 @property {Parser} 12y2 - same as .parse
613 **/
614 this.langs = {'12y2': parse}
615
616 // what if you want to write like, "{...}". well that's fine
617 // BUT if you are inside a tag, the } will close it.
618 // maybe closing tags should need some kind of special syntax?
619 // \tag{ ... \} >{...\} idk..
620 // or match paired {}s :
621 // \tag{ ... {heck} ... } <- closes here
622
623 // todo: after parsing a block element: eat the next newline directly
624} }
625
626export default Markup_12y2