this repo has no description
at cactus 626 lines 17 kB view raw
1/*! 𝦗𖹭 2*/ 3 412||+typeof await/2//2; export default 5/** 6 12y2 markup parser factory 7 @implements Parser_Collection 8**/ 9class Markup_12y2_Ref { constructor() { 10 11 // TokenType 🏷 enum 12 // BlockType 🏷 enum 13 // Text 🏷 string 📝 from input text 14 // ArgPattern 🏷 RegExp 15 // GroupNum 🏷 number - regex capturing group num 16 // RawArgs 🏷 Array - array with .named field 17 // Block 🏷 Object - has .type .args .content 18 // CurrentBlock 🏷 Object - block + other fields 19 20 // all state is stored in these vars (and REGEX.lastIndex) 21 let current, brackets 22 23 // About __proto__ in object literals: 24 // https://tc39.es/ecma262/multipage/ecmascript-language-expressions.html#sec-runtime-semantics-propertydefinitionevaluation 25 26 // elements which can survive an eol (without a body) 27 const IS_BLOCK = {__proto__:null, code:1, divider:1, ROOT:1, heading:1, quote:1, table:1, table_cell:1, image:1, video:1, audio:1, spoiler:1, align:1, list:1, list_item:1, youtube:1, anchor:1} 28 29 // RegExp 30 // GroupNum -> TokenType 31 // GroupNum -> ArgPattern 32 const MACROS = { 33 '{EOL}': "(?![^\\n])", 34 '{BOL}': "^", 35 '{ANY}': "[^]", 36 '{URL_CHARS}': "[-\\w/%&=#+~@$*'!?,.;:]*", 37 '{URL_FINAL}': "[-\\w/%&=#+~@$*']", 38 } 39 const GROUPS = [], ARGTYPES = [] 40 let regi = [] 41 function PAT({raw}, ...groups) { 42 regi.push( 43 raw.join("()") 44 .replace(/\\`/g, "`") 45 .replace(/[(](?![?)])/g, "(?:") 46 .replace(/[{][A-Z_]+[}]/g, match=>MACROS[match]) 47 ) 48 for (let g of groups) { 49 GROUPS.push(Object.keys(g)[0]) 50 ARGTYPES.push(Object.values(g)[0]) 51 } 52 } 53 54 // ArgPattern 55 const ARGS_NORMAL = // /[...]?{?/ 56 /(?:\[([^\]\n]*)\])?({\n?)?/y 57 58 const ARGS_WORD = // /[...]?{/ or /[...] ?<word>/ or / <word>/ 59 /(?:\[([^\]\n]*)\]|(?=[ {]))({\n?| ?([^\s`^()+=\[\]{}\\|"';:,.<>/?!*]*))/y // todo: more complex rule for word parsing //TODO: does this set the body flag right? //(what did i mean by this?) 60 const ARGS_LINE = // /[...]?{/ or /[...] ?/ or / / 61 /(?:\[([^\]\n]*)\]|(?=[ {]))(?:({\n?)| ?)/y // probably dont need this, we can strip space after { in all cases instead. 62 const ARGS_HEADING = // /[...]?{/ or /[...] ?/ or / / 63 /(?:\[([^\]\n]*)\]|(?=[ {]))(?:({\n?)| ?)/y 64 65 // this is like args_heading kinda, except always counts as a line start. maybe backport this to args heading etc.? 66 const ARGS_ANCHOR = // /[...]{?/ 67 /\[([^\]\n]*)\]({\n?| ?|)/y 68 69 const ARGS_BODYLESS = // /[...]?/ 70 /(?:\[([^\]\n]*)\])?/y 71 const ARGS_TABLE = // /[...]? */ 72 /(?:\[([^\]\n]*)\])? */y 73 74 const ARGS_CODE = // ... ``` 75 /(?: *([-\w.+#$ ]+?) *(?![^\n]))?\n?([^]*?)(?:\n?```|$)/y // @@@ backported \n``` 76 77 PAT`[\n]?[}]${{ BLOCK_END: 0}}` 78 PAT`[\n]${{ NEWLINE: 0}}` 79 PAT`{BOL}[#]{1,4}${{ HEADING: ARGS_HEADING}}` 80 PAT`{BOL}[-]{3,}{EOL}${{ DIVIDER: 0}}` 81 PAT`([*][*]|[_][_]|[~][~]|[/])${{ STYLE: true}}` 82 PAT`[\\][a-z]+(?![a-zA-Z0-9])${{ TAG: true}}` 83 PAT`[\\][{][\n]?${{ NULL_ENV: 0}}` 84 PAT`[\\]{ANY}${{ ESCAPED: 0}}` 85 PAT`{BOL}[>]${{ QUOTE: ARGS_HEADING}}` 86 PAT`{BOL}[\`]{3}(?=[^\n\`]*?{EOL})${{ CODE_BLOCK: ARGS_CODE}}` 87 PAT`[\`][^\`\n]*([\`]{2}[^\`\n]*)*[\`]?${{ INLINE_CODE: 0}}` 88 PAT`([!]${{ EMBED: ARGS_BODYLESS}})?\b(https?://|sbs:){URL_CHARS}({URL_FINAL}|[(]{URL_CHARS}[)]({URL_CHARS}{URL_FINAL})?)${{ LINK: ARGS_NORMAL}}` 89 PAT`{BOL} *[|]${{ TABLE_START: ARGS_TABLE}}` 90 PAT` *[|]${{ TABLE_CELL: ARGS_TABLE}}` 91 PAT`{BOL} *[-]${{ LIST_ITEM: ARGS_HEADING}}` 92 93 const REGEX = new RegExp(regi.join("|"), 'g') 94 regi = null 95 96 //todo: org tables separators? 97 98 // TokenType -> ArgRegex 99 const TAGS = { 100 __proto__:null, 101 '\\sub': ARGS_WORD, 102 '\\sup': ARGS_WORD, 103 '\\b': ARGS_WORD, 104 '\\i': ARGS_WORD, 105 '\\u': ARGS_WORD, 106 '\\s': ARGS_WORD, 107 '\\quote': ARGS_LINE, 108 '\\align': ARGS_LINE, 109 '\\spoiler': ARGS_LINE, '\\h': ARGS_LINE, 110 '\\ruby': ARGS_WORD, 111 '\\key': ARGS_WORD, 112 '\\a': ARGS_ANCHOR, 113 '\\link': ARGS_NORMAL, // should use arg parse mode, i think? 114 } 115 116 // process a token 117 // 📥 _token_type 🏷 TokenType 📝 118 // 📥 token 🏷 Text 📝 token text, including arguments 119 // 📥 rarys 🏷 RawArgs 📝 raw arguments 120 // 📥 body 🏷 Text 📝 argmatch[2] (varies) 121 // 📥 base_token 🏷 Text 📝 token text, without arguments 122 function PROCESS(_token_type, token, rargs, body, args_token) { 123 switch (_token_type) { default: { 124 throw new TypeError("unknown token type: "+_token_type) 125 // error 126 } break; case 'NEWLINE': { 127 NEWLINE(true) 128 } break; case 'HEADING': { 129 let level = token.length 130 let args = {level} 131 let id = rargs[0] 132 args.id = id ? id.replace(/\W+/g, "-") : null 133 // todo: anchor name (and, can this be chosen automatically based on contents?) 134 OPEN('heading', args, body) 135 } break; case 'DIVIDER': { 136 BLOCK('divider') 137 } break; case 'BLOCK_END': { 138 if (brackets>0) { 139 while (!current.body) 140 CANCEL() 141 if ('invalid'===current.type) { 142 if ("\n}"==token) 143 NEWLINE(false) // false since we already closed everything 144 TEXT("}") 145 } 146 CLOSE() 147 } else { 148 // hack: 149 if ("\n}"==token) 150 NEWLINE(true) 151 TEXT("}") 152 } 153 } break; case 'NULL_ENV': { 154 OPEN('null_env', null, true) 155 current.prev = current.parent.prev 156 } break; case 'ESCAPED': { 157 if ("\\\n"===token) 158 NEWLINE(false) 159 else if ("\\."===token) { // \. is a no-op 160 // todo: close lists too 161 //current.content.push("") 162 current.prev = 'block' 163 } else 164 TEXT(token.substring(1)) 165 } break; case 'QUOTE': { 166 OPEN('quote', {cite: rargs[0]}, body) 167 } break; case 'CODE_BLOCK': { 168 let lang = rargs 169 BLOCK('code', {text: body, lang}) 170 } break; case 'INLINE_CODE': { 171 BLOCK('icode', {text: token.replace(/`(`)?/g, "$1")}) 172 } break; case 'EMBED': { 173 let url = token.substring(1) // ehh better 174 let [type, args] = process_embed(url, rargs) 175 BLOCK(type, args) 176 } break; case 'LINK': { 177 let url = token 178 let args = {url} 179 if (body) { 180 OPEN('link', args, body) 181 } else { 182 args.text = rargs[0] 183 BLOCK('simple_link', args) 184 } 185 } break; case 'TABLE_START': { 186 OPEN('table_row', token+args_token) // special OPEN call 187 OPEN('table_cell', rargs, body) 188 } break; case 'TABLE_CELL': { 189 while (current.type!=='table_cell') 190 CANCEL() 191 CLOSE() // cell 192 // we don't know whether these are row args or cell args, 193 // so just pass the raw args directly, and parse them later. 194 OPEN('table_cell', rargs, body) 195 } break; case 'INVALID_TAG': { 196 if (body) 197 OPEN('invalid', {text: token+args_token, reason: "invalid tag"}, body) 198 else 199 BLOCK('invalid', {text: token+args_token, reason: "invalid tag"}) 200 } break; case 'LIST_ITEM': { 201 let indent = token.indexOf("-") 202 OPEN('list_item', {indent}, body) 203 204 } break; case '\\sub': { 205 OPEN('subscript', null, body) 206 } break; case '\\sup': { 207 OPEN('superscript', null, body) 208 } break; case '\\b': { 209 OPEN('bold', null, body) 210 } break; case '\\i': { 211 OPEN('italic', null, body) 212 } break; case '\\u': { 213 OPEN('underline', null, body) 214 } break; case '\\s': { 215 OPEN('strikethrough', null, body) 216 } break; case '\\quote': { 217 OPEN('quote', {cite: rargs[0]}, body) 218 } break; case '\\align': { 219 let a = rargs[0] 220 if (!['left', 'right', 'center'].includes(a)) 221 a = 'center' 222 OPEN('align', {align: a}, body) 223 } break; case '\\spoiler': case '\\h': { 224 let label = arg0(rargs, "spoiler") 225 OPEN('spoiler', {label}, body) 226 } break; case '\\ruby': { 227 let text = arg0(rargs, "true") 228 OPEN('ruby', {text}, body) 229 } break; case '\\key': { 230 OPEN('key', null, body) 231 } break; case '\\a': { 232 let id = rargs[0] 233 id = id ? id.replace(/\W+/g, "-") : null 234 OPEN('anchor', {id}, body) 235 //BLOCK('anchor', {id}) 236 } break; case '\\link': { 237 let args = {url: rargs[0]} 238 if (body) { 239 OPEN('link', args, body) 240 } else { 241 args.text = args.url 242 BLOCK('simple_link', args) 243 } 244 } } 245 } 246 247 function arg0(rargs, def) { 248 if (rargs.length<1) 249 return def 250 return rargs[0] 251 } 252 253 254 255 const null_args = [] 256 null_args.named = Object.freeze({}) 257 Object.freeze(null_args) 258 // todo: do we even need named args? 259 function parse_args(arglist) { 260 // note: checks undefined AND "" (\tag AND \tag[]) 261 if (!arglist) 262 return null_args 263 let list = [], named = {} 264 list.named = named 265 for (let arg of arglist.split(";")) { 266 let [, name, value] = /^(?:([^=]*)=)?(.*)$/.exec(arg) 267 // value OR =value 268 // (this is to allow values to contain =. ex: [=1=2] is "1=2") 269 if (!name) 270 list.push(value) 271 else // name=value 272 named[name] = value 273 } 274 return list 275 } 276 // process an embed url: !https://example.com/image.png[alt=balls] 277 // returns [type: String, args: Object] 278 function process_embed(url, rargs) { 279 let type 280 let args = {url} 281 for (let arg of rargs) { 282 let m 283 if ('video'===arg || 'audio'===arg || 'image'===arg) { 284 type = arg 285 } else if (m = /^(\d+)x(\d+)$/.exec(arg)) { 286 args.width = +m[1] 287 args.height = +m[2] 288 } else { 289 if (args.alt==undefined) 290 args.alt = arg 291 else 292 args.alt += ";"+arg 293 } 294 } 295 if (rargs.named.alt!=undefined) 296 args.alt = rargs.named.alt 297 // todo: improve this 298 if (!type) { 299 //let u = new URL(url, "x-relative:/") 300 //let ext = /[.]([a-z0-9A-Z]{3,4})(?!\w)[^.]*$/.exec(url) 301 if (/[.](mp3|ogg|wav|m4a)\b/i.test(url)) 302 type = 'audio' 303 else if (/[.](mp4|mkv|mov)\b/i.test(url)) 304 type = 'video' 305 else if (/^https?:[/][/](?:www[.])?(?:youtube.com[/]watch[?]v=|youtu[.]be[/]|youtube.com[/]shorts[/])[\w-]{11}/.test(url)) { 306 // todo: accept [start-end] args maybe? 307 type = 'youtube' 308 } 309 } 310 if (!type) 311 type = 'image' 312 return [type, args] 313 } 314 315 // start a new block 316 function OPEN(type, args, body) { 317 current = Object.seal({ 318 type, args, content: [], 319 body, parent: current, 320 prev: 'all_newline', 321 }) 322 if (body) 323 brackets++ 324 } 325 // move up 326 function pop() { 327 if (current.body) 328 brackets-- 329 let o = current 330 current = current.parent 331 return o 332 } 333 334 function CANCEL() { 335 if ('style'===current.type) { 336 let o = pop() 337 current.content.push(o.args, ...o.content) 338 current.prev = o.prev 339 return 340 } 341 if ('table_cell'===current.type) { 342 if (current.content.length) { 343 CLOSE() // table_cell 344 current.args = {} 345 } else { 346 // cancelling an empty table cell means: 347 // it's the end of the row, so discard the cell 348 let o = pop() 349 // if the ROW is empty (i.e. we just have a single | ) 350 if (!current.content.length) { 351 let o = pop() // discard the row 352 TEXT(o.args) 353 return 354 // todo: maybe also cancel rows with 1 unclosed cell? 355 // like `| abc` -> text 356 } 357 // transfer args to the row, and parse as table row args: 358 let ret = current.args = {} 359 for (let arg of o.args) { 360 if ("*"===arg || "#"===arg) { 361 ret.header = true 362 } 363 } 364 } 365 // fallthrough to close the table_row 366 } 367 CLOSE() 368 } 369 370 function get_last(block) { 371 return block.content[block.content.length-1] 372 } 373 374 function CLOSE() { 375 let o = pop() 376 377 if ('null_env'===o.type) { 378 current.content.push(...o.content) 379 current.prev = o.prev 380 return 381 } 382 383 if ('newline'===o.prev) 384 o.content.push("\n") 385 let node = {type: o.type, args: o.args, content: o.content} 386 let dest = current 387 388 // merge list_item with preceeding list 389 if ('list_item'===o.type) { 390 node.args = null 391 let indent = o.args.indent 392 while (1) { 393 let curr = dest 394 dest = get_last(curr) 395 if (!dest || dest.type!=='list' || dest.args.indent>indent) { 396 // create a new level in the list 397 dest = {type:'list', args:{indent}, content:[]} 398 // safe because there's no newline 399 curr.content.push(dest) 400 break 401 } 402 if (dest.args.indent == indent) 403 break 404 } 405 } 406 // merge table_row with preceeding table 407 else if ('table_row'===o.type) { 408 dest = get_last(current) 409 if (!dest || 'table'!==dest.type) { 410 dest = {type:'table', args:null, content:[]} 411 current.content.push(dest) 412 } 413 } 414 // table cell 415 else if ('table_cell'===o.type) { 416 let ret = node.args = {} 417 for (let arg of o.args) { 418 let m 419 if ("*"===arg || "#"===arg) 420 ret.header = true 421 else if (['red', 'orange', 'yellow', 'green', 'blue', 'purple', 'gray'].includes(arg)) 422 ret.color = arg 423 else if (m = /^(\d*)x(\d*)$/.exec(arg)) { 424 let [, w, h] = m 425 if (+w > 1) ret.colspan = +w 426 if (+h > 1) ret.rowspan = +h 427 } 428 } 429 } else if ('style'===o.type) { 430 node.type = { 431 __proto__:null, 432 '**': 'bold', '__': 'underline', 433 '~~': 'strikethrough', '/': 'italic', 434 }[o.args] 435 node.args = null 436 } 437 438 dest.content.push(node) 439 current.prev = o.type in IS_BLOCK ? 'block' : o.prev 440 } 441 // push text 442 function TEXT(text) { 443 if (text!=="") { 444 current.content.push(text) // todo: merge with surrounding textnodes? 445 current.prev = 'text' 446 } 447 } 448 // push empty tag 449 function BLOCK(type, args) { 450 current.content.push({type, args}) 451 current.prev = type in IS_BLOCK ? 'block' : 'text' 452 } 453 454 function NEWLINE(real) { 455 if (real) 456 while (!current.body && 'ROOT'!=current.type) 457 CANCEL() 458 if ('block'!==current.prev) 459 current.content.push("\n") 460 if ('all_newline'!==current.prev) 461 current.prev = 'newline' 462 } 463 464 function in_table() { 465 for (let c=current; ; c=c.parent) { 466 if ('table_cell'===c.type) 467 return true 468 if ('style'!==c.type) 469 return false 470 } 471 } 472 // todo: this should check for body 473 function find_style(token) { 474 for (let c=current; 'style'===c.type; c=c.parent) 475 if (c.args===token) 476 return c 477 } 478 function do_style(token_text, before, after) { 479 for (let c=current; 'style'===c.type; c=c.parent) 480 if (c.args===token_text) { 481 if (!after || /[^\s,'"][-\s.,:;!?'")}{]/y.test(before+after)) 482 return c 483 else 484 break 485 } 486 487 if (!before || /[\s.({}'"][^\s,'"]/y.test(before+after)) 488 return true 489 } 490 491 function parse(text) { 492 let tree = {type: 'ROOT', content: [], prev: 'all_newline'} 493 current = tree 494 brackets = 0 495 496 // MAIN LOOP // 497 let prev = -1 498 let last = REGEX.lastIndex = 0 499 let match 500 function nevermind() { 501 REGEX.lastIndex = match.index+1 502 } 503 function accept() { 504 TEXT(text.substring(last, match.index)) 505 last = REGEX.lastIndex 506 } 507 function start_line() { 508 text = text.substring(last) 509 last = REGEX.lastIndex = 0 510 prev = -1 511 } 512 main: while (match = REGEX.exec(text)) { 513 // check for infinite loops 514 if (match.index===prev) 515 throw ["INFINITE LOOP", match] 516 prev = match.index 517 // 2: figure out which token type was matched 518 let token_text = match[0] 519 let group_num = match.indexOf("", 1)-1 520 521 // 3: get type + argument pattern 522 let type = GROUPS[group_num] 523 let argregex 524 // 4: special cases: 525 if ('TAG'===type) { 526 if (token_text in TAGS) { 527 type = token_text 528 argregex = TAGS[type] 529 } else { 530 type = 'INVALID_TAG' 531 argregex = ARGS_NORMAL 532 } 533 } else if ('STYLE'===type) { 534 let c = do_style(token_text, text.charAt(match.index-1), text.charAt(REGEX.lastIndex)) 535 if (!c) { // no 536 nevermind() 537 } else if (true===c) { // open new 538 accept() 539 OPEN('style', token_text) 540 } else { // close 541 accept() 542 while (current != c) 543 CANCEL() 544 CLOSE() 545 } 546 continue main 547 } else if ('TABLE_CELL'===type && !in_table()) { 548 nevermind() 549 continue main 550 } else { 551 argregex = ARGTYPES[group_num] 552 } 553 // 5: parse args and { 554 if (!argregex) { 555 accept() 556 let body = 'NULL_ENV'===type //h 557 PROCESS(type, token_text, null, body, token_text) 558 if (body || 'NEWLINE'===type) 559 start_line() 560 } else { 561 // try to match arguments 562 argregex.lastIndex = REGEX.lastIndex 563 let argmatch = argregex.exec(text) 564 if (null===argmatch) { 565 nevermind() 566 continue main 567 } 568 REGEX.lastIndex = argregex.lastIndex 569 accept() 570 571 let args = argmatch[1] 572 let body = argmatch[2] // flag: args with {, or word args 573 let word = argmatch[3] // contents: word args & code block 574 if (ARGS_CODE!==argregex) { 575 args = parse_args(args) 576 body = body>="{" 577 } 578 579 PROCESS(type, token_text, args, body, argmatch[0]) 580 // word tags 581 if (undefined!==word) { 582 // escaping in word args? idk. todo 583 TEXT(word.replace(/\\([^])/g, "$1")) 584 CLOSE() 585 } 586 // tags with { body 587 else if (argmatch[2]!==undefined && ARGS_CODE!==argregex) { 588 start_line() 589 } 590 } 591 } // end of main loop 592 593 TEXT(text.substring(last)) // text after last token 594 595 while ('ROOT'!==current.type) 596 CANCEL() 597 if ('newline'===current.prev) //todo: this is repeated 598 current.content.push("\n") 599 600 return tree // technically we could return `current` here and get rid of `tree` entirely 601 } 602 603 /** 604 Parser function 605 (closure method) 606 @type {Parser} 607 @kind function 608 **/ 609 this.parse = parse 610 /** 611 @type {Object<string,Parser>} 612 @property {Parser} 12y2 - same as .parse 613 **/ 614 this.langs = {'12y2': parse} 615 616 // what if you want to write like, "{...}". well that's fine 617 // BUT if you are inside a tag, the } will close it. 618 // maybe closing tags should need some kind of special syntax? 619 // \tag{ ... \} >{...\} idk.. 620 // or match paired {}s : 621 // \tag{ ... {heck} ... } <- closes here 622 623 // todo: after parsing a block element: eat the next newline directly 624} } 625 626export default Markup_12y2