Markdown parser fork with extended syntax for personal use.
at hack 659 lines 19 kB view raw
1//! HTML (text) occurs in the [text][] content type. 2//! 3//! ## Grammar 4//! 5//! HTML (text) forms with the following BNF 6//! (<small>see [construct][crate::construct] for character groups</small>): 7//! 8//! ```bnf 9//! html_text ::= comment | instruction | declaration | cdata | tag_close | tag_open 10//! 11//! ; Restriction: the text is not allowed to start with `>`, `->`, or to contain `--`. 12//! comment ::= '<!--' *byte '-->' 13//! instruction ::= '<?' *byte '?>' 14//! declaration ::= '<!' ascii_alphabetic *byte '>' 15//! ; Restriction: the text is not allowed to contain `]]`. 16//! cdata ::= '<![CDATA[' *byte ']]>' 17//! tag_close ::= '</' tag_name [space_or_tab_eol] '>' 18//! opening_tag ::= '<' tag_name *(space_or_tab_eol attribute) [[space_or_tab_eol] '/'] [space_or_tab_eol] '>' 19//! 20//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric ) 21//! attribute ::= attribute_name [[space_or_tab_eol] '=' [space_or_tab_eol] attribute_value] 22//! attribute_name ::= (':' | '_' | ascii_alphabetic) *('-' | '.' | ':' | '_' | ascii_alphanumeric) 23//! attribute_value ::= '"' *(byte - '"') '"' | "'" *(byte - "'") "'" | 1*(text - '"' - "'" - '/' - '<' - '=' - '>' - '`') 24//! ``` 25//! 26//! The grammar for HTML in markdown does not follow the rules of parsing 27//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML 28//! spec][html_parsing]. 29//! See the related flow construct [HTML (flow)][html_flow] for more info. 30//! 31//! Because the **tag open** and **tag close** productions in the grammar form 32//! with just tags instead of complete elements, it is possible to interleave 33//! (a word for switching between languages) markdown and HTML together. 34//! For example: 35//! 36//! ```markdown 37//! This is equivalent to <code>*emphasised* code</code>. 38//! ``` 39//! 40//! ## Tokens 41//! 42//! * [`HtmlText`][Name::HtmlText] 43//! * [`HtmlTextData`][Name::HtmlTextData] 44//! 45//! ## References 46//! 47//! * [`html-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-text.js) 48//! * [*§ 6.6 Raw HTML* in `CommonMark`](https://spec.commonmark.org/0.31/#raw-html) 49//! 50//! [text]: crate::construct::text 51//! [html_flow]: crate::construct::html_flow 52//! [html_parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing 53 54use crate::construct::partial_space_or_tab::space_or_tab; 55use crate::event::Name; 56use crate::state::{Name as StateName, State}; 57use crate::tokenizer::Tokenizer; 58use crate::util::constant::HTML_CDATA_PREFIX; 59 60/// Start of HTML (text). 61/// 62/// ```markdown 63/// > | a <b> c 64/// ^ 65/// ``` 66pub fn start(tokenizer: &mut Tokenizer) -> State { 67 if Some(b'<') == tokenizer.current && tokenizer.parse_state.options.constructs.html_text { 68 tokenizer.enter(Name::HtmlText); 69 tokenizer.enter(Name::HtmlTextData); 70 tokenizer.consume(); 71 State::Next(StateName::HtmlTextOpen) 72 } else { 73 State::Nok 74 } 75} 76 77/// After `<`, at tag name or other stuff. 78/// 79/// ```markdown 80/// > | a <b> c 81/// ^ 82/// > | a <!doctype> c 83/// ^ 84/// > | a <!--b--> c 85/// ^ 86/// ``` 87pub fn open(tokenizer: &mut Tokenizer) -> State { 88 match tokenizer.current { 89 Some(b'!') => { 90 tokenizer.consume(); 91 State::Next(StateName::HtmlTextDeclarationOpen) 92 } 93 Some(b'/') => { 94 tokenizer.consume(); 95 State::Next(StateName::HtmlTextTagCloseStart) 96 } 97 Some(b'?') => { 98 tokenizer.consume(); 99 State::Next(StateName::HtmlTextInstruction) 100 } 101 // ASCII alphabetical. 102 Some(b'A'..=b'Z' | b'a'..=b'z') => { 103 tokenizer.consume(); 104 State::Next(StateName::HtmlTextTagOpen) 105 } 106 _ => State::Nok, 107 } 108} 109 110/// After `<!`, at declaration, comment, or CDATA. 111/// 112/// ```markdown 113/// > | a <!doctype> c 114/// ^ 115/// > | a <!--b--> c 116/// ^ 117/// > | a <![CDATA[>&<]]> c 118/// ^ 119/// ``` 120pub fn declaration_open(tokenizer: &mut Tokenizer) -> State { 121 match tokenizer.current { 122 Some(b'-') => { 123 tokenizer.consume(); 124 State::Next(StateName::HtmlTextCommentOpenInside) 125 } 126 // ASCII alphabetical. 127 Some(b'A'..=b'Z' | b'a'..=b'z') => { 128 tokenizer.consume(); 129 State::Next(StateName::HtmlTextDeclaration) 130 } 131 Some(b'[') => { 132 tokenizer.consume(); 133 State::Next(StateName::HtmlTextCdataOpenInside) 134 } 135 _ => State::Nok, 136 } 137} 138 139/// In a comment, after `<!-`, at another `-`. 140/// 141/// ```markdown 142/// > | a <!--b--> c 143/// ^ 144/// ``` 145pub fn comment_open_inside(tokenizer: &mut Tokenizer) -> State { 146 match tokenizer.current { 147 Some(b'-') => { 148 tokenizer.consume(); 149 State::Next(StateName::HtmlTextCommentEnd) 150 } 151 _ => State::Nok, 152 } 153} 154 155/// In comment. 156/// 157/// ```markdown 158/// > | a <!--b--> c 159/// ^ 160/// ``` 161pub fn comment(tokenizer: &mut Tokenizer) -> State { 162 match tokenizer.current { 163 None => State::Nok, 164 Some(b'\n') => { 165 tokenizer.attempt(State::Next(StateName::HtmlTextComment), State::Nok); 166 State::Retry(StateName::HtmlTextLineEndingBefore) 167 } 168 Some(b'-') => { 169 tokenizer.consume(); 170 State::Next(StateName::HtmlTextCommentClose) 171 } 172 _ => { 173 tokenizer.consume(); 174 State::Next(StateName::HtmlTextComment) 175 } 176 } 177} 178 179/// In comment, after `-`. 180/// 181/// ```markdown 182/// > | a <!--b--> c 183/// ^ 184/// ``` 185pub fn comment_close(tokenizer: &mut Tokenizer) -> State { 186 match tokenizer.current { 187 Some(b'-') => { 188 tokenizer.consume(); 189 State::Next(StateName::HtmlTextCommentEnd) 190 } 191 _ => State::Retry(StateName::HtmlTextComment), 192 } 193} 194/// In comment, after `-`. 195/// 196/// ```markdown 197/// > | a <!--b--> c 198/// ^ 199/// ``` 200pub fn comment_end(tokenizer: &mut Tokenizer) -> State { 201 match tokenizer.current { 202 Some(b'>') => State::Retry(StateName::HtmlTextEnd), 203 Some(b'-') => State::Retry(StateName::HtmlTextCommentClose), 204 _ => State::Retry(StateName::HtmlTextComment), 205 } 206} 207 208/// After `<![`, in CDATA, expecting `CDATA[`. 209/// 210/// ```markdown 211/// > | a <![CDATA[>&<]]> b 212/// ^^^^^^ 213/// ``` 214pub fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State { 215 if tokenizer.current == Some(HTML_CDATA_PREFIX[tokenizer.tokenize_state.size]) { 216 tokenizer.tokenize_state.size += 1; 217 tokenizer.consume(); 218 219 if tokenizer.tokenize_state.size == HTML_CDATA_PREFIX.len() { 220 tokenizer.tokenize_state.size = 0; 221 State::Next(StateName::HtmlTextCdata) 222 } else { 223 State::Next(StateName::HtmlTextCdataOpenInside) 224 } 225 } else { 226 State::Nok 227 } 228} 229 230/// In CDATA. 231/// 232/// ```markdown 233/// > | a <![CDATA[>&<]]> b 234/// ^^^ 235/// ``` 236pub fn cdata(tokenizer: &mut Tokenizer) -> State { 237 match tokenizer.current { 238 None => State::Nok, 239 Some(b'\n') => { 240 tokenizer.attempt(State::Next(StateName::HtmlTextCdata), State::Nok); 241 State::Retry(StateName::HtmlTextLineEndingBefore) 242 } 243 Some(b']') => { 244 tokenizer.consume(); 245 State::Next(StateName::HtmlTextCdataClose) 246 } 247 _ => { 248 tokenizer.consume(); 249 State::Next(StateName::HtmlTextCdata) 250 } 251 } 252} 253 254/// In CDATA, after `]`, at another `]`. 255/// 256/// ```markdown 257/// > | a <![CDATA[>&<]]> b 258/// ^ 259/// ``` 260pub fn cdata_close(tokenizer: &mut Tokenizer) -> State { 261 match tokenizer.current { 262 Some(b']') => { 263 tokenizer.consume(); 264 State::Next(StateName::HtmlTextCdataEnd) 265 } 266 _ => State::Retry(StateName::HtmlTextCdata), 267 } 268} 269 270/// In CDATA, after `]]`, at `>`. 271/// 272/// ```markdown 273/// > | a <![CDATA[>&<]]> b 274/// ^ 275/// ``` 276pub fn cdata_end(tokenizer: &mut Tokenizer) -> State { 277 match tokenizer.current { 278 Some(b'>') => State::Retry(StateName::HtmlTextEnd), 279 Some(b']') => State::Retry(StateName::HtmlTextCdataClose), 280 _ => State::Retry(StateName::HtmlTextCdata), 281 } 282} 283 284/// In declaration. 285/// 286/// ```markdown 287/// > | a <!b> c 288/// ^ 289/// ``` 290pub fn declaration(tokenizer: &mut Tokenizer) -> State { 291 match tokenizer.current { 292 None | Some(b'>') => State::Retry(StateName::HtmlTextEnd), 293 Some(b'\n') => { 294 tokenizer.attempt(State::Next(StateName::HtmlTextDeclaration), State::Nok); 295 State::Retry(StateName::HtmlTextLineEndingBefore) 296 } 297 _ => { 298 tokenizer.consume(); 299 State::Next(StateName::HtmlTextDeclaration) 300 } 301 } 302} 303 304/// In instruction. 305/// 306/// ```markdown 307/// > | a <?b?> c 308/// ^ 309/// ``` 310pub fn instruction(tokenizer: &mut Tokenizer) -> State { 311 match tokenizer.current { 312 None => State::Nok, 313 Some(b'\n') => { 314 tokenizer.attempt(State::Next(StateName::HtmlTextInstruction), State::Nok); 315 State::Retry(StateName::HtmlTextLineEndingBefore) 316 } 317 Some(b'?') => { 318 tokenizer.consume(); 319 State::Next(StateName::HtmlTextInstructionClose) 320 } 321 _ => { 322 tokenizer.consume(); 323 State::Next(StateName::HtmlTextInstruction) 324 } 325 } 326} 327 328/// In instruction, after `?`, at `>`. 329/// 330/// ```markdown 331/// > | a <?b?> c 332/// ^ 333/// ``` 334pub fn instruction_close(tokenizer: &mut Tokenizer) -> State { 335 match tokenizer.current { 336 Some(b'>') => State::Retry(StateName::HtmlTextEnd), 337 _ => State::Retry(StateName::HtmlTextInstruction), 338 } 339} 340 341/// After `</`, in closing tag, at tag name. 342/// 343/// ```markdown 344/// > | a </b> c 345/// ^ 346/// ``` 347pub fn tag_close_start(tokenizer: &mut Tokenizer) -> State { 348 match tokenizer.current { 349 // ASCII alphabetical. 350 Some(b'A'..=b'Z' | b'a'..=b'z') => { 351 tokenizer.consume(); 352 State::Next(StateName::HtmlTextTagClose) 353 } 354 _ => State::Nok, 355 } 356} 357 358/// After `</x`, in a tag name. 359/// 360/// ```markdown 361/// > | a </b> c 362/// ^ 363/// ``` 364pub fn tag_close(tokenizer: &mut Tokenizer) -> State { 365 match tokenizer.current { 366 // ASCII alphanumerical and `-`. 367 Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { 368 tokenizer.consume(); 369 State::Next(StateName::HtmlTextTagClose) 370 } 371 _ => State::Retry(StateName::HtmlTextTagCloseBetween), 372 } 373} 374 375/// In closing tag, after tag name. 376/// 377/// ```markdown 378/// > | a </b> c 379/// ^ 380/// ``` 381pub fn tag_close_between(tokenizer: &mut Tokenizer) -> State { 382 match tokenizer.current { 383 Some(b'\n') => { 384 tokenizer.attempt(State::Next(StateName::HtmlTextTagCloseBetween), State::Nok); 385 State::Retry(StateName::HtmlTextLineEndingBefore) 386 } 387 Some(b'\t' | b' ') => { 388 tokenizer.consume(); 389 State::Next(StateName::HtmlTextTagCloseBetween) 390 } 391 _ => State::Retry(StateName::HtmlTextEnd), 392 } 393} 394 395/// After `<x`, in opening tag name. 396/// 397/// ```markdown 398/// > | a <b> c 399/// ^ 400/// ``` 401pub fn tag_open(tokenizer: &mut Tokenizer) -> State { 402 match tokenizer.current { 403 // ASCII alphanumerical and `-`. 404 Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => { 405 tokenizer.consume(); 406 State::Next(StateName::HtmlTextTagOpen) 407 } 408 Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween), 409 _ => State::Nok, 410 } 411} 412 413/// In opening tag, after tag name. 414/// 415/// ```markdown 416/// > | a <b> c 417/// ^ 418/// ``` 419pub fn tag_open_between(tokenizer: &mut Tokenizer) -> State { 420 match tokenizer.current { 421 Some(b'\n') => { 422 tokenizer.attempt(State::Next(StateName::HtmlTextTagOpenBetween), State::Nok); 423 State::Retry(StateName::HtmlTextLineEndingBefore) 424 } 425 Some(b'\t' | b' ') => { 426 tokenizer.consume(); 427 State::Next(StateName::HtmlTextTagOpenBetween) 428 } 429 Some(b'/') => { 430 tokenizer.consume(); 431 State::Next(StateName::HtmlTextEnd) 432 } 433 // ASCII alphabetical and `:` and `_`. 434 Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { 435 tokenizer.consume(); 436 State::Next(StateName::HtmlTextTagOpenAttributeName) 437 } 438 _ => State::Retry(StateName::HtmlTextEnd), 439 } 440} 441 442/// In attribute name. 443/// 444/// ```markdown 445/// > | a <b c> d 446/// ^ 447/// ``` 448pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State { 449 match tokenizer.current { 450 // ASCII alphabetical and `-`, `.`, `:`, and `_`. 451 Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => { 452 tokenizer.consume(); 453 State::Next(StateName::HtmlTextTagOpenAttributeName) 454 } 455 _ => State::Retry(StateName::HtmlTextTagOpenAttributeNameAfter), 456 } 457} 458 459/// After attribute name, before initializer, the end of the tag, or 460/// whitespace. 461/// 462/// ```markdown 463/// > | a <b c> d 464/// ^ 465/// ``` 466pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State { 467 match tokenizer.current { 468 Some(b'\n') => { 469 tokenizer.attempt( 470 State::Next(StateName::HtmlTextTagOpenAttributeNameAfter), 471 State::Nok, 472 ); 473 State::Retry(StateName::HtmlTextLineEndingBefore) 474 } 475 Some(b'\t' | b' ') => { 476 tokenizer.consume(); 477 State::Next(StateName::HtmlTextTagOpenAttributeNameAfter) 478 } 479 Some(b'=') => { 480 tokenizer.consume(); 481 State::Next(StateName::HtmlTextTagOpenAttributeValueBefore) 482 } 483 _ => State::Retry(StateName::HtmlTextTagOpenBetween), 484 } 485} 486 487/// Before unquoted, double quoted, or single quoted attribute value, allowing 488/// whitespace. 489/// 490/// ```markdown 491/// > | a <b c=d> e 492/// ^ 493/// ``` 494pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State { 495 match tokenizer.current { 496 None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok, 497 Some(b'\n') => { 498 tokenizer.attempt( 499 State::Next(StateName::HtmlTextTagOpenAttributeValueBefore), 500 State::Nok, 501 ); 502 State::Retry(StateName::HtmlTextLineEndingBefore) 503 } 504 Some(b'\t' | b' ') => { 505 tokenizer.consume(); 506 State::Next(StateName::HtmlTextTagOpenAttributeValueBefore) 507 } 508 Some(b'"' | b'\'') => { 509 tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); 510 tokenizer.consume(); 511 State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted) 512 } 513 Some(_) => { 514 tokenizer.consume(); 515 State::Next(StateName::HtmlTextTagOpenAttributeValueUnquoted) 516 } 517 } 518} 519 520/// In double or single quoted attribute value. 521/// 522/// ```markdown 523/// > | a <b c="d"> e 524/// ^ 525/// ``` 526pub fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State { 527 if tokenizer.current == Some(tokenizer.tokenize_state.marker) { 528 tokenizer.tokenize_state.marker = 0; 529 tokenizer.consume(); 530 State::Next(StateName::HtmlTextTagOpenAttributeValueQuotedAfter) 531 } else { 532 match tokenizer.current { 533 None => { 534 tokenizer.tokenize_state.marker = 0; 535 State::Nok 536 } 537 Some(b'\n') => { 538 tokenizer.attempt( 539 State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted), 540 State::Nok, 541 ); 542 State::Retry(StateName::HtmlTextLineEndingBefore) 543 } 544 _ => { 545 tokenizer.consume(); 546 State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted) 547 } 548 } 549 } 550} 551 552/// In unquoted attribute value. 553/// 554/// ```markdown 555/// > | a <b c=d> e 556/// ^ 557/// ``` 558pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State { 559 match tokenizer.current { 560 None | Some(b'"' | b'\'' | b'<' | b'=' | b'`') => State::Nok, 561 Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween), 562 Some(_) => { 563 tokenizer.consume(); 564 State::Next(StateName::HtmlTextTagOpenAttributeValueUnquoted) 565 } 566 } 567} 568 569/// After double or single quoted attribute value, before whitespace or the end 570/// of the tag. 571/// 572/// ```markdown 573/// > | a <b c="d"> e 574/// ^ 575/// ``` 576pub fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State { 577 match tokenizer.current { 578 Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween), 579 _ => State::Nok, 580 } 581} 582 583/// In certain circumstances of a tag where only an `>` is allowed. 584/// 585/// ```markdown 586/// > | a <b c="d"> e 587/// ^ 588/// ``` 589pub fn end(tokenizer: &mut Tokenizer) -> State { 590 match tokenizer.current { 591 Some(b'>') => { 592 tokenizer.consume(); 593 tokenizer.exit(Name::HtmlTextData); 594 tokenizer.exit(Name::HtmlText); 595 State::Ok 596 } 597 _ => State::Nok, 598 } 599} 600 601/// At eol. 602/// 603/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about 604/// > empty tokens. 605/// 606/// ```markdown 607/// > | a <!--a 608/// ^ 609/// | b--> 610/// ``` 611pub fn line_ending_before(tokenizer: &mut Tokenizer) -> State { 612 match tokenizer.current { 613 Some(b'\n') => { 614 tokenizer.exit(Name::HtmlTextData); 615 tokenizer.enter(Name::LineEnding); 616 tokenizer.consume(); 617 tokenizer.exit(Name::LineEnding); 618 State::Next(StateName::HtmlTextLineEndingAfter) 619 } 620 _ => unreachable!("expected eol"), 621 } 622} 623 624/// After eol, at optional whitespace. 625/// 626/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about 627/// > empty tokens. 628/// 629/// ```markdown 630/// | a <!--a 631/// > | b--> 632/// ^ 633/// ``` 634pub fn line_ending_after(tokenizer: &mut Tokenizer) -> State { 635 if matches!(tokenizer.current, Some(b'\t' | b' ')) { 636 tokenizer.attempt( 637 State::Next(StateName::HtmlTextLineEndingAfterPrefix), 638 State::Nok, 639 ); 640 State::Retry(space_or_tab(tokenizer)) 641 } else { 642 State::Retry(StateName::HtmlTextLineEndingAfterPrefix) 643 } 644} 645 646/// After eol, after optional whitespace. 647/// 648/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about 649/// > empty tokens. 650/// 651/// ```markdown 652/// | a <!--a 653/// > | b--> 654/// ^ 655/// ``` 656pub fn line_ending_after_prefix(tokenizer: &mut Tokenizer) -> State { 657 tokenizer.enter(Name::HtmlTextData); 658 State::Ok 659}