Markdown parser fork with extended syntax for personal use.
at hack 628 lines 19 kB view raw
1//! The document content type. 2//! 3//! **Document** represents the containers, such as block quotes, list items, 4//! or GFM footnotes, which structure the document and contain other sections. 5//! 6//! The constructs found in flow are: 7//! 8//! * [Block quote][crate::construct::block_quote] 9//! * [List item][crate::construct::list_item] 10//! * [GFM: Footnote definition][crate::construct::gfm_footnote_definition] 11 12use crate::event::{Content, Event, Kind, Link, Name}; 13use crate::message; 14use crate::state::{Name as StateName, State}; 15use crate::subtokenize::divide_events; 16use crate::tokenizer::{Container, ContainerState, Tokenizer}; 17use crate::util::skip; 18use alloc::{boxed::Box, vec::Vec}; 19 20/// Phases where we can exit containers. 21#[derive(Debug, PartialEq)] 22enum Phase { 23 /// After parsing a line of lazy flow which resulted in something that 24 /// exits containers before the line. 25 /// 26 /// ```markdown 27 /// | * a 28 /// > | ```js 29 /// ^ 30 /// | b 31 /// | ``` 32 /// ``` 33 After, 34 /// When a new container replaces an existing container. 35 /// 36 /// ```markdown 37 /// | * a 38 /// > | > b 39 /// ^ 40 /// ``` 41 Prefix, 42 /// After everything. 43 /// 44 /// ```markdown 45 /// > | * a 46 /// ^ 47 /// ``` 48 Eof, 49} 50 51/// Start of document, at an optional BOM. 52/// 53/// ```markdown 54/// > | a 55/// ^ 56/// ``` 57pub fn start(tokenizer: &mut Tokenizer) -> State { 58 tokenizer.tokenize_state.document_child = Some(Box::new(Tokenizer::new( 59 tokenizer.point.clone(), 60 tokenizer.parse_state, 61 ))); 62 63 tokenizer.attempt( 64 State::Next(StateName::DocumentBeforeFrontmatter), 65 State::Next(StateName::DocumentBeforeFrontmatter), 66 ); 67 68 State::Retry(StateName::BomStart) 69} 70 71/// At optional frontmatter. 72/// 73/// ```markdown 74/// > | --- 75/// ^ 76/// | title: Venus 77/// | --- 78/// ``` 79pub fn before_frontmatter(tokenizer: &mut Tokenizer) -> State { 80 tokenizer.attempt( 81 State::Next(StateName::DocumentContainerNewBefore), 82 State::Next(StateName::DocumentContainerNewBefore), 83 ); 84 State::Retry(StateName::FrontmatterStart) 85} 86 87/// At optional existing containers. 88// 89/// ```markdown 90/// | * a 91/// > | > b 92/// ^ 93/// ``` 94pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State { 95 // If there are more existing containers, check whether the next one continues. 96 if tokenizer.tokenize_state.document_continued 97 < tokenizer.tokenize_state.document_container_stack.len() 98 { 99 let container = &tokenizer.tokenize_state.document_container_stack 100 [tokenizer.tokenize_state.document_continued]; 101 102 let name = match container.kind { 103 Container::BlockQuote => StateName::BlockQuoteContStart, 104 Container::GfmFootnoteDefinition => StateName::GfmFootnoteDefinitionContStart, 105 Container::ListItem => StateName::ListItemContStart, 106 }; 107 108 tokenizer.attempt( 109 State::Next(StateName::DocumentContainerExistingAfter), 110 State::Next(StateName::DocumentContainerNewBefore), 111 ); 112 113 State::Retry(name) 114 } 115 // Otherwise, check new containers. 116 else { 117 State::Retry(StateName::DocumentContainerNewBefore) 118 } 119} 120 121/// After continued existing container. 122// 123/// ```markdown 124/// | * a 125/// > | b 126/// ^ 127/// ``` 128pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State { 129 tokenizer.tokenize_state.document_continued += 1; 130 State::Retry(StateName::DocumentContainerExistingBefore) 131} 132 133/// At new containers. 134// 135/// ```markdown 136/// > | * a 137/// ^ 138/// > | > b 139/// ^ 140/// ``` 141pub fn container_new_before(tokenizer: &mut Tokenizer) -> State { 142 // If we have completely continued, restore the flow’s past `interrupt` 143 // status. 144 if tokenizer.tokenize_state.document_continued 145 == tokenizer.tokenize_state.document_container_stack.len() 146 { 147 let child = tokenizer.tokenize_state.document_child.as_ref().unwrap(); 148 149 tokenizer.interrupt = child.interrupt; 150 151 // …and if we’re in a concrete construct, new containers can’t “pierce” 152 // into them. 153 if child.concrete { 154 return State::Retry(StateName::DocumentContainersAfter); 155 } 156 } 157 158 // Check for a new container. 159 // Block quote? 160 // Add a new container at the end of the stack. 161 let tail = tokenizer.tokenize_state.document_container_stack.len(); 162 tokenizer 163 .tokenize_state 164 .document_container_stack 165 .push(ContainerState { 166 kind: Container::BlockQuote, 167 blank_initial: false, 168 size: 0, 169 }); 170 // Swap the existing container with the new one. 171 tokenizer 172 .tokenize_state 173 .document_container_stack 174 .swap(tokenizer.tokenize_state.document_continued, tail); 175 176 tokenizer.attempt( 177 State::Next(StateName::DocumentContainerNewAfter), 178 State::Next(StateName::DocumentContainerNewBeforeNotBlockQuote), 179 ); 180 State::Retry(StateName::BlockQuoteStart) 181} 182 183/// At new container, but not a block quote. 184// 185/// ```markdown 186/// > | * a 187/// ^ 188/// ``` 189pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State { 190 // List item? 191 // We replace the empty block quote container for this new list item one. 192 tokenizer.tokenize_state.document_container_stack 193 [tokenizer.tokenize_state.document_continued] = ContainerState { 194 kind: Container::ListItem, 195 blank_initial: false, 196 size: 0, 197 }; 198 199 tokenizer.attempt( 200 State::Next(StateName::DocumentContainerNewAfter), 201 State::Next(StateName::DocumentContainerNewBeforeNotList), 202 ); 203 State::Retry(StateName::ListItemStart) 204} 205 206/// At new container, but not a block quote or list item. 207// 208/// ```markdown 209/// > | a 210/// ^ 211/// ``` 212pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State { 213 // Footnote definition? 214 // We replace the empty list item container for this new footnote 215 // definition one. 216 tokenizer.tokenize_state.document_container_stack 217 [tokenizer.tokenize_state.document_continued] = ContainerState { 218 kind: Container::GfmFootnoteDefinition, 219 blank_initial: false, 220 size: 0, 221 }; 222 223 tokenizer.attempt( 224 State::Next(StateName::DocumentContainerNewAfter), 225 State::Next(StateName::DocumentContainerNewBeforeNotGfmFootnoteDefinition), 226 ); 227 State::Retry(StateName::GfmFootnoteDefinitionStart) 228} 229 230/// At new container, but not a block quote, list item, or footnote definition. 231// 232/// ```markdown 233/// > | a 234/// ^ 235/// ``` 236pub fn container_new_before_not_footnote_definition(tokenizer: &mut Tokenizer) -> State { 237 // It wasn’t a new block quote, list item, or footnote definition. 238 // Swap the new container (in the middle) with the existing one (at the end). 239 // Drop what was in the middle. 240 tokenizer 241 .tokenize_state 242 .document_container_stack 243 .swap_remove(tokenizer.tokenize_state.document_continued); 244 245 State::Retry(StateName::DocumentContainersAfter) 246} 247 248/// After new container. 249/// 250/// ```markdown 251/// > | * a 252/// ^ 253/// > | > b 254/// ^ 255/// ``` 256pub fn container_new_after(tokenizer: &mut Tokenizer) -> State { 257 // It was a new block quote, list item, or footnote definition. 258 // Swap the new container (in the middle) with the existing one (at the end). 259 // Take the new container. 260 let container = tokenizer 261 .tokenize_state 262 .document_container_stack 263 .swap_remove(tokenizer.tokenize_state.document_continued); 264 265 // If we did not continue all existing containers, and there is a new one, 266 // close the flow and those containers. 267 if tokenizer.tokenize_state.document_continued 268 != tokenizer.tokenize_state.document_container_stack.len() 269 { 270 if let Err(message) = exit_containers(tokenizer, &Phase::Prefix) { 271 return State::Error(message); 272 } 273 } 274 275 // We are “piercing” into the flow with a new container. 276 tokenizer 277 .tokenize_state 278 .document_child 279 .as_mut() 280 .unwrap() 281 .pierce = true; 282 283 tokenizer 284 .tokenize_state 285 .document_container_stack 286 .push(container); 287 tokenizer.tokenize_state.document_continued += 1; 288 tokenizer.interrupt = false; 289 State::Retry(StateName::DocumentContainerNewBefore) 290} 291 292/// After containers, at flow. 293// 294/// ```markdown 295/// > | * a 296/// ^ 297/// > | > b 298/// ^ 299/// ``` 300pub fn containers_after(tokenizer: &mut Tokenizer) -> State { 301 let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); 302 303 child.lazy = tokenizer.tokenize_state.document_continued 304 != tokenizer.tokenize_state.document_container_stack.len(); 305 child.define_skip(tokenizer.point.clone()); 306 307 if tokenizer.current.is_none() { 308 State::Retry(StateName::DocumentFlowEnd) 309 } else { 310 let current = tokenizer.events.len(); 311 let previous = tokenizer.tokenize_state.document_data_index; 312 if let Some(previous) = previous { 313 tokenizer.events[previous].link.as_mut().unwrap().next = Some(current); 314 } 315 tokenizer.tokenize_state.document_data_index = Some(current); 316 tokenizer.enter_link( 317 Name::Data, 318 Link { 319 previous, 320 next: None, 321 content: Content::Flow, 322 }, 323 ); 324 State::Retry(StateName::DocumentFlowInside) 325 } 326} 327 328/// In flow. 329// 330/// ```markdown 331/// > | * ab 332/// ^ 333/// ``` 334pub fn flow_inside(tokenizer: &mut Tokenizer) -> State { 335 match tokenizer.current { 336 None => { 337 tokenizer.exit(Name::Data); 338 State::Retry(StateName::DocumentFlowEnd) 339 } 340 // Note: EOL is part of data. 341 Some(b'\n') => { 342 tokenizer.consume(); 343 tokenizer.exit(Name::Data); 344 State::Next(StateName::DocumentFlowEnd) 345 } 346 Some(_) => { 347 tokenizer.consume(); 348 State::Next(StateName::DocumentFlowInside) 349 } 350 } 351} 352 353/// After flow (after eol or at eof). 354// 355/// ```markdown 356/// | * a 357/// > | > b 358/// ^ ^ 359/// ``` 360pub fn flow_end(tokenizer: &mut Tokenizer) -> State { 361 let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); 362 let state = tokenizer 363 .tokenize_state 364 .document_child_state 365 .take() 366 .unwrap_or(State::Next(StateName::FlowStart)); 367 368 tokenizer.tokenize_state.document_exits.push(None); 369 370 let state = child.push( 371 (child.point.index, child.point.vs), 372 (tokenizer.point.index, tokenizer.point.vs), 373 state, 374 ); 375 376 tokenizer.tokenize_state.document_child_state = Some(state); 377 378 // If we’re in a lazy line, and the previous (lazy or not) line is something 379 // that can be lazy, and this line is that too, allow it. 380 // 381 // Accept: 382 // 383 // ```markdown 384 // | * a 385 // > | b 386 // ^ 387 // | ``` 388 // ``` 389 // 390 // Do not accept: 391 // 392 // ```markdown 393 // | * # a 394 // > | b 395 // ^ 396 // | ``` 397 // ``` 398 // 399 // Do not accept: 400 // 401 // ```markdown 402 // | * a 403 // > | # b 404 // ^ 405 // | ``` 406 // ``` 407 let mut document_lazy_continuation_current = false; 408 let mut stack_index = child.stack.len(); 409 410 // Use two algo’s: one for when we’re suspended or in multiline things 411 // like definitions, another for when we fed the line ending and closed. 412 while !document_lazy_continuation_current && stack_index > 0 { 413 stack_index -= 1; 414 let name = &child.stack[stack_index]; 415 if name == &Name::Content || name == &Name::GfmTableHead { 416 document_lazy_continuation_current = true; 417 } 418 } 419 420 // …another because we parse each “rest” line as a paragraph, and we passed 421 // a EOL already. 422 if !document_lazy_continuation_current && !child.events.is_empty() { 423 let before = skip::opt_back(&child.events, child.events.len() - 1, &[Name::LineEnding]); 424 let name = &child.events[before].name; 425 if name == &Name::Content || name == &Name::HeadingSetextUnderline { 426 document_lazy_continuation_current = true; 427 } 428 } 429 430 // Reset “piercing”. 431 child.pierce = false; 432 433 if child.lazy 434 && tokenizer.tokenize_state.document_lazy_accepting_before 435 && document_lazy_continuation_current 436 { 437 tokenizer.tokenize_state.document_continued = 438 tokenizer.tokenize_state.document_container_stack.len(); 439 } 440 441 if tokenizer.tokenize_state.document_continued 442 != tokenizer.tokenize_state.document_container_stack.len() 443 { 444 let result = exit_containers(tokenizer, &Phase::After); 445 // `Phase::After` doesn’t deal with flow: it only generates exits for 446 // containers. 447 // And that never errors. 448 debug_assert!(result.is_ok(), "did not expect error when exiting"); 449 } 450 451 if tokenizer.current.is_none() { 452 tokenizer.tokenize_state.document_continued = 0; 453 if let Err(message) = exit_containers(tokenizer, &Phase::Eof) { 454 return State::Error(message); 455 } 456 resolve(tokenizer); 457 State::Ok 458 } else { 459 tokenizer.tokenize_state.document_continued = 0; 460 tokenizer.tokenize_state.document_lazy_accepting_before = 461 document_lazy_continuation_current; 462 // Containers would only be interrupting if we’ve continued. 463 tokenizer.interrupt = false; 464 State::Retry(StateName::DocumentContainerExistingBefore) 465 } 466} 467 468/// Close containers (and flow if needed). 469fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) -> Result<(), message::Message> { 470 let mut stack_close = tokenizer 471 .tokenize_state 472 .document_container_stack 473 .split_off(tokenizer.tokenize_state.document_continued); 474 475 let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); 476 477 // Flush if needed. 478 if *phase != Phase::After { 479 let state = tokenizer 480 .tokenize_state 481 .document_child_state 482 .take() 483 .unwrap_or(State::Next(StateName::FlowStart)); 484 485 child.flush(state, false)?; 486 } 487 488 if !stack_close.is_empty() { 489 let index = tokenizer.tokenize_state.document_exits.len() 490 - (if *phase == Phase::After { 2 } else { 1 }); 491 let mut exits = Vec::with_capacity(stack_close.len()); 492 493 while let Some(container) = stack_close.pop() { 494 let name = match container.kind { 495 Container::BlockQuote => Name::BlockQuote, 496 Container::GfmFootnoteDefinition => Name::GfmFootnoteDefinition, 497 Container::ListItem => Name::ListItem, 498 }; 499 500 exits.push(Event { 501 kind: Kind::Exit, 502 name: name.clone(), 503 point: tokenizer.point.clone(), 504 link: None, 505 }); 506 507 let mut stack_index = tokenizer.stack.len(); 508 let mut found = false; 509 510 while stack_index > 0 { 511 stack_index -= 1; 512 513 if tokenizer.stack[stack_index] == name { 514 tokenizer.stack.remove(stack_index); 515 found = true; 516 break; 517 } 518 } 519 520 debug_assert!(found, "expected to find container event to exit"); 521 } 522 523 debug_assert!( 524 tokenizer.tokenize_state.document_exits[index].is_none(), 525 "expected no exits yet" 526 ); 527 tokenizer.tokenize_state.document_exits[index] = Some(exits); 528 } 529 530 child.interrupt = false; 531 532 Ok(()) 533} 534 535// Inject everything together. 536fn resolve(tokenizer: &mut Tokenizer) { 537 let child = tokenizer.tokenize_state.document_child.as_mut().unwrap(); 538 539 // First, add the container exits into `child`. 540 let mut child_index = 0; 541 let mut line = 0; 542 543 while child_index < child.events.len() { 544 if child.events[child_index].kind == Kind::Exit 545 && matches!( 546 child.events[child_index].name, 547 Name::LineEnding | Name::BlankLineEnding 548 ) 549 { 550 // Inject before `Enter:LineEnding`. 551 let mut inject_index = child_index - 1; 552 let mut point = &child.events[inject_index].point; 553 554 while child_index + 1 < child.events.len() 555 && child.events[child_index + 1].kind == Kind::Exit 556 { 557 child_index += 1; 558 point = &child.events[child_index].point; 559 // Inject after `Exit:*`. 560 inject_index = child_index + 1; 561 } 562 563 if line < tokenizer.tokenize_state.document_exits.len() { 564 if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() { 565 let mut exit_index = 0; 566 while exit_index < exits.len() { 567 exits[exit_index].point = point.clone(); 568 exit_index += 1; 569 } 570 571 child.map.add(inject_index, 0, exits); 572 } 573 } 574 575 line += 1; 576 } 577 578 child_index += 1; 579 } 580 581 child.map.consume(&mut child.events); 582 583 let mut flow_index = skip::to(&tokenizer.events, 0, &[Name::Data]); 584 while flow_index < tokenizer.events.len() 585 // To do: use `!is_some_and()` when that’s stable. 586 && (tokenizer.events[flow_index].link.is_none() 587 || tokenizer.events[flow_index].link.as_ref().unwrap().content != Content::Flow) 588 { 589 flow_index = skip::to(&tokenizer.events, flow_index + 1, &[Name::Data]); 590 } 591 592 // Now, add all child events into our parent document tokenizer. 593 divide_events( 594 &mut tokenizer.map, 595 &tokenizer.events, 596 flow_index, 597 &mut child.events, 598 (0, 0), 599 ); 600 601 // Replace the flow data with actual events. 602 tokenizer.map.consume(&mut tokenizer.events); 603 604 // Now, add some final container exits due to the EOF. 605 // We can’t inject them into the child earlier, as they are “outside” its 606 // linked data. 607 if line < tokenizer.tokenize_state.document_exits.len() { 608 if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() { 609 let mut exit_index = 0; 610 while exit_index < exits.len() { 611 exits[exit_index].point = tokenizer.point.clone(); 612 exit_index += 1; 613 } 614 615 tokenizer.events.append(&mut exits); 616 } 617 } 618 619 // Add the resolvers from child. 620 tokenizer 621 .resolvers 622 .append(&mut child.resolvers.split_off(0)); 623 624 tokenizer 625 .tokenize_state 626 .definitions 627 .append(&mut child.tokenize_state.definitions.split_off(0)); 628}