Markdown parser fork with extended syntax for personal use.
1//! The document content type.
2//!
3//! **Document** represents the containers, such as block quotes, list items,
4//! or GFM footnotes, which structure the document and contain other sections.
5//!
6//! The constructs found in flow are:
7//!
8//! * [Block quote][crate::construct::block_quote]
9//! * [List item][crate::construct::list_item]
10//! * [GFM: Footnote definition][crate::construct::gfm_footnote_definition]
11
12use crate::event::{Content, Event, Kind, Link, Name};
13use crate::message;
14use crate::state::{Name as StateName, State};
15use crate::subtokenize::divide_events;
16use crate::tokenizer::{Container, ContainerState, Tokenizer};
17use crate::util::skip;
18use alloc::{boxed::Box, vec::Vec};
19
20/// Phases where we can exit containers.
21#[derive(Debug, PartialEq)]
22enum Phase {
23 /// After parsing a line of lazy flow which resulted in something that
24 /// exits containers before the line.
25 ///
26 /// ```markdown
27 /// | * a
28 /// > | ```js
29 /// ^
30 /// | b
31 /// | ```
32 /// ```
33 After,
34 /// When a new container replaces an existing container.
35 ///
36 /// ```markdown
37 /// | * a
38 /// > | > b
39 /// ^
40 /// ```
41 Prefix,
42 /// After everything.
43 ///
44 /// ```markdown
45 /// > | * a
46 /// ^
47 /// ```
48 Eof,
49}
50
51/// Start of document, at an optional BOM.
52///
53/// ```markdown
54/// > | a
55/// ^
56/// ```
57pub fn start(tokenizer: &mut Tokenizer) -> State {
58 tokenizer.tokenize_state.document_child = Some(Box::new(Tokenizer::new(
59 tokenizer.point.clone(),
60 tokenizer.parse_state,
61 )));
62
63 tokenizer.attempt(
64 State::Next(StateName::DocumentBeforeFrontmatter),
65 State::Next(StateName::DocumentBeforeFrontmatter),
66 );
67
68 State::Retry(StateName::BomStart)
69}
70
71/// At optional frontmatter.
72///
73/// ```markdown
74/// > | ---
75/// ^
76/// | title: Venus
77/// | ---
78/// ```
79pub fn before_frontmatter(tokenizer: &mut Tokenizer) -> State {
80 tokenizer.attempt(
81 State::Next(StateName::DocumentContainerNewBefore),
82 State::Next(StateName::DocumentContainerNewBefore),
83 );
84 State::Retry(StateName::FrontmatterStart)
85}
86
87/// At optional existing containers.
88//
89/// ```markdown
90/// | * a
91/// > | > b
92/// ^
93/// ```
94pub fn container_existing_before(tokenizer: &mut Tokenizer) -> State {
95 // If there are more existing containers, check whether the next one continues.
96 if tokenizer.tokenize_state.document_continued
97 < tokenizer.tokenize_state.document_container_stack.len()
98 {
99 let container = &tokenizer.tokenize_state.document_container_stack
100 [tokenizer.tokenize_state.document_continued];
101
102 let name = match container.kind {
103 Container::BlockQuote => StateName::BlockQuoteContStart,
104 Container::GfmFootnoteDefinition => StateName::GfmFootnoteDefinitionContStart,
105 Container::ListItem => StateName::ListItemContStart,
106 };
107
108 tokenizer.attempt(
109 State::Next(StateName::DocumentContainerExistingAfter),
110 State::Next(StateName::DocumentContainerNewBefore),
111 );
112
113 State::Retry(name)
114 }
115 // Otherwise, check new containers.
116 else {
117 State::Retry(StateName::DocumentContainerNewBefore)
118 }
119}
120
121/// After continued existing container.
122//
123/// ```markdown
124/// | * a
125/// > | b
126/// ^
127/// ```
128pub fn container_existing_after(tokenizer: &mut Tokenizer) -> State {
129 tokenizer.tokenize_state.document_continued += 1;
130 State::Retry(StateName::DocumentContainerExistingBefore)
131}
132
133/// At new containers.
134//
135/// ```markdown
136/// > | * a
137/// ^
138/// > | > b
139/// ^
140/// ```
141pub fn container_new_before(tokenizer: &mut Tokenizer) -> State {
142 // If we have completely continued, restore the flow’s past `interrupt`
143 // status.
144 if tokenizer.tokenize_state.document_continued
145 == tokenizer.tokenize_state.document_container_stack.len()
146 {
147 let child = tokenizer.tokenize_state.document_child.as_ref().unwrap();
148
149 tokenizer.interrupt = child.interrupt;
150
151 // …and if we’re in a concrete construct, new containers can’t “pierce”
152 // into them.
153 if child.concrete {
154 return State::Retry(StateName::DocumentContainersAfter);
155 }
156 }
157
158 // Check for a new container.
159 // Block quote?
160 // Add a new container at the end of the stack.
161 let tail = tokenizer.tokenize_state.document_container_stack.len();
162 tokenizer
163 .tokenize_state
164 .document_container_stack
165 .push(ContainerState {
166 kind: Container::BlockQuote,
167 blank_initial: false,
168 size: 0,
169 });
170 // Swap the existing container with the new one.
171 tokenizer
172 .tokenize_state
173 .document_container_stack
174 .swap(tokenizer.tokenize_state.document_continued, tail);
175
176 tokenizer.attempt(
177 State::Next(StateName::DocumentContainerNewAfter),
178 State::Next(StateName::DocumentContainerNewBeforeNotBlockQuote),
179 );
180 State::Retry(StateName::BlockQuoteStart)
181}
182
183/// At new container, but not a block quote.
184//
185/// ```markdown
186/// > | * a
187/// ^
188/// ```
189pub fn container_new_before_not_block_quote(tokenizer: &mut Tokenizer) -> State {
190 // List item?
191 // We replace the empty block quote container for this new list item one.
192 tokenizer.tokenize_state.document_container_stack
193 [tokenizer.tokenize_state.document_continued] = ContainerState {
194 kind: Container::ListItem,
195 blank_initial: false,
196 size: 0,
197 };
198
199 tokenizer.attempt(
200 State::Next(StateName::DocumentContainerNewAfter),
201 State::Next(StateName::DocumentContainerNewBeforeNotList),
202 );
203 State::Retry(StateName::ListItemStart)
204}
205
206/// At new container, but not a block quote or list item.
207//
208/// ```markdown
209/// > | a
210/// ^
211/// ```
212pub fn container_new_before_not_list(tokenizer: &mut Tokenizer) -> State {
213 // Footnote definition?
214 // We replace the empty list item container for this new footnote
215 // definition one.
216 tokenizer.tokenize_state.document_container_stack
217 [tokenizer.tokenize_state.document_continued] = ContainerState {
218 kind: Container::GfmFootnoteDefinition,
219 blank_initial: false,
220 size: 0,
221 };
222
223 tokenizer.attempt(
224 State::Next(StateName::DocumentContainerNewAfter),
225 State::Next(StateName::DocumentContainerNewBeforeNotGfmFootnoteDefinition),
226 );
227 State::Retry(StateName::GfmFootnoteDefinitionStart)
228}
229
230/// At new container, but not a block quote, list item, or footnote definition.
231//
232/// ```markdown
233/// > | a
234/// ^
235/// ```
236pub fn container_new_before_not_footnote_definition(tokenizer: &mut Tokenizer) -> State {
237 // It wasn’t a new block quote, list item, or footnote definition.
238 // Swap the new container (in the middle) with the existing one (at the end).
239 // Drop what was in the middle.
240 tokenizer
241 .tokenize_state
242 .document_container_stack
243 .swap_remove(tokenizer.tokenize_state.document_continued);
244
245 State::Retry(StateName::DocumentContainersAfter)
246}
247
248/// After new container.
249///
250/// ```markdown
251/// > | * a
252/// ^
253/// > | > b
254/// ^
255/// ```
256pub fn container_new_after(tokenizer: &mut Tokenizer) -> State {
257 // It was a new block quote, list item, or footnote definition.
258 // Swap the new container (in the middle) with the existing one (at the end).
259 // Take the new container.
260 let container = tokenizer
261 .tokenize_state
262 .document_container_stack
263 .swap_remove(tokenizer.tokenize_state.document_continued);
264
265 // If we did not continue all existing containers, and there is a new one,
266 // close the flow and those containers.
267 if tokenizer.tokenize_state.document_continued
268 != tokenizer.tokenize_state.document_container_stack.len()
269 {
270 if let Err(message) = exit_containers(tokenizer, &Phase::Prefix) {
271 return State::Error(message);
272 }
273 }
274
275 // We are “piercing” into the flow with a new container.
276 tokenizer
277 .tokenize_state
278 .document_child
279 .as_mut()
280 .unwrap()
281 .pierce = true;
282
283 tokenizer
284 .tokenize_state
285 .document_container_stack
286 .push(container);
287 tokenizer.tokenize_state.document_continued += 1;
288 tokenizer.interrupt = false;
289 State::Retry(StateName::DocumentContainerNewBefore)
290}
291
292/// After containers, at flow.
293//
294/// ```markdown
295/// > | * a
296/// ^
297/// > | > b
298/// ^
299/// ```
300pub fn containers_after(tokenizer: &mut Tokenizer) -> State {
301 let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
302
303 child.lazy = tokenizer.tokenize_state.document_continued
304 != tokenizer.tokenize_state.document_container_stack.len();
305 child.define_skip(tokenizer.point.clone());
306
307 if tokenizer.current.is_none() {
308 State::Retry(StateName::DocumentFlowEnd)
309 } else {
310 let current = tokenizer.events.len();
311 let previous = tokenizer.tokenize_state.document_data_index;
312 if let Some(previous) = previous {
313 tokenizer.events[previous].link.as_mut().unwrap().next = Some(current);
314 }
315 tokenizer.tokenize_state.document_data_index = Some(current);
316 tokenizer.enter_link(
317 Name::Data,
318 Link {
319 previous,
320 next: None,
321 content: Content::Flow,
322 },
323 );
324 State::Retry(StateName::DocumentFlowInside)
325 }
326}
327
328/// In flow.
329//
330/// ```markdown
331/// > | * ab
332/// ^
333/// ```
334pub fn flow_inside(tokenizer: &mut Tokenizer) -> State {
335 match tokenizer.current {
336 None => {
337 tokenizer.exit(Name::Data);
338 State::Retry(StateName::DocumentFlowEnd)
339 }
340 // Note: EOL is part of data.
341 Some(b'\n') => {
342 tokenizer.consume();
343 tokenizer.exit(Name::Data);
344 State::Next(StateName::DocumentFlowEnd)
345 }
346 Some(_) => {
347 tokenizer.consume();
348 State::Next(StateName::DocumentFlowInside)
349 }
350 }
351}
352
353/// After flow (after eol or at eof).
354//
355/// ```markdown
356/// | * a
357/// > | > b
358/// ^ ^
359/// ```
360pub fn flow_end(tokenizer: &mut Tokenizer) -> State {
361 let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
362 let state = tokenizer
363 .tokenize_state
364 .document_child_state
365 .take()
366 .unwrap_or(State::Next(StateName::FlowStart));
367
368 tokenizer.tokenize_state.document_exits.push(None);
369
370 let state = child.push(
371 (child.point.index, child.point.vs),
372 (tokenizer.point.index, tokenizer.point.vs),
373 state,
374 );
375
376 tokenizer.tokenize_state.document_child_state = Some(state);
377
378 // If we’re in a lazy line, and the previous (lazy or not) line is something
379 // that can be lazy, and this line is that too, allow it.
380 //
381 // Accept:
382 //
383 // ```markdown
384 // | * a
385 // > | b
386 // ^
387 // | ```
388 // ```
389 //
390 // Do not accept:
391 //
392 // ```markdown
393 // | * # a
394 // > | b
395 // ^
396 // | ```
397 // ```
398 //
399 // Do not accept:
400 //
401 // ```markdown
402 // | * a
403 // > | # b
404 // ^
405 // | ```
406 // ```
407 let mut document_lazy_continuation_current = false;
408 let mut stack_index = child.stack.len();
409
410 // Use two algo’s: one for when we’re suspended or in multiline things
411 // like definitions, another for when we fed the line ending and closed.
412 while !document_lazy_continuation_current && stack_index > 0 {
413 stack_index -= 1;
414 let name = &child.stack[stack_index];
415 if name == &Name::Content || name == &Name::GfmTableHead {
416 document_lazy_continuation_current = true;
417 }
418 }
419
420 // …another because we parse each “rest” line as a paragraph, and we passed
421 // a EOL already.
422 if !document_lazy_continuation_current && !child.events.is_empty() {
423 let before = skip::opt_back(&child.events, child.events.len() - 1, &[Name::LineEnding]);
424 let name = &child.events[before].name;
425 if name == &Name::Content || name == &Name::HeadingSetextUnderline {
426 document_lazy_continuation_current = true;
427 }
428 }
429
430 // Reset “piercing”.
431 child.pierce = false;
432
433 if child.lazy
434 && tokenizer.tokenize_state.document_lazy_accepting_before
435 && document_lazy_continuation_current
436 {
437 tokenizer.tokenize_state.document_continued =
438 tokenizer.tokenize_state.document_container_stack.len();
439 }
440
441 if tokenizer.tokenize_state.document_continued
442 != tokenizer.tokenize_state.document_container_stack.len()
443 {
444 let result = exit_containers(tokenizer, &Phase::After);
445 // `Phase::After` doesn’t deal with flow: it only generates exits for
446 // containers.
447 // And that never errors.
448 debug_assert!(result.is_ok(), "did not expect error when exiting");
449 }
450
451 if tokenizer.current.is_none() {
452 tokenizer.tokenize_state.document_continued = 0;
453 if let Err(message) = exit_containers(tokenizer, &Phase::Eof) {
454 return State::Error(message);
455 }
456 resolve(tokenizer);
457 State::Ok
458 } else {
459 tokenizer.tokenize_state.document_continued = 0;
460 tokenizer.tokenize_state.document_lazy_accepting_before =
461 document_lazy_continuation_current;
462 // Containers would only be interrupting if we’ve continued.
463 tokenizer.interrupt = false;
464 State::Retry(StateName::DocumentContainerExistingBefore)
465 }
466}
467
468/// Close containers (and flow if needed).
469fn exit_containers(tokenizer: &mut Tokenizer, phase: &Phase) -> Result<(), message::Message> {
470 let mut stack_close = tokenizer
471 .tokenize_state
472 .document_container_stack
473 .split_off(tokenizer.tokenize_state.document_continued);
474
475 let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
476
477 // Flush if needed.
478 if *phase != Phase::After {
479 let state = tokenizer
480 .tokenize_state
481 .document_child_state
482 .take()
483 .unwrap_or(State::Next(StateName::FlowStart));
484
485 child.flush(state, false)?;
486 }
487
488 if !stack_close.is_empty() {
489 let index = tokenizer.tokenize_state.document_exits.len()
490 - (if *phase == Phase::After { 2 } else { 1 });
491 let mut exits = Vec::with_capacity(stack_close.len());
492
493 while let Some(container) = stack_close.pop() {
494 let name = match container.kind {
495 Container::BlockQuote => Name::BlockQuote,
496 Container::GfmFootnoteDefinition => Name::GfmFootnoteDefinition,
497 Container::ListItem => Name::ListItem,
498 };
499
500 exits.push(Event {
501 kind: Kind::Exit,
502 name: name.clone(),
503 point: tokenizer.point.clone(),
504 link: None,
505 });
506
507 let mut stack_index = tokenizer.stack.len();
508 let mut found = false;
509
510 while stack_index > 0 {
511 stack_index -= 1;
512
513 if tokenizer.stack[stack_index] == name {
514 tokenizer.stack.remove(stack_index);
515 found = true;
516 break;
517 }
518 }
519
520 debug_assert!(found, "expected to find container event to exit");
521 }
522
523 debug_assert!(
524 tokenizer.tokenize_state.document_exits[index].is_none(),
525 "expected no exits yet"
526 );
527 tokenizer.tokenize_state.document_exits[index] = Some(exits);
528 }
529
530 child.interrupt = false;
531
532 Ok(())
533}
534
535// Inject everything together.
536fn resolve(tokenizer: &mut Tokenizer) {
537 let child = tokenizer.tokenize_state.document_child.as_mut().unwrap();
538
539 // First, add the container exits into `child`.
540 let mut child_index = 0;
541 let mut line = 0;
542
543 while child_index < child.events.len() {
544 if child.events[child_index].kind == Kind::Exit
545 && matches!(
546 child.events[child_index].name,
547 Name::LineEnding | Name::BlankLineEnding
548 )
549 {
550 // Inject before `Enter:LineEnding`.
551 let mut inject_index = child_index - 1;
552 let mut point = &child.events[inject_index].point;
553
554 while child_index + 1 < child.events.len()
555 && child.events[child_index + 1].kind == Kind::Exit
556 {
557 child_index += 1;
558 point = &child.events[child_index].point;
559 // Inject after `Exit:*`.
560 inject_index = child_index + 1;
561 }
562
563 if line < tokenizer.tokenize_state.document_exits.len() {
564 if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() {
565 let mut exit_index = 0;
566 while exit_index < exits.len() {
567 exits[exit_index].point = point.clone();
568 exit_index += 1;
569 }
570
571 child.map.add(inject_index, 0, exits);
572 }
573 }
574
575 line += 1;
576 }
577
578 child_index += 1;
579 }
580
581 child.map.consume(&mut child.events);
582
583 let mut flow_index = skip::to(&tokenizer.events, 0, &[Name::Data]);
584 while flow_index < tokenizer.events.len()
585 // To do: use `!is_some_and()` when that’s stable.
586 && (tokenizer.events[flow_index].link.is_none()
587 || tokenizer.events[flow_index].link.as_ref().unwrap().content != Content::Flow)
588 {
589 flow_index = skip::to(&tokenizer.events, flow_index + 1, &[Name::Data]);
590 }
591
592 // Now, add all child events into our parent document tokenizer.
593 divide_events(
594 &mut tokenizer.map,
595 &tokenizer.events,
596 flow_index,
597 &mut child.events,
598 (0, 0),
599 );
600
601 // Replace the flow data with actual events.
602 tokenizer.map.consume(&mut tokenizer.events);
603
604 // Now, add some final container exits due to the EOF.
605 // We can’t inject them into the child earlier, as they are “outside” its
606 // linked data.
607 if line < tokenizer.tokenize_state.document_exits.len() {
608 if let Some(mut exits) = tokenizer.tokenize_state.document_exits[line].take() {
609 let mut exit_index = 0;
610 while exit_index < exits.len() {
611 exits[exit_index].point = tokenizer.point.clone();
612 exit_index += 1;
613 }
614
615 tokenizer.events.append(&mut exits);
616 }
617 }
618
619 // Add the resolvers from child.
620 tokenizer
621 .resolvers
622 .append(&mut child.resolvers.split_off(0));
623
624 tokenizer
625 .tokenize_state
626 .definitions
627 .append(&mut child.tokenize_state.definitions.split_off(0));
628}