Markdown parser fork with extended syntax for personal use.
at hack 281 lines 10 kB view raw
1//! Heading (setext) occurs in the [flow][] content type. 2//! 3//! ## Grammar 4//! 5//! Heading (setext) forms with the following BNF 6//! (<small>see [construct][crate::construct] for character groups</small>): 7//! 8//! ```bnf 9//! heading_setext ::= paragraph eol *space_or_tab (1*'-' | 1*'=') *space_or_tab 10//! 11//! ; See the `paragraph` construct for the BNF of that part. 12//! ``` 13//! 14//! As this construct occurs in flow, like all flow constructs, it must be 15//! followed by an eol (line ending) or eof (end of file). 16//! 17//! See [`paragraph`][paragraph] for grammar, notes, and recommendations on 18//! that part. 19//! 20//! In markdown, it is also possible to create headings with a 21//! [heading (atx)][heading_atx] construct. 22//! The benefit of setext headings is that their text can include line endings, 23//! and by extensions also hard breaks (e.g., with 24//! [hard break (escape)][hard_break_escape]). 25//! However, their limit is that they cannot form `<h3>` through `<h6>` 26//! headings. 27//! 28//! [Thematic breaks][thematic_break] formed with dashes and without whitespace 29//! could be interpreted as a heading (setext). 30//! Which one forms depends on whether there is text directly in fron of the 31//! sequence. 32//! 33//! > 🏛 **Background**: the word *setext* originates from a small markup 34//! > language by Ian Feldman from 1991. 35//! > See [*§ Setext* on Wikipedia][wiki_setext] for more info. 36//! > The word *atx* originates from a tiny markup language by Aaron Swartz 37//! > from 2002. 38//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for 39//! > more info. 40//! 41//! ## HTML 42//! 43//! Heading (setext) in markdown relates to the `<h1>` and `<h2>` elements in 44//! HTML. 45//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the 46//! HTML spec][html] for more info. 47//! 48//! ## Recommendation 49//! 50//! Always use heading (atx), never heading (setext). 51//! 52//! ## Tokens 53//! 54//! * [`HeadingSetext`][Name::HeadingSetext] 55//! * [`HeadingSetextText`][Name::HeadingSetextText] 56//! * [`HeadingSetextUnderline`][Name::HeadingSetextUnderline] 57//! * [`HeadingSetextUnderlineSequence`][Name::HeadingSetextUnderlineSequence] 58//! 59//! ## References 60//! 61//! * [`setext-underline.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/setext-underline.js) 62//! * [*§ 4.3 Setext headings* in `CommonMark`](https://spec.commonmark.org/0.31/#setext-headings) 63//! 64//! [flow]: crate::construct::flow 65//! [paragraph]: crate::construct::paragraph 66//! [heading_atx]: crate::construct::heading_atx 67//! [thematic_break]: crate::construct::thematic_break 68//! [hard_break_escape]: crate::construct::hard_break_escape 69//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements 70//! [wiki_setext]: https://en.wikipedia.org/wiki/Setext 71//! [atx]: http://www.aaronsw.com/2002/atx/ 72 73use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; 74use crate::event::{Content, Event, Kind, Link, Name}; 75use crate::resolve::Name as ResolveName; 76use crate::state::{Name as StateName, State}; 77use crate::subtokenize::Subresult; 78use crate::tokenizer::Tokenizer; 79use crate::util::{constant::TAB_SIZE, skip}; 80use alloc::vec; 81 82/// At start of heading (setext) underline. 83/// 84/// ```markdown 85/// | aa 86/// > | == 87/// ^ 88/// ``` 89pub fn start(tokenizer: &mut Tokenizer) -> State { 90 if tokenizer.parse_state.options.constructs.heading_setext 91 && !tokenizer.lazy 92 && !tokenizer.pierce 93 // Require a paragraph before. 94 && (!tokenizer.events.is_empty() 95 && matches!(tokenizer.events[skip::opt_back( 96 &tokenizer.events, 97 tokenizer.events.len() - 1, 98 &[Name::LineEnding, Name::SpaceOrTab], 99 )] 100 .name, Name::Content | Name::HeadingSetextUnderline)) 101 { 102 tokenizer.enter(Name::HeadingSetextUnderline); 103 104 if matches!(tokenizer.current, Some(b'\t' | b' ')) { 105 tokenizer.attempt(State::Next(StateName::HeadingSetextBefore), State::Nok); 106 State::Retry(space_or_tab_min_max( 107 tokenizer, 108 0, 109 if tokenizer.parse_state.options.constructs.code_indented { 110 TAB_SIZE - 1 111 } else { 112 usize::MAX 113 }, 114 )) 115 } else { 116 State::Retry(StateName::HeadingSetextBefore) 117 } 118 } else { 119 State::Nok 120 } 121} 122 123/// After optional whitespace, at `-` or `=`. 124/// 125/// ```markdown 126/// | aa 127/// > | == 128/// ^ 129/// ``` 130pub fn before(tokenizer: &mut Tokenizer) -> State { 131 match tokenizer.current { 132 Some(b'-' | b'=') => { 133 tokenizer.tokenize_state.marker = tokenizer.current.unwrap(); 134 tokenizer.enter(Name::HeadingSetextUnderlineSequence); 135 State::Retry(StateName::HeadingSetextInside) 136 } 137 _ => State::Nok, 138 } 139} 140 141/// In sequence. 142/// 143/// ```markdown 144/// | aa 145/// > | == 146/// ^ 147/// ``` 148pub fn inside(tokenizer: &mut Tokenizer) -> State { 149 if tokenizer.current == Some(tokenizer.tokenize_state.marker) { 150 tokenizer.consume(); 151 State::Next(StateName::HeadingSetextInside) 152 } else { 153 tokenizer.tokenize_state.marker = 0; 154 tokenizer.exit(Name::HeadingSetextUnderlineSequence); 155 156 if matches!(tokenizer.current, Some(b'\t' | b' ')) { 157 tokenizer.attempt(State::Next(StateName::HeadingSetextAfter), State::Nok); 158 State::Retry(space_or_tab(tokenizer)) 159 } else { 160 State::Retry(StateName::HeadingSetextAfter) 161 } 162 } 163} 164 165/// After sequence, after optional whitespace. 166/// 167/// ```markdown 168/// | aa 169/// > | == 170/// ^ 171/// ``` 172pub fn after(tokenizer: &mut Tokenizer) -> State { 173 match tokenizer.current { 174 None | Some(b'\n') => { 175 // Feel free to interrupt. 176 tokenizer.interrupt = false; 177 tokenizer.register_resolver(ResolveName::HeadingSetext); 178 tokenizer.exit(Name::HeadingSetextUnderline); 179 State::Ok 180 } 181 _ => State::Nok, 182 } 183} 184 185/// Resolve heading (setext). 186pub fn resolve(tokenizer: &mut Tokenizer) -> Option<Subresult> { 187 let mut enter = skip::to(&tokenizer.events, 0, &[Name::HeadingSetextUnderline]); 188 189 while enter < tokenizer.events.len() { 190 let exit = skip::to( 191 &tokenizer.events, 192 enter + 1, 193 &[Name::HeadingSetextUnderline], 194 ); 195 196 // Find paragraph before 197 let paragraph_exit_before = skip::opt_back( 198 &tokenizer.events, 199 enter - 1, 200 &[Name::SpaceOrTab, Name::LineEnding, Name::BlockQuotePrefix], 201 ); 202 203 // There’s a paragraph before: this is a setext heading. 204 if tokenizer.events[paragraph_exit_before].name == Name::Paragraph { 205 let paragraph_enter = skip::to_back( 206 &tokenizer.events, 207 paragraph_exit_before - 1, 208 &[Name::Paragraph], 209 ); 210 211 // Change types of Enter:Paragraph, Exit:Paragraph. 212 tokenizer.events[paragraph_enter].name = Name::HeadingSetextText; 213 tokenizer.events[paragraph_exit_before].name = Name::HeadingSetextText; 214 215 // Add Enter:HeadingSetext, Exit:HeadingSetext. 216 let mut heading_enter = tokenizer.events[paragraph_enter].clone(); 217 heading_enter.name = Name::HeadingSetext; 218 tokenizer.map.add(paragraph_enter, 0, vec![heading_enter]); 219 let mut heading_exit = tokenizer.events[exit].clone(); 220 heading_exit.name = Name::HeadingSetext; 221 tokenizer.map.add(exit + 1, 0, vec![heading_exit]); 222 } else { 223 // There’s a following paragraph, move this underline inside it. 224 if exit + 3 < tokenizer.events.len() 225 && tokenizer.events[exit + 1].name == Name::LineEnding 226 && tokenizer.events[exit + 3].name == Name::Paragraph 227 { 228 // Swap type, HeadingSetextUnderline:Enter -> Paragraph:Enter. 229 tokenizer.events[enter].name = Name::Paragraph; 230 // Swap type, LineEnding -> Data. 231 tokenizer.events[exit + 1].name = Name::Data; 232 tokenizer.events[exit + 2].name = Name::Data; 233 // Move new data (was line ending) back to include whole line, 234 // and link data together. 235 tokenizer.events[exit + 1].point = tokenizer.events[enter].point.clone(); 236 tokenizer.events[exit + 1].link = Some(Link { 237 previous: None, 238 next: Some(exit + 4), 239 content: Content::Text, 240 }); 241 tokenizer.events[exit + 4].link.as_mut().unwrap().previous = Some(exit + 1); 242 // Remove *including* HeadingSetextUnderline:Exit, until the line ending. 243 tokenizer.map.add(enter + 1, exit - enter, vec![]); 244 // Remove old Paragraph:Enter. 245 tokenizer.map.add(exit + 3, 1, vec![]); 246 } else { 247 // Swap type. 248 tokenizer.events[enter].name = Name::Paragraph; 249 tokenizer.events[exit].name = Name::Paragraph; 250 // Replace what’s inside the underline (whitespace, sequence). 251 tokenizer.map.add( 252 enter + 1, 253 exit - enter - 1, 254 vec![ 255 Event { 256 name: Name::Data, 257 kind: Kind::Enter, 258 point: tokenizer.events[enter].point.clone(), 259 link: Some(Link { 260 previous: None, 261 next: None, 262 content: Content::Text, 263 }), 264 }, 265 Event { 266 name: Name::Data, 267 kind: Kind::Exit, 268 point: tokenizer.events[exit].point.clone(), 269 link: None, 270 }, 271 ], 272 ); 273 } 274 } 275 276 enter = skip::to(&tokenizer.events, exit + 1, &[Name::HeadingSetextUnderline]); 277 } 278 279 tokenizer.map.consume(&mut tokenizer.events); 280 None 281}