Markdown parser fork with extended syntax for personal use.
at hack 280 lines 9.0 kB view raw
1//! Heading (atx) occurs in the [flow][] content type. 2//! 3//! ## Grammar 4//! 5//! Heading (atx) forms with the following BNF 6//! (<small>see [construct][crate::construct] for character groups</small>): 7//! 8//! ```bnf 9//! heading_atx ::= 1*6'#' [ 1*space_or_tab line [ 1*space_or_tab 1*'#' ] ] *space_or_tab 10//! ``` 11//! 12//! As this construct occurs in flow, like all flow constructs, it must be 13//! followed by an eol (line ending) or eof (end of file). 14//! 15//! `CommonMark` introduced the requirement on whitespace existing after the 16//! opening sequence and before text. 17//! In older markdown versions, this was not required, and headings would form 18//! without it. 19//! 20//! In markdown, it is also possible to create headings with a 21//! [heading (setext)][heading_setext] construct. 22//! The benefit of setext headings is that their text can include line endings, 23//! and by extensions also hard breaks (e.g., with 24//! [hard break (escape)][hard_break_escape]). 25//! However, their limit is that they cannot form `<h3>` through `<h6>` 26//! headings. 27//! 28//! > 🏛 **Background**: the word *setext* originates from a small markup 29//! > language by Ian Feldman from 1991. 30//! > See [*§ Setext* on Wikipedia][wiki_setext] for more info. 31//! > The word *atx* originates from a tiny markup language by Aaron Swartz 32//! > from 2002. 33//! > See [*§ atx, the true structured text format* on `aaronsw.com`][atx] for 34//! > more info. 35//! 36//! ## HTML 37//! 38//! Headings in markdown relate to the `<h1>` through `<h6>` elements in HTML. 39//! See [*§ 4.3.6 The `h1`, `h2`, `h3`, `h4`, `h5`, and `h6` elements* in the 40//! HTML spec][html] for more info. 41//! 42//! ## Recommendation 43//! 44//! Always use heading (atx), never heading (setext). 45//! 46//! ## Tokens 47//! 48//! * [`HeadingAtx`][Name::HeadingAtx] 49//! * [`HeadingAtxSequence`][Name::HeadingAtxSequence] 50//! * [`HeadingAtxText`][Name::HeadingAtxText] 51//! * [`SpaceOrTab`][Name::SpaceOrTab] 52//! 53//! ## References 54//! 55//! * [`heading-atx.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/heading-atx.js) 56//! * [*§ 4.2 ATX headings* in `CommonMark`](https://spec.commonmark.org/0.31/#atx-headings) 57//! 58//! [flow]: crate::construct::flow 59//! [heading_setext]: crate::construct::heading_setext 60//! [hard_break_escape]: crate::construct::hard_break_escape 61//! [html]: https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements 62//! [wiki_setext]: https://en.wikipedia.org/wiki/Setext 63//! [atx]: http://www.aaronsw.com/2002/atx/ 64 65use crate::construct::partial_space_or_tab::{space_or_tab, space_or_tab_min_max}; 66use crate::event::{Content, Event, Kind, Link, Name}; 67use crate::resolve::Name as ResolveName; 68use crate::state::{Name as StateName, State}; 69use crate::subtokenize::Subresult; 70use crate::tokenizer::Tokenizer; 71use crate::util::constant::{HEADING_ATX_OPENING_FENCE_SIZE_MAX, TAB_SIZE}; 72use alloc::vec; 73 74/// Start of a heading (atx). 75/// 76/// ```markdown 77/// > | ## aa 78/// ^ 79/// ``` 80pub fn start(tokenizer: &mut Tokenizer) -> State { 81 if tokenizer.parse_state.options.constructs.heading_atx { 82 tokenizer.enter(Name::HeadingAtx); 83 if matches!(tokenizer.current, Some(b'\t' | b' ')) { 84 tokenizer.attempt(State::Next(StateName::HeadingAtxBefore), State::Nok); 85 State::Retry(space_or_tab_min_max( 86 tokenizer, 87 0, 88 if tokenizer.parse_state.options.constructs.code_indented { 89 TAB_SIZE - 1 90 } else { 91 usize::MAX 92 }, 93 )) 94 } else { 95 State::Retry(StateName::HeadingAtxBefore) 96 } 97 } else { 98 State::Nok 99 } 100} 101 102/// After optional whitespace, at `#`. 103/// 104/// ```markdown 105/// > | ## aa 106/// ^ 107/// ``` 108pub fn before(tokenizer: &mut Tokenizer) -> State { 109 if Some(b'#') == tokenizer.current { 110 tokenizer.enter(Name::HeadingAtxSequence); 111 State::Retry(StateName::HeadingAtxSequenceOpen) 112 } else { 113 State::Nok 114 } 115} 116 117/// In opening sequence. 118/// 119/// ```markdown 120/// > | ## aa 121/// ^ 122/// ``` 123pub fn sequence_open(tokenizer: &mut Tokenizer) -> State { 124 if tokenizer.current == Some(b'#') 125 && tokenizer.tokenize_state.size < HEADING_ATX_OPENING_FENCE_SIZE_MAX 126 { 127 tokenizer.tokenize_state.size += 1; 128 tokenizer.consume(); 129 State::Next(StateName::HeadingAtxSequenceOpen) 130 } 131 // Always at least one `#`. 132 else if matches!(tokenizer.current, None | Some(b'\t' | b'\n' | b' ')) { 133 tokenizer.tokenize_state.size = 0; 134 tokenizer.exit(Name::HeadingAtxSequence); 135 State::Retry(StateName::HeadingAtxAtBreak) 136 } else { 137 tokenizer.tokenize_state.size = 0; 138 State::Nok 139 } 140} 141 142/// After something, before something else. 143/// 144/// ```markdown 145/// > | ## aa 146/// ^ 147/// ``` 148pub fn at_break(tokenizer: &mut Tokenizer) -> State { 149 match tokenizer.current { 150 None | Some(b'\n') => { 151 tokenizer.exit(Name::HeadingAtx); 152 tokenizer.register_resolver(ResolveName::HeadingAtx); 153 // Feel free to interrupt. 154 tokenizer.interrupt = false; 155 State::Ok 156 } 157 Some(b'\t' | b' ') => { 158 tokenizer.attempt(State::Next(StateName::HeadingAtxAtBreak), State::Nok); 159 State::Retry(space_or_tab(tokenizer)) 160 } 161 Some(b'#') => { 162 tokenizer.enter(Name::HeadingAtxSequence); 163 State::Retry(StateName::HeadingAtxSequenceFurther) 164 } 165 Some(_) => { 166 tokenizer.enter_link( 167 Name::Data, 168 Link { 169 previous: None, 170 next: None, 171 content: Content::Text, 172 }, 173 ); 174 State::Retry(StateName::HeadingAtxData) 175 } 176 } 177} 178 179/// In further sequence (after whitespace). 180/// 181/// Could be normal “visible” hashes in the heading or a final sequence. 182/// 183/// ```markdown 184/// > | ## aa ## 185/// ^ 186/// ``` 187pub fn sequence_further(tokenizer: &mut Tokenizer) -> State { 188 if let Some(b'#') = tokenizer.current { 189 tokenizer.consume(); 190 State::Next(StateName::HeadingAtxSequenceFurther) 191 } else { 192 tokenizer.exit(Name::HeadingAtxSequence); 193 State::Retry(StateName::HeadingAtxAtBreak) 194 } 195} 196 197/// In text. 198/// 199/// ```markdown 200/// > | ## aa 201/// ^ 202/// ``` 203pub fn data(tokenizer: &mut Tokenizer) -> State { 204 match tokenizer.current { 205 // Note: `#` for closing sequence must be preceded by whitespace, otherwise it’s just text. 206 None | Some(b'\t' | b'\n' | b' ') => { 207 tokenizer.exit(Name::Data); 208 State::Retry(StateName::HeadingAtxAtBreak) 209 } 210 _ => { 211 tokenizer.consume(); 212 State::Next(StateName::HeadingAtxData) 213 } 214 } 215} 216 217/// Resolve heading (atx). 218pub fn resolve(tokenizer: &mut Tokenizer) -> Option<Subresult> { 219 let mut index = 0; 220 let mut heading_inside = false; 221 let mut data_start: Option<usize> = None; 222 let mut data_end: Option<usize> = None; 223 224 while index < tokenizer.events.len() { 225 let event = &tokenizer.events[index]; 226 227 if event.name == Name::HeadingAtx { 228 if event.kind == Kind::Enter { 229 heading_inside = true; 230 } else { 231 if let Some(start) = data_start { 232 // If `start` is some, `end` is too. 233 let end = data_end.unwrap(); 234 235 tokenizer.map.add( 236 start, 237 0, 238 vec![Event { 239 kind: Kind::Enter, 240 name: Name::HeadingAtxText, 241 point: tokenizer.events[start].point.clone(), 242 link: None, 243 }], 244 ); 245 246 // Remove everything between the start and the end. 247 tokenizer.map.add(start + 1, end - start - 1, vec![]); 248 249 tokenizer.map.add( 250 end + 1, 251 0, 252 vec![Event { 253 kind: Kind::Exit, 254 name: Name::HeadingAtxText, 255 point: tokenizer.events[end].point.clone(), 256 link: None, 257 }], 258 ); 259 } 260 261 heading_inside = false; 262 data_start = None; 263 data_end = None; 264 } 265 } else if heading_inside && event.name == Name::Data { 266 if event.kind == Kind::Enter { 267 if data_start.is_none() { 268 data_start = Some(index); 269 } 270 } else { 271 data_end = Some(index); 272 } 273 } 274 275 index += 1; 276 } 277 278 tokenizer.map.consume(&mut tokenizer.events); 279 None 280}