Markdown parser fork with extended syntax for personal use.
at hack 213 lines 7.1 kB view raw
1//! Trailing whitespace occurs in [string][] and [text][]. 2//! 3//! ## Grammar 4//! 5//! Trailing whitespace forms with the following BNF 6//! (<small>see [construct][crate::construct] for character groups</small>): 7//! 8//! ```bnf 9//! ; Restriction: the start and end here count as an eol in the case of `text`. 10//! whitespace ::= *space_or_tab eol *space_or_tab 11//! ``` 12//! 13//! It occurs around line endings and, in the case of text content, it also 14//! occurs at the start or end of the whole. 15//! 16//! Normally this whitespace is ignored. 17//! In the case of text content, whitespace before a line ending that 18//! consistents solely of spaces, at least 2, forms a hard break (trailing). 19//! 20//! The minimum number of those spaces is defined in 21//! [`HARD_BREAK_PREFIX_SIZE_MIN`][]. 22//! 23//! It is also possible to create a hard break with a similar construct: a 24//! [hard break (escape)][hard_break_escape] is a backslash followed 25//! by a line ending. 26//! That construct is recommended because it is similar to a 27//! [character escape][character_escape] and similar to how line endings can be 28//! “escaped” in other languages. 29//! Trailing spaces are typically invisible in editors, or even automatically 30//! removed, making hard break (trailing) hard to use. 31//! 32//! ## HTML 33//! 34//! Hard breaks in markdown relate to the HTML element `<br>`. 35//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info. 36//! 37//! ## Recommendation 38//! 39//! Do not use trailing whitespace. 40//! It is never needed when using [hard break (escape)][hard_break_escape] 41//! to create hard breaks. 42//! 43//! ## Tokens 44//! 45//! * [`HardBreakTrailing`][Name::HardBreakTrailing] 46//! * [`SpaceOrTab`][Name::SpaceOrTab] 47//! 48//! ## References 49//! 50//! * [`initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js) 51//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.31/#hard-line-breaks) 52//! 53//! [string]: crate::construct::string 54//! [text]: crate::construct::text 55//! [hard_break_escape]: crate::construct::hard_break_escape 56//! [character_escape]: crate::construct::character_escape 57//! [hard_break_prefix_size_min]: crate::util::constant::HARD_BREAK_PREFIX_SIZE_MIN 58//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element 59 60use crate::event::{Event, Kind, Name}; 61use crate::tokenizer::Tokenizer; 62use crate::util::{ 63 constant::HARD_BREAK_PREFIX_SIZE_MIN, 64 slice::{Position, Slice}, 65}; 66use alloc::vec; 67 68/// Resolve whitespace. 69pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) { 70 let mut index = 0; 71 72 while index < tokenizer.events.len() { 73 let event = &tokenizer.events[index]; 74 75 if event.kind == Kind::Exit && event.name == Name::Data { 76 let trim_start = (trim_whole && index == 1) 77 || (index > 1 && tokenizer.events[index - 2].name == Name::LineEnding); 78 let trim_end = (trim_whole && index == tokenizer.events.len() - 1) 79 || (index + 1 < tokenizer.events.len() 80 && tokenizer.events[index + 1].name == Name::LineEnding); 81 82 trim_data(tokenizer, index, trim_start, trim_end, hard_break); 83 } 84 85 index += 1; 86 } 87 88 tokenizer.map.consume(&mut tokenizer.events); 89} 90 91/// Trim a [`Data`][Name::Data] event. 92fn trim_data( 93 tokenizer: &mut Tokenizer, 94 exit_index: usize, 95 trim_start: bool, 96 trim_end: bool, 97 hard_break: bool, 98) { 99 let mut slice = Slice::from_position( 100 tokenizer.parse_state.bytes, 101 &Position::from_exit_event(&tokenizer.events, exit_index), 102 ); 103 104 if trim_end { 105 let mut index = slice.bytes.len(); 106 let mut spaces_only = slice.after == 0; 107 while index > 0 { 108 match slice.bytes[index - 1] { 109 b' ' => {} 110 b'\t' => spaces_only = false, 111 _ => break, 112 } 113 114 index -= 1; 115 } 116 117 let diff = slice.bytes.len() - index; 118 let name = if hard_break 119 && spaces_only 120 && diff >= HARD_BREAK_PREFIX_SIZE_MIN 121 && exit_index + 1 < tokenizer.events.len() 122 { 123 Name::HardBreakTrailing 124 } else { 125 Name::SpaceOrTab 126 }; 127 128 // The whole data is whitespace. 129 // We can be very fast: we only change the event names. 130 if index == 0 { 131 tokenizer.events[exit_index - 1].name = name.clone(); 132 tokenizer.events[exit_index].name = name; 133 return; 134 } 135 136 if diff > 0 || slice.after > 0 { 137 let exit_point = tokenizer.events[exit_index].point.clone(); 138 let mut enter_point = exit_point.clone(); 139 enter_point.index -= diff; 140 enter_point.column -= diff; 141 enter_point.vs = 0; 142 143 tokenizer.map.add( 144 exit_index + 1, 145 0, 146 vec![ 147 Event { 148 kind: Kind::Enter, 149 name: name.clone(), 150 point: enter_point.clone(), 151 link: None, 152 }, 153 Event { 154 kind: Kind::Exit, 155 name, 156 point: exit_point, 157 link: None, 158 }, 159 ], 160 ); 161 162 tokenizer.events[exit_index].point = enter_point; 163 slice.bytes = &slice.bytes[..index]; 164 } 165 } 166 167 if trim_start { 168 let mut index = 0; 169 while index < slice.bytes.len() { 170 match slice.bytes[index] { 171 b' ' | b'\t' => index += 1, 172 _ => break, 173 } 174 } 175 176 // The whole data is whitespace. 177 // We can be very fast: we only change the event names. 178 if index == slice.bytes.len() { 179 tokenizer.events[exit_index - 1].name = Name::SpaceOrTab; 180 tokenizer.events[exit_index].name = Name::SpaceOrTab; 181 return; 182 } 183 184 if index > 0 || slice.before > 0 { 185 let enter_point = tokenizer.events[exit_index - 1].point.clone(); 186 let mut exit_point = enter_point.clone(); 187 exit_point.index += index; 188 exit_point.column += index; 189 exit_point.vs = 0; 190 191 tokenizer.map.add( 192 exit_index - 1, 193 0, 194 vec![ 195 Event { 196 kind: Kind::Enter, 197 name: Name::SpaceOrTab, 198 point: enter_point, 199 link: None, 200 }, 201 Event { 202 kind: Kind::Exit, 203 name: Name::SpaceOrTab, 204 point: exit_point.clone(), 205 link: None, 206 }, 207 ], 208 ); 209 210 tokenizer.events[exit_index - 1].point = exit_point; 211 } 212 } 213}