Markdown parser fork with extended syntax for personal use.
1//! Trailing whitespace occurs in [string][] and [text][].
2//!
3//! ## Grammar
4//!
5//! Trailing whitespace forms with the following BNF
6//! (<small>see [construct][crate::construct] for character groups</small>):
7//!
8//! ```bnf
9//! ; Restriction: the start and end here count as an eol in the case of `text`.
10//! whitespace ::= *space_or_tab eol *space_or_tab
11//! ```
12//!
13//! It occurs around line endings and, in the case of text content, it also
14//! occurs at the start or end of the whole.
15//!
16//! Normally this whitespace is ignored.
17//! In the case of text content, whitespace before a line ending that
18//! consistents solely of spaces, at least 2, forms a hard break (trailing).
19//!
20//! The minimum number of those spaces is defined in
21//! [`HARD_BREAK_PREFIX_SIZE_MIN`][].
22//!
23//! It is also possible to create a hard break with a similar construct: a
24//! [hard break (escape)][hard_break_escape] is a backslash followed
25//! by a line ending.
26//! That construct is recommended because it is similar to a
27//! [character escape][character_escape] and similar to how line endings can be
28//! “escaped” in other languages.
29//! Trailing spaces are typically invisible in editors, or even automatically
30//! removed, making hard break (trailing) hard to use.
31//!
32//! ## HTML
33//!
34//! Hard breaks in markdown relate to the HTML element `<br>`.
35//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info.
36//!
37//! ## Recommendation
38//!
39//! Do not use trailing whitespace.
40//! It is never needed when using [hard break (escape)][hard_break_escape]
41//! to create hard breaks.
42//!
43//! ## Tokens
44//!
45//! * [`HardBreakTrailing`][Name::HardBreakTrailing]
46//! * [`SpaceOrTab`][Name::SpaceOrTab]
47//!
48//! ## References
49//!
50//! * [`initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js)
51//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.31/#hard-line-breaks)
52//!
53//! [string]: crate::construct::string
54//! [text]: crate::construct::text
55//! [hard_break_escape]: crate::construct::hard_break_escape
56//! [character_escape]: crate::construct::character_escape
57//! [hard_break_prefix_size_min]: crate::util::constant::HARD_BREAK_PREFIX_SIZE_MIN
58//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element
59
60use crate::event::{Event, Kind, Name};
61use crate::tokenizer::Tokenizer;
62use crate::util::{
63 constant::HARD_BREAK_PREFIX_SIZE_MIN,
64 slice::{Position, Slice},
65};
66use alloc::vec;
67
68/// Resolve whitespace.
69pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) {
70 let mut index = 0;
71
72 while index < tokenizer.events.len() {
73 let event = &tokenizer.events[index];
74
75 if event.kind == Kind::Exit && event.name == Name::Data {
76 let trim_start = (trim_whole && index == 1)
77 || (index > 1 && tokenizer.events[index - 2].name == Name::LineEnding);
78 let trim_end = (trim_whole && index == tokenizer.events.len() - 1)
79 || (index + 1 < tokenizer.events.len()
80 && tokenizer.events[index + 1].name == Name::LineEnding);
81
82 trim_data(tokenizer, index, trim_start, trim_end, hard_break);
83 }
84
85 index += 1;
86 }
87
88 tokenizer.map.consume(&mut tokenizer.events);
89}
90
91/// Trim a [`Data`][Name::Data] event.
92fn trim_data(
93 tokenizer: &mut Tokenizer,
94 exit_index: usize,
95 trim_start: bool,
96 trim_end: bool,
97 hard_break: bool,
98) {
99 let mut slice = Slice::from_position(
100 tokenizer.parse_state.bytes,
101 &Position::from_exit_event(&tokenizer.events, exit_index),
102 );
103
104 if trim_end {
105 let mut index = slice.bytes.len();
106 let mut spaces_only = slice.after == 0;
107 while index > 0 {
108 match slice.bytes[index - 1] {
109 b' ' => {}
110 b'\t' => spaces_only = false,
111 _ => break,
112 }
113
114 index -= 1;
115 }
116
117 let diff = slice.bytes.len() - index;
118 let name = if hard_break
119 && spaces_only
120 && diff >= HARD_BREAK_PREFIX_SIZE_MIN
121 && exit_index + 1 < tokenizer.events.len()
122 {
123 Name::HardBreakTrailing
124 } else {
125 Name::SpaceOrTab
126 };
127
128 // The whole data is whitespace.
129 // We can be very fast: we only change the event names.
130 if index == 0 {
131 tokenizer.events[exit_index - 1].name = name.clone();
132 tokenizer.events[exit_index].name = name;
133 return;
134 }
135
136 if diff > 0 || slice.after > 0 {
137 let exit_point = tokenizer.events[exit_index].point.clone();
138 let mut enter_point = exit_point.clone();
139 enter_point.index -= diff;
140 enter_point.column -= diff;
141 enter_point.vs = 0;
142
143 tokenizer.map.add(
144 exit_index + 1,
145 0,
146 vec![
147 Event {
148 kind: Kind::Enter,
149 name: name.clone(),
150 point: enter_point.clone(),
151 link: None,
152 },
153 Event {
154 kind: Kind::Exit,
155 name,
156 point: exit_point,
157 link: None,
158 },
159 ],
160 );
161
162 tokenizer.events[exit_index].point = enter_point;
163 slice.bytes = &slice.bytes[..index];
164 }
165 }
166
167 if trim_start {
168 let mut index = 0;
169 while index < slice.bytes.len() {
170 match slice.bytes[index] {
171 b' ' | b'\t' => index += 1,
172 _ => break,
173 }
174 }
175
176 // The whole data is whitespace.
177 // We can be very fast: we only change the event names.
178 if index == slice.bytes.len() {
179 tokenizer.events[exit_index - 1].name = Name::SpaceOrTab;
180 tokenizer.events[exit_index].name = Name::SpaceOrTab;
181 return;
182 }
183
184 if index > 0 || slice.before > 0 {
185 let enter_point = tokenizer.events[exit_index - 1].point.clone();
186 let mut exit_point = enter_point.clone();
187 exit_point.index += index;
188 exit_point.column += index;
189 exit_point.vs = 0;
190
191 tokenizer.map.add(
192 exit_index - 1,
193 0,
194 vec![
195 Event {
196 kind: Kind::Enter,
197 name: Name::SpaceOrTab,
198 point: enter_point,
199 link: None,
200 },
201 Event {
202 kind: Kind::Exit,
203 name: Name::SpaceOrTab,
204 point: exit_point.clone(),
205 link: None,
206 },
207 ],
208 );
209
210 tokenizer.events[exit_index - 1].point = exit_point;
211 }
212 }
213}