src/construct/partial_whitespace.rs at hack · crashkeys.dev/markdown-rs

crashkeys.dev / markdown-rs
fork atom
Markdown parser fork with extended syntax for personal use.
fork atom
markdown-rs / src / construct / partial_whitespace.rs
at hack 213 lines 7.1 kB view raw
wrap content
Titus Wormer Refactor docs 11mo ago
e0ca3f6c
  1//! Trailing whitespace occurs in [string][] and [text][].
  2//!
  3//! ## Grammar
  4//!
  5//! Trailing whitespace forms with the following BNF
  6//! (<small>see [construct][crate::construct] for character groups</small>):
  7//!
  8//! ```bnf
  9//! ; Restriction: the start and end here count as an eol in the case of `text`.
 10//! whitespace ::= *space_or_tab eol *space_or_tab
 11//! ```
 12//!
 13//! It occurs around line endings and, in the case of text content, it also
 14//! occurs at the start or end of the whole.
 15//!
 16//! Normally this whitespace is ignored.
 17//! In the case of text content, whitespace before a line ending that
 18//! consistents solely of spaces, at least 2, forms a hard break (trailing).
 19//!
 20//! The minimum number of those spaces is defined in
 21//! [`HARD_BREAK_PREFIX_SIZE_MIN`][].
 22//!
 23//! It is also possible to create a hard break with a similar construct: a
 24//! [hard break (escape)][hard_break_escape] is a backslash followed
 25//! by a line ending.
 26//! That construct is recommended because it is similar to a
 27//! [character escape][character_escape] and similar to how line endings can be
 28//! “escaped” in other languages.
 29//! Trailing spaces are typically invisible in editors, or even automatically
 30//! removed, making hard break (trailing) hard to use.
 31//!
 32//! ## HTML
 33//!
 34//! Hard breaks in markdown relate to the HTML element `<br>`.
 35//! See [*§ 4.5.27 The `br` element* in the HTML spec][html] for more info.
 36//!
 37//! ## Recommendation
 38//!
 39//! Do not use trailing whitespace.
 40//! It is never needed when using [hard break (escape)][hard_break_escape]
 41//! to create hard breaks.
 42//!
 43//! ## Tokens
 44//!
 45//! * [`HardBreakTrailing`][Name::HardBreakTrailing]
 46//! * [`SpaceOrTab`][Name::SpaceOrTab]
 47//!
 48//! ## References
 49//!
 50//! * [`initialize/text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark/dev/lib/initialize/text.js)
 51//! * [*§ 6.7 Hard line breaks* in `CommonMark`](https://spec.commonmark.org/0.31/#hard-line-breaks)
 52//!
 53//! [string]: crate::construct::string
 54//! [text]: crate::construct::text
 55//! [hard_break_escape]: crate::construct::hard_break_escape
 56//! [character_escape]: crate::construct::character_escape
 57//! [hard_break_prefix_size_min]: crate::util::constant::HARD_BREAK_PREFIX_SIZE_MIN
 58//! [html]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-br-element
 59
 60use crate::event::{Event, Kind, Name};
 61use crate::tokenizer::Tokenizer;
 62use crate::util::{
 63    constant::HARD_BREAK_PREFIX_SIZE_MIN,
 64    slice::{Position, Slice},
 65};
 66use alloc::vec;
 67
 68/// Resolve whitespace.
 69pub fn resolve_whitespace(tokenizer: &mut Tokenizer, hard_break: bool, trim_whole: bool) {
 70    let mut index = 0;
 71
 72    while index < tokenizer.events.len() {
 73        let event = &tokenizer.events[index];
 74
 75        if event.kind == Kind::Exit && event.name == Name::Data {
 76            let trim_start = (trim_whole && index == 1)
 77                || (index > 1 && tokenizer.events[index - 2].name == Name::LineEnding);
 78            let trim_end = (trim_whole && index == tokenizer.events.len() - 1)
 79                || (index + 1 < tokenizer.events.len()
 80                    && tokenizer.events[index + 1].name == Name::LineEnding);
 81
 82            trim_data(tokenizer, index, trim_start, trim_end, hard_break);
 83        }
 84
 85        index += 1;
 86    }
 87
 88    tokenizer.map.consume(&mut tokenizer.events);
 89}
 90
 91/// Trim a [`Data`][Name::Data] event.
 92fn trim_data(
 93    tokenizer: &mut Tokenizer,
 94    exit_index: usize,
 95    trim_start: bool,
 96    trim_end: bool,
 97    hard_break: bool,
 98) {
 99    let mut slice = Slice::from_position(
100        tokenizer.parse_state.bytes,
101        &Position::from_exit_event(&tokenizer.events, exit_index),
102    );
103
104    if trim_end {
105        let mut index = slice.bytes.len();
106        let mut spaces_only = slice.after == 0;
107        while index > 0 {
108            match slice.bytes[index - 1] {
109                b' ' => {}
110                b'\t' => spaces_only = false,
111                _ => break,
112            }
113
114            index -= 1;
115        }
116
117        let diff = slice.bytes.len() - index;
118        let name = if hard_break
119            && spaces_only
120            && diff >= HARD_BREAK_PREFIX_SIZE_MIN
121            && exit_index + 1 < tokenizer.events.len()
122        {
123            Name::HardBreakTrailing
124        } else {
125            Name::SpaceOrTab
126        };
127
128        // The whole data is whitespace.
129        // We can be very fast: we only change the event names.
130        if index == 0 {
131            tokenizer.events[exit_index - 1].name = name.clone();
132            tokenizer.events[exit_index].name = name;
133            return;
134        }
135
136        if diff > 0 || slice.after > 0 {
137            let exit_point = tokenizer.events[exit_index].point.clone();
138            let mut enter_point = exit_point.clone();
139            enter_point.index -= diff;
140            enter_point.column -= diff;
141            enter_point.vs = 0;
142
143            tokenizer.map.add(
144                exit_index + 1,
145                0,
146                vec![
147                    Event {
148                        kind: Kind::Enter,
149                        name: name.clone(),
150                        point: enter_point.clone(),
151                        link: None,
152                    },
153                    Event {
154                        kind: Kind::Exit,
155                        name,
156                        point: exit_point,
157                        link: None,
158                    },
159                ],
160            );
161
162            tokenizer.events[exit_index].point = enter_point;
163            slice.bytes = &slice.bytes[..index];
164        }
165    }
166
167    if trim_start {
168        let mut index = 0;
169        while index < slice.bytes.len() {
170            match slice.bytes[index] {
171                b' ' | b'\t' => index += 1,
172                _ => break,
173            }
174        }
175
176        // The whole data is whitespace.
177        // We can be very fast: we only change the event names.
178        if index == slice.bytes.len() {
179            tokenizer.events[exit_index - 1].name = Name::SpaceOrTab;
180            tokenizer.events[exit_index].name = Name::SpaceOrTab;
181            return;
182        }
183
184        if index > 0 || slice.before > 0 {
185            let enter_point = tokenizer.events[exit_index - 1].point.clone();
186            let mut exit_point = enter_point.clone();
187            exit_point.index += index;
188            exit_point.column += index;
189            exit_point.vs = 0;
190
191            tokenizer.map.add(
192                exit_index - 1,
193                0,
194                vec![
195                    Event {
196                        kind: Kind::Enter,
197                        name: Name::SpaceOrTab,
198                        point: enter_point,
199                        link: None,
200                    },
201                    Event {
202                        kind: Kind::Exit,
203                        name: Name::SpaceOrTab,
204                        point: exit_point.clone(),
205                        link: None,
206                    },
207                ],
208            );
209
210            tokenizer.events[exit_index - 1].point = exit_point;
211        }
212    }
213}