src/construct/partial_destination.rs at hack

crashkeys.dev / markdown-rs
fork atom
Markdown parser fork with extended syntax for personal use.
fork atom
markdown-rs / src / construct / partial_destination.rs
at hack 251 lines 8.2 kB view raw
wrap content
Titus Wormer Refactor docs 11mo ago
e0ca3f6c
  1//! Destination occurs in [definition][] and [label end][label_end].
  2//!
  3//! ## Grammar
  4//!
  5//! Destination forms with the following BNF
  6//! (<small>see [construct][crate::construct] for character groups</small>):
  7//!
  8//! ```bnf
  9//! destination ::= destination_enclosed | destination_raw
 10//!
 11//! destination_enclosed ::= '<' *(destination_enclosed_byte | destination_enclosed_escape) '>'
 12//! destination_enclosed_byte ::= line - '<' - '\\' - '>'
 13//! destination_enclosed_escape ::= '\\' ['<' | '\\' | '>']
 14//!
 15//! destination_raw ::= 1*(destination_raw_byte | destination_raw_escape)
 16//! ; Restriction: unbalanced `)` characters are not allowed.
 17//! destination_raw_byte ::= text - '\\' - ascii_control
 18//! destination_raw_escape ::= '\\' ['(' | ')' | '\\']
 19//! ```
 20//!
 21//! Balanced parens allowed in raw destinations.
 22//! They are counted with a counter that starts at `0`, and is incremented
 23//! every time `(` occurs and decremented every time `)` occurs.
 24//! If `)` is found when the counter is `0`, the destination closes immediately
 25//! before it.
 26//! Escaped parens do not count in balancing.
 27//!
 28//! The destination is interpreted as the [string][] content type.
 29//! That means that [character escapes][character_escape] and
 30//! [character references][character_reference] are allowed.
 31//!
 32//! The grammar for enclosed destinations (`<x>`) prohibits the use of `<`,
 33//! `>`, and line endings to form URLs.
 34//! The angle brackets can be encoded as a character reference, character
 35//! escape, or percent encoding:
 36//!
 37//! * `<` as `&lt;`, `\<`, or `%3c`
 38//! * `>` as `&gt;`, `\>`, or `%3e`
 39//!
 40//! The grammar for raw destinations (`x`) prohibits space (` `) and all
 41//! [ASCII control][u8::is_ascii_control] characters, which thus must be
 42//! encoded.
 43//! Unbalanced parens can be encoded as a character reference, character escape,
 44//! or percent encoding:
 45//!
 46//! * `(` as `&lpar;`, `\(`, or `%28`
 47//! * `)` as `&rpar;`, `\)`, or `%29`
 48//!
 49//! There are several cases where incorrect encoding of URLs would, in other
 50//! languages, result in a parse error.
 51//! In markdown, there are no errors, and URLs are normalized.
 52//! In addition, unicode characters are percent encoded
 53//! ([`sanitize_uri`][sanitize_uri]).
 54//! For example:
 55//!
 56//! ```markdown
 57//! [x]
 58//!
 59//! [x]: <https://a👍b%>
 60//! ```
 61//!
 62//! Yields:
 63//!
 64//! ```html
 65//! <p><a href="https://a%F0%9F%91%8Db%25">x</a></p>
 66//! ```
 67//!
 68//! ## Recommendation
 69//!
 70//! It is recommended to use the enclosed variant of destinations, as it allows
 71//! the most characters, including arbitrary parens, in URLs.
 72//!
 73//! ## References
 74//!
 75//! * [`micromark-factory-destination/index.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-factory-destination/dev/index.js)
 76//!
 77//! [definition]: crate::construct::definition
 78//! [string]: crate::construct::string
 79//! [character_escape]: crate::construct::character_escape
 80//! [character_reference]: crate::construct::character_reference
 81//! [label_end]: crate::construct::label_end
 82//! [sanitize_uri]: crate::util::sanitize_uri
 83
 84use crate::event::{Content, Link, Name};
 85use crate::state::{Name as StateName, State};
 86use crate::tokenizer::Tokenizer;
 87
 88/// Start of destination.
 89///
 90/// ```markdown
 91/// > | <aa>
 92///     ^
 93/// > | aa
 94///     ^
 95/// ```
 96pub fn start(tokenizer: &mut Tokenizer) -> State {
 97    match tokenizer.current {
 98        Some(b'<') => {
 99            tokenizer.enter(tokenizer.tokenize_state.token_1.clone());
100            tokenizer.enter(tokenizer.tokenize_state.token_2.clone());
101            tokenizer.enter(tokenizer.tokenize_state.token_3.clone());
102            tokenizer.consume();
103            tokenizer.exit(tokenizer.tokenize_state.token_3.clone());
104            State::Next(StateName::DestinationEnclosedBefore)
105        }
106        // ASCII control, space, closing paren, but *not* `\0`.
107        None | Some(0x01..=0x1F | b' ' | b')' | 0x7F) => State::Nok,
108        Some(_) => {
109            tokenizer.enter(tokenizer.tokenize_state.token_1.clone());
110            tokenizer.enter(tokenizer.tokenize_state.token_4.clone());
111            tokenizer.enter(tokenizer.tokenize_state.token_5.clone());
112            tokenizer.enter_link(
113                Name::Data,
114                Link {
115                    previous: None,
116                    next: None,
117                    content: Content::String,
118                },
119            );
120            State::Retry(StateName::DestinationRaw)
121        }
122    }
123}
124
125/// After `<`, at an enclosed destination.
126///
127/// ```markdown
128/// > | <aa>
129///      ^
130/// ```
131pub fn enclosed_before(tokenizer: &mut Tokenizer) -> State {
132    if let Some(b'>') = tokenizer.current {
133        tokenizer.enter(tokenizer.tokenize_state.token_3.clone());
134        tokenizer.consume();
135        tokenizer.exit(tokenizer.tokenize_state.token_3.clone());
136        tokenizer.exit(tokenizer.tokenize_state.token_2.clone());
137        tokenizer.exit(tokenizer.tokenize_state.token_1.clone());
138        State::Ok
139    } else {
140        tokenizer.enter(tokenizer.tokenize_state.token_5.clone());
141        tokenizer.enter_link(
142            Name::Data,
143            Link {
144                previous: None,
145                next: None,
146                content: Content::String,
147            },
148        );
149        State::Retry(StateName::DestinationEnclosed)
150    }
151}
152
153/// In enclosed destination.
154///
155/// ```markdown
156/// > | <aa>
157///      ^
158/// ```
159pub fn enclosed(tokenizer: &mut Tokenizer) -> State {
160    match tokenizer.current {
161        None | Some(b'\n' | b'<') => State::Nok,
162        Some(b'>') => {
163            tokenizer.exit(Name::Data);
164            tokenizer.exit(tokenizer.tokenize_state.token_5.clone());
165            State::Retry(StateName::DestinationEnclosedBefore)
166        }
167        Some(b'\\') => {
168            tokenizer.consume();
169            State::Next(StateName::DestinationEnclosedEscape)
170        }
171        _ => {
172            tokenizer.consume();
173            State::Next(StateName::DestinationEnclosed)
174        }
175    }
176}
177
178/// After `\`, at a special character.
179///
180/// ```markdown
181/// > | <a\*a>
182///        ^
183/// ```
184pub fn enclosed_escape(tokenizer: &mut Tokenizer) -> State {
185    match tokenizer.current {
186        Some(b'<' | b'>' | b'\\') => {
187            tokenizer.consume();
188            State::Next(StateName::DestinationEnclosed)
189        }
190        _ => State::Retry(StateName::DestinationEnclosed),
191    }
192}
193
194/// In raw destination.
195///
196/// ```markdown
197/// > | aa
198///     ^
199/// ```
200pub fn raw(tokenizer: &mut Tokenizer) -> State {
201    if tokenizer.tokenize_state.size == 0
202        && matches!(tokenizer.current, None | Some(b'\t' | b'\n' | b' ' | b')'))
203    {
204        tokenizer.exit(Name::Data);
205        tokenizer.exit(tokenizer.tokenize_state.token_5.clone());
206        tokenizer.exit(tokenizer.tokenize_state.token_4.clone());
207        tokenizer.exit(tokenizer.tokenize_state.token_1.clone());
208        tokenizer.tokenize_state.size = 0;
209        State::Ok
210    } else if tokenizer.tokenize_state.size < tokenizer.tokenize_state.size_b
211        && tokenizer.current == Some(b'(')
212    {
213        tokenizer.consume();
214        tokenizer.tokenize_state.size += 1;
215        State::Next(StateName::DestinationRaw)
216    } else if tokenizer.current == Some(b')') {
217        tokenizer.consume();
218        tokenizer.tokenize_state.size -= 1;
219        State::Next(StateName::DestinationRaw)
220    }
221    // ASCII control (but *not* `\0`) and space and `(`.
222    else if matches!(
223        tokenizer.current,
224        None | Some(0x01..=0x1F | b' ' | b'(' | 0x7F)
225    ) {
226        tokenizer.tokenize_state.size = 0;
227        State::Nok
228    } else if tokenizer.current == Some(b'\\') {
229        tokenizer.consume();
230        State::Next(StateName::DestinationRawEscape)
231    } else {
232        tokenizer.consume();
233        State::Next(StateName::DestinationRaw)
234    }
235}
236
237/// After `\`, at special character.
238///
239/// ```markdown
240/// > | a\*a
241///       ^
242/// ```
243pub fn raw_escape(tokenizer: &mut Tokenizer) -> State {
244    match tokenizer.current {
245        Some(b'(' | b')' | b'\\') => {
246            tokenizer.consume();
247            State::Next(StateName::DestinationRaw)
248        }
249        _ => State::Retry(StateName::DestinationRaw),
250    }
251}