Markdown parser fork with extended syntax for personal use.
1//! Definition occurs in the [content] content type.
2//!
3//! ## Grammar
4//!
5//! Definition forms with the following BNF
6//! (<small>see [construct][crate::construct] for character groups</small>):
7//!
8//! ```bnf
9//! definition ::= label ':' [ space_or_tab_eol ] destination [ space_or_tab_eol title ] [ space_or_tab ]
10//!
11//! ; See the `destination`, `title`, and `label` constructs for the BNF of
12//! ; those parts.
13//! ```
14//!
15//! This construct must be followed by an eol (line ending) or eof (end of
16//! file), like flow constructs.
17//!
18//! See [`destination`][destination], [`label`][label], and [`title`][title]
19//! for grammar, notes, and recommendations on each part.
20//!
21//! The `destination`, `label`, and `title` parts are interpreted as the
22//! [string][] content type.
23//! That means that [character escapes][character_escape] and
24//! [character references][character_reference] are allowed.
25//!
26//! Definitions match to references through identifiers.
27//! To match, both labels must be equal after normalizing with
28//! [`normalize_identifier`][normalize_identifier].
29//! One definition can match to multiple references.
30//! Multiple definitions with the same, normalized, identifier are ignored: the
31//! first definition is preferred.
32//! To illustrate, the definition with a destination of `x` wins:
33//!
34//! ```markdown
35//! [a]: x
36//! [a]: y
37//!
38//! [a]
39//! ```
40//!
41//! Importantly, while labels *can* include [string][] content (character
42//! escapes and character references), these are not considered when matching.
43//! To illustrate, neither definition matches the reference:
44//!
45//! ```markdown
46//! [a&b]: x
47//! [a\&b]: y
48//!
49//! [a&b]
50//! ```
51//!
52//! For info on how to encode characters in URLs, see
53//! [`destination`][destination].
54//! For info on how characters are encoded as `href` on `<a>` or `src` on
55//! `<img>` when compiling, see
56//! [`sanitize_uri`][sanitize_uri].
57//!
58//! ## HTML
59//!
60//! Definitions in markdown do not, on their own, relate to anything in HTML.
61//! When matched with a [label end (reference)][label_end], they together
62//! relate to the `<a>` or `<img>` elements in HTML.
63//! The definition forms its `href` or `src`, and optionally `title`,
64//! attributes.
65//! See [*§ 4.5.1 The `a` element*][html_a] and
66//! [*§ 4.8.3 The `img` element*][html_img] in the HTML spec for more info.
67//!
68//! ## Tokens
69//!
70//! * [`Definition`][Name::Definition]
71//! * [`DefinitionDestination`][Name::DefinitionDestination]
72//! * [`DefinitionDestinationLiteral`][Name::DefinitionDestinationLiteral]
73//! * [`DefinitionDestinationLiteralMarker`][Name::DefinitionDestinationLiteralMarker]
74//! * [`DefinitionDestinationRaw`][Name::DefinitionDestinationRaw]
75//! * [`DefinitionDestinationString`][Name::DefinitionDestinationString]
76//! * [`DefinitionLabel`][Name::DefinitionLabel]
77//! * [`DefinitionLabelMarker`][Name::DefinitionLabelMarker]
78//! * [`DefinitionLabelString`][Name::DefinitionLabelString]
79//! * [`DefinitionMarker`][Name::DefinitionMarker]
80//! * [`DefinitionTitle`][Name::DefinitionTitle]
81//! * [`DefinitionTitleMarker`][Name::DefinitionTitleMarker]
82//! * [`DefinitionTitleString`][Name::DefinitionTitleString]
83//! * [`LineEnding`][Name::LineEnding]
84//! * [`SpaceOrTab`][Name::SpaceOrTab]
85//!
86//! ## References
87//!
88//! * [`definition.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/definition.js)
89//! * [*§ 4.7 Link reference definitions* in `CommonMark`](https://spec.commonmark.org/0.31/#link-reference-definitions)
90//!
91//! [content]: crate::construct::content
92//! [string]: crate::construct::string
93//! [character_escape]: crate::construct::character_escape
94//! [character_reference]: crate::construct::character_reference
95//! [destination]: crate::construct::partial_destination
96//! [label]: crate::construct::partial_label
97//! [label_end]: crate::construct::label_end
98//! [title]: crate::construct::partial_title
99//! [sanitize_uri]: crate::util::sanitize_uri::sanitize
100//! [normalize_identifier]: crate::util::normalize_identifier
101//! [html_a]: https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-a-element
102//! [html_img]: https://html.spec.whatwg.org/multipage/embedded-content.html#the-img-element
103
104use crate::construct::partial_space_or_tab::space_or_tab;
105use crate::construct::partial_space_or_tab_eol::space_or_tab_eol;
106use crate::event::Name;
107use crate::state::{Name as StateName, State};
108use crate::tokenizer::Tokenizer;
109use crate::util::{
110 normalize_identifier::normalize_identifier,
111 skip,
112 slice::{Position, Slice},
113};
114
115/// At start of a definition.
116///
117/// ```markdown
118/// > | [a]: b "c"
119/// ^
120/// ```
121pub fn start(tokenizer: &mut Tokenizer) -> State {
122 // Do not interrupt paragraphs (but do follow definitions).
123 if tokenizer.parse_state.options.constructs.definition
124 && (!tokenizer.interrupt
125 || (!tokenizer.events.is_empty()
126 && tokenizer.events[skip::opt_back(
127 &tokenizer.events,
128 tokenizer.events.len() - 1,
129 &[Name::LineEnding, Name::SpaceOrTab],
130 )]
131 .name
132 == Name::Definition))
133 {
134 tokenizer.enter(Name::Definition);
135
136 if matches!(tokenizer.current, Some(b'\t' | b' ')) {
137 // Note: arbitrary whitespace allowed even if code (indented) is on.
138 tokenizer.attempt(State::Next(StateName::DefinitionBefore), State::Nok);
139 State::Retry(space_or_tab(tokenizer))
140 } else {
141 State::Retry(StateName::DefinitionBefore)
142 }
143 } else {
144 State::Nok
145 }
146}
147
148/// After optional whitespace, at `[`.
149///
150/// ```markdown
151/// > | [a]: b "c"
152/// ^
153/// ```
154pub fn before(tokenizer: &mut Tokenizer) -> State {
155 match tokenizer.current {
156 Some(b'[') => {
157 tokenizer.tokenize_state.token_1 = Name::DefinitionLabel;
158 tokenizer.tokenize_state.token_2 = Name::DefinitionLabelMarker;
159 tokenizer.tokenize_state.token_3 = Name::DefinitionLabelString;
160 tokenizer.attempt(
161 State::Next(StateName::DefinitionLabelAfter),
162 State::Next(StateName::DefinitionLabelNok),
163 );
164 State::Retry(StateName::LabelStart)
165 }
166 _ => State::Nok,
167 }
168}
169
170/// After label.
171///
172/// ```markdown
173/// > | [a]: b "c"
174/// ^
175/// ```
176pub fn label_after(tokenizer: &mut Tokenizer) -> State {
177 tokenizer.tokenize_state.token_1 = Name::Data;
178 tokenizer.tokenize_state.token_2 = Name::Data;
179 tokenizer.tokenize_state.token_3 = Name::Data;
180
181 match tokenizer.current {
182 Some(b':') => {
183 tokenizer.tokenize_state.end = skip::to_back(
184 &tokenizer.events,
185 tokenizer.events.len() - 1,
186 &[Name::DefinitionLabelString],
187 );
188
189 tokenizer.enter(Name::DefinitionMarker);
190 tokenizer.consume();
191 tokenizer.exit(Name::DefinitionMarker);
192 State::Next(StateName::DefinitionMarkerAfter)
193 }
194 _ => State::Nok,
195 }
196}
197
198/// At a non-label
199///
200/// ```markdown
201/// > | []
202/// ^
203/// ```
204pub fn label_nok(tokenizer: &mut Tokenizer) -> State {
205 tokenizer.tokenize_state.token_1 = Name::Data;
206 tokenizer.tokenize_state.token_2 = Name::Data;
207 tokenizer.tokenize_state.token_3 = Name::Data;
208 State::Nok
209}
210
211/// After marker.
212///
213/// ```markdown
214/// > | [a]: b "c"
215/// ^
216/// ```
217pub fn marker_after(tokenizer: &mut Tokenizer) -> State {
218 tokenizer.attempt(
219 State::Next(StateName::DefinitionDestinationBefore),
220 State::Next(StateName::DefinitionDestinationBefore),
221 );
222 State::Retry(space_or_tab_eol(tokenizer))
223}
224
225/// Before destination.
226///
227/// ```markdown
228/// > | [a]: b "c"
229/// ^
230/// ```
231pub fn destination_before(tokenizer: &mut Tokenizer) -> State {
232 tokenizer.tokenize_state.token_1 = Name::DefinitionDestination;
233 tokenizer.tokenize_state.token_2 = Name::DefinitionDestinationLiteral;
234 tokenizer.tokenize_state.token_3 = Name::DefinitionDestinationLiteralMarker;
235 tokenizer.tokenize_state.token_4 = Name::DefinitionDestinationRaw;
236 tokenizer.tokenize_state.token_5 = Name::DefinitionDestinationString;
237 tokenizer.tokenize_state.size_b = usize::MAX;
238 tokenizer.attempt(
239 State::Next(StateName::DefinitionDestinationAfter),
240 State::Next(StateName::DefinitionDestinationMissing),
241 );
242 State::Retry(StateName::DestinationStart)
243}
244
245/// After destination.
246///
247/// ```markdown
248/// > | [a]: b "c"
249/// ^
250/// ```
251pub fn destination_after(tokenizer: &mut Tokenizer) -> State {
252 tokenizer.tokenize_state.token_1 = Name::Data;
253 tokenizer.tokenize_state.token_2 = Name::Data;
254 tokenizer.tokenize_state.token_3 = Name::Data;
255 tokenizer.tokenize_state.token_4 = Name::Data;
256 tokenizer.tokenize_state.token_5 = Name::Data;
257 tokenizer.tokenize_state.size_b = 0;
258 tokenizer.attempt(
259 State::Next(StateName::DefinitionAfter),
260 State::Next(StateName::DefinitionAfter),
261 );
262 State::Retry(StateName::DefinitionTitleBefore)
263}
264
265/// Without destination.
266pub fn destination_missing(tokenizer: &mut Tokenizer) -> State {
267 tokenizer.tokenize_state.token_1 = Name::Data;
268 tokenizer.tokenize_state.token_2 = Name::Data;
269 tokenizer.tokenize_state.token_3 = Name::Data;
270 tokenizer.tokenize_state.token_4 = Name::Data;
271 tokenizer.tokenize_state.token_5 = Name::Data;
272 tokenizer.tokenize_state.size_b = 0;
273 tokenizer.tokenize_state.end = 0;
274 State::Nok
275}
276
277/// After definition.
278///
279/// ```markdown
280/// > | [a]: b
281/// ^
282/// > | [a]: b "c"
283/// ^
284/// ```
285pub fn after(tokenizer: &mut Tokenizer) -> State {
286 if matches!(tokenizer.current, Some(b'\t' | b' ')) {
287 tokenizer.attempt(
288 State::Next(StateName::DefinitionAfterWhitespace),
289 State::Nok,
290 );
291 State::Retry(space_or_tab(tokenizer))
292 } else {
293 State::Retry(StateName::DefinitionAfterWhitespace)
294 }
295}
296
297/// After definition, after optional whitespace.
298///
299/// ```markdown
300/// > | [a]: b
301/// ^
302/// > | [a]: b "c"
303/// ^
304/// ```
305pub fn after_whitespace(tokenizer: &mut Tokenizer) -> State {
306 match tokenizer.current {
307 None | Some(b'\n') => {
308 tokenizer.exit(Name::Definition);
309
310 // Note: we don’t care about uniqueness.
311 // It’s likely that that doesn’t happen very frequently.
312 // It is more likely that it wastes precious time.
313 tokenizer.tokenize_state.definitions.push(
314 // Note: we don’t care about virtual spaces, so `as_str` is fine.
315 normalize_identifier(
316 Slice::from_position(
317 tokenizer.parse_state.bytes,
318 &Position::from_exit_event(&tokenizer.events, tokenizer.tokenize_state.end),
319 )
320 .as_str(),
321 ),
322 );
323
324 tokenizer.tokenize_state.end = 0;
325
326 // You’d be interrupting.
327 tokenizer.interrupt = true;
328 State::Ok
329 }
330 _ => {
331 tokenizer.tokenize_state.end = 0;
332 State::Nok
333 }
334 }
335}
336
337/// After destination, at whitespace.
338///
339/// ```markdown
340/// > | [a]: b
341/// ^
342/// > | [a]: b "c"
343/// ^
344/// ```
345pub fn title_before(tokenizer: &mut Tokenizer) -> State {
346 if matches!(tokenizer.current, Some(b'\t' | b'\n' | b' ')) {
347 tokenizer.attempt(
348 State::Next(StateName::DefinitionTitleBeforeMarker),
349 State::Nok,
350 );
351 State::Retry(space_or_tab_eol(tokenizer))
352 } else {
353 State::Nok
354 }
355}
356
357/// At title.
358///
359/// ```markdown
360/// | [a]: b
361/// > | "c"
362/// ^
363/// ```
364pub fn title_before_marker(tokenizer: &mut Tokenizer) -> State {
365 tokenizer.tokenize_state.token_1 = Name::DefinitionTitle;
366 tokenizer.tokenize_state.token_2 = Name::DefinitionTitleMarker;
367 tokenizer.tokenize_state.token_3 = Name::DefinitionTitleString;
368 tokenizer.attempt(State::Next(StateName::DefinitionTitleAfter), State::Nok);
369 State::Retry(StateName::TitleStart)
370}
371
372/// After title.
373///
374/// ```markdown
375/// > | [a]: b "c"
376/// ^
377/// ```
378pub fn title_after(tokenizer: &mut Tokenizer) -> State {
379 tokenizer.tokenize_state.token_1 = Name::Data;
380 tokenizer.tokenize_state.token_2 = Name::Data;
381 tokenizer.tokenize_state.token_3 = Name::Data;
382 if matches!(tokenizer.current, Some(b'\t' | b' ')) {
383 tokenizer.attempt(
384 State::Next(StateName::DefinitionTitleAfterOptionalWhitespace),
385 State::Nok,
386 );
387 State::Retry(space_or_tab(tokenizer))
388 } else {
389 State::Retry(StateName::DefinitionTitleAfterOptionalWhitespace)
390 }
391}
392
393/// After title, after optional whitespace.
394///
395/// ```markdown
396/// > | [a]: b "c"
397/// ^
398/// ```
399pub fn title_after_optional_whitespace(tokenizer: &mut Tokenizer) -> State {
400 match tokenizer.current {
401 None | Some(b'\n') => State::Ok,
402 _ => State::Nok,
403 }
404}