src/construct/list_item.rs at hack · crashkeys.dev/markdown-rs

crashkeys.dev / markdown-rs
fork atom
Markdown parser fork with extended syntax for personal use.
fork atom
markdown-rs / src / construct / list_item.rs
at hack 474 lines 14 kB view raw
wrap content
Titus Wormer Refactor docs 11mo ago
e0ca3f6c
  1//! List item occurs in the [document][] content type.
  2//!
  3//! ## Grammar
  4//!
  5//! List item forms with the following BNF
  6//! (<small>see [construct][crate::construct] for character groups</small>):
  7//!
  8//! ```bnf
  9//! ; Restriction: if there is no space after the marker, the start must be followed by an `eol`.
 10//! ; Restriction: if the first line after the marker is not blank and starts with `5(space_or_tab)`,
 11//! ; only the first `space_or_tab` is part of the start.
 12//! list_item_start ::= '*' | '+' | '-' | 1*9(ascii_decimal) ('.' | ')') [1*4 space_or_tab]
 13//!
 14//! ; Restriction: blank line allowed, except when this is the first continuation after a blank start.
 15//! ; Restriction: if not blank, the line must be indented, exactly `n` times.
 16//! list_item_cont ::= [n(space_or_tab)]
 17//! ```
 18//!
 19//! Further lines that are not prefixed with `list_item_cont` cause the list
 20//! item to be exited, except when those lines are lazy continuation or blank.
 21//! Like so many things in markdown, list items too are complex.
 22//! See [*§ Phase 1: block structure* in `CommonMark`][commonmark_block] for
 23//! more on parsing details.
 24//!
 25//! As list item is a container, it takes several bytes from the start of the
 26//! line, while the rest of the line includes more containers or flow.
 27//!
 28//! ## HTML
 29//!
 30//! List item relates to the `<li>`, `<ol>`, and `<ul>` elements in HTML.
 31//! See [*§ 4.4.8 The `li` element*][html_li],
 32//! [*§ 4.4.5 The `ol` element*][html_ol], and
 33//! [*§ 4.4.7 The `ul` element*][html_ul] in the HTML spec for more info.
 34//!
 35//! ## Recommendation
 36//!
 37//! Use a single space after a marker.
 38//! Never use lazy continuation.
 39//!
 40//! ## Tokens
 41//!
 42//! * [`ListItem`][Name::ListItem]
 43//! * [`ListItemMarker`][Name::ListItemMarker]
 44//! * [`ListItemPrefix`][Name::ListItemPrefix]
 45//! * [`ListItemValue`][Name::ListItemValue]
 46//! * [`ListOrdered`][Name::ListOrdered]
 47//! * [`ListUnordered`][Name::ListUnordered]
 48//!
 49//! ## References
 50//!
 51//! * [`list.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/list.js)
 52//! * [*§ 5.2 List items* in `CommonMark`](https://spec.commonmark.org/0.31/#list-items)
 53//! * [*§ 5.3 Lists* in `CommonMark`](https://spec.commonmark.org/0.31/#lists)
 54//!
 55//! [document]: crate::construct::document
 56//! [html_li]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-li-element
 57//! [html_ol]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ol-element
 58//! [html_ul]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ul-element
 59//! [commonmark_block]: https://spec.commonmark.org/0.31/#phase-1-block-structure
 60
 61use crate::construct::partial_space_or_tab::space_or_tab_min_max;
 62use crate::event::{Kind, Name};
 63use crate::resolve::Name as ResolveName;
 64use crate::state::{Name as StateName, State};
 65use crate::subtokenize::Subresult;
 66use crate::tokenizer::Tokenizer;
 67use crate::util::{
 68    constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE},
 69    skip,
 70    slice::{Position, Slice},
 71};
 72use alloc::{vec, vec::Vec};
 73
 74/// Start of list item.
 75///
 76/// ```markdown
 77/// > | * a
 78///     ^
 79/// ```
 80pub fn start(tokenizer: &mut Tokenizer) -> State {
 81    if tokenizer.parse_state.options.constructs.list_item {
 82        tokenizer.enter(Name::ListItem);
 83
 84        if matches!(tokenizer.current, Some(b'\t' | b' ')) {
 85            tokenizer.attempt(State::Next(StateName::ListItemBefore), State::Nok);
 86            State::Retry(space_or_tab_min_max(
 87                tokenizer,
 88                0,
 89                if tokenizer.parse_state.options.constructs.code_indented {
 90                    TAB_SIZE - 1
 91                } else {
 92                    usize::MAX
 93                },
 94            ))
 95        } else {
 96            State::Retry(StateName::ListItemBefore)
 97        }
 98    } else {
 99        State::Nok
100    }
101}
102
103/// After optional whitespace, at list item prefix.
104///
105/// ```markdown
106/// > | * a
107///     ^
108/// ```
109pub fn before(tokenizer: &mut Tokenizer) -> State {
110    // Unordered.
111    if matches!(tokenizer.current, Some(b'*' | b'-')) {
112        tokenizer.check(State::Nok, State::Next(StateName::ListItemBeforeUnordered));
113        State::Retry(StateName::ThematicBreakStart)
114    } else if tokenizer.current == Some(b'+') {
115        State::Retry(StateName::ListItemBeforeUnordered)
116    }
117    // Ordered.
118    else if tokenizer.current == Some(b'1')
119        || (matches!(tokenizer.current, Some(b'0'..=b'9')) && !tokenizer.interrupt)
120    {
121        State::Retry(StateName::ListItemBeforeOrdered)
122    } else {
123        State::Nok
124    }
125}
126
127/// At unordered list item marker.
128///
129/// The line is not a thematic break.
130///
131/// ```markdown
132/// > | * a
133///     ^
134/// ```
135pub fn before_unordered(tokenizer: &mut Tokenizer) -> State {
136    tokenizer.enter(Name::ListItemPrefix);
137    State::Retry(StateName::ListItemMarker)
138}
139
140/// At ordered list item value.
141///
142/// ```markdown
143/// > | * a
144///     ^
145/// ```
146pub fn before_ordered(tokenizer: &mut Tokenizer) -> State {
147    tokenizer.enter(Name::ListItemPrefix);
148    tokenizer.enter(Name::ListItemValue);
149    State::Retry(StateName::ListItemValue)
150}
151
152/// In ordered list item value.
153///
154/// ```markdown
155/// > | 1. a
156///     ^
157/// ```
158pub fn value(tokenizer: &mut Tokenizer) -> State {
159    if matches!(tokenizer.current, Some(b'.' | b')'))
160        && (!tokenizer.interrupt || tokenizer.tokenize_state.size < 2)
161    {
162        tokenizer.exit(Name::ListItemValue);
163        State::Retry(StateName::ListItemMarker)
164    } else if matches!(tokenizer.current, Some(b'0'..=b'9'))
165        && tokenizer.tokenize_state.size + 1 < LIST_ITEM_VALUE_SIZE_MAX
166    {
167        tokenizer.tokenize_state.size += 1;
168        tokenizer.consume();
169        State::Next(StateName::ListItemValue)
170    } else {
171        tokenizer.tokenize_state.size = 0;
172        State::Nok
173    }
174}
175
176/// At list item marker.
177///
178/// ```markdown
179/// > | * a
180///     ^
181/// > | 1. b
182///      ^
183/// ```
184pub fn marker(tokenizer: &mut Tokenizer) -> State {
185    tokenizer.enter(Name::ListItemMarker);
186    tokenizer.consume();
187    tokenizer.exit(Name::ListItemMarker);
188    State::Next(StateName::ListItemMarkerAfter)
189}
190
191/// After list item marker.
192///
193/// ```markdown
194/// > | * a
195///      ^
196/// > | 1. b
197///       ^
198/// ```
199pub fn marker_after(tokenizer: &mut Tokenizer) -> State {
200    tokenizer.tokenize_state.size = 1;
201    tokenizer.check(
202        State::Next(StateName::ListItemAfter),
203        State::Next(StateName::ListItemMarkerAfterFilled),
204    );
205    State::Retry(StateName::BlankLineStart)
206}
207
208/// After list item marker.
209///
210/// The marker is not followed by a blank line.
211///
212/// ```markdown
213/// > | * a
214///      ^
215/// ```
216pub fn marker_after_filled(tokenizer: &mut Tokenizer) -> State {
217    tokenizer.tokenize_state.size = 0;
218
219    // Attempt to parse up to the largest allowed indent, `nok` if there is more whitespace.
220    tokenizer.attempt(
221        State::Next(StateName::ListItemAfter),
222        State::Next(StateName::ListItemPrefixOther),
223    );
224    State::Retry(StateName::ListItemWhitespace)
225}
226
227/// After marker, at whitespace.
228///
229/// ```markdown
230/// > | * a
231///      ^
232/// ```
233pub fn whitespace(tokenizer: &mut Tokenizer) -> State {
234    tokenizer.attempt(State::Next(StateName::ListItemWhitespaceAfter), State::Nok);
235    State::Retry(space_or_tab_min_max(tokenizer, 1, TAB_SIZE))
236}
237
238/// After acceptable whitespace.
239///
240/// ```markdown
241/// > | * a
242///      ^
243/// ```
244pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State {
245    if let Some(b'\t' | b' ') = tokenizer.current {
246        State::Nok
247    } else {
248        State::Ok
249    }
250}
251
252/// After marker, followed by no indent or more indent that needed.
253///
254/// ```markdown
255/// > | * a
256///      ^
257/// ```
258pub fn prefix_other(tokenizer: &mut Tokenizer) -> State {
259    match tokenizer.current {
260        Some(b'\t' | b' ') => {
261            tokenizer.enter(Name::SpaceOrTab);
262            tokenizer.consume();
263            tokenizer.exit(Name::SpaceOrTab);
264            State::Next(StateName::ListItemAfter)
265        }
266        _ => State::Nok,
267    }
268}
269
270/// After list item prefix.
271///
272/// ```markdown
273/// > | * a
274///       ^
275/// ```
276pub fn after(tokenizer: &mut Tokenizer) -> State {
277    let blank = tokenizer.tokenize_state.size == 1;
278    tokenizer.tokenize_state.size = 0;
279
280    if blank && tokenizer.interrupt {
281        State::Nok
282    } else {
283        let start = skip::to_back(
284            &tokenizer.events,
285            tokenizer.events.len() - 1,
286            &[Name::ListItem],
287        );
288        let mut prefix = Slice::from_position(
289            tokenizer.parse_state.bytes,
290            &Position {
291                start: &tokenizer.events[start].point,
292                end: &tokenizer.point,
293            },
294        )
295        .len();
296
297        if blank {
298            prefix += 1;
299        }
300
301        let container = &mut tokenizer.tokenize_state.document_container_stack
302            [tokenizer.tokenize_state.document_continued];
303
304        container.blank_initial = blank;
305        container.size = prefix;
306
307        tokenizer.exit(Name::ListItemPrefix);
308        tokenizer.register_resolver_before(ResolveName::ListItem);
309        State::Ok
310    }
311}
312
313/// Start of list item continuation.
314///
315/// ```markdown
316///   | * a
317/// > |   b
318///     ^
319/// ```
320pub fn cont_start(tokenizer: &mut Tokenizer) -> State {
321    tokenizer.check(
322        State::Next(StateName::ListItemContBlank),
323        State::Next(StateName::ListItemContFilled),
324    );
325    State::Retry(StateName::BlankLineStart)
326}
327
328/// Start of blank list item continuation.
329///
330/// ```markdown
331///   | * a
332/// > |
333///     ^
334///   |   b
335/// ```
336pub fn cont_blank(tokenizer: &mut Tokenizer) -> State {
337    let container = &mut tokenizer.tokenize_state.document_container_stack
338        [tokenizer.tokenize_state.document_continued];
339    let size = container.size;
340
341    if container.blank_initial {
342        State::Nok
343    } else if matches!(tokenizer.current, Some(b'\t' | b' ')) {
344        // Consume, optionally, at most `size`.
345        State::Retry(space_or_tab_min_max(tokenizer, 0, size))
346    } else {
347        State::Ok
348    }
349}
350
351/// Start of non-blank list item continuation.
352///
353/// ```markdown
354///   | * a
355/// > |   b
356///     ^
357/// ```
358pub fn cont_filled(tokenizer: &mut Tokenizer) -> State {
359    let container = &mut tokenizer.tokenize_state.document_container_stack
360        [tokenizer.tokenize_state.document_continued];
361    let size = container.size;
362
363    container.blank_initial = false;
364
365    if matches!(tokenizer.current, Some(b'\t' | b' ')) {
366        // Consume exactly `size`.
367        State::Retry(space_or_tab_min_max(tokenizer, size, size))
368    } else {
369        State::Nok
370    }
371}
372
373/// Find adjacent list items with the same marker.
374pub fn resolve(tokenizer: &mut Tokenizer) -> Option<Subresult> {
375    let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![];
376    let mut lists: Vec<(u8, usize, usize, usize)> = vec![];
377    let mut index = 0;
378    let mut balance = 0;
379
380    // Merge list items.
381    while index < tokenizer.events.len() {
382        let event = &tokenizer.events[index];
383
384        if event.name == Name::ListItem {
385            if event.kind == Kind::Enter {
386                let end = skip::opt(&tokenizer.events, index, &[Name::ListItem]) - 1;
387                let marker = skip::to(&tokenizer.events, index, &[Name::ListItemMarker]);
388                // Guaranteed to be a valid ASCII byte.
389                let marker = tokenizer.parse_state.bytes[tokenizer.events[marker].point.index];
390                let current = (marker, balance, index, end);
391
392                let mut list_index = lists_wip.len();
393                let mut matched = false;
394
395                while list_index > 0 {
396                    list_index -= 1;
397                    let previous = &lists_wip[list_index];
398                    let before = skip::opt(
399                        &tokenizer.events,
400                        previous.3 + 1,
401                        &[
402                            Name::SpaceOrTab,
403                            Name::LineEnding,
404                            Name::BlankLineEnding,
405                            Name::BlockQuotePrefix,
406                        ],
407                    );
408
409                    if previous.0 == current.0 && previous.1 == current.1 && before == current.2 {
410                        let previous_mut = &mut lists_wip[list_index];
411                        previous_mut.3 = current.3;
412                        lists.append(&mut lists_wip.split_off(list_index + 1));
413                        matched = true;
414                        break;
415                    }
416                }
417
418                if !matched {
419                    let mut index = lists_wip.len();
420                    let mut exit = None;
421
422                    while index > 0 {
423                        index -= 1;
424
425                        // If the current (new) item starts after where this
426                        // item on the stack ends, we can remove it from the
427                        // stack.
428                        if current.2 > lists_wip[index].3 {
429                            exit = Some(index);
430                        } else {
431                            break;
432                        }
433                    }
434
435                    if let Some(exit) = exit {
436                        lists.append(&mut lists_wip.split_off(exit));
437                    }
438
439                    lists_wip.push(current);
440                }
441
442                balance += 1;
443            } else {
444                balance -= 1;
445            }
446        }
447
448        index += 1;
449    }
450
451    lists.append(&mut lists_wip);
452
453    // Inject events.
454    let mut index = 0;
455    while index < lists.len() {
456        let list_item = &lists[index];
457        let mut list_start = tokenizer.events[list_item.2].clone();
458        let mut list_end = tokenizer.events[list_item.3].clone();
459        let name = match list_item.0 {
460            b'.' | b')' => Name::ListOrdered,
461            _ => Name::ListUnordered,
462        };
463        list_start.name = name.clone();
464        list_end.name = name;
465
466        tokenizer.map.add(list_item.2, 0, vec![list_start]);
467        tokenizer.map.add(list_item.3 + 1, 0, vec![list_end]);
468
469        index += 1;
470    }
471
472    tokenizer.map.consume(&mut tokenizer.events);
473    None
474}