Markdown parser fork with extended syntax for personal use.
at hack 474 lines 14 kB view raw
1//! List item occurs in the [document][] content type. 2//! 3//! ## Grammar 4//! 5//! List item forms with the following BNF 6//! (<small>see [construct][crate::construct] for character groups</small>): 7//! 8//! ```bnf 9//! ; Restriction: if there is no space after the marker, the start must be followed by an `eol`. 10//! ; Restriction: if the first line after the marker is not blank and starts with `5(space_or_tab)`, 11//! ; only the first `space_or_tab` is part of the start. 12//! list_item_start ::= '*' | '+' | '-' | 1*9(ascii_decimal) ('.' | ')') [1*4 space_or_tab] 13//! 14//! ; Restriction: blank line allowed, except when this is the first continuation after a blank start. 15//! ; Restriction: if not blank, the line must be indented, exactly `n` times. 16//! list_item_cont ::= [n(space_or_tab)] 17//! ``` 18//! 19//! Further lines that are not prefixed with `list_item_cont` cause the list 20//! item to be exited, except when those lines are lazy continuation or blank. 21//! Like so many things in markdown, list items too are complex. 22//! See [*§ Phase 1: block structure* in `CommonMark`][commonmark_block] for 23//! more on parsing details. 24//! 25//! As list item is a container, it takes several bytes from the start of the 26//! line, while the rest of the line includes more containers or flow. 27//! 28//! ## HTML 29//! 30//! List item relates to the `<li>`, `<ol>`, and `<ul>` elements in HTML. 31//! See [*§ 4.4.8 The `li` element*][html_li], 32//! [*§ 4.4.5 The `ol` element*][html_ol], and 33//! [*§ 4.4.7 The `ul` element*][html_ul] in the HTML spec for more info. 34//! 35//! ## Recommendation 36//! 37//! Use a single space after a marker. 38//! Never use lazy continuation. 39//! 40//! ## Tokens 41//! 42//! * [`ListItem`][Name::ListItem] 43//! * [`ListItemMarker`][Name::ListItemMarker] 44//! * [`ListItemPrefix`][Name::ListItemPrefix] 45//! * [`ListItemValue`][Name::ListItemValue] 46//! * [`ListOrdered`][Name::ListOrdered] 47//! * [`ListUnordered`][Name::ListUnordered] 48//! 49//! ## References 50//! 51//! * [`list.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/list.js) 52//! * [*§ 5.2 List items* in `CommonMark`](https://spec.commonmark.org/0.31/#list-items) 53//! * [*§ 5.3 Lists* in `CommonMark`](https://spec.commonmark.org/0.31/#lists) 54//! 55//! [document]: crate::construct::document 56//! [html_li]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-li-element 57//! [html_ol]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ol-element 58//! [html_ul]: https://html.spec.whatwg.org/multipage/grouping-content.html#the-ul-element 59//! [commonmark_block]: https://spec.commonmark.org/0.31/#phase-1-block-structure 60 61use crate::construct::partial_space_or_tab::space_or_tab_min_max; 62use crate::event::{Kind, Name}; 63use crate::resolve::Name as ResolveName; 64use crate::state::{Name as StateName, State}; 65use crate::subtokenize::Subresult; 66use crate::tokenizer::Tokenizer; 67use crate::util::{ 68 constant::{LIST_ITEM_VALUE_SIZE_MAX, TAB_SIZE}, 69 skip, 70 slice::{Position, Slice}, 71}; 72use alloc::{vec, vec::Vec}; 73 74/// Start of list item. 75/// 76/// ```markdown 77/// > | * a 78/// ^ 79/// ``` 80pub fn start(tokenizer: &mut Tokenizer) -> State { 81 if tokenizer.parse_state.options.constructs.list_item { 82 tokenizer.enter(Name::ListItem); 83 84 if matches!(tokenizer.current, Some(b'\t' | b' ')) { 85 tokenizer.attempt(State::Next(StateName::ListItemBefore), State::Nok); 86 State::Retry(space_or_tab_min_max( 87 tokenizer, 88 0, 89 if tokenizer.parse_state.options.constructs.code_indented { 90 TAB_SIZE - 1 91 } else { 92 usize::MAX 93 }, 94 )) 95 } else { 96 State::Retry(StateName::ListItemBefore) 97 } 98 } else { 99 State::Nok 100 } 101} 102 103/// After optional whitespace, at list item prefix. 104/// 105/// ```markdown 106/// > | * a 107/// ^ 108/// ``` 109pub fn before(tokenizer: &mut Tokenizer) -> State { 110 // Unordered. 111 if matches!(tokenizer.current, Some(b'*' | b'-')) { 112 tokenizer.check(State::Nok, State::Next(StateName::ListItemBeforeUnordered)); 113 State::Retry(StateName::ThematicBreakStart) 114 } else if tokenizer.current == Some(b'+') { 115 State::Retry(StateName::ListItemBeforeUnordered) 116 } 117 // Ordered. 118 else if tokenizer.current == Some(b'1') 119 || (matches!(tokenizer.current, Some(b'0'..=b'9')) && !tokenizer.interrupt) 120 { 121 State::Retry(StateName::ListItemBeforeOrdered) 122 } else { 123 State::Nok 124 } 125} 126 127/// At unordered list item marker. 128/// 129/// The line is not a thematic break. 130/// 131/// ```markdown 132/// > | * a 133/// ^ 134/// ``` 135pub fn before_unordered(tokenizer: &mut Tokenizer) -> State { 136 tokenizer.enter(Name::ListItemPrefix); 137 State::Retry(StateName::ListItemMarker) 138} 139 140/// At ordered list item value. 141/// 142/// ```markdown 143/// > | * a 144/// ^ 145/// ``` 146pub fn before_ordered(tokenizer: &mut Tokenizer) -> State { 147 tokenizer.enter(Name::ListItemPrefix); 148 tokenizer.enter(Name::ListItemValue); 149 State::Retry(StateName::ListItemValue) 150} 151 152/// In ordered list item value. 153/// 154/// ```markdown 155/// > | 1. a 156/// ^ 157/// ``` 158pub fn value(tokenizer: &mut Tokenizer) -> State { 159 if matches!(tokenizer.current, Some(b'.' | b')')) 160 && (!tokenizer.interrupt || tokenizer.tokenize_state.size < 2) 161 { 162 tokenizer.exit(Name::ListItemValue); 163 State::Retry(StateName::ListItemMarker) 164 } else if matches!(tokenizer.current, Some(b'0'..=b'9')) 165 && tokenizer.tokenize_state.size + 1 < LIST_ITEM_VALUE_SIZE_MAX 166 { 167 tokenizer.tokenize_state.size += 1; 168 tokenizer.consume(); 169 State::Next(StateName::ListItemValue) 170 } else { 171 tokenizer.tokenize_state.size = 0; 172 State::Nok 173 } 174} 175 176/// At list item marker. 177/// 178/// ```markdown 179/// > | * a 180/// ^ 181/// > | 1. b 182/// ^ 183/// ``` 184pub fn marker(tokenizer: &mut Tokenizer) -> State { 185 tokenizer.enter(Name::ListItemMarker); 186 tokenizer.consume(); 187 tokenizer.exit(Name::ListItemMarker); 188 State::Next(StateName::ListItemMarkerAfter) 189} 190 191/// After list item marker. 192/// 193/// ```markdown 194/// > | * a 195/// ^ 196/// > | 1. b 197/// ^ 198/// ``` 199pub fn marker_after(tokenizer: &mut Tokenizer) -> State { 200 tokenizer.tokenize_state.size = 1; 201 tokenizer.check( 202 State::Next(StateName::ListItemAfter), 203 State::Next(StateName::ListItemMarkerAfterFilled), 204 ); 205 State::Retry(StateName::BlankLineStart) 206} 207 208/// After list item marker. 209/// 210/// The marker is not followed by a blank line. 211/// 212/// ```markdown 213/// > | * a 214/// ^ 215/// ``` 216pub fn marker_after_filled(tokenizer: &mut Tokenizer) -> State { 217 tokenizer.tokenize_state.size = 0; 218 219 // Attempt to parse up to the largest allowed indent, `nok` if there is more whitespace. 220 tokenizer.attempt( 221 State::Next(StateName::ListItemAfter), 222 State::Next(StateName::ListItemPrefixOther), 223 ); 224 State::Retry(StateName::ListItemWhitespace) 225} 226 227/// After marker, at whitespace. 228/// 229/// ```markdown 230/// > | * a 231/// ^ 232/// ``` 233pub fn whitespace(tokenizer: &mut Tokenizer) -> State { 234 tokenizer.attempt(State::Next(StateName::ListItemWhitespaceAfter), State::Nok); 235 State::Retry(space_or_tab_min_max(tokenizer, 1, TAB_SIZE)) 236} 237 238/// After acceptable whitespace. 239/// 240/// ```markdown 241/// > | * a 242/// ^ 243/// ``` 244pub fn whitespace_after(tokenizer: &mut Tokenizer) -> State { 245 if let Some(b'\t' | b' ') = tokenizer.current { 246 State::Nok 247 } else { 248 State::Ok 249 } 250} 251 252/// After marker, followed by no indent or more indent that needed. 253/// 254/// ```markdown 255/// > | * a 256/// ^ 257/// ``` 258pub fn prefix_other(tokenizer: &mut Tokenizer) -> State { 259 match tokenizer.current { 260 Some(b'\t' | b' ') => { 261 tokenizer.enter(Name::SpaceOrTab); 262 tokenizer.consume(); 263 tokenizer.exit(Name::SpaceOrTab); 264 State::Next(StateName::ListItemAfter) 265 } 266 _ => State::Nok, 267 } 268} 269 270/// After list item prefix. 271/// 272/// ```markdown 273/// > | * a 274/// ^ 275/// ``` 276pub fn after(tokenizer: &mut Tokenizer) -> State { 277 let blank = tokenizer.tokenize_state.size == 1; 278 tokenizer.tokenize_state.size = 0; 279 280 if blank && tokenizer.interrupt { 281 State::Nok 282 } else { 283 let start = skip::to_back( 284 &tokenizer.events, 285 tokenizer.events.len() - 1, 286 &[Name::ListItem], 287 ); 288 let mut prefix = Slice::from_position( 289 tokenizer.parse_state.bytes, 290 &Position { 291 start: &tokenizer.events[start].point, 292 end: &tokenizer.point, 293 }, 294 ) 295 .len(); 296 297 if blank { 298 prefix += 1; 299 } 300 301 let container = &mut tokenizer.tokenize_state.document_container_stack 302 [tokenizer.tokenize_state.document_continued]; 303 304 container.blank_initial = blank; 305 container.size = prefix; 306 307 tokenizer.exit(Name::ListItemPrefix); 308 tokenizer.register_resolver_before(ResolveName::ListItem); 309 State::Ok 310 } 311} 312 313/// Start of list item continuation. 314/// 315/// ```markdown 316/// | * a 317/// > | b 318/// ^ 319/// ``` 320pub fn cont_start(tokenizer: &mut Tokenizer) -> State { 321 tokenizer.check( 322 State::Next(StateName::ListItemContBlank), 323 State::Next(StateName::ListItemContFilled), 324 ); 325 State::Retry(StateName::BlankLineStart) 326} 327 328/// Start of blank list item continuation. 329/// 330/// ```markdown 331/// | * a 332/// > | 333/// ^ 334/// | b 335/// ``` 336pub fn cont_blank(tokenizer: &mut Tokenizer) -> State { 337 let container = &mut tokenizer.tokenize_state.document_container_stack 338 [tokenizer.tokenize_state.document_continued]; 339 let size = container.size; 340 341 if container.blank_initial { 342 State::Nok 343 } else if matches!(tokenizer.current, Some(b'\t' | b' ')) { 344 // Consume, optionally, at most `size`. 345 State::Retry(space_or_tab_min_max(tokenizer, 0, size)) 346 } else { 347 State::Ok 348 } 349} 350 351/// Start of non-blank list item continuation. 352/// 353/// ```markdown 354/// | * a 355/// > | b 356/// ^ 357/// ``` 358pub fn cont_filled(tokenizer: &mut Tokenizer) -> State { 359 let container = &mut tokenizer.tokenize_state.document_container_stack 360 [tokenizer.tokenize_state.document_continued]; 361 let size = container.size; 362 363 container.blank_initial = false; 364 365 if matches!(tokenizer.current, Some(b'\t' | b' ')) { 366 // Consume exactly `size`. 367 State::Retry(space_or_tab_min_max(tokenizer, size, size)) 368 } else { 369 State::Nok 370 } 371} 372 373/// Find adjacent list items with the same marker. 374pub fn resolve(tokenizer: &mut Tokenizer) -> Option<Subresult> { 375 let mut lists_wip: Vec<(u8, usize, usize, usize)> = vec![]; 376 let mut lists: Vec<(u8, usize, usize, usize)> = vec![]; 377 let mut index = 0; 378 let mut balance = 0; 379 380 // Merge list items. 381 while index < tokenizer.events.len() { 382 let event = &tokenizer.events[index]; 383 384 if event.name == Name::ListItem { 385 if event.kind == Kind::Enter { 386 let end = skip::opt(&tokenizer.events, index, &[Name::ListItem]) - 1; 387 let marker = skip::to(&tokenizer.events, index, &[Name::ListItemMarker]); 388 // Guaranteed to be a valid ASCII byte. 389 let marker = tokenizer.parse_state.bytes[tokenizer.events[marker].point.index]; 390 let current = (marker, balance, index, end); 391 392 let mut list_index = lists_wip.len(); 393 let mut matched = false; 394 395 while list_index > 0 { 396 list_index -= 1; 397 let previous = &lists_wip[list_index]; 398 let before = skip::opt( 399 &tokenizer.events, 400 previous.3 + 1, 401 &[ 402 Name::SpaceOrTab, 403 Name::LineEnding, 404 Name::BlankLineEnding, 405 Name::BlockQuotePrefix, 406 ], 407 ); 408 409 if previous.0 == current.0 && previous.1 == current.1 && before == current.2 { 410 let previous_mut = &mut lists_wip[list_index]; 411 previous_mut.3 = current.3; 412 lists.append(&mut lists_wip.split_off(list_index + 1)); 413 matched = true; 414 break; 415 } 416 } 417 418 if !matched { 419 let mut index = lists_wip.len(); 420 let mut exit = None; 421 422 while index > 0 { 423 index -= 1; 424 425 // If the current (new) item starts after where this 426 // item on the stack ends, we can remove it from the 427 // stack. 428 if current.2 > lists_wip[index].3 { 429 exit = Some(index); 430 } else { 431 break; 432 } 433 } 434 435 if let Some(exit) = exit { 436 lists.append(&mut lists_wip.split_off(exit)); 437 } 438 439 lists_wip.push(current); 440 } 441 442 balance += 1; 443 } else { 444 balance -= 1; 445 } 446 } 447 448 index += 1; 449 } 450 451 lists.append(&mut lists_wip); 452 453 // Inject events. 454 let mut index = 0; 455 while index < lists.len() { 456 let list_item = &lists[index]; 457 let mut list_start = tokenizer.events[list_item.2].clone(); 458 let mut list_end = tokenizer.events[list_item.3].clone(); 459 let name = match list_item.0 { 460 b'.' | b')' => Name::ListOrdered, 461 _ => Name::ListUnordered, 462 }; 463 list_start.name = name.clone(); 464 list_end.name = name; 465 466 tokenizer.map.add(list_item.2, 0, vec![list_start]); 467 tokenizer.map.add(list_item.3 + 1, 0, vec![list_end]); 468 469 index += 1; 470 } 471 472 tokenizer.map.consume(&mut tokenizer.events); 473 None 474}