src/construct/mod.rs at hack · crashkeys.dev/markdown-rs

crashkeys.dev / markdown-rs
fork atom
Markdown parser fork with extended syntax for personal use.
fork atom
markdown-rs / src / construct / mod.rs
at hack 200 lines 7.2 kB view raw
wrap content
crashkeys.dev wikilink: passing tests. 5mo ago
9745af7b
  1//! Constructs found in markdown.
  2//!
  3//! Constructs are grouped by content type.
  4//! Which content type is allowed somewhere, prescribes which constructs are
  5//! allowed there.
  6//!
  7//! ## Content type
  8//!
  9//! The following content types are found in markdown:
 10//!
 11//! * [document][]
 12//! * [flow][]
 13//! * [string][]
 14//! * [text][]
 15//!
 16//! Content types also have a *rest* thing: after all things are parsed,
 17//! there’s something left.
 18//! In document, that is [flow][].
 19//! In flow, that is [content][].
 20//! In string and text, that is [data][partial_data].
 21//!
 22//! ## Construct
 23//!
 24//! There are several *things* found when parsing markdown, such as, say, a
 25//! thematic break.
 26//! These things are called constructs here.
 27//!
 28//! Sometimes, there are several constructs that result in an equivalent thing.
 29//! For example, [code (fenced)][raw_flow] and
 30//! [code (indented)][code_indented] are considered different constructs.
 31//! Sometimes, constructs on their own don’t result in anything.
 32//! For example, a `*` is parsed as an attention sequence, but later when we
 33//! didn’t find another sequence, it’s turned back into plain data.
 34//!
 35//! The following constructs are found in markdown (`CommonMark`):
 36//!
 37//! * [attention][] (strong, emphasis, extension: GFM strikethrough)
 38//! * [autolink][]
 39//! * [blank line][blank_line]
 40//! * [block quote][block_quote]
 41//! * [character escape][character_escape]
 42//! * [character reference][character_reference]
 43//! * [code (indented)][code_indented]
 44//! * [content][]
 45//! * [definition][]
 46//! * [hard break (escape)][hard_break_escape]
 47//! * [heading (atx)][heading_atx]
 48//! * [heading (setext)][heading_setext]
 49//! * [html (flow)][html_flow]
 50//! * [html (text)][html_text]
 51//! * [label end][label_end]
 52//! * [label start (image)][label_start_image]
 53//! * [label start (link)][label_start_link]
 54//! * [list item][list_item]
 55//! * [paragraph][]
 56//! * [raw (flow)][raw_flow] (code (fenced), extensions: math (flow))
 57//! * [raw (text)][raw_text] (code (text), extensions: math (text))
 58//! * [thematic break][thematic_break]
 59//!
 60//! > 👉 **Note**: for performance reasons, hard break (trailing) is formed by
 61//! > [whitespace][partial_whitespace].
 62//!
 63//! The following constructs are extensions found in markdown:
 64//!
 65//! * [frontmatter][]
 66//! * [gfm autolink literal][gfm_autolink_literal]
 67//! * [gfm footnote definition][gfm_footnote_definition]
 68//! * [gfm label start footnote][gfm_label_start_footnote]
 69//! * [gfm table][gfm_table]
 70//! * [gfm task list item check][gfm_task_list_item_check]
 71//! * [mdx esm][mdx_esm]
 72//! * [mdx expression (flow)][mdx_expression_flow]
 73//! * [mdx expression (text)][mdx_expression_text]
 74//! * [mdx jsx (flow)][mdx_jsx_flow]
 75//! * [mdx jsx (text)][mdx_jsx_text]
 76//!
 77//! There are also several small subroutines typically used in different places:
 78//!
 79//! * [bom][partial_bom]
 80//! * [data][partial_data]
 81//! * [destination][partial_destination]
 82//! * [label][partial_label]
 83//! * [mdx expression][partial_mdx_expression]
 84//! * [mdx jsx][partial_mdx_jsx]
 85//! * [non lazy continuation][partial_non_lazy_continuation]
 86//! * [space or tab][partial_space_or_tab]
 87//! * [space or tab, eol][partial_space_or_tab_eol]
 88//! * [title][partial_title]
 89//! * [whitespace][partial_whitespace]
 90//!
 91//! ## Grammar
 92//!
 93//! Each construct maintained here is explained with a BNF diagram.
 94//!
 95//! Such diagrams are considered to be *non-normative*.
 96//! That is to say, they form illustrative, imperfect, but useful, examples.
 97//! The code, in Rust, is considered to be normative.
 98//!
 99//! The actual syntax of markdown can be described in Backus–Naur form (BNF) as:
100//!
101//! ```bnf
102//! markdown = .*
103//! ```
104//!
105//! No, that’s [not a typo][bnf]: markdown has no syntax errors; anything
106//! thrown at it renders *something*.
107//!
108//! These diagrams contain references to character group as defined by Rust on
109//! for example [char][], but also often on [u8][], which is what `micromark-rs`
110//! typically works on.
111//! So, for example, `ascii_punctuation` refers to
112//! [`u8::is_ascii_punctuation`][u8::is_ascii_punctuation].
113//!
114//! For clarity, the productions used throughout are:
115//!
116//! ```bnf
117//! ; Rust / ASCII groups:
118//! ; 'a'..='z'
119//! ascii_lowercase ::= 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'g' | 'h' | 'i' | 'j' | 'k' | 'l' | 'm' | 'n' | 'o' | 'p' | 'q' | 'r' | 's' | 't' | 'u' | 'v' | 'w' | 'x' | 'y' | 'z'
120//! ; 'A'..='Z'
121//! ascii_uppercase ::= 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L' | 'M' | 'N' | 'O' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X' | 'Y' | 'Z'
122//! ; 'A'..='Z', 'a'..='z'
123//! ascii_alphabetic ::= ascii_lowercase | ascii_uppercase
124//! ; '0'..='9'
125//! ascii_digit ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
126//! ; '0'..='9', 'A'..='F', 'a'..='f'
127//! ascii_hexdigit ::= ascii_digit | 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'
128//! ; '0'..='9', 'A'..='Z', 'a'..='z'
129//! ascii_alphanumeric ::= ascii_digit | ascii_alphabetic
130//! ; '!'..='/', ':'..='@', '['..='`', '{'..='~'
131//! ascii_punctuation ::= '!' | '"' | '#' | '$' | '%' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\' | ']' | '^' | '_' | '`' | '{' | '|' | '}' | '~'
132//! ; 0x00..=0x1F, 0x7F
133//! ascii_control ::= 0x00 | 0x01 | 0x02 | 0x03 | 0x04 | 0x05 | 0x06 | 0x07 | 0x08 | 0x09 | 0x0A | 0x0B | 0x0C | 0x0D | 0x0E | 0x0F | 0x10 | 0x11 | 0x12 | 0x13 | 0x14 | 0x15 | 0x16 | 0x17 | 0x18 | 0x19 | 0x1A | 0x1B | 0x1C | 0x1D | 0x1E | 0x1F | 0x7F
134//!
135//! ; Markdown groups:
136//! ; Any byte (u8)
137//! byte ::= 0x00..=0xFFFF
138//! space_or_tab ::= '\t' | ' '
139//! eol ::= '\n' | '\r' | '\r\n'
140//! line ::= byte - eol
141//! text ::= line - space_or_tab
142//! space_or_tab_eol ::= 1*space_or_tab | *space_or_tab eol *space_or_tab
143//!
144//! ; Unicode groups:
145//! unicode_whitespace ::= ? ; See `char::is_whitespace`.
146//! unicode_punctuation ::= ? ; See `src/unicode.rs`.
147//! ```
148//!
149//! [bnf]: http://trevorjim.com/a-specification-for-markdown/
150
151pub mod attention;
152pub mod autolink;
153pub mod blank_line;
154pub mod block_quote;
155pub mod character_escape;
156pub mod character_reference;
157pub mod code_indented;
158pub mod content;
159pub mod definition;
160pub mod document;
161pub mod flow;
162pub mod frontmatter;
163pub mod gfm_autolink_literal;
164pub mod gfm_footnote_definition;
165pub mod gfm_label_start_footnote;
166pub mod gfm_table;
167pub mod gfm_task_list_item_check;
168pub mod hard_break_escape;
169pub mod heading_atx;
170pub mod heading_setext;
171pub mod html_flow;
172pub mod html_text;
173pub mod label_end;
174pub mod label_start_image;
175pub mod label_start_link;
176pub mod list_item;
177pub mod mdx_esm;
178pub mod mdx_expression_flow;
179pub mod mdx_expression_text;
180pub mod mdx_jsx_flow;
181pub mod mdx_jsx_text;
182pub mod paragraph;
183pub mod partial_bom;
184pub mod partial_data;
185pub mod partial_destination;
186pub mod partial_label;
187pub mod partial_mdx_expression;
188pub mod partial_mdx_jsx;
189pub mod partial_non_lazy_continuation;
190pub mod partial_space_or_tab;
191pub mod partial_space_or_tab_eol;
192pub mod partial_title;
193pub mod partial_whitespace;
194pub mod raw_flow;
195pub mod raw_text;
196pub mod string;
197pub mod text;
198pub mod thematic_break;
199pub mod wikilink;
200pub mod wikilink_label;