Markdown parser fork with extended syntax for personal use.
at hack 204 lines 7.4 kB view raw
1//! Character references occur in the [string][] and [text][] content types. 2//! 3//! ## Grammar 4//! 5//! Character references form with the following BNF 6//! (<small>see [construct][crate::construct] for character groups</small>): 7//! 8//! ```bnf 9//! character_reference ::= '&' (numeric | named) ';' 10//! 11//! numeric ::= '#' (hexadecimal | decimal) 12//! ; Note: Limit of `6` imposed, as all bigger numbers are invalid. 13//! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit) 14//! ; Note: Limit of `7` imposed, as all bigger numbers are invalid. 15//! decimal ::= 1*7(ascii_digit) 16//! ; Note: Limit of `31` imposed, for `CounterClockwiseContourIntegral`. 17//! ; Note: Limited to any known named character reference (see `constants.rs`) 18//! named ::= 1*31(ascii_alphanumeric) 19//! ``` 20//! 21//! Like much of markdown, there are no “invalid” character references. 22//! However, for security reasons, several numeric character references parse 23//! fine but are not rendered as their corresponding character. 24//! They are instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`). 25//! See [`decode_numeric`][decode_numeric] for more info. 26//! 27//! To escape ASCII punctuation characters, use the terser 28//! [character escape][character_escape] construct instead (as in, `\&`). 29//! 30//! Character references in markdown are not the same as character references 31//! in HTML. 32//! Notably, HTML allows several character references without a closing 33//! semicolon. 34//! See [*§ 13.2.5.72 Character reference state* in the HTML spec][html] for more info. 35//! 36//! Character references are parsed insensitive to casing. 37//! The casing of hexadecimal numeric character references has no effect. 38//! The casing of named character references does not matter when parsing, but 39//! does affect whether they match. 40//! Depending on the name, one or more cases are allowed, such as that `AMP` 41//! and `amp` are both allowed but other cases are not. 42//! See [`CHARACTER_REFERENCES`][character_references] for which 43//! names match. 44//! 45//! ## Recommendation 46//! 47//! If possible, use a character escape. 48//! Otherwise, use a character reference. 49//! 50//! ## Tokens 51//! 52//! * [`CharacterReference`][Name::CharacterReference] 53//! * [`CharacterReferenceMarker`][Name::CharacterReferenceMarker] 54//! * [`CharacterReferenceMarkerHexadecimal`][Name::CharacterReferenceMarkerHexadecimal] 55//! * [`CharacterReferenceMarkerNumeric`][Name::CharacterReferenceMarkerNumeric] 56//! * [`CharacterReferenceMarkerSemi`][Name::CharacterReferenceMarkerSemi] 57//! * [`CharacterReferenceValue`][Name::CharacterReferenceValue] 58//! 59//! ## References 60//! 61//! * [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js) 62//! * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.31/#entity-and-numeric-character-references) 63//! 64//! [string]: crate::construct::string 65//! [text]: crate::construct::text 66//! [character_escape]: crate::construct::character_reference 67//! [decode_numeric]: crate::util::character_reference::decode_numeric 68//! [character_references]: crate::util::constant::CHARACTER_REFERENCES 69//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state 70 71use crate::event::Name; 72use crate::state::{Name as StateName, State}; 73use crate::tokenizer::Tokenizer; 74use crate::util::{ 75 character_reference::{decode_named, value_max, value_test}, 76 slice::Slice, 77}; 78 79/// Start of character reference. 80/// 81/// ```markdown 82/// > | a&amp;b 83/// ^ 84/// > | a&#123;b 85/// ^ 86/// > | a&#x9;b 87/// ^ 88/// ``` 89pub fn start(tokenizer: &mut Tokenizer) -> State { 90 if tokenizer.parse_state.options.constructs.character_reference 91 && tokenizer.current == Some(b'&') 92 { 93 tokenizer.enter(Name::CharacterReference); 94 tokenizer.enter(Name::CharacterReferenceMarker); 95 tokenizer.consume(); 96 tokenizer.exit(Name::CharacterReferenceMarker); 97 State::Next(StateName::CharacterReferenceOpen) 98 } else { 99 State::Nok 100 } 101} 102 103/// After `&`, at `#` for numeric references or alphanumeric for named 104/// references. 105/// 106/// ```markdown 107/// > | a&amp;b 108/// ^ 109/// > | a&#123;b 110/// ^ 111/// > | a&#x9;b 112/// ^ 113/// ``` 114pub fn open(tokenizer: &mut Tokenizer) -> State { 115 if let Some(b'#') = tokenizer.current { 116 tokenizer.enter(Name::CharacterReferenceMarkerNumeric); 117 tokenizer.consume(); 118 tokenizer.exit(Name::CharacterReferenceMarkerNumeric); 119 State::Next(StateName::CharacterReferenceNumeric) 120 } else { 121 tokenizer.tokenize_state.marker = b'&'; 122 tokenizer.enter(Name::CharacterReferenceValue); 123 State::Retry(StateName::CharacterReferenceValue) 124 } 125} 126 127/// After `#`, at `x` for hexadecimals or digit for decimals. 128/// 129/// ```markdown 130/// > | a&#123;b 131/// ^ 132/// > | a&#x9;b 133/// ^ 134/// ``` 135pub fn numeric(tokenizer: &mut Tokenizer) -> State { 136 if let Some(b'x' | b'X') = tokenizer.current { 137 tokenizer.enter(Name::CharacterReferenceMarkerHexadecimal); 138 tokenizer.consume(); 139 tokenizer.exit(Name::CharacterReferenceMarkerHexadecimal); 140 tokenizer.enter(Name::CharacterReferenceValue); 141 tokenizer.tokenize_state.marker = b'x'; 142 State::Next(StateName::CharacterReferenceValue) 143 } else { 144 tokenizer.enter(Name::CharacterReferenceValue); 145 tokenizer.tokenize_state.marker = b'#'; 146 State::Retry(StateName::CharacterReferenceValue) 147 } 148} 149 150/// After markers (`&#x`, `&#`, or `&`), in value, before `;`. 151/// 152/// The character reference kind defines what and how many characters are 153/// allowed. 154/// 155/// ```markdown 156/// > | a&amp;b 157/// ^^^ 158/// > | a&#123;b 159/// ^^^ 160/// > | a&#x9;b 161/// ^ 162/// ``` 163pub fn value(tokenizer: &mut Tokenizer) -> State { 164 if matches!(tokenizer.current, Some(b';')) && tokenizer.tokenize_state.size > 0 { 165 // Named. 166 if tokenizer.tokenize_state.marker == b'&' { 167 // Guaranteed to be valid ASCII bytes. 168 let slice = Slice::from_indices( 169 tokenizer.parse_state.bytes, 170 tokenizer.point.index - tokenizer.tokenize_state.size, 171 tokenizer.point.index, 172 ); 173 174 if decode_named(slice.as_str(), true).is_none() { 175 tokenizer.tokenize_state.marker = 0; 176 tokenizer.tokenize_state.size = 0; 177 return State::Nok; 178 } 179 } 180 181 tokenizer.exit(Name::CharacterReferenceValue); 182 tokenizer.enter(Name::CharacterReferenceMarkerSemi); 183 tokenizer.consume(); 184 tokenizer.exit(Name::CharacterReferenceMarkerSemi); 185 tokenizer.exit(Name::CharacterReference); 186 tokenizer.tokenize_state.marker = 0; 187 tokenizer.tokenize_state.size = 0; 188 return State::Ok; 189 } 190 191 if let Some(byte) = tokenizer.current { 192 if tokenizer.tokenize_state.size < value_max(tokenizer.tokenize_state.marker) 193 && value_test(tokenizer.tokenize_state.marker)(&byte) 194 { 195 tokenizer.tokenize_state.size += 1; 196 tokenizer.consume(); 197 return State::Next(StateName::CharacterReferenceValue); 198 } 199 } 200 201 tokenizer.tokenize_state.marker = 0; 202 tokenizer.tokenize_state.size = 0; 203 State::Nok 204}