Markdown parser fork with extended syntax for personal use.
1//! Character references occur in the [string][] and [text][] content types.
2//!
3//! ## Grammar
4//!
5//! Character references form with the following BNF
6//! (<small>see [construct][crate::construct] for character groups</small>):
7//!
8//! ```bnf
9//! character_reference ::= '&' (numeric | named) ';'
10//!
11//! numeric ::= '#' (hexadecimal | decimal)
12//! ; Note: Limit of `6` imposed, as all bigger numbers are invalid.
13//! hexadecimal ::= ('x' | 'X') 1*6(ascii_hexdigit)
14//! ; Note: Limit of `7` imposed, as all bigger numbers are invalid.
15//! decimal ::= 1*7(ascii_digit)
16//! ; Note: Limit of `31` imposed, for `CounterClockwiseContourIntegral`.
17//! ; Note: Limited to any known named character reference (see `constants.rs`)
18//! named ::= 1*31(ascii_alphanumeric)
19//! ```
20//!
21//! Like much of markdown, there are no “invalid” character references.
22//! However, for security reasons, several numeric character references parse
23//! fine but are not rendered as their corresponding character.
24//! They are instead replaced by a U+FFFD REPLACEMENT CHARACTER (`�`).
25//! See [`decode_numeric`][decode_numeric] for more info.
26//!
27//! To escape ASCII punctuation characters, use the terser
28//! [character escape][character_escape] construct instead (as in, `\&`).
29//!
30//! Character references in markdown are not the same as character references
31//! in HTML.
32//! Notably, HTML allows several character references without a closing
33//! semicolon.
34//! See [*§ 13.2.5.72 Character reference state* in the HTML spec][html] for more info.
35//!
36//! Character references are parsed insensitive to casing.
37//! The casing of hexadecimal numeric character references has no effect.
38//! The casing of named character references does not matter when parsing, but
39//! does affect whether they match.
40//! Depending on the name, one or more cases are allowed, such as that `AMP`
41//! and `amp` are both allowed but other cases are not.
42//! See [`CHARACTER_REFERENCES`][character_references] for which
43//! names match.
44//!
45//! ## Recommendation
46//!
47//! If possible, use a character escape.
48//! Otherwise, use a character reference.
49//!
50//! ## Tokens
51//!
52//! * [`CharacterReference`][Name::CharacterReference]
53//! * [`CharacterReferenceMarker`][Name::CharacterReferenceMarker]
54//! * [`CharacterReferenceMarkerHexadecimal`][Name::CharacterReferenceMarkerHexadecimal]
55//! * [`CharacterReferenceMarkerNumeric`][Name::CharacterReferenceMarkerNumeric]
56//! * [`CharacterReferenceMarkerSemi`][Name::CharacterReferenceMarkerSemi]
57//! * [`CharacterReferenceValue`][Name::CharacterReferenceValue]
58//!
59//! ## References
60//!
61//! * [`character-reference.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/character-reference.js)
62//! * [*§ 2.5 Entity and numeric character references* in `CommonMark`](https://spec.commonmark.org/0.31/#entity-and-numeric-character-references)
63//!
64//! [string]: crate::construct::string
65//! [text]: crate::construct::text
66//! [character_escape]: crate::construct::character_reference
67//! [decode_numeric]: crate::util::character_reference::decode_numeric
68//! [character_references]: crate::util::constant::CHARACTER_REFERENCES
69//! [html]: https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
70
71use crate::event::Name;
72use crate::state::{Name as StateName, State};
73use crate::tokenizer::Tokenizer;
74use crate::util::{
75 character_reference::{decode_named, value_max, value_test},
76 slice::Slice,
77};
78
79/// Start of character reference.
80///
81/// ```markdown
82/// > | a&b
83/// ^
84/// > | a{b
85/// ^
86/// > | a	b
87/// ^
88/// ```
89pub fn start(tokenizer: &mut Tokenizer) -> State {
90 if tokenizer.parse_state.options.constructs.character_reference
91 && tokenizer.current == Some(b'&')
92 {
93 tokenizer.enter(Name::CharacterReference);
94 tokenizer.enter(Name::CharacterReferenceMarker);
95 tokenizer.consume();
96 tokenizer.exit(Name::CharacterReferenceMarker);
97 State::Next(StateName::CharacterReferenceOpen)
98 } else {
99 State::Nok
100 }
101}
102
103/// After `&`, at `#` for numeric references or alphanumeric for named
104/// references.
105///
106/// ```markdown
107/// > | a&b
108/// ^
109/// > | a{b
110/// ^
111/// > | a	b
112/// ^
113/// ```
114pub fn open(tokenizer: &mut Tokenizer) -> State {
115 if let Some(b'#') = tokenizer.current {
116 tokenizer.enter(Name::CharacterReferenceMarkerNumeric);
117 tokenizer.consume();
118 tokenizer.exit(Name::CharacterReferenceMarkerNumeric);
119 State::Next(StateName::CharacterReferenceNumeric)
120 } else {
121 tokenizer.tokenize_state.marker = b'&';
122 tokenizer.enter(Name::CharacterReferenceValue);
123 State::Retry(StateName::CharacterReferenceValue)
124 }
125}
126
127/// After `#`, at `x` for hexadecimals or digit for decimals.
128///
129/// ```markdown
130/// > | a{b
131/// ^
132/// > | a	b
133/// ^
134/// ```
135pub fn numeric(tokenizer: &mut Tokenizer) -> State {
136 if let Some(b'x' | b'X') = tokenizer.current {
137 tokenizer.enter(Name::CharacterReferenceMarkerHexadecimal);
138 tokenizer.consume();
139 tokenizer.exit(Name::CharacterReferenceMarkerHexadecimal);
140 tokenizer.enter(Name::CharacterReferenceValue);
141 tokenizer.tokenize_state.marker = b'x';
142 State::Next(StateName::CharacterReferenceValue)
143 } else {
144 tokenizer.enter(Name::CharacterReferenceValue);
145 tokenizer.tokenize_state.marker = b'#';
146 State::Retry(StateName::CharacterReferenceValue)
147 }
148}
149
150/// After markers (`&#x`, `&#`, or `&`), in value, before `;`.
151///
152/// The character reference kind defines what and how many characters are
153/// allowed.
154///
155/// ```markdown
156/// > | a&b
157/// ^^^
158/// > | a{b
159/// ^^^
160/// > | a	b
161/// ^
162/// ```
163pub fn value(tokenizer: &mut Tokenizer) -> State {
164 if matches!(tokenizer.current, Some(b';')) && tokenizer.tokenize_state.size > 0 {
165 // Named.
166 if tokenizer.tokenize_state.marker == b'&' {
167 // Guaranteed to be valid ASCII bytes.
168 let slice = Slice::from_indices(
169 tokenizer.parse_state.bytes,
170 tokenizer.point.index - tokenizer.tokenize_state.size,
171 tokenizer.point.index,
172 );
173
174 if decode_named(slice.as_str(), true).is_none() {
175 tokenizer.tokenize_state.marker = 0;
176 tokenizer.tokenize_state.size = 0;
177 return State::Nok;
178 }
179 }
180
181 tokenizer.exit(Name::CharacterReferenceValue);
182 tokenizer.enter(Name::CharacterReferenceMarkerSemi);
183 tokenizer.consume();
184 tokenizer.exit(Name::CharacterReferenceMarkerSemi);
185 tokenizer.exit(Name::CharacterReference);
186 tokenizer.tokenize_state.marker = 0;
187 tokenizer.tokenize_state.size = 0;
188 return State::Ok;
189 }
190
191 if let Some(byte) = tokenizer.current {
192 if tokenizer.tokenize_state.size < value_max(tokenizer.tokenize_state.marker)
193 && value_test(tokenizer.tokenize_state.marker)(&byte)
194 {
195 tokenizer.tokenize_state.size += 1;
196 tokenizer.consume();
197 return State::Next(StateName::CharacterReferenceValue);
198 }
199 }
200
201 tokenizer.tokenize_state.marker = 0;
202 tokenizer.tokenize_state.size = 0;
203 State::Nok
204}