Markdown parser fork with extended syntax for personal use.
1//! HTML (text) occurs in the [text][] content type.
2//!
3//! ## Grammar
4//!
5//! HTML (text) forms with the following BNF
6//! (<small>see [construct][crate::construct] for character groups</small>):
7//!
8//! ```bnf
9//! html_text ::= comment | instruction | declaration | cdata | tag_close | tag_open
10//!
11//! ; Restriction: the text is not allowed to start with `>`, `->`, or to contain `--`.
12//! comment ::= '<!--' *byte '-->'
13//! instruction ::= '<?' *byte '?>'
14//! declaration ::= '<!' ascii_alphabetic *byte '>'
15//! ; Restriction: the text is not allowed to contain `]]`.
16//! cdata ::= '<![CDATA[' *byte ']]>'
17//! tag_close ::= '</' tag_name [space_or_tab_eol] '>'
18//! opening_tag ::= '<' tag_name *(space_or_tab_eol attribute) [[space_or_tab_eol] '/'] [space_or_tab_eol] '>'
19//!
20//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric )
21//! attribute ::= attribute_name [[space_or_tab_eol] '=' [space_or_tab_eol] attribute_value]
22//! attribute_name ::= (':' | '_' | ascii_alphabetic) *('-' | '.' | ':' | '_' | ascii_alphanumeric)
23//! attribute_value ::= '"' *(byte - '"') '"' | "'" *(byte - "'") "'" | 1*(text - '"' - "'" - '/' - '<' - '=' - '>' - '`')
24//! ```
25//!
26//! The grammar for HTML in markdown does not follow the rules of parsing
27//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML
28//! spec][html_parsing].
29//! See the related flow construct [HTML (flow)][html_flow] for more info.
30//!
31//! Because the **tag open** and **tag close** productions in the grammar form
32//! with just tags instead of complete elements, it is possible to interleave
33//! (a word for switching between languages) markdown and HTML together.
34//! For example:
35//!
36//! ```markdown
37//! This is equivalent to <code>*emphasised* code</code>.
38//! ```
39//!
40//! ## Tokens
41//!
42//! * [`HtmlText`][Name::HtmlText]
43//! * [`HtmlTextData`][Name::HtmlTextData]
44//!
45//! ## References
46//!
47//! * [`html-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-text.js)
48//! * [*§ 6.6 Raw HTML* in `CommonMark`](https://spec.commonmark.org/0.31/#raw-html)
49//!
50//! [text]: crate::construct::text
51//! [html_flow]: crate::construct::html_flow
52//! [html_parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
53
54use crate::construct::partial_space_or_tab::space_or_tab;
55use crate::event::Name;
56use crate::state::{Name as StateName, State};
57use crate::tokenizer::Tokenizer;
58use crate::util::constant::HTML_CDATA_PREFIX;
59
60/// Start of HTML (text).
61///
62/// ```markdown
63/// > | a <b> c
64/// ^
65/// ```
66pub fn start(tokenizer: &mut Tokenizer) -> State {
67 if Some(b'<') == tokenizer.current && tokenizer.parse_state.options.constructs.html_text {
68 tokenizer.enter(Name::HtmlText);
69 tokenizer.enter(Name::HtmlTextData);
70 tokenizer.consume();
71 State::Next(StateName::HtmlTextOpen)
72 } else {
73 State::Nok
74 }
75}
76
77/// After `<`, at tag name or other stuff.
78///
79/// ```markdown
80/// > | a <b> c
81/// ^
82/// > | a <!doctype> c
83/// ^
84/// > | a <!--b--> c
85/// ^
86/// ```
87pub fn open(tokenizer: &mut Tokenizer) -> State {
88 match tokenizer.current {
89 Some(b'!') => {
90 tokenizer.consume();
91 State::Next(StateName::HtmlTextDeclarationOpen)
92 }
93 Some(b'/') => {
94 tokenizer.consume();
95 State::Next(StateName::HtmlTextTagCloseStart)
96 }
97 Some(b'?') => {
98 tokenizer.consume();
99 State::Next(StateName::HtmlTextInstruction)
100 }
101 // ASCII alphabetical.
102 Some(b'A'..=b'Z' | b'a'..=b'z') => {
103 tokenizer.consume();
104 State::Next(StateName::HtmlTextTagOpen)
105 }
106 _ => State::Nok,
107 }
108}
109
110/// After `<!`, at declaration, comment, or CDATA.
111///
112/// ```markdown
113/// > | a <!doctype> c
114/// ^
115/// > | a <!--b--> c
116/// ^
117/// > | a <![CDATA[>&<]]> c
118/// ^
119/// ```
120pub fn declaration_open(tokenizer: &mut Tokenizer) -> State {
121 match tokenizer.current {
122 Some(b'-') => {
123 tokenizer.consume();
124 State::Next(StateName::HtmlTextCommentOpenInside)
125 }
126 // ASCII alphabetical.
127 Some(b'A'..=b'Z' | b'a'..=b'z') => {
128 tokenizer.consume();
129 State::Next(StateName::HtmlTextDeclaration)
130 }
131 Some(b'[') => {
132 tokenizer.consume();
133 State::Next(StateName::HtmlTextCdataOpenInside)
134 }
135 _ => State::Nok,
136 }
137}
138
139/// In a comment, after `<!-`, at another `-`.
140///
141/// ```markdown
142/// > | a <!--b--> c
143/// ^
144/// ```
145pub fn comment_open_inside(tokenizer: &mut Tokenizer) -> State {
146 match tokenizer.current {
147 Some(b'-') => {
148 tokenizer.consume();
149 State::Next(StateName::HtmlTextCommentEnd)
150 }
151 _ => State::Nok,
152 }
153}
154
155/// In comment.
156///
157/// ```markdown
158/// > | a <!--b--> c
159/// ^
160/// ```
161pub fn comment(tokenizer: &mut Tokenizer) -> State {
162 match tokenizer.current {
163 None => State::Nok,
164 Some(b'\n') => {
165 tokenizer.attempt(State::Next(StateName::HtmlTextComment), State::Nok);
166 State::Retry(StateName::HtmlTextLineEndingBefore)
167 }
168 Some(b'-') => {
169 tokenizer.consume();
170 State::Next(StateName::HtmlTextCommentClose)
171 }
172 _ => {
173 tokenizer.consume();
174 State::Next(StateName::HtmlTextComment)
175 }
176 }
177}
178
179/// In comment, after `-`.
180///
181/// ```markdown
182/// > | a <!--b--> c
183/// ^
184/// ```
185pub fn comment_close(tokenizer: &mut Tokenizer) -> State {
186 match tokenizer.current {
187 Some(b'-') => {
188 tokenizer.consume();
189 State::Next(StateName::HtmlTextCommentEnd)
190 }
191 _ => State::Retry(StateName::HtmlTextComment),
192 }
193}
194/// In comment, after `-`.
195///
196/// ```markdown
197/// > | a <!--b--> c
198/// ^
199/// ```
200pub fn comment_end(tokenizer: &mut Tokenizer) -> State {
201 match tokenizer.current {
202 Some(b'>') => State::Retry(StateName::HtmlTextEnd),
203 Some(b'-') => State::Retry(StateName::HtmlTextCommentClose),
204 _ => State::Retry(StateName::HtmlTextComment),
205 }
206}
207
208/// After `<![`, in CDATA, expecting `CDATA[`.
209///
210/// ```markdown
211/// > | a <![CDATA[>&<]]> b
212/// ^^^^^^
213/// ```
214pub fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State {
215 if tokenizer.current == Some(HTML_CDATA_PREFIX[tokenizer.tokenize_state.size]) {
216 tokenizer.tokenize_state.size += 1;
217 tokenizer.consume();
218
219 if tokenizer.tokenize_state.size == HTML_CDATA_PREFIX.len() {
220 tokenizer.tokenize_state.size = 0;
221 State::Next(StateName::HtmlTextCdata)
222 } else {
223 State::Next(StateName::HtmlTextCdataOpenInside)
224 }
225 } else {
226 State::Nok
227 }
228}
229
230/// In CDATA.
231///
232/// ```markdown
233/// > | a <![CDATA[>&<]]> b
234/// ^^^
235/// ```
236pub fn cdata(tokenizer: &mut Tokenizer) -> State {
237 match tokenizer.current {
238 None => State::Nok,
239 Some(b'\n') => {
240 tokenizer.attempt(State::Next(StateName::HtmlTextCdata), State::Nok);
241 State::Retry(StateName::HtmlTextLineEndingBefore)
242 }
243 Some(b']') => {
244 tokenizer.consume();
245 State::Next(StateName::HtmlTextCdataClose)
246 }
247 _ => {
248 tokenizer.consume();
249 State::Next(StateName::HtmlTextCdata)
250 }
251 }
252}
253
254/// In CDATA, after `]`, at another `]`.
255///
256/// ```markdown
257/// > | a <![CDATA[>&<]]> b
258/// ^
259/// ```
260pub fn cdata_close(tokenizer: &mut Tokenizer) -> State {
261 match tokenizer.current {
262 Some(b']') => {
263 tokenizer.consume();
264 State::Next(StateName::HtmlTextCdataEnd)
265 }
266 _ => State::Retry(StateName::HtmlTextCdata),
267 }
268}
269
270/// In CDATA, after `]]`, at `>`.
271///
272/// ```markdown
273/// > | a <![CDATA[>&<]]> b
274/// ^
275/// ```
276pub fn cdata_end(tokenizer: &mut Tokenizer) -> State {
277 match tokenizer.current {
278 Some(b'>') => State::Retry(StateName::HtmlTextEnd),
279 Some(b']') => State::Retry(StateName::HtmlTextCdataClose),
280 _ => State::Retry(StateName::HtmlTextCdata),
281 }
282}
283
284/// In declaration.
285///
286/// ```markdown
287/// > | a <!b> c
288/// ^
289/// ```
290pub fn declaration(tokenizer: &mut Tokenizer) -> State {
291 match tokenizer.current {
292 None | Some(b'>') => State::Retry(StateName::HtmlTextEnd),
293 Some(b'\n') => {
294 tokenizer.attempt(State::Next(StateName::HtmlTextDeclaration), State::Nok);
295 State::Retry(StateName::HtmlTextLineEndingBefore)
296 }
297 _ => {
298 tokenizer.consume();
299 State::Next(StateName::HtmlTextDeclaration)
300 }
301 }
302}
303
304/// In instruction.
305///
306/// ```markdown
307/// > | a <?b?> c
308/// ^
309/// ```
310pub fn instruction(tokenizer: &mut Tokenizer) -> State {
311 match tokenizer.current {
312 None => State::Nok,
313 Some(b'\n') => {
314 tokenizer.attempt(State::Next(StateName::HtmlTextInstruction), State::Nok);
315 State::Retry(StateName::HtmlTextLineEndingBefore)
316 }
317 Some(b'?') => {
318 tokenizer.consume();
319 State::Next(StateName::HtmlTextInstructionClose)
320 }
321 _ => {
322 tokenizer.consume();
323 State::Next(StateName::HtmlTextInstruction)
324 }
325 }
326}
327
328/// In instruction, after `?`, at `>`.
329///
330/// ```markdown
331/// > | a <?b?> c
332/// ^
333/// ```
334pub fn instruction_close(tokenizer: &mut Tokenizer) -> State {
335 match tokenizer.current {
336 Some(b'>') => State::Retry(StateName::HtmlTextEnd),
337 _ => State::Retry(StateName::HtmlTextInstruction),
338 }
339}
340
341/// After `</`, in closing tag, at tag name.
342///
343/// ```markdown
344/// > | a </b> c
345/// ^
346/// ```
347pub fn tag_close_start(tokenizer: &mut Tokenizer) -> State {
348 match tokenizer.current {
349 // ASCII alphabetical.
350 Some(b'A'..=b'Z' | b'a'..=b'z') => {
351 tokenizer.consume();
352 State::Next(StateName::HtmlTextTagClose)
353 }
354 _ => State::Nok,
355 }
356}
357
358/// After `</x`, in a tag name.
359///
360/// ```markdown
361/// > | a </b> c
362/// ^
363/// ```
364pub fn tag_close(tokenizer: &mut Tokenizer) -> State {
365 match tokenizer.current {
366 // ASCII alphanumerical and `-`.
367 Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
368 tokenizer.consume();
369 State::Next(StateName::HtmlTextTagClose)
370 }
371 _ => State::Retry(StateName::HtmlTextTagCloseBetween),
372 }
373}
374
375/// In closing tag, after tag name.
376///
377/// ```markdown
378/// > | a </b> c
379/// ^
380/// ```
381pub fn tag_close_between(tokenizer: &mut Tokenizer) -> State {
382 match tokenizer.current {
383 Some(b'\n') => {
384 tokenizer.attempt(State::Next(StateName::HtmlTextTagCloseBetween), State::Nok);
385 State::Retry(StateName::HtmlTextLineEndingBefore)
386 }
387 Some(b'\t' | b' ') => {
388 tokenizer.consume();
389 State::Next(StateName::HtmlTextTagCloseBetween)
390 }
391 _ => State::Retry(StateName::HtmlTextEnd),
392 }
393}
394
395/// After `<x`, in opening tag name.
396///
397/// ```markdown
398/// > | a <b> c
399/// ^
400/// ```
401pub fn tag_open(tokenizer: &mut Tokenizer) -> State {
402 match tokenizer.current {
403 // ASCII alphanumerical and `-`.
404 Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
405 tokenizer.consume();
406 State::Next(StateName::HtmlTextTagOpen)
407 }
408 Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween),
409 _ => State::Nok,
410 }
411}
412
413/// In opening tag, after tag name.
414///
415/// ```markdown
416/// > | a <b> c
417/// ^
418/// ```
419pub fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
420 match tokenizer.current {
421 Some(b'\n') => {
422 tokenizer.attempt(State::Next(StateName::HtmlTextTagOpenBetween), State::Nok);
423 State::Retry(StateName::HtmlTextLineEndingBefore)
424 }
425 Some(b'\t' | b' ') => {
426 tokenizer.consume();
427 State::Next(StateName::HtmlTextTagOpenBetween)
428 }
429 Some(b'/') => {
430 tokenizer.consume();
431 State::Next(StateName::HtmlTextEnd)
432 }
433 // ASCII alphabetical and `:` and `_`.
434 Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
435 tokenizer.consume();
436 State::Next(StateName::HtmlTextTagOpenAttributeName)
437 }
438 _ => State::Retry(StateName::HtmlTextEnd),
439 }
440}
441
442/// In attribute name.
443///
444/// ```markdown
445/// > | a <b c> d
446/// ^
447/// ```
448pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {
449 match tokenizer.current {
450 // ASCII alphabetical and `-`, `.`, `:`, and `_`.
451 Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
452 tokenizer.consume();
453 State::Next(StateName::HtmlTextTagOpenAttributeName)
454 }
455 _ => State::Retry(StateName::HtmlTextTagOpenAttributeNameAfter),
456 }
457}
458
459/// After attribute name, before initializer, the end of the tag, or
460/// whitespace.
461///
462/// ```markdown
463/// > | a <b c> d
464/// ^
465/// ```
466pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State {
467 match tokenizer.current {
468 Some(b'\n') => {
469 tokenizer.attempt(
470 State::Next(StateName::HtmlTextTagOpenAttributeNameAfter),
471 State::Nok,
472 );
473 State::Retry(StateName::HtmlTextLineEndingBefore)
474 }
475 Some(b'\t' | b' ') => {
476 tokenizer.consume();
477 State::Next(StateName::HtmlTextTagOpenAttributeNameAfter)
478 }
479 Some(b'=') => {
480 tokenizer.consume();
481 State::Next(StateName::HtmlTextTagOpenAttributeValueBefore)
482 }
483 _ => State::Retry(StateName::HtmlTextTagOpenBetween),
484 }
485}
486
487/// Before unquoted, double quoted, or single quoted attribute value, allowing
488/// whitespace.
489///
490/// ```markdown
491/// > | a <b c=d> e
492/// ^
493/// ```
494pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {
495 match tokenizer.current {
496 None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok,
497 Some(b'\n') => {
498 tokenizer.attempt(
499 State::Next(StateName::HtmlTextTagOpenAttributeValueBefore),
500 State::Nok,
501 );
502 State::Retry(StateName::HtmlTextLineEndingBefore)
503 }
504 Some(b'\t' | b' ') => {
505 tokenizer.consume();
506 State::Next(StateName::HtmlTextTagOpenAttributeValueBefore)
507 }
508 Some(b'"' | b'\'') => {
509 tokenizer.tokenize_state.marker = tokenizer.current.unwrap();
510 tokenizer.consume();
511 State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted)
512 }
513 Some(_) => {
514 tokenizer.consume();
515 State::Next(StateName::HtmlTextTagOpenAttributeValueUnquoted)
516 }
517 }
518}
519
520/// In double or single quoted attribute value.
521///
522/// ```markdown
523/// > | a <b c="d"> e
524/// ^
525/// ```
526pub fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State {
527 if tokenizer.current == Some(tokenizer.tokenize_state.marker) {
528 tokenizer.tokenize_state.marker = 0;
529 tokenizer.consume();
530 State::Next(StateName::HtmlTextTagOpenAttributeValueQuotedAfter)
531 } else {
532 match tokenizer.current {
533 None => {
534 tokenizer.tokenize_state.marker = 0;
535 State::Nok
536 }
537 Some(b'\n') => {
538 tokenizer.attempt(
539 State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted),
540 State::Nok,
541 );
542 State::Retry(StateName::HtmlTextLineEndingBefore)
543 }
544 _ => {
545 tokenizer.consume();
546 State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted)
547 }
548 }
549 }
550}
551
552/// In unquoted attribute value.
553///
554/// ```markdown
555/// > | a <b c=d> e
556/// ^
557/// ```
558pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State {
559 match tokenizer.current {
560 None | Some(b'"' | b'\'' | b'<' | b'=' | b'`') => State::Nok,
561 Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween),
562 Some(_) => {
563 tokenizer.consume();
564 State::Next(StateName::HtmlTextTagOpenAttributeValueUnquoted)
565 }
566 }
567}
568
569/// After double or single quoted attribute value, before whitespace or the end
570/// of the tag.
571///
572/// ```markdown
573/// > | a <b c="d"> e
574/// ^
575/// ```
576pub fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State {
577 match tokenizer.current {
578 Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween),
579 _ => State::Nok,
580 }
581}
582
583/// In certain circumstances of a tag where only an `>` is allowed.
584///
585/// ```markdown
586/// > | a <b c="d"> e
587/// ^
588/// ```
589pub fn end(tokenizer: &mut Tokenizer) -> State {
590 match tokenizer.current {
591 Some(b'>') => {
592 tokenizer.consume();
593 tokenizer.exit(Name::HtmlTextData);
594 tokenizer.exit(Name::HtmlText);
595 State::Ok
596 }
597 _ => State::Nok,
598 }
599}
600
601/// At eol.
602///
603/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about
604/// > empty tokens.
605///
606/// ```markdown
607/// > | a <!--a
608/// ^
609/// | b-->
610/// ```
611pub fn line_ending_before(tokenizer: &mut Tokenizer) -> State {
612 match tokenizer.current {
613 Some(b'\n') => {
614 tokenizer.exit(Name::HtmlTextData);
615 tokenizer.enter(Name::LineEnding);
616 tokenizer.consume();
617 tokenizer.exit(Name::LineEnding);
618 State::Next(StateName::HtmlTextLineEndingAfter)
619 }
620 _ => unreachable!("expected eol"),
621 }
622}
623
624/// After eol, at optional whitespace.
625///
626/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about
627/// > empty tokens.
628///
629/// ```markdown
630/// | a <!--a
631/// > | b-->
632/// ^
633/// ```
634pub fn line_ending_after(tokenizer: &mut Tokenizer) -> State {
635 if matches!(tokenizer.current, Some(b'\t' | b' ')) {
636 tokenizer.attempt(
637 State::Next(StateName::HtmlTextLineEndingAfterPrefix),
638 State::Nok,
639 );
640 State::Retry(space_or_tab(tokenizer))
641 } else {
642 State::Retry(StateName::HtmlTextLineEndingAfterPrefix)
643 }
644}
645
646/// After eol, after optional whitespace.
647///
648/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about
649/// > empty tokens.
650///
651/// ```markdown
652/// | a <!--a
653/// > | b-->
654/// ^
655/// ```
656pub fn line_ending_after_prefix(tokenizer: &mut Tokenizer) -> State {
657 tokenizer.enter(Name::HtmlTextData);
658 State::Ok
659}