src/construct/html_text.rs at hack · crashkeys.dev/markdown-rs

crashkeys.dev / markdown-rs
fork atom
Markdown parser fork with extended syntax for personal use.
fork atom
markdown-rs / src / construct / html_text.rs
at hack 659 lines 19 kB view raw
wrap content
Titus Wormer Refactor docs 11mo ago
e0ca3f6c
  1//! HTML (text) occurs in the [text][] content type.
  2//!
  3//! ## Grammar
  4//!
  5//! HTML (text) forms with the following BNF
  6//! (<small>see [construct][crate::construct] for character groups</small>):
  7//!
  8//! ```bnf
  9//! html_text ::= comment | instruction | declaration | cdata | tag_close | tag_open
 10//!
 11//! ; Restriction: the text is not allowed to start with `>`, `->`, or to contain `--`.
 12//! comment ::= '<!--' *byte '-->'
 13//! instruction ::= '<?' *byte '?>'
 14//! declaration ::= '<!' ascii_alphabetic *byte '>'
 15//! ; Restriction: the text is not allowed to contain `]]`.
 16//! cdata ::= '<![CDATA[' *byte ']]>'
 17//! tag_close ::= '</' tag_name [space_or_tab_eol] '>'
 18//! opening_tag ::= '<' tag_name *(space_or_tab_eol attribute) [[space_or_tab_eol] '/'] [space_or_tab_eol] '>'
 19//!
 20//! tag_name ::= ascii_alphabetic *( '-' | ascii_alphanumeric )
 21//! attribute ::= attribute_name [[space_or_tab_eol] '=' [space_or_tab_eol] attribute_value]
 22//! attribute_name ::= (':' | '_' | ascii_alphabetic) *('-' | '.' | ':' | '_' | ascii_alphanumeric)
 23//! attribute_value ::= '"' *(byte - '"') '"' | "'" *(byte - "'")  "'" | 1*(text - '"' - "'" - '/' - '<' - '=' - '>' - '`')
 24//! ```
 25//!
 26//! The grammar for HTML in markdown does not follow the rules of parsing
 27//! HTML according to the [*§ 13.2 Parsing HTML documents* in the HTML
 28//! spec][html_parsing].
 29//! See the related flow construct [HTML (flow)][html_flow] for more info.
 30//!
 31//! Because the **tag open** and **tag close** productions in the grammar form
 32//! with just tags instead of complete elements, it is possible to interleave
 33//! (a word for switching between languages) markdown and HTML together.
 34//! For example:
 35//!
 36//! ```markdown
 37//! This is equivalent to <code>*emphasised* code</code>.
 38//! ```
 39//!
 40//! ## Tokens
 41//!
 42//! * [`HtmlText`][Name::HtmlText]
 43//! * [`HtmlTextData`][Name::HtmlTextData]
 44//!
 45//! ## References
 46//!
 47//! * [`html-text.js` in `micromark`](https://github.com/micromark/micromark/blob/main/packages/micromark-core-commonmark/dev/lib/html-text.js)
 48//! * [*§ 6.6 Raw HTML* in `CommonMark`](https://spec.commonmark.org/0.31/#raw-html)
 49//!
 50//! [text]: crate::construct::text
 51//! [html_flow]: crate::construct::html_flow
 52//! [html_parsing]: https://html.spec.whatwg.org/multipage/parsing.html#parsing
 53
 54use crate::construct::partial_space_or_tab::space_or_tab;
 55use crate::event::Name;
 56use crate::state::{Name as StateName, State};
 57use crate::tokenizer::Tokenizer;
 58use crate::util::constant::HTML_CDATA_PREFIX;
 59
 60/// Start of HTML (text).
 61///
 62/// ```markdown
 63/// > | a <b> c
 64///       ^
 65/// ```
 66pub fn start(tokenizer: &mut Tokenizer) -> State {
 67    if Some(b'<') == tokenizer.current && tokenizer.parse_state.options.constructs.html_text {
 68        tokenizer.enter(Name::HtmlText);
 69        tokenizer.enter(Name::HtmlTextData);
 70        tokenizer.consume();
 71        State::Next(StateName::HtmlTextOpen)
 72    } else {
 73        State::Nok
 74    }
 75}
 76
 77/// After `<`, at tag name or other stuff.
 78///
 79/// ```markdown
 80/// > | a <b> c
 81///        ^
 82/// > | a <!doctype> c
 83///        ^
 84/// > | a <!--b--> c
 85///        ^
 86/// ```
 87pub fn open(tokenizer: &mut Tokenizer) -> State {
 88    match tokenizer.current {
 89        Some(b'!') => {
 90            tokenizer.consume();
 91            State::Next(StateName::HtmlTextDeclarationOpen)
 92        }
 93        Some(b'/') => {
 94            tokenizer.consume();
 95            State::Next(StateName::HtmlTextTagCloseStart)
 96        }
 97        Some(b'?') => {
 98            tokenizer.consume();
 99            State::Next(StateName::HtmlTextInstruction)
100        }
101        // ASCII alphabetical.
102        Some(b'A'..=b'Z' | b'a'..=b'z') => {
103            tokenizer.consume();
104            State::Next(StateName::HtmlTextTagOpen)
105        }
106        _ => State::Nok,
107    }
108}
109
110/// After `<!`, at declaration, comment, or CDATA.
111///
112/// ```markdown
113/// > | a <!doctype> c
114///         ^
115/// > | a <!--b--> c
116///         ^
117/// > | a <![CDATA[>&<]]> c
118///         ^
119/// ```
120pub fn declaration_open(tokenizer: &mut Tokenizer) -> State {
121    match tokenizer.current {
122        Some(b'-') => {
123            tokenizer.consume();
124            State::Next(StateName::HtmlTextCommentOpenInside)
125        }
126        // ASCII alphabetical.
127        Some(b'A'..=b'Z' | b'a'..=b'z') => {
128            tokenizer.consume();
129            State::Next(StateName::HtmlTextDeclaration)
130        }
131        Some(b'[') => {
132            tokenizer.consume();
133            State::Next(StateName::HtmlTextCdataOpenInside)
134        }
135        _ => State::Nok,
136    }
137}
138
139/// In a comment, after `<!-`, at another `-`.
140///
141/// ```markdown
142/// > | a <!--b--> c
143///          ^
144/// ```
145pub fn comment_open_inside(tokenizer: &mut Tokenizer) -> State {
146    match tokenizer.current {
147        Some(b'-') => {
148            tokenizer.consume();
149            State::Next(StateName::HtmlTextCommentEnd)
150        }
151        _ => State::Nok,
152    }
153}
154
155/// In comment.
156///
157/// ```markdown
158/// > | a <!--b--> c
159///           ^
160/// ```
161pub fn comment(tokenizer: &mut Tokenizer) -> State {
162    match tokenizer.current {
163        None => State::Nok,
164        Some(b'\n') => {
165            tokenizer.attempt(State::Next(StateName::HtmlTextComment), State::Nok);
166            State::Retry(StateName::HtmlTextLineEndingBefore)
167        }
168        Some(b'-') => {
169            tokenizer.consume();
170            State::Next(StateName::HtmlTextCommentClose)
171        }
172        _ => {
173            tokenizer.consume();
174            State::Next(StateName::HtmlTextComment)
175        }
176    }
177}
178
179/// In comment, after `-`.
180///
181/// ```markdown
182/// > | a <!--b--> c
183///             ^
184/// ```
185pub fn comment_close(tokenizer: &mut Tokenizer) -> State {
186    match tokenizer.current {
187        Some(b'-') => {
188            tokenizer.consume();
189            State::Next(StateName::HtmlTextCommentEnd)
190        }
191        _ => State::Retry(StateName::HtmlTextComment),
192    }
193}
194/// In comment, after `-`.
195///
196/// ```markdown
197/// > | a <!--b--> c
198///             ^
199/// ```
200pub fn comment_end(tokenizer: &mut Tokenizer) -> State {
201    match tokenizer.current {
202        Some(b'>') => State::Retry(StateName::HtmlTextEnd),
203        Some(b'-') => State::Retry(StateName::HtmlTextCommentClose),
204        _ => State::Retry(StateName::HtmlTextComment),
205    }
206}
207
208/// After `<![`, in CDATA, expecting `CDATA[`.
209///
210/// ```markdown
211/// > | a <![CDATA[>&<]]> b
212///          ^^^^^^
213/// ```
214pub fn cdata_open_inside(tokenizer: &mut Tokenizer) -> State {
215    if tokenizer.current == Some(HTML_CDATA_PREFIX[tokenizer.tokenize_state.size]) {
216        tokenizer.tokenize_state.size += 1;
217        tokenizer.consume();
218
219        if tokenizer.tokenize_state.size == HTML_CDATA_PREFIX.len() {
220            tokenizer.tokenize_state.size = 0;
221            State::Next(StateName::HtmlTextCdata)
222        } else {
223            State::Next(StateName::HtmlTextCdataOpenInside)
224        }
225    } else {
226        State::Nok
227    }
228}
229
230/// In CDATA.
231///
232/// ```markdown
233/// > | a <![CDATA[>&<]]> b
234///                ^^^
235/// ```
236pub fn cdata(tokenizer: &mut Tokenizer) -> State {
237    match tokenizer.current {
238        None => State::Nok,
239        Some(b'\n') => {
240            tokenizer.attempt(State::Next(StateName::HtmlTextCdata), State::Nok);
241            State::Retry(StateName::HtmlTextLineEndingBefore)
242        }
243        Some(b']') => {
244            tokenizer.consume();
245            State::Next(StateName::HtmlTextCdataClose)
246        }
247        _ => {
248            tokenizer.consume();
249            State::Next(StateName::HtmlTextCdata)
250        }
251    }
252}
253
254/// In CDATA, after `]`, at another `]`.
255///
256/// ```markdown
257/// > | a <![CDATA[>&<]]> b
258///                    ^
259/// ```
260pub fn cdata_close(tokenizer: &mut Tokenizer) -> State {
261    match tokenizer.current {
262        Some(b']') => {
263            tokenizer.consume();
264            State::Next(StateName::HtmlTextCdataEnd)
265        }
266        _ => State::Retry(StateName::HtmlTextCdata),
267    }
268}
269
270/// In CDATA, after `]]`, at `>`.
271///
272/// ```markdown
273/// > | a <![CDATA[>&<]]> b
274///                     ^
275/// ```
276pub fn cdata_end(tokenizer: &mut Tokenizer) -> State {
277    match tokenizer.current {
278        Some(b'>') => State::Retry(StateName::HtmlTextEnd),
279        Some(b']') => State::Retry(StateName::HtmlTextCdataClose),
280        _ => State::Retry(StateName::HtmlTextCdata),
281    }
282}
283
284/// In declaration.
285///
286/// ```markdown
287/// > | a <!b> c
288///          ^
289/// ```
290pub fn declaration(tokenizer: &mut Tokenizer) -> State {
291    match tokenizer.current {
292        None | Some(b'>') => State::Retry(StateName::HtmlTextEnd),
293        Some(b'\n') => {
294            tokenizer.attempt(State::Next(StateName::HtmlTextDeclaration), State::Nok);
295            State::Retry(StateName::HtmlTextLineEndingBefore)
296        }
297        _ => {
298            tokenizer.consume();
299            State::Next(StateName::HtmlTextDeclaration)
300        }
301    }
302}
303
304/// In instruction.
305///
306/// ```markdown
307/// > | a <?b?> c
308///         ^
309/// ```
310pub fn instruction(tokenizer: &mut Tokenizer) -> State {
311    match tokenizer.current {
312        None => State::Nok,
313        Some(b'\n') => {
314            tokenizer.attempt(State::Next(StateName::HtmlTextInstruction), State::Nok);
315            State::Retry(StateName::HtmlTextLineEndingBefore)
316        }
317        Some(b'?') => {
318            tokenizer.consume();
319            State::Next(StateName::HtmlTextInstructionClose)
320        }
321        _ => {
322            tokenizer.consume();
323            State::Next(StateName::HtmlTextInstruction)
324        }
325    }
326}
327
328/// In instruction, after `?`, at `>`.
329///
330/// ```markdown
331/// > | a <?b?> c
332///           ^
333/// ```
334pub fn instruction_close(tokenizer: &mut Tokenizer) -> State {
335    match tokenizer.current {
336        Some(b'>') => State::Retry(StateName::HtmlTextEnd),
337        _ => State::Retry(StateName::HtmlTextInstruction),
338    }
339}
340
341/// After `</`, in closing tag, at tag name.
342///
343/// ```markdown
344/// > | a </b> c
345///         ^
346/// ```
347pub fn tag_close_start(tokenizer: &mut Tokenizer) -> State {
348    match tokenizer.current {
349        // ASCII alphabetical.
350        Some(b'A'..=b'Z' | b'a'..=b'z') => {
351            tokenizer.consume();
352            State::Next(StateName::HtmlTextTagClose)
353        }
354        _ => State::Nok,
355    }
356}
357
358/// After `</x`, in a tag name.
359///
360/// ```markdown
361/// > | a </b> c
362///          ^
363/// ```
364pub fn tag_close(tokenizer: &mut Tokenizer) -> State {
365    match tokenizer.current {
366        // ASCII alphanumerical and `-`.
367        Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
368            tokenizer.consume();
369            State::Next(StateName::HtmlTextTagClose)
370        }
371        _ => State::Retry(StateName::HtmlTextTagCloseBetween),
372    }
373}
374
375/// In closing tag, after tag name.
376///
377/// ```markdown
378/// > | a </b> c
379///          ^
380/// ```
381pub fn tag_close_between(tokenizer: &mut Tokenizer) -> State {
382    match tokenizer.current {
383        Some(b'\n') => {
384            tokenizer.attempt(State::Next(StateName::HtmlTextTagCloseBetween), State::Nok);
385            State::Retry(StateName::HtmlTextLineEndingBefore)
386        }
387        Some(b'\t' | b' ') => {
388            tokenizer.consume();
389            State::Next(StateName::HtmlTextTagCloseBetween)
390        }
391        _ => State::Retry(StateName::HtmlTextEnd),
392    }
393}
394
395/// After `<x`, in opening tag name.
396///
397/// ```markdown
398/// > | a <b> c
399///         ^
400/// ```
401pub fn tag_open(tokenizer: &mut Tokenizer) -> State {
402    match tokenizer.current {
403        // ASCII alphanumerical and `-`.
404        Some(b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z') => {
405            tokenizer.consume();
406            State::Next(StateName::HtmlTextTagOpen)
407        }
408        Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween),
409        _ => State::Nok,
410    }
411}
412
413/// In opening tag, after tag name.
414///
415/// ```markdown
416/// > | a <b> c
417///         ^
418/// ```
419pub fn tag_open_between(tokenizer: &mut Tokenizer) -> State {
420    match tokenizer.current {
421        Some(b'\n') => {
422            tokenizer.attempt(State::Next(StateName::HtmlTextTagOpenBetween), State::Nok);
423            State::Retry(StateName::HtmlTextLineEndingBefore)
424        }
425        Some(b'\t' | b' ') => {
426            tokenizer.consume();
427            State::Next(StateName::HtmlTextTagOpenBetween)
428        }
429        Some(b'/') => {
430            tokenizer.consume();
431            State::Next(StateName::HtmlTextEnd)
432        }
433        // ASCII alphabetical and `:` and `_`.
434        Some(b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
435            tokenizer.consume();
436            State::Next(StateName::HtmlTextTagOpenAttributeName)
437        }
438        _ => State::Retry(StateName::HtmlTextEnd),
439    }
440}
441
442/// In attribute name.
443///
444/// ```markdown
445/// > | a <b c> d
446///          ^
447/// ```
448pub fn tag_open_attribute_name(tokenizer: &mut Tokenizer) -> State {
449    match tokenizer.current {
450        // ASCII alphabetical and `-`, `.`, `:`, and `_`.
451        Some(b'-' | b'.' | b'0'..=b'9' | b':' | b'A'..=b'Z' | b'_' | b'a'..=b'z') => {
452            tokenizer.consume();
453            State::Next(StateName::HtmlTextTagOpenAttributeName)
454        }
455        _ => State::Retry(StateName::HtmlTextTagOpenAttributeNameAfter),
456    }
457}
458
459/// After attribute name, before initializer, the end of the tag, or
460/// whitespace.
461///
462/// ```markdown
463/// > | a <b c> d
464///           ^
465/// ```
466pub fn tag_open_attribute_name_after(tokenizer: &mut Tokenizer) -> State {
467    match tokenizer.current {
468        Some(b'\n') => {
469            tokenizer.attempt(
470                State::Next(StateName::HtmlTextTagOpenAttributeNameAfter),
471                State::Nok,
472            );
473            State::Retry(StateName::HtmlTextLineEndingBefore)
474        }
475        Some(b'\t' | b' ') => {
476            tokenizer.consume();
477            State::Next(StateName::HtmlTextTagOpenAttributeNameAfter)
478        }
479        Some(b'=') => {
480            tokenizer.consume();
481            State::Next(StateName::HtmlTextTagOpenAttributeValueBefore)
482        }
483        _ => State::Retry(StateName::HtmlTextTagOpenBetween),
484    }
485}
486
487/// Before unquoted, double quoted, or single quoted attribute value, allowing
488/// whitespace.
489///
490/// ```markdown
491/// > | a <b c=d> e
492///            ^
493/// ```
494pub fn tag_open_attribute_value_before(tokenizer: &mut Tokenizer) -> State {
495    match tokenizer.current {
496        None | Some(b'<' | b'=' | b'>' | b'`') => State::Nok,
497        Some(b'\n') => {
498            tokenizer.attempt(
499                State::Next(StateName::HtmlTextTagOpenAttributeValueBefore),
500                State::Nok,
501            );
502            State::Retry(StateName::HtmlTextLineEndingBefore)
503        }
504        Some(b'\t' | b' ') => {
505            tokenizer.consume();
506            State::Next(StateName::HtmlTextTagOpenAttributeValueBefore)
507        }
508        Some(b'"' | b'\'') => {
509            tokenizer.tokenize_state.marker = tokenizer.current.unwrap();
510            tokenizer.consume();
511            State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted)
512        }
513        Some(_) => {
514            tokenizer.consume();
515            State::Next(StateName::HtmlTextTagOpenAttributeValueUnquoted)
516        }
517    }
518}
519
520/// In double or single quoted attribute value.
521///
522/// ```markdown
523/// > | a <b c="d"> e
524///             ^
525/// ```
526pub fn tag_open_attribute_value_quoted(tokenizer: &mut Tokenizer) -> State {
527    if tokenizer.current == Some(tokenizer.tokenize_state.marker) {
528        tokenizer.tokenize_state.marker = 0;
529        tokenizer.consume();
530        State::Next(StateName::HtmlTextTagOpenAttributeValueQuotedAfter)
531    } else {
532        match tokenizer.current {
533            None => {
534                tokenizer.tokenize_state.marker = 0;
535                State::Nok
536            }
537            Some(b'\n') => {
538                tokenizer.attempt(
539                    State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted),
540                    State::Nok,
541                );
542                State::Retry(StateName::HtmlTextLineEndingBefore)
543            }
544            _ => {
545                tokenizer.consume();
546                State::Next(StateName::HtmlTextTagOpenAttributeValueQuoted)
547            }
548        }
549    }
550}
551
552/// In unquoted attribute value.
553///
554/// ```markdown
555/// > | a <b c=d> e
556///            ^
557/// ```
558pub fn tag_open_attribute_value_unquoted(tokenizer: &mut Tokenizer) -> State {
559    match tokenizer.current {
560        None | Some(b'"' | b'\'' | b'<' | b'=' | b'`') => State::Nok,
561        Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween),
562        Some(_) => {
563            tokenizer.consume();
564            State::Next(StateName::HtmlTextTagOpenAttributeValueUnquoted)
565        }
566    }
567}
568
569/// After double or single quoted attribute value, before whitespace or the end
570/// of the tag.
571///
572/// ```markdown
573/// > | a <b c="d"> e
574///               ^
575/// ```
576pub fn tag_open_attribute_value_quoted_after(tokenizer: &mut Tokenizer) -> State {
577    match tokenizer.current {
578        Some(b'\t' | b'\n' | b' ' | b'/' | b'>') => State::Retry(StateName::HtmlTextTagOpenBetween),
579        _ => State::Nok,
580    }
581}
582
583/// In certain circumstances of a tag where only an `>` is allowed.
584///
585/// ```markdown
586/// > | a <b c="d"> e
587///               ^
588/// ```
589pub fn end(tokenizer: &mut Tokenizer) -> State {
590    match tokenizer.current {
591        Some(b'>') => {
592            tokenizer.consume();
593            tokenizer.exit(Name::HtmlTextData);
594            tokenizer.exit(Name::HtmlText);
595            State::Ok
596        }
597        _ => State::Nok,
598    }
599}
600
601/// At eol.
602///
603/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about
604/// > empty tokens.
605///
606/// ```markdown
607/// > | a <!--a
608///            ^
609///   | b-->
610/// ```
611pub fn line_ending_before(tokenizer: &mut Tokenizer) -> State {
612    match tokenizer.current {
613        Some(b'\n') => {
614            tokenizer.exit(Name::HtmlTextData);
615            tokenizer.enter(Name::LineEnding);
616            tokenizer.consume();
617            tokenizer.exit(Name::LineEnding);
618            State::Next(StateName::HtmlTextLineEndingAfter)
619        }
620        _ => unreachable!("expected eol"),
621    }
622}
623
624/// After eol, at optional whitespace.
625///
626/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about
627/// > empty tokens.
628///
629/// ```markdown
630///   | a <!--a
631/// > | b-->
632///     ^
633/// ```
634pub fn line_ending_after(tokenizer: &mut Tokenizer) -> State {
635    if matches!(tokenizer.current, Some(b'\t' | b' ')) {
636        tokenizer.attempt(
637            State::Next(StateName::HtmlTextLineEndingAfterPrefix),
638            State::Nok,
639        );
640        State::Retry(space_or_tab(tokenizer))
641    } else {
642        State::Retry(StateName::HtmlTextLineEndingAfterPrefix)
643    }
644}
645
646/// After eol, after optional whitespace.
647///
648/// > 👉 **Note**: we can’t have blank lines in text, so no need to worry about
649/// > empty tokens.
650///
651/// ```markdown
652///   | a <!--a
653/// > | b-->
654///     ^
655/// ```
656pub fn line_ending_after_prefix(tokenizer: &mut Tokenizer) -> State {
657    tokenizer.enter(Name::HtmlTextData);
658    State::Ok
659}