src/parser/syntax.ml at main · jon.recoil.org/odoc

this repo has no description
odoc / src / parser / syntax.ml
at main 1505 lines 60 kB view raw
wrap content
   1(* This module is a recursive descent parser for the ocamldoc syntax. The parser
   2   consumes a token stream of type [Token.t Stream.t], provided by the lexer,
   3   and produces a comment AST of the type defined in [Parser_.Ast].
   4
   5   The AST has two main levels: inline elements, which can appear inside
   6   paragraphs, and are spaced horizontally when presented, and block elements,
   7   such as paragraphs and lists, which are spaced vertically when presented.
   8   Block elements contain inline elements, but not vice versa.
   9
  10   Corresponding to this, the parser has three "main" functions:
  11
  12   - [delimited_inline_element_list] parses a run of inline elements that is
  13     delimited by curly brace markup ([{...}]).
  14   - [paragraph] parses a run of inline elements that make up a paragraph, and
  15     is not explicitly delimited with curly braces.
  16   - [block_element_list] parses a sequence of block elements. A comment is a
  17     sequence of block elements, so [block_element_list] is the top-level
  18     parser. It is also used for list item and tag content. *)
  19
  20open! Compat
  21
  22type 'a with_location = 'a Loc.with_location
  23
  24(* {2 Input} *)
  25
  26type input = {
  27  tokens : Token.t Loc.with_location Stream.t;
  28  warnings : Warning.t list ref;
  29}
  30
  31(* {2 Output} *)
  32
  33let add_warning input warning = input.warnings := warning :: !(input.warnings)
  34let junk input = Stream.junk input.tokens
  35
  36let peek input =
  37  match Stream.peek input.tokens with
  38  | Some token -> token
  39  | None -> assert false
  40
  41module Table = struct
  42  module Light_syntax = struct
  43    let valid_align = function
  44      | [ { Loc.value = `Word w; _ } ] -> (
  45          match String.length w with
  46          | 0 -> `Valid None
  47          | 1 -> (
  48              match w with
  49              | "-" -> `Valid None
  50              | ":" -> `Valid (Some `Center)
  51              | _ -> `Invalid)
  52          | len ->
  53              if String.for_all (Char.equal '-') (String.sub w 1 (len - 2)) then
  54                match (String.get w 0, String.get w (len - 1)) with
  55                | ':', ':' -> `Valid (Some `Center)
  56                | ':', '-' -> `Valid (Some `Left)
  57                | '-', ':' -> `Valid (Some `Right)
  58                | '-', '-' -> `Valid None
  59                | _ -> `Invalid
  60              else `Invalid)
  61      | _ -> `Invalid
  62
  63    let valid_align_row lx =
  64      let rec loop acc = function
  65        | [] -> Some (List.rev acc)
  66        | x :: q -> (
  67            match valid_align x with
  68            | `Invalid -> None
  69            | `Valid alignment -> loop (alignment :: acc) q)
  70      in
  71      loop [] lx
  72
  73    let create ~grid ~align : Ast.table =
  74      let cell_to_block (x, k) =
  75        let whole_loc = Loc.span (List.map (fun x -> x.Loc.location) x) in
  76        match x with
  77        | [] -> ([], k)
  78        | _ -> ([ Loc.at whole_loc (`Paragraph x) ], k)
  79      in
  80      let row_to_block = List.map cell_to_block in
  81      let grid_to_block = List.map row_to_block in
  82      ((grid_to_block grid, align), `Light)
  83
  84    let with_kind kind : 'a with_location list list -> 'a Ast.row =
  85      List.map (fun c -> (c, kind))
  86
  87    let from_raw_data grid : Ast.table =
  88      match grid with
  89      | [] -> create ~grid:[] ~align:None
  90      | row1 :: rows2_N -> (
  91          match valid_align_row row1 with
  92          (* If the first line is the align row, everything else is data. *)
  93          | Some _ as align ->
  94              create ~grid:(List.map (with_kind `Data) rows2_N) ~align
  95          | None -> (
  96              match rows2_N with
  97              (* Only 1 line, if this is not the align row this is data. *)
  98              | [] -> create ~grid:[ with_kind `Data row1 ] ~align:None
  99              | row2 :: rows3_N -> (
 100                  match valid_align_row row2 with
 101                  (* If the second line is the align row, the first one is the
 102                     header and the rest is data. *)
 103                  | Some _ as align ->
 104                      let header = with_kind `Header row1 in
 105                      let data = List.map (with_kind `Data) rows3_N in
 106                      create ~grid:(header :: data) ~align
 107                  (* No align row in the first 2 lines, everything is considered
 108                     data. *)
 109                  | None ->
 110                      create ~grid:(List.map (with_kind `Data) grid) ~align:None
 111                  )))
 112  end
 113
 114  module Heavy_syntax = struct
 115    let create ~grid : Ast.table = ((grid, None), `Heavy)
 116    let from_grid grid : Ast.table = create ~grid
 117  end
 118end
 119
 120module Reader = struct
 121  let until_rbrace_or_eof input acc =
 122    let rec consume () =
 123      let next_token = peek input in
 124      match next_token.value with
 125      | `Right_brace ->
 126          junk input;
 127          `End (acc, next_token.location)
 128      | `End ->
 129          Parse_error.end_not_allowed next_token.location ~in_what:"table"
 130          |> add_warning input;
 131          junk input;
 132          `End (acc, next_token.location)
 133      | `Space _ | `Single_newline _ | `Blank_line _ ->
 134          junk input;
 135          consume ()
 136      | _ -> `Token next_token
 137    in
 138    consume ()
 139
 140  module Infix = struct
 141    let ( >>> ) consume if_token =
 142      match consume with
 143      | `End (ret, loc) -> (ret, loc)
 144      | `Token t -> if_token t
 145  end
 146end
 147
 148open Reader.Infix
 149
 150(* The last token in the stream is always [`End], and it is never consumed by
 151   the parser, so the [None] case is impossible. *)
 152
 153let npeek n input = Stream.npeek n input.tokens
 154
 155(* {2 Non-link inline elements} *)
 156type style = [ `Bold | `Italic | `Emphasis | `Superscript | `Subscript ]
 157
 158(* Convenient abbreviation for use in patterns. *)
 159type token_that_always_begins_an_inline_element =
 160  [ `Word of string
 161  | `Code_span of string
 162  | `Raw_markup of string option * string
 163  | `Begin_style of style
 164  | `Simple_reference of string
 165  | `Begin_reference_with_replacement_text of string
 166  | `Simple_link of string
 167  | `Begin_link_with_replacement_text of string
 168  | `Math_span of string ]
 169
 170(* Check that the token constructors above actually are all in [Token.t]. *)
 171let _check_subset : token_that_always_begins_an_inline_element -> Token.t =
 172 fun t -> (t :> Token.t)
 173
 174(* Consumes tokens that make up a single non-link inline element:
 175
 176   - a horizontal space ([`Space], significant in inline elements),
 177   - a word (see [word]),
 178   - a code span ([...], [`Code_span _]), or
 179   - styled text ({e ...}).
 180
 181   The latter requires a recursive call to [delimited_inline_element_list],
 182   defined below.
 183
 184   This should be part of [delimited_inline_element_list]; however, it is also
 185   called by function [paragraph]. As a result, it is factored out, and made
 186   mutually-recursive with [delimited_inline_element_list].
 187
 188   This is called only when it is known that the first token in the list is the
 189   beginning of an inline element. In the case of [`Minus] and [`Plus], that
 190   means the caller has determined that they are not a list bullet (i.e., not
 191   the first non-whitespace tokens on their line).
 192
 193   This function consumes exactly the tokens that make up the element. *)
 194let rec inline_element :
 195    input -> Loc.span -> _ -> Ast.inline_element with_location =
 196 fun input location next_token ->
 197  match next_token with
 198  | `Space _ as token ->
 199      junk input;
 200      Loc.at location token
 201  | `Word _ as token ->
 202      junk input;
 203      Loc.at location token
 204      (* This is actually the same memory representation as the token, complete
 205         with location, and is probably the most common case. Perhaps the token
 206         can be reused somehow. The same is true of [`Space], [`Code_span]. *)
 207  | `Minus ->
 208      junk input;
 209      Loc.at location (`Word "-")
 210  | `Plus ->
 211      junk input;
 212      Loc.at location (`Word "+")
 213  | `Bar ->
 214      junk input;
 215      Loc.at location (`Word "|")
 216  | (`Code_span _ | `Math_span _ | `Raw_markup _) as token ->
 217      junk input;
 218      Loc.at location token
 219  | `Begin_style s as parent_markup ->
 220      junk input;
 221
 222      let requires_leading_whitespace =
 223        match s with
 224        | `Bold | `Italic | `Emphasis -> true
 225        | `Superscript | `Subscript -> false
 226      in
 227      let content, brace_location =
 228        delimited_inline_element_list ~parent_markup
 229          ~parent_markup_location:location ~requires_leading_whitespace input
 230      in
 231
 232      let location = Loc.span [ location; brace_location ] in
 233
 234      if content = [] then
 235        Parse_error.should_not_be_empty
 236          ~what:(Token.describe parent_markup)
 237          location
 238        |> add_warning input;
 239
 240      Loc.at location (`Styled (s, content))
 241  | `Simple_reference r ->
 242      junk input;
 243
 244      let r_location = Loc.nudge_start (String.length "{!") location in
 245      let r = Loc.at r_location r in
 246
 247      Loc.at location (`Reference (`Simple, r, []))
 248  | `Begin_reference_with_replacement_text r as parent_markup ->
 249      junk input;
 250
 251      let r_location = Loc.nudge_start (String.length "{{!") location in
 252      let r = Loc.at r_location r in
 253
 254      let content, brace_location =
 255        delimited_inline_element_list ~parent_markup
 256          ~parent_markup_location:location ~requires_leading_whitespace:false
 257          input
 258      in
 259
 260      let location = Loc.span [ location; brace_location ] in
 261
 262      if content = [] then
 263        Parse_error.should_not_be_empty
 264          ~what:(Token.describe parent_markup)
 265          location
 266        |> add_warning input;
 267
 268      Loc.at location (`Reference (`With_text, r, content))
 269  | `Simple_link u ->
 270      junk input;
 271
 272      let u = String.trim u in
 273
 274      if u = "" then
 275        Parse_error.should_not_be_empty
 276          ~what:(Token.describe next_token)
 277          location
 278        |> add_warning input;
 279
 280      Loc.at location (`Link (u, []))
 281  | `Begin_link_with_replacement_text u as parent_markup ->
 282      junk input;
 283
 284      let u = String.trim u in
 285
 286      if u = "" then
 287        Parse_error.should_not_be_empty
 288          ~what:(Token.describe parent_markup)
 289          location
 290        |> add_warning input;
 291
 292      let content, brace_location =
 293        delimited_inline_element_list ~parent_markup
 294          ~parent_markup_location:location ~requires_leading_whitespace:false
 295          input
 296      in
 297
 298      `Link (u, content) |> Loc.at (Loc.span [ location; brace_location ])
 299
 300(* Consumes tokens that make up a sequence of inline elements that is ended by
 301   a '}', a [`Right_brace] token. The brace token is also consumed.
 302
 303   The sequences are also preceded by some markup like '{b'. Some of these
 304   markup tokens require whitespace immediately after the token, and others not.
 305   The caller indicates which way that is through the
 306   [~requires_leading_whitespace] argument.
 307
 308   Whitespace is significant in inline element lists. In particular, "foo [bar]"
 309   is represented as [`Word "foo"; `Space; `Code_span "bar"], while "foo[bar]"
 310   is [`Word "foo"; `Code_span "bar"]. It doesn't matter how much whitespace is
 311   there, just whether it is present or not. Single newlines and horizontal
 312   space in any amount are allowed. Blank lines are not, as these are separators
 313   for {e block} elements.
 314
 315   In correct input, the first and last elements emitted will not be [`Space],
 316   i.e. [`Space] appears only between other non-link inline elements. In
 317   incorrect input, there might be [`Space] followed immediately by something
 318   like an @author tag.
 319
 320   The [~parent_markup] and [~parent_markup_location] arguments are used for
 321   generating error messages. *)
 322and delimited_inline_element_list :
 323    parent_markup:[< Token.t ] ->
 324    parent_markup_location:Loc.span ->
 325    requires_leading_whitespace:bool ->
 326    input ->
 327    Ast.inline_element with_location list * Loc.span =
 328 fun ~parent_markup ~parent_markup_location ~requires_leading_whitespace
 329     input ->
 330  (* [~at_start_of_line] is used to interpret [`Minus] and [`Plus]. These are
 331     word tokens if not the first non-whitespace tokens on their line. Then,
 332     they are allowed in a non-link element list. *)
 333  let rec consume_elements :
 334      at_start_of_line:bool ->
 335      Ast.inline_element with_location list ->
 336      Ast.inline_element with_location list * Loc.span =
 337   fun ~at_start_of_line acc ->
 338    let next_token = peek input in
 339    match next_token.value with
 340    | `Right_brace ->
 341        junk input;
 342        (List.rev acc, next_token.location)
 343    (* The [`Space] token is not space at the beginning or end of line, because
 344       that is combined into [`Single_newline] or [`Blank_line] tokens. It is
 345       also not at the beginning of markup (after e.g. '{b'), because that is
 346       handled separately before calling
 347       [consume_non_link_inline_elements], and not immediately before '}',
 348       because that is combined into the [`Right_brace] token by the lexer. So,
 349       it is an internal space, and we want to add it to the non-link inline
 350       element list. *)
 351    | (`Space _ | #token_that_always_begins_an_inline_element) as token ->
 352        let acc = inline_element input next_token.location token :: acc in
 353        consume_elements ~at_start_of_line:false acc
 354    | `Single_newline ws ->
 355        junk input;
 356        let element = Loc.same next_token (`Space ws) in
 357        consume_elements ~at_start_of_line:true (element :: acc)
 358    | `Blank_line ws as blank ->
 359        Parse_error.not_allowed ~what:(Token.describe blank)
 360          ~in_what:(Token.describe parent_markup)
 361          next_token.location
 362        |> add_warning input;
 363
 364        junk input;
 365        let element = Loc.same next_token (`Space ws) in
 366        consume_elements ~at_start_of_line:true (element :: acc)
 367    | `Bar as token ->
 368        let acc = inline_element input next_token.location token :: acc in
 369        consume_elements ~at_start_of_line:false acc
 370    | (`Minus | `Plus) as bullet ->
 371        (if at_start_of_line then
 372           let suggestion =
 373             Printf.sprintf "move %s so it isn't the first thing on the line."
 374               (Token.print bullet)
 375           in
 376           Parse_error.not_allowed ~what:(Token.describe bullet)
 377             ~in_what:(Token.describe parent_markup)
 378             ~suggestion next_token.location
 379           |> add_warning input);
 380
 381        let acc = inline_element input next_token.location bullet :: acc in
 382        consume_elements ~at_start_of_line:false acc
 383    | other_token ->
 384        Parse_error.not_allowed
 385          ~what:(Token.describe other_token)
 386          ~in_what:(Token.describe parent_markup)
 387          next_token.location
 388        |> add_warning input;
 389
 390        let last_location =
 391          match acc with
 392          | last_token :: _ -> last_token.location
 393          | [] -> parent_markup_location
 394        in
 395
 396        (List.rev acc, last_location)
 397  in
 398
 399  let first_token = peek input in
 400  match first_token.value with
 401  | `Space _ ->
 402      junk input;
 403      consume_elements ~at_start_of_line:false []
 404      (* [~at_start_of_line] is [false] here because the preceding token was some
 405         some markup like '{b', and we didn't move to the next line, so the next
 406         token will not be the first non-whitespace token on its line. *)
 407  | `Single_newline _ ->
 408      junk input;
 409      consume_elements ~at_start_of_line:true []
 410  | `Blank_line _ as blank ->
 411      (* In case the markup is immediately followed by a blank line, the error
 412         message printed by the catch-all case below can be confusing, as it will
 413         suggest that the markup must be followed by a newline (which it is). It
 414         just must not be followed by two newlines. To explain that clearly,
 415         handle that case specifically. *)
 416      Parse_error.not_allowed ~what:(Token.describe blank)
 417        ~in_what:(Token.describe parent_markup)
 418        first_token.location
 419      |> add_warning input;
 420
 421      junk input;
 422      consume_elements ~at_start_of_line:true []
 423  | `Right_brace ->
 424      junk input;
 425      ([], first_token.location)
 426  | _ ->
 427      if requires_leading_whitespace then
 428        Parse_error.should_be_followed_by_whitespace
 429          ~what:(Token.print parent_markup)
 430          parent_markup_location
 431        |> add_warning input;
 432      consume_elements ~at_start_of_line:false []
 433
 434(* {2 Paragraphs} *)
 435
 436(* Consumes tokens that make up a paragraph.
 437
 438   A paragraph is a sequence of inline elements that ends on a blank line, or
 439   explicit block markup such as a verbatim block on a new line.
 440
 441   Because of the significance of newlines, paragraphs are parsed line-by-line.
 442   The function [paragraph] is called only when the current token is the first
 443   non-whitespace token on its line, and begins an inline element. [paragraph]
 444   then parses a line of inline elements. Afterwards, it looks ahead to the next
 445   line. If that line also begins with an inline element, it parses that line,
 446   and so on. *)
 447let paragraph : input -> Ast.nestable_block_element with_location =
 448 fun input ->
 449  (* Parses a single line of a paragraph, consisting of inline elements. The
 450     only valid ways to end a paragraph line are with [`End], [`Single_newline],
 451     [`Blank_line], and [`Right_brace]. Everything else either belongs in the
 452     paragraph, or signifies an attempt to begin a block element inside a
 453     paragraph line, which is an error. These errors are caught elsewhere; the
 454     paragraph parser just stops. *)
 455  let rec paragraph_line :
 456      Ast.inline_element with_location list ->
 457      Ast.inline_element with_location list =
 458   fun acc ->
 459    let next_token = peek input in
 460    match next_token.value with
 461    | ( `Space _ | `Minus | `Plus | `Bar
 462      | #token_that_always_begins_an_inline_element ) as token ->
 463        let element = inline_element input next_token.location token in
 464        paragraph_line (element :: acc)
 465    | _ -> acc
 466  in
 467
 468  (* After each line is parsed, decides whether to parse more lines. *)
 469  let rec additional_lines :
 470      Ast.inline_element with_location list ->
 471      Ast.inline_element with_location list =
 472   fun acc ->
 473    match npeek 2 input with
 474    | { value = `Single_newline ws; location }
 475      :: { value = #token_that_always_begins_an_inline_element | `Bar; _ }
 476      :: _ ->
 477        junk input;
 478        let acc = Loc.at location (`Space ws) :: acc in
 479        let acc = paragraph_line acc in
 480        additional_lines acc
 481    | _ -> List.rev acc
 482  in
 483
 484  let elements = paragraph_line [] |> additional_lines in
 485  `Paragraph elements |> Loc.at (Loc.span (List.map Loc.location elements))
 486
 487(* {2 Block elements} *)
 488
 489(* {3 Helper types} *)
 490
 491(* The interpretation of tokens in the block parser depends on where on a line
 492    each token appears. The six possible "locations" are:
 493
 494    - [`At_start_of_line], when only whitespace has been read on the current
 495      line.
 496    - [`After_tag], when a valid tag token, such as [@deprecated], has been read,
 497      and only whitespace has been read since.
 498    - [`After_shorthand_bullet], when a valid shorthand list item bullet, such as
 499      [-], has been read, and only whitespace has been read since.
 500    - [`After_explicit_list_bullet], when a valid explicit bullet, such as [{li],
 501      has been read, and only whitespace has been read since.
 502    - [`After_table_cell], when a table cell opening markup ('{th' or '{td') has been read.
 503    - [`After_text], when any other valid non-whitespace token has already been
 504      read on the current line.
 505
 506    Here are some examples of how this affects the interpretation of tokens:
 507
 508    - A paragraph can start anywhere except [`After_text] (two paragraphs cannot
 509      be on the same line, but paragraphs can be nested in just about anything).
 510    - [`Minus] is interpreted as a list item bullet [`At_start_of_line],
 511      [`After_tag], and [`After_explicit_list_bullet].
 512    - Tags are only allowed [`At_start_of_line].
 513
 514   To track the location accurately, the functions that make up the block parser
 515   pass explicit [where_in_line] values around and return them.
 516
 517   In a few cases, [where_in_line] can be inferred from what helper was called.
 518   For example, the [paragraph] parser always stops on the same line as the last
 519   significant token that is in the paragraph it consumed, so the location must
 520   be [`After_text]. *)
 521type where_in_line =
 522  [ `At_start_of_line
 523  | `After_tag
 524  | `After_shorthand_bullet
 525  | `After_explicit_list_bullet
 526  | `After_table_cell
 527  | `After_text ]
 528
 529(* The block parsing loop, function [block_element_list], stops when it
 530   encounters certain tokens.
 531
 532   When it is called for the whole comment, or for in explicit list item
 533   ([{li foo}]), it can only stop on end of input or a right brace.
 534
 535   When it is called inside a shorthand list item ([- foo]), it stops on end of
 536   input, right brace, a blank line (indicating end of shorthand list), plus or
 537   minus (indicating the start of the next list item), or a section heading or
 538   tag, which cannot be nested in list markup.
 539
 540   The block parser [block_element_list] explicitly returns the token that
 541   stopped it, with a type more precise than [Token.t stream_head]: if it was
 542   called for the whole comment or an explicit list item, the stop token will
 543   have type [stops_at_delimiters stream_head], and if it was called for a
 544   shorthand list item, the stop token will have type
 545   [implicit_stop stream_head]. This allows the calling parsers to write precise
 546   cases for exactly the tokens that might be at the front of the stream after
 547   the block parser returns. *)
 548type stops_at_delimiters = [ `End | `Right_brace ]
 549type code_stop = [ `End | `Right_code_delimiter ]
 550
 551type stopped_implicitly =
 552  [ `End
 553  | `Blank_line of string
 554  | `Right_brace
 555  | `Minus
 556  | `Plus
 557  | Token.section_heading
 558  | Token.media_markup
 559  | Token.tag ]
 560
 561(* Ensure that the above two types are really subsets of [Token.t]. *)
 562let _check_subset : stops_at_delimiters -> Token.t = fun t -> (t :> Token.t)
 563let _check_subset : stopped_implicitly -> Token.t = fun t -> (t :> Token.t)
 564
 565(* The different contexts in which the block parser [block_element_list] can be
 566   called. The block parser's behavior depends somewhat on the context. For
 567   example, while paragraphs are allowed anywhere, shorthand lists are not
 568   allowed immediately inside other shorthand lists, while tags are not allowed
 569   anywhere except at the comment top level.
 570
 571   Besides telling the block parser how to behave, each context also carries two
 572   types, which determine the return type of the block parser:
 573
 574   - The type of blocks the parser returns. Note that [nestable_block_element]
 575     is included in [block_element]. However, the extra block kinds in
 576     [block_element] are only allowed at the comment top level.
 577   - The type of token that the block parser stops at. See discussion above. *)
 578type ('block, 'stops_at_which_tokens) context =
 579  | Top_level : (Ast.block_element, stops_at_delimiters) context
 580  | In_implicitly_ended :
 581      [ `Tag | `Shorthand_list ]
 582      -> (Ast.nestable_block_element, stopped_implicitly) context
 583  | In_explicit_list : (Ast.nestable_block_element, stops_at_delimiters) context
 584  | In_table_cell : (Ast.nestable_block_element, stops_at_delimiters) context
 585  | In_code_results : (Ast.nestable_block_element, code_stop) context
 586
 587(* This is a no-op. It is needed to prove to the type system that nestable block
 588   elements are acceptable block elements in all contexts. *)
 589let accepted_in_all_contexts : type block stops_at_which_tokens.
 590    (block, stops_at_which_tokens) context ->
 591    Ast.nestable_block_element ->
 592    block =
 593 fun context block ->
 594  match context with
 595  | Top_level -> (block :> Ast.block_element)
 596  | In_implicitly_ended (`Tag | `Shorthand_list) -> block
 597  | In_explicit_list -> block
 598  | In_table_cell -> block
 599  | In_code_results -> block
 600
 601(* Converts a tag to a series of words. This is used in error recovery, when a
 602   tag cannot be generated. *)
 603let tag_to_words = function
 604  | `Author s -> [ `Word "@author"; `Space " "; `Word s ]
 605  | `Before s -> [ `Word "@before"; `Space " "; `Word s ]
 606  | `Canonical s -> [ `Word "@canonical"; `Space " "; `Word s ]
 607  | `Deprecated -> [ `Word "@deprecated" ]
 608  | `Inline -> [ `Word "@inline" ]
 609  | `Open -> [ `Word "@open" ]
 610  | `Closed -> [ `Word "@closed" ]
 611  | `Hidden -> [ `Word "@hidden" ]
 612  | `Param s -> [ `Word "@param"; `Space " "; `Word s ]
 613  | `Raise s -> [ `Word "@raise"; `Space " "; `Word s ]
 614  | `Return -> [ `Word "@return" ]
 615  | `See (`Document, s) -> [ `Word "@see"; `Space " "; `Word ("\"" ^ s ^ "\"") ]
 616  | `See (`File, s) -> [ `Word "@see"; `Space " "; `Word ("'" ^ s ^ "'") ]
 617  | `See (`Url, s) -> [ `Word "@see"; `Space " "; `Word ("<" ^ s ^ ">") ]
 618  | `Since s -> [ `Word "@since"; `Space " "; `Word s ]
 619  | `Version s -> [ `Word "@version"; `Space " "; `Word s ]
 620  | `Children_order -> [ `Word "@children_order" ]
 621  | `Toc_status -> [ `Word "@toc_status" ]
 622  | `Order_category -> [ `Word "@order_category" ]
 623  | `Short_title -> [ `Word "@short_title" ]
 624  | `Custom tag -> [ `Word ("@" ^ tag) ]
 625
 626(* {3 Block element lists} *)
 627
 628(* Consumes tokens making up a sequence of block elements. These are:
 629
 630   - paragraphs,
 631   - code blocks,
 632   - verbatim text blocks,
 633   - tables,
 634   - lists, and
 635   - section headings. *)
 636let rec block_element_list : type block stops_at_which_tokens.
 637    (block, stops_at_which_tokens) context ->
 638    parent_markup:[< Token.t | `Comment ] ->
 639    input ->
 640    block with_location list
 641    * stops_at_which_tokens with_location
 642    * where_in_line =
 643 fun context ~parent_markup input ->
 644  let rec consume_block_elements :
 645      where_in_line ->
 646      block with_location list ->
 647      block with_location list
 648      * stops_at_which_tokens with_location
 649      * where_in_line =
 650   fun where_in_line acc ->
 651    let describe token =
 652      match token with
 653      | #token_that_always_begins_an_inline_element -> "paragraph"
 654      | _ -> Token.describe token
 655    in
 656
 657    let warn_if_after_text { Loc.location; value = token } =
 658      if where_in_line = `After_text then
 659        Parse_error.should_begin_on_its_own_line ~what:(describe token) location
 660        |> add_warning input
 661    in
 662
 663    let warn_because_not_at_top_level { Loc.location; value = token } =
 664      let suggestion =
 665        Printf.sprintf "move %s outside of any other markup."
 666          (Token.print token)
 667      in
 668      Parse_error.not_allowed ~what:(Token.describe token)
 669        ~in_what:(Token.describe parent_markup)
 670        ~suggestion location
 671      |> add_warning input
 672    in
 673
 674    match peek input with
 675    (* Terminators: the two tokens that terminate anything. *)
 676    | { value = `End; _ } as next_token -> (
 677        match context with
 678        | Top_level -> (List.rev acc, next_token, where_in_line)
 679        | In_implicitly_ended (`Tag | `Shorthand_list) ->
 680            (List.rev acc, next_token, where_in_line)
 681        | In_explicit_list -> (List.rev acc, next_token, where_in_line)
 682        | In_table_cell -> (List.rev acc, next_token, where_in_line)
 683        | In_code_results -> (List.rev acc, next_token, where_in_line))
 684    | { value = `Right_brace; _ } as next_token -> (
 685        (* This little absurdity is needed to satisfy the type system. Without it,
 686           OCaml is unable to prove that [stream_head] has the right type for all
 687           possible values of [context]. *)
 688        match context with
 689        | Top_level -> (List.rev acc, next_token, where_in_line)
 690        | In_implicitly_ended (`Tag | `Shorthand_list) ->
 691            (List.rev acc, next_token, where_in_line)
 692        | In_explicit_list -> (List.rev acc, next_token, where_in_line)
 693        | In_table_cell -> (List.rev acc, next_token, where_in_line)
 694        | In_code_results ->
 695            junk input;
 696            consume_block_elements where_in_line acc)
 697    | { value = `Right_code_delimiter; _ } as next_token -> (
 698        match context with
 699        | In_code_results -> (List.rev acc, next_token, where_in_line)
 700        | _ ->
 701            junk input;
 702            consume_block_elements where_in_line acc)
 703    (* Whitespace. This can terminate some kinds of block elements. It is also
 704       necessary to track it to interpret [`Minus] and [`Plus] correctly, as
 705       well as to ensure that all block elements begin on their own line. *)
 706    | { value = `Space _; _ } ->
 707        junk input;
 708        consume_block_elements where_in_line acc
 709    | { value = `Single_newline _; _ } ->
 710        junk input;
 711        consume_block_elements `At_start_of_line acc
 712    | { value = `Blank_line _; _ } as next_token -> (
 713        match context with
 714        (* Blank lines terminate shorthand lists ([- foo]) and tags. They also
 715           terminate paragraphs, but the paragraph parser is aware of that
 716           internally. *)
 717        | In_implicitly_ended (`Tag | `Shorthand_list) ->
 718            (List.rev acc, next_token, where_in_line)
 719        (* Otherwise, blank lines are pretty much like single newlines. *)
 720        | _ ->
 721            junk input;
 722            consume_block_elements `At_start_of_line acc)
 723    (* Explicit list items ([{li ...}] and [{- ...}]) can never appear directly
 724       in block content. They can only appear inside [{ul ...}] and [{ol ...}].
 725       So, catch those. *)
 726    | { value = `Begin_list_item _ as token; location } ->
 727        let suggestion =
 728          Printf.sprintf "move %s into %s, or use %s." (Token.print token)
 729            (Token.describe (`Begin_list `Unordered))
 730            (Token.describe `Minus)
 731        in
 732        Parse_error.not_allowed ~what:(Token.describe token)
 733          ~in_what:(Token.describe parent_markup)
 734          ~suggestion location
 735        |> add_warning input;
 736
 737        junk input;
 738        consume_block_elements where_in_line acc
 739    (* Table rows ([{tr ...}]) can never appear directly
 740       in block content. They can only appear inside [{table ...}]. *)
 741    | { value = `Begin_table_row as token; location } ->
 742        let suggestion =
 743          Printf.sprintf "move %s into %s." (Token.print token)
 744            (Token.describe `Begin_table_heavy)
 745        in
 746        Parse_error.not_allowed ~what:(Token.describe token)
 747          ~in_what:(Token.describe parent_markup)
 748          ~suggestion location
 749        |> add_warning input;
 750        junk input;
 751        consume_block_elements where_in_line acc
 752    (* Table cells ([{th ...}] and [{td ...}]) can never appear directly
 753       in block content. They can only appear inside [{tr ...}]. *)
 754    | { value = `Begin_table_cell _ as token; location } ->
 755        let suggestion =
 756          Printf.sprintf "move %s into %s." (Token.print token)
 757            (Token.describe `Begin_table_row)
 758        in
 759        Parse_error.not_allowed ~what:(Token.describe token)
 760          ~in_what:(Token.describe parent_markup)
 761          ~suggestion location
 762        |> add_warning input;
 763        junk input;
 764        consume_block_elements where_in_line acc
 765    (* Tags. These can appear at the top level only. *)
 766    | { value = `Tag tag as token; location } as next_token -> (
 767        let recover_when_not_at_top_level context =
 768          warn_because_not_at_top_level next_token;
 769          junk input;
 770          let words = List.map (Loc.at location) (tag_to_words tag) in
 771          let paragraph =
 772            `Paragraph words
 773            |> accepted_in_all_contexts context
 774            |> Loc.at location
 775          in
 776          consume_block_elements `At_start_of_line (paragraph :: acc)
 777        in
 778
 779        match context with
 780        (* Tags cannot make sense in an explicit list ([{ul {li ...}}]). *)
 781        | In_explicit_list -> recover_when_not_at_top_level context
 782        (* If a tag starts at the beginning of a line, it terminates the preceding
 783           tag and/or the current shorthand list. In this case, return to the
 784           caller, and let the caller decide how to interpret the tag token. *)
 785        | In_implicitly_ended (`Tag | `Shorthand_list) ->
 786            if where_in_line = `At_start_of_line then
 787              (List.rev acc, next_token, where_in_line)
 788            else recover_when_not_at_top_level context
 789        | In_table_cell -> recover_when_not_at_top_level context
 790        | In_code_results -> recover_when_not_at_top_level context
 791        (* If this is the top-level call to [block_element_list], parse the
 792           tag. *)
 793        | Top_level -> (
 794            if where_in_line <> `At_start_of_line then
 795              Parse_error.should_begin_on_its_own_line
 796                ~what:(Token.describe token) location
 797              |> add_warning input;
 798
 799            junk input;
 800
 801            match tag with
 802            | (`Author s | `Since s | `Version s | `Canonical s) as tag ->
 803                let s = String.trim s in
 804                if s = "" then
 805                  Parse_error.should_not_be_empty ~what:(Token.describe token)
 806                    location
 807                  |> add_warning input;
 808                let tag =
 809                  match tag with
 810                  | `Author _ -> `Author s
 811                  | `Since _ -> `Since s
 812                  | `Version _ -> `Version s
 813                  | `Canonical _ ->
 814                      (* TODO The location is only approximate, as we need lexer
 815                         cooperation to get the real location. *)
 816                      let r_location =
 817                        Loc.nudge_start (String.length "@canonical ") location
 818                      in
 819                      `Canonical (Loc.at r_location s)
 820                in
 821
 822                let tag = Loc.at location (`Tag tag) in
 823                consume_block_elements `After_text (tag :: acc)
 824            | ( `Deprecated | `Return | `Children_order | `Short_title
 825              | `Toc_status | `Order_category | `Custom _ ) as tag ->
 826                let content, _stream_head, where_in_line =
 827                  block_element_list (In_implicitly_ended `Tag)
 828                    ~parent_markup:token input
 829                in
 830                let tag =
 831                  match tag with
 832                  | `Deprecated -> `Deprecated content
 833                  | `Toc_status -> `Toc_status content
 834                  | `Return -> `Return content
 835                  | `Children_order -> `Children_order content
 836                  | `Short_title -> `Short_title content
 837                  | `Order_category -> `Order_category content
 838                  | `Custom s -> `Custom (s, content)
 839                in
 840                let location =
 841                  location :: List.map Loc.location content |> Loc.span
 842                in
 843                let tag = Loc.at location (`Tag tag) in
 844                consume_block_elements where_in_line (tag :: acc)
 845            | (`Param _ | `Raise _ | `Before _) as tag ->
 846                let content, _stream_head, where_in_line =
 847                  block_element_list (In_implicitly_ended `Tag)
 848                    ~parent_markup:token input
 849                in
 850                let tag =
 851                  match tag with
 852                  | `Param s -> `Param (s, content)
 853                  | `Raise s -> `Raise (s, content)
 854                  | `Before s -> `Before (s, content)
 855                in
 856                let location =
 857                  location :: List.map Loc.location content |> Loc.span
 858                in
 859                let tag = Loc.at location (`Tag tag) in
 860                consume_block_elements where_in_line (tag :: acc)
 861            | `See (kind, target) ->
 862                let content, _next_token, where_in_line =
 863                  block_element_list (In_implicitly_ended `Tag)
 864                    ~parent_markup:token input
 865                in
 866                let location =
 867                  location :: List.map Loc.location content |> Loc.span
 868                in
 869                let tag = `Tag (`See (kind, target, content)) in
 870                let tag = Loc.at location tag in
 871                consume_block_elements where_in_line (tag :: acc)
 872            | (`Inline | `Open | `Closed | `Hidden) as tag ->
 873                let tag = Loc.at location (`Tag tag) in
 874                consume_block_elements `After_text (tag :: acc)))
 875    | ( { value = #token_that_always_begins_an_inline_element; _ }
 876      | { value = `Bar; _ } ) as next_token ->
 877        warn_if_after_text next_token;
 878
 879        let block = paragraph input in
 880        let block = Loc.map (accepted_in_all_contexts context) block in
 881        let acc = block :: acc in
 882        consume_block_elements `After_text acc
 883    | { value = `Verbatim s as token; location } as next_token ->
 884        warn_if_after_text next_token;
 885        if s = "" then
 886          Parse_error.should_not_be_empty ~what:(Token.describe token) location
 887          |> add_warning input;
 888
 889        junk input;
 890        let block = accepted_in_all_contexts context token in
 891        let block = Loc.at location block in
 892        let acc = block :: acc in
 893        consume_block_elements `After_text acc
 894    | { value = `Math_block s as token; location } as next_token ->
 895        warn_if_after_text next_token;
 896        if s = "" then
 897          Parse_error.should_not_be_empty ~what:(Token.describe token) location
 898          |> add_warning input;
 899
 900        junk input;
 901        let block = accepted_in_all_contexts context token in
 902        let block = Loc.at location block in
 903        let acc = block :: acc in
 904        consume_block_elements `After_text acc
 905    | {
 906        value =
 907          `Code_block (meta, delim, { value = s; location = v_loc }, has_outputs)
 908          as token;
 909        location;
 910      } as next_token ->
 911        warn_if_after_text next_token;
 912        junk input;
 913        let delimiter = if delim = "" then None else Some delim in
 914        let output, location =
 915          if not has_outputs then (None, location)
 916          else
 917            let content, next_token, _where_in_line =
 918              block_element_list In_code_results ~parent_markup:token input
 919            in
 920            junk input;
 921            let locations =
 922              location :: List.map (fun content -> content.Loc.location) content
 923            in
 924            let location = Loc.span locations in
 925            let location = { location with end_ = next_token.location.end_ } in
 926            (Some content, location)
 927        in
 928
 929        if s = "" then
 930          Parse_error.should_not_be_empty ~what:(Token.describe token) location
 931          |> add_warning input;
 932
 933        let meta =
 934          match meta with
 935          | None -> None
 936          | Some (language, tags) -> Some { Ast.language; tags }
 937        in
 938        let block =
 939          accepted_in_all_contexts context
 940            (`Code_block
 941               {
 942                 Ast.meta;
 943                 delimiter;
 944                 content = { value = s; location = v_loc };
 945                 output;
 946               })
 947        in
 948        let block = Loc.at location block in
 949        let acc = block :: acc in
 950        consume_block_elements `After_text acc
 951    | { value = `Modules s as token; location } as next_token ->
 952        warn_if_after_text next_token;
 953
 954        junk input;
 955
 956        (* TODO Use some library for a splitting function, or move this out into a
 957           Util module. *)
 958        let split_string delimiters s =
 959          let rec scan_delimiters acc index =
 960            if index >= String.length s then List.rev acc
 961            else if String.contains delimiters s.[index] then
 962              scan_delimiters acc (index + 1)
 963            else scan_word acc index (index + 1)
 964          and scan_word acc start_index index =
 965            if index >= String.length s then
 966              let word = String.sub s start_index (index - start_index) in
 967              List.rev (word :: acc)
 968            else if String.contains delimiters s.[index] then
 969              let word = String.sub s start_index (index - start_index) in
 970              scan_delimiters (word :: acc) (index + 1)
 971            else scan_word acc start_index (index + 1)
 972          in
 973
 974          scan_delimiters [] 0
 975        in
 976
 977        (* TODO Correct locations await a full implementation of {!modules}
 978           parsing. *)
 979        let modules =
 980          split_string " \t\r\n" s |> List.map (fun r -> Loc.at location r)
 981        in
 982
 983        if modules = [] then
 984          Parse_error.should_not_be_empty ~what:(Token.describe token) location
 985          |> add_warning input;
 986
 987        let block = accepted_in_all_contexts context (`Modules modules) in
 988        let block = Loc.at location block in
 989        let acc = block :: acc in
 990        consume_block_elements `After_text acc
 991    | { value = `Begin_list kind as token; location } as next_token ->
 992        warn_if_after_text next_token;
 993
 994        junk input;
 995
 996        let items, brace_location =
 997          explicit_list_items ~parent_markup:token input
 998        in
 999        if items = [] then
1000          Parse_error.should_not_be_empty ~what:(Token.describe token) location
1001          |> add_warning input;
1002
1003        let location = Loc.span [ location; brace_location ] in
1004        let block = `List (kind, `Heavy, items) in
1005        let block = accepted_in_all_contexts context block in
1006        let block = Loc.at location block in
1007        let acc = block :: acc in
1008        consume_block_elements `After_text acc
1009    | { value = (`Begin_table_light | `Begin_table_heavy) as token; location }
1010      as next_token ->
1011        warn_if_after_text next_token;
1012        junk input;
1013        let block, brace_location =
1014          let parent_markup = token in
1015          let parent_markup_location = location in
1016          match token with
1017          | `Begin_table_light ->
1018              light_table input ~parent_markup ~parent_markup_location
1019          | `Begin_table_heavy ->
1020              heavy_table input ~parent_markup ~parent_markup_location
1021        in
1022        let location = Loc.span [ location; brace_location ] in
1023        let block = accepted_in_all_contexts context (`Table block) in
1024        let block = Loc.at location block in
1025        let acc = block :: acc in
1026        consume_block_elements `After_text acc
1027    | { value = (`Minus | `Plus) as token; location } as next_token -> (
1028        (match where_in_line with
1029        | `After_text | `After_shorthand_bullet ->
1030            Parse_error.should_begin_on_its_own_line
1031              ~what:(Token.describe token) location
1032            |> add_warning input
1033        | _ -> ());
1034
1035        match context with
1036        | In_implicitly_ended `Shorthand_list ->
1037            (List.rev acc, next_token, where_in_line)
1038        | _ ->
1039            let items, where_in_line =
1040              shorthand_list_items next_token where_in_line input
1041            in
1042            let kind =
1043              match token with `Minus -> `Unordered | `Plus -> `Ordered
1044            in
1045            let location =
1046              location :: List.map Loc.location (List.flatten items) |> Loc.span
1047            in
1048            let block = `List (kind, `Light, items) in
1049            let block = accepted_in_all_contexts context block in
1050            let block = Loc.at location block in
1051            let acc = block :: acc in
1052            consume_block_elements where_in_line acc)
1053    | { value = `Begin_section_heading (level, label) as token; location } as
1054      next_token -> (
1055        let recover_when_not_at_top_level context =
1056          warn_because_not_at_top_level next_token;
1057          junk input;
1058          let content, brace_location =
1059            delimited_inline_element_list ~parent_markup:token
1060              ~parent_markup_location:location ~requires_leading_whitespace:true
1061              input
1062          in
1063          let location = Loc.span [ location; brace_location ] in
1064          let paragraph =
1065            `Paragraph content
1066            |> accepted_in_all_contexts context
1067            |> Loc.at location
1068          in
1069          consume_block_elements `At_start_of_line (paragraph :: acc)
1070        in
1071
1072        match context with
1073        | In_implicitly_ended (`Tag | `Shorthand_list) ->
1074            if where_in_line = `At_start_of_line then
1075              (List.rev acc, next_token, where_in_line)
1076            else recover_when_not_at_top_level context
1077        | In_explicit_list -> recover_when_not_at_top_level context
1078        | In_table_cell -> recover_when_not_at_top_level context
1079        | In_code_results -> recover_when_not_at_top_level context
1080        | Top_level ->
1081            if where_in_line <> `At_start_of_line then
1082              Parse_error.should_begin_on_its_own_line
1083                ~what:(Token.describe token) location
1084              |> add_warning input;
1085
1086            let label =
1087              match label with
1088              | Some "" ->
1089                  Parse_error.should_not_be_empty ~what:"heading label" location
1090                  |> add_warning input;
1091                  None
1092              | _ -> label
1093            in
1094
1095            junk input;
1096
1097            let content, brace_location =
1098              delimited_inline_element_list ~parent_markup:token
1099                ~parent_markup_location:location
1100                ~requires_leading_whitespace:true input
1101            in
1102            if content = [] then
1103              Parse_error.should_not_be_empty ~what:(Token.describe token)
1104                location
1105              |> add_warning input;
1106
1107            let location = Loc.span [ location; brace_location ] in
1108            let heading = `Heading (level, label, content) in
1109            let heading = Loc.at location heading in
1110            let acc = heading :: acc in
1111            consume_block_elements `After_text acc)
1112    | { value = `Begin_paragraph_style _ as token; location } ->
1113        junk input;
1114        let content, brace_location =
1115          delimited_inline_element_list ~parent_markup:token
1116            ~parent_markup_location:location ~requires_leading_whitespace:true
1117            input
1118        in
1119        let location = Loc.span [ location; brace_location ] in
1120
1121        Parse_error.markup_should_not_be_used ~what:(Token.describe token)
1122          location
1123        |> add_warning input;
1124
1125        let paragraph =
1126          `Paragraph content
1127          |> accepted_in_all_contexts context
1128          |> Loc.at location
1129        in
1130        consume_block_elements `At_start_of_line (paragraph :: acc)
1131    | {
1132     location;
1133     value = `Media_with_replacement_text (href, media, content) as token;
1134    } ->
1135        junk input;
1136
1137        let r_location =
1138          Loc.nudge_start
1139            (String.length @@ Token.s_of_media `Replaced media)
1140            location
1141          |> Loc.nudge_end (String.length content + 1)
1142          (* +1 for closing character *)
1143        in
1144        let c_location =
1145          Loc.nudge_start
1146            (String.length (Token.s_of_media `Replaced media)
1147            + String.length (match href with `Reference s | `Link s -> s))
1148            location
1149          |> Loc.nudge_end 1
1150        in
1151        let content = String.trim content in
1152        let href = href |> Loc.at r_location in
1153
1154        if content = "" then
1155          Parse_error.should_not_be_empty ~what:(Token.describe token)
1156            c_location
1157          |> add_warning input;
1158
1159        let block = `Media (`Simple, href, content, media) in
1160        let block = accepted_in_all_contexts context block in
1161        let block = Loc.at location block in
1162        let acc = block :: acc in
1163        consume_block_elements `After_text acc
1164    | { location; value = `Simple_media (href, media) } ->
1165        junk input;
1166
1167        let r_location =
1168          Loc.nudge_start
1169            (String.length @@ Token.s_of_media `Simple media)
1170            location
1171          |> Loc.nudge_end 1
1172        in
1173        let href = href |> Loc.at r_location in
1174        let block = `Media (`Simple, href, "", media) in
1175        let block = accepted_in_all_contexts context block in
1176        let block = Loc.at location block in
1177        let acc = block :: acc in
1178        consume_block_elements `After_text acc
1179  in
1180
1181  let where_in_line =
1182    match context with
1183    | Top_level -> `At_start_of_line
1184    | In_implicitly_ended `Shorthand_list -> `After_shorthand_bullet
1185    | In_explicit_list -> `After_explicit_list_bullet
1186    | In_table_cell -> `After_table_cell
1187    | In_code_results -> `After_tag
1188    | In_implicitly_ended `Tag -> `After_tag
1189  in
1190
1191  consume_block_elements where_in_line []
1192
1193(* {3 Lists} *)
1194
1195(* Consumes a sequence of implicit list items. Each one consists of a [`Minus]
1196   or [`Plus] token, followed by block elements until:
1197
1198   - a blank line, or
1199   - a list bullet of the opposite kind (e.g. [`Plus] for a [`Minus] list).
1200
1201   This function is called when the next token is known to be [`Minus] or
1202   [`Plus]. It consumes that token, and calls the block element parser (see
1203   above). That parser returns to [implicit_list_items] only on [`Blank_line],
1204   [`End], [`Minus] or [`Plus] at the start of a line, or [`Right_brace]. *)
1205and shorthand_list_items :
1206    [ `Minus | `Plus ] with_location ->
1207    where_in_line ->
1208    input ->
1209    Ast.nestable_block_element with_location list list * where_in_line =
1210 fun first_token where_in_line input ->
1211  let bullet_token = first_token.value in
1212
1213  let rec consume_list_items :
1214      [> ] with_location ->
1215      where_in_line ->
1216      Ast.nestable_block_element with_location list list ->
1217      Ast.nestable_block_element with_location list list * where_in_line =
1218   fun next_token where_in_line acc ->
1219    match next_token.value with
1220    | `End | `Right_brace | `Blank_line _ | `Tag _ | `Begin_section_heading _
1221    | `Simple_media _ | `Media_with_replacement_text _ ->
1222        (List.rev acc, where_in_line)
1223    | (`Minus | `Plus) as bullet ->
1224        if bullet = bullet_token then (
1225          junk input;
1226
1227          let content, stream_head, where_in_line =
1228            block_element_list (In_implicitly_ended `Shorthand_list)
1229              ~parent_markup:bullet input
1230          in
1231          if content = [] then
1232            Parse_error.should_not_be_empty ~what:(Token.describe bullet)
1233              next_token.location
1234            |> add_warning input;
1235
1236          let acc = content :: acc in
1237          consume_list_items stream_head where_in_line acc)
1238        else (List.rev acc, where_in_line)
1239  in
1240
1241  consume_list_items
1242    (first_token :> stopped_implicitly with_location)
1243    where_in_line []
1244
1245(* Consumes a sequence of explicit list items (starting with '{li ...}' and
1246   '{-...}', which are represented by [`Begin_list_item _] tokens).
1247
1248   This function is called immediately after '{ul' or '{ol' ([`Begin_list _]) is
1249   read. The only "valid" way to exit is by reading a [`Right_brace] token,
1250   which is consumed.
1251
1252   Whitespace inside the list, but outside list items, is not significant – this
1253   parsing function consumes all of it. Otherwise, only list item start tokens
1254   are accepted. Everything else is an error. *)
1255and explicit_list_items :
1256    parent_markup:[< Token.t ] ->
1257    input ->
1258    Ast.nestable_block_element with_location list list * Loc.span =
1259 fun ~parent_markup input ->
1260  let rec consume_list_items :
1261      Ast.nestable_block_element with_location list list ->
1262      Ast.nestable_block_element with_location list list * Loc.span =
1263   fun acc ->
1264    let next_token = peek input in
1265    match next_token.value with
1266    | `End ->
1267        Parse_error.end_not_allowed next_token.location
1268          ~in_what:(Token.describe parent_markup)
1269        |> add_warning input;
1270        (List.rev acc, next_token.location)
1271    | `Right_brace ->
1272        junk input;
1273        (List.rev acc, next_token.location)
1274    | `Space _ | `Single_newline _ | `Blank_line _ ->
1275        junk input;
1276        consume_list_items acc
1277    | `Begin_list_item kind as token ->
1278        junk input;
1279
1280        (* '{li', represented by [`Begin_list_item `Li], must be followed by
1281           whitespace. *)
1282        (if kind = `Li then
1283           match (peek input).value with
1284           | `Space _ | `Single_newline _ | `Blank_line _ | `Right_brace ->
1285               ()
1286               (* The presence of [`Right_brace] above requires some explanation:
1287
1288                  - It is better to be silent about missing whitespace if the next
1289                    token is [`Right_brace], because the error about an empty list
1290                    item will be generated below, and that error is more important to
1291                    the user.
1292                  - The [`Right_brace] token also happens to include all whitespace
1293                    before it, as a convenience for the rest of the parser. As a
1294                    result, not ignoring it could be wrong: there could in fact be
1295                    whitespace in the concrete syntax immediately after '{li', just
1296                    it is not represented as [`Space], [`Single_newline], or
1297                    [`Blank_line]. *)
1298           | _ ->
1299               Parse_error.should_be_followed_by_whitespace next_token.location
1300                 ~what:(Token.print token)
1301               |> add_warning input);
1302
1303        let content, token_after_list_item, _where_in_line =
1304          block_element_list In_explicit_list ~parent_markup:token input
1305        in
1306
1307        if content = [] then
1308          Parse_error.should_not_be_empty next_token.location
1309            ~what:(Token.describe token)
1310          |> add_warning input;
1311
1312        (match token_after_list_item.value with
1313        | `Right_brace -> junk input
1314        | `End ->
1315            Parse_error.end_not_allowed token_after_list_item.location
1316              ~in_what:(Token.describe token)
1317            |> add_warning input);
1318
1319        let acc = content :: acc in
1320        consume_list_items acc
1321    | token ->
1322        let suggestion =
1323          match token with
1324          | `Begin_section_heading _ | `Tag _ ->
1325              Printf.sprintf "move %s outside the list." (Token.describe token)
1326          | _ ->
1327              Printf.sprintf "move %s into a list item, %s or %s."
1328                (Token.describe token)
1329                (Token.print (`Begin_list_item `Li))
1330                (Token.print (`Begin_list_item `Dash))
1331        in
1332        Parse_error.not_allowed next_token.location ~what:(Token.describe token)
1333          ~in_what:(Token.describe parent_markup)
1334          ~suggestion
1335        |> add_warning input;
1336
1337        junk input;
1338        consume_list_items acc
1339  in
1340
1341  consume_list_items []
1342
1343(* Consumes a sequence of table rows that might start with [`Bar].
1344
1345   This function is called immediately after '{t' ([`Begin_table `Light]) is
1346   read. The only "valid" way to exit is by reading a [`Right_brace] token,
1347   which is consumed. *)
1348and light_table ~parent_markup ~parent_markup_location input =
1349  let rec consume_rows acc ~last_loc =
1350    Reader.until_rbrace_or_eof input acc >>> fun next_token ->
1351    match next_token.Loc.value with
1352    | `Bar | #token_that_always_begins_an_inline_element -> (
1353        let next, row, last_loc =
1354          light_table_row ~parent_markup ~last_loc input
1355        in
1356        match next with
1357        | `Continue -> consume_rows (row :: acc) ~last_loc
1358        | `Stop -> (row :: acc, last_loc))
1359    | other_token ->
1360        Parse_error.not_allowed next_token.location
1361          ~what:(Token.describe other_token)
1362          ~in_what:(Token.describe parent_markup)
1363        |> add_warning input;
1364        junk input;
1365        consume_rows acc ~last_loc
1366  in
1367  let rows, brace_location = consume_rows [] ~last_loc:parent_markup_location in
1368  let grid = List.rev rows in
1369  (Table.Light_syntax.from_raw_data grid, brace_location)
1370
1371(* Consumes a table row that might start with [`Bar]. *)
1372and light_table_row ~parent_markup ~last_loc input =
1373  let rec consume_row acc_row acc_cell acc_space ~new_line ~last_loc =
1374    let push_cells row cell =
1375      match cell with [] -> row | _ -> List.rev cell :: row
1376    in
1377    let return row cell = List.rev (push_cells row cell) in
1378    let next_token = peek input in
1379    match next_token.value with
1380    | `End ->
1381        Parse_error.end_not_allowed next_token.location ~in_what:"table"
1382        |> add_warning input;
1383        junk input;
1384        (`Stop, return acc_row acc_cell, next_token.location)
1385    | `Right_brace ->
1386        junk input;
1387        (`Stop, return acc_row acc_cell, next_token.location)
1388    | `Space _ as token ->
1389        junk input;
1390        let i = Loc.at next_token.location token in
1391        consume_row acc_row acc_cell (i :: acc_space) ~new_line ~last_loc
1392    | `Single_newline _ | `Blank_line _ ->
1393        junk input;
1394        (`Continue, return acc_row acc_cell, last_loc)
1395    | `Bar ->
1396        junk input;
1397        let acc_row = if new_line then [] else List.rev acc_cell :: acc_row in
1398        consume_row acc_row [] [] ~new_line:false ~last_loc
1399    | #token_that_always_begins_an_inline_element as token ->
1400        let i = inline_element input next_token.location token in
1401        if Loc.spans_multiple_lines i then
1402          Parse_error.not_allowed
1403            ~what:(Token.describe (`Single_newline ""))
1404            ~in_what:(Token.describe `Begin_table_light)
1405            i.location
1406          |> add_warning input;
1407        let acc_cell =
1408          if acc_cell = [] then [ i ] else (i :: acc_space) @ acc_cell
1409        in
1410        consume_row acc_row acc_cell [] ~new_line:false
1411          ~last_loc:next_token.location
1412    | other_token ->
1413        Parse_error.not_allowed next_token.location
1414          ~what:(Token.describe other_token)
1415          ~in_what:(Token.describe parent_markup)
1416        |> add_warning input;
1417        junk input;
1418        consume_row acc_row acc_cell acc_space ~new_line ~last_loc
1419  in
1420  consume_row [] [] [] ~new_line:true ~last_loc
1421
1422(* Consumes a sequence of table rows (starting with '{tr ...}', which are
1423   represented by [`Begin_table_row] tokens).
1424
1425   This function is called immediately after '{table' ([`Begin_table `Heavy]) is
1426   read. The only "valid" way to exit is by reading a [`Right_brace] token,
1427   which is consumed. *)
1428and heavy_table ~parent_markup ~parent_markup_location input =
1429  let rec consume_rows acc ~last_loc =
1430    Reader.until_rbrace_or_eof input acc >>> fun next_token ->
1431    match next_token.Loc.value with
1432    | `Begin_table_row as token ->
1433        junk input;
1434        let items, last_loc = heavy_table_row ~parent_markup:token input in
1435        consume_rows (List.rev items :: acc) ~last_loc
1436    | token ->
1437        Parse_error.not_allowed next_token.location ~what:(Token.describe token)
1438          ~in_what:(Token.describe parent_markup)
1439          ~suggestion:"Move outside of {table ...}, or inside {tr ...}"
1440        |> add_warning input;
1441        junk input;
1442        consume_rows acc ~last_loc
1443  in
1444  let rows, brace_location = consume_rows [] ~last_loc:parent_markup_location in
1445  let grid = List.rev rows in
1446  (Table.Heavy_syntax.from_grid grid, brace_location)
1447
1448(* Consumes a sequence of table cells (starting with '{th ...}' or '{td ... }',
1449   which are represented by [`Begin_table_cell] tokens).
1450
1451   This function is called immediately after '{tr' ([`Begin_table_row]) is
1452   read. The only "valid" way to exit is by reading a [`Right_brace] token,
1453   which is consumed. *)
1454and heavy_table_row ~parent_markup input =
1455  let rec consume_cell_items acc =
1456    Reader.until_rbrace_or_eof input acc >>> fun next_token ->
1457    match next_token.Loc.value with
1458    | `Begin_table_cell kind as token ->
1459        junk input;
1460        let content, token_after_list_item, _where_in_line =
1461          block_element_list In_table_cell ~parent_markup:token input
1462        in
1463        (match token_after_list_item.value with
1464        | `Right_brace -> junk input
1465        | `End ->
1466            Parse_error.not_allowed token_after_list_item.location
1467              ~what:(Token.describe `End) ~in_what:(Token.describe token)
1468            |> add_warning input);
1469        consume_cell_items ((content, kind) :: acc)
1470    | token ->
1471        Parse_error.not_allowed next_token.location ~what:(Token.describe token)
1472          ~in_what:(Token.describe parent_markup)
1473          ~suggestion:
1474            "Move outside of {table ...}, or inside {td ...} or {th ...}"
1475        |> add_warning input;
1476        junk input;
1477        consume_cell_items acc
1478  in
1479  consume_cell_items []
1480
1481(* {2 Entry point} *)
1482
1483let parse warnings tokens =
1484  let input : input = { tokens; warnings } in
1485
1486  let rec parse_block_elements () =
1487    let elements, last_token, _where_in_line =
1488      block_element_list Top_level ~parent_markup:`Comment input
1489    in
1490
1491    match last_token.value with
1492    | `End -> elements
1493    | `Right_brace ->
1494        Parse_error.unpaired_right_brace last_token.location
1495        |> add_warning input;
1496
1497        let block =
1498          Loc.same last_token (`Paragraph [ Loc.same last_token (`Word "}") ])
1499        in
1500
1501        junk input;
1502        elements @ (block :: parse_block_elements ())
1503  in
1504  let ast = parse_block_elements () in
1505  (ast, List.rev !(input.warnings))