src/parser/odoc_parser.ml at main · jon.recoil.org/odoc

this repo has no description
odoc / src / parser / odoc_parser.ml
at main 213 lines 8.5 kB view raw
wrap content
  1module Ast = Ast
  2module Loc = Loc
  3module Warning = Warning
  4
  5type t = {
  6  ast : Ast.t;
  7  warnings : Warning.t list;
  8  reversed_newlines : (int * int) list;
  9  original_pos : Lexing.position;
 10}
 11
 12(* odoc uses an ocamllex lexer. The "engine" for such lexers is the standard
 13   [Lexing] module.
 14
 15   As the [Lexing] module reads the input, it keeps track of only the byte
 16   offset into the input. It is normally the job of each particular lexer
 17   implementation to decide which character sequences count as newlines, and
 18   keep track of line/column locations. This is usually done by writing several
 19   extra regular expressions, and calling [Lexing.new_line] at the right time.
 20
 21   Keeping track of newlines like this makes the odoc lexer somewhat too
 22   diffiult to read, however. To factor the aspect of keeping track of newlines
 23   fully out of the odoc lexer, instead of having it keep track of newlines as
 24   it's scanning the input, the input is pre-scanned before feeding it into the
 25   lexer. A table of all the newlines is assembled, and used to convert offsets
 26   into line/column pairs after the lexer emits tokens.
 27
 28   [reversed_newlines ~input ~comment_location offset] returns a list of pairs
 29   of (line number * offset), allowing the easy conversion from the byte
 30   [offset], relative to the beginning of a comment, into a location, relative
 31   to the beginning of the file containing the comment. This can then be used
 32   to convert from byte offset to line number / column number - a Loc.point,
 33   and additionally for converting back from a Loc.point to a Lexing.position.
 34*)
 35
 36let reversed_newlines : input:string -> (int * int) list =
 37 fun ~input ->
 38  let rec find_newlines line_number input_index newlines_accumulator =
 39    if input_index >= String.length input then newlines_accumulator
 40    else if
 41      (* This is good enough to detect CR-LF also. *)
 42      input.[input_index] = '\n'
 43    then
 44      find_newlines (line_number + 1) (input_index + 1)
 45        ((line_number + 1, input_index + 1) :: newlines_accumulator)
 46    else find_newlines line_number (input_index + 1) newlines_accumulator
 47  in
 48  find_newlines 1 0 [ (1, 0) ]
 49
 50(* [offset_to_location] converts from an offset within the comment text, where
 51   [reversed_newlines] is the result of the above function and [comment_location]
 52   is the location of the comment within its file. The function is meant to be
 53   partially applied to its first two arguments, at which point it is passed to
 54   the lexer, so it can apply the table to its emitted tokens. *)
 55
 56let offset_to_location :
 57    reversed_newlines:(int * int) list ->
 58    comment_location:Lexing.position ->
 59    int ->
 60    Loc.point =
 61 fun ~reversed_newlines ~comment_location byte_offset ->
 62  let rec scan_to_last_newline reversed_newlines_prefix =
 63    match reversed_newlines_prefix with
 64    | [] -> assert false
 65    | (line_in_comment, line_start_offset) :: prefix ->
 66        if line_start_offset > byte_offset then scan_to_last_newline prefix
 67        else
 68          let column_in_comment = byte_offset - line_start_offset in
 69          let line_in_file =
 70            line_in_comment + comment_location.Lexing.pos_lnum - 1
 71          in
 72          let column_in_file =
 73            if line_in_comment = 1 then
 74              column_in_comment + comment_location.Lexing.pos_cnum
 75              - comment_location.Lexing.pos_bol
 76            else column_in_comment
 77          in
 78          { Loc.line = line_in_file; column = column_in_file }
 79  in
 80  scan_to_last_newline reversed_newlines
 81
 82(* Given a Loc.point and the result of [parse_comment], this function returns
 83   a valid Lexing.position *)
 84let position_of_point : t -> Loc.point -> Lexing.position =
 85 fun v point ->
 86  let { reversed_newlines; original_pos; _ } = v in
 87  let line_in_comment = point.Loc.line - original_pos.pos_lnum + 1 in
 88  let rec find_pos_bol reversed_newlines_prefix =
 89    match reversed_newlines_prefix with
 90    | [] -> assert false
 91    | [ _ ] -> original_pos.pos_bol
 92    | (line_number, line_start_offset) :: prefix ->
 93        if line_number > line_in_comment then find_pos_bol prefix
 94        else line_start_offset + original_pos.pos_cnum
 95  in
 96  let pos_bol = find_pos_bol reversed_newlines in
 97  let pos_lnum = point.Loc.line in
 98  let pos_cnum = point.column + pos_bol in
 99  let pos_fname = original_pos.pos_fname in
100  { Lexing.pos_bol; pos_lnum; pos_cnum; pos_fname }
101
102(* The main entry point for this module *)
103let parse_comment ~location ~text =
104  let warnings = ref [] in
105  let reversed_newlines = reversed_newlines ~input:text in
106  let string_buffer = Buffer.create 256 in
107  let token_stream =
108    let lexbuf = Lexing.from_string text in
109    let offset_to_location =
110      offset_to_location ~reversed_newlines ~comment_location:location
111    in
112    let input : Lexer.input =
113      {
114        file = location.Lexing.pos_fname;
115        offset_to_location;
116        warnings;
117        lexbuf;
118        string_buffer;
119      }
120    in
121    Stream.from (fun _token_index -> Some (Lexer.token input lexbuf))
122  in
123  let ast, warnings = Syntax.parse warnings token_stream in
124  { ast; warnings; reversed_newlines; original_pos = location }
125
126(* Accessor functions, as [t] is opaque *)
127let warnings t = t.warnings
128let ast t = t.ast
129
130(** [deindent ~what input ~start_offset s] "deindents" [s] by an offset computed
131    from [start_offset] and [input], corresponding to the begining of a code
132    block or verbatim. If that is not possible (eg there is a non-whitespace
133    line starting with less than [offset] whitespaces), it unindents as much as
134    possible and raises a warning. *)
135let deindent : what:string -> loc:Loc.span -> string -> string * Warning.t list
136    =
137 fun ~what ~loc s ->
138  let offset = loc.start.column in
139  (* Whitespace-only lines do not count, so they return [None]. *)
140  let count_leading_whitespace line =
141    let rec count_leading_whitespace' index len =
142      if index = len then None
143      else
144        match line.[index] with
145        | ' ' | '\t' -> count_leading_whitespace' (index + 1) len
146        | _ -> Some index
147    in
148    let len = String.length line in
149    (* '\r' may remain because we only split on '\n' below. This is important
150       for the first line, which would be considered not empty without this check. *)
151    let len = if len > 0 && line.[len - 1] = '\r' then len - 1 else len in
152    count_leading_whitespace' 0 len
153  in
154
155  let lines = Astring.String.cuts ~sep:"\n" s in
156
157  let least_amount_of_whitespace =
158    List.fold_left
159      (fun least_so_far line ->
160        match (count_leading_whitespace line, least_so_far) with
161        | Some n, least when n < least -> n
162        | _ -> least_so_far)
163      offset lines
164  in
165  let warning =
166    if least_amount_of_whitespace < offset then
167      [ Parse_error.not_enough_indentation_in_code_block ~what loc ]
168    else []
169  in
170  let drop n line =
171    (* Since blank lines were ignored when calculating
172       [least_amount_of_whitespace], their length might be less than the
173       amount. *)
174    if String.length line < n then ""
175    else String.sub line n (String.length line - n)
176  in
177  let lines = List.map (drop least_amount_of_whitespace) lines in
178  (String.concat "\n" lines, warning)
179
180(** Implements the rules for code block as specified in [odoc_for_authors],
181    section on code blocks and indentation. *)
182let code_block_content ~what ~loc s =
183  let indent = loc.Loc.start.column in
184  (* Remove the first line (to first \n char, included) if it's whitespace only.
185     Otherwise, indent at [indent] level to account for offset. *)
186  let rec handle_first_newline index =
187    if index >= String.length s then String.make indent ' ' ^ s
188    else
189      match s.[index] with
190      | ' ' | '\t' | '\r' -> handle_first_newline (index + 1)
191      | '\n' -> String.sub s (index + 1) (String.length s - index - 1)
192      | _ -> String.make indent ' ' ^ s
193  in
194  let s = handle_first_newline 0 in
195  (* Remove the last line (from last \n char, included) if it's whitespace
196     only. *)
197  let rec handle_last_newline index =
198    if index < 0 then s
199    else
200      match s.[index] with
201      | ' ' | '\t' | '\r' -> handle_last_newline (index - 1)
202      | '\n' -> String.sub s 0 index
203      | _ -> s
204  in
205  let s = handle_last_newline (String.length s - 1) in
206  deindent ~what ~loc s
207
208let verbatim_content loc c =
209  let what = "verbatim" in
210  code_block_content ~what ~loc c
211let codeblock_content loc c =
212  let what = "code block" in
213  code_block_content ~what ~loc c