OCaml HTML5 parser/serialiser based on Python's JustHTML

more errors

+45
lib/html5rw/html5rw.ml
··· 34 34 ]} 35 35 *) 36 36 37 + (** {1 Error Handling} *) 38 + 39 + (** Global error type that wraps all errors raised by the Html5rw library. 40 + 41 + This provides a unified error type for all parsing and selector errors, 42 + along with printers for display and debugging. 43 + *) 44 + module Error = struct 45 + (** The unified error type for the Html5rw library. *) 46 + type t = 47 + | Parse_error of { 48 + code : Parse_error_code.t; 49 + line : int; 50 + column : int; 51 + } 52 + (** An HTML parse error, including location information. *) 53 + | Selector_error of Selector.Error_code.t 54 + (** A CSS selector parse error. *) 55 + 56 + let of_parse_error (err : Parser.parse_error) : t = 57 + Parse_error { 58 + code = Parser.error_code err; 59 + line = Parser.error_line err; 60 + column = Parser.error_column err; 61 + } 62 + 63 + let of_selector_error (code : Selector.Error_code.t) : t = 64 + Selector_error code 65 + 66 + let to_string = function 67 + | Parse_error { code; line; column } -> 68 + Printf.sprintf "Parse error at %d:%d: %s" line column 69 + (Parse_error_code.to_string code) 70 + | Selector_error code -> 71 + Printf.sprintf "Selector error: %s" 72 + (Selector.Error_code.to_human_string code) 73 + 74 + let pp fmt err = Format.pp_print_string fmt (to_string err) 75 + 76 + (** Get the error code as a kebab-case string. *) 77 + let code_string = function 78 + | Parse_error { code; _ } -> Parse_error_code.to_string code 79 + | Selector_error code -> Selector.Error_code.to_string code 80 + end 81 + 37 82 (** {1 Sub-modules} *) 38 83 39 84 (** Parse error code types *)
+89
lib/html5rw/html5rw.mli
··· 372 372 Column numbers count from 1 and reset at each newline. *) 373 373 val error_column : parse_error -> int 374 374 375 + (** {1 Error Handling} *) 376 + 377 + (** Global error type that wraps all errors raised by the Html5rw library. 378 + 379 + This module provides a unified error type for all parsing and selector 380 + errors, along with printers and conversion functions. Use this when you 381 + want to handle all possible errors from the library in a uniform way. 382 + 383 + {2 Usage} 384 + 385 + {[ 386 + (* Converting parse errors *) 387 + let errors = Html5rw.errors result in 388 + List.iter (fun err -> 389 + let unified = Html5rw.Error.of_parse_error err in 390 + Printf.eprintf "%s\n" (Html5rw.Error.to_string unified) 391 + ) errors 392 + 393 + (* Catching selector errors *) 394 + match Html5rw.query result selector with 395 + | nodes -> (* success *) 396 + | exception Html5rw.Selector.Selector_error code -> 397 + let unified = Html5rw.Error.of_selector_error code in 398 + Printf.eprintf "%s\n" (Html5rw.Error.to_string unified) 399 + ]} 400 + *) 401 + module Error : sig 402 + (** The unified error type for the Html5rw library. *) 403 + type t = 404 + | Parse_error of { 405 + code : Parse_error_code.t; 406 + line : int; 407 + column : int; 408 + } 409 + (** An HTML parse error, including location information. 410 + 411 + Parse errors occur during HTML tokenization and tree construction. 412 + The location indicates where in the input the error was detected. 413 + 414 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> 415 + WHATWG: Parse errors *) 416 + 417 + | Selector_error of Selector.Error_code.t 418 + (** A CSS selector parse error. 419 + 420 + Selector errors occur when parsing malformed CSS selectors passed 421 + to {!query} or {!matches}. *) 422 + 423 + val of_parse_error : parse_error -> t 424 + (** Convert a parse error to the unified error type. 425 + 426 + {[ 427 + let errors = Html5rw.errors result in 428 + let unified_errors = List.map Html5rw.Error.of_parse_error errors 429 + ]} *) 430 + 431 + val of_selector_error : Selector.Error_code.t -> t 432 + (** Convert a selector error code to the unified error type. 433 + 434 + {[ 435 + match Html5rw.query result "invalid[" with 436 + | _ -> () 437 + | exception Html5rw.Selector.Selector_error code -> 438 + let err = Html5rw.Error.of_selector_error code in 439 + Printf.eprintf "%s\n" (Html5rw.Error.to_string err) 440 + ]} *) 441 + 442 + val to_string : t -> string 443 + (** Convert to a human-readable error message with location information. 444 + 445 + Examples: 446 + - ["Parse error at 5:12: unexpected-null-character"] 447 + - ["Selector error: Expected \]"] *) 448 + 449 + val pp : Format.formatter -> t -> unit 450 + (** Pretty-printer for use with [Format] functions. *) 451 + 452 + val code_string : t -> string 453 + (** Get just the error code as a kebab-case string (without location). 454 + 455 + This is useful for programmatic error handling or logging. 456 + 457 + Examples: 458 + - ["unexpected-null-character"] 459 + - ["expected-closing-bracket"] *) 460 + end 461 + 462 + (** {1 Fragment Parsing} *) 463 + 375 464 (** Context element for HTML fragment parsing (innerHTML). 376 465 377 466 When parsing HTML fragments (like the [innerHTML] of an element), you
+6 -1
lib/html5rw/selector/selector.ml
··· 58 58 ]} 59 59 *) 60 60 61 + (** {1 Error Types} *) 62 + 63 + (** CSS selector error codes. *) 64 + module Error_code = Selector_error_code 65 + 61 66 (** {1 Exceptions} *) 62 67 63 68 (** Raised when a selector string is malformed. 64 69 65 - The exception contains an error message describing the parse error. 70 + The exception contains a typed error code describing the parse error. 66 71 *) 67 72 exception Selector_error = Selector_lexer.Selector_error 68 73
+55 -2
lib/html5rw/selector/selector.mli
··· 58 58 ]} 59 59 *) 60 60 61 + (** {1 Error Types} *) 62 + 63 + (** CSS selector error codes. 64 + 65 + This module provides the {!Error_code.t} variant type that represents 66 + all possible errors when parsing CSS selectors. 67 + *) 68 + module Error_code : sig 69 + type t = 70 + | Empty_selector 71 + (** The selector string was empty or contained only whitespace. *) 72 + | Unterminated_string 73 + (** A quoted string was not closed before end of input. *) 74 + | Unterminated_escape 75 + (** An escape sequence was not completed before end of input. *) 76 + | Expected_identifier_after_hash 77 + (** Expected an identifier after [#] for ID selector. *) 78 + | Expected_identifier_after_dot 79 + (** Expected an identifier after [.] for class selector. *) 80 + | Expected_attribute_name 81 + (** Expected an attribute name inside an attribute selector. *) 82 + | Expected_closing_bracket 83 + (** Expected [\]] to close an attribute selector. *) 84 + | Expected_equals_after_operator of char 85 + (** Expected [=] after an attribute operator like [~], [|], [^], [$], or [*]. *) 86 + | Unexpected_character_in_attribute_selector 87 + (** Found an unexpected character inside an attribute selector. *) 88 + | Expected_pseudo_class_name 89 + (** Expected a pseudo-class name after [:]. *) 90 + | Expected_closing_paren 91 + (** Expected [)] to close a pseudo-class argument. *) 92 + | Unexpected_character of char 93 + (** Found an unexpected character in the selector. *) 94 + | Expected_attribute_value 95 + (** Expected a value after the attribute operator. *) 96 + | Expected_closing_bracket_or_operator 97 + (** Expected [\]] or an attribute operator like [=]. *) 98 + | Expected_selector_after_combinator 99 + (** Expected a selector after a combinator ([>], [+], [~], or space). *) 100 + | Unexpected_token 101 + (** Found an unexpected token in the selector. *) 102 + | Expected_end_of_selector 103 + (** Expected end of selector but found more tokens. *) 104 + 105 + val to_string : t -> string 106 + (** Convert to a kebab-case string identifier suitable for programmatic use. *) 107 + 108 + val to_human_string : t -> string 109 + (** Convert to a human-readable error message. *) 110 + end 111 + 61 112 (** {1 Exceptions} *) 62 113 63 - exception Selector_error of string 114 + exception Selector_error of Error_code.t 64 115 (** Raised when a selector string is malformed. 65 116 66 - The exception contains an error message describing the parse error. 117 + The exception contains a typed error code describing the parse error. 118 + Use {!Error_code.to_string} or {!Error_code.to_human_string} to get 119 + a string representation. 67 120 *) 68 121 69 122 (** {1 Sub-modules} *)
+13 -13
lib/html5rw/selector/selector_lexer.ml
··· 1 1 (* CSS selector lexer *) 2 2 3 - exception Selector_error of string 3 + exception Selector_error of Selector_error_code.t 4 4 5 5 type t = { 6 6 input : string; ··· 47 47 let buf = Buffer.create 32 in 48 48 let rec loop () = 49 49 match peek t with 50 - | None -> raise (Selector_error "Unterminated string") 50 + | None -> raise (Selector_error Selector_error_code.Unterminated_string) 51 51 | Some c when c = quote -> advance t 52 52 | Some '\\' -> 53 53 advance t; 54 54 (match peek t with 55 55 | Some c -> Buffer.add_char buf c; advance t; loop () 56 - | None -> raise (Selector_error "Unterminated escape")) 56 + | None -> raise (Selector_error Selector_error_code.Unterminated_escape)) 57 57 | Some c -> 58 58 Buffer.add_char buf c; 59 59 advance t; ··· 99 99 | '#' -> 100 100 advance t; 101 101 let name = read_name t in 102 - if name = "" then raise (Selector_error "Expected identifier after #"); 102 + if name = "" then raise (Selector_error Selector_error_code.Expected_identifier_after_hash); 103 103 tokens := Selector_token.Id name :: !tokens 104 104 | '.' -> 105 105 advance t; 106 106 let name = read_name t in 107 - if name = "" then raise (Selector_error "Expected identifier after ."); 107 + if name = "" then raise (Selector_error Selector_error_code.Expected_identifier_after_dot); 108 108 tokens := Selector_token.Class name :: !tokens 109 109 | '[' -> 110 110 advance t; 111 111 tokens := Selector_token.Attr_start :: !tokens; 112 112 skip_whitespace t; 113 113 let attr_name = read_name t in 114 - if attr_name = "" then raise (Selector_error "Expected attribute name"); 114 + if attr_name = "" then raise (Selector_error Selector_error_code.Expected_attribute_name); 115 115 tokens := Selector_token.Tag attr_name :: !tokens; 116 116 skip_whitespace t; 117 117 ··· 130 130 in 131 131 tokens := Selector_token.String value :: !tokens; 132 132 skip_whitespace t; 133 - if peek t <> Some ']' then raise (Selector_error "Expected ]"); 133 + if peek t <> Some ']' then raise (Selector_error Selector_error_code.Expected_closing_bracket); 134 134 advance t; 135 135 tokens := Selector_token.Attr_end :: !tokens 136 136 | Some ('~' | '|' | '^' | '$' | '*') as op_char -> 137 137 let op_c = Option.get op_char in 138 138 advance t; 139 139 if peek t <> Some '=' then 140 - raise (Selector_error ("Expected = after " ^ String.make 1 op_c)); 140 + raise (Selector_error (Selector_error_code.Expected_equals_after_operator op_c)); 141 141 advance t; 142 142 tokens := Selector_token.Attr_op (String.make 1 op_c ^ "=") :: !tokens; 143 143 skip_whitespace t; ··· 148 148 in 149 149 tokens := Selector_token.String value :: !tokens; 150 150 skip_whitespace t; 151 - if peek t <> Some ']' then raise (Selector_error "Expected ]"); 151 + if peek t <> Some ']' then raise (Selector_error Selector_error_code.Expected_closing_bracket); 152 152 advance t; 153 153 tokens := Selector_token.Attr_end :: !tokens 154 - | _ -> raise (Selector_error "Unexpected character in attribute selector")) 154 + | _ -> raise (Selector_error Selector_error_code.Unexpected_character_in_attribute_selector)) 155 155 156 156 | ',' -> 157 157 advance t; ··· 161 161 advance t; 162 162 tokens := Selector_token.Colon :: !tokens; 163 163 let name = read_name t in 164 - if name = "" then raise (Selector_error "Expected pseudo-class name"); 164 + if name = "" then raise (Selector_error Selector_error_code.Expected_pseudo_class_name); 165 165 tokens := Selector_token.Tag name :: !tokens; 166 166 167 167 if peek t = Some '(' then begin ··· 179 179 done; 180 180 let arg = String.trim (String.sub t.input start (t.pos - start)) in 181 181 if arg <> "" then tokens := Selector_token.String arg :: !tokens; 182 - if peek t <> Some ')' then raise (Selector_error "Expected )"); 182 + if peek t <> Some ')' then raise (Selector_error Selector_error_code.Expected_closing_paren); 183 183 advance t; 184 184 tokens := Selector_token.Paren_close :: !tokens 185 185 end ··· 187 187 let name = read_name t in 188 188 tokens := Selector_token.Tag (String.lowercase_ascii name) :: !tokens 189 189 | _ -> 190 - raise (Selector_error ("Unexpected character: " ^ String.make 1 c)) 190 + raise (Selector_error (Selector_error_code.Unexpected_character c)) 191 191 end 192 192 done; 193 193
+14 -13
lib/html5rw/selector/selector_parser.ml
··· 3 3 open Selector_ast 4 4 open Selector_token 5 5 6 - exception Parse_error of string 6 + (* Re-use the Selector_error exception from the lexer for consistency *) 7 + let raise_error code = raise (Selector_lexer.Selector_error code) 7 8 8 9 type t = { 9 10 tokens : Selector_token.t list; ··· 29 30 let expect t expected = 30 31 let tok = peek t in 31 32 if tok <> expected then 32 - raise (Parse_error ("Expected " ^ (match expected with EOF -> "EOF" | _ -> "token"))) 33 + raise_error (match expected with EOF -> Selector_error_code.Expected_end_of_selector | _ -> Selector_error_code.Unexpected_token) 33 34 else 34 35 advance t 35 36 ··· 51 52 advance t; 52 53 let attr_name = match peek t with 53 54 | Tag name -> advance t; name 54 - | _ -> raise (Parse_error "Expected attribute name") 55 + | _ -> raise_error Selector_error_code.Expected_attribute_name 55 56 in 56 57 (match peek t with 57 58 | Attr_end -> ··· 61 62 advance t; 62 63 let value = match peek t with 63 64 | String v -> advance t; v 64 - | _ -> raise (Parse_error "Expected attribute value") 65 + | _ -> raise_error Selector_error_code.Expected_attribute_value 65 66 in 66 67 (match peek t with 67 68 | Attr_end -> advance t 68 - | _ -> raise (Parse_error "Expected ]")); 69 + | _ -> raise_error Selector_error_code.Expected_closing_bracket); 69 70 Some (make_simple Type_attr ~name:attr_name ~operator:op ~value ()) 70 - | _ -> raise (Parse_error "Expected ] or attribute operator")) 71 + | _ -> raise_error Selector_error_code.Expected_closing_bracket_or_operator) 71 72 | Colon -> 72 73 advance t; 73 74 let name = match peek t with 74 75 | Tag n -> advance t; n 75 - | _ -> raise (Parse_error "Expected pseudo-class name") 76 + | _ -> raise_error Selector_error_code.Expected_pseudo_class_name 76 77 in 77 78 let arg = match peek t with 78 79 | Paren_open -> ··· 84 85 in 85 86 (match peek t with 86 87 | Paren_close -> advance t 87 - | _ -> raise (Parse_error "Expected )")); 88 + | _ -> raise_error Selector_error_code.Expected_closing_paren); 88 89 a 89 90 | _ -> None 90 91 in ··· 111 112 | Combinator comb -> 112 113 advance t; 113 114 (match parse_compound_selector t with 114 - | None -> raise (Parse_error "Expected selector after combinator") 115 + | None -> raise_error Selector_error_code.Expected_selector_after_combinator 115 116 | Some compound -> 116 117 parts := (Some comb, compound) :: !parts; 117 118 loop ()) ··· 131 132 advance t; 132 133 loop (sel :: acc) 133 134 | EOF -> sel :: acc 134 - | _ -> raise (Parse_error "Unexpected token")) 135 + | _ -> raise_error Selector_error_code.Unexpected_token) 135 136 in 136 137 let selectors = List.rev (loop []) in 137 138 (match peek t with 138 139 | EOF -> () 139 - | _ -> raise (Parse_error "Expected end of selector")); 140 + | _ -> raise_error Selector_error_code.Expected_end_of_selector); 140 141 match selectors with 141 - | [] -> raise (Parse_error "Empty selector") 142 + | [] -> raise_error Selector_error_code.Empty_selector 142 143 | [sel] -> Complex sel 143 144 | sels -> List (make_list sels) 144 145 145 146 let parse_selector input = 146 147 if String.trim input = "" then 147 - raise (Selector_lexer.Selector_error "Empty selector"); 148 + raise_error Selector_error_code.Empty_selector; 148 149 let tokens = Selector_lexer.tokenize input in 149 150 parse tokens