OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 7.0 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6(** HTML5 Conformance Checker 7 8 Validates HTML5 documents against the 9 {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard}. 10 11 {2 Quick Start} 12 13 {[ 14 let result = Htmlrw_check.check_string "<html><body><img></body></html>" in 15 16 if Htmlrw_check.has_errors result then 17 print_endline (Htmlrw_check.to_text result) 18 else 19 print_endline "Valid HTML5!" 20 ]} 21 22 {2 Handling Specific Errors} 23 24 Use pattern matching on [error_code] for fine-grained control: 25 26 {[ 27 List.iter (fun msg -> 28 match msg.Htmlrw_check.error_code with 29 | Parse code -> 30 Printf.printf "Syntax error: %s\n" 31 (Html5rw.Parse_error_code.to_string code) 32 | Conformance code -> 33 match code with 34 | `Img `Missing_alt -> 35 Printf.printf "Accessibility: %s needs alt text\n" 36 (Option.value ~default:"image" msg.element) 37 | `Attr (`Duplicate_id _) -> 38 Printf.printf "Duplicate ID found\n" 39 | _ -> 40 Printf.printf "Error: %s\n" msg.text 41 ) (Htmlrw_check.errors result) 42 ]} 43 44 {2 CI Integration} 45 46 {[ 47 let validate_file path = 48 let ic = open_in path in 49 let reader = Bytesrw.Bytes.Reader.of_in_channel ic in 50 let result = Htmlrw_check.check ~system_id:path reader in 51 close_in ic; 52 if Htmlrw_check.has_errors result then begin 53 print_string (Htmlrw_check.to_gnu result); 54 exit 1 55 end 56 ]} 57 58 {2 What Gets Checked} 59 60 - {b Parse errors}: Malformed syntax per WHATWG parsing specification 61 - {b Content model}: Invalid element nesting (e.g., [<div>] inside [<p>]) 62 - {b Attributes}: Missing required, disallowed, or invalid attributes 63 - {b Accessibility}: ARIA misuse, missing alt text, form labeling 64 - {b Structure}: Missing DOCTYPE, duplicate IDs, heading hierarchy 65 - {b Internationalization}: Missing or mismatched lang attributes 66 67 @see <https://html.spec.whatwg.org/> WHATWG HTML Living Standard 68 @see <https://validator.w3.org/nu/> Nu HTML Checker *) 69 70 71(** {1:types Types} *) 72 73(** Message severity level. *) 74type severity = 75 | Error (** Conformance violation - document is invalid *) 76 | Warning (** Likely problem - may be intentional *) 77 | Info (** Suggestion for improvement *) 78 79(** Source location in the document. Line and column are 1-indexed. *) 80type location = { 81 line : int; 82 column : int; 83 end_line : int option; 84 end_column : int option; 85 system_id : string option; (** File path or URL if provided *) 86} 87 88(** Typed error code. Pattern match to handle specific errors. 89 90 {[ 91 match msg.error_code with 92 | Parse Html5rw.Parse_error_code.Eof_in_tag -> 93 (* Unclosed tag at end of file *) 94 | Conformance (`Img `Missing_alt) -> 95 (* Image without alt attribute *) 96 | _ -> () 97 ]} *) 98type error_code = 99 | Parse of Html5rw.Parse_error_code.t 100 (** Syntax error from the HTML5 parser. 101 @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> *) 102 | Conformance of Error_code.t 103 (** Semantic error from conformance checking. *) 104 105(** A validation message. *) 106type message = { 107 severity : severity; 108 text : string; (** Human-readable description *) 109 error_code : error_code; (** Typed code for pattern matching *) 110 location : location option; (** Source location if available *) 111 element : string option; (** Relevant element (lowercase) *) 112 attribute : string option; (** Relevant attribute (lowercase) *) 113 extract : string option; (** Source excerpt for context *) 114} 115 116(** Validation result. Use accessors below to inspect. *) 117type t 118 119 120(** {1:validation Validation} *) 121 122(** Validate HTML from a string. 123 124 {[ 125 let result = Htmlrw_check.check_string html in 126 if Htmlrw_check.has_errors result then 127 prerr_endline (Htmlrw_check.to_text result) 128 ]} 129 130 @param system_id File path or URL for error messages. *) 131val check_string : ?system_id:string -> string -> t 132 133(** Validate HTML from a reader. 134 135 @param collect_parse_errors Include syntax errors (default: [true]). 136 @param system_id File path or URL for error messages. *) 137val check : 138 ?collect_parse_errors:bool -> 139 ?system_id:string -> 140 Bytesrw.Bytes.Reader.t -> 141 t 142 143(** Validate an already-parsed document. 144 145 Useful when you've parsed the HTML separately and want to run 146 conformance checks without re-parsing. *) 147val check_parsed : 148 ?collect_parse_errors:bool -> 149 ?system_id:string -> 150 Html5rw.t -> 151 t 152 153 154(** {1:results Results} *) 155 156(** All messages in document order. *) 157val messages : t -> message list 158 159(** Only error-severity messages. *) 160val errors : t -> message list 161 162(** Only warning-severity messages. *) 163val warnings : t -> message list 164 165(** Only info-severity messages. *) 166val infos : t -> message list 167 168(** Only syntax errors from the parser. *) 169val parse_errors : t -> message list 170 171(** Only semantic errors from conformance checking. *) 172val conformance_errors : t -> message list 173 174(** [true] if any errors were found. *) 175val has_errors : t -> bool 176 177(** [true] if any warnings were found. *) 178val has_warnings : t -> bool 179 180(** The parsed document. *) 181val document : t -> Html5rw.t 182 183(** The system identifier (file path or URL) if provided. *) 184val system_id : t -> string option 185 186 187(** {1:formatting Output Formatting} *) 188 189(** Human-readable text format. 190 191{v 192file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt". 193v} *) 194val to_text : t -> string 195 196(** JSON format compatible with Nu HTML Validator. 197 198{v 199{"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]} 200v} *) 201val to_json : t -> string 202 203(** GNU error format for IDE integration. 204 205{v 206file.html:5:3: error: Element "img" is missing required attribute "alt". 207v} *) 208val to_gnu : t -> string 209 210 211(** {1:utilities Utilities} *) 212 213(** ["error"], ["warning"], or ["info"]. *) 214val severity_to_string : severity -> string 215 216(** String representation of an error code. *) 217val error_code_to_string : error_code -> string 218 219(** Pretty-printer for severity. *) 220val pp_severity : Format.formatter -> severity -> unit 221 222(** Pretty-printer for location. *) 223val pp_location : Format.formatter -> location -> unit 224 225(** Pretty-printer for message. *) 226val pp_message : Format.formatter -> message -> unit 227 228 229(** {1:error_codes Error Code Types} 230 231 For pattern matching on conformance errors. Parse errors use 232 {!Html5rw.Parse_error_code}. 233 234 {[ 235 match code with 236 | `Attr (`Missing_required_attr _) -> ... 237 | `Img `Missing_alt -> ... 238 | `Aria _ -> ... (* Any ARIA error *) 239 | _ -> ... 240 ]} *) 241module Error_code = Error_code