OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+34 -11
lib/htmlrw_check/htmlrw_check.ml
··· 5 5 6 6 module Error_code = Error_code 7 7 8 - (* Public types - defined here to avoid re-exporting internal modules *) 8 + (* Public types *) 9 9 10 10 type severity = Error | Warning | Info 11 11 ··· 17 17 system_id : string option; 18 18 } 19 19 20 + type error_code = 21 + | Parse of Html5rw.Parse_error_code.t 22 + | Conformance of Error_code.t 23 + 20 24 type message = { 21 25 severity : severity; 22 26 text : string; 23 - code : string; 24 - error_code : Error_code.t option; 27 + error_code : error_code; 25 28 location : location option; 26 29 element : string option; 27 30 attribute : string option; ··· 49 52 system_id = loc.system_id; 50 53 } 51 54 55 + let convert_error_code = function 56 + | Message.Parse_error code -> Parse code 57 + | Message.Conformance_error code -> Conformance code 58 + 52 59 let convert_message (m : Message.t) : message = { 53 60 severity = convert_severity m.severity; 54 61 text = m.message; 55 - code = m.code; 56 - error_code = m.error_code; 62 + error_code = convert_error_code m.error_code; 57 63 location = Option.map convert_location m.location; 58 64 element = m.element; 59 65 attribute = m.attribute; ··· 77 83 78 84 match Xhtml_parser.parse_xhtml content with 79 85 | Ok root -> 80 - (* Run all registered checkers via DOM traversal *) 81 86 let registry = Checker_registry.default () in 82 87 Dom_walker.walk_registry registry collector root; 83 88 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in ··· 111 116 { doc; msgs; sys_id = system_id } 112 117 end 113 118 119 + let check_string ?system_id html = 120 + let reader = Bytesrw.Bytes.Reader.of_string html in 121 + check ?system_id reader 122 + 114 123 let check_parsed ?(collect_parse_errors = true) ?system_id doc = 115 124 let collector = Message_collector.create () in 116 125 ··· 138 147 let infos t = 139 148 List.filter (fun msg -> msg.severity = Info) t.msgs 140 149 150 + let parse_errors t = 151 + List.filter (fun msg -> 152 + match msg.error_code with Parse _ -> true | Conformance _ -> false 153 + ) t.msgs 154 + 155 + let conformance_errors t = 156 + List.filter (fun msg -> 157 + match msg.error_code with Parse _ -> false | Conformance _ -> true 158 + ) t.msgs 159 + 141 160 let has_errors t = 142 161 List.exists (fun msg -> msg.severity = Error) t.msgs 143 162 ··· 163 182 system_id = loc.system_id; 164 183 } 165 184 185 + let unconvert_error_code = function 186 + | Parse code -> Message.Parse_error code 187 + | Conformance code -> Message.Conformance_error code 188 + 166 189 let unconvert_message (m : message) : Message.t = { 167 190 severity = unconvert_severity m.severity; 168 191 message = m.text; 169 - code = m.code; 170 - error_code = m.error_code; 192 + error_code = unconvert_error_code m.error_code; 171 193 location = Option.map unconvert_location m.location; 172 194 element = m.element; 173 195 attribute = m.attribute; ··· 193 215 | Warning -> "warning" 194 216 | Info -> "info" 195 217 218 + let error_code_to_string = function 219 + | Parse code -> Html5rw.Parse_error_code.to_string code 220 + | Conformance code -> Error_code.code_string code 221 + 196 222 let pp_severity fmt sev = 197 223 Format.pp_print_string fmt (severity_to_string sev) 198 224 ··· 207 233 match msg.location with 208 234 | Some loc -> Format.fprintf fmt " (at %a)" pp_location loc 209 235 | None -> () 210 - 211 - let message_to_string msg = 212 - Format.asprintf "%a" pp_message msg
+135 -247
lib/htmlrw_check/htmlrw_check.mli
··· 5 5 6 6 (** HTML5 Conformance Checker 7 7 8 - This module validates HTML5 documents against the 9 - {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard}, 10 - reporting conformance errors, warnings, and suggestions. 8 + Validates HTML5 documents against the 9 + {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard}. 11 10 12 11 {2 Quick Start} 13 12 14 13 {[ 15 - (* Validate HTML from a string *) 16 - let html = "<html><body><img></body></html>" in 17 - let reader = Bytesrw.Bytes.Reader.of_string html in 18 - let result = Htmlrw_check.check reader in 14 + let result = Htmlrw_check.check_string "<html><body><img></body></html>" in 19 15 20 - if Htmlrw_check.has_errors result then begin 21 - List.iter (fun msg -> 22 - Printf.printf "%s: %s\n" 23 - (Htmlrw_check.severity_to_string msg.Htmlrw_check.severity) 24 - msg.Htmlrw_check.text 25 - ) (Htmlrw_check.errors result) 26 - end 16 + if Htmlrw_check.has_errors result then 17 + print_endline (Htmlrw_check.to_text result) 18 + else 19 + print_endline "Valid HTML5!" 27 20 ]} 28 21 29 - {2 What Gets Checked} 22 + {2 Handling Specific Errors} 30 23 31 - The checker validates: 24 + Use pattern matching on {!field-message.error_code} for fine-grained control: 32 25 33 - - {b Parse errors}: Malformed HTML syntax (missing end tags, invalid 34 - nesting, etc.) per the WHATWG parsing specification 35 - - {b Content model}: Elements appearing in contexts where they're not 36 - allowed (e.g., [<div>] inside [<p>]) 37 - - {b Attributes}: Missing required attributes, disallowed attributes, 38 - and invalid attribute values 39 - - {b Accessibility}: ARIA role/attribute misuse, missing alt text on 40 - images, form labeling issues 41 - - {b Document structure}: Missing DOCTYPE, duplicate IDs, heading 42 - hierarchy issues 43 - - {b Internationalization}: Missing or mismatched lang attributes 26 + {[ 27 + List.iter (fun msg -> 28 + match msg.Htmlrw_check.error_code with 29 + | Parse code -> 30 + Printf.printf "Syntax error: %s\n" 31 + (Html5rw.Parse_error_code.to_string code) 32 + | Conformance code -> 33 + match code with 34 + | `Img `Missing_alt -> 35 + Printf.printf "Accessibility: %s needs alt text\n" 36 + (Option.value ~default:"image" msg.element) 37 + | `Attr (`Duplicate_id _) -> 38 + Printf.printf "Duplicate ID found\n" 39 + | _ -> 40 + Printf.printf "Error: %s\n" msg.text 41 + ) (Htmlrw_check.errors result) 42 + ]} 44 43 45 - {2 Output Formats} 44 + {2 CI Integration} 46 45 47 - Results can be formatted as: 48 - - {b Text}: Human-readable messages for terminal output 49 - - {b JSON}: Machine-readable format compatible with Nu HTML Validator 50 - - {b GNU}: Error format for IDE integration 46 + {[ 47 + let validate_file path = 48 + let ic = open_in path in 49 + let reader = Bytesrw.Bytes.Reader.of_in_channel ic in 50 + let result = Htmlrw_check.check ~system_id:path reader in 51 + close_in ic; 52 + if Htmlrw_check.has_errors result then begin 53 + print_string (Htmlrw_check.to_gnu result); 54 + exit 1 55 + end 56 + ]} 51 57 52 - @see <https://html.spec.whatwg.org/> 53 - WHATWG HTML Living Standard 54 - @see <https://validator.w3.org/nu/> 55 - Nu HTML Checker (reference validator) *) 58 + {2 What Gets Checked} 56 59 57 - (** {1:types Types} *) 60 + - {b Parse errors}: Malformed syntax per WHATWG parsing specification 61 + - {b Content model}: Invalid element nesting (e.g., [<div>] inside [<p>]) 62 + - {b Attributes}: Missing required, disallowed, or invalid attributes 63 + - {b Accessibility}: ARIA misuse, missing alt text, form labeling 64 + - {b Structure}: Missing DOCTYPE, duplicate IDs, heading hierarchy 65 + - {b Internationalization}: Missing or mismatched lang attributes 58 66 59 - (** Message severity level. 67 + @see <https://html.spec.whatwg.org/> WHATWG HTML Living Standard 68 + @see <https://validator.w3.org/nu/> Nu HTML Checker *) 60 69 61 - - [Error]: Conformance error - the document violates the HTML5 spec 62 - - [Warning]: Likely problem - should be reviewed but may be intentional 63 - - [Info]: Suggestion - best practice recommendation *) 64 - type severity = Error | Warning | Info 65 70 66 - (** Source location of a validation issue. 71 + (** {1:types Types} *) 72 + 73 + (** Message severity level. *) 74 + type severity = 75 + | Error (** Conformance violation - document is invalid *) 76 + | Warning (** Likely problem - may be intentional *) 77 + | Info (** Suggestion for improvement *) 67 78 68 - Locations use 1-based line and column numbers matching typical editor 69 - conventions. The [system_id] field contains the file path or URL if one 70 - was provided to the checker. *) 79 + (** Source location in the document. Line and column are 1-indexed. *) 71 80 type location = { 72 81 line : int; 73 - (** Line number (1-indexed) where the issue was found. *) 74 - 75 82 column : int; 76 - (** Column number (1-indexed) within the line. *) 77 - 78 83 end_line : int option; 79 - (** End line for issues spanning multiple lines. *) 80 - 81 84 end_column : int option; 82 - (** End column for range-based issues. *) 83 - 84 - system_id : string option; 85 - (** File path or URL, if provided to the checker. *) 85 + system_id : string option; (** File path or URL if provided *) 86 86 } 87 87 88 - (** A validation message describing a conformance issue. 88 + (** Typed error code. Pattern match to handle specific errors. 89 89 90 - Each message contains: 91 - - The {!field-severity} indicating how serious the issue is 92 - - Human-readable {!field-text} explaining the problem 93 - - Machine-readable {!field-code} for programmatic handling 94 - - Optional {!field-error_code} for fine-grained pattern matching 95 - - Source {!field-location} when available 96 - - Context ({!field-element}, {!field-attribute}) when relevant *) 90 + {[ 91 + match msg.error_code with 92 + | Parse Html5rw.Parse_error_code.Eof_in_tag -> 93 + (* Unclosed tag at end of file *) 94 + | Conformance (`Img `Missing_alt) -> 95 + (* Image without alt attribute *) 96 + | _ -> () 97 + ]} *) 98 + type error_code = 99 + | Parse of Html5rw.Parse_error_code.t 100 + (** Syntax error from the HTML5 parser. 101 + @see <https://html.spec.whatwg.org/multipage/parsing.html#parse-errors> *) 102 + | Conformance of Error_code.t 103 + (** Semantic error from conformance checking. *) 104 + 105 + (** A validation message. *) 97 106 type message = { 98 107 severity : severity; 99 - (** Severity level of this message. *) 100 - 101 - text : string; 102 - (** Human-readable description of the issue. 103 - 104 - The text follows Nu HTML Validator message conventions, using 105 - Unicode quotes around element/attribute names: 106 - ["Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child..."] *) 107 - 108 - code : string; 109 - (** Machine-readable error code in kebab-case. 110 - 111 - Examples: ["missing-alt"], ["duplicate-id"], ["unexpected-end-tag"]. 112 - Useful for filtering or categorizing errors programmatically. *) 113 - 114 - error_code : Error_code.t option; 115 - (** Typed error code for pattern matching. 116 - 117 - When present, allows fine-grained handling of specific errors: 118 - {[ 119 - match msg.error_code with 120 - | Some (`Img `Missing_alt) -> suggest_alt_text () 121 - | Some (`Attr (`Duplicate_id (`Id id))) -> highlight_duplicate id 122 - | _ -> show_generic_error msg 123 - ]} *) 124 - 125 - location : location option; 126 - (** Source location where the issue was detected. 127 - 128 - [None] for document-level issues or when location tracking is 129 - unavailable (e.g., for some content model errors). *) 130 - 131 - element : string option; 132 - (** Element name relevant to this message (e.g., ["img"], ["div"]). 133 - 134 - Lowercase, without angle brackets. *) 135 - 136 - attribute : string option; 137 - (** Attribute name relevant to this message (e.g., ["alt"], ["href"]). 138 - 139 - Lowercase. Only present for attribute-related errors. *) 140 - 141 - extract : string option; 142 - (** Source excerpt showing context around the error. 143 - 144 - Typically a few characters before and after the problematic location. 145 - Useful for displaying the error in context. *) 108 + text : string; (** Human-readable description *) 109 + error_code : error_code; (** Typed code for pattern matching *) 110 + location : location option; (** Source location if available *) 111 + element : string option; (** Relevant element (lowercase) *) 112 + attribute : string option; (** Relevant attribute (lowercase) *) 113 + extract : string option; (** Source excerpt for context *) 146 114 } 147 115 148 - (** Validation result containing all messages and the parsed document. 149 - 150 - Use {!messages}, {!errors}, {!warnings}, and {!infos} to access 151 - the validation messages. Use {!document} to access the parsed DOM. *) 116 + (** Validation result. Use accessors below to inspect. *) 152 117 type t 153 118 154 - (** {1:validation Validation Functions} *) 155 119 156 - (** Validate HTML from a reader. 120 + (** {1:validation Validation} *) 157 121 158 - Parses the HTML input and runs all conformance checks, returning 159 - a result containing any validation messages. 122 + (** Validate HTML from a string. 160 123 161 - {b Example:} 162 124 {[ 163 - let ic = open_in "page.html" in 164 - let reader = Bytesrw.Bytes.Reader.of_in_channel ic in 165 - let result = Htmlrw_check.check ~system_id:"page.html" reader in 166 - close_in ic; 167 - 125 + let result = Htmlrw_check.check_string html in 168 126 if Htmlrw_check.has_errors result then 169 - print_endline (Htmlrw_check.to_text result) 127 + prerr_endline (Htmlrw_check.to_text result) 170 128 ]} 171 129 172 - @param collect_parse_errors If [true] (default), include HTML parse 173 - errors in the results. Set to [false] to only get conformance 174 - checker errors (content model, attributes, etc.). 175 - @param system_id File path or URL for the document. Used in error 176 - messages and the {!location} field. Does not affect validation. *) 130 + @param system_id File path or URL for error messages. *) 131 + val check_string : ?system_id:string -> string -> t 132 + 133 + (** Validate HTML from a reader. 134 + 135 + @param collect_parse_errors Include syntax errors (default: [true]). 136 + @param system_id File path or URL for error messages. *) 177 137 val check : 178 138 ?collect_parse_errors:bool -> 179 139 ?system_id:string -> 180 140 Bytesrw.Bytes.Reader.t -> 181 141 t 182 142 183 - (** Validate an already-parsed HTML document. 184 - 185 - Runs conformance checks on an existing {!Html5rw.t} parse result. 186 - Useful when you've already parsed the document and want to validate 187 - it without re-parsing. 188 - 189 - {b Example:} 190 - {[ 191 - let doc = Html5rw.parse reader in 192 - (* ... manipulate the DOM ... *) 193 - let result = Htmlrw_check.check_parsed doc in 194 - ]} 143 + (** Validate an already-parsed document. 195 144 196 - @param collect_parse_errors If [true] (default), include any parse 197 - errors that were collected during the original parse. 198 - @param system_id File path or URL for error reporting. *) 145 + Useful when you've parsed the HTML separately and want to run 146 + conformance checks without re-parsing. *) 199 147 val check_parsed : 200 148 ?collect_parse_errors:bool -> 201 149 ?system_id:string -> 202 150 Html5rw.t -> 203 151 t 204 152 205 - (** {1:results Result Accessors} *) 206 153 207 - (** Get all validation messages. 154 + (** {1:results Results} *) 208 155 209 - Returns messages in the order they were generated, which roughly 210 - corresponds to document order for element-related errors. *) 156 + (** All messages in document order. *) 211 157 val messages : t -> message list 212 158 213 - (** Get only error messages. 214 - 215 - Errors indicate conformance violations - the document does not 216 - comply with the HTML5 specification. *) 159 + (** Only error-severity messages. *) 217 160 val errors : t -> message list 218 161 219 - (** Get only warning messages. 220 - 221 - Warnings indicate likely problems that may be intentional in 222 - some cases (e.g., deprecated features still in use). *) 162 + (** Only warning-severity messages. *) 223 163 val warnings : t -> message list 224 164 225 - (** Get only informational messages. 226 - 227 - Info messages are suggestions for best practices that don't 228 - affect conformance. *) 165 + (** Only info-severity messages. *) 229 166 val infos : t -> message list 230 167 231 - (** Test if any errors were found. 168 + (** Only syntax errors from the parser. *) 169 + val parse_errors : t -> message list 232 170 233 - Equivalent to [errors result <> []] but more efficient. *) 234 - val has_errors : t -> bool 171 + (** Only semantic errors from conformance checking. *) 172 + val conformance_errors : t -> message list 235 173 236 - (** Test if any warnings were found. 174 + (** [true] if any errors were found. *) 175 + val has_errors : t -> bool 237 176 238 - Equivalent to [warnings result <> []] but more efficient. *) 177 + (** [true] if any warnings were found. *) 239 178 val has_warnings : t -> bool 240 179 241 - (** Get the parsed document. 242 - 243 - Returns the DOM tree that was validated. For {!check}, this is the 244 - newly parsed document. For {!check_parsed}, this is the document 245 - that was passed in. *) 180 + (** The parsed document. *) 246 181 val document : t -> Html5rw.t 247 182 248 - (** Get the system identifier. 249 - 250 - Returns the file path or URL that was passed to {!check} or 251 - {!check_parsed}, or [None] if not provided. *) 183 + (** The system identifier (file path or URL) if provided. *) 252 184 val system_id : t -> string option 253 185 186 + 254 187 (** {1:formatting Output Formatting} *) 255 188 256 - (** Format messages as human-readable text. 189 + (** Human-readable text format. 257 190 258 - Produces multi-line output suitable for terminal display: 259 191 {v 260 - Error: Element "img" is missing required attribute "alt". 261 - At line 5, column 3 262 - <img src="photo.jpg"> 263 - v} 264 - 265 - Messages are formatted with severity, description, location, 266 - and source excerpt when available. *) 192 + file.html:5.3: error [missing-alt]: Element "img" is missing required attribute "alt". 193 + v} *) 267 194 val to_text : t -> string 268 195 269 - (** Format messages as JSON. 196 + (** JSON format compatible with Nu HTML Validator. 270 197 271 - Produces JSON output compatible with the Nu HTML Validator format: 272 198 {v 273 - { 274 - "messages": [ 275 - { 276 - "type": "error", 277 - "message": "Element \"img\" is missing required attribute \"alt\".", 278 - "lastLine": 5, 279 - "lastColumn": 3 280 - } 281 - ] 282 - } 283 - v} 284 - 285 - Useful for machine processing and integration with other tools. *) 199 + {"messages":[{"type":"error","message":"...","firstLine":5,"firstColumn":3}]} 200 + v} *) 286 201 val to_json : t -> string 287 202 288 - (** Format messages in GNU error format. 203 + (** GNU error format for IDE integration. 289 204 290 - Produces one-line-per-error output for IDE integration: 291 205 {v 292 - page.html:5:3: error: Element "img" is missing required attribute "alt". 293 - v} 294 - 295 - This format is recognized by many editors and build tools. *) 206 + file.html:5:3: error: Element "img" is missing required attribute "alt". 207 + v} *) 296 208 val to_gnu : t -> string 297 209 298 - (** {1:utilities Utility Functions} *) 299 210 300 - (** Convert severity to lowercase string. 211 + (** {1:utilities Utilities} *) 301 212 302 - Returns ["error"], ["warning"], or ["info"]. *) 213 + (** ["error"], ["warning"], or ["info"]. *) 303 214 val severity_to_string : severity -> string 304 215 305 - (** Pretty-print a severity value. *) 216 + (** String representation of an error code. *) 217 + val error_code_to_string : error_code -> string 218 + 219 + (** Pretty-printer for severity. *) 306 220 val pp_severity : Format.formatter -> severity -> unit 307 221 308 - (** Pretty-print a location. *) 222 + (** Pretty-printer for location. *) 309 223 val pp_location : Format.formatter -> location -> unit 310 224 311 - (** Pretty-print a message. 312 - 313 - Includes severity, text, and location if available. *) 225 + (** Pretty-printer for message. *) 314 226 val pp_message : Format.formatter -> message -> unit 315 227 316 - (** Convert a message to a single-line string. 317 228 318 - Includes severity and message text. *) 319 - val message_to_string : message -> string 320 - 321 - (** {1:error_codes Error Codes} 229 + (** {1:error_codes Error Code Types} 322 230 323 - The {!Error_code} module provides typed error codes for programmatic 324 - handling of validation issues. Use pattern matching to handle specific 325 - errors: 231 + For pattern matching on conformance errors. Parse errors use 232 + {!Html5rw.Parse_error_code}. 326 233 327 234 {[ 328 - let handle_message msg = 329 - match msg.Htmlrw_check.error_code with 330 - | Some (`Img `Missing_alt) -> 331 - (* Image accessibility issue *) 332 - suggest_alt_text msg 333 - | Some (`Attr (`Duplicate_id (`Id id))) -> 334 - (* Duplicate ID found *) 335 - highlight_all_with_id id 336 - | Some (`Aria _) -> 337 - (* Any ARIA-related error *) 338 - show_aria_help () 339 - | _ -> 340 - (* Generic handling *) 341 - display_error msg 342 - ]} 343 - 344 - The error codes are organized into categories: 345 - - [`Attr _]: Attribute errors (missing, invalid, duplicate) 346 - - [`Element _]: Element/content model errors 347 - - [`Aria _]: ARIA accessibility errors 348 - - [`Img _]: Image-related errors 349 - - [`Table _]: Table structure errors 350 - - And more... 351 - 352 - See {!Error_code} for the complete type definition. *) 235 + match code with 236 + | `Attr (`Missing_required_attr _) -> ... 237 + | `Img `Missing_alt -> ... 238 + | `Aria _ -> ... (* Any ARIA error *) 239 + | _ -> ... 240 + ]} *) 353 241 module Error_code = Error_code
+23 -18
lib/htmlrw_check/message.ml
··· 10 10 system_id : string option; 11 11 } 12 12 13 + type error_code = 14 + | Parse_error of Html5rw.Parse_error_code.t 15 + | Conformance_error of Error_code.t 16 + 13 17 type t = { 14 18 severity : severity; 15 19 message : string; 16 - code : string; 17 - error_code : Error_code.t option; 20 + error_code : error_code; 18 21 location : location option; 19 22 element : string option; 20 23 attribute : string option; ··· 24 27 let make_location ~line ~column ?end_line ?end_column ?system_id () = 25 28 { line; column; end_line; end_column; system_id } 26 29 27 - (** Create a message from a typed error code *) 28 - let of_error_code ?location ?element ?attribute ?extract error_code = 30 + (** Create a message from a conformance error code *) 31 + let of_conformance_error ?location ?element ?attribute ?extract error_code = 29 32 let severity = match Error_code.severity error_code with 30 33 | Error_code.Error -> Error 31 34 | Error_code.Warning -> Warning ··· 34 37 { 35 38 severity; 36 39 message = Error_code.to_message error_code; 37 - code = Error_code.code_string error_code; 38 - error_code = Some error_code; 40 + error_code = Conformance_error error_code; 39 41 location; 40 42 element; 41 43 attribute; 42 44 extract; 43 45 } 44 46 45 - (** Create a message with manual message text (for backwards compatibility during migration) *) 46 - let make ~severity ~message ?(code="generic") ?location ?element ?attribute ?extract () = 47 - { severity; message; code; error_code = None; location; element; attribute; extract } 48 - 49 - let error ~message ?(code="generic") ?location ?element ?attribute ?extract () = 50 - make ~severity:Error ~message ~code ?location ?element ?attribute ?extract () 51 - 52 - let warning ~message ?(code="generic") ?location ?element ?attribute ?extract () = 53 - make ~severity:Warning ~message ~code ?location ?element ?attribute ?extract () 47 + (** Create a message from a parse error code *) 48 + let of_parse_error ?location ?element ?attribute ?extract ~message code = 49 + { 50 + severity = Error; (* Parse errors are always errors *) 51 + message; 52 + error_code = Parse_error code; 53 + location; 54 + element; 55 + attribute; 56 + extract; 57 + } 54 58 55 - let info ~message ?(code="generic") ?location ?element ?attribute ?extract () = 56 - make ~severity:Info ~message ~code ?location ?element ?attribute ?extract () 59 + let error_code_to_string = function 60 + | Parse_error code -> Html5rw.Parse_error_code.to_string code 61 + | Conformance_error code -> Error_code.code_string code 57 62 58 63 let severity_to_string = function 59 64 | Error -> "error" ··· 82 87 Format.fprintf fmt ": " 83 88 | None -> ()); 84 89 pp_severity fmt msg.severity; 85 - Format.fprintf fmt " [%s]" msg.code; 90 + Format.fprintf fmt " [%s]" (error_code_to_string msg.error_code); 86 91 Format.fprintf fmt ": %s" msg.message; 87 92 (match msg.element with 88 93 | Some elem -> Format.fprintf fmt " (element: %s)" elem
+18 -42
lib/htmlrw_check/message.mli
··· 18 18 system_id : string option; (** File path or URL *) 19 19 } 20 20 21 + (** Unified error code type covering both parse errors and conformance errors. *) 22 + type error_code = 23 + | Parse_error of Html5rw.Parse_error_code.t 24 + (** Parse error from the HTML5 tokenizer/parser *) 25 + | Conformance_error of Error_code.t 26 + (** Conformance error from semantic validation *) 27 + 21 28 (** A validation message. *) 22 29 type t = { 23 30 severity : severity; 24 31 message : string; (** Human-readable description *) 25 - code : string; (** Machine-readable error code *) 26 - error_code : Error_code.t option; (** Typed error code if available *) 32 + error_code : error_code; (** Typed error code *) 27 33 location : location option; 28 34 element : string option; (** Element name if relevant *) 29 35 attribute : string option; (** Attribute name if relevant *) ··· 32 38 33 39 (** {1 Constructors} *) 34 40 35 - (** Create a message from a typed error code (preferred method). *) 36 - val of_error_code : 41 + (** Create a message from a conformance error code. *) 42 + val of_conformance_error : 37 43 ?location:location -> 38 44 ?element:string -> 39 45 ?attribute:string -> ··· 41 47 Error_code.t -> 42 48 t 43 49 44 - (** Create a validation message with specified severity (legacy). *) 45 - val make : 46 - severity:severity -> 47 - message:string -> 48 - ?code:string -> 49 - ?location:location -> 50 - ?element:string -> 51 - ?attribute:string -> 52 - ?extract:string -> 53 - unit -> 54 - t 55 - 56 - (** Create an error message (legacy). *) 57 - val error : 58 - message:string -> 59 - ?code:string -> 50 + (** Create a message from a parse error code. *) 51 + val of_parse_error : 60 52 ?location:location -> 61 53 ?element:string -> 62 54 ?attribute:string -> 63 55 ?extract:string -> 64 - unit -> 65 - t 66 - 67 - (** Create a warning message (legacy). *) 68 - val warning : 69 56 message:string -> 70 - ?code:string -> 71 - ?location:location -> 72 - ?element:string -> 73 - ?attribute:string -> 74 - ?extract:string -> 75 - unit -> 76 - t 77 - 78 - (** Create an informational message (legacy). *) 79 - val info : 80 - message:string -> 81 - ?code:string -> 82 - ?location:location -> 83 - ?element:string -> 84 - ?attribute:string -> 85 - ?extract:string -> 86 - unit -> 57 + Html5rw.Parse_error_code.t -> 87 58 t 88 59 89 60 (** Create a location record. *) ··· 95 66 ?system_id:string -> 96 67 unit -> 97 68 location 69 + 70 + (** {1 Utilities} *) 71 + 72 + (** Get the string representation of an error code. *) 73 + val error_code_to_string : error_code -> string 98 74 99 75 (** {1 Formatting} *) 100 76
+2 -27
lib/htmlrw_check/message_collector.ml
··· 13 13 14 14 let add t msg = t.messages <- msg :: t.messages 15 15 16 - (** Add a message from a typed error code *) 16 + (** Add a message from a typed conformance error code *) 17 17 let add_typed t ?location ?element ?attribute ?extract error_code = 18 18 (* Use provided location, or fall back to current_location *) 19 19 let loc = match location with 20 20 | Some _ -> location 21 21 | None -> t.current_location 22 22 in 23 - let msg = Message.of_error_code ?location:loc ?element ?attribute ?extract error_code in 24 - add t msg 25 - 26 - (** Add an error from a typed error code *) 27 - let add_error_code t ?location ?element ?attribute ?extract error_code = 28 - add_typed t ?location ?element ?attribute ?extract error_code 29 - 30 - (** Legacy: Add an error with manual message text *) 31 - let add_error t ~message ?code ?location ?element ?attribute ?extract () = 32 - let msg = 33 - Message.error ~message ?code ?location ?element ?attribute ?extract () 34 - in 35 - add t msg 36 - 37 - (** Legacy: Add a warning with manual message text *) 38 - let add_warning t ~message ?code ?location ?element ?attribute ?extract () = 39 - let msg = 40 - Message.warning ~message ?code ?location ?element ?attribute ?extract () 41 - in 42 - add t msg 43 - 44 - (** Legacy: Add an info message with manual message text *) 45 - let add_info t ~message ?code ?location ?element ?attribute ?extract () = 46 - let msg = 47 - Message.info ~message ?code ?location ?element ?attribute ?extract () 48 - in 23 + let msg = Message.of_conformance_error ?location:loc ?element ?attribute ?extract error_code in 49 24 add t msg 50 25 51 26 let messages t = List.rev t.messages
+5 -52
lib/htmlrw_check/message_collector.mli
··· 20 20 (** Get the current location. *) 21 21 val get_current_location : t -> Message.location option 22 22 23 - (** {1 Adding Messages - Typed Error Codes (Preferred)} *) 24 - 25 - (** Add a message from a typed error code. *) 26 - val add_typed : 27 - t -> 28 - ?location:Message.location -> 29 - ?element:string -> 30 - ?attribute:string -> 31 - ?extract:string -> 32 - Error_code.t -> 33 - unit 34 - 35 - (** Add an error from a typed error code. Alias for add_typed. *) 36 - val add_error_code : 37 - t -> 38 - ?location:Message.location -> 39 - ?element:string -> 40 - ?attribute:string -> 41 - ?extract:string -> 42 - Error_code.t -> 43 - unit 44 - 45 - (** {1 Adding Messages - Legacy (for migration)} *) 23 + (** {1 Adding Messages} *) 46 24 47 25 (** Add a message to the collector. *) 48 26 val add : t -> Message.t -> unit 49 27 50 - (** Add an error message to the collector (legacy). *) 51 - val add_error : 28 + (** Add a message from a typed conformance error code. 29 + Uses the current location if no explicit location is provided. *) 30 + val add_typed : 52 31 t -> 53 - message:string -> 54 - ?code:string -> 55 - ?location:Message.location -> 56 - ?element:string -> 57 - ?attribute:string -> 58 - ?extract:string -> 59 - unit -> 60 - unit 61 - 62 - (** Add a warning message to the collector (legacy). *) 63 - val add_warning : 64 - t -> 65 - message:string -> 66 - ?code:string -> 67 32 ?location:Message.location -> 68 33 ?element:string -> 69 34 ?attribute:string -> 70 35 ?extract:string -> 71 - unit -> 72 - unit 73 - 74 - (** Add an info message to the collector (legacy). *) 75 - val add_info : 76 - t -> 77 - message:string -> 78 - ?code:string -> 79 - ?location:Message.location -> 80 - ?element:string -> 81 - ?attribute:string -> 82 - ?extract:string -> 83 - unit -> 36 + Error_code.t -> 84 37 unit 85 38 86 39 (** {1 Retrieving Messages} *)
+3 -3
lib/htmlrw_check/message_format.ml
··· 24 24 match system_id with Some s -> s | None -> "input") 25 25 in 26 26 let severity_str = Message.severity_to_string msg.Message.severity in 27 - let code_str = " [" ^ msg.Message.code ^ "]" in 27 + let code_str = " [" ^ Message.error_code_to_string msg.Message.error_code ^ "]" in 28 28 let elem_str = 29 29 match msg.Message.element with 30 30 | Some e -> " (element: " ^ e ^ ")" ··· 59 59 match system_id with Some s -> s ^ ":0:0" | None -> "input:0:0") 60 60 in 61 61 let severity_str = Message.severity_to_string msg.Message.severity in 62 - let code_str = " [" ^ msg.Message.code ^ "]" in 62 + let code_str = " [" ^ Message.error_code_to_string msg.Message.error_code ^ "]" in 63 63 Buffer.add_string buf 64 64 (Printf.sprintf "%s: %s%s: %s\n" loc_str severity_str code_str 65 65 msg.Message.message)) ··· 72 72 let message_text = String (msg.Message.message, Meta.none) in 73 73 let base = [ (("type", Meta.none), severity); (("message", Meta.none), message_text) ] in 74 74 let with_code = 75 - (("subType", Meta.none), String (msg.Message.code, Meta.none)) :: base 75 + (("subType", Meta.none), String (Message.error_code_to_string msg.Message.error_code, Meta.none)) :: base 76 76 in 77 77 let with_location = 78 78 match msg.Message.location with
+79 -84
lib/htmlrw_check/parse_error_bridge.ml
··· 3 3 SPDX-License-Identifier: MIT 4 4 ---------------------------------------------------------------------------*) 5 5 6 + (** Generate human-readable message for a parse error code *) 7 + let message_of_parse_error code = 8 + let code_str = Html5rw.Parse_error_code.to_string code in 9 + match code with 10 + | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> 11 + "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag." 12 + | Html5rw.Parse_error_code.Null_character_reference -> 13 + "Character reference expands to zero." 14 + | Html5rw.Parse_error_code.Tree_construction_error s -> 15 + (* Check for control-character/noncharacter/surrogate with codepoint info *) 16 + (try 17 + if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then 18 + let colon_pos = String.index s ':' in 19 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 20 + let cp = int_of_string ("0x" ^ cp_str) in 21 + Printf.sprintf "Forbidden code point U+%04x." cp 22 + else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then 23 + let colon_pos = String.index s ':' in 24 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 25 + let cp = int_of_string ("0x" ^ cp_str) in 26 + Printf.sprintf "Forbidden code point U+%04x." cp 27 + else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then 28 + let colon_pos = String.index s ':' in 29 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 30 + let cp = int_of_string ("0x" ^ cp_str) in 31 + Printf.sprintf "Forbidden code point U+%04x." cp 32 + (* Character reference errors *) 33 + else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then 34 + let cp_str = String.sub s 28 (String.length s - 28) in 35 + let cp = int_of_string ("0x" ^ cp_str) in 36 + if cp = 0x0D then 37 + "A numeric character reference expanded to carriage return." 38 + else 39 + Printf.sprintf "Character reference expands to a control character (U+%04x)." cp 40 + else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then 41 + let colon_pos = String.index s ':' in 42 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 43 + let cp = int_of_string ("0x" ^ cp_str) in 44 + (* U+FDD0-U+FDEF are "permanently unassigned" *) 45 + if cp >= 0xFDD0 && cp <= 0xFDEF then 46 + "Character reference expands to a permanently unassigned code point." 47 + (* Astral noncharacters (planes 1-16) *) 48 + else if cp >= 0x10000 then 49 + Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp 50 + else 51 + Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp 52 + else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then 53 + "Character reference outside the permissible Unicode range." 54 + else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then 55 + let colon_pos = String.index s ':' in 56 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 57 + let cp = int_of_string ("0x" ^ cp_str) in 58 + Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp 59 + else if s = "no-p-element-in-scope" then 60 + "No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen." 61 + else if s = "end-tag-p-implied-but-open-elements" then 62 + "End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements." 63 + else if s = "end-tag-br" then 64 + "End tag \xe2\x80\x9cbr\xe2\x80\x9d." 65 + else if s = "expected-closing-tag-but-got-eof" then 66 + "End of file seen and there were open elements." 67 + else if String.length s > 28 && String.sub s 0 28 = "bad-start-tag-in-head-noscri" then 68 + let colon_pos = String.index s ':' in 69 + let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 70 + Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element 71 + else if String.length s > 19 && String.sub s 0 19 = "unexpected-end-tag:" then 72 + let element = String.sub s 19 (String.length s - 19) in 73 + Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element 74 + else if String.length s > 19 && String.sub s 0 19 = "start-tag-in-table:" then 75 + let tag = String.sub s 19 (String.length s - 19) in 76 + Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag 77 + else 78 + Printf.sprintf "Parse error: %s" s 79 + with _ -> Printf.sprintf "Parse error: %s" s) 80 + | _ -> Printf.sprintf "Parse error: %s" code_str 81 + 6 82 let of_parse_error ?system_id err = 7 83 let code = Html5rw.error_code err in 8 84 let line = Html5rw.error_line err in 9 85 let column = Html5rw.error_column err in 10 - let location = 11 - Message.make_location ~line ~column ?system_id () 12 - in 13 - let code_str = Html5rw.Parse_error_code.to_string code in 14 - let (message, final_code) = match code with 15 - | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> 16 - ("Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag.", code_str) 17 - | Html5rw.Parse_error_code.Null_character_reference -> 18 - ("Character reference expands to zero.", "null-character-reference") 19 - | Html5rw.Parse_error_code.Tree_construction_error s -> 20 - (* Check for control-character/noncharacter/surrogate with codepoint info *) 21 - (try 22 - if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then 23 - let colon_pos = String.index s ':' in 24 - let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 25 - let cp = int_of_string ("0x" ^ cp_str) in 26 - (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 27 - else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then 28 - let colon_pos = String.index s ':' in 29 - let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 30 - let cp = int_of_string ("0x" ^ cp_str) in 31 - (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 32 - else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then 33 - let colon_pos = String.index s ':' in 34 - let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 35 - let cp = int_of_string ("0x" ^ cp_str) in 36 - (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 37 - (* Character reference errors *) 38 - else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then 39 - let cp_str = String.sub s 28 (String.length s - 28) in 40 - let cp = int_of_string ("0x" ^ cp_str) in 41 - if cp = 0x0D then 42 - ("A numeric character reference expanded to carriage return.", "control-character-reference") 43 - else 44 - (Printf.sprintf "Character reference expands to a control character (U+%04x)." cp, "control-character-reference") 45 - else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then 46 - let colon_pos = String.index s ':' in 47 - let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 48 - let cp = int_of_string ("0x" ^ cp_str) in 49 - (* U+FDD0-U+FDEF are "permanently unassigned" *) 50 - if cp >= 0xFDD0 && cp <= 0xFDEF then 51 - ("Character reference expands to a permanently unassigned code point.", "noncharacter-character-reference") 52 - (* Astral noncharacters (planes 1-16) *) 53 - else if cp >= 0x10000 then 54 - (Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp, "noncharacter-character-reference") 55 - else 56 - (Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp, "noncharacter-character-reference") 57 - else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then 58 - let colon_pos = String.index s ':' in 59 - let _ = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 60 - ("Character reference outside the permissible Unicode range.", "character-reference-outside-unicode-range") 61 - else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then 62 - let colon_pos = String.index s ':' in 63 - let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 64 - let cp = int_of_string ("0x" ^ cp_str) in 65 - (Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp, "surrogate-character-reference") 66 - else if s = "no-p-element-in-scope" then 67 - ("No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen.", "no-p-element-in-scope") 68 - else if s = "end-tag-p-implied-but-open-elements" then 69 - ("End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements.", "end-tag-p-implied") 70 - else if s = "end-tag-br" then 71 - ("End tag \xe2\x80\x9cbr\xe2\x80\x9d.", "end-tag-br") 72 - else if s = "expected-closing-tag-but-got-eof" then 73 - ("End of file seen and there were open elements.", "eof-in-open-element") 74 - else if String.length s > 28 && String.sub s 0 28 = "bad-start-tag-in-head-noscri" then 75 - let colon_pos = String.index s ':' in 76 - let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 77 - (Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element, "bad-start-tag-in-head-noscript") 78 - else if String.length s > 19 && String.sub s 0 19 = "unexpected-end-tag:" then 79 - let element = String.sub s 19 (String.length s - 19) in 80 - (Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element, "stray-end-tag") 81 - else if String.length s > 19 && String.sub s 0 19 = "start-tag-in-table:" then 82 - let tag = String.sub s 19 (String.length s - 19) in 83 - (Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag, "start-tag-in-table") 84 - else 85 - (Printf.sprintf "Parse error: %s" s, s) 86 - with _ -> (Printf.sprintf "Parse error: %s" s, s)) 87 - | _ -> (Printf.sprintf "Parse error: %s" code_str, code_str) 88 - in 89 - Message.error 90 - ~message 91 - ~code:final_code 92 - ~location 93 - () 86 + let location = Message.make_location ~line ~column ?system_id () in 87 + let message = message_of_parse_error code in 88 + Message.of_parse_error ~location ~message code 94 89 95 90 let collect_parse_errors ?system_id result = 96 91 let errors = Html5rw.errors result in
+2 -2
test/expected_message.ml
··· 360 360 let code_matches = 361 361 match (expected.error_code, actual.Htmlrw_check.error_code) with 362 362 | (None, _) -> true (* No expected code to match *) 363 - | (Some ec, Some ac) -> error_codes_match ec ac 364 - | (Some _, None) -> false (* Expected typed but got untyped *) 363 + | (Some ec, Htmlrw_check.Conformance ac) -> error_codes_match ec ac 364 + | (Some _, Htmlrw_check.Parse _) -> false (* Expected conformance but got parse error *) 365 365 in 366 366 367 367 (* Check message text *)