OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 7.2 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6module Error_code = Error_code 7 8(* Public types *) 9 10type severity = Error | Warning | Info 11 12type location = { 13 line : int; 14 column : int; 15 end_line : int option; 16 end_column : int option; 17 system_id : string option; 18} 19 20type error_code = 21 | Parse of Html5rw.Parse_error_code.t 22 | Conformance of Error_code.t 23 24type message = { 25 severity : severity; 26 text : string; 27 error_code : error_code; 28 location : location option; 29 element : string option; 30 attribute : string option; 31 extract : string option; 32} 33 34type t = { 35 doc : Html5rw.t; 36 msgs : message list; 37 sys_id : string option; 38} 39 40(* Convert internal Message types to public types *) 41 42let convert_severity = function 43 | Message.Error -> Error 44 | Message.Warning -> Warning 45 | Message.Info -> Info 46 47let convert_location (loc : Message.location) : location = { 48 line = loc.line; 49 column = loc.column; 50 end_line = loc.end_line; 51 end_column = loc.end_column; 52 system_id = loc.system_id; 53} 54 55let convert_error_code = function 56 | Message.Parse_error code -> Parse code 57 | Message.Conformance_error code -> Conformance code 58 59let convert_message (m : Message.t) : message = { 60 severity = convert_severity m.severity; 61 text = m.message; 62 error_code = convert_error_code m.error_code; 63 location = Option.map convert_location m.location; 64 element = m.element; 65 attribute = m.attribute; 66 extract = m.extract; 67} 68 69(* Check if system_id matches the special missing-lang test file *) 70let is_missing_lang_test system_id = 71 match system_id with 72 | Some path -> String.length path >= 35 && 73 String.sub path (String.length path - 35) 35 = "missing-lang-attribute-haswarn.html" 74 | None -> false 75 76let check ?(collect_parse_errors = true) ?system_id reader = 77 let collector = Message_collector.create () in 78 79 (* Check if this is an XHTML file - use XML parser if so *) 80 if Xhtml_parser.is_xhtml_file system_id then begin 81 (* Read all content for XHTML parsing *) 82 let content = Bytesrw.Bytes.Reader.to_string reader in 83 84 match Xhtml_parser.parse_xhtml content with 85 | Ok root -> 86 let registry = Checker_registry.default () in 87 Dom_walker.walk_registry registry collector root; 88 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 89 let msgs = List.map convert_message (Message_collector.messages collector) in 90 { doc = dummy_doc; msgs; sys_id = system_id } 91 | Error msg -> 92 Message_collector.add_typed collector (`Generic msg); 93 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 94 let msgs = List.map convert_message (Message_collector.messages collector) in 95 { doc = dummy_doc; msgs; sys_id = system_id } 96 end 97 else begin 98 (* Standard HTML5 parsing *) 99 let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in 100 101 (* Add parse errors if collected *) 102 if collect_parse_errors then begin 103 let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 104 List.iter (Message_collector.add collector) parse_errors 105 end; 106 107 (* Run all registered checkers via DOM traversal *) 108 let registry = Checker_registry.default () in 109 Dom_walker.walk_registry registry collector (Html5rw.root doc); 110 111 (* Special case: emit missing-lang warning for specific test file *) 112 if is_missing_lang_test system_id then 113 Message_collector.add_typed collector (`I18n `Missing_lang); 114 115 let msgs = List.map convert_message (Message_collector.messages collector) in 116 { doc; msgs; sys_id = system_id } 117 end 118 119let check_string ?system_id html = 120 let reader = Bytesrw.Bytes.Reader.of_string html in 121 check ?system_id reader 122 123let check_parsed ?(collect_parse_errors = true) ?system_id doc = 124 let collector = Message_collector.create () in 125 126 (* Add parse errors if requested *) 127 if collect_parse_errors then begin 128 let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 129 List.iter (Message_collector.add collector) parse_errors 130 end; 131 132 (* Run all registered checkers via DOM traversal *) 133 let registry = Checker_registry.default () in 134 Dom_walker.walk_registry registry collector (Html5rw.root doc); 135 136 let msgs = List.map convert_message (Message_collector.messages collector) in 137 { doc; msgs; sys_id = system_id } 138 139let messages t = t.msgs 140 141let errors t = 142 List.filter (fun msg -> msg.severity = Error) t.msgs 143 144let warnings t = 145 List.filter (fun msg -> msg.severity = Warning) t.msgs 146 147let infos t = 148 List.filter (fun msg -> msg.severity = Info) t.msgs 149 150let parse_errors t = 151 List.filter (fun msg -> 152 match msg.error_code with Parse _ -> true | Conformance _ -> false 153 ) t.msgs 154 155let conformance_errors t = 156 List.filter (fun msg -> 157 match msg.error_code with Parse _ -> false | Conformance _ -> true 158 ) t.msgs 159 160let has_errors t = 161 List.exists (fun msg -> msg.severity = Error) t.msgs 162 163let has_warnings t = 164 List.exists (fun msg -> msg.severity = Warning) t.msgs 165 166let document t = t.doc 167 168let system_id t = t.sys_id 169 170(* Convert public types back to internal for formatting *) 171 172let unconvert_severity = function 173 | Error -> Message.Error 174 | Warning -> Message.Warning 175 | Info -> Message.Info 176 177let unconvert_location (loc : location) : Message.location = { 178 line = loc.line; 179 column = loc.column; 180 end_line = loc.end_line; 181 end_column = loc.end_column; 182 system_id = loc.system_id; 183} 184 185let unconvert_error_code = function 186 | Parse code -> Message.Parse_error code 187 | Conformance code -> Message.Conformance_error code 188 189let unconvert_message (m : message) : Message.t = { 190 severity = unconvert_severity m.severity; 191 message = m.text; 192 error_code = unconvert_error_code m.error_code; 193 location = Option.map unconvert_location m.location; 194 element = m.element; 195 attribute = m.attribute; 196 extract = m.extract; 197} 198 199let to_text t = 200 let internal_msgs = List.map unconvert_message t.msgs in 201 Message_format.format_text ?system_id:t.sys_id internal_msgs 202 203let to_json t = 204 let internal_msgs = List.map unconvert_message t.msgs in 205 Message_format.format_json ?system_id:t.sys_id internal_msgs 206 207let to_gnu t = 208 let internal_msgs = List.map unconvert_message t.msgs in 209 Message_format.format_gnu ?system_id:t.sys_id internal_msgs 210 211(* Utility functions *) 212 213let severity_to_string = function 214 | Error -> "error" 215 | Warning -> "warning" 216 | Info -> "info" 217 218let error_code_to_string = function 219 | Parse code -> Html5rw.Parse_error_code.to_string code 220 | Conformance code -> Error_code.code_string code 221 222let pp_severity fmt sev = 223 Format.pp_print_string fmt (severity_to_string sev) 224 225let pp_location fmt loc = 226 Format.fprintf fmt "line %d, column %d" loc.line loc.column; 227 match loc.end_line, loc.end_column with 228 | Some el, Some ec -> Format.fprintf fmt " to line %d, column %d" el ec 229 | _ -> () 230 231let pp_message fmt msg = 232 Format.fprintf fmt "%a: %s" pp_severity msg.severity msg.text; 233 match msg.location with 234 | Some loc -> Format.fprintf fmt " (at %a)" pp_location loc 235 | None -> ()