OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Structured expected messages from Nu validator. *) 2 3type t = { 4 message: string; 5 error_code: Html5_checker.Error_code.t option; 6 line: int option; 7 column: int option; 8 element: string option; 9 attribute: string option; 10 severity: [`Error | `Warning | `Info] option; 11} 12 13type match_quality = 14 | Exact_match 15 | Code_match 16 | Message_match 17 | Substring_match 18 | Severity_mismatch 19 | No_match 20 21type strictness = { 22 require_exact_message: bool; 23 require_error_code: bool; 24 require_location: bool; 25 require_severity: bool; 26} 27 28let lenient = { 29 require_exact_message = false; 30 require_error_code = false; 31 require_location = false; 32 require_severity = false; 33} 34 35(** Practical strict mode: requires exact message text but not typed error codes *) 36let exact_message = { 37 require_exact_message = true; 38 require_error_code = false; 39 require_location = false; 40 require_severity = false; 41} 42 43(** Full strict mode: all checks enabled (requires typed error code migration) *) 44let strict = { 45 require_exact_message = true; 46 require_error_code = true; 47 require_location = true; 48 require_severity = true; 49} 50 51(** Normalize Unicode curly quotes to ASCII for comparison *) 52let normalize_quotes s = 53 let buf = Buffer.create (String.length s) in 54 let i = ref 0 in 55 while !i < String.length s do 56 let c = s.[!i] in 57 if !i + 2 < String.length s && c = '\xe2' then begin 58 let c1 = s.[!i + 1] in 59 let c2 = s.[!i + 2] in 60 if c1 = '\x80' && (c2 = '\x9c' || c2 = '\x9d') then begin 61 Buffer.add_char buf '"'; 62 i := !i + 3 63 end else begin 64 Buffer.add_char buf c; 65 incr i 66 end 67 end else begin 68 Buffer.add_char buf c; 69 incr i 70 end 71 done; 72 Buffer.contents buf 73 74(** Pattern matchers for Nu validator messages. 75 Each returns (error_code option, element option, attribute option) *) 76 77let pattern_element_not_allowed msg = 78 (* "Element "X" not allowed as child of element "Y"..." *) 79 let re = Str.regexp {|Element "\([^"]+\)" not allowed as child of element "\([^"]+\)"|} in 80 if Str.string_match re msg 0 then 81 let child = Str.matched_group 1 msg in 82 let parent = Str.matched_group 2 msg in 83 Some (Html5_checker.Error_code.Element_not_allowed_as_child { child; parent }, 84 Some child, None) 85 else None 86 87let pattern_attr_not_allowed_on_element msg = 88 (* "Attribute "X" not allowed on element "Y"..." *) 89 let re = Str.regexp {|Attribute "\([^"]+\)" not allowed on element "\([^"]+\)"|} in 90 if Str.string_match re msg 0 then 91 let attr = Str.matched_group 1 msg in 92 let element = Str.matched_group 2 msg in 93 Some (Html5_checker.Error_code.Attr_not_allowed_on_element { attr; element }, 94 Some element, Some attr) 95 else None 96 97let pattern_attr_not_allowed_here msg = 98 (* "Attribute "X" not allowed here." *) 99 let re = Str.regexp {|Attribute "\([^"]+\)" not allowed here|} in 100 if Str.string_match re msg 0 then 101 let attr = Str.matched_group 1 msg in 102 Some (Html5_checker.Error_code.Attr_not_allowed_here { attr }, 103 None, Some attr) 104 else None 105 106let pattern_missing_required_attr msg = 107 (* "Element "X" is missing required attribute "Y"." *) 108 let re = Str.regexp {|Element "\([^"]+\)" is missing required attribute "\([^"]+\)"|} in 109 if Str.string_match re msg 0 then 110 let element = Str.matched_group 1 msg in 111 let attr = Str.matched_group 2 msg in 112 Some (Html5_checker.Error_code.Missing_required_attr { element; attr }, 113 Some element, Some attr) 114 else None 115 116let pattern_missing_required_child msg = 117 (* "Element "X" is missing required child element "Y"." *) 118 let re = Str.regexp {|Element "\([^"]+\)" is missing required child element "\([^"]+\)"|} in 119 if Str.string_match re msg 0 then 120 let parent = Str.matched_group 1 msg in 121 let child = Str.matched_group 2 msg in 122 Some (Html5_checker.Error_code.Missing_required_child { parent; child }, 123 Some parent, None) 124 else None 125 126let pattern_duplicate_id msg = 127 (* "Duplicate ID "X"." *) 128 let re = Str.regexp {|Duplicate ID "\([^"]+\)"|} in 129 if Str.string_match re msg 0 then 130 let id = Str.matched_group 1 msg in 131 Some (Html5_checker.Error_code.Duplicate_id { id }, 132 None, None) 133 else None 134 135let pattern_obsolete_element msg = 136 (* "The "X" element is obsolete." *) 137 let re = Str.regexp {|The "\([^"]+\)" element is obsolete|} in 138 if Str.string_match re msg 0 then 139 let element = Str.matched_group 1 msg in 140 Some (Html5_checker.Error_code.Obsolete_element { element; suggestion = "" }, 141 Some element, None) 142 else None 143 144let pattern_obsolete_attr msg = 145 (* "The "X" attribute on the "Y" element is obsolete." *) 146 let re = Str.regexp {|The "\([^"]+\)" attribute on the "\([^"]+\)" element is obsolete|} in 147 if Str.string_match re msg 0 then 148 let attr = Str.matched_group 1 msg in 149 let element = Str.matched_group 2 msg in 150 Some (Html5_checker.Error_code.Obsolete_attr { attr; element; suggestion = None }, 151 Some element, Some attr) 152 else None 153 154let pattern_stray_end_tag msg = 155 (* "Stray end tag "X"." *) 156 let re = Str.regexp {|Stray end tag "\([^"]+\)"|} in 157 if Str.string_match re msg 0 then 158 let tag = Str.matched_group 1 msg in 159 Some (Html5_checker.Error_code.Stray_end_tag { tag }, 160 Some tag, None) 161 else None 162 163let pattern_stray_start_tag msg = 164 (* "Stray start tag "X"." *) 165 let re = Str.regexp {|Stray start tag "\([^"]+\)"|} in 166 if Str.string_match re msg 0 then 167 let tag = Str.matched_group 1 msg in 168 Some (Html5_checker.Error_code.Stray_start_tag { tag }, 169 Some tag, None) 170 else None 171 172let pattern_unnecessary_role msg = 173 (* "The "X" role is unnecessary for..." *) 174 let re = Str.regexp {|The "\([^"]+\)" role is unnecessary for \(.*\)|} in 175 if Str.string_match re msg 0 then 176 let role = Str.matched_group 1 msg in 177 let reason = Str.matched_group 2 msg in 178 Some (Html5_checker.Error_code.Unnecessary_role { role; element = ""; reason }, 179 None, None) 180 else None 181 182let pattern_bad_role msg = 183 (* "Bad value "X" for attribute "role" on element "Y"." *) 184 let re = Str.regexp {|Bad value "\([^"]+\)" for attribute "role" on element "\([^"]+\)"|} in 185 if Str.string_match re msg 0 then 186 let role = Str.matched_group 1 msg in 187 let element = Str.matched_group 2 msg in 188 Some (Html5_checker.Error_code.Bad_role { element; role }, 189 Some element, Some "role") 190 else None 191 192let pattern_aria_must_not_be_specified msg = 193 (* "The "X" attribute must not be specified on any "Y" element unless..." *) 194 let re = Str.regexp {|The "\([^"]+\)" attribute must not be specified on any "\([^"]+\)" element unless \(.*\)|} in 195 if Str.string_match re msg 0 then 196 let attr = Str.matched_group 1 msg in 197 let element = Str.matched_group 2 msg in 198 let condition = Str.matched_group 3 msg in 199 Some (Html5_checker.Error_code.Aria_must_not_be_specified { attr; element; condition }, 200 Some element, Some attr) 201 else None 202 203let pattern_aria_must_not_be_used msg = 204 (* "The "X" attribute must not be used on an "Y" element which has..." *) 205 let re = Str.regexp {|The "\([^"]+\)" attribute must not be used on an "\([^"]+\)" element which has \(.*\)|} in 206 if Str.string_match re msg 0 then 207 let attr = Str.matched_group 1 msg in 208 let element = Str.matched_group 2 msg in 209 let condition = Str.matched_group 3 msg in 210 Some (Html5_checker.Error_code.Aria_must_not_be_used { attr; element; condition }, 211 Some element, Some attr) 212 else None 213 214let pattern_bad_attr_value msg = 215 (* "Bad value "X" for attribute "Y" on element "Z": ..." *) 216 let re = Str.regexp {|Bad value "\([^"]*\)" for attribute "\([^"]+\)" on element "\([^"]+\)"|} in 217 if Str.string_match re msg 0 then 218 let value = Str.matched_group 1 msg in 219 let attr = Str.matched_group 2 msg in 220 let element = Str.matched_group 3 msg in 221 (* Extract reason after the colon if present *) 222 let reason = 223 try 224 let colon_pos = String.index_from msg (Str.match_end ()) ':' in 225 String.trim (String.sub msg (colon_pos + 1) (String.length msg - colon_pos - 1)) 226 with Not_found -> "" 227 in 228 Some (Html5_checker.Error_code.Bad_attr_value { element; attr; value; reason }, 229 Some element, Some attr) 230 else None 231 232let pattern_end_tag_implied msg = 233 (* "End tag "X" implied, but there were open elements." *) 234 let re = Str.regexp {|End tag "\([^"]+\)" implied, but there were open elements|} in 235 if Str.string_match re msg 0 then 236 let tag = Str.matched_group 1 msg in 237 Some (Html5_checker.Error_code.End_tag_implied_open_elements { tag }, 238 Some tag, None) 239 else None 240 241let pattern_no_element_in_scope msg = 242 (* "No "X" element in scope but a "X" end tag seen." *) 243 let re = Str.regexp {|No "\([^"]+\)" element in scope but a "\([^"]+\)" end tag seen|} in 244 if Str.string_match re msg 0 then 245 let tag = Str.matched_group 1 msg in 246 Some (Html5_checker.Error_code.No_element_in_scope { tag }, 247 Some tag, None) 248 else None 249 250let pattern_start_tag_in_table msg = 251 (* "Start tag "X" seen in "table"." *) 252 let re = Str.regexp {|Start tag "\([^"]+\)" seen in "table"|} in 253 if Str.string_match re msg 0 then 254 let tag = Str.matched_group 1 msg in 255 Some (Html5_checker.Error_code.Start_tag_in_table { tag }, 256 Some tag, None) 257 else None 258 259(** All pattern matchers in priority order *) 260let patterns = [ 261 pattern_element_not_allowed; 262 pattern_attr_not_allowed_on_element; 263 pattern_attr_not_allowed_here; 264 pattern_missing_required_attr; 265 pattern_missing_required_child; 266 pattern_duplicate_id; 267 pattern_obsolete_element; 268 pattern_obsolete_attr; 269 pattern_stray_end_tag; 270 pattern_stray_start_tag; 271 pattern_unnecessary_role; 272 pattern_bad_role; 273 pattern_aria_must_not_be_specified; 274 pattern_aria_must_not_be_used; 275 pattern_bad_attr_value; 276 pattern_end_tag_implied; 277 pattern_no_element_in_scope; 278 pattern_start_tag_in_table; 279] 280 281(** Try to recognize the error code from a message *) 282let recognize_error_code msg = 283 let normalized = normalize_quotes msg in 284 let rec try_patterns = function 285 | [] -> (None, None, None) 286 | p :: rest -> 287 match p normalized with 288 | Some (code, elem, attr) -> (Some code, elem, attr) 289 | None -> try_patterns rest 290 in 291 try_patterns patterns 292 293(** Infer severity from message patterns *) 294let infer_severity msg = 295 let normalized = String.lowercase_ascii msg in 296 if String.sub normalized 0 (min 8 (String.length normalized)) = "consider" then 297 Some `Info 298 else if String.sub normalized 0 (min 3 (String.length normalized)) = "the" 299 && (try let _ = Str.search_forward (Str.regexp_string "is unnecessary") normalized 0 in true 300 with Not_found -> false) then 301 Some `Warning 302 else 303 Some `Error 304 305let parse message = 306 let (error_code, element, attribute) = recognize_error_code message in 307 let severity = infer_severity message in 308 { 309 message; 310 error_code; 311 line = None; 312 column = None; 313 element; 314 attribute; 315 severity; 316 } 317 318let parse_json_value ~get_string ~get_int ~message_field = 319 let message = match message_field with 320 | Some m -> m 321 | None -> match get_string "message" with Some m -> m | None -> "" 322 in 323 let base = parse message in 324 { base with 325 line = (match get_int "line" with Some l -> Some l | None -> base.line); 326 column = (match get_int "column" with Some c -> Some c | None -> base.column); 327 element = (match get_string "element" with Some e -> Some e | None -> base.element); 328 attribute = (match get_string "attribute" with Some a -> Some a | None -> base.attribute); 329 } 330 331(** Compare error codes for semantic equality *) 332let error_codes_match code1 code2 = 333 match (code1, code2) with 334 | (Html5_checker.Error_code.Element_not_allowed_as_child { child = c1; parent = p1 }, 335 Html5_checker.Error_code.Element_not_allowed_as_child { child = c2; parent = p2 }) -> 336 String.lowercase_ascii c1 = String.lowercase_ascii c2 && 337 String.lowercase_ascii p1 = String.lowercase_ascii p2 338 | (Html5_checker.Error_code.Attr_not_allowed_on_element { attr = a1; element = e1 }, 339 Html5_checker.Error_code.Attr_not_allowed_on_element { attr = a2; element = e2 }) -> 340 String.lowercase_ascii a1 = String.lowercase_ascii a2 && 341 String.lowercase_ascii e1 = String.lowercase_ascii e2 342 | (Html5_checker.Error_code.Missing_required_attr { element = e1; attr = a1 }, 343 Html5_checker.Error_code.Missing_required_attr { element = e2; attr = a2 }) -> 344 String.lowercase_ascii e1 = String.lowercase_ascii e2 && 345 String.lowercase_ascii a1 = String.lowercase_ascii a2 346 | (Html5_checker.Error_code.Duplicate_id { id = i1 }, 347 Html5_checker.Error_code.Duplicate_id { id = i2 }) -> 348 i1 = i2 349 | (Html5_checker.Error_code.Stray_end_tag { tag = t1 }, 350 Html5_checker.Error_code.Stray_end_tag { tag = t2 }) -> 351 String.lowercase_ascii t1 = String.lowercase_ascii t2 352 | (Html5_checker.Error_code.Stray_start_tag { tag = t1 }, 353 Html5_checker.Error_code.Stray_start_tag { tag = t2 }) -> 354 String.lowercase_ascii t1 = String.lowercase_ascii t2 355 (* For other cases, fall back to structural equality *) 356 | (c1, c2) -> c1 = c2 357 358let matches ~strictness ~expected ~actual = 359 let expected_norm = normalize_quotes expected.message in 360 let actual_norm = normalize_quotes actual.Html5_checker.Message.message in 361 362 (* Check severity match *) 363 let severity_matches = 364 match (expected.severity, actual.Html5_checker.Message.severity) with 365 | (None, _) -> true 366 | (Some `Error, Html5_checker.Message.Error) -> true 367 | (Some `Warning, Html5_checker.Message.Warning) -> true 368 | (Some `Info, Html5_checker.Message.Info) -> true 369 | _ -> false 370 in 371 372 (* Check location match *) 373 let location_matches = 374 match (expected.line, expected.column, actual.Html5_checker.Message.location) with 375 | (None, None, _) -> true 376 | (Some el, Some ec, Some loc) -> loc.line = el && loc.column = ec 377 | (Some el, None, Some loc) -> loc.line = el 378 | _ -> false 379 in 380 381 (* Check error code match *) 382 let code_matches = 383 match (expected.error_code, actual.Html5_checker.Message.error_code) with 384 | (None, _) -> true (* No expected code to match *) 385 | (Some ec, Some ac) -> error_codes_match ec ac 386 | (Some _, None) -> false (* Expected typed but got untyped *) 387 in 388 389 (* Check message text *) 390 let exact_text_match = actual_norm = expected_norm in 391 let substring_match = 392 try let _ = Str.search_forward (Str.regexp_string expected_norm) actual_norm 0 in true 393 with Not_found -> false 394 in 395 396 (* Determine match quality *) 397 if not severity_matches && strictness.require_severity then 398 Severity_mismatch 399 else if exact_text_match && code_matches && (location_matches || not strictness.require_location) then 400 Exact_match 401 else if code_matches && expected.error_code <> None then 402 Code_match 403 else if exact_text_match then 404 Message_match 405 else if substring_match && not strictness.require_exact_message then 406 Substring_match 407 else 408 No_match 409 410let is_acceptable ~strictness quality = 411 match quality with 412 | Exact_match -> true 413 | Code_match -> not strictness.require_exact_message 414 | Message_match -> not strictness.require_error_code 415 | Substring_match -> not strictness.require_exact_message 416 | Severity_mismatch -> not strictness.require_severity 417 | No_match -> false 418 419let match_quality_to_string = function 420 | Exact_match -> "exact" 421 | Code_match -> "code" 422 | Message_match -> "message" 423 | Substring_match -> "substring" 424 | Severity_mismatch -> "severity-mismatch" 425 | No_match -> "no-match"