OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 18 kB view raw
1(** Structured expected messages from Nu validator. *) 2 3type t = { 4 message: string; 5 error_code: Htmlrw_check.Error_code.t option; 6 line: int option; 7 column: int option; 8 element: string option; 9 attribute: string option; 10 severity: [`Error | `Warning | `Info] option; 11} 12 13type match_quality = 14 | Exact_match 15 | Code_match 16 | Message_match 17 | Substring_match 18 | Severity_mismatch 19 | No_match 20 21type strictness = { 22 require_exact_message: bool; 23 require_error_code: bool; 24 require_location: bool; 25 require_severity: bool; 26} 27 28let lenient = { 29 require_exact_message = false; 30 require_error_code = false; 31 require_location = false; 32 require_severity = false; 33} 34 35(** Practical strict mode: requires exact message text but not typed error codes *) 36let exact_message = { 37 require_exact_message = true; 38 require_error_code = false; 39 require_location = false; 40 require_severity = false; 41} 42 43(** Full strict mode: all checks enabled (requires typed error code migration) *) 44let strict = { 45 require_exact_message = true; 46 require_error_code = true; 47 require_location = true; 48 require_severity = true; 49} 50 51(** Unicode ellipsis character *) 52let ellipsis = "\xe2\x80\xa6" 53 54(** Normalize Unicode curly quotes to ASCII for comparison *) 55let normalize_quotes s = 56 let buf = Buffer.create (String.length s) in 57 let i = ref 0 in 58 while !i < String.length s do 59 let c = s.[!i] in 60 if !i + 2 < String.length s && c = '\xe2' then begin 61 let c1 = s.[!i + 1] in 62 let c2 = s.[!i + 2] in 63 if c1 = '\x80' && (c2 = '\x9c' || c2 = '\x9d') then begin 64 Buffer.add_char buf '"'; 65 i := !i + 3 66 end else begin 67 Buffer.add_char buf c; 68 incr i 69 end 70 end else begin 71 Buffer.add_char buf c; 72 incr i 73 end 74 done; 75 Buffer.contents buf 76 77(** Unicode curly quotes *) 78let left_curly_quote = "\xe2\x80\x9c" 79let right_curly_quote = "\xe2\x80\x9d" 80 81(** Check if expected message (with potential ellipsis truncation) matches actual. 82 When expected has ellipsis followed by text in curly quotes, we check if actual 83 has a value that ends with that text. 84 This handles Nu validator's message truncation for long attribute values. *) 85let truncation_aware_match expected actual = 86 (* Look for pattern: left_curly_quote + ellipsis in expected *) 87 let quote_ellipsis = left_curly_quote ^ ellipsis in 88 try 89 let pos = Str.search_forward (Str.regexp_string quote_ellipsis) expected 0 in 90 (* Found quote+ellipsis pattern - extract what comes after ellipsis until closing curly quote *) 91 let start_after_ellipsis = pos + String.length quote_ellipsis in 92 let end_quote_pos = 93 try Str.search_forward (Str.regexp_string right_curly_quote) expected start_after_ellipsis 94 with Not_found -> String.length expected 95 in 96 let truncated_suffix = String.sub expected start_after_ellipsis (end_quote_pos - start_after_ellipsis) in 97 98 (* Build expected prefix (everything before the truncated quote) and suffix (everything after) *) 99 let prefix = String.sub expected 0 pos in 100 let suffix_start = end_quote_pos + String.length right_curly_quote in 101 let suffix = 102 if suffix_start < String.length expected then 103 String.sub expected suffix_start (String.length expected - suffix_start) 104 else "" 105 in 106 107 (* Check if actual starts with prefix and ends with suffix *) 108 let actual_starts_with_prefix = 109 String.length actual >= String.length prefix && 110 String.sub actual 0 (String.length prefix) = prefix 111 in 112 let actual_ends_with_suffix = 113 String.length actual >= String.length suffix && 114 String.sub actual (String.length actual - String.length suffix) (String.length suffix) = suffix 115 in 116 117 (* If prefix and suffix match, extract the middle (the quoted value in actual) *) 118 if actual_starts_with_prefix && actual_ends_with_suffix then begin 119 (* Find the quoted value in actual at the same position *) 120 let actual_quote_start = String.length prefix in 121 try 122 (* Check actual has left curly quote at expected position *) 123 if String.sub actual actual_quote_start (String.length left_curly_quote) = left_curly_quote then begin 124 let actual_value_start = actual_quote_start + String.length left_curly_quote in 125 let actual_value_end = 126 Str.search_forward (Str.regexp_string right_curly_quote) actual actual_value_start 127 in 128 let actual_value = String.sub actual actual_value_start (actual_value_end - actual_value_start) in 129 (* Check if actual value ends with the truncated suffix from expected *) 130 String.length actual_value >= String.length truncated_suffix && 131 String.sub actual_value (String.length actual_value - String.length truncated_suffix) (String.length truncated_suffix) = truncated_suffix 132 end else false 133 with _ -> false 134 end else false 135 with Not_found -> 136 (* No ellipsis truncation pattern found *) 137 false 138 139(** Pattern matchers for Nu validator messages. 140 Each returns (error_code option, element option, attribute option) *) 141 142let pattern_element_not_allowed msg = 143 (* "Element "X" not allowed as child of element "Y"..." *) 144 let re = Str.regexp {|Element "\([^"]+\)" not allowed as child of element "\([^"]+\)"|} in 145 if Str.string_match re msg 0 then 146 let child = Str.matched_group 1 msg in 147 let parent = Str.matched_group 2 msg in 148 Some ((`Element (`Not_allowed_as_child (`Child child, `Parent parent)) : Htmlrw_check.Error_code.t), 149 Some child, None) 150 else None 151 152let pattern_attr_not_allowed_on_element msg = 153 (* "Attribute "X" not allowed on element "Y"..." *) 154 let re = Str.regexp {|Attribute "\([^"]+\)" not allowed on element "\([^"]+\)"|} in 155 if Str.string_match re msg 0 then 156 let attr = Str.matched_group 1 msg in 157 let element = Str.matched_group 2 msg in 158 Some ((`Attr (`Not_allowed (`Attr attr, `Elem element)) : Htmlrw_check.Error_code.t), 159 Some element, Some attr) 160 else None 161 162let pattern_attr_not_allowed_here msg = 163 (* "Attribute "X" not allowed here." *) 164 let re = Str.regexp {|Attribute "\([^"]+\)" not allowed here|} in 165 if Str.string_match re msg 0 then 166 let attr = Str.matched_group 1 msg in 167 Some ((`Attr (`Not_allowed_here (`Attr attr)) : Htmlrw_check.Error_code.t), 168 None, Some attr) 169 else None 170 171let pattern_missing_required_attr msg = 172 (* "Element "X" is missing required attribute "Y"." *) 173 let re = Str.regexp {|Element "\([^"]+\)" is missing required attribute "\([^"]+\)"|} in 174 if Str.string_match re msg 0 then 175 let element = Str.matched_group 1 msg in 176 let attr = Str.matched_group 2 msg in 177 Some ((`Attr (`Missing (`Elem element, `Attr attr)) : Htmlrw_check.Error_code.t), 178 Some element, Some attr) 179 else None 180 181let pattern_missing_required_child msg = 182 (* "Element "X" is missing required child element "Y"." *) 183 let re = Str.regexp {|Element "\([^"]+\)" is missing required child element "\([^"]+\)"|} in 184 if Str.string_match re msg 0 then 185 let parent = Str.matched_group 1 msg in 186 let child = Str.matched_group 2 msg in 187 Some ((`Element (`Missing_child (`Parent parent, `Child child)) : Htmlrw_check.Error_code.t), 188 Some parent, None) 189 else None 190 191let pattern_duplicate_id msg = 192 (* "Duplicate ID "X"." *) 193 let re = Str.regexp {|Duplicate ID "\([^"]+\)"|} in 194 if Str.string_match re msg 0 then 195 let id = Str.matched_group 1 msg in 196 Some ((`Attr (`Duplicate_id (`Id id)) : Htmlrw_check.Error_code.t), 197 None, None) 198 else None 199 200let pattern_obsolete_element msg = 201 (* "The "X" element is obsolete." *) 202 let re = Str.regexp {|The "\([^"]+\)" element is obsolete|} in 203 if Str.string_match re msg 0 then 204 let element = Str.matched_group 1 msg in 205 Some ((`Element (`Obsolete (`Elem element, `Suggestion "")) : Htmlrw_check.Error_code.t), 206 Some element, None) 207 else None 208 209let pattern_obsolete_attr msg = 210 (* "The "X" attribute on the "Y" element is obsolete." *) 211 let re = Str.regexp {|The "\([^"]+\)" attribute on the "\([^"]+\)" element is obsolete|} in 212 if Str.string_match re msg 0 then 213 let attr = Str.matched_group 1 msg in 214 let element = Str.matched_group 2 msg in 215 Some ((`Element (`Obsolete_attr (`Elem element, `Attr attr, `Suggestion None)) : Htmlrw_check.Error_code.t), 216 Some element, Some attr) 217 else None 218 219let pattern_stray_end_tag msg = 220 (* "Stray end tag "X"." *) 221 let re = Str.regexp {|Stray end tag "\([^"]+\)"|} in 222 if Str.string_match re msg 0 then 223 let tag = Str.matched_group 1 msg in 224 Some ((`Tag (`Stray_end (`Tag tag)) : Htmlrw_check.Error_code.t), 225 Some tag, None) 226 else None 227 228let pattern_stray_start_tag msg = 229 (* "Stray start tag "X"." *) 230 let re = Str.regexp {|Stray start tag "\([^"]+\)"|} in 231 if Str.string_match re msg 0 then 232 let tag = Str.matched_group 1 msg in 233 Some ((`Tag (`Stray_start (`Tag tag)) : Htmlrw_check.Error_code.t), 234 Some tag, None) 235 else None 236 237let pattern_unnecessary_role msg = 238 (* "The "X" role is unnecessary for..." *) 239 let re = Str.regexp {|The "\([^"]+\)" role is unnecessary for \(.*\)|} in 240 if Str.string_match re msg 0 then 241 let role = Str.matched_group 1 msg in 242 let reason = Str.matched_group 2 msg in 243 Some ((`Aria (`Unnecessary_role (`Role role, `Elem "", `Reason reason)) : Htmlrw_check.Error_code.t), 244 None, None) 245 else None 246 247let pattern_bad_role msg = 248 (* "Bad value "X" for attribute "role" on element "Y"." *) 249 let re = Str.regexp {|Bad value "\([^"]+\)" for attribute "role" on element "\([^"]+\)"|} in 250 if Str.string_match re msg 0 then 251 let role = Str.matched_group 1 msg in 252 let element = Str.matched_group 2 msg in 253 Some ((`Aria (`Bad_role (`Elem element, `Role role)) : Htmlrw_check.Error_code.t), 254 Some element, Some "role") 255 else None 256 257let pattern_aria_must_not_be_specified msg = 258 (* "The "X" attribute must not be specified on any "Y" element unless..." *) 259 let re = Str.regexp {|The "\([^"]+\)" attribute must not be specified on any "\([^"]+\)" element unless \(.*\)|} in 260 if Str.string_match re msg 0 then 261 let attr = Str.matched_group 1 msg in 262 let element = Str.matched_group 2 msg in 263 let condition = Str.matched_group 3 msg in 264 Some ((`Aria (`Must_not_specify (`Attr attr, `Elem element, `Condition condition)) : Htmlrw_check.Error_code.t), 265 Some element, Some attr) 266 else None 267 268let pattern_aria_must_not_be_used msg = 269 (* "The "X" attribute must not be used on an "Y" element which has..." *) 270 let re = Str.regexp {|The "\([^"]+\)" attribute must not be used on an "\([^"]+\)" element which has \(.*\)|} in 271 if Str.string_match re msg 0 then 272 let attr = Str.matched_group 1 msg in 273 let element = Str.matched_group 2 msg in 274 let condition = Str.matched_group 3 msg in 275 Some ((`Aria (`Must_not_use (`Attr attr, `Elem element, `Condition condition)) : Htmlrw_check.Error_code.t), 276 Some element, Some attr) 277 else None 278 279let pattern_bad_attr_value msg = 280 (* "Bad value "X" for attribute "Y" on element "Z": ..." *) 281 let re = Str.regexp {|Bad value "\([^"]*\)" for attribute "\([^"]+\)" on element "\([^"]+\)"|} in 282 if Str.string_match re msg 0 then 283 let value = Str.matched_group 1 msg in 284 let attr = Str.matched_group 2 msg in 285 let element = Str.matched_group 3 msg in 286 (* Extract reason after the colon if present *) 287 let reason = 288 try 289 let colon_pos = String.index_from msg (Str.match_end ()) ':' in 290 String.trim (String.sub msg (colon_pos + 1) (String.length msg - colon_pos - 1)) 291 with Not_found -> "" 292 in 293 Some ((`Attr (`Bad_value (`Elem element, `Attr attr, `Value value, `Reason reason)) : Htmlrw_check.Error_code.t), 294 Some element, Some attr) 295 else None 296 297let pattern_end_tag_implied msg = 298 (* "End tag "X" implied, but there were open elements." *) 299 let re = Str.regexp {|End tag "\([^"]+\)" implied, but there were open elements|} in 300 if Str.string_match re msg 0 then 301 let tag = Str.matched_group 1 msg in 302 Some ((`Tag (`End_implied_open (`Tag tag)) : Htmlrw_check.Error_code.t), 303 Some tag, None) 304 else None 305 306let pattern_no_element_in_scope msg = 307 (* "No "X" element in scope but a "X" end tag seen." *) 308 let re = Str.regexp {|No "\([^"]+\)" element in scope but a "\([^"]+\)" end tag seen|} in 309 if Str.string_match re msg 0 then 310 let tag = Str.matched_group 1 msg in 311 Some ((`Tag (`Not_in_scope (`Tag tag)) : Htmlrw_check.Error_code.t), 312 Some tag, None) 313 else None 314 315let pattern_start_tag_in_table msg = 316 (* "Start tag "X" seen in "table"." *) 317 let re = Str.regexp {|Start tag "\([^"]+\)" seen in "table"|} in 318 if Str.string_match re msg 0 then 319 let tag = Str.matched_group 1 msg in 320 Some ((`Tag (`Start_in_table (`Tag tag)) : Htmlrw_check.Error_code.t), 321 Some tag, None) 322 else None 323 324(** All pattern matchers in priority order *) 325let patterns = [ 326 pattern_element_not_allowed; 327 pattern_attr_not_allowed_on_element; 328 pattern_attr_not_allowed_here; 329 pattern_missing_required_attr; 330 pattern_missing_required_child; 331 pattern_duplicate_id; 332 pattern_obsolete_element; 333 pattern_obsolete_attr; 334 pattern_stray_end_tag; 335 pattern_stray_start_tag; 336 pattern_unnecessary_role; 337 pattern_bad_role; 338 pattern_aria_must_not_be_specified; 339 pattern_aria_must_not_be_used; 340 pattern_bad_attr_value; 341 pattern_end_tag_implied; 342 pattern_no_element_in_scope; 343 pattern_start_tag_in_table; 344] 345 346(** Try to recognize the error code from a message *) 347let recognize_error_code msg = 348 let normalized = normalize_quotes msg in 349 let rec try_patterns = function 350 | [] -> (None, None, None) 351 | p :: rest -> 352 match p normalized with 353 | Some (code, elem, attr) -> (Some code, elem, attr) 354 | None -> try_patterns rest 355 in 356 try_patterns patterns 357 358(** Infer severity from message patterns *) 359let infer_severity msg = 360 let normalized = String.lowercase_ascii msg in 361 if String.sub normalized 0 (min 8 (String.length normalized)) = "consider" then 362 Some `Info 363 else if String.sub normalized 0 (min 3 (String.length normalized)) = "the" 364 && (try let _ = Str.search_forward (Str.regexp_string "is unnecessary") normalized 0 in true 365 with Not_found -> false) then 366 Some `Warning 367 else 368 Some `Error 369 370let parse message = 371 let (error_code, element, attribute) = recognize_error_code message in 372 let severity = infer_severity message in 373 { 374 message; 375 error_code; 376 line = None; 377 column = None; 378 element; 379 attribute; 380 severity; 381 } 382 383let parse_json_value ~get_string ~get_int ~message_field = 384 let message = match message_field with 385 | Some m -> m 386 | None -> match get_string "message" with Some m -> m | None -> "" 387 in 388 let base = parse message in 389 { base with 390 line = (match get_int "line" with Some l -> Some l | None -> base.line); 391 column = (match get_int "column" with Some c -> Some c | None -> base.column); 392 element = (match get_string "element" with Some e -> Some e | None -> base.element); 393 attribute = (match get_string "attribute" with Some a -> Some a | None -> base.attribute); 394 } 395 396(** Compare error codes for semantic equality *) 397let error_codes_match code1 code2 = 398 (* Use structural equality for all polymorphic variant error codes *) 399 code1 = code2 400 401let matches ~strictness ~expected ~actual = 402 let expected_norm = normalize_quotes expected.message in 403 let actual_norm = normalize_quotes actual.Htmlrw_check.text in 404 405 (* Check severity match *) 406 let severity_matches = 407 match (expected.severity, actual.Htmlrw_check.severity) with 408 | (None, _) -> true 409 | (Some `Error, Htmlrw_check.Error) -> true 410 | (Some `Warning, Htmlrw_check.Warning) -> true 411 | (Some `Info, Htmlrw_check.Info) -> true 412 | _ -> false 413 in 414 415 (* Check location match *) 416 let location_matches = 417 match (expected.line, expected.column, actual.Htmlrw_check.location) with 418 | (None, None, _) -> true 419 | (Some el, Some ec, Some loc) -> loc.line = el && loc.column = ec 420 | (Some el, None, Some loc) -> loc.line = el 421 | _ -> false 422 in 423 424 (* Check error code match *) 425 let code_matches = 426 match (expected.error_code, actual.Htmlrw_check.error_code) with 427 | (None, _) -> true (* No expected code to match *) 428 | (Some ec, Htmlrw_check.Conformance ac) -> error_codes_match ec ac 429 | (Some _, Htmlrw_check.Parse _) -> false (* Expected conformance but got parse error *) 430 in 431 432 (* Check message text *) 433 let exact_text_match = actual_norm = expected_norm in 434 (* Truncation-aware match: expected may have ellipsis where actual has full value *) 435 let truncation_match = truncation_aware_match expected.message actual.Htmlrw_check.text in 436 let substring_match = 437 try let _ = Str.search_forward (Str.regexp_string expected_norm) actual_norm 0 in true 438 with Not_found -> false 439 in 440 441 (* Determine match quality *) 442 if not severity_matches && strictness.require_severity then 443 Severity_mismatch 444 else if exact_text_match && code_matches && (location_matches || not strictness.require_location) then 445 Exact_match 446 else if code_matches && expected.error_code <> None then 447 Code_match 448 else if exact_text_match then 449 Message_match 450 else if truncation_match then 451 Message_match (* Treat truncation match same as message match *) 452 else if substring_match && not strictness.require_exact_message then 453 Substring_match 454 else 455 No_match 456 457let is_acceptable ~strictness quality = 458 match quality with 459 | Exact_match -> true 460 | Code_match -> not strictness.require_exact_message 461 | Message_match -> not strictness.require_error_code 462 | Substring_match -> not strictness.require_exact_message 463 | Severity_mismatch -> not strictness.require_severity 464 | No_match -> false 465 466let match_quality_to_string = function 467 | Exact_match -> "exact" 468 | Code_match -> "code" 469 | Message_match -> "message" 470 | Substring_match -> "substring" 471 | Severity_mismatch -> "severity-mismatch" 472 | No_match -> "no-match"