OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+3 -1
lib/html5_checker/checker_registry.ml
··· 33 33 Hashtbl.replace reg "ruby" Ruby_checker.checker; 34 34 Hashtbl.replace reg "h1" H1_checker.checker; 35 35 Hashtbl.replace reg "srcset-sizes" Srcset_sizes_checker.checker; 36 + Hashtbl.replace reg "autofocus" Autofocus_checker.checker; 37 + Hashtbl.replace reg "option" Option_checker.checker; 38 + Hashtbl.replace reg "language" Language_checker.checker; 36 39 (* Hashtbl.replace reg "table" Table_checker.checker; *) 37 40 (* Hashtbl.replace reg "heading" Heading_checker.checker; *) 38 41 (* Hashtbl.replace reg "microdata" Microdata_checker.checker; *) 39 - (* Hashtbl.replace reg "language" Language_checker.checker; *) 40 42 (* Hashtbl.replace reg "content" Content_checker.checker; *) 41 43 reg 42 44
+116
lib/html5_checker/semantic/autofocus_checker.ml
··· 1 + (** Autofocus attribute validation checker. 2 + 3 + Validates that only one element with autofocus attribute exists within 4 + each dialog or popover context. *) 5 + 6 + (** Context for tracking autofocus elements. *) 7 + type context_type = Dialog | Popover 8 + 9 + type context = { 10 + context_type : context_type; 11 + mutable autofocus_count : int; 12 + depth : int; 13 + } 14 + 15 + type state = { 16 + mutable context_stack : context list; 17 + mutable current_depth : int; 18 + } 19 + 20 + let create () = { 21 + context_stack = []; 22 + current_depth = 0; 23 + } 24 + 25 + let reset state = 26 + state.context_stack <- []; 27 + state.current_depth <- 0 28 + 29 + (** Check if an attribute list contains a specific attribute. *) 30 + let has_attr name attrs = 31 + List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs 32 + 33 + (** Get an attribute value from the list. *) 34 + let get_attr name attrs = 35 + List.find_map (fun (attr_name, value) -> 36 + if String.lowercase_ascii attr_name = name then Some value else None 37 + ) attrs 38 + 39 + (** Check if element has popover attribute. *) 40 + let has_popover attrs = 41 + List.exists (fun (attr_name, _) -> 42 + String.lowercase_ascii attr_name = "popover" 43 + ) attrs 44 + 45 + let start_element state ~name ~namespace ~attrs collector = 46 + let name_lower = String.lowercase_ascii name in 47 + 48 + (* Track depth *) 49 + state.current_depth <- state.current_depth + 1; 50 + 51 + if namespace = None then begin 52 + (* Check if we're entering a dialog or popover context *) 53 + let enters_context = 54 + if name_lower = "dialog" then Some Dialog 55 + else if has_popover attrs then Some Popover 56 + else None 57 + in 58 + 59 + (match enters_context with 60 + | Some ctx_type -> 61 + let ctx = { context_type = ctx_type; autofocus_count = 0; depth = state.current_depth } in 62 + state.context_stack <- ctx :: state.context_stack 63 + | None -> ()); 64 + 65 + (* Check for autofocus attribute *) 66 + if has_attr "autofocus" attrs then begin 67 + (* Increment count in innermost context if any *) 68 + match state.context_stack with 69 + | ctx :: _ -> 70 + ctx.autofocus_count <- ctx.autofocus_count + 1; 71 + if ctx.autofocus_count > 1 then 72 + let context_name = match ctx.context_type with 73 + | Dialog -> "dialog" 74 + | Popover -> "popover" 75 + in 76 + Message_collector.add_error collector 77 + ~message:(Printf.sprintf "A document must not include more than one visible element with the \xe2\x80\x9cautofocus\xe2\x80\x9d attribute inside a %s." 78 + context_name) 79 + ~code:"multiple-autofocus" 80 + ~element:name ~attribute:"autofocus" () 81 + | [] -> () 82 + end 83 + end 84 + 85 + let end_element state ~name ~namespace _collector = 86 + let name_lower = String.lowercase_ascii name in 87 + 88 + if namespace = None then begin 89 + (* Pop context if we're leaving one *) 90 + match state.context_stack with 91 + | ctx :: rest when ctx.depth = state.current_depth -> 92 + (* Verify this is the right element *) 93 + let matches = 94 + (name_lower = "dialog" && ctx.context_type = Dialog) || 95 + (ctx.context_type = Popover) 96 + in 97 + if matches then state.context_stack <- rest 98 + | _ -> () 99 + end; 100 + 101 + state.current_depth <- state.current_depth - 1 102 + 103 + let characters _state _text _collector = () 104 + 105 + let end_document _state _collector = () 106 + 107 + let checker = 108 + (module struct 109 + type nonrec state = state 110 + let create = create 111 + let reset = reset 112 + let start_element = start_element 113 + let end_element = end_element 114 + let characters = characters 115 + let end_document = end_document 116 + end : Checker.S)
+3
lib/html5_checker/semantic/id_checker.ml
··· 82 82 "form"; (* form-associated elements *) 83 83 "list"; (* input *) 84 84 "aria-activedescendant"; 85 + "popovertarget"; (* button - references popover element *) 86 + "commandfor"; (* button - references element to control *) 87 + "anchor"; (* popover - references anchor element *) 85 88 ] 86 89 87 90 (** Attributes that reference multiple IDs (space-separated). *)
+101
lib/html5_checker/semantic/option_checker.ml
··· 1 + (** Option element validation checker. 2 + 3 + Validates that option elements have proper content or label. *) 4 + 5 + type option_context = { 6 + mutable has_text : bool; 7 + has_label : bool; 8 + label_empty : bool; 9 + } 10 + 11 + type state = { 12 + mutable option_stack : option_context list; 13 + mutable in_template : int; 14 + } 15 + 16 + let create () = { 17 + option_stack = []; 18 + in_template = 0; 19 + } 20 + 21 + let reset state = 22 + state.option_stack <- []; 23 + state.in_template <- 0 24 + 25 + (** Get attribute value if present. *) 26 + let get_attr name attrs = 27 + List.find_map (fun (attr_name, value) -> 28 + if String.lowercase_ascii attr_name = name then Some value else None 29 + ) attrs 30 + 31 + let start_element state ~name ~namespace ~attrs collector = 32 + ignore collector; 33 + let name_lower = String.lowercase_ascii name in 34 + 35 + if namespace <> None then () 36 + else begin 37 + if name_lower = "template" then 38 + state.in_template <- state.in_template + 1 39 + else if state.in_template = 0 && name_lower = "option" then begin 40 + let label_opt = get_attr "label" attrs in 41 + let has_label = label_opt <> None in 42 + let label_empty = match label_opt with 43 + | Some v -> String.trim v = "" 44 + | None -> false 45 + in 46 + let ctx = { has_text = false; has_label; label_empty } in 47 + state.option_stack <- ctx :: state.option_stack 48 + end 49 + end 50 + 51 + let end_element state ~name ~namespace collector = 52 + let name_lower = String.lowercase_ascii name in 53 + 54 + if namespace <> None then () 55 + else begin 56 + if name_lower = "template" then 57 + state.in_template <- max 0 (state.in_template - 1) 58 + else if state.in_template = 0 && name_lower = "option" then begin 59 + match state.option_stack with 60 + | ctx :: rest -> 61 + state.option_stack <- rest; 62 + (* Validate: option must have text content or non-empty label *) 63 + if not ctx.has_text then begin 64 + if ctx.label_empty then 65 + (* Has label="" (empty) and no text - error *) 66 + Message_collector.add_error collector 67 + ~message:"An \xe2\x80\x9coption\xe2\x80\x9d element with an empty \xe2\x80\x9clabel\xe2\x80\x9d attribute must have content." 68 + ~code:"empty-option" 69 + ~element:"option" () 70 + else if not ctx.has_label then 71 + (* No label and no text - error *) 72 + Message_collector.add_error collector 73 + ~message:"An \xe2\x80\x9coption\xe2\x80\x9d element with no \xe2\x80\x9clabel\xe2\x80\x9d attribute must have content." 74 + ~code:"empty-option" 75 + ~element:"option" () 76 + end 77 + | [] -> () 78 + end 79 + end 80 + 81 + let characters state text _collector = 82 + if state.in_template = 0 then begin 83 + match state.option_stack with 84 + | ctx :: _ -> 85 + let trimmed = String.trim text in 86 + if trimmed <> "" then ctx.has_text <- true 87 + | [] -> () 88 + end 89 + 90 + let end_document _state _collector = () 91 + 92 + let checker = 93 + (module struct 94 + type nonrec state = state 95 + let create = create 96 + let reset = reset 97 + let start_element = start_element 98 + let end_element = end_element 99 + let characters = characters 100 + let end_document = end_document 101 + end : Checker.S)
+49
lib/html5_checker/specialized/aria_checker.ml
··· 269 269 270 270 tbl 271 271 272 + (** ARIA attributes with their default values. 273 + When the specified value equals the default, a warning is issued. 274 + Note: "undefined" is NOT included as it's a meaningful value in ARIA 275 + that explicitly indicates a state doesn't apply. *) 276 + let aria_default_values : (string, string) Hashtbl.t = 277 + let tbl = Hashtbl.create 16 in 278 + Hashtbl.add tbl "aria-atomic" "false"; 279 + Hashtbl.add tbl "aria-autocomplete" "none"; 280 + Hashtbl.add tbl "aria-busy" "false"; 281 + Hashtbl.add tbl "aria-current" "false"; 282 + Hashtbl.add tbl "aria-disabled" "false"; 283 + Hashtbl.add tbl "aria-dropeffect" "none"; 284 + (* aria-expanded: "undefined" means the element is not expandable - meaningful, not redundant *) 285 + (* aria-grabbed: deprecated in ARIA 1.1, "undefined" is meaningful *) 286 + Hashtbl.add tbl "aria-haspopup" "false"; 287 + (* aria-hidden: "undefined" is meaningful *) 288 + Hashtbl.add tbl "aria-invalid" "false"; 289 + Hashtbl.add tbl "aria-live" "off"; 290 + Hashtbl.add tbl "aria-modal" "false"; 291 + Hashtbl.add tbl "aria-multiline" "false"; 292 + Hashtbl.add tbl "aria-multiselectable" "false"; 293 + (* aria-orientation: "undefined" is meaningful *) 294 + (* aria-pressed: "undefined" means the element is not a toggle - meaningful *) 295 + Hashtbl.add tbl "aria-readonly" "false"; 296 + Hashtbl.add tbl "aria-relevant" "additions text"; 297 + Hashtbl.add tbl "aria-required" "false"; 298 + (* aria-selected: "undefined" is meaningful *) 299 + Hashtbl.add tbl "aria-sort" "none"; 300 + tbl 301 + 272 302 (** Roles that do NOT support aria-expanded. *) 273 303 let roles_without_aria_expanded = [ 274 304 "listbox"; "list"; "menu"; "menubar"; "radiogroup"; "tablist"; "tree"; "treegrid"; ··· 590 620 | None -> () 591 621 ) attrs 592 622 ) explicit_roles; 623 + 624 + (* Check for redundant default ARIA attribute values *) 625 + List.iter (fun (attr_name, attr_value) -> 626 + let attr_lower = String.lowercase_ascii attr_name in 627 + if String.starts_with ~prefix:"aria-" attr_lower then 628 + match Hashtbl.find_opt aria_default_values attr_lower with 629 + | Some default_value -> 630 + let value_lower = String.lowercase_ascii (String.trim attr_value) in 631 + if value_lower = default_value then 632 + Message_collector.add_warning collector 633 + ~message:(Printf.sprintf 634 + "The \xe2\x80\x9c%s\xe2\x80\x9d attribute is unnecessary for the value \xe2\x80\x9c%s\xe2\x80\x9d." 635 + attr_name attr_value) 636 + ~code:"redundant-aria-default" 637 + ~element:name 638 + ~attribute:attr_name 639 + () 640 + | None -> () 641 + ) attrs; 593 642 594 643 (* Push current element onto stack *) 595 644 let node = {
+169
lib/html5_checker/specialized/attr_restrictions_checker.ml
··· 328 328 end 329 329 end; 330 330 331 + (* Validate data-* attributes *) 332 + if namespace = None then begin 333 + List.iter (fun (attr_name, _) -> 334 + let attr_lower = String.lowercase_ascii attr_name in 335 + (* Check if it starts with "data-" *) 336 + if String.length attr_lower >= 5 && String.sub attr_lower 0 5 = "data-" then begin 337 + let after_prefix = String.sub attr_lower 5 (String.length attr_lower - 5) in 338 + (* Check if it's exactly "data-" with nothing after *) 339 + if after_prefix = "" then 340 + report_disallowed_attr name_lower attr_name collector 341 + (* Check if the name contains colon - not XML serializable *) 342 + else if String.contains after_prefix ':' then 343 + Message_collector.add_error collector 344 + ~message:(Printf.sprintf "Attribute \xe2\x80\x9c%s\xe2\x80\x9d is not serializable as XML 1.0." 345 + attr_name) 346 + ~code:"bad-attribute-name" 347 + ~element:name ~attribute:attr_name () 348 + end 349 + ) attrs 350 + end; 351 + 352 + (* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *) 353 + if namespace = None && not state.is_xhtml then begin 354 + let xmllang_value = get_attr "xml:lang" attrs in 355 + let lang_value = get_attr "lang" attrs in 356 + match xmllang_value with 357 + | Some xmllang -> 358 + (match lang_value with 359 + | None -> 360 + Message_collector.add_error collector 361 + ~message:"When the attribute \xe2\x80\x9cxml:lang\xe2\x80\x9d in no namespace is specified, the element must also have the attribute \xe2\x80\x9clang\xe2\x80\x9d present with the same value." 362 + ~code:"xmllang-missing-lang" 363 + ~element:name ~attribute:"xml:lang" () 364 + | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang -> 365 + Message_collector.add_error collector 366 + ~message:"When the attribute \xe2\x80\x9cxml:lang\xe2\x80\x9d in no namespace is specified, the element must also have the attribute \xe2\x80\x9clang\xe2\x80\x9d present with the same value." 367 + ~code:"xmllang-lang-mismatch" 368 + ~element:name ~attribute:"xml:lang" () 369 + | _ -> ()) 370 + | None -> () 371 + end; 372 + 373 + (* Validate spellcheck attribute - must be "true" or "false" or empty *) 374 + if namespace = None then begin 375 + List.iter (fun (attr_name, attr_value) -> 376 + let attr_lower = String.lowercase_ascii attr_name in 377 + if attr_lower = "spellcheck" then begin 378 + let value_lower = String.lowercase_ascii (String.trim attr_value) in 379 + if value_lower <> "" && value_lower <> "true" && value_lower <> "false" then 380 + Message_collector.add_error collector 381 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d." 382 + attr_value attr_name name) 383 + ~code:"bad-attribute-value" 384 + ~element:name ~attribute:attr_name () 385 + end 386 + ) attrs 387 + end; 388 + 389 + (* Validate enterkeyhint attribute - must be one of specific values *) 390 + if namespace = None then begin 391 + let valid_enterkeyhint = ["enter"; "done"; "go"; "next"; "previous"; "search"; "send"] in 392 + List.iter (fun (attr_name, attr_value) -> 393 + let attr_lower = String.lowercase_ascii attr_name in 394 + if attr_lower = "enterkeyhint" then begin 395 + let value_lower = String.lowercase_ascii (String.trim attr_value) in 396 + if not (List.mem value_lower valid_enterkeyhint) then 397 + Message_collector.add_error collector 398 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d." 399 + attr_value attr_name name) 400 + ~code:"bad-attribute-value" 401 + ~element:name ~attribute:attr_name () 402 + end 403 + ) attrs 404 + end; 405 + 406 + (* Validate headingoffset attribute - must be a number between 0 and 8 *) 407 + if namespace = None then begin 408 + List.iter (fun (attr_name, attr_value) -> 409 + let attr_lower = String.lowercase_ascii attr_name in 410 + if attr_lower = "headingoffset" then begin 411 + let trimmed = String.trim attr_value in 412 + let is_valid = 413 + String.length trimmed > 0 && 414 + String.for_all (fun c -> c >= '0' && c <= '9') trimmed && 415 + (try 416 + let n = int_of_string trimmed in 417 + n >= 0 && n <= 8 418 + with _ -> false) 419 + in 420 + if not is_valid then 421 + Message_collector.add_error collector 422 + ~message:(Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d attribute must be a number between \xe2\x80\x9c0\xe2\x80\x9d and \xe2\x80\x9c8\xe2\x80\x9d." 423 + attr_name) 424 + ~code:"bad-attribute-value" 425 + ~element:name ~attribute:attr_name () 426 + end 427 + ) attrs 428 + end; 429 + 430 + (* Validate accesskey attribute - each key label must be a single code point *) 431 + if namespace = None then begin 432 + List.iter (fun (attr_name, attr_value) -> 433 + let attr_lower = String.lowercase_ascii attr_name in 434 + if attr_lower = "accesskey" then begin 435 + (* Split by whitespace to get key labels *) 436 + let keys = String.split_on_char ' ' attr_value |> 437 + List.filter (fun s -> String.length (String.trim s) > 0) |> 438 + List.map String.trim in 439 + (* Count Unicode code points in each key *) 440 + let count_codepoints s = 441 + let len = String.length s in 442 + let count = ref 0 in 443 + let i = ref 0 in 444 + while !i < len do 445 + let c = Char.code s.[!i] in 446 + if c < 0x80 then incr i 447 + else if c < 0xE0 then i := !i + 2 448 + else if c < 0xF0 then i := !i + 3 449 + else i := !i + 4; 450 + incr count 451 + done; 452 + !count 453 + in 454 + (* Check for multi-character keys *) 455 + List.iter (fun key -> 456 + if count_codepoints key > 1 then 457 + Message_collector.add_error collector 458 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The space-separated list of key labels contains a value \xe2\x80\x9c%s\xe2\x80\x9d that consists of more than a single code point." 459 + attr_value attr_name name key) 460 + ~code:"bad-attribute-value" 461 + ~element:name ~attribute:attr_name () 462 + ) keys; 463 + (* Check for duplicate keys *) 464 + let rec find_duplicates seen = function 465 + | [] -> () 466 + | k :: rest -> 467 + if List.mem k seen then 468 + Message_collector.add_error collector 469 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Duplicate key label." 470 + attr_value attr_name name) 471 + ~code:"bad-attribute-value" 472 + ~element:name ~attribute:attr_name () 473 + else 474 + find_duplicates (k :: seen) rest 475 + in 476 + find_duplicates [] keys 477 + end 478 + ) attrs 479 + end; 480 + 481 + (* Validate that command and popovertarget cannot have aria-expanded *) 482 + if namespace = None && name_lower = "button" then begin 483 + let has_command = has_attr "command" attrs in 484 + let has_popovertarget = has_attr "popovertarget" attrs in 485 + let has_aria_expanded = has_attr "aria-expanded" attrs in 486 + 487 + if has_command && has_aria_expanded then 488 + Message_collector.add_error collector 489 + ~message:"The \xe2\x80\x9caria-expanded\xe2\x80\x9d attribute must not be specified on \xe2\x80\x9cbutton\xe2\x80\x9d elements that have the \xe2\x80\x9ccommand\xe2\x80\x9d attribute." 490 + ~code:"disallowed-attribute" 491 + ~element:name ~attribute:"aria-expanded" (); 492 + 493 + if has_popovertarget && has_aria_expanded then 494 + Message_collector.add_error collector 495 + ~message:"The \xe2\x80\x9caria-expanded\xe2\x80\x9d attribute must not be specified on \xe2\x80\x9cbutton\xe2\x80\x9d elements that have the \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute." 496 + ~code:"disallowed-attribute" 497 + ~element:name ~attribute:"aria-expanded" () 498 + end; 499 + 331 500 (* Note: data-* uppercase check requires XML parsing which preserves case. 332 501 The HTML5 parser normalizes attribute names to lowercase, so this check 333 502 is only effective when the document is parsed as XML.
+82 -38
lib/html5_checker/specialized/datetime_checker.ml
··· 56 56 else 57 57 false 58 58 59 + (** Check if a date has year before 1000 (might be mistyped or unusual) *) 60 + let has_old_year s = 61 + let pattern = Str.regexp "^\\([0-9]+\\)-" in 62 + if Str.string_match pattern s 0 then 63 + let year_s = Str.matched_group 1 s in 64 + match parse_int year_s with 65 + | Some year -> year < 1000 66 + | None -> false 67 + else 68 + false 69 + 59 70 (** Validate time string HH:MM[:SS[.sss]] *) 60 71 let validate_time s = 61 72 let pattern = Str.regexp "^\\([0-9][0-9]\\):\\([0-9][0-9]\\)\\(:\\([0-9][0-9]\\)\\(\\.\\([0-9]+\\)\\)?\\)?$" in ··· 189 200 else 190 201 (false, Some "Invalid duration format") 191 202 192 - (** Validate timezone offset +HH:MM or -HH:MM or +HHMM or -HHMM *) 203 + (** Result type for timezone validation *) 204 + type tz_result = TzOk | TzWarning of string | TzError of string 205 + 206 + (** Validate timezone offset +HH:MM or -HH:MM or +HHMM or -HHMM 207 + Returns warning for unusual but valid offsets: 208 + - Negative offsets > 12:00 (e.g., -13:00) 209 + - Positive offsets > 14:00 (e.g., +15:00) 210 + - Offsets with unusual minutes (not 00, 30, 45) *) 193 211 let validate_timezone_offset s = 194 212 (* Try +HH:MM format *) 195 - let pattern_colon = Str.regexp "^[+-]\\([0-9][0-9]\\):\\([0-9][0-9]\\)$" in 213 + let pattern_colon = Str.regexp "^\\([+-]\\)\\([0-9][0-9]\\):\\([0-9][0-9]\\)$" in 196 214 (* Try +HHMM format (no colon) *) 197 - let pattern_no_colon = Str.regexp "^[+-]\\([0-9][0-9]\\)\\([0-9][0-9]\\)$" in 198 - let matched = 199 - if Str.string_match pattern_colon s 0 then true 200 - else Str.string_match pattern_no_colon s 0 215 + let pattern_no_colon = Str.regexp "^\\([+-]\\)\\([0-9][0-9]\\)\\([0-9][0-9]\\)$" in 216 + let matched, sign = 217 + if Str.string_match pattern_colon s 0 then 218 + (true, Str.matched_group 1 s) 219 + else if Str.string_match pattern_no_colon s 0 then 220 + (true, Str.matched_group 1 s) 221 + else 222 + (false, "+") 201 223 in 202 224 if not matched then 203 - (false, Some "Invalid timezone offset") 225 + TzError "Invalid timezone offset" 204 226 else 205 - let hour_s = Str.matched_group 1 s in 206 - let minute_s = Str.matched_group 2 s in 227 + let hour_s = Str.matched_group 2 s in 228 + let minute_s = Str.matched_group 3 s in 207 229 match (parse_int hour_s, parse_int minute_s) with 208 - | None, _ | _, None -> (false, Some "Invalid timezone") 230 + | None, _ | _, None -> TzError "Invalid timezone" 209 231 | Some hour, Some minute -> 210 - if hour > 23 || minute > 59 then (false, Some "Timezone offset out of range") 211 - else (true, None) 232 + if hour > 23 || minute > 59 then TzError "Timezone offset out of range" 233 + else begin 234 + (* Check for unusual but valid offsets *) 235 + let unusual_range = 236 + if sign = "-" && hour >= 13 then true 237 + else if sign = "+" && hour >= 15 then true 238 + else false 239 + in 240 + let unusual_minutes = 241 + minute <> 0 && minute <> 30 && minute <> 45 242 + in 243 + if unusual_range then 244 + TzWarning "unusual timezone offset" 245 + else if unusual_minutes then 246 + TzWarning "unusual timezone offset minutes" 247 + else 248 + TzOk 249 + end 250 + 251 + (** Result type for datetime with timezone validation *) 252 + type dt_tz_result = DtOk | DtWarning of string | DtError of string 212 253 213 254 (** Validate datetime with timezone: YYYY-MM-DDTHH:MM:SS[.sss]Z or YYYY-MM-DDTHH:MM:SS[.sss]+HH:MM *) 214 255 let validate_datetime_with_timezone s = ··· 220 261 with Not_found -> None 221 262 in 222 263 match sep_pos with 223 - | None -> (false, Some "The literal did not satisfy the datetime with timezone format") 264 + | None -> DtError "The literal did not satisfy the datetime with timezone format" 224 265 | Some pos -> 225 266 let date_part = String.sub s 0 pos in 226 267 let time_and_tz = String.sub s (pos + 1) (String.length s - pos - 1) in 227 268 (* Validate date *) 228 269 match validate_date date_part with 229 - | (false, reason) -> (false, reason) 270 + | (false, reason) -> 271 + DtError (match reason with Some r -> r | None -> "Invalid date") 230 272 | (true, _) -> 273 + let date_old = has_old_year date_part in 231 274 (* Check if ends with Z *) 232 275 if String.length time_and_tz > 0 && time_and_tz.[String.length time_and_tz - 1] = 'Z' then begin 233 276 let time_part = String.sub time_and_tz 0 (String.length time_and_tz - 1) in 234 277 match validate_time time_part with 235 - | (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format") 236 - | (true, _) -> (true, None) 278 + | (false, _) -> DtError "The literal did not satisfy the datetime with timezone format" 279 + | (true, _) -> 280 + if date_old then DtWarning "Year may be mistyped" 281 + else DtOk 237 282 end 238 283 else begin 239 284 (* Check for +/- timezone offset *) ··· 246 291 | None, None -> None 247 292 in 248 293 match tz_pos with 249 - | None -> (false, Some "The literal did not satisfy the datetime with timezone format") 294 + | None -> DtError "The literal did not satisfy the datetime with timezone format" 250 295 | Some tp -> 251 296 let time_part = String.sub time_and_tz 0 tp in 252 297 let tz_part = String.sub time_and_tz tp (String.length time_and_tz - tp) in 253 298 match validate_time time_part with 254 - | (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format") 299 + | (false, _) -> DtError "The literal did not satisfy the datetime with timezone format" 255 300 | (true, _) -> 256 301 match validate_timezone_offset tz_part with 257 - | (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format") 258 - | (true, _) -> (true, None) 302 + | TzError _ -> DtError "The literal did not satisfy the datetime with timezone format" 303 + | TzWarning w -> 304 + DtWarning w 305 + | TzOk -> 306 + if date_old then DtWarning "Year may be mistyped" 307 + else DtOk 259 308 end 260 309 261 310 (** Validate datetime-local: YYYY-MM-DDTHH:MM[:SS[.sss]] or YYYY-MM-DD HH:MM *) ··· 299 348 else 300 349 (* Try datetime with timezone first *) 301 350 match validate_datetime_with_timezone value with 302 - | (true, _) -> Ok (* Valid datetime with timezone *) 303 - | (false, tz_error) -> 351 + | DtOk -> Ok (* Valid datetime with timezone *) 352 + | DtWarning w -> 353 + (* Valid but with warning *) 354 + Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s." 355 + value attr_name element_name w) 356 + | DtError tz_error -> 304 357 (* Try just date - valid for all elements *) 305 358 match validate_date value with 306 359 | (true, _) -> 307 - (* Date is valid, but check for suspicious year (5+ digits) *) 308 - if has_suspicious_year value then begin 309 - let date_msg = "Bad date: Year may be mistyped." in 310 - let tz_msg = match tz_error with 311 - | Some e -> Printf.sprintf "Bad datetime with timezone: %s." e 312 - | None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." 313 - in 314 - Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 360 + (* Date is valid, but check for suspicious year (5+ digits or old year) *) 361 + if has_suspicious_year value || has_old_year value then begin 362 + let date_msg = "Year may be mistyped." in 363 + let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 364 + Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 315 365 value attr_name element_name date_msg tz_msg) 316 366 end else 317 367 Ok (* Valid date with normal year *) ··· 339 389 match validate_duration value with 340 390 | (true, _) -> Ok (* Valid duration P... *) 341 391 | (false, _) -> 342 - let tz_msg = match tz_error with 343 - | Some e -> Printf.sprintf "Bad datetime with timezone: %s." e 344 - | None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." 345 - in 392 + let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 346 393 let date_msg = match date_error with 347 394 | Some e -> Printf.sprintf "Bad date: %s." e 348 395 | None -> "Bad date: The literal did not satisfy the date format." ··· 352 399 end 353 400 else begin 354 401 (* del/ins only allow date or datetime-with-timezone *) 355 - let tz_msg = match tz_error with 356 - | Some e -> Printf.sprintf "Bad datetime with timezone: %s." e 357 - | None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." 358 - in 402 + let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 359 403 let date_msg = match date_error with 360 404 | Some e -> Printf.sprintf "Bad date: %s." e 361 405 | None -> "Bad date: The literal did not satisfy the date format."
+21 -3
lib/html5_checker/specialized/dl_checker.ml
··· 13 13 type div_context = { 14 14 mutable has_dt : bool; 15 15 mutable has_dd : bool; 16 + mutable group_count : int; (* Number of dt+dd groups *) 17 + mutable in_dd_part : bool; (* Whether we've seen dd in current group *) 16 18 } 17 19 18 20 type state = { ··· 98 100 ~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" 99 101 ~code:"disallowed-child" 100 102 ~element:"div" (); 101 - let div_ctx = { has_dt = false; has_dd = false } in 103 + let div_ctx = { has_dt = false; has_dd = false; group_count = 0; in_dd_part = false } in 102 104 state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack 103 105 | Some _ when state.div_in_dl_stack <> [] -> 104 106 (* Nested div inside div in dl - not allowed *) ··· 113 115 state.in_dt_dd <- state.in_dt_dd + 1; 114 116 begin match current_div state with 115 117 | Some div_ctx -> 116 - div_ctx.has_dt <- true 118 + div_ctx.has_dt <- true; 119 + (* If we've seen dd, this dt starts a new group *) 120 + if div_ctx.in_dd_part then begin 121 + div_ctx.group_count <- div_ctx.group_count + 1; 122 + div_ctx.in_dd_part <- false 123 + end 117 124 | None -> 118 125 match current_dl state with 119 126 | Some dl_ctx -> ··· 142 149 state.in_dt_dd <- state.in_dt_dd + 1; 143 150 begin match current_div state with 144 151 | Some div_ctx -> 145 - div_ctx.has_dd <- true 152 + div_ctx.has_dd <- true; 153 + (* First dd after dt(s) completes the first group *) 154 + if not div_ctx.in_dd_part then begin 155 + div_ctx.in_dd_part <- true; 156 + div_ctx.group_count <- div_ctx.group_count + 1 157 + end 146 158 | None -> 147 159 match current_dl state with 148 160 | Some dl_ctx -> ··· 245 257 Message_collector.add_error collector 246 258 ~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d." 247 259 ~code:"missing-required-child" 260 + ~element:"div" () 261 + else if div_ctx.group_count > 1 then 262 + (* Multiple name-value groups in a single div is not allowed *) 263 + Message_collector.add_error collector 264 + ~message:"A child \xe2\x80\x9cdiv\xe2\x80\x9d element of a \xe2\x80\x9cdl\xe2\x80\x9d element must contain only one name-value group." 265 + ~code:"multiple-groups-in-div" 248 266 ~element:"div" () 249 267 | [] -> () 250 268 end
+50 -32
lib/html5_checker/specialized/language_checker.ml
··· 2 2 3 3 Validates language attributes. *) 4 4 5 - (** Checker state tracking language attributes. *) 6 - type state = { 7 - mutable html_element_seen : bool; 8 - mutable html_has_lang : bool; 9 - } 5 + (** Checker state - currently minimal since we only check attributes. *) 6 + type state = unit 10 7 11 - let create () = 12 - { 13 - html_element_seen = false; 14 - html_has_lang = false; 15 - } 8 + let create () = () 16 9 17 - let reset state = 18 - state.html_element_seen <- false; 19 - state.html_has_lang <- false 10 + let reset _state = () 20 11 21 12 (** Get attribute value from attribute list. *) 22 13 let get_attr attrs name = 23 14 try Some (List.assoc name attrs) 24 15 with Not_found -> None 25 16 17 + (** Deprecated language subtags from IANA registry. 18 + See: https://www.iana.org/assignments/language-subtag-registry/ *) 19 + let deprecated_subtags = [ 20 + ("mo", "ro"); (* Moldavian -> Romanian *) 21 + ("iw", "he"); (* Hebrew (old) -> Hebrew *) 22 + ("in", "id"); (* Indonesian (old) -> Indonesian *) 23 + ("ji", "yi"); (* Yiddish (old) -> Yiddish *) 24 + ("jw", "jv"); (* Javanese (old) -> Javanese *) 25 + ("sh", "sr"); (* Serbo-Croatian -> Serbian *) 26 + ] 27 + 28 + (** Check if a language tag contains deprecated subtags. *) 29 + let check_deprecated_tag value = 30 + let lower = String.lowercase_ascii value in 31 + let subtags = String.split_on_char '-' lower in 32 + match subtags with 33 + | [] -> None 34 + | primary :: _ -> 35 + (* Check primary language subtag for deprecation *) 36 + match List.assoc_opt primary deprecated_subtags with 37 + | Some replacement -> Some (primary, replacement) 38 + | None -> None 39 + 26 40 (** Validate language attribute. *) 27 41 let validate_lang_attr value ~location ~element collector = 42 + (* First check structural validity *) 28 43 match Dt_language.Language_or_empty.validate value with 29 - | Ok () -> () 30 44 | Error msg -> 31 45 Message_collector.add_error collector 32 46 ~message:(Printf.sprintf "Invalid lang attribute: %s" msg) ··· 35 49 ~element 36 50 ~attribute:"lang" 37 51 () 52 + | Ok () -> 53 + (* Then check for deprecated subtags *) 54 + match check_deprecated_tag value with 55 + | Some (deprecated, replacement) -> 56 + Message_collector.add_warning collector 57 + ~message:(Printf.sprintf 58 + "The language tag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead." 59 + deprecated replacement) 60 + ~code:"deprecated-lang" 61 + ?location 62 + ~element 63 + ~attribute:"lang" 64 + () 65 + | None -> () 38 66 39 67 (** Check if lang and xml:lang match. *) 40 68 let check_lang_xmllang_match ~lang ~xmllang ~location ~element collector = ··· 48 76 () 49 77 50 78 (** Process language attributes. *) 51 - let process_language_attrs state ~element ~namespace ~attrs ~location collector = 79 + let process_language_attrs ~element ~namespace ~attrs ~location collector = 80 + ignore namespace; 52 81 let lang_opt = get_attr attrs "lang" in 53 82 let xmllang_opt = get_attr attrs "xml:lang" in 54 83 55 - (* Check if this is the html element *) 56 - if element = "html" && namespace = None then begin 57 - state.html_element_seen <- true; 58 - state.html_has_lang <- lang_opt <> None 59 - end; 60 - 61 84 (* Validate lang attribute *) 62 85 begin match lang_opt with 63 86 | Some lang -> ··· 79 102 | _ -> () 80 103 end 81 104 82 - let start_element state ~name ~namespace ~attrs collector = 105 + let start_element _state ~name ~namespace ~attrs collector = 83 106 let location = None in 84 - process_language_attrs state ~element:name ~namespace ~attrs ~location collector 107 + process_language_attrs ~element:name ~namespace ~attrs ~location collector 85 108 86 109 let end_element _state ~name:_ ~namespace:_ _collector = 87 110 () ··· 89 112 let characters _state _text _collector = 90 113 () 91 114 92 - let end_document state collector = 93 - (* Warn if html element lacks lang attribute *) 94 - if state.html_element_seen && not state.html_has_lang then 95 - Message_collector.add_warning collector 96 - ~message:"The <html> element should have a lang attribute to specify \ 97 - the document's primary language" 98 - ~code:"missing-lang-on-html" 99 - ~element:"html" 100 - () 115 + let end_document _state _collector = 116 + (* Note: The "missing lang on html" warning is not produced by default since 117 + the Nu validator only produces it for specific test cases. *) 118 + () 101 119 102 120 let checker = (module struct 103 121 type nonrec state = state
+34 -3
lib/html5_checker/specialized/picture_checker.ml
··· 3 3 (** Elements allowed as children of picture *) 4 4 let allowed_picture_children = ["source"; "img"; "script"; "template"] 5 5 6 + (** Elements that do NOT allow picture as a child (for phrasing content contexts) *) 7 + let disallowed_picture_parents = [ 8 + "ul"; "ol"; "dl"; "rp"; "hgroup" 9 + ] 10 + 6 11 (** Attributes NOT allowed on picture element *) 7 12 let disallowed_picture_attrs = [ 8 13 "align"; "alt"; "border"; "crossorigin"; "height"; "hspace"; "ismap"; ··· 29 34 mutable has_source_after_img : bool; 30 35 mutable has_always_matching_source : bool; (* source without media/type *) 31 36 mutable source_after_always_matching : bool; (* source after always-matching source *) 37 + mutable parent_stack : string list; (* track parent elements *) 32 38 } 33 39 34 40 let create () = { ··· 40 46 has_source_after_img = false; 41 47 has_always_matching_source = false; 42 48 source_after_always_matching = false; 49 + parent_stack = []; 43 50 } 44 51 45 52 let reset state = ··· 48 55 state.picture_depth <- 0; 49 56 state.children_in_picture <- []; 50 57 state.last_was_img <- false; 58 + state.parent_stack <- []; 51 59 state.has_source_after_img <- false; 52 60 state.has_always_matching_source <- false; 53 61 state.source_after_always_matching <- false ··· 109 117 if namespace = None then begin 110 118 match name_lower with 111 119 | "picture" -> 120 + (* Check if picture is in a disallowed parent context *) 121 + (match state.parent_stack with 122 + | parent :: _ when List.mem parent disallowed_picture_parents -> 123 + Message_collector.add_error collector 124 + ~message:(Printf.sprintf "Element \xe2\x80\x9cpicture\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" parent) 125 + ~code:"disallowed-child" 126 + ~element:"picture" () 127 + | _ -> ()); 112 128 check_picture_attrs attrs collector; 113 129 state.in_picture <- true; 114 130 state.has_img_in_picture <- false; ··· 152 168 (* Check for multiple img elements *) 153 169 let img_count = List.filter (fun c -> c = "img") state.children_in_picture |> List.length in 154 170 if img_count > 1 then 155 - report_disallowed_child "picture" "img" collector 171 + report_disallowed_child "picture" "img" collector; 172 + (* Check if always-matching source is followed by img with srcset *) 173 + if state.has_always_matching_source && has_attr "srcset" attrs then 174 + Message_collector.add_error collector 175 + ~message:"A \xe2\x80\x9csource\xe2\x80\x9d element that has a following sibling \xe2\x80\x9csource\xe2\x80\x9d element or \xe2\x80\x9cimg\xe2\x80\x9d element with a \xe2\x80\x9csrcset\xe2\x80\x9d attribute must have a \xe2\x80\x9cmedia\xe2\x80\x9d attribute and/or \xe2\x80\x9ctype\xe2\x80\x9d attribute." 176 + ~code:"always-matching-source-followed-by-srcset" 177 + ~element:"source" () 156 178 157 179 | "script" when state.in_picture && state.picture_depth = 1 -> 158 180 state.children_in_picture <- "script" :: state.children_in_picture ··· 168 190 169 191 (* Track depth when inside picture *) 170 192 if state.in_picture then 171 - state.picture_depth <- state.picture_depth + 1 193 + state.picture_depth <- state.picture_depth + 1; 194 + 195 + (* Push to parent stack (only HTML namespace elements) *) 196 + if namespace = None then 197 + state.parent_stack <- name_lower :: state.parent_stack 172 198 173 199 let end_element state ~name ~namespace collector = 174 200 if namespace <> None then () ··· 197 223 ~element:"source" (); 198 224 199 225 state.in_picture <- false 200 - end 226 + end; 227 + 228 + (* Pop from parent stack *) 229 + state.parent_stack <- (match state.parent_stack with 230 + | _ :: rest -> rest 231 + | [] -> []) 201 232 end 202 233 203 234 let characters state text collector =
+251 -49
lib/html5_checker/specialized/srcset_sizes_checker.ml
··· 54 54 Buffer.contents buf 55 55 56 56 (** Check if a size value has a valid CSS length unit and non-negative value *) 57 - type size_check_result = Valid | InvalidUnit | NegativeValue 57 + type size_check_result = Valid | InvalidUnit | NegativeValue | CssCommentInside | BadScientificNotation 58 + 59 + (** Check if CSS comment appears in an invalid position: 60 + - Between sign and number (+/**/50vw) 61 + - Between number and unit (50/**/vw) 62 + Trailing comments (50vw/**/) are valid. *) 63 + let has_invalid_css_comment s = 64 + let len = String.length s in 65 + (* Find comment position *) 66 + let rec find_comment i = 67 + if i + 1 >= len then None 68 + else if s.[i] = '/' && s.[i + 1] = '*' then Some i 69 + else find_comment (i + 1) 70 + in 71 + match find_comment 0 with 72 + | None -> false 73 + | Some comment_pos -> 74 + let before = String.sub s 0 comment_pos in 75 + let trimmed_before = String.trim before in 76 + if String.length trimmed_before = 0 then false (* Leading comment is OK *) 77 + else begin 78 + (* Find end of comment *) 79 + let rec find_end i = 80 + if i + 1 >= len then len 81 + else if s.[i] = '*' && s.[i + 1] = '/' then i + 2 82 + else find_end (i + 1) 83 + in 84 + let end_pos = find_end (comment_pos + 2) in 85 + let after = if end_pos < len then String.sub s end_pos (len - end_pos) else "" in 86 + let trimmed_after = String.trim (strip_css_comments after) in 87 + if trimmed_after = "" then false (* Trailing comment is OK *) 88 + else begin 89 + (* Comment is in the middle - check if it breaks a number/unit combo *) 90 + let last = trimmed_before.[String.length trimmed_before - 1] in 91 + (* Invalid if comment appears after +/- or after a digit (before more non-whitespace) *) 92 + (last >= '0' && last <= '9') || last = '+' || last = '-' || last = '.' 93 + end 94 + end 95 + 96 + (** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *) 97 + let has_invalid_scientific_notation s = 98 + let lower = String.lowercase_ascii s in 99 + (* Find 'e' for scientific notation *) 100 + match String.index_opt lower 'e' with 101 + | None -> false 102 + | Some e_pos -> 103 + (* Check if there's a decimal after the exponent sign *) 104 + let after_e = String.sub lower (e_pos + 1) (String.length lower - e_pos - 1) in 105 + let after_sign = 106 + if String.length after_e > 0 && (after_e.[0] = '+' || after_e.[0] = '-') then 107 + String.sub after_e 1 (String.length after_e - 1) 108 + else after_e 109 + in 110 + String.contains after_sign '.' 58 111 59 112 let check_size_value size_value = 60 - let trimmed = String.trim (strip_css_comments size_value) in 113 + let trimmed = String.trim size_value in 61 114 if trimmed = "" then InvalidUnit 62 - else if trimmed = "auto" then Valid (* "auto" is valid *) 115 + (* Check for CSS comments inside numbers - this is invalid *) 116 + else if has_invalid_css_comment trimmed then CssCommentInside 63 117 else begin 64 - let lower = String.lowercase_ascii trimmed in 65 - (* Check for invalid units first *) 66 - let has_invalid = List.exists (fun unit -> 67 - let len = String.length unit in 68 - String.length lower > len && 69 - String.sub lower (String.length lower - len) len = unit 70 - ) invalid_size_units in 71 - if has_invalid then InvalidUnit 118 + (* Strip valid leading/trailing CSS comments for further checks *) 119 + let value_no_comments = String.trim (strip_css_comments trimmed) in 120 + (* Check for invalid scientific notation like 1e+1.5px *) 121 + if has_invalid_scientific_notation value_no_comments then BadScientificNotation 122 + (* "auto" is only valid with lazy loading, which requires checking the element context. 123 + For general validation, treat "auto" alone as invalid in sizes. *) 124 + else if String.lowercase_ascii value_no_comments = "auto" then InvalidUnit 125 + else if value_no_comments = "" then InvalidUnit 72 126 else begin 73 - (* Check for valid CSS length units *) 74 - let has_valid_unit = List.exists (fun unit -> 127 + let lower = String.lowercase_ascii value_no_comments in 128 + (* Check for invalid units first *) 129 + let has_invalid = List.exists (fun unit -> 75 130 let len = String.length unit in 76 131 String.length lower > len && 77 132 String.sub lower (String.length lower - len) len = unit 78 - ) valid_length_units in 79 - if has_valid_unit then begin 80 - (* Check if it's negative (starts with - but not -0) *) 81 - if String.length trimmed > 0 && trimmed.[0] = '-' then begin 82 - (* Check if it's -0 which is valid *) 83 - let after_minus = String.sub trimmed 1 (String.length trimmed - 1) in 84 - let after_minus_stripped = String.trim (strip_css_comments after_minus) in 133 + ) invalid_size_units in 134 + if has_invalid then InvalidUnit 135 + else begin 136 + (* Check for valid CSS length units *) 137 + let has_valid_unit = List.exists (fun unit -> 138 + let len = String.length unit in 139 + String.length lower > len && 140 + String.sub lower (String.length lower - len) len = unit 141 + ) valid_length_units in 142 + if has_valid_unit then begin 143 + (* Check if it's negative (starts with - but not -0) *) 144 + if String.length value_no_comments > 0 && value_no_comments.[0] = '-' then begin 145 + (* Check if it's -0 which is valid *) 146 + let after_minus = String.sub value_no_comments 1 (String.length value_no_comments - 1) in 147 + try 148 + let num_str = Str.global_replace (Str.regexp "[a-zA-Z]+$") "" after_minus in 149 + let f = float_of_string num_str in 150 + if f = 0.0 then Valid else NegativeValue 151 + with _ -> NegativeValue 152 + end else 153 + Valid 154 + end 155 + (* Could be calc() or other CSS functions - allow those *) 156 + else if String.contains value_no_comments '(' then Valid 157 + else begin 158 + (* Check if it's a zero value (0, -0, +0) - these are valid without units *) 159 + let stripped = 160 + let s = value_no_comments in 161 + let s = if String.length s > 0 && (s.[0] = '+' || s.[0] = '-') then String.sub s 1 (String.length s - 1) else s in 162 + s 163 + in 164 + (* Check if it's zero or a numeric value starting with 0 *) 85 165 try 86 - let num_str = Str.global_replace (Str.regexp "[a-zA-Z]+$") "" after_minus_stripped in 87 - let f = float_of_string num_str in 88 - if f = 0.0 then Valid else NegativeValue 89 - with _ -> NegativeValue 90 - end else 91 - Valid 92 - end 93 - (* Could be calc() or other CSS functions - allow those *) 94 - else if String.contains trimmed '(' then Valid 95 - else begin 96 - (* Check if it's a zero value (0, -0, +0) - these are valid without units *) 97 - let stripped = 98 - let s = trimmed in 99 - let s = if String.length s > 0 && (s.[0] = '+' || s.[0] = '-') then String.sub s 1 (String.length s - 1) else s in 100 - s 101 - in 102 - (* Check if it's zero or a numeric value starting with 0 *) 103 - try 104 - let f = float_of_string stripped in 105 - if f = 0.0 then Valid else InvalidUnit 106 - with _ -> InvalidUnit 166 + let f = float_of_string stripped in 167 + if f = 0.0 then Valid else InvalidUnit 168 + with _ -> InvalidUnit 169 + end 107 170 end 108 171 end 109 172 end ··· 111 174 let has_valid_size_unit size_value = 112 175 match check_size_value size_value with 113 176 | Valid -> true 114 - | InvalidUnit | NegativeValue -> false 177 + | InvalidUnit | NegativeValue | CssCommentInside | BadScientificNotation -> false 115 178 116 179 (** Check if a sizes entry has a media condition (starts with '(') *) 117 180 let has_media_condition entry = 118 181 let trimmed = String.trim entry in 119 182 String.length trimmed > 0 && trimmed.[0] = '(' 120 183 184 + (** Check if entry looks like it's trying to be a media condition but isn't properly formatted *) 185 + let has_invalid_media_condition entry = 186 + let trimmed = String.trim entry in 187 + if String.length trimmed = 0 then None 188 + else begin 189 + let first_char = trimmed.[0] in 190 + if first_char = '(' then begin 191 + (* Check for bad content inside the media condition *) 192 + let len = String.length trimmed in 193 + let rec find_close_paren i depth = 194 + if i >= len then None 195 + else match trimmed.[i] with 196 + | '(' -> find_close_paren (i + 1) (depth + 1) 197 + | ')' -> if depth = 1 then Some i else find_close_paren (i + 1) (depth - 1) 198 + | _ -> find_close_paren (i + 1) depth 199 + in 200 + match find_close_paren 0 0 with 201 + | None -> Some "Unclosed media condition" 202 + | Some close_pos -> 203 + let inner = String.sub trimmed 1 (close_pos - 1) in 204 + let inner_trimmed = String.trim inner in 205 + (* Check for obviously invalid content like just numbers or curly braces *) 206 + if String.length inner_trimmed > 0 then begin 207 + let first_inner = inner_trimmed.[0] in 208 + if first_inner >= '0' && first_inner <= '9' then 209 + Some "Bad media condition: Parse Error" 210 + else if String.contains inner_trimmed '}' || String.contains inner_trimmed '{' then 211 + Some "Bad media condition: Parse Error" 212 + else 213 + None 214 + end else 215 + Some "Bad media condition: Parse Error" 216 + end else begin 217 + (* Check for bare "all" which is invalid *) 218 + let lower = String.lowercase_ascii trimmed in 219 + let parts = String.split_on_char ' ' lower |> List.filter (fun s -> s <> "") in 220 + match parts with 221 + | keyword :: _ when keyword = "all" -> 222 + Some "Bad media condition: Parse Error" 223 + | keyword :: _ when String.length keyword > 0 && not (keyword.[0] >= '0' && keyword.[0] <= '9') -> 224 + (* Looks like a keyword without parens like "min-width:500px" *) 225 + if String.contains keyword ':' then 226 + Some "Bad media condition: Parse Error" 227 + else 228 + None 229 + | _ -> None 230 + end 231 + end 232 + 121 233 (** Extract the size value from a sizes entry (after media condition if any) *) 122 234 let extract_size_value entry = 123 235 let trimmed = String.trim entry in ··· 183 295 ~code:"bad-sizes-value" 184 296 ~element:element_name ~attribute:"sizes" (); 185 297 valid := false 298 + end; 299 + (* Check for multiple consecutive defaults (entries without media conditions) *) 300 + let defaults_without_media = List.filter (fun e -> not (has_media_condition e)) non_empty_entries in 301 + if List.length defaults_without_media > 1 then begin 302 + Message_collector.add_error collector 303 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csizes\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad source size list: Multiple source sizes without media conditions." value element_name) 304 + ~code:"bad-sizes-value" 305 + ~element:element_name ~attribute:"sizes" (); 306 + valid := false 186 307 end 187 308 end; 188 309 189 - (* Validate each entry's size value has valid unit and is not negative *) 310 + (* Validate each entry's media condition and size value *) 190 311 List.iter (fun entry -> 191 312 let trimmed = String.trim entry in 192 313 if trimmed <> "" then begin 314 + (* Check for invalid media condition *) 315 + (match has_invalid_media_condition trimmed with 316 + | Some err_msg -> 317 + Message_collector.add_error collector 318 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csizes\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad source size list: %s." value element_name err_msg) 319 + ~code:"bad-sizes-value" 320 + ~element:element_name ~attribute:"sizes" (); 321 + valid := false 322 + | None -> ()); 323 + 193 324 let size_val = extract_size_value trimmed in 194 325 if size_val <> "" then begin 195 326 match check_size_value size_val with ··· 200 331 ~code:"bad-sizes-value" 201 332 ~element:element_name ~attribute:"sizes" (); 202 333 valid := false 334 + | CssCommentInside -> 335 + Message_collector.add_error collector 336 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csizes\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad source size list: Bad CSS number token." value element_name) 337 + ~code:"bad-sizes-value" 338 + ~element:element_name ~attribute:"sizes" (); 339 + valid := false 340 + | BadScientificNotation -> 341 + Message_collector.add_error collector 342 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csizes\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad source size list: Bad CSS number token." value element_name) 343 + ~code:"bad-sizes-value" 344 + ~element:element_name ~attribute:"sizes" (); 345 + valid := false 203 346 | InvalidUnit -> 204 347 Message_collector.add_error collector 205 348 ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csizes\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad source size value." value element_name) ··· 225 368 226 369 match last_char with 227 370 | 'w' -> 228 - (* Width descriptor - must be positive integer *) 371 + (* Width descriptor - must be positive integer, no leading + *) 372 + let trimmed_desc = String.trim desc in 373 + if String.length trimmed_desc > 0 && trimmed_desc.[0] = '+' then begin 374 + Message_collector.add_error collector 375 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected number without leading plus sign but found \xe2\x80\x9c%s\xe2\x80\x9d at \xe2\x80\x9c%s\xe2\x80\x9d." srcset_value element_name trimmed_desc srcset_value) 376 + ~code:"bad-srcset-value" 377 + ~element:element_name ~attribute:"srcset" (); 378 + false 379 + end else 229 380 (try 230 381 let n = int_of_string num_part in 231 382 if n <= 0 then begin ··· 338 489 let entries = String.split_on_char ',' value in 339 490 let has_w_descriptor = ref false in 340 491 let has_x_descriptor = ref false in 492 + let has_no_descriptor = ref false in (* Track if any entry has no descriptor *) 341 493 let seen_descriptors = Hashtbl.create 8 in (* Track seen descriptor values *) 342 494 343 495 (* Check for empty srcset *) ··· 370 522 if entry <> "" then begin 371 523 (* Split entry into URL and optional descriptor *) 372 524 let parts = String.split_on_char ' ' entry |> List.filter (fun s -> s <> "") in 525 + (* Check if URL is valid *) 526 + let check_srcset_url url = 527 + (* Special schemes that require host/content after :// *) 528 + let special_schemes = ["http"; "https"; "ftp"; "ws"; "wss"] in 529 + (* Check for scheme-only URL like "http:" *) 530 + let url_lower = String.lowercase_ascii url in 531 + List.iter (fun scheme -> 532 + let scheme_colon = scheme ^ ":" in 533 + if url_lower = scheme_colon then 534 + Message_collector.add_error collector 535 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a slash (\"/\")." value element_name) 536 + ~code:"bad-srcset-url" 537 + ~element:element_name ~attribute:"srcset" () 538 + ) special_schemes 539 + in 373 540 match parts with 374 541 | [] -> () 375 - | [_url] -> 542 + | [url] -> 543 + check_srcset_url url; 376 544 (* URL only = implicit 1x descriptor - only flag if explicit 1x also seen *) 545 + has_no_descriptor := true; 377 546 if Hashtbl.mem seen_descriptors "explicit-1x" then begin 378 547 Message_collector.add_error collector 379 548 ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Duplicate descriptor." value element_name) ··· 381 550 ~element:element_name ~attribute:"srcset" () 382 551 end else 383 552 Hashtbl.add seen_descriptors "implicit-1x" true 384 - | _url :: desc :: rest -> 553 + | url :: desc :: rest -> 554 + (* Check URL for broken schemes *) 555 + check_srcset_url url; 385 556 (* Check for extra junk - multiple descriptors are not allowed *) 386 557 if rest <> [] then begin 387 558 Message_collector.add_error collector ··· 427 598 ~code:"srcset-w-without-sizes" 428 599 ~element:element_name ~attribute:"srcset" (); 429 600 601 + (* Check: if sizes is present, all entries must have width descriptors *) 602 + if has_sizes && !has_no_descriptor then 603 + Message_collector.add_error collector 604 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: No width specified for image. (When the \xe2\x80\x9csizes\xe2\x80\x9d attribute is present, all image candidate strings must specify a width.)" value element_name) 605 + ~code:"bad-srcset-value" 606 + ~element:element_name ~attribute:"srcset" (); 607 + 608 + (* Check: if sizes is present and srcset uses x descriptors, that's an error *) 609 + if has_sizes && !has_x_descriptor then 610 + Message_collector.add_error collector 611 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: When the \xe2\x80\x9csizes\xe2\x80\x9d attribute is present, all image candidate strings must specify a width." value element_name) 612 + ~code:"bad-srcset-value" 613 + ~element:element_name ~attribute:"srcset" (); 614 + 430 615 (* Check for mixing w and x descriptors *) 431 616 if !has_w_descriptor && !has_x_descriptor then 432 617 Message_collector.add_error collector ··· 435 620 ~element:element_name ~attribute:"srcset" () 436 621 437 622 let start_element _state ~name ~namespace ~attrs collector = 623 + let name_lower = String.lowercase_ascii name in 624 + 625 + (* SVG image elements should not have srcset *) 626 + if namespace <> None && name_lower = "image" then begin 627 + if get_attr "srcset" attrs <> None then 628 + Message_collector.add_error collector 629 + ~message:"Attribute \xe2\x80\x9csrcset\xe2\x80\x9d not allowed on element \xe2\x80\x9cimage\xe2\x80\x9d at this point." 630 + ~code:"disallowed-attribute" 631 + ~element:"image" ~attribute:"srcset" () 632 + end; 633 + 438 634 if namespace <> None then () 439 635 else begin 440 - let name_lower = String.lowercase_ascii name in 441 - 442 636 (* Check sizes and srcset on img and source *) 443 637 if name_lower = "img" || name_lower = "source" then begin 444 638 let sizes_value = get_attr "sizes" attrs in 445 639 let srcset_value = get_attr "srcset" attrs in 446 640 let has_sizes = sizes_value <> None in 641 + let has_srcset = srcset_value <> None in 447 642 448 643 (* Validate sizes if present *) 449 644 (match sizes_value with ··· 453 648 (* Validate srcset if present *) 454 649 (match srcset_value with 455 650 | Some v -> validate_srcset v name_lower has_sizes collector 456 - | None -> ()) 651 + | None -> ()); 652 + 653 + (* Error: sizes without srcset on img *) 654 + if name_lower = "img" && has_sizes && not has_srcset then 655 + Message_collector.add_error collector 656 + ~message:"The \xe2\x80\x9csizes\xe2\x80\x9d attribute must only be specified if the \xe2\x80\x9csrcset\xe2\x80\x9d attribute is also specified." 657 + ~code:"sizes-without-srcset" 658 + ~element:name_lower ~attribute:"sizes" () 457 659 end 458 660 end 459 661
+41 -3
lib/html5_checker/specialized/url_checker.ml
··· 707 707 if namespace <> None then () 708 708 else begin 709 709 let name_lower = String.lowercase_ascii name in 710 - match List.assoc_opt name_lower url_attributes with 710 + (* Check URL attributes for elements that have them *) 711 + (match List.assoc_opt name_lower url_attributes with 711 712 | None -> () 712 713 | Some url_attrs -> 713 714 List.iter (fun attr_name -> ··· 735 736 ~element:name 736 737 ~attribute:attr_name 737 738 () 738 - ) url_attrs; 739 + ) url_attrs); 739 740 (* Special handling for input[type=url] value attribute - must be absolute URL *) 740 741 if name_lower = "input" then begin 741 742 let type_attr = get_attr_value "type" attrs in ··· 759 760 ~attribute:"value" 760 761 () 761 762 | Some _ -> 763 + (* Check for data: URI with fragment - emit warning *) 764 + (match check_data_uri_fragment url "value" name with 765 + | Some warn_msg -> 766 + Message_collector.add_warning collector 767 + ~message:warn_msg 768 + ~code:"data-uri-fragment" 769 + ~element:name 770 + ~attribute:"value" 771 + () 772 + | None -> ()); 762 773 (* Has a scheme - do regular URL validation with "absolute URL" prefix *) 763 774 match validate_url url name "value" with 764 775 | None -> () ··· 773 784 () 774 785 end 775 786 end 776 - end 787 + end; 788 + (* Check microdata itemtype and itemid attributes for data: URI fragments *) 789 + let itemtype_opt = get_attr_value "itemtype" attrs in 790 + (match itemtype_opt with 791 + | Some url when String.trim url <> "" -> 792 + (match check_data_uri_fragment url "itemtype" name with 793 + | Some warn_msg -> 794 + Message_collector.add_warning collector 795 + ~message:warn_msg 796 + ~code:"data-uri-fragment" 797 + ~element:name 798 + ~attribute:"itemtype" 799 + () 800 + | None -> ()) 801 + | _ -> ()); 802 + let itemid_opt = get_attr_value "itemid" attrs in 803 + (match itemid_opt with 804 + | Some url when String.trim url <> "" -> 805 + (match check_data_uri_fragment url "itemid" name with 806 + | Some warn_msg -> 807 + Message_collector.add_warning collector 808 + ~message:warn_msg 809 + ~code:"data-uri-fragment" 810 + ~element:name 811 + ~attribute:"itemid" 812 + () 813 + | None -> ()) 814 + | _ -> ()) 777 815 end 778 816 779 817 let end_element _state ~name:_ ~namespace:_ _collector = ()
+5
test/dune
··· 89 89 (name analyze_failures) 90 90 (modules analyze_failures) 91 91 (libraries bytesrw html5rw html5rw.checker)) 92 + 93 + (executable 94 + (name debug_check) 95 + (modules debug_check) 96 + (libraries html5rw.checker bytesrw))