OCaml HTML5 parser/serialiser based on Python's JustHTML

fix

+5 -4
lib/html5_checker/content_model/content_checker.ml
··· 57 57 | Content_model.Nothing -> false 58 58 | Content_model.Text -> true 59 59 | Content_model.Transparent -> true (* Inherits parent *) 60 - | Content_model.Categories _ -> false (* Elements only *) 60 + | Content_model.Categories cats -> 61 + (* Phrasing and Flow content include text *) 62 + List.mem Content_category.Phrasing cats || List.mem Content_category.Flow cats 61 63 | Content_model.Elements _ -> false (* Specific elements only *) 62 64 | Content_model.Mixed _ -> true (* Text + elements *) 63 65 | Content_model.One_or_more model -> allows_text model ··· 96 98 97 99 match spec_opt with 98 100 | None -> 99 - (* Unknown element - emit warning *) 100 - Message_collector.add_typed collector 101 - (Error_code.Unknown_element { name }) 101 + (* Unknown element - first check if it's allowed in current context *) 102 + validate_child_element state name collector 102 103 | Some spec -> 103 104 (* Check prohibited ancestors *) 104 105 check_prohibited_ancestors state name spec collector;
+23 -2
lib/html5_checker/error_code.ml
··· 135 135 (** The "id" attribute on a "map" element must have the same value as the "name" attribute. *) 136 136 | List_attr_requires_datalist 137 137 (** The "list" attribute of "input" must refer to a "datalist" element. *) 138 + | Input_list_not_allowed 139 + (** Attribute "list" is only allowed on certain input types. *) 138 140 | Label_too_many_labelable 139 141 (** The "label" element may contain at most one labelable descendant. *) 140 142 | Label_for_id_mismatch ··· 151 153 (** Element "summary" is missing required attribute "role". *) 152 154 | Summary_missing_attrs 153 155 (** Element "summary" is missing one or more of [aria-checked, aria-level, role]. *) 156 + | Summary_role_not_allowed 157 + (** The "role" attribute must not be used on any "summary" for its parent "details". *) 154 158 | Autocomplete_webauthn_on_select 155 159 (** The value of "autocomplete" for "select" must not contain "webauthn". *) 156 160 | Commandfor_invalid_target ··· 234 238 | Importmap_scopes_values_not_object 235 239 (** The value of "scopes" property values must also be JSON objects. *) 236 240 | Importmap_scopes_invalid_url 237 - (** The "scopes" property must only contain valid URL values. *) 241 + (** The "scopes" property keys must be valid URL strings. *) 242 + | Importmap_scopes_value_invalid_url 243 + (** The specifier map within "scopes" must only contain valid URL values. *) 238 244 239 245 (* ===== Style Element ===== *) 240 246 | Style_type_invalid ··· 270 276 | Unnecessary_role _ -> Warning 271 277 | Aria_should_not_be_used _ -> Warning 272 278 | Unknown_element _ -> Warning 279 + | Not_nfc _ -> Warning 273 280 | _ -> Error 274 281 275 282 (** Get a short code string for categorization *) ··· 333 340 | Picture_missing_img -> "picture-missing-img" 334 341 | Map_id_name_mismatch -> "map-id-name" 335 342 | List_attr_requires_datalist -> "list-datalist" 343 + | Input_list_not_allowed -> "list-not-allowed" 336 344 | Label_too_many_labelable -> "label-multiple" 337 345 | Label_for_id_mismatch -> "label-for-mismatch" 338 346 | Role_on_label_ancestor -> "role-on-label" ··· 341 349 | Input_value_constraint _ -> "input-value" 342 350 | Summary_missing_role -> "summary-role" 343 351 | Summary_missing_attrs -> "summary-attrs" 352 + | Summary_role_not_allowed -> "summary-role" 344 353 | Autocomplete_webauthn_on_select -> "autocomplete" 345 354 | Commandfor_invalid_target -> "commandfor" 346 355 | Forbidden_codepoint _ -> "forbidden-codepoint" ··· 377 386 | Importmap_scopes_not_object -> "importmap" 378 387 | Importmap_scopes_values_not_object -> "importmap" 379 388 | Importmap_scopes_invalid_url -> "importmap" 389 + | Importmap_scopes_value_invalid_url -> "importmap" 380 390 | Style_type_invalid -> "style-type" 381 391 | Headingoffset_invalid -> "headingoffset" 382 392 | Media_empty -> "media-empty" ··· 410 420 | Bad_attr_value_generic { message } -> message 411 421 | Duplicate_id { id } -> 412 422 Printf.sprintf "Duplicate ID %s." (q id) 413 - | Data_attr_invalid_name { reason } -> reason 423 + | Data_attr_invalid_name { reason } -> 424 + Printf.sprintf "%s attribute names %s." (q "data-*") reason 414 425 | Data_attr_uppercase -> 415 426 Printf.sprintf "%s attributes must not have characters from the range %s\xe2\x80\xa6%s in the name." 416 427 (q "data-*") (q "A") (q "Z") ··· 566 577 | List_attr_requires_datalist -> 567 578 Printf.sprintf "The %s attribute of the %s element must refer to a %s element." 568 579 (q "list") (q "input") (q "datalist") 580 + | Input_list_not_allowed -> 581 + Printf.sprintf "Attribute %s is only allowed when the input type is %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, or %s." 582 + (q "list") (q "color") (q "date") (q "datetime-local") (q "email") (q "month") 583 + (q "number") (q "range") (q "search") (q "tel") (q "text") (q "time") (q "url") (q "week") 569 584 | Label_too_many_labelable -> 570 585 Printf.sprintf "The %s element may contain at most one %s, %s, %s, %s, %s, %s, or %s descendant." 571 586 (q "label") (q "button") (q "input") (q "meter") (q "output") (q "progress") (q "select") (q "textarea") ··· 588 603 | Summary_missing_attrs -> 589 604 Printf.sprintf "Element %s is missing one or more of the following attributes: [aria-checked, aria-level, role]." 590 605 (q "summary") 606 + | Summary_role_not_allowed -> 607 + Printf.sprintf "The %s attribute must not be used on any %s element that is a summary for its parent %s element." 608 + (q "role") (q "summary") (q "details") 591 609 | Autocomplete_webauthn_on_select -> 592 610 Printf.sprintf "The value of the %s attribute for the %s element must not contain %s." 593 611 (q "autocomplete") (q "select") (q "webauthn") ··· 692 710 Printf.sprintf "The value of the %s property within the content of a %s element with a %s attribute whose value is %s must be a JSON object whose values are also JSON objects." 693 711 (q "scopes") (q "script") (q "type") (q "importmap") 694 712 | Importmap_scopes_invalid_url -> 713 + Printf.sprintf "The value of the %s property within the content of a %s element with a %s attribute whose value is %s must be a JSON object whose keys are valid URL strings." 714 + (q "scopes") (q "script") (q "type") (q "importmap") 715 + | Importmap_scopes_value_invalid_url -> 695 716 Printf.sprintf "A specifier map defined in a %s property within the content of a %s element with a %s attribute whose value is %s must only contain valid URL values." 696 717 (q "scopes") (q "script") (q "type") (q "importmap") 697 718
+3
lib/html5_checker/error_code.mli
··· 76 76 | Picture_missing_img 77 77 | Map_id_name_mismatch 78 78 | List_attr_requires_datalist 79 + | Input_list_not_allowed 79 80 | Label_too_many_labelable 80 81 | Label_for_id_mismatch 81 82 | Role_on_label_ancestor ··· 84 85 | Input_value_constraint of { constraint_type: string } 85 86 | Summary_missing_role 86 87 | Summary_missing_attrs 88 + | Summary_role_not_allowed 87 89 | Autocomplete_webauthn_on_select 88 90 | Commandfor_invalid_target 89 91 ··· 132 134 | Importmap_scopes_not_object 133 135 | Importmap_scopes_values_not_object 134 136 | Importmap_scopes_invalid_url 137 + | Importmap_scopes_value_invalid_url 135 138 136 139 (* Style Element *) 137 140 | Style_type_invalid
+14 -8
lib/html5_checker/specialized/aria_checker.ml
··· 691 691 if name_lower = "summary" then begin 692 692 let parent = get_parent_element state in 693 693 let is_in_details = parent = Some "details" in 694 + let has_role_attr = List.exists (fun (k, _) -> String.lowercase_ascii k = "role") attrs in 695 + let has_aria_expanded = List.assoc_opt "aria-expanded" attrs <> None in 696 + let has_aria_pressed = List.assoc_opt "aria-pressed" attrs <> None in 694 697 if is_in_details then begin 695 698 (* summary that is the first child of details *) 696 - (* Cannot have role=paragraph (or other non-button roles) *) 697 - if explicit_roles <> [] then begin 698 - let first_role = List.hd explicit_roles in 699 - if first_role <> "button" && first_role <> "none" && first_role <> "presentation" then 700 - Message_collector.add_typed collector Error_code.Summary_missing_role 701 - end; 699 + if has_role_attr then 700 + (* Must not have role attribute *) 701 + Message_collector.add_typed collector Error_code.Summary_role_not_allowed 702 + else if has_aria_pressed then 703 + (* aria-pressed without role requires role *) 704 + Message_collector.add_typed collector Error_code.Summary_missing_role 705 + else if has_aria_expanded then 706 + (* aria-expanded without role requires role *) 707 + Message_collector.add_typed collector Error_code.Summary_missing_attrs 708 + end else begin 709 + (* summary NOT in details context - different rules apply *) 702 710 (* If has aria-expanded or aria-pressed, must have role *) 703 - let has_aria_expanded = List.assoc_opt "aria-expanded" attrs <> None in 704 - let has_aria_pressed = List.assoc_opt "aria-pressed" attrs <> None in 705 711 if (has_aria_expanded || has_aria_pressed) && explicit_roles = [] then begin 706 712 if has_aria_pressed then 707 713 Message_collector.add_typed collector Error_code.Summary_missing_role
+4 -2
lib/html5_checker/specialized/attr_restrictions_checker.ml
··· 287 287 | None -> "text" (* default type is text *) 288 288 in 289 289 if not (List.mem input_type input_types_allowing_list) then 290 - Message_collector.add_typed collector Error_code.List_attr_requires_datalist 290 + Message_collector.add_typed collector Error_code.Input_list_not_allowed 291 291 end 292 292 end; 293 293 ··· 317 317 | Some xmllang -> 318 318 (match lang_value with 319 319 | None -> 320 + (* xml:lang without lang attribute *) 320 321 Message_collector.add_typed collector Error_code.Xml_lang_without_lang 321 322 | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang -> 322 - Message_collector.add_typed collector Error_code.Xml_lang_lang_mismatch 323 + (* xml:lang and lang have different values - "lang present with same value" message *) 324 + Message_collector.add_typed collector Error_code.Xml_lang_without_lang 323 325 | _ -> ()) 324 326 | None -> () 325 327 end;
+1 -1
lib/html5_checker/specialized/importmap_checker.ml
··· 292 292 | ForbiddenProperty _ -> Error_code.Importmap_invalid_root 293 293 | SlashKeyWithoutSlashValue _ -> Error_code.Importmap_key_trailing_slash 294 294 | InvalidScopeKey -> Error_code.Importmap_scopes_invalid_url 295 - | InvalidScopeValue _ -> Error_code.Importmap_scopes_invalid_url 295 + | InvalidScopeValue _ -> Error_code.Importmap_scopes_value_invalid_url 296 296 | ScopeValueNotObject -> Error_code.Importmap_scopes_values_not_object 297 297 298 298 let end_element state ~name ~namespace collector =