OCaml HTML5 parser/serialiser based on Python's JustHTML

fix

+5 -4
lib/html5_checker/content_model/content_checker.ml
··· 57 | Content_model.Nothing -> false 58 | Content_model.Text -> true 59 | Content_model.Transparent -> true (* Inherits parent *) 60 - | Content_model.Categories _ -> false (* Elements only *) 61 | Content_model.Elements _ -> false (* Specific elements only *) 62 | Content_model.Mixed _ -> true (* Text + elements *) 63 | Content_model.One_or_more model -> allows_text model ··· 96 97 match spec_opt with 98 | None -> 99 - (* Unknown element - emit warning *) 100 - Message_collector.add_typed collector 101 - (Error_code.Unknown_element { name }) 102 | Some spec -> 103 (* Check prohibited ancestors *) 104 check_prohibited_ancestors state name spec collector;
··· 57 | Content_model.Nothing -> false 58 | Content_model.Text -> true 59 | Content_model.Transparent -> true (* Inherits parent *) 60 + | Content_model.Categories cats -> 61 + (* Phrasing and Flow content include text *) 62 + List.mem Content_category.Phrasing cats || List.mem Content_category.Flow cats 63 | Content_model.Elements _ -> false (* Specific elements only *) 64 | Content_model.Mixed _ -> true (* Text + elements *) 65 | Content_model.One_or_more model -> allows_text model ··· 98 99 match spec_opt with 100 | None -> 101 + (* Unknown element - first check if it's allowed in current context *) 102 + validate_child_element state name collector 103 | Some spec -> 104 (* Check prohibited ancestors *) 105 check_prohibited_ancestors state name spec collector;
+23 -2
lib/html5_checker/error_code.ml
··· 135 (** The "id" attribute on a "map" element must have the same value as the "name" attribute. *) 136 | List_attr_requires_datalist 137 (** The "list" attribute of "input" must refer to a "datalist" element. *) 138 | Label_too_many_labelable 139 (** The "label" element may contain at most one labelable descendant. *) 140 | Label_for_id_mismatch ··· 151 (** Element "summary" is missing required attribute "role". *) 152 | Summary_missing_attrs 153 (** Element "summary" is missing one or more of [aria-checked, aria-level, role]. *) 154 | Autocomplete_webauthn_on_select 155 (** The value of "autocomplete" for "select" must not contain "webauthn". *) 156 | Commandfor_invalid_target ··· 234 | Importmap_scopes_values_not_object 235 (** The value of "scopes" property values must also be JSON objects. *) 236 | Importmap_scopes_invalid_url 237 - (** The "scopes" property must only contain valid URL values. *) 238 239 (* ===== Style Element ===== *) 240 | Style_type_invalid ··· 270 | Unnecessary_role _ -> Warning 271 | Aria_should_not_be_used _ -> Warning 272 | Unknown_element _ -> Warning 273 | _ -> Error 274 275 (** Get a short code string for categorization *) ··· 333 | Picture_missing_img -> "picture-missing-img" 334 | Map_id_name_mismatch -> "map-id-name" 335 | List_attr_requires_datalist -> "list-datalist" 336 | Label_too_many_labelable -> "label-multiple" 337 | Label_for_id_mismatch -> "label-for-mismatch" 338 | Role_on_label_ancestor -> "role-on-label" ··· 341 | Input_value_constraint _ -> "input-value" 342 | Summary_missing_role -> "summary-role" 343 | Summary_missing_attrs -> "summary-attrs" 344 | Autocomplete_webauthn_on_select -> "autocomplete" 345 | Commandfor_invalid_target -> "commandfor" 346 | Forbidden_codepoint _ -> "forbidden-codepoint" ··· 377 | Importmap_scopes_not_object -> "importmap" 378 | Importmap_scopes_values_not_object -> "importmap" 379 | Importmap_scopes_invalid_url -> "importmap" 380 | Style_type_invalid -> "style-type" 381 | Headingoffset_invalid -> "headingoffset" 382 | Media_empty -> "media-empty" ··· 410 | Bad_attr_value_generic { message } -> message 411 | Duplicate_id { id } -> 412 Printf.sprintf "Duplicate ID %s." (q id) 413 - | Data_attr_invalid_name { reason } -> reason 414 | Data_attr_uppercase -> 415 Printf.sprintf "%s attributes must not have characters from the range %s\xe2\x80\xa6%s in the name." 416 (q "data-*") (q "A") (q "Z") ··· 566 | List_attr_requires_datalist -> 567 Printf.sprintf "The %s attribute of the %s element must refer to a %s element." 568 (q "list") (q "input") (q "datalist") 569 | Label_too_many_labelable -> 570 Printf.sprintf "The %s element may contain at most one %s, %s, %s, %s, %s, %s, or %s descendant." 571 (q "label") (q "button") (q "input") (q "meter") (q "output") (q "progress") (q "select") (q "textarea") ··· 588 | Summary_missing_attrs -> 589 Printf.sprintf "Element %s is missing one or more of the following attributes: [aria-checked, aria-level, role]." 590 (q "summary") 591 | Autocomplete_webauthn_on_select -> 592 Printf.sprintf "The value of the %s attribute for the %s element must not contain %s." 593 (q "autocomplete") (q "select") (q "webauthn") ··· 692 Printf.sprintf "The value of the %s property within the content of a %s element with a %s attribute whose value is %s must be a JSON object whose values are also JSON objects." 693 (q "scopes") (q "script") (q "type") (q "importmap") 694 | Importmap_scopes_invalid_url -> 695 Printf.sprintf "A specifier map defined in a %s property within the content of a %s element with a %s attribute whose value is %s must only contain valid URL values." 696 (q "scopes") (q "script") (q "type") (q "importmap") 697
··· 135 (** The "id" attribute on a "map" element must have the same value as the "name" attribute. *) 136 | List_attr_requires_datalist 137 (** The "list" attribute of "input" must refer to a "datalist" element. *) 138 + | Input_list_not_allowed 139 + (** Attribute "list" is only allowed on certain input types. *) 140 | Label_too_many_labelable 141 (** The "label" element may contain at most one labelable descendant. *) 142 | Label_for_id_mismatch ··· 153 (** Element "summary" is missing required attribute "role". *) 154 | Summary_missing_attrs 155 (** Element "summary" is missing one or more of [aria-checked, aria-level, role]. *) 156 + | Summary_role_not_allowed 157 + (** The "role" attribute must not be used on any "summary" for its parent "details". *) 158 | Autocomplete_webauthn_on_select 159 (** The value of "autocomplete" for "select" must not contain "webauthn". *) 160 | Commandfor_invalid_target ··· 238 | Importmap_scopes_values_not_object 239 (** The value of "scopes" property values must also be JSON objects. *) 240 | Importmap_scopes_invalid_url 241 + (** The "scopes" property keys must be valid URL strings. *) 242 + | Importmap_scopes_value_invalid_url 243 + (** The specifier map within "scopes" must only contain valid URL values. *) 244 245 (* ===== Style Element ===== *) 246 | Style_type_invalid ··· 276 | Unnecessary_role _ -> Warning 277 | Aria_should_not_be_used _ -> Warning 278 | Unknown_element _ -> Warning 279 + | Not_nfc _ -> Warning 280 | _ -> Error 281 282 (** Get a short code string for categorization *) ··· 340 | Picture_missing_img -> "picture-missing-img" 341 | Map_id_name_mismatch -> "map-id-name" 342 | List_attr_requires_datalist -> "list-datalist" 343 + | Input_list_not_allowed -> "list-not-allowed" 344 | Label_too_many_labelable -> "label-multiple" 345 | Label_for_id_mismatch -> "label-for-mismatch" 346 | Role_on_label_ancestor -> "role-on-label" ··· 349 | Input_value_constraint _ -> "input-value" 350 | Summary_missing_role -> "summary-role" 351 | Summary_missing_attrs -> "summary-attrs" 352 + | Summary_role_not_allowed -> "summary-role" 353 | Autocomplete_webauthn_on_select -> "autocomplete" 354 | Commandfor_invalid_target -> "commandfor" 355 | Forbidden_codepoint _ -> "forbidden-codepoint" ··· 386 | Importmap_scopes_not_object -> "importmap" 387 | Importmap_scopes_values_not_object -> "importmap" 388 | Importmap_scopes_invalid_url -> "importmap" 389 + | Importmap_scopes_value_invalid_url -> "importmap" 390 | Style_type_invalid -> "style-type" 391 | Headingoffset_invalid -> "headingoffset" 392 | Media_empty -> "media-empty" ··· 420 | Bad_attr_value_generic { message } -> message 421 | Duplicate_id { id } -> 422 Printf.sprintf "Duplicate ID %s." (q id) 423 + | Data_attr_invalid_name { reason } -> 424 + Printf.sprintf "%s attribute names %s." (q "data-*") reason 425 | Data_attr_uppercase -> 426 Printf.sprintf "%s attributes must not have characters from the range %s\xe2\x80\xa6%s in the name." 427 (q "data-*") (q "A") (q "Z") ··· 577 | List_attr_requires_datalist -> 578 Printf.sprintf "The %s attribute of the %s element must refer to a %s element." 579 (q "list") (q "input") (q "datalist") 580 + | Input_list_not_allowed -> 581 + Printf.sprintf "Attribute %s is only allowed when the input type is %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, or %s." 582 + (q "list") (q "color") (q "date") (q "datetime-local") (q "email") (q "month") 583 + (q "number") (q "range") (q "search") (q "tel") (q "text") (q "time") (q "url") (q "week") 584 | Label_too_many_labelable -> 585 Printf.sprintf "The %s element may contain at most one %s, %s, %s, %s, %s, %s, or %s descendant." 586 (q "label") (q "button") (q "input") (q "meter") (q "output") (q "progress") (q "select") (q "textarea") ··· 603 | Summary_missing_attrs -> 604 Printf.sprintf "Element %s is missing one or more of the following attributes: [aria-checked, aria-level, role]." 605 (q "summary") 606 + | Summary_role_not_allowed -> 607 + Printf.sprintf "The %s attribute must not be used on any %s element that is a summary for its parent %s element." 608 + (q "role") (q "summary") (q "details") 609 | Autocomplete_webauthn_on_select -> 610 Printf.sprintf "The value of the %s attribute for the %s element must not contain %s." 611 (q "autocomplete") (q "select") (q "webauthn") ··· 710 Printf.sprintf "The value of the %s property within the content of a %s element with a %s attribute whose value is %s must be a JSON object whose values are also JSON objects." 711 (q "scopes") (q "script") (q "type") (q "importmap") 712 | Importmap_scopes_invalid_url -> 713 + Printf.sprintf "The value of the %s property within the content of a %s element with a %s attribute whose value is %s must be a JSON object whose keys are valid URL strings." 714 + (q "scopes") (q "script") (q "type") (q "importmap") 715 + | Importmap_scopes_value_invalid_url -> 716 Printf.sprintf "A specifier map defined in a %s property within the content of a %s element with a %s attribute whose value is %s must only contain valid URL values." 717 (q "scopes") (q "script") (q "type") (q "importmap") 718
+3
lib/html5_checker/error_code.mli
··· 76 | Picture_missing_img 77 | Map_id_name_mismatch 78 | List_attr_requires_datalist 79 | Label_too_many_labelable 80 | Label_for_id_mismatch 81 | Role_on_label_ancestor ··· 84 | Input_value_constraint of { constraint_type: string } 85 | Summary_missing_role 86 | Summary_missing_attrs 87 | Autocomplete_webauthn_on_select 88 | Commandfor_invalid_target 89 ··· 132 | Importmap_scopes_not_object 133 | Importmap_scopes_values_not_object 134 | Importmap_scopes_invalid_url 135 136 (* Style Element *) 137 | Style_type_invalid
··· 76 | Picture_missing_img 77 | Map_id_name_mismatch 78 | List_attr_requires_datalist 79 + | Input_list_not_allowed 80 | Label_too_many_labelable 81 | Label_for_id_mismatch 82 | Role_on_label_ancestor ··· 85 | Input_value_constraint of { constraint_type: string } 86 | Summary_missing_role 87 | Summary_missing_attrs 88 + | Summary_role_not_allowed 89 | Autocomplete_webauthn_on_select 90 | Commandfor_invalid_target 91 ··· 134 | Importmap_scopes_not_object 135 | Importmap_scopes_values_not_object 136 | Importmap_scopes_invalid_url 137 + | Importmap_scopes_value_invalid_url 138 139 (* Style Element *) 140 | Style_type_invalid
+14 -8
lib/html5_checker/specialized/aria_checker.ml
··· 691 if name_lower = "summary" then begin 692 let parent = get_parent_element state in 693 let is_in_details = parent = Some "details" in 694 if is_in_details then begin 695 (* summary that is the first child of details *) 696 - (* Cannot have role=paragraph (or other non-button roles) *) 697 - if explicit_roles <> [] then begin 698 - let first_role = List.hd explicit_roles in 699 - if first_role <> "button" && first_role <> "none" && first_role <> "presentation" then 700 - Message_collector.add_typed collector Error_code.Summary_missing_role 701 - end; 702 (* If has aria-expanded or aria-pressed, must have role *) 703 - let has_aria_expanded = List.assoc_opt "aria-expanded" attrs <> None in 704 - let has_aria_pressed = List.assoc_opt "aria-pressed" attrs <> None in 705 if (has_aria_expanded || has_aria_pressed) && explicit_roles = [] then begin 706 if has_aria_pressed then 707 Message_collector.add_typed collector Error_code.Summary_missing_role
··· 691 if name_lower = "summary" then begin 692 let parent = get_parent_element state in 693 let is_in_details = parent = Some "details" in 694 + let has_role_attr = List.exists (fun (k, _) -> String.lowercase_ascii k = "role") attrs in 695 + let has_aria_expanded = List.assoc_opt "aria-expanded" attrs <> None in 696 + let has_aria_pressed = List.assoc_opt "aria-pressed" attrs <> None in 697 if is_in_details then begin 698 (* summary that is the first child of details *) 699 + if has_role_attr then 700 + (* Must not have role attribute *) 701 + Message_collector.add_typed collector Error_code.Summary_role_not_allowed 702 + else if has_aria_pressed then 703 + (* aria-pressed without role requires role *) 704 + Message_collector.add_typed collector Error_code.Summary_missing_role 705 + else if has_aria_expanded then 706 + (* aria-expanded without role requires role *) 707 + Message_collector.add_typed collector Error_code.Summary_missing_attrs 708 + end else begin 709 + (* summary NOT in details context - different rules apply *) 710 (* If has aria-expanded or aria-pressed, must have role *) 711 if (has_aria_expanded || has_aria_pressed) && explicit_roles = [] then begin 712 if has_aria_pressed then 713 Message_collector.add_typed collector Error_code.Summary_missing_role
+4 -2
lib/html5_checker/specialized/attr_restrictions_checker.ml
··· 287 | None -> "text" (* default type is text *) 288 in 289 if not (List.mem input_type input_types_allowing_list) then 290 - Message_collector.add_typed collector Error_code.List_attr_requires_datalist 291 end 292 end; 293 ··· 317 | Some xmllang -> 318 (match lang_value with 319 | None -> 320 Message_collector.add_typed collector Error_code.Xml_lang_without_lang 321 | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang -> 322 - Message_collector.add_typed collector Error_code.Xml_lang_lang_mismatch 323 | _ -> ()) 324 | None -> () 325 end;
··· 287 | None -> "text" (* default type is text *) 288 in 289 if not (List.mem input_type input_types_allowing_list) then 290 + Message_collector.add_typed collector Error_code.Input_list_not_allowed 291 end 292 end; 293 ··· 317 | Some xmllang -> 318 (match lang_value with 319 | None -> 320 + (* xml:lang without lang attribute *) 321 Message_collector.add_typed collector Error_code.Xml_lang_without_lang 322 | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang -> 323 + (* xml:lang and lang have different values - "lang present with same value" message *) 324 + Message_collector.add_typed collector Error_code.Xml_lang_without_lang 325 | _ -> ()) 326 | None -> () 327 end;
+1 -1
lib/html5_checker/specialized/importmap_checker.ml
··· 292 | ForbiddenProperty _ -> Error_code.Importmap_invalid_root 293 | SlashKeyWithoutSlashValue _ -> Error_code.Importmap_key_trailing_slash 294 | InvalidScopeKey -> Error_code.Importmap_scopes_invalid_url 295 - | InvalidScopeValue _ -> Error_code.Importmap_scopes_invalid_url 296 | ScopeValueNotObject -> Error_code.Importmap_scopes_values_not_object 297 298 let end_element state ~name ~namespace collector =
··· 292 | ForbiddenProperty _ -> Error_code.Importmap_invalid_root 293 | SlashKeyWithoutSlashValue _ -> Error_code.Importmap_key_trailing_slash 294 | InvalidScopeKey -> Error_code.Importmap_scopes_invalid_url 295 + | InvalidScopeValue _ -> Error_code.Importmap_scopes_value_invalid_url 296 | ScopeValueNotObject -> Error_code.Importmap_scopes_values_not_object 297 298 let end_element state ~name ~namespace collector =