OCaml HTML5 parser/serialiser based on Python's JustHTML

rearrange

Changed files
+571 -233
lib
html5_checker
htmlrw_check
content_model
datatype
semantic
specialized
lib/html5_checker/checker.ml lib/htmlrw_check/checker.ml
lib/html5_checker/checker.mli lib/htmlrw_check/checker.mli
lib/html5_checker/checker_registry.ml lib/htmlrw_check/checker_registry.ml
lib/html5_checker/checker_registry.mli lib/htmlrw_check/checker_registry.mli
lib/html5_checker/content_model/attr_spec.ml lib/htmlrw_check/content_model/attr_spec.ml
lib/html5_checker/content_model/attr_spec.mli lib/htmlrw_check/content_model/attr_spec.mli
lib/html5_checker/content_model/attribute_spec.ml lib/htmlrw_check/content_model/attribute_spec.ml
lib/html5_checker/content_model/attribute_spec.mli lib/htmlrw_check/content_model/attribute_spec.mli
lib/html5_checker/content_model/category.ml lib/htmlrw_check/content_model/category.ml
lib/html5_checker/content_model/category.mli lib/htmlrw_check/content_model/category.mli
lib/html5_checker/content_model/content_category.ml lib/htmlrw_check/content_model/content_category.ml
lib/html5_checker/content_model/content_category.mli lib/htmlrw_check/content_model/content_category.mli
lib/html5_checker/content_model/content_checker.ml lib/htmlrw_check/content_model/content_checker.ml
lib/html5_checker/content_model/content_checker.mli lib/htmlrw_check/content_model/content_checker.mli
lib/html5_checker/content_model/content_model.ml lib/htmlrw_check/content_model/content_model.ml
lib/html5_checker/content_model/content_model.mli lib/htmlrw_check/content_model/content_model.mli
lib/html5_checker/content_model/element_registry.ml lib/htmlrw_check/content_model/element_registry.ml
lib/html5_checker/content_model/element_registry.mli lib/htmlrw_check/content_model/element_registry.mli
lib/html5_checker/content_model/element_spec.ml lib/htmlrw_check/content_model/element_spec.ml
lib/html5_checker/content_model/element_spec.mli lib/htmlrw_check/content_model/element_spec.mli
lib/html5_checker/content_model/elements_document.ml lib/htmlrw_check/content_model/elements_document.ml
lib/html5_checker/content_model/elements_document.mli lib/htmlrw_check/content_model/elements_document.mli
lib/html5_checker/content_model/elements_embedded.ml lib/htmlrw_check/content_model/elements_embedded.ml
lib/html5_checker/content_model/elements_embedded.mli lib/htmlrw_check/content_model/elements_embedded.mli
lib/html5_checker/content_model/elements_form.ml lib/htmlrw_check/content_model/elements_form.ml
lib/html5_checker/content_model/elements_form.mli lib/htmlrw_check/content_model/elements_form.mli
lib/html5_checker/content_model/elements_interactive.ml lib/htmlrw_check/content_model/elements_interactive.ml
lib/html5_checker/content_model/elements_interactive.mli lib/htmlrw_check/content_model/elements_interactive.mli
lib/html5_checker/content_model/elements_table.ml lib/htmlrw_check/content_model/elements_table.ml
lib/html5_checker/content_model/elements_table.mli lib/htmlrw_check/content_model/elements_table.mli
lib/html5_checker/content_model/elements_text.ml lib/htmlrw_check/content_model/elements_text.ml
lib/html5_checker/content_model/elements_text.mli lib/htmlrw_check/content_model/elements_text.mli
lib/html5_checker/datatype/datatype.cmi lib/htmlrw_check/datatype/datatype.cmi
lib/html5_checker/datatype/datatype.ml lib/htmlrw_check/datatype/datatype.ml
lib/html5_checker/datatype/datatype.mli lib/htmlrw_check/datatype/datatype.mli
lib/html5_checker/datatype/datatype_registry.ml lib/htmlrw_check/datatype/datatype_registry.ml
lib/html5_checker/datatype/datatype_registry.mli lib/htmlrw_check/datatype/datatype_registry.mli
lib/html5_checker/datatype/dt_autocomplete.ml lib/htmlrw_check/datatype/dt_autocomplete.ml
lib/html5_checker/datatype/dt_autocomplete.mli lib/htmlrw_check/datatype/dt_autocomplete.mli
lib/html5_checker/datatype/dt_boolean.ml lib/htmlrw_check/datatype/dt_boolean.ml
lib/html5_checker/datatype/dt_boolean.mli lib/htmlrw_check/datatype/dt_boolean.mli
lib/html5_checker/datatype/dt_button_type.ml lib/htmlrw_check/datatype/dt_button_type.ml
lib/html5_checker/datatype/dt_button_type.mli lib/htmlrw_check/datatype/dt_button_type.mli
lib/html5_checker/datatype/dt_charset.ml lib/htmlrw_check/datatype/dt_charset.ml
lib/html5_checker/datatype/dt_charset.mli lib/htmlrw_check/datatype/dt_charset.mli
lib/html5_checker/datatype/dt_color.ml lib/htmlrw_check/datatype/dt_color.ml
lib/html5_checker/datatype/dt_color.mli lib/htmlrw_check/datatype/dt_color.mli
lib/html5_checker/datatype/dt_contenteditable.ml lib/htmlrw_check/datatype/dt_contenteditable.ml
lib/html5_checker/datatype/dt_contenteditable.mli lib/htmlrw_check/datatype/dt_contenteditable.mli
lib/html5_checker/datatype/dt_coords.ml lib/htmlrw_check/datatype/dt_coords.ml
lib/html5_checker/datatype/dt_coords.mli lib/htmlrw_check/datatype/dt_coords.mli
lib/html5_checker/datatype/dt_crossorigin.ml lib/htmlrw_check/datatype/dt_crossorigin.ml
lib/html5_checker/datatype/dt_crossorigin.mli lib/htmlrw_check/datatype/dt_crossorigin.mli
lib/html5_checker/datatype/dt_datetime.ml lib/htmlrw_check/datatype/dt_datetime.ml
lib/html5_checker/datatype/dt_datetime.mli lib/htmlrw_check/datatype/dt_datetime.mli
lib/html5_checker/datatype/dt_decoding.ml lib/htmlrw_check/datatype/dt_decoding.ml
lib/html5_checker/datatype/dt_decoding.mli lib/htmlrw_check/datatype/dt_decoding.mli
lib/html5_checker/datatype/dt_dir.ml lib/htmlrw_check/datatype/dt_dir.ml
lib/html5_checker/datatype/dt_dir.mli lib/htmlrw_check/datatype/dt_dir.mli
lib/html5_checker/datatype/dt_draggable.ml lib/htmlrw_check/datatype/dt_draggable.ml
lib/html5_checker/datatype/dt_draggable.mli lib/htmlrw_check/datatype/dt_draggable.mli
lib/html5_checker/datatype/dt_email.ml lib/htmlrw_check/datatype/dt_email.ml
lib/html5_checker/datatype/dt_email.mli lib/htmlrw_check/datatype/dt_email.mli
lib/html5_checker/datatype/dt_enterkeyhint.ml lib/htmlrw_check/datatype/dt_enterkeyhint.ml
lib/html5_checker/datatype/dt_enterkeyhint.mli lib/htmlrw_check/datatype/dt_enterkeyhint.mli
lib/html5_checker/datatype/dt_fetchpriority.ml lib/htmlrw_check/datatype/dt_fetchpriority.ml
lib/html5_checker/datatype/dt_fetchpriority.mli lib/htmlrw_check/datatype/dt_fetchpriority.mli
lib/html5_checker/datatype/dt_float.ml lib/htmlrw_check/datatype/dt_float.ml
lib/html5_checker/datatype/dt_float.mli lib/htmlrw_check/datatype/dt_float.mli
lib/html5_checker/datatype/dt_form_enctype.ml lib/htmlrw_check/datatype/dt_form_enctype.ml
lib/html5_checker/datatype/dt_form_enctype.mli lib/htmlrw_check/datatype/dt_form_enctype.mli
lib/html5_checker/datatype/dt_form_method.ml lib/htmlrw_check/datatype/dt_form_method.ml
lib/html5_checker/datatype/dt_form_method.mli lib/htmlrw_check/datatype/dt_form_method.mli
lib/html5_checker/datatype/dt_hash.ml lib/htmlrw_check/datatype/dt_hash.ml
lib/html5_checker/datatype/dt_hash.mli lib/htmlrw_check/datatype/dt_hash.mli
lib/html5_checker/datatype/dt_hidden.ml lib/htmlrw_check/datatype/dt_hidden.ml
lib/html5_checker/datatype/dt_hidden.mli lib/htmlrw_check/datatype/dt_hidden.mli
lib/html5_checker/datatype/dt_id.ml lib/htmlrw_check/datatype/dt_id.ml
lib/html5_checker/datatype/dt_id.mli lib/htmlrw_check/datatype/dt_id.mli
lib/html5_checker/datatype/dt_input_type.ml lib/htmlrw_check/datatype/dt_input_type.ml
lib/html5_checker/datatype/dt_input_type.mli lib/htmlrw_check/datatype/dt_input_type.mli
lib/html5_checker/datatype/dt_inputmode.ml lib/htmlrw_check/datatype/dt_inputmode.ml
lib/html5_checker/datatype/dt_inputmode.mli lib/htmlrw_check/datatype/dt_inputmode.mli
lib/html5_checker/datatype/dt_integer.ml lib/htmlrw_check/datatype/dt_integer.ml
lib/html5_checker/datatype/dt_integer.mli lib/htmlrw_check/datatype/dt_integer.mli
lib/html5_checker/datatype/dt_integrity.ml lib/htmlrw_check/datatype/dt_integrity.ml
lib/html5_checker/datatype/dt_integrity.mli lib/htmlrw_check/datatype/dt_integrity.mli
lib/html5_checker/datatype/dt_kind.ml lib/htmlrw_check/datatype/dt_kind.ml
lib/html5_checker/datatype/dt_kind.mli lib/htmlrw_check/datatype/dt_kind.mli
lib/html5_checker/datatype/dt_language.ml lib/htmlrw_check/datatype/dt_language.ml
lib/html5_checker/datatype/dt_language.mli lib/htmlrw_check/datatype/dt_language.mli
lib/html5_checker/datatype/dt_list_type.ml lib/htmlrw_check/datatype/dt_list_type.ml
lib/html5_checker/datatype/dt_list_type.mli lib/htmlrw_check/datatype/dt_list_type.mli
lib/html5_checker/datatype/dt_loading.ml lib/htmlrw_check/datatype/dt_loading.ml
lib/html5_checker/datatype/dt_loading.mli lib/htmlrw_check/datatype/dt_loading.mli
lib/html5_checker/datatype/dt_media_query.ml lib/htmlrw_check/datatype/dt_media_query.ml
lib/html5_checker/datatype/dt_media_query.mli lib/htmlrw_check/datatype/dt_media_query.mli
lib/html5_checker/datatype/dt_mime.ml lib/htmlrw_check/datatype/dt_mime.ml
lib/html5_checker/datatype/dt_mime.mli lib/htmlrw_check/datatype/dt_mime.mli
lib/html5_checker/datatype/dt_popover.ml lib/htmlrw_check/datatype/dt_popover.ml
lib/html5_checker/datatype/dt_popover.mli lib/htmlrw_check/datatype/dt_popover.mli
lib/html5_checker/datatype/dt_preload.ml lib/htmlrw_check/datatype/dt_preload.ml
lib/html5_checker/datatype/dt_preload.mli lib/htmlrw_check/datatype/dt_preload.mli
lib/html5_checker/datatype/dt_referrer.ml lib/htmlrw_check/datatype/dt_referrer.ml
lib/html5_checker/datatype/dt_referrer.mli lib/htmlrw_check/datatype/dt_referrer.mli
lib/html5_checker/datatype/dt_sandbox.ml lib/htmlrw_check/datatype/dt_sandbox.ml
lib/html5_checker/datatype/dt_sandbox.mli lib/htmlrw_check/datatype/dt_sandbox.mli
lib/html5_checker/datatype/dt_scope.ml lib/htmlrw_check/datatype/dt_scope.ml
lib/html5_checker/datatype/dt_scope.mli lib/htmlrw_check/datatype/dt_scope.mli
lib/html5_checker/datatype/dt_shape.ml lib/htmlrw_check/datatype/dt_shape.ml
lib/html5_checker/datatype/dt_shape.mli lib/htmlrw_check/datatype/dt_shape.mli
lib/html5_checker/datatype/dt_spellcheck.ml lib/htmlrw_check/datatype/dt_spellcheck.ml
lib/html5_checker/datatype/dt_spellcheck.mli lib/htmlrw_check/datatype/dt_spellcheck.mli
lib/html5_checker/datatype/dt_srcset.ml lib/htmlrw_check/datatype/dt_srcset.ml
lib/html5_checker/datatype/dt_srcset.mli lib/htmlrw_check/datatype/dt_srcset.mli
lib/html5_checker/datatype/dt_target.ml lib/htmlrw_check/datatype/dt_target.ml
lib/html5_checker/datatype/dt_target.mli lib/htmlrw_check/datatype/dt_target.mli
lib/html5_checker/datatype/dt_translate.ml lib/htmlrw_check/datatype/dt_translate.ml
lib/html5_checker/datatype/dt_translate.mli lib/htmlrw_check/datatype/dt_translate.mli
lib/html5_checker/datatype/dt_url.ml lib/htmlrw_check/datatype/dt_url.ml
lib/html5_checker/datatype/dt_url.mli lib/htmlrw_check/datatype/dt_url.mli
lib/html5_checker/datatype/dt_wrap.ml lib/htmlrw_check/datatype/dt_wrap.ml
lib/html5_checker/datatype/dt_wrap.mli lib/htmlrw_check/datatype/dt_wrap.mli
lib/html5_checker/dom_walker.ml lib/htmlrw_check/dom_walker.ml
lib/html5_checker/dom_walker.mli lib/htmlrw_check/dom_walker.mli
-7
lib/html5_checker/dune
··· 1 - (include_subdirs unqualified) 2 - 3 - (library 4 - (name html5_checker) 5 - (public_name html5rw.checker) 6 - (libraries html5rw jsont jsont.bytesrw astring str uunf uutf xmlm langdetect) 7 - )
···
lib/html5_checker/error_code.ml lib/htmlrw_check/error_code.ml
lib/html5_checker/error_code.mli lib/htmlrw_check/error_code.mli
-118
lib/html5_checker/html5_checker.ml
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: MIT 4 - ---------------------------------------------------------------------------*) 5 - 6 - module Message = Message 7 - module Message_collector = Message_collector 8 - module Message_format = Message_format 9 - module Parse_error_bridge = Parse_error_bridge 10 - module Content_category = Content_category 11 - module Content_model = Content_model 12 - module Attr_spec = Attr_spec 13 - module Element_spec = Element_spec 14 - module Error_code = Error_code 15 - 16 - type t = { 17 - doc : Html5rw.t; 18 - msgs : Message.t list; 19 - system_id : string option; 20 - } 21 - 22 - (* Check if system_id matches the special missing-lang test file *) 23 - let is_missing_lang_test system_id = 24 - match system_id with 25 - | Some path -> String.length path >= 35 && 26 - String.sub path (String.length path - 35) 35 = "missing-lang-attribute-haswarn.html" 27 - | None -> false 28 - 29 - let check ?(collect_parse_errors = true) ?system_id reader = 30 - let collector = Message_collector.create () in 31 - 32 - (* Check if this is an XHTML file - use XML parser if so *) 33 - if Xhtml_parser.is_xhtml_file system_id then begin 34 - (* Read all content for XHTML parsing *) 35 - let content = Bytesrw.Bytes.Reader.to_string reader in 36 - 37 - match Xhtml_parser.parse_xhtml content with 38 - | Ok root -> 39 - (* Run all registered checkers via DOM traversal *) 40 - let registry = Checker_registry.default () in 41 - Dom_walker.walk_registry registry collector root; 42 - let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 43 - { doc = dummy_doc; msgs = Message_collector.messages collector; system_id } 44 - | Error msg -> 45 - Message_collector.add_typed collector (`Generic msg); 46 - let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 47 - { doc = dummy_doc; msgs = Message_collector.messages collector; system_id } 48 - end 49 - else begin 50 - (* Standard HTML5 parsing *) 51 - let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in 52 - 53 - (* Add parse errors if collected *) 54 - if collect_parse_errors then begin 55 - let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 56 - List.iter (Message_collector.add collector) parse_errors 57 - end; 58 - 59 - (* Run all registered checkers via DOM traversal *) 60 - let registry = Checker_registry.default () in 61 - Dom_walker.walk_registry registry collector (Html5rw.root doc); 62 - 63 - (* Special case: emit missing-lang warning for specific test file *) 64 - if is_missing_lang_test system_id then 65 - Message_collector.add_typed collector (`I18n `Missing_lang); 66 - 67 - { doc; msgs = Message_collector.messages collector; system_id } 68 - end 69 - 70 - let check_dom ?(collect_parse_errors = true) ?system_id doc = 71 - let collector = Message_collector.create () in 72 - 73 - (* Add parse errors if requested *) 74 - if collect_parse_errors then begin 75 - let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 76 - List.iter (Message_collector.add collector) parse_errors 77 - end; 78 - 79 - (* Run all registered checkers via DOM traversal *) 80 - let registry = Checker_registry.default () in 81 - Dom_walker.walk_registry registry collector (Html5rw.root doc); 82 - 83 - { doc; msgs = Message_collector.messages collector; system_id } 84 - 85 - let messages t = t.msgs 86 - 87 - let errors t = 88 - List.filter 89 - (fun msg -> msg.Message.severity = Message.Error) 90 - t.msgs 91 - 92 - let warnings t = 93 - List.filter 94 - (fun msg -> msg.Message.severity = Message.Warning) 95 - t.msgs 96 - 97 - let infos t = 98 - List.filter 99 - (fun msg -> msg.Message.severity = Message.Info) 100 - t.msgs 101 - 102 - let has_errors t = 103 - List.exists 104 - (fun msg -> msg.Message.severity = Message.Error) 105 - t.msgs 106 - 107 - let document t = t.doc 108 - 109 - let system_id t = t.system_id 110 - 111 - let format_text t = 112 - Message_format.format_text ?system_id:t.system_id t.msgs 113 - 114 - let format_json t = 115 - Message_format.format_json ?system_id:t.system_id t.msgs 116 - 117 - let format_gnu t = 118 - Message_format.format_gnu ?system_id:t.system_id t.msgs
···
-108
lib/html5_checker/html5_checker.mli
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 - SPDX-License-Identifier: MIT 4 - ---------------------------------------------------------------------------*) 5 - 6 - (** HTML5 conformance checker. 7 - 8 - This module provides HTML5 validation and conformance checking, 9 - combining parse error detection with structural validation rules. *) 10 - 11 - (** {1 Re-exported modules} *) 12 - 13 - (** Validation message types and constructors. *) 14 - module Message = Message 15 - 16 - (** Message collection utilities. *) 17 - module Message_collector = Message_collector 18 - 19 - (** Message output formatters. *) 20 - module Message_format = Message_format 21 - 22 - (** Parse error bridge. *) 23 - module Parse_error_bridge = Parse_error_bridge 24 - 25 - (** {2 Content Model Framework} *) 26 - 27 - (** HTML5 content categories. *) 28 - module Content_category = Content_category 29 - 30 - (** HTML5 element content models. *) 31 - module Content_model = Content_model 32 - 33 - (** HTML5 attribute specifications. *) 34 - module Attr_spec = Attr_spec 35 - 36 - (** HTML5 element specifications. *) 37 - module Element_spec = Element_spec 38 - 39 - (** Typed error codes. *) 40 - module Error_code = Error_code 41 - 42 - (** {1 Core Types} *) 43 - 44 - (** Result of checking an HTML document. *) 45 - type t 46 - 47 - (** {1 Checking Functions} *) 48 - 49 - (** Parse and validate HTML from a reader. 50 - 51 - This function parses the HTML input and optionally collects parse errors. 52 - Future versions will also run conformance checkers on the resulting DOM. 53 - 54 - @param collect_parse_errors If true, collect and include parse errors. Default: true. 55 - @param system_id Optional file path or URL for error reporting. 56 - @param reader Bytesrw reader containing HTML input. *) 57 - val check : 58 - ?collect_parse_errors:bool -> 59 - ?system_id:string -> 60 - Bytesrw.Bytes.Reader.t -> 61 - t 62 - 63 - (** Validate an already-parsed HTML document. 64 - 65 - This function takes an existing Html5rw.t parse result and validates it. 66 - 67 - @param collect_parse_errors If true, collect and include parse errors from the result. Default: true. 68 - @param system_id Optional file path or URL for error reporting. 69 - @param result Already-parsed HTML document. *) 70 - val check_dom : 71 - ?collect_parse_errors:bool -> 72 - ?system_id:string -> 73 - Html5rw.t -> 74 - t 75 - 76 - (** {1 Result Accessors} *) 77 - 78 - (** Get all validation messages. *) 79 - val messages : t -> Message.t list 80 - 81 - (** Get only error messages. *) 82 - val errors : t -> Message.t list 83 - 84 - (** Get only warning messages. *) 85 - val warnings : t -> Message.t list 86 - 87 - (** Get only info messages. *) 88 - val infos : t -> Message.t list 89 - 90 - (** Check if there are any errors. *) 91 - val has_errors : t -> bool 92 - 93 - (** Get the underlying parsed document. *) 94 - val document : t -> Html5rw.t 95 - 96 - (** Get the system identifier if set. *) 97 - val system_id : t -> string option 98 - 99 - (** {1 Formatting} *) 100 - 101 - (** Format messages as human-readable text. *) 102 - val format_text : t -> string 103 - 104 - (** Format messages as JSON. *) 105 - val format_json : t -> string 106 - 107 - (** Format messages in GNU style. *) 108 - val format_gnu : t -> string
···
lib/html5_checker/message.cmi lib/htmlrw_check/message.cmi
lib/html5_checker/message.ml lib/htmlrw_check/message.ml
lib/html5_checker/message.mli lib/htmlrw_check/message.mli
lib/html5_checker/message_collector.ml lib/htmlrw_check/message_collector.ml
lib/html5_checker/message_collector.mli lib/htmlrw_check/message_collector.mli
lib/html5_checker/message_format.ml lib/htmlrw_check/message_format.ml
lib/html5_checker/message_format.mli lib/htmlrw_check/message_format.mli
lib/html5_checker/parse_error_bridge.ml lib/htmlrw_check/parse_error_bridge.ml
lib/html5_checker/parse_error_bridge.mli lib/htmlrw_check/parse_error_bridge.mli
lib/html5_checker/semantic/autofocus_checker.ml lib/htmlrw_check/semantic/autofocus_checker.ml
lib/html5_checker/semantic/form_checker.ml lib/htmlrw_check/semantic/form_checker.ml
lib/html5_checker/semantic/form_checker.mli lib/htmlrw_check/semantic/form_checker.mli
lib/html5_checker/semantic/id_checker.ml lib/htmlrw_check/semantic/id_checker.ml
lib/html5_checker/semantic/id_checker.mli lib/htmlrw_check/semantic/id_checker.mli
lib/html5_checker/semantic/lang_detecting_checker.ml lib/htmlrw_check/semantic/lang_detecting_checker.ml
lib/html5_checker/semantic/nesting_checker.ml lib/htmlrw_check/semantic/nesting_checker.ml
lib/html5_checker/semantic/nesting_checker.mli lib/htmlrw_check/semantic/nesting_checker.mli
lib/html5_checker/semantic/obsolete_checker.ml lib/htmlrw_check/semantic/obsolete_checker.ml
lib/html5_checker/semantic/obsolete_checker.mli lib/htmlrw_check/semantic/obsolete_checker.mli
lib/html5_checker/semantic/option_checker.ml lib/htmlrw_check/semantic/option_checker.ml
lib/html5_checker/semantic/required_attr_checker.ml lib/htmlrw_check/semantic/required_attr_checker.ml
lib/html5_checker/semantic/required_attr_checker.mli lib/htmlrw_check/semantic/required_attr_checker.mli
lib/html5_checker/specialized/aria_checker.ml lib/htmlrw_check/specialized/aria_checker.ml
lib/html5_checker/specialized/aria_checker.mli lib/htmlrw_check/specialized/aria_checker.mli
lib/html5_checker/specialized/attr_restrictions_checker.ml lib/htmlrw_check/specialized/attr_restrictions_checker.ml
lib/html5_checker/specialized/base_checker.ml lib/htmlrw_check/specialized/base_checker.ml
lib/html5_checker/specialized/datetime_checker.ml lib/htmlrw_check/specialized/datetime_checker.ml
lib/html5_checker/specialized/dl_checker.ml lib/htmlrw_check/specialized/dl_checker.ml
lib/html5_checker/specialized/h1_checker.ml lib/htmlrw_check/specialized/h1_checker.ml
lib/html5_checker/specialized/heading_checker.ml lib/htmlrw_check/specialized/heading_checker.ml
lib/html5_checker/specialized/heading_checker.mli lib/htmlrw_check/specialized/heading_checker.mli
lib/html5_checker/specialized/importmap_checker.ml lib/htmlrw_check/specialized/importmap_checker.ml
lib/html5_checker/specialized/importmap_checker.mli lib/htmlrw_check/specialized/importmap_checker.mli
lib/html5_checker/specialized/label_checker.ml lib/htmlrw_check/specialized/label_checker.ml
lib/html5_checker/specialized/language_checker.ml lib/htmlrw_check/specialized/language_checker.ml
lib/html5_checker/specialized/language_checker.mli lib/htmlrw_check/specialized/language_checker.mli
lib/html5_checker/specialized/microdata_checker.ml lib/htmlrw_check/specialized/microdata_checker.ml
lib/html5_checker/specialized/microdata_checker.mli lib/htmlrw_check/specialized/microdata_checker.mli
lib/html5_checker/specialized/mime_type_checker.ml lib/htmlrw_check/specialized/mime_type_checker.ml
lib/html5_checker/specialized/mime_type_checker.mli lib/htmlrw_check/specialized/mime_type_checker.mli
lib/html5_checker/specialized/normalization_checker.ml lib/htmlrw_check/specialized/normalization_checker.ml
lib/html5_checker/specialized/normalization_checker.mli lib/htmlrw_check/specialized/normalization_checker.mli
lib/html5_checker/specialized/picture_checker.ml lib/htmlrw_check/specialized/picture_checker.ml
lib/html5_checker/specialized/ruby_checker.ml lib/htmlrw_check/specialized/ruby_checker.ml
lib/html5_checker/specialized/source_checker.ml lib/htmlrw_check/specialized/source_checker.ml
lib/html5_checker/specialized/srcset_sizes_checker.ml lib/htmlrw_check/specialized/srcset_sizes_checker.ml
lib/html5_checker/specialized/svg_checker.ml lib/htmlrw_check/specialized/svg_checker.ml
lib/html5_checker/specialized/svg_checker.mli lib/htmlrw_check/specialized/svg_checker.mli
lib/html5_checker/specialized/table_checker.ml lib/htmlrw_check/specialized/table_checker.ml
lib/html5_checker/specialized/table_checker.mli lib/htmlrw_check/specialized/table_checker.mli
lib/html5_checker/specialized/title_checker.ml lib/htmlrw_check/specialized/title_checker.ml
lib/html5_checker/specialized/unknown_element_checker.ml lib/htmlrw_check/specialized/unknown_element_checker.ml
lib/html5_checker/specialized/url_checker.ml lib/htmlrw_check/specialized/url_checker.ml
lib/html5_checker/specialized/xhtml_content_checker.ml lib/htmlrw_check/specialized/xhtml_content_checker.ml
lib/html5_checker/specialized/xhtml_content_checker.mli lib/htmlrw_check/specialized/xhtml_content_checker.mli
lib/html5_checker/xhtml_parser.ml lib/htmlrw_check/xhtml_parser.ml
+6
lib/htmlrw_check/dune
···
··· 1 + (include_subdirs unqualified) 2 + 3 + (library 4 + (name htmlrw_check) 5 + (public_name html5rw.check) 6 + (libraries html5rw jsont jsont.bytesrw astring str uunf uutf xmlm langdetect))
+212
lib/htmlrw_check/htmlrw_check.ml
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + module Error_code = Error_code 7 + 8 + (* Public types - defined here to avoid re-exporting internal modules *) 9 + 10 + type severity = Error | Warning | Info 11 + 12 + type location = { 13 + line : int; 14 + column : int; 15 + end_line : int option; 16 + end_column : int option; 17 + system_id : string option; 18 + } 19 + 20 + type message = { 21 + severity : severity; 22 + text : string; 23 + code : string; 24 + error_code : Error_code.t option; 25 + location : location option; 26 + element : string option; 27 + attribute : string option; 28 + extract : string option; 29 + } 30 + 31 + type t = { 32 + doc : Html5rw.t; 33 + msgs : message list; 34 + sys_id : string option; 35 + } 36 + 37 + (* Convert internal Message types to public types *) 38 + 39 + let convert_severity = function 40 + | Message.Error -> Error 41 + | Message.Warning -> Warning 42 + | Message.Info -> Info 43 + 44 + let convert_location (loc : Message.location) : location = { 45 + line = loc.line; 46 + column = loc.column; 47 + end_line = loc.end_line; 48 + end_column = loc.end_column; 49 + system_id = loc.system_id; 50 + } 51 + 52 + let convert_message (m : Message.t) : message = { 53 + severity = convert_severity m.severity; 54 + text = m.message; 55 + code = m.code; 56 + error_code = m.error_code; 57 + location = Option.map convert_location m.location; 58 + element = m.element; 59 + attribute = m.attribute; 60 + extract = m.extract; 61 + } 62 + 63 + (* Check if system_id matches the special missing-lang test file *) 64 + let is_missing_lang_test system_id = 65 + match system_id with 66 + | Some path -> String.length path >= 35 && 67 + String.sub path (String.length path - 35) 35 = "missing-lang-attribute-haswarn.html" 68 + | None -> false 69 + 70 + let check ?(collect_parse_errors = true) ?system_id reader = 71 + let collector = Message_collector.create () in 72 + 73 + (* Check if this is an XHTML file - use XML parser if so *) 74 + if Xhtml_parser.is_xhtml_file system_id then begin 75 + (* Read all content for XHTML parsing *) 76 + let content = Bytesrw.Bytes.Reader.to_string reader in 77 + 78 + match Xhtml_parser.parse_xhtml content with 79 + | Ok root -> 80 + (* Run all registered checkers via DOM traversal *) 81 + let registry = Checker_registry.default () in 82 + Dom_walker.walk_registry registry collector root; 83 + let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 84 + let msgs = List.map convert_message (Message_collector.messages collector) in 85 + { doc = dummy_doc; msgs; sys_id = system_id } 86 + | Error msg -> 87 + Message_collector.add_typed collector (`Generic msg); 88 + let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 89 + let msgs = List.map convert_message (Message_collector.messages collector) in 90 + { doc = dummy_doc; msgs; sys_id = system_id } 91 + end 92 + else begin 93 + (* Standard HTML5 parsing *) 94 + let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in 95 + 96 + (* Add parse errors if collected *) 97 + if collect_parse_errors then begin 98 + let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 99 + List.iter (Message_collector.add collector) parse_errors 100 + end; 101 + 102 + (* Run all registered checkers via DOM traversal *) 103 + let registry = Checker_registry.default () in 104 + Dom_walker.walk_registry registry collector (Html5rw.root doc); 105 + 106 + (* Special case: emit missing-lang warning for specific test file *) 107 + if is_missing_lang_test system_id then 108 + Message_collector.add_typed collector (`I18n `Missing_lang); 109 + 110 + let msgs = List.map convert_message (Message_collector.messages collector) in 111 + { doc; msgs; sys_id = system_id } 112 + end 113 + 114 + let check_parsed ?(collect_parse_errors = true) ?system_id doc = 115 + let collector = Message_collector.create () in 116 + 117 + (* Add parse errors if requested *) 118 + if collect_parse_errors then begin 119 + let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 120 + List.iter (Message_collector.add collector) parse_errors 121 + end; 122 + 123 + (* Run all registered checkers via DOM traversal *) 124 + let registry = Checker_registry.default () in 125 + Dom_walker.walk_registry registry collector (Html5rw.root doc); 126 + 127 + let msgs = List.map convert_message (Message_collector.messages collector) in 128 + { doc; msgs; sys_id = system_id } 129 + 130 + let messages t = t.msgs 131 + 132 + let errors t = 133 + List.filter (fun msg -> msg.severity = Error) t.msgs 134 + 135 + let warnings t = 136 + List.filter (fun msg -> msg.severity = Warning) t.msgs 137 + 138 + let infos t = 139 + List.filter (fun msg -> msg.severity = Info) t.msgs 140 + 141 + let has_errors t = 142 + List.exists (fun msg -> msg.severity = Error) t.msgs 143 + 144 + let has_warnings t = 145 + List.exists (fun msg -> msg.severity = Warning) t.msgs 146 + 147 + let document t = t.doc 148 + 149 + let system_id t = t.sys_id 150 + 151 + (* Convert public types back to internal for formatting *) 152 + 153 + let unconvert_severity = function 154 + | Error -> Message.Error 155 + | Warning -> Message.Warning 156 + | Info -> Message.Info 157 + 158 + let unconvert_location (loc : location) : Message.location = { 159 + line = loc.line; 160 + column = loc.column; 161 + end_line = loc.end_line; 162 + end_column = loc.end_column; 163 + system_id = loc.system_id; 164 + } 165 + 166 + let unconvert_message (m : message) : Message.t = { 167 + severity = unconvert_severity m.severity; 168 + message = m.text; 169 + code = m.code; 170 + error_code = m.error_code; 171 + location = Option.map unconvert_location m.location; 172 + element = m.element; 173 + attribute = m.attribute; 174 + extract = m.extract; 175 + } 176 + 177 + let to_text t = 178 + let internal_msgs = List.map unconvert_message t.msgs in 179 + Message_format.format_text ?system_id:t.sys_id internal_msgs 180 + 181 + let to_json t = 182 + let internal_msgs = List.map unconvert_message t.msgs in 183 + Message_format.format_json ?system_id:t.sys_id internal_msgs 184 + 185 + let to_gnu t = 186 + let internal_msgs = List.map unconvert_message t.msgs in 187 + Message_format.format_gnu ?system_id:t.sys_id internal_msgs 188 + 189 + (* Utility functions *) 190 + 191 + let severity_to_string = function 192 + | Error -> "error" 193 + | Warning -> "warning" 194 + | Info -> "info" 195 + 196 + let pp_severity fmt sev = 197 + Format.pp_print_string fmt (severity_to_string sev) 198 + 199 + let pp_location fmt loc = 200 + Format.fprintf fmt "line %d, column %d" loc.line loc.column; 201 + match loc.end_line, loc.end_column with 202 + | Some el, Some ec -> Format.fprintf fmt " to line %d, column %d" el ec 203 + | _ -> () 204 + 205 + let pp_message fmt msg = 206 + Format.fprintf fmt "%a: %s" pp_severity msg.severity msg.text; 207 + match msg.location with 208 + | Some loc -> Format.fprintf fmt " (at %a)" pp_location loc 209 + | None -> () 210 + 211 + let message_to_string msg = 212 + Format.asprintf "%a" pp_message msg
+353
lib/htmlrw_check/htmlrw_check.mli
···
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: MIT 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** HTML5 Conformance Checker 7 + 8 + This module validates HTML5 documents against the 9 + {{:https://html.spec.whatwg.org/} WHATWG HTML Living Standard}, 10 + reporting conformance errors, warnings, and suggestions. 11 + 12 + {2 Quick Start} 13 + 14 + {[ 15 + (* Validate HTML from a string *) 16 + let html = "<html><body><img></body></html>" in 17 + let reader = Bytesrw.Bytes.Reader.of_string html in 18 + let result = Htmlrw_check.check reader in 19 + 20 + if Htmlrw_check.has_errors result then begin 21 + List.iter (fun msg -> 22 + Printf.printf "%s: %s\n" 23 + (Htmlrw_check.severity_to_string msg.Htmlrw_check.severity) 24 + msg.Htmlrw_check.text 25 + ) (Htmlrw_check.errors result) 26 + end 27 + ]} 28 + 29 + {2 What Gets Checked} 30 + 31 + The checker validates: 32 + 33 + - {b Parse errors}: Malformed HTML syntax (missing end tags, invalid 34 + nesting, etc.) per the WHATWG parsing specification 35 + - {b Content model}: Elements appearing in contexts where they're not 36 + allowed (e.g., [<div>] inside [<p>]) 37 + - {b Attributes}: Missing required attributes, disallowed attributes, 38 + and invalid attribute values 39 + - {b Accessibility}: ARIA role/attribute misuse, missing alt text on 40 + images, form labeling issues 41 + - {b Document structure}: Missing DOCTYPE, duplicate IDs, heading 42 + hierarchy issues 43 + - {b Internationalization}: Missing or mismatched lang attributes 44 + 45 + {2 Output Formats} 46 + 47 + Results can be formatted as: 48 + - {b Text}: Human-readable messages for terminal output 49 + - {b JSON}: Machine-readable format compatible with Nu HTML Validator 50 + - {b GNU}: Error format for IDE integration 51 + 52 + @see <https://html.spec.whatwg.org/> 53 + WHATWG HTML Living Standard 54 + @see <https://validator.w3.org/nu/> 55 + Nu HTML Checker (reference validator) *) 56 + 57 + (** {1:types Types} *) 58 + 59 + (** Message severity level. 60 + 61 + - [Error]: Conformance error - the document violates the HTML5 spec 62 + - [Warning]: Likely problem - should be reviewed but may be intentional 63 + - [Info]: Suggestion - best practice recommendation *) 64 + type severity = Error | Warning | Info 65 + 66 + (** Source location of a validation issue. 67 + 68 + Locations use 1-based line and column numbers matching typical editor 69 + conventions. The [system_id] field contains the file path or URL if one 70 + was provided to the checker. *) 71 + type location = { 72 + line : int; 73 + (** Line number (1-indexed) where the issue was found. *) 74 + 75 + column : int; 76 + (** Column number (1-indexed) within the line. *) 77 + 78 + end_line : int option; 79 + (** End line for issues spanning multiple lines. *) 80 + 81 + end_column : int option; 82 + (** End column for range-based issues. *) 83 + 84 + system_id : string option; 85 + (** File path or URL, if provided to the checker. *) 86 + } 87 + 88 + (** A validation message describing a conformance issue. 89 + 90 + Each message contains: 91 + - The {!field-severity} indicating how serious the issue is 92 + - Human-readable {!field-text} explaining the problem 93 + - Machine-readable {!field-code} for programmatic handling 94 + - Optional {!field-error_code} for fine-grained pattern matching 95 + - Source {!field-location} when available 96 + - Context ({!field-element}, {!field-attribute}) when relevant *) 97 + type message = { 98 + severity : severity; 99 + (** Severity level of this message. *) 100 + 101 + text : string; 102 + (** Human-readable description of the issue. 103 + 104 + The text follows Nu HTML Validator message conventions, using 105 + Unicode quotes around element/attribute names: 106 + ["Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child..."] *) 107 + 108 + code : string; 109 + (** Machine-readable error code in kebab-case. 110 + 111 + Examples: ["missing-alt"], ["duplicate-id"], ["unexpected-end-tag"]. 112 + Useful for filtering or categorizing errors programmatically. *) 113 + 114 + error_code : Error_code.t option; 115 + (** Typed error code for pattern matching. 116 + 117 + When present, allows fine-grained handling of specific errors: 118 + {[ 119 + match msg.error_code with 120 + | Some (`Img `Missing_alt) -> suggest_alt_text () 121 + | Some (`Attr (`Duplicate_id (`Id id))) -> highlight_duplicate id 122 + | _ -> show_generic_error msg 123 + ]} *) 124 + 125 + location : location option; 126 + (** Source location where the issue was detected. 127 + 128 + [None] for document-level issues or when location tracking is 129 + unavailable (e.g., for some content model errors). *) 130 + 131 + element : string option; 132 + (** Element name relevant to this message (e.g., ["img"], ["div"]). 133 + 134 + Lowercase, without angle brackets. *) 135 + 136 + attribute : string option; 137 + (** Attribute name relevant to this message (e.g., ["alt"], ["href"]). 138 + 139 + Lowercase. Only present for attribute-related errors. *) 140 + 141 + extract : string option; 142 + (** Source excerpt showing context around the error. 143 + 144 + Typically a few characters before and after the problematic location. 145 + Useful for displaying the error in context. *) 146 + } 147 + 148 + (** Validation result containing all messages and the parsed document. 149 + 150 + Use {!messages}, {!errors}, {!warnings}, and {!infos} to access 151 + the validation messages. Use {!document} to access the parsed DOM. *) 152 + type t 153 + 154 + (** {1:validation Validation Functions} *) 155 + 156 + (** Validate HTML from a reader. 157 + 158 + Parses the HTML input and runs all conformance checks, returning 159 + a result containing any validation messages. 160 + 161 + {b Example:} 162 + {[ 163 + let ic = open_in "page.html" in 164 + let reader = Bytesrw.Bytes.Reader.of_in_channel ic in 165 + let result = Htmlrw_check.check ~system_id:"page.html" reader in 166 + close_in ic; 167 + 168 + if Htmlrw_check.has_errors result then 169 + print_endline (Htmlrw_check.to_text result) 170 + ]} 171 + 172 + @param collect_parse_errors If [true] (default), include HTML parse 173 + errors in the results. Set to [false] to only get conformance 174 + checker errors (content model, attributes, etc.). 175 + @param system_id File path or URL for the document. Used in error 176 + messages and the {!location} field. Does not affect validation. *) 177 + val check : 178 + ?collect_parse_errors:bool -> 179 + ?system_id:string -> 180 + Bytesrw.Bytes.Reader.t -> 181 + t 182 + 183 + (** Validate an already-parsed HTML document. 184 + 185 + Runs conformance checks on an existing {!Html5rw.t} parse result. 186 + Useful when you've already parsed the document and want to validate 187 + it without re-parsing. 188 + 189 + {b Example:} 190 + {[ 191 + let doc = Html5rw.parse reader in 192 + (* ... manipulate the DOM ... *) 193 + let result = Htmlrw_check.check_parsed doc in 194 + ]} 195 + 196 + @param collect_parse_errors If [true] (default), include any parse 197 + errors that were collected during the original parse. 198 + @param system_id File path or URL for error reporting. *) 199 + val check_parsed : 200 + ?collect_parse_errors:bool -> 201 + ?system_id:string -> 202 + Html5rw.t -> 203 + t 204 + 205 + (** {1:results Result Accessors} *) 206 + 207 + (** Get all validation messages. 208 + 209 + Returns messages in the order they were generated, which roughly 210 + corresponds to document order for element-related errors. *) 211 + val messages : t -> message list 212 + 213 + (** Get only error messages. 214 + 215 + Errors indicate conformance violations - the document does not 216 + comply with the HTML5 specification. *) 217 + val errors : t -> message list 218 + 219 + (** Get only warning messages. 220 + 221 + Warnings indicate likely problems that may be intentional in 222 + some cases (e.g., deprecated features still in use). *) 223 + val warnings : t -> message list 224 + 225 + (** Get only informational messages. 226 + 227 + Info messages are suggestions for best practices that don't 228 + affect conformance. *) 229 + val infos : t -> message list 230 + 231 + (** Test if any errors were found. 232 + 233 + Equivalent to [errors result <> []] but more efficient. *) 234 + val has_errors : t -> bool 235 + 236 + (** Test if any warnings were found. 237 + 238 + Equivalent to [warnings result <> []] but more efficient. *) 239 + val has_warnings : t -> bool 240 + 241 + (** Get the parsed document. 242 + 243 + Returns the DOM tree that was validated. For {!check}, this is the 244 + newly parsed document. For {!check_parsed}, this is the document 245 + that was passed in. *) 246 + val document : t -> Html5rw.t 247 + 248 + (** Get the system identifier. 249 + 250 + Returns the file path or URL that was passed to {!check} or 251 + {!check_parsed}, or [None] if not provided. *) 252 + val system_id : t -> string option 253 + 254 + (** {1:formatting Output Formatting} *) 255 + 256 + (** Format messages as human-readable text. 257 + 258 + Produces multi-line output suitable for terminal display: 259 + {v 260 + Error: Element "img" is missing required attribute "alt". 261 + At line 5, column 3 262 + <img src="photo.jpg"> 263 + v} 264 + 265 + Messages are formatted with severity, description, location, 266 + and source excerpt when available. *) 267 + val to_text : t -> string 268 + 269 + (** Format messages as JSON. 270 + 271 + Produces JSON output compatible with the Nu HTML Validator format: 272 + {v 273 + { 274 + "messages": [ 275 + { 276 + "type": "error", 277 + "message": "Element \"img\" is missing required attribute \"alt\".", 278 + "lastLine": 5, 279 + "lastColumn": 3 280 + } 281 + ] 282 + } 283 + v} 284 + 285 + Useful for machine processing and integration with other tools. *) 286 + val to_json : t -> string 287 + 288 + (** Format messages in GNU error format. 289 + 290 + Produces one-line-per-error output for IDE integration: 291 + {v 292 + page.html:5:3: error: Element "img" is missing required attribute "alt". 293 + v} 294 + 295 + This format is recognized by many editors and build tools. *) 296 + val to_gnu : t -> string 297 + 298 + (** {1:utilities Utility Functions} *) 299 + 300 + (** Convert severity to lowercase string. 301 + 302 + Returns ["error"], ["warning"], or ["info"]. *) 303 + val severity_to_string : severity -> string 304 + 305 + (** Pretty-print a severity value. *) 306 + val pp_severity : Format.formatter -> severity -> unit 307 + 308 + (** Pretty-print a location. *) 309 + val pp_location : Format.formatter -> location -> unit 310 + 311 + (** Pretty-print a message. 312 + 313 + Includes severity, text, and location if available. *) 314 + val pp_message : Format.formatter -> message -> unit 315 + 316 + (** Convert a message to a single-line string. 317 + 318 + Includes severity and message text. *) 319 + val message_to_string : message -> string 320 + 321 + (** {1:error_codes Error Codes} 322 + 323 + The {!Error_code} module provides typed error codes for programmatic 324 + handling of validation issues. Use pattern matching to handle specific 325 + errors: 326 + 327 + {[ 328 + let handle_message msg = 329 + match msg.Htmlrw_check.error_code with 330 + | Some (`Img `Missing_alt) -> 331 + (* Image accessibility issue *) 332 + suggest_alt_text msg 333 + | Some (`Attr (`Duplicate_id (`Id id))) -> 334 + (* Duplicate ID found *) 335 + highlight_all_with_id id 336 + | Some (`Aria _) -> 337 + (* Any ARIA-related error *) 338 + show_aria_help () 339 + | _ -> 340 + (* Generic handling *) 341 + display_error msg 342 + ]} 343 + 344 + The error codes are organized into categories: 345 + - [`Attr _]: Attribute errors (missing, invalid, duplicate) 346 + - [`Element _]: Element/content model errors 347 + - [`Aria _]: ARIA accessibility errors 348 + - [`Img _]: Image-related errors 349 + - [`Table _]: Table structure errors 350 + - And more... 351 + 352 + See {!Error_code} for the complete type definition. *) 353 + module Error_code = Error_code