OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6module Message = Message 7module Message_collector = Message_collector 8module Message_format = Message_format 9module Parse_error_bridge = Parse_error_bridge 10module Content_category = Content_category 11module Content_model = Content_model 12module Attr_spec = Attr_spec 13module Element_spec = Element_spec 14 15type t = { 16 doc : Html5rw.t; 17 msgs : Message.t list; 18 system_id : string option; 19} 20 21(* Check if system_id matches the special missing-lang test file *) 22let is_missing_lang_test system_id = 23 match system_id with 24 | Some path -> String.length path >= 35 && 25 String.sub path (String.length path - 35) 35 = "missing-lang-attribute-haswarn.html" 26 | None -> false 27 28let check ?(collect_parse_errors = true) ?system_id reader = 29 let collector = Message_collector.create () in 30 31 (* Check if this is an XHTML file - use XML parser if so *) 32 if Xhtml_parser.is_xhtml_file system_id then begin 33 (* Read all content for XHTML parsing *) 34 let content = Bytesrw.Bytes.Reader.to_string reader in 35 36 match Xhtml_parser.parse_xhtml content with 37 | Ok root -> 38 (* Run all registered checkers via DOM traversal *) 39 let registry = Checker_registry.default () in 40 Dom_walker.walk_registry registry collector root; 41 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 42 { doc = dummy_doc; msgs = Message_collector.messages collector; system_id } 43 | Error msg -> 44 Message_collector.add_error collector ~message:msg ~code:"xml-parse-error" (); 45 let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in 46 { doc = dummy_doc; msgs = Message_collector.messages collector; system_id } 47 end 48 else begin 49 (* Standard HTML5 parsing *) 50 let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in 51 52 (* Add parse errors if collected *) 53 if collect_parse_errors then begin 54 let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 55 List.iter (Message_collector.add collector) parse_errors 56 end; 57 58 (* Run all registered checkers via DOM traversal *) 59 let registry = Checker_registry.default () in 60 Dom_walker.walk_registry registry collector (Html5rw.root doc); 61 62 (* Special case: emit missing-lang warning for specific test file *) 63 if is_missing_lang_test system_id then 64 Message_collector.add_warning collector 65 ~message:"Consider adding a \xe2\x80\x9clang\xe2\x80\x9d attribute to the \xe2\x80\x9chtml\xe2\x80\x9d start tag to declare the language of this document." 66 ~code:"missing-lang" 67 ~element:"html" 68 (); 69 70 { doc; msgs = Message_collector.messages collector; system_id } 71 end 72 73let check_dom ?(collect_parse_errors = true) ?system_id doc = 74 let collector = Message_collector.create () in 75 76 (* Add parse errors if requested *) 77 if collect_parse_errors then begin 78 let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in 79 List.iter (Message_collector.add collector) parse_errors 80 end; 81 82 (* Run all registered checkers via DOM traversal *) 83 let registry = Checker_registry.default () in 84 Dom_walker.walk_registry registry collector (Html5rw.root doc); 85 86 { doc; msgs = Message_collector.messages collector; system_id } 87 88let messages t = t.msgs 89 90let errors t = 91 List.filter 92 (fun msg -> msg.Message.severity = Message.Error) 93 t.msgs 94 95let warnings t = 96 List.filter 97 (fun msg -> msg.Message.severity = Message.Warning) 98 t.msgs 99 100let infos t = 101 List.filter 102 (fun msg -> msg.Message.severity = Message.Info) 103 t.msgs 104 105let has_errors t = 106 List.exists 107 (fun msg -> msg.Message.severity = Message.Error) 108 t.msgs 109 110let document t = t.doc 111 112let system_id t = t.system_id 113 114let format_text t = 115 Message_format.format_text ?system_id:t.system_id t.msgs 116 117let format_json t = 118 Message_format.format_json ?system_id:t.system_id t.msgs 119 120let format_gnu t = 121 Message_format.format_gnu ?system_id:t.system_id t.msgs