OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4---------------------------------------------------------------------------*) 5 6(** html5check - HTML5 conformance checker CLI 7 8 Command line interface for validating HTML5 documents. *) 9 10open Cmdliner 11 12let version = "0.1.0" 13 14(** Exit codes *) 15module Exit_code = struct 16 let ok = Cmd.Exit.ok 17 let validation_errors = 1 18 let io_error = 2 19end 20 21(** Read input from file or stdin *) 22let read_input file = 23 try 24 let ic = 25 if file = "-" then stdin 26 else open_in file 27 in 28 let reader = Bytesrw.Bytes.Reader.of_in_channel ic in 29 Ok (reader, ic, file) 30 with 31 | Sys_error msg -> 32 Error (`Io_error (Printf.sprintf "Cannot read file '%s': %s" file msg)) 33 34(** Format output based on the requested format *) 35let format_output format result = 36 match format with 37 | `Text -> Htmlrw_check.to_text result 38 | `Json -> Htmlrw_check.to_json result 39 | `Gnu -> Htmlrw_check.to_gnu result 40 41(** Run the validation *) 42let run format errors_only exit_zero quiet verbose file = 43 match read_input file with 44 | Error (`Io_error msg) -> 45 if not quiet then Printf.eprintf "Error: %s\n" msg; 46 Exit_code.io_error 47 | Ok (reader, ic, system_id) -> 48 (* Run validation *) 49 let result = Htmlrw_check.check ~system_id reader in 50 (* Close input if it's not stdin *) 51 if file <> "-" then close_in ic; 52 53 (* Get messages based on filtering *) 54 let messages = 55 if errors_only then Htmlrw_check.errors result 56 else Htmlrw_check.messages result 57 in 58 59 (* Output based on mode *) 60 if quiet then begin 61 (* Only show counts *) 62 let error_count = List.length (Htmlrw_check.errors result) in 63 let warning_count = List.length (Htmlrw_check.warnings result) in 64 if errors_only then 65 Printf.printf "%d error%s\n" error_count (if error_count = 1 then "" else "s") 66 else 67 Printf.printf "%d error%s, %d warning%s\n" 68 error_count (if error_count = 1 then "" else "s") 69 warning_count (if warning_count = 1 then "" else "s") 70 end else begin 71 (* Format and print messages *) 72 let output = format_output format result in 73 if output <> "" then print_string output; 74 75 (* Show summary if verbose *) 76 if verbose && messages <> [] then begin 77 let error_count = List.length (Htmlrw_check.errors result) in 78 let warning_count = List.length (Htmlrw_check.warnings result) in 79 Printf.eprintf "\nSummary: %d error%s, %d warning%s\n" 80 error_count (if error_count = 1 then "" else "s") 81 warning_count (if warning_count = 1 then "" else "s") 82 end 83 end; 84 85 (* Determine exit code *) 86 if exit_zero || not (Htmlrw_check.has_errors result) then 87 Exit_code.ok 88 else 89 Exit_code.validation_errors 90 91(** Command line argument definitions *) 92 93let format_arg = 94 let formats = [("text", `Text); ("json", `Json); ("gnu", `Gnu)] in 95 let doc = 96 "Output format. $(docv) must be one of $(b,text) (human-readable, default), \ 97 $(b,json) (Nu validator compatible JSON), or $(b,gnu) (GNU-style for IDE integration)." 98 in 99 Arg.(value & opt (enum formats) `Text & info ["format"] ~docv:"FORMAT" ~doc) 100 101let errors_only_arg = 102 let doc = "Only show errors (suppress warnings)." in 103 Arg.(value & flag & info ["errors-only"] ~doc) 104 105let exit_zero_arg = 106 let doc = 107 "Always exit with status code 0, even if validation errors are found. \ 108 Useful for CI pipelines where you want to collect validation results \ 109 but not fail the build." 110 in 111 Arg.(value & flag & info ["exit-zero"] ~doc) 112 113let quiet_arg = 114 let doc = "Quiet mode - only show error and warning counts, no details." in 115 Arg.(value & flag & info ["q"; "quiet"] ~doc) 116 117let verbose_arg = 118 let doc = "Verbose mode - show additional information including summary." in 119 Arg.(value & flag & info ["v"; "verbose"] ~doc) 120 121let file_arg = 122 let doc = 123 "HTML file to validate. Use $(b,-) to read from standard input. \ 124 If no file is specified, reads from stdin." 125 in 126 Arg.(value & pos 0 string "-" & info [] ~docv:"FILE" ~doc) 127 128let cmd = 129 let doc = "validate HTML5 documents for conformance" in 130 let man = [ 131 `S Manpage.s_description; 132 `P "$(tname) validates HTML5 documents against the WHATWG HTML5 specification. \ 133 It reports parse errors, structural validation issues, and conformance problems."; 134 `P "The validator checks for:"; 135 `I ("Parse errors", "Malformed HTML syntax according to the WHATWG specification"); 136 `I ("Content model violations", "Elements in invalid parent/child relationships"); 137 `I ("Attribute errors", "Invalid or missing required attributes"); 138 `I ("Structural issues", "Other conformance problems"); 139 `S Manpage.s_options; 140 `S "OUTPUT FORMATS"; 141 `P "The validator supports three output formats:"; 142 `I ("$(b,text)", "Human-readable format showing file:line:col: severity: message"); 143 `I ("$(b,json)", "JSON format compatible with the Nu Html Checker (v.Nu)"); 144 `I ("$(b,gnu)", "GNU-style format for IDE integration (file:line:column: message)"); 145 `S "EXIT STATUS"; 146 `P "The validator exits with one of the following status codes:"; 147 `I ("0", "No validation errors found (or --exit-zero was specified)"); 148 `I ("1", "Validation errors were found"); 149 `I ("2", "File not found or I/O error"); 150 `S Manpage.s_examples; 151 `P "Validate a file:"; 152 `Pre " $(mname) index.html"; 153 `P "Validate from stdin:"; 154 `Pre " cat page.html | $(mname) -"; 155 `P "Show only errors in JSON format:"; 156 `Pre " $(mname) --format=json --errors-only page.html"; 157 `P "Quiet mode for CI:"; 158 `Pre " $(mname) --quiet --exit-zero index.html"; 159 `S Manpage.s_bugs; 160 `P "Report bugs at https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues"; 161 ] in 162 let info = Cmd.info "html5check" ~version ~doc ~man in 163 Cmd.v info Term.(const run $ format_arg $ errors_only_arg $ exit_zero_arg 164 $ quiet_arg $ verbose_arg $ file_arg) 165 166let main () = Cmd.eval' cmd 167let () = Stdlib.exit (main ())