OCaml HTML5 parser/serialiser based on Python's JustHTML
at validator 6.3 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 SPDX-License-Identifier: MIT 4---------------------------------------------------------------------------*) 5 6(** html5check - HTML5 conformance checker CLI 7 8 Command line interface for validating HTML5 documents. *) 9 10open Cmdliner 11 12let version = "0.1.0" 13 14(** Exit codes *) 15module Exit_code = struct 16 let ok = Cmd.Exit.ok 17 let validation_errors = 1 18 let io_error = 2 19end 20 21(** Read input from file or stdin *) 22let read_input file = 23 try 24 let ic = 25 if file = "-" then stdin 26 else open_in file 27 in 28 let reader = Bytesrw.Bytes.Reader.of_in_channel ic in 29 Ok (reader, ic, file) 30 with 31 | Sys_error msg -> 32 Error (`Io_error (Printf.sprintf "Cannot read file '%s': %s" file msg)) 33 34(** Format output based on the requested format *) 35let format_output format result = 36 match format with 37 | `Text -> Htmlrw_check.to_text result 38 | `Json -> Htmlrw_check.to_json result 39 | `Gnu -> Htmlrw_check.to_gnu result 40 41(** Run the validation *) 42let run format errors_only exit_zero quiet verbose file = 43 match read_input file with 44 | Error (`Io_error msg) -> 45 if not quiet then Printf.eprintf "Error: %s\n" msg; 46 Exit_code.io_error 47 | Ok (reader, ic, system_id) -> 48 (* Run validation *) 49 let result = Htmlrw_check.check ~system_id reader in 50 51 (* Close input if it's not stdin *) 52 if file <> "-" then close_in ic; 53 54 (* Get messages based on filtering *) 55 let messages = 56 if errors_only then Htmlrw_check.errors result 57 else Htmlrw_check.messages result 58 in 59 60 (* Output based on mode *) 61 if quiet then begin 62 (* Only show counts *) 63 let error_count = List.length (Htmlrw_check.errors result) in 64 let warning_count = List.length (Htmlrw_check.warnings result) in 65 if errors_only then 66 Printf.printf "%d error%s\n" error_count (if error_count = 1 then "" else "s") 67 else 68 Printf.printf "%d error%s, %d warning%s\n" 69 error_count (if error_count = 1 then "" else "s") 70 warning_count (if warning_count = 1 then "" else "s") 71 end else begin 72 (* Format and print messages *) 73 let output = format_output format result in 74 if output <> "" then print_string output; 75 76 (* Show summary if verbose *) 77 if verbose && messages <> [] then begin 78 let error_count = List.length (Htmlrw_check.errors result) in 79 let warning_count = List.length (Htmlrw_check.warnings result) in 80 Printf.eprintf "\nSummary: %d error%s, %d warning%s\n" 81 error_count (if error_count = 1 then "" else "s") 82 warning_count (if warning_count = 1 then "" else "s") 83 end 84 end; 85 86 (* Determine exit code *) 87 if exit_zero || not (Htmlrw_check.has_errors result) then 88 Exit_code.ok 89 else 90 Exit_code.validation_errors 91 92(** Command line argument definitions *) 93 94let format_arg = 95 let formats = [("text", `Text); ("json", `Json); ("gnu", `Gnu)] in 96 let doc = 97 "Output format. $(docv) must be one of $(b,text) (human-readable, default), \ 98 $(b,json) (Nu validator compatible JSON), or $(b,gnu) (GNU-style for IDE integration)." 99 in 100 Arg.(value & opt (enum formats) `Text & info ["format"] ~docv:"FORMAT" ~doc) 101 102let errors_only_arg = 103 let doc = "Only show errors (suppress warnings)." in 104 Arg.(value & flag & info ["errors-only"] ~doc) 105 106let exit_zero_arg = 107 let doc = 108 "Always exit with status code 0, even if validation errors are found. \ 109 Useful for CI pipelines where you want to collect validation results \ 110 but not fail the build." 111 in 112 Arg.(value & flag & info ["exit-zero"] ~doc) 113 114let quiet_arg = 115 let doc = "Quiet mode - only show error and warning counts, no details." in 116 Arg.(value & flag & info ["q"; "quiet"] ~doc) 117 118let verbose_arg = 119 let doc = "Verbose mode - show additional information including summary." in 120 Arg.(value & flag & info ["v"; "verbose"] ~doc) 121 122let file_arg = 123 let doc = 124 "HTML file to validate. Use $(b,-) to read from standard input. \ 125 If no file is specified, reads from stdin." 126 in 127 Arg.(value & pos 0 string "-" & info [] ~docv:"FILE" ~doc) 128 129let cmd = 130 let doc = "validate HTML5 documents for conformance" in 131 let man = [ 132 `S Manpage.s_description; 133 `P "$(tname) validates HTML5 documents against the WHATWG HTML5 specification. \ 134 It reports parse errors, structural validation issues, and conformance problems."; 135 `P "The validator checks for:"; 136 `I ("Parse errors", "Malformed HTML syntax according to the WHATWG specification"); 137 `I ("Content model violations", "Elements in invalid parent/child relationships"); 138 `I ("Attribute errors", "Invalid or missing required attributes"); 139 `I ("Structural issues", "Other conformance problems"); 140 `S Manpage.s_options; 141 `S "OUTPUT FORMATS"; 142 `P "The validator supports three output formats:"; 143 `I ("$(b,text)", "Human-readable format showing file:line:col: severity: message"); 144 `I ("$(b,json)", "JSON format compatible with the Nu Html Checker (v.Nu)"); 145 `I ("$(b,gnu)", "GNU-style format for IDE integration (file:line:column: message)"); 146 `S "EXIT STATUS"; 147 `P "The validator exits with one of the following status codes:"; 148 `I ("0", "No validation errors found (or --exit-zero was specified)"); 149 `I ("1", "Validation errors were found"); 150 `I ("2", "File not found or I/O error"); 151 `S Manpage.s_examples; 152 `P "Validate a file:"; 153 `Pre " $(mname) index.html"; 154 `P "Validate from stdin:"; 155 `Pre " cat page.html | $(mname) -"; 156 `P "Show only errors in JSON format:"; 157 `Pre " $(mname) --format=json --errors-only page.html"; 158 `P "Quiet mode for CI:"; 159 `Pre " $(mname) --quiet --exit-zero index.html"; 160 `S Manpage.s_bugs; 161 `P "Report bugs at https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues"; 162 ] in 163 let info = Cmd.info "html5check" ~version ~doc ~man in 164 Cmd.v info Term.(const run $ format_arg $ errors_only_arg $ exit_zero_arg 165 $ quiet_arg $ verbose_arg $ file_arg) 166 167let main () = Cmd.eval' cmd 168let () = Stdlib.exit (main ())