OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4---------------------------------------------------------------------------*)
5
6(** html5check - HTML5 conformance checker CLI
7
8 Command line interface for validating HTML5 documents. *)
9
10open Cmdliner
11
12let version = "0.1.0"
13
14(** Exit codes *)
15module Exit_code = struct
16 let ok = Cmd.Exit.ok
17 let validation_errors = 1
18 let io_error = 2
19end
20
21(** Read input from file or stdin *)
22let read_input file =
23 try
24 let ic =
25 if file = "-" then stdin
26 else open_in file
27 in
28 let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
29 Ok (reader, ic, file)
30 with
31 | Sys_error msg ->
32 Error (`Io_error (Printf.sprintf "Cannot read file '%s': %s" file msg))
33
34(** Format output based on the requested format *)
35let format_output format result =
36 match format with
37 | `Text -> Htmlrw_check.to_text result
38 | `Json -> Htmlrw_check.to_json result
39 | `Gnu -> Htmlrw_check.to_gnu result
40
41(** Run the validation *)
42let run format errors_only exit_zero quiet verbose file =
43 match read_input file with
44 | Error (`Io_error msg) ->
45 if not quiet then Printf.eprintf "Error: %s\n" msg;
46 Exit_code.io_error
47 | Ok (reader, ic, system_id) ->
48 (* Run validation *)
49 let result = Htmlrw_check.check ~system_id reader in
50
51 (* Close input if it's not stdin *)
52 if file <> "-" then close_in ic;
53
54 (* Get messages based on filtering *)
55 let messages =
56 if errors_only then Htmlrw_check.errors result
57 else Htmlrw_check.messages result
58 in
59
60 (* Output based on mode *)
61 if quiet then begin
62 (* Only show counts *)
63 let error_count = List.length (Htmlrw_check.errors result) in
64 let warning_count = List.length (Htmlrw_check.warnings result) in
65 if errors_only then
66 Printf.printf "%d error%s\n" error_count (if error_count = 1 then "" else "s")
67 else
68 Printf.printf "%d error%s, %d warning%s\n"
69 error_count (if error_count = 1 then "" else "s")
70 warning_count (if warning_count = 1 then "" else "s")
71 end else begin
72 (* Format and print messages *)
73 let output = format_output format result in
74 if output <> "" then print_string output;
75
76 (* Show summary if verbose *)
77 if verbose && messages <> [] then begin
78 let error_count = List.length (Htmlrw_check.errors result) in
79 let warning_count = List.length (Htmlrw_check.warnings result) in
80 Printf.eprintf "\nSummary: %d error%s, %d warning%s\n"
81 error_count (if error_count = 1 then "" else "s")
82 warning_count (if warning_count = 1 then "" else "s")
83 end
84 end;
85
86 (* Determine exit code *)
87 if exit_zero || not (Htmlrw_check.has_errors result) then
88 Exit_code.ok
89 else
90 Exit_code.validation_errors
91
92(** Command line argument definitions *)
93
94let format_arg =
95 let formats = [("text", `Text); ("json", `Json); ("gnu", `Gnu)] in
96 let doc =
97 "Output format. $(docv) must be one of $(b,text) (human-readable, default), \
98 $(b,json) (Nu validator compatible JSON), or $(b,gnu) (GNU-style for IDE integration)."
99 in
100 Arg.(value & opt (enum formats) `Text & info ["format"] ~docv:"FORMAT" ~doc)
101
102let errors_only_arg =
103 let doc = "Only show errors (suppress warnings)." in
104 Arg.(value & flag & info ["errors-only"] ~doc)
105
106let exit_zero_arg =
107 let doc =
108 "Always exit with status code 0, even if validation errors are found. \
109 Useful for CI pipelines where you want to collect validation results \
110 but not fail the build."
111 in
112 Arg.(value & flag & info ["exit-zero"] ~doc)
113
114let quiet_arg =
115 let doc = "Quiet mode - only show error and warning counts, no details." in
116 Arg.(value & flag & info ["q"; "quiet"] ~doc)
117
118let verbose_arg =
119 let doc = "Verbose mode - show additional information including summary." in
120 Arg.(value & flag & info ["v"; "verbose"] ~doc)
121
122let file_arg =
123 let doc =
124 "HTML file to validate. Use $(b,-) to read from standard input. \
125 If no file is specified, reads from stdin."
126 in
127 Arg.(value & pos 0 string "-" & info [] ~docv:"FILE" ~doc)
128
129let cmd =
130 let doc = "validate HTML5 documents for conformance" in
131 let man = [
132 `S Manpage.s_description;
133 `P "$(tname) validates HTML5 documents against the WHATWG HTML5 specification. \
134 It reports parse errors, structural validation issues, and conformance problems.";
135 `P "The validator checks for:";
136 `I ("Parse errors", "Malformed HTML syntax according to the WHATWG specification");
137 `I ("Content model violations", "Elements in invalid parent/child relationships");
138 `I ("Attribute errors", "Invalid or missing required attributes");
139 `I ("Structural issues", "Other conformance problems");
140 `S Manpage.s_options;
141 `S "OUTPUT FORMATS";
142 `P "The validator supports three output formats:";
143 `I ("$(b,text)", "Human-readable format showing file:line:col: severity: message");
144 `I ("$(b,json)", "JSON format compatible with the Nu Html Checker (v.Nu)");
145 `I ("$(b,gnu)", "GNU-style format for IDE integration (file:line:column: message)");
146 `S "EXIT STATUS";
147 `P "The validator exits with one of the following status codes:";
148 `I ("0", "No validation errors found (or --exit-zero was specified)");
149 `I ("1", "Validation errors were found");
150 `I ("2", "File not found or I/O error");
151 `S Manpage.s_examples;
152 `P "Validate a file:";
153 `Pre " $(mname) index.html";
154 `P "Validate from stdin:";
155 `Pre " cat page.html | $(mname) -";
156 `P "Show only errors in JSON format:";
157 `Pre " $(mname) --format=json --errors-only page.html";
158 `P "Quiet mode for CI:";
159 `Pre " $(mname) --quiet --exit-zero index.html";
160 `S Manpage.s_bugs;
161 `P "Report bugs at https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues";
162 ] in
163 let info = Cmd.info "html5check" ~version ~doc ~man in
164 Cmd.v info Term.(const run $ format_arg $ errors_only_arg $ exit_zero_arg
165 $ quiet_arg $ verbose_arg $ file_arg)
166
167let main () = Cmd.eval' cmd
168let () = Stdlib.exit (main ())