OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4---------------------------------------------------------------------------*)
5
6(** html5check - HTML5 conformance checker CLI
7
8 Command line interface for validating HTML5 documents. *)
9
10open Cmdliner
11
12let version = "0.1.0"
13
14(** Exit codes *)
15module Exit_code = struct
16 let ok = Cmd.Exit.ok
17 let validation_errors = 1
18 let io_error = 2
19end
20
21(** Read input from file or stdin *)
22let read_input file =
23 try
24 let ic =
25 if file = "-" then stdin
26 else open_in file
27 in
28 let reader = Bytesrw.Bytes.Reader.of_in_channel ic in
29 Ok (reader, ic, file)
30 with
31 | Sys_error msg ->
32 Error (`Io_error (Printf.sprintf "Cannot read file '%s': %s" file msg))
33
34(** Format output based on the requested format *)
35let format_output format result =
36 match format with
37 | `Text -> Htmlrw_check.to_text result
38 | `Json -> Htmlrw_check.to_json result
39 | `Gnu -> Htmlrw_check.to_gnu result
40
41(** Run the validation *)
42let run format errors_only exit_zero quiet verbose file =
43 match read_input file with
44 | Error (`Io_error msg) ->
45 if not quiet then Printf.eprintf "Error: %s\n" msg;
46 Exit_code.io_error
47 | Ok (reader, ic, system_id) ->
48 (* Run validation *)
49 let result = Htmlrw_check.check ~system_id reader in
50 (* Close input if it's not stdin *)
51 if file <> "-" then close_in ic;
52
53 (* Get messages based on filtering *)
54 let messages =
55 if errors_only then Htmlrw_check.errors result
56 else Htmlrw_check.messages result
57 in
58
59 (* Output based on mode *)
60 if quiet then begin
61 (* Only show counts *)
62 let error_count = List.length (Htmlrw_check.errors result) in
63 let warning_count = List.length (Htmlrw_check.warnings result) in
64 if errors_only then
65 Printf.printf "%d error%s\n" error_count (if error_count = 1 then "" else "s")
66 else
67 Printf.printf "%d error%s, %d warning%s\n"
68 error_count (if error_count = 1 then "" else "s")
69 warning_count (if warning_count = 1 then "" else "s")
70 end else begin
71 (* Format and print messages *)
72 let output = format_output format result in
73 if output <> "" then print_string output;
74
75 (* Show summary if verbose *)
76 if verbose && messages <> [] then begin
77 let error_count = List.length (Htmlrw_check.errors result) in
78 let warning_count = List.length (Htmlrw_check.warnings result) in
79 Printf.eprintf "\nSummary: %d error%s, %d warning%s\n"
80 error_count (if error_count = 1 then "" else "s")
81 warning_count (if warning_count = 1 then "" else "s")
82 end
83 end;
84
85 (* Determine exit code *)
86 if exit_zero || not (Htmlrw_check.has_errors result) then
87 Exit_code.ok
88 else
89 Exit_code.validation_errors
90
91(** Command line argument definitions *)
92
93let format_arg =
94 let formats = [("text", `Text); ("json", `Json); ("gnu", `Gnu)] in
95 let doc =
96 "Output format. $(docv) must be one of $(b,text) (human-readable, default), \
97 $(b,json) (Nu validator compatible JSON), or $(b,gnu) (GNU-style for IDE integration)."
98 in
99 Arg.(value & opt (enum formats) `Text & info ["format"] ~docv:"FORMAT" ~doc)
100
101let errors_only_arg =
102 let doc = "Only show errors (suppress warnings)." in
103 Arg.(value & flag & info ["errors-only"] ~doc)
104
105let exit_zero_arg =
106 let doc =
107 "Always exit with status code 0, even if validation errors are found. \
108 Useful for CI pipelines where you want to collect validation results \
109 but not fail the build."
110 in
111 Arg.(value & flag & info ["exit-zero"] ~doc)
112
113let quiet_arg =
114 let doc = "Quiet mode - only show error and warning counts, no details." in
115 Arg.(value & flag & info ["q"; "quiet"] ~doc)
116
117let verbose_arg =
118 let doc = "Verbose mode - show additional information including summary." in
119 Arg.(value & flag & info ["v"; "verbose"] ~doc)
120
121let file_arg =
122 let doc =
123 "HTML file to validate. Use $(b,-) to read from standard input. \
124 If no file is specified, reads from stdin."
125 in
126 Arg.(value & pos 0 string "-" & info [] ~docv:"FILE" ~doc)
127
128let cmd =
129 let doc = "validate HTML5 documents for conformance" in
130 let man = [
131 `S Manpage.s_description;
132 `P "$(tname) validates HTML5 documents against the WHATWG HTML5 specification. \
133 It reports parse errors, structural validation issues, and conformance problems.";
134 `P "The validator checks for:";
135 `I ("Parse errors", "Malformed HTML syntax according to the WHATWG specification");
136 `I ("Content model violations", "Elements in invalid parent/child relationships");
137 `I ("Attribute errors", "Invalid or missing required attributes");
138 `I ("Structural issues", "Other conformance problems");
139 `S Manpage.s_options;
140 `S "OUTPUT FORMATS";
141 `P "The validator supports three output formats:";
142 `I ("$(b,text)", "Human-readable format showing file:line:col: severity: message");
143 `I ("$(b,json)", "JSON format compatible with the Nu Html Checker (v.Nu)");
144 `I ("$(b,gnu)", "GNU-style format for IDE integration (file:line:column: message)");
145 `S "EXIT STATUS";
146 `P "The validator exits with one of the following status codes:";
147 `I ("0", "No validation errors found (or --exit-zero was specified)");
148 `I ("1", "Validation errors were found");
149 `I ("2", "File not found or I/O error");
150 `S Manpage.s_examples;
151 `P "Validate a file:";
152 `Pre " $(mname) index.html";
153 `P "Validate from stdin:";
154 `Pre " cat page.html | $(mname) -";
155 `P "Show only errors in JSON format:";
156 `Pre " $(mname) --format=json --errors-only page.html";
157 `P "Quiet mode for CI:";
158 `Pre " $(mname) --quiet --exit-zero index.html";
159 `S Manpage.s_bugs;
160 `P "Report bugs at https://tangled.org/@anil.recoil.org/ocaml-html5rw/issues";
161 ] in
162 let info = Cmd.info "html5check" ~version ~doc ~man in
163 Cmd.v info Term.(const run $ format_arg $ errors_only_arg $ exit_zero_arg
164 $ quiet_arg $ verbose_arg $ file_arg)
165
166let main () = Cmd.eval' cmd
167let () = Stdlib.exit (main ())