OCaml HTML5 parser/serialiser based on Python's JustHTML
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: MIT
4 ---------------------------------------------------------------------------*)
5
6let q = Error_code.q
7
8(** Generate human-readable message for a parse error code *)
9let message_of_parse_error code =
10 let code_str = Html5rw.Parse_error_code.to_string code in
11 match code with
12 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
13 "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag."
14 | Html5rw.Parse_error_code.Null_character_reference ->
15 "Character reference expands to zero."
16 | Html5rw.Parse_error_code.Tree_construction_error s ->
17 (* Check for control-character/noncharacter/surrogate with codepoint info *)
18 (try
19 if String.starts_with ~prefix:"control-character-in-input-s" s then
20 let colon_pos = String.index s ':' in
21 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
22 let cp = int_of_string ("0x" ^ cp_str) in
23 Printf.sprintf "Forbidden code point U+%04x." cp
24 else if String.starts_with ~prefix:"noncharacter-in-input-str" s then
25 let colon_pos = String.index s ':' in
26 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
27 let cp = int_of_string ("0x" ^ cp_str) in
28 Printf.sprintf "Forbidden code point U+%04x." cp
29 else if String.starts_with ~prefix:"surrogate-in-input-str" s then
30 let colon_pos = String.index s ':' in
31 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
32 let cp = int_of_string ("0x" ^ cp_str) in
33 Printf.sprintf "Forbidden code point U+%04x." cp
34 (* Character reference errors *)
35 else if String.starts_with ~prefix:"control-character-reference:" s then
36 let cp_str = String.sub s 28 (String.length s - 28) in
37 let cp = int_of_string ("0x" ^ cp_str) in
38 if cp = 0x0D then
39 "A numeric character reference expanded to carriage return."
40 else
41 Printf.sprintf "Character reference expands to a control character (U+%04x)." cp
42 else if String.starts_with ~prefix:"noncharacter-character-referenc" s then
43 let colon_pos = String.index s ':' in
44 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
45 let cp = int_of_string ("0x" ^ cp_str) in
46 (* U+FDD0-U+FDEF are "permanently unassigned" *)
47 if cp >= 0xFDD0 && cp <= 0xFDEF then
48 "Character reference expands to a permanently unassigned code point."
49 (* Astral noncharacters (planes 1-16) *)
50 else if cp >= 0x10000 then
51 Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp
52 else
53 Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp
54 else if String.starts_with ~prefix:"character-reference-outside-unicode-" s then
55 "Character reference outside the permissible Unicode range."
56 else if String.starts_with ~prefix:"surrogate-character-referen" s then
57 let colon_pos = String.index s ':' in
58 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
59 let cp = int_of_string ("0x" ^ cp_str) in
60 Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp
61 else if s = "no-p-element-in-scope" then
62 Printf.sprintf "No %s element in scope but a %s end tag seen." (q "p") (q "p")
63 else if s = "end-tag-p-implied-but-open-elements" then
64 Printf.sprintf "End tag %s implied, but there were open elements." (q "p")
65 else if s = "end-tag-br" then
66 Printf.sprintf "End tag %s." (q "br")
67 else if s = "expected-closing-tag-but-got-eof" then
68 "End of file seen and there were open elements."
69 else if String.starts_with ~prefix:"bad-start-tag-in-head-noscri" s then
70 let colon_pos = String.index s ':' in
71 let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
72 Printf.sprintf "Bad start tag in %s in %s in %s." (q element) (q "noscript") (q "head")
73 else if String.starts_with ~prefix:"unexpected-end-tag:" s then
74 let element = String.sub s 19 (String.length s - 19) in
75 Printf.sprintf "Stray end tag %s." (q element)
76 else if String.starts_with ~prefix:"start-tag-in-table:" s then
77 let tag = String.sub s 19 (String.length s - 19) in
78 Printf.sprintf "Start tag %s seen in %s." (q tag) (q "table")
79 else
80 Printf.sprintf "Parse error: %s" s
81 with _ -> Printf.sprintf "Parse error: %s" s)
82 | _ -> Printf.sprintf "Parse error: %s" code_str
83
84let of_parse_error ?system_id err =
85 let code = Html5rw.error_code err in
86 let line = Html5rw.error_line err in
87 let column = Html5rw.error_column err in
88 let location = Message.make_location ~line ~column ?system_id () in
89 let message = message_of_parse_error code in
90 Message.of_parse_error ~location ~message code
91
92let collect_parse_errors ?system_id result =
93 let errors = Html5rw.errors result in
94 let is_xhtml = match system_id with
95 | Some s -> String.length s > 6 && String.sub s (String.length s - 6) 6 = ".xhtml"
96 | None -> false
97 in
98 let filtered_errors =
99 if is_xhtml then
100 (* XHTML has different requirements than HTML:
101 - No DOCTYPE required
102 - Self-closing syntax is valid for all elements *)
103 List.filter (fun err ->
104 match Html5rw.error_code err with
105 | Html5rw.Parse_error_code.Tree_construction_error "expected-doctype-but-got-other" -> false
106 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> false
107 | _ -> true
108 ) errors
109 else errors
110 in
111 List.map (of_parse_error ?system_id) filtered_errors