+1
lib/html5_checker/checker_registry.ml
+1
lib/html5_checker/checker_registry.ml
···
42
42
Hashtbl.replace reg "mime-type" Mime_type_checker.checker;
43
43
Hashtbl.replace reg "normalization" Normalization_checker.checker;
44
44
Hashtbl.replace reg "svg" Svg_checker.checker;
45
+
Hashtbl.replace reg "xhtml-content" Xhtml_content_checker.checker;
45
46
(* Hashtbl.replace reg "heading" Heading_checker.checker; *)
46
47
(* Hashtbl.replace reg "content" Content_checker.checker; *)
47
48
reg
+1
-1
lib/html5_checker/dune
+1
-1
lib/html5_checker/dune
+31
-10
lib/html5_checker/html5_checker.ml
+31
-10
lib/html5_checker/html5_checker.ml
···
19
19
}
20
20
21
21
let check ?(collect_parse_errors = true) ?system_id reader =
22
-
let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in
23
22
let collector = Message_collector.create () in
24
23
25
-
(* Add parse errors if collected *)
26
-
if collect_parse_errors then begin
27
-
let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
28
-
List.iter (Message_collector.add collector) parse_errors
29
-
end;
24
+
(* Check if this is an XHTML file - use XML parser if so *)
25
+
if Xhtml_parser.is_xhtml_file system_id then begin
26
+
(* Read all content for XHTML parsing *)
27
+
let content = Bytesrw.Bytes.Reader.to_string reader in
28
+
29
+
match Xhtml_parser.parse_xhtml content with
30
+
| Ok root ->
31
+
(* Run all registered checkers via DOM traversal *)
32
+
let registry = Checker_registry.default () in
33
+
Dom_walker.walk_registry registry collector root;
34
+
let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
35
+
{ doc = dummy_doc; msgs = Message_collector.messages collector; system_id }
36
+
| Error msg ->
37
+
Message_collector.add_error collector ~message:msg ~code:"xml-parse-error" ();
38
+
let dummy_doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string "") in
39
+
{ doc = dummy_doc; msgs = Message_collector.messages collector; system_id }
40
+
end
41
+
else begin
42
+
(* Standard HTML5 parsing *)
43
+
let doc = Html5rw.parse ~collect_errors:collect_parse_errors reader in
30
44
31
-
(* Run all registered checkers via DOM traversal *)
32
-
let registry = Checker_registry.default () in
33
-
Dom_walker.walk_registry registry collector (Html5rw.root doc);
45
+
(* Add parse errors if collected *)
46
+
if collect_parse_errors then begin
47
+
let parse_errors = Parse_error_bridge.collect_parse_errors ?system_id doc in
48
+
List.iter (Message_collector.add collector) parse_errors
49
+
end;
34
50
35
-
{ doc; msgs = Message_collector.messages collector; system_id }
51
+
(* Run all registered checkers via DOM traversal *)
52
+
let registry = Checker_registry.default () in
53
+
Dom_walker.walk_registry registry collector (Html5rw.root doc);
54
+
55
+
{ doc; msgs = Message_collector.messages collector; system_id }
56
+
end
36
57
37
58
let check_dom ?(collect_parse_errors = true) ?system_id doc =
38
59
let collector = Message_collector.create () in
+68
-7
lib/html5_checker/semantic/required_attr_checker.ml
+68
-7
lib/html5_checker/semantic/required_attr_checker.ml
···
3
3
type state = {
4
4
mutable _in_figure : bool;
5
5
(** Track if we're inside a <figure> element (alt is more critical there) *)
6
+
mutable in_a_with_href : bool;
7
+
(** Track if we're inside an <a> element with href attribute *)
6
8
}
7
9
8
-
let create () = { _in_figure = false }
10
+
let create () = { _in_figure = false; in_a_with_href = false }
9
11
10
-
let reset state = state._in_figure <- false
12
+
let reset state =
13
+
state._in_figure <- false;
14
+
state.in_a_with_href <- false
11
15
12
16
(** Check if an attribute list contains a specific attribute. *)
13
17
let has_attr name attrs =
···
20
24
if String.equal attr_name name then Some value else None)
21
25
attrs
22
26
23
-
let check_img_element attrs collector =
27
+
let check_img_element state attrs collector =
24
28
(* Check for required src OR srcset attribute *)
25
29
if not (has_attr "src" attrs) && not (has_attr "srcset" attrs) then
26
30
Message_collector.add_error collector
···
31
35
if not (has_attr "alt" attrs) then
32
36
Message_collector.add_error collector
33
37
~message:"img element requires alt attribute for accessibility"
34
-
~code:"missing-required-attribute" ~element:"img" ~attribute:"alt" ()
38
+
~code:"missing-required-attribute" ~element:"img" ~attribute:"alt" ();
39
+
40
+
(* Check ismap requires 'a' ancestor with href *)
41
+
if has_attr "ismap" attrs && not state.in_a_with_href then
42
+
Message_collector.add_error collector
43
+
~message:"The \xe2\x80\x9cimg\xe2\x80\x9d element with the \xe2\x80\x9cismap\xe2\x80\x9d attribute set must have an \xe2\x80\x9ca\xe2\x80\x9d ancestor with the \xe2\x80\x9chref\xe2\x80\x9d attribute."
44
+
~code:"missing-required-ancestor" ~element:"img" ~attribute:"ismap" ()
35
45
36
46
let check_area_element attrs collector =
37
47
(* area with href requires alt *)
···
143
153
~code:"bad-attribute-value" ~element:"div" ~attribute:"popover" ()
144
154
| None -> ()
145
155
156
+
let check_meter_element attrs collector =
157
+
(* meter requires value attribute *)
158
+
if not (has_attr "value" attrs) then
159
+
Message_collector.add_error collector
160
+
~message:"Element \xe2\x80\x9cmeter\xe2\x80\x9d is missing required attribute \xe2\x80\x9cvalue\xe2\x80\x9d."
161
+
~code:"missing-required-attribute" ~element:"meter" ~attribute:"value" ()
162
+
else begin
163
+
(* Validate min <= value constraint *)
164
+
match get_attr "value" attrs, get_attr "min" attrs with
165
+
| Some value_str, Some min_str ->
166
+
(try
167
+
let value = float_of_string value_str in
168
+
let min_val = float_of_string min_str in
169
+
if min_val > value then
170
+
Message_collector.add_error collector
171
+
~message:"The value of the \xe2\x80\x9cmin\xe2\x80\x9d attribute must be less than or equal to the value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute."
172
+
~code:"bad-attribute-value" ~element:"meter" ~attribute:"min" ()
173
+
with _ -> ())
174
+
| _ -> ()
175
+
end
176
+
177
+
let check_progress_element attrs collector =
178
+
(* Validate progress value constraints *)
179
+
match get_attr "value" attrs with
180
+
| None -> () (* value is optional *)
181
+
| Some value_str ->
182
+
(try
183
+
let value = float_of_string value_str in
184
+
let max_val = match get_attr "max" attrs with
185
+
| None -> 1.0 (* default max is 1 *)
186
+
| Some max_str -> (try float_of_string max_str with _ -> 1.0)
187
+
in
188
+
if value > max_val then
189
+
(* Check which message to use based on whether max is present *)
190
+
if has_attr "max" attrs then
191
+
Message_collector.add_error collector
192
+
~message:"The value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute must be less than or equal to the value of the \xe2\x80\x9cmax\xe2\x80\x9d attribute."
193
+
~code:"bad-attribute-value" ~element:"progress" ~attribute:"value" ()
194
+
else
195
+
Message_collector.add_error collector
196
+
~message:"The value of the \xe2\x80\x9cvalue\xe2\x80\x9d attribute must be less than or equal to one when the \xe2\x80\x9cmax\xe2\x80\x9d attribute is absent."
197
+
~code:"bad-attribute-value" ~element:"progress" ~attribute:"value" ()
198
+
with _ -> ())
199
+
146
200
let start_element state ~name ~namespace:_ ~attrs collector =
147
201
match name with
148
-
| "img" -> check_img_element attrs collector
202
+
| "img" -> check_img_element state attrs collector
149
203
| "area" -> check_area_element attrs collector
150
204
| "input" -> check_input_element attrs collector
151
205
| "script" -> check_script_element attrs collector
152
206
| "meta" -> check_meta_element attrs collector
153
207
| "link" -> check_link_element attrs collector
154
-
| "a" -> check_a_element attrs collector
208
+
| "a" ->
209
+
check_a_element attrs collector;
210
+
if has_attr "href" attrs then state.in_a_with_href <- true
155
211
| "map" -> check_map_element attrs collector
156
212
| "object" -> check_object_element attrs collector
213
+
| "meter" -> check_meter_element attrs collector
214
+
| "progress" -> check_progress_element attrs collector
157
215
| "figure" -> state._in_figure <- true
158
216
| _ ->
159
217
(* Check popover attribute on any element *)
160
218
if has_attr "popover" attrs then check_popover_element attrs collector
161
219
162
220
let end_element state ~name ~namespace:_ _collector =
163
-
match name with "figure" -> state._in_figure <- false | _ -> ()
221
+
match name with
222
+
| "figure" -> state._in_figure <- false
223
+
| "a" -> state.in_a_with_href <- false
224
+
| _ -> ()
164
225
165
226
let characters _state _text _collector = ()
166
227
+103
lib/html5_checker/specialized/xhtml_content_checker.ml
+103
lib/html5_checker/specialized/xhtml_content_checker.ml
···
1
+
(** XHTML content model checker.
2
+
3
+
Validates specific content model rules that the Nu validator checks,
4
+
particularly for elements that don't allow text content or specific children. *)
5
+
6
+
type state = {
7
+
mutable element_stack : string list;
8
+
}
9
+
10
+
let create () = { element_stack = [] }
11
+
12
+
let reset state = state.element_stack <- []
13
+
14
+
(* Elements that don't allow direct text content (only specific child elements) *)
15
+
let no_text_elements = [
16
+
"menu"; (* Only li elements *)
17
+
"iframe"; (* In XHTML mode, no content allowed *)
18
+
"figure"; (* Only figcaption and flow content, not bare text *)
19
+
]
20
+
21
+
22
+
(* Check if an element is allowed as child of parent *)
23
+
let is_child_allowed ~parent ~child =
24
+
match parent with
25
+
| "menu" ->
26
+
(* menu only allows li, script, template *)
27
+
List.mem child ["li"; "script"; "template"]
28
+
| _ -> true
29
+
30
+
(* Check if text is allowed in element *)
31
+
let is_text_allowed element =
32
+
not (List.mem element no_text_elements)
33
+
34
+
(* Check if data-* attribute has uppercase characters *)
35
+
let check_data_attr_case attrs collector =
36
+
List.iter (fun (attr_name, _) ->
37
+
if String.length attr_name > 5 &&
38
+
String.sub attr_name 0 5 = "data-" then
39
+
let suffix = String.sub attr_name 5 (String.length attr_name - 5) in
40
+
if String.exists (fun c -> c >= 'A' && c <= 'Z') suffix then
41
+
Message_collector.add_error collector
42
+
~message:"\xe2\x80\x9cdata-*\xe2\x80\x9d attributes must not have characters from the range \xe2\x80\x9cA\xe2\x80\x9d\xe2\x80\xa6\xe2\x80\x9cZ\xe2\x80\x9d in the name."
43
+
~attribute:attr_name
44
+
()
45
+
) attrs
46
+
47
+
let start_element state ~name ~namespace ~attrs collector =
48
+
ignore namespace;
49
+
let name_lower = String.lowercase_ascii name in
50
+
51
+
(* Check data-* attributes for uppercase *)
52
+
check_data_attr_case attrs collector;
53
+
54
+
(* Check if this element is allowed as child of parent *)
55
+
(match state.element_stack with
56
+
| parent :: _ ->
57
+
let parent_lower = String.lowercase_ascii parent in
58
+
if not (is_child_allowed ~parent:parent_lower ~child:name_lower) then
59
+
Message_collector.add_error collector
60
+
~message:(Printf.sprintf
61
+
"Element \xe2\x80\x9c%s\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
62
+
name_lower parent_lower)
63
+
~element:name_lower
64
+
()
65
+
| [] -> ());
66
+
67
+
(* Push onto stack *)
68
+
state.element_stack <- name :: state.element_stack
69
+
70
+
let end_element state ~name:_ ~namespace:_ _collector =
71
+
(* Pop from stack *)
72
+
match state.element_stack with
73
+
| _ :: rest -> state.element_stack <- rest
74
+
| [] -> ()
75
+
76
+
let characters state text collector =
77
+
(* Check if text is allowed in current element *)
78
+
match state.element_stack with
79
+
| [] -> () (* Root level - ignore *)
80
+
| parent :: _ ->
81
+
let parent_lower = String.lowercase_ascii parent in
82
+
(* Only report non-whitespace text *)
83
+
let trimmed = String.trim text in
84
+
if trimmed <> "" && not (is_text_allowed parent_lower) then
85
+
Message_collector.add_error collector
86
+
~message:(Printf.sprintf
87
+
"Text not allowed in element \xe2\x80\x9c%s\xe2\x80\x9d in this context."
88
+
parent_lower)
89
+
~element:parent_lower
90
+
()
91
+
92
+
let end_document _state _collector = ()
93
+
94
+
let checker =
95
+
(module struct
96
+
type nonrec state = state
97
+
let create = create
98
+
let reset = reset
99
+
let start_element = start_element
100
+
let end_element = end_element
101
+
let characters = characters
102
+
let end_document = end_document
103
+
end : Checker.S)
+5
lib/html5_checker/specialized/xhtml_content_checker.mli
+5
lib/html5_checker/specialized/xhtml_content_checker.mli
+85
lib/html5_checker/xhtml_parser.ml
+85
lib/html5_checker/xhtml_parser.ml
···
1
+
(** XHTML parser using xmlm for proper XML parsing.
2
+
3
+
This module provides XML parsing for XHTML files, which the HTML5 parser
4
+
cannot handle correctly (especially self-closing tags on non-void elements). *)
5
+
6
+
(** Parse XHTML content using xmlm and return a DOM tree. *)
7
+
let parse_xhtml content =
8
+
let input = Xmlm.make_input (`String (0, content)) in
9
+
10
+
(* Stack of nodes during parsing *)
11
+
let stack = ref [] in
12
+
let root = Html5rw.Dom.create_document () in
13
+
stack := [root];
14
+
15
+
(* Helper to get namespace shorthand *)
16
+
let ns_shorthand ns =
17
+
if ns = "http://www.w3.org/2000/svg" then Some "svg"
18
+
else if ns = "http://www.w3.org/1998/Math/MathML" then Some "mathml"
19
+
else if ns = "http://www.w3.org/1999/xhtml" then None (* HTML namespace *)
20
+
else if ns = "" then None (* No namespace = HTML *)
21
+
else Some ns (* Keep other namespaces as-is *)
22
+
in
23
+
24
+
(* Process xmlm signals *)
25
+
let rec process () =
26
+
if Xmlm.eoi input then ()
27
+
else begin
28
+
match Xmlm.input input with
29
+
| `Dtd _ ->
30
+
(* Skip DTD for now *)
31
+
process ()
32
+
| `El_start ((ns, local), attrs) ->
33
+
(* Create element node *)
34
+
let namespace = ns_shorthand ns in
35
+
let attr_list = List.map (fun ((_, aname), aval) -> (aname, aval)) attrs in
36
+
let node = Html5rw.Dom.create_element local ~namespace ~attrs:attr_list () in
37
+
(* Append to current parent *)
38
+
(match !stack with
39
+
| parent :: _ -> Html5rw.Dom.append_child parent node
40
+
| [] -> ());
41
+
(* Push onto stack *)
42
+
stack := node :: !stack;
43
+
process ()
44
+
| `El_end ->
45
+
(* Pop from stack *)
46
+
(match !stack with
47
+
| _ :: rest -> stack := rest
48
+
| [] -> ());
49
+
process ()
50
+
| `Data text ->
51
+
(* Create text node and append to current parent *)
52
+
let trimmed = String.trim text in
53
+
if trimmed <> "" || String.length text > 0 then begin
54
+
let text_node = Html5rw.Dom.create_text text in
55
+
(match !stack with
56
+
| parent :: _ -> Html5rw.Dom.append_child parent text_node
57
+
| [] -> ())
58
+
end;
59
+
process ()
60
+
end
61
+
in
62
+
63
+
try
64
+
process ();
65
+
Ok root
66
+
with
67
+
| Xmlm.Error ((line, col), err) ->
68
+
Error (Printf.sprintf "XML parse error at %d:%d: %s" line col (Xmlm.error_message err))
69
+
70
+
(** Check if a system_id indicates an XHTML file. *)
71
+
let is_xhtml_file system_id =
72
+
match system_id with
73
+
| Some path ->
74
+
String.length path > 6 &&
75
+
String.sub path (String.length path - 6) 6 = ".xhtml"
76
+
| None -> false
77
+
78
+
(** Wrap DOM in an Html5rw.t-compatible structure for the checker. *)
79
+
type xhtml_doc = {
80
+
root : Html5rw.Dom.node;
81
+
errors : Html5rw.Error.t list;
82
+
}
83
+
84
+
let xhtml_root doc = doc.root
85
+
let xhtml_errors _doc = [] (* XML parser handles errors differently *)
+2
-2
test/debug_check.ml
+2
-2
test/debug_check.ml
···
1
1
let () =
2
-
let test_file = "validator/tests/xhtml/elements/progress/002-isvalid.xhtml" in
2
+
let test_file = "validator/tests/xhtml/elements/menu/menu-containing-text-novalid.xhtml" in
3
3
let ic = open_in test_file in
4
4
let html = really_input_string ic (in_channel_length ic) in
5
5
close_in ic;
···
32
32
print_endline "=== Errors ===";
33
33
List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors;
34
34
print_endline "\n=== Expected ===";
35
-
print_endline "Element \xe2\x80\x9crect\xe2\x80\x9d is missing required attribute \xe2\x80\x9cheight\xe2\x80\x9d."
35
+
print_endline "Text not allowed in element \xe2\x80\x9cmenu\xe2\x80\x9d in this context."