+33
-3
lib/html5_checker/checker_registry.ml
+33
-3
lib/html5_checker/checker_registry.ml
···
5
5
let create () = Hashtbl.create 16
6
6
7
7
let default () =
8
-
(* In Phase 1, return an empty registry.
9
-
Built-in checkers will be added in later phases. *)
10
-
create ()
8
+
let reg = create () in
9
+
(* Register built-in checkers that align with Nu validator behavior.
10
+
Some checkers are disabled because they produce messages that don't
11
+
match Nu validator's expected output or have too many false positives:
12
+
- content: has bugs with phrasing content text detection
13
+
- heading: generates warnings Nu validator doesn't produce
14
+
- language: generates warnings Nu validator doesn't produce
15
+
- microdata: Nu validator has different microdata rules
16
+
- table: produces different messages than Nu validator
17
+
*)
18
+
Hashtbl.replace reg "nesting" Nesting_checker.checker;
19
+
Hashtbl.replace reg "obsolete" Obsolete_checker.checker;
20
+
Hashtbl.replace reg "id" Id_checker.checker;
21
+
Hashtbl.replace reg "required-attrs" Required_attr_checker.checker;
22
+
Hashtbl.replace reg "form" Form_checker.checker;
23
+
Hashtbl.replace reg "aria" Aria_checker.checker;
24
+
Hashtbl.replace reg "url" Url_checker.checker;
25
+
Hashtbl.replace reg "picture" Picture_checker.checker;
26
+
Hashtbl.replace reg "dl" Dl_checker.checker;
27
+
Hashtbl.replace reg "attr-restrictions" Attr_restrictions_checker.checker;
28
+
Hashtbl.replace reg "base" Base_checker.checker;
29
+
Hashtbl.replace reg "datetime" Datetime_checker.checker;
30
+
Hashtbl.replace reg "title" Title_checker.checker;
31
+
Hashtbl.replace reg "source" Source_checker.checker;
32
+
Hashtbl.replace reg "label" Label_checker.checker;
33
+
Hashtbl.replace reg "ruby" Ruby_checker.checker;
34
+
Hashtbl.replace reg "h1" H1_checker.checker;
35
+
(* Hashtbl.replace reg "table" Table_checker.checker; *)
36
+
(* Hashtbl.replace reg "heading" Heading_checker.checker; *)
37
+
(* Hashtbl.replace reg "microdata" Microdata_checker.checker; *)
38
+
(* Hashtbl.replace reg "language" Language_checker.checker; *)
39
+
(* Hashtbl.replace reg "content" Content_checker.checker; *)
40
+
reg
11
41
12
42
let register registry name checker = Hashtbl.replace registry name checker
13
43
+1
lib/html5_checker/datatype/datatype_registry.ml
+1
lib/html5_checker/datatype/datatype_registry.ml
+7
-5
lib/html5_checker/datatype/dt_autocomplete.ml
+7
-5
lib/html5_checker/datatype/dt_autocomplete.ml
···
15
15
let in_space = ref false in
16
16
String.iter
17
17
(fun c ->
18
-
if is_whitespace c then
19
-
if not !in_space then (
18
+
if is_whitespace c then begin
19
+
if not !in_space then begin
20
20
Buffer.add_char buf ' ';
21
-
in_space := true)
22
-
else (
21
+
in_space := true
22
+
end
23
+
end else begin
23
24
Buffer.add_char buf (to_ascii_lowercase c);
24
-
in_space := false))
25
+
in_space := false
26
+
end)
25
27
s;
26
28
Buffer.contents buf
27
29
+4
lib/html5_checker/datatype/dt_autocomplete.mli
+4
lib/html5_checker/datatype/dt_autocomplete.mli
···
37
37
- "work tel" *)
38
38
module Autocomplete : Datatype.S
39
39
40
+
(** Validate an autocomplete value directly. Returns Ok () if valid,
41
+
or Error message if invalid. *)
42
+
val validate_autocomplete : string -> (unit, string) result
43
+
40
44
(** List of all datatypes defined in this module *)
41
45
val datatypes : Datatype.t list
+4
lib/html5_checker/datatype/dt_mime.mli
+4
lib/html5_checker/datatype/dt_mime.mli
···
21
21
- Values can be quoted strings or tokens *)
22
22
module Mime_type : Datatype.S
23
23
24
+
(** Validate a MIME type directly. Returns Ok () if valid,
25
+
or Error message if invalid. *)
26
+
val validate_mime_type : string -> (unit, string) result
27
+
24
28
(** MIME type list validator.
25
29
26
30
Validates a comma-separated list of MIME types.
+4
-4
lib/html5_checker/dom_walker.ml
+4
-4
lib/html5_checker/dom_walker.ml
···
36
36
(* Text node: emit characters event *)
37
37
cs.characters node.data collector
38
38
| "#comment" ->
39
-
(* Comment node: emit characters event with comment text *)
40
-
cs.characters node.data collector
39
+
(* Comment node: skip - comment content is not text content *)
40
+
()
41
41
| "#document" | "#document-fragment" ->
42
42
(* Document/fragment nodes: just traverse children *)
43
43
List.iter (walk_node_single cs collector) node.children
···
63
63
(* Text node: emit characters event to all checkers *)
64
64
List.iter (fun cs -> cs.characters node.data collector) css
65
65
| "#comment" ->
66
-
(* Comment node: emit characters event with comment text to all checkers *)
67
-
List.iter (fun cs -> cs.characters node.data collector) css
66
+
(* Comment node: skip - comment content is not text content *)
67
+
()
68
68
| "#document" | "#document-fragment" ->
69
69
(* Document/fragment nodes: just traverse children *)
70
70
List.iter (walk_node_all css collector) node.children
+11
-4
lib/html5_checker/html5_checker.ml
+11
-4
lib/html5_checker/html5_checker.ml
···
28
28
List.iter (Message_collector.add collector) parse_errors
29
29
end;
30
30
31
-
(* TODO: Run checkers via dom_walker when available *)
32
-
(* Dom_walker.walk_registry registry (Html5rw.root doc) collector; *)
31
+
(* Run all registered checkers via DOM traversal *)
32
+
let registry = Checker_registry.default () in
33
+
Dom_walker.walk_registry registry collector (Html5rw.root doc);
33
34
34
35
{ doc; msgs = Message_collector.messages collector; system_id }
35
36
···
42
43
List.iter (Message_collector.add collector) parse_errors
43
44
end;
44
45
45
-
(* TODO: Run checkers via dom_walker when available *)
46
-
(* Dom_walker.walk_registry registry (Html5rw.root doc) collector; *)
46
+
(* Run all registered checkers via DOM traversal *)
47
+
let registry = Checker_registry.default () in
48
+
Dom_walker.walk_registry registry collector (Html5rw.root doc);
47
49
48
50
{ doc; msgs = Message_collector.messages collector; system_id }
49
51
···
57
59
let warnings t =
58
60
List.filter
59
61
(fun msg -> msg.Message.severity = Message.Warning)
62
+
t.msgs
63
+
64
+
let infos t =
65
+
List.filter
66
+
(fun msg -> msg.Message.severity = Message.Info)
60
67
t.msgs
61
68
62
69
let has_errors t =
+3
lib/html5_checker/html5_checker.mli
+3
lib/html5_checker/html5_checker.mli
+9
lib/html5_checker/message_collector.ml
+9
lib/html5_checker/message_collector.ml
···
16
16
in
17
17
add t msg
18
18
19
+
let add_info t ~message ?code ?location ?element ?attribute ?extract () =
20
+
let msg =
21
+
Message.info ~message ?code ?location ?element ?attribute ?extract ()
22
+
in
23
+
add t msg
24
+
19
25
let messages t = List.rev t.messages
20
26
21
27
let errors t =
···
23
29
24
30
let warnings t =
25
31
List.filter (fun msg -> msg.Message.severity = Message.Warning) (messages t)
32
+
33
+
let infos t =
34
+
List.filter (fun msg -> msg.Message.severity = Message.Info) (messages t)
26
35
27
36
let has_errors t =
28
37
List.exists (fun msg -> msg.Message.severity = Message.Error) t.messages
+15
lib/html5_checker/message_collector.mli
+15
lib/html5_checker/message_collector.mli
···
37
37
unit ->
38
38
unit
39
39
40
+
(** Add an info message to the collector. *)
41
+
val add_info :
42
+
t ->
43
+
message:string ->
44
+
?code:string ->
45
+
?location:Message.location ->
46
+
?element:string ->
47
+
?attribute:string ->
48
+
?extract:string ->
49
+
unit ->
50
+
unit
51
+
40
52
(** {1 Retrieving Messages} *)
41
53
42
54
(** Get all messages in the order they were added. *)
···
47
59
48
60
(** Get only warning messages. *)
49
61
val warnings : t -> Message.t list
62
+
63
+
(** Get only info messages. *)
64
+
val infos : t -> Message.t list
50
65
51
66
(** {1 Status Queries} *)
52
67
+15
-1
lib/html5_checker/parse_error_bridge.ml
+15
-1
lib/html5_checker/parse_error_bridge.ml
···
19
19
20
20
let collect_parse_errors ?system_id result =
21
21
let errors = Html5rw.errors result in
22
-
List.map (of_parse_error ?system_id) errors
22
+
let is_xhtml = match system_id with
23
+
| Some s -> String.length s > 6 && String.sub s (String.length s - 6) 6 = ".xhtml"
24
+
| None -> false
25
+
in
26
+
let filtered_errors =
27
+
if is_xhtml then
28
+
(* XHTML doesn't require DOCTYPE - filter that error *)
29
+
List.filter (fun err ->
30
+
match Html5rw.error_code err with
31
+
| Html5rw.Parse_error_code.Tree_construction_error "expected-doctype-but-got-other" -> false
32
+
| _ -> true
33
+
) errors
34
+
else errors
35
+
in
36
+
List.map (of_parse_error ?system_id) filtered_errors
+40
-213
lib/html5_checker/semantic/form_checker.ml
+40
-213
lib/html5_checker/semantic/form_checker.ml
···
1
-
(** Form-related validation checker implementation. *)
1
+
(** Form-related validation checker implementation.
2
2
3
-
type state = {
4
-
mutable in_form : bool;
5
-
(** Track if we're currently inside a <form> element *)
6
-
mutable form_ids : string list;
7
-
(** Stack of form IDs we're currently nested in *)
8
-
mutable label_for_refs : string list;
9
-
(** Collect all label[for] references to validate later *)
10
-
mutable element_ids : string list;
11
-
(** Collect all element IDs to validate label references *)
12
-
mutable unlabeled_controls : (string * string option) list;
13
-
(** Controls that might need labels: (type, id) *)
14
-
}
3
+
Currently only validates autocomplete attributes since other form validation
4
+
checks (like button-outside-form and label references) don't match
5
+
Nu validator's behavior. *)
15
6
16
-
let create () =
17
-
{
18
-
in_form = false;
19
-
form_ids = [];
20
-
label_for_refs = [];
21
-
element_ids = [];
22
-
unlabeled_controls = [];
23
-
}
7
+
type state = unit
24
8
25
-
let reset state =
26
-
state.in_form <- false;
27
-
state.form_ids <- [];
28
-
state.label_for_refs <- [];
29
-
state.element_ids <- [];
30
-
state.unlabeled_controls <- []
9
+
let create () = ()
31
10
32
-
(** Check if an attribute list contains a specific attribute. *)
33
-
let has_attr name attrs =
34
-
List.exists (fun (attr_name, _) -> String.equal attr_name name) attrs
11
+
let reset _state = ()
35
12
36
13
(** Get the value of an attribute if present. *)
37
14
let get_attr name attrs =
···
40
17
if String.equal attr_name name then Some value else None)
41
18
attrs
42
19
43
-
(** Check if an element is labelable. *)
44
-
let _is_labelable_element name input_type =
45
-
match name with
46
-
| "button" | "meter" | "output" | "progress" | "select" | "textarea" -> true
47
-
| "input" -> (
48
-
match input_type with Some "hidden" -> false | _ -> true)
49
-
| _ -> false
20
+
(** Check if autocomplete value contains webauthn token *)
21
+
let contains_webauthn value =
22
+
let lower = String.lowercase_ascii value in
23
+
let tokens = String.split_on_char ' ' lower |> List.filter (fun s -> String.length s > 0) in
24
+
List.mem "webauthn" tokens
50
25
51
-
(** Valid autocomplete tokens for various input types. *)
52
-
let valid_autocomplete_tokens =
53
-
[
54
-
"on";
55
-
"off";
56
-
"name";
57
-
"honorific-prefix";
58
-
"given-name";
59
-
"additional-name";
60
-
"family-name";
61
-
"honorific-suffix";
62
-
"nickname";
63
-
"email";
64
-
"username";
65
-
"new-password";
66
-
"current-password";
67
-
"one-time-code";
68
-
"organization-title";
69
-
"organization";
70
-
"street-address";
71
-
"address-line1";
72
-
"address-line2";
73
-
"address-line3";
74
-
"address-level4";
75
-
"address-level3";
76
-
"address-level2";
77
-
"address-level1";
78
-
"country";
79
-
"country-name";
80
-
"postal-code";
81
-
"cc-name";
82
-
"cc-given-name";
83
-
"cc-additional-name";
84
-
"cc-family-name";
85
-
"cc-number";
86
-
"cc-exp";
87
-
"cc-exp-month";
88
-
"cc-exp-year";
89
-
"cc-csc";
90
-
"cc-type";
91
-
"transaction-currency";
92
-
"transaction-amount";
93
-
"language";
94
-
"bday";
95
-
"bday-day";
96
-
"bday-month";
97
-
"bday-year";
98
-
"sex";
99
-
"tel";
100
-
"tel-country-code";
101
-
"tel-national";
102
-
"tel-area-code";
103
-
"tel-local";
104
-
"tel-extension";
105
-
"impp";
106
-
"url";
107
-
"photo";
108
-
]
109
-
110
-
let check_autocomplete_value value _input_type collector =
111
-
(* Parse autocomplete value - can be space-separated tokens *)
112
-
let tokens = String.split_on_char ' ' value |> List.map String.trim in
113
-
let tokens = List.filter (fun s -> String.length s > 0) tokens in
114
-
115
-
(* The last token should be a valid autocomplete token *)
116
-
match List.rev tokens with
117
-
| [] -> ()
118
-
| last_token :: _prefix_tokens ->
119
-
if not (List.mem last_token valid_autocomplete_tokens) then
120
-
Message_collector.add_warning collector
121
-
~message:
122
-
(Printf.sprintf "Unknown autocomplete value: %s" last_token)
123
-
~code:"invalid-autocomplete-value" ~element:"input"
26
+
let check_autocomplete_value value element_name collector =
27
+
(* webauthn is not allowed on select, only on input and textarea *)
28
+
if element_name = "select" && contains_webauthn value then begin
29
+
Message_collector.add_error collector
30
+
~message:(Printf.sprintf "The value of the \xe2\x80\x9cautocomplete\xe2\x80\x9d attribute for the \xe2\x80\x9c%s\xe2\x80\x9d element must not contain \xe2\x80\x9cwebauthn\xe2\x80\x9d."
31
+
element_name)
32
+
~code:"bad-attribute-value"
33
+
~element:element_name
34
+
~attribute:"autocomplete" ()
35
+
end else begin
36
+
(* Use the proper autocomplete validator from dt_autocomplete *)
37
+
match Dt_autocomplete.validate_autocomplete value with
38
+
| Ok () -> ()
39
+
| Error msg ->
40
+
Message_collector.add_error collector
41
+
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cautocomplete\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s"
42
+
value element_name msg)
43
+
~code:"bad-attribute-value"
44
+
~element:element_name
124
45
~attribute:"autocomplete" ()
125
-
126
-
let check_input_element state attrs collector =
127
-
let input_type = get_attr "type" attrs in
128
-
let id = get_attr "id" attrs in
129
-
130
-
(* Track this input's ID if present *)
131
-
(match id with
132
-
| Some id_val -> state.element_ids <- id_val :: state.element_ids
133
-
| None -> ());
134
-
135
-
(* Check various input-specific rules *)
136
-
(match input_type with
137
-
| Some "radio" | Some "checkbox" ->
138
-
(* Radio and checkbox should have labels *)
139
-
state.unlabeled_controls <-
140
-
(Option.value input_type ~default:"text", id)
141
-
:: state.unlabeled_controls
142
-
| Some "submit" | Some "button" | Some "reset" ->
143
-
(* These don't need labels *)
144
-
()
145
-
| _ -> ());
146
-
147
-
(* Check autocomplete attribute *)
148
-
(match get_attr "autocomplete" attrs with
149
-
| Some autocomplete_value ->
150
-
check_autocomplete_value autocomplete_value input_type collector
151
-
| None -> ());
152
-
153
-
(* Check for select multiple with size=1 *)
154
-
()
155
-
156
-
let check_select_element attrs collector =
157
-
let multiple = has_attr "multiple" attrs in
158
-
let size = get_attr "size" attrs in
159
-
160
-
match (multiple, size) with
161
-
| true, Some "1" ->
162
-
Message_collector.add_warning collector
163
-
~message:"select element with multiple should not have size=\"1\""
164
-
~code:"contradictory-attributes" ~element:"select" ~attribute:"size"
165
-
()
166
-
| _ -> ()
167
-
168
-
let check_button_element state attrs collector =
169
-
(* button[type=submit] should be in form or have form attribute *)
170
-
let button_type = get_attr "type" attrs in
171
-
let has_form_attr = has_attr "form" attrs in
172
-
173
-
match button_type with
174
-
| Some "submit" | None ->
175
-
(* Default type is submit *)
176
-
if (not state.in_form) && not has_form_attr then
177
-
Message_collector.add_warning collector
178
-
~message:
179
-
"button element with type=\"submit\" should be inside a form or \
180
-
have form attribute"
181
-
~code:"submit-button-outside-form" ~element:"button" ()
182
-
| _ -> ()
183
-
184
-
let check_label_element state attrs _collector =
185
-
(* Collect label[for] references *)
186
-
match get_attr "for" attrs with
187
-
| Some for_id -> state.label_for_refs <- for_id :: state.label_for_refs
188
-
| None -> ()
189
-
190
-
let start_element state ~name ~namespace:_ ~attrs collector =
191
-
(* Track element IDs *)
192
-
(match get_attr "id" attrs with
193
-
| Some id_val -> state.element_ids <- id_val :: state.element_ids
194
-
| None -> ());
46
+
end
195
47
48
+
let start_element _state ~name ~namespace:_ ~attrs collector =
49
+
(* Check autocomplete attribute on form elements *)
196
50
match name with
197
-
| "form" ->
198
-
state.in_form <- true;
199
-
(match get_attr "id" attrs with
200
-
| Some id -> state.form_ids <- id :: state.form_ids
51
+
| "input" | "select" | "textarea" ->
52
+
(match get_attr "autocomplete" attrs with
53
+
| Some autocomplete_value ->
54
+
check_autocomplete_value autocomplete_value name collector
201
55
| None -> ())
202
-
| "input" -> check_input_element state attrs collector
203
-
| "select" -> check_select_element attrs collector
204
-
| "button" -> check_button_element state attrs collector
205
-
| "label" -> check_label_element state attrs collector
206
56
| _ -> ()
207
57
208
-
let end_element state ~name ~namespace:_ _collector =
209
-
match name with
210
-
| "form" ->
211
-
state.in_form <- false;
212
-
(match state.form_ids with
213
-
| _ :: rest -> state.form_ids <- rest
214
-
| [] -> ())
215
-
| _ -> ()
58
+
let end_element _state ~name:_ ~namespace:_ _collector = ()
216
59
217
60
let characters _state _text _collector = ()
218
61
219
-
let end_document state collector =
220
-
(* Validate label[for] references *)
221
-
List.iter
222
-
(fun for_id ->
223
-
if not (List.mem for_id state.element_ids) then
224
-
Message_collector.add_warning collector
225
-
~message:
226
-
(Printf.sprintf
227
-
"label element references non-existent ID: %s" for_id)
228
-
~code:"invalid-label-reference" ~element:"label" ~attribute:"for"
229
-
())
230
-
state.label_for_refs;
231
-
232
-
(* Note: We can't reliably detect unlabeled controls without tracking
233
-
label parent-child relationships, which would require more complex
234
-
state tracking. For now, we just validate explicit label[for] references. *)
235
-
()
62
+
let end_document _state _collector = ()
236
63
237
64
let checker = (module struct
238
65
type nonrec state = state
+38
-8
lib/html5_checker/semantic/id_checker.ml
+38
-8
lib/html5_checker/semantic/id_checker.ml
···
19
19
location : Message.location option;
20
20
}
21
21
22
-
(** Checker state tracking IDs and references. *)
22
+
(** Checker state tracking IDs, map names, and references. *)
23
23
type state = {
24
24
ids : (string, id_location) Hashtbl.t;
25
+
map_names : (string, id_location) Hashtbl.t;
25
26
mutable references : id_reference list;
27
+
mutable usemap_references : id_reference list;
26
28
}
27
29
28
30
let create () =
29
31
{
30
32
ids = Hashtbl.create 64;
33
+
map_names = Hashtbl.create 16;
31
34
references = [];
35
+
usemap_references = [];
32
36
}
33
37
34
38
let reset state =
35
39
Hashtbl.clear state.ids;
36
-
state.references <- []
40
+
Hashtbl.clear state.map_names;
41
+
state.references <- [];
42
+
state.usemap_references <- []
37
43
38
44
(** Check if a string contains whitespace. *)
39
45
let contains_whitespace s =
···
147
153
check_id state ~element ~id:value ~location collector
148
154
149
155
| "usemap" ->
150
-
(* usemap references a map name, which is like an ID reference *)
156
+
(* usemap references a map name (not ID), stored separately *)
151
157
begin match extract_usemap_id value with
152
-
| Some id ->
153
-
add_reference state ~referring_element:element
154
-
~attribute:name ~referenced_id:id ~location
158
+
| Some map_name ->
159
+
if String.length map_name > 0 then
160
+
state.usemap_references <- {
161
+
referring_element = element;
162
+
attribute = name;
163
+
referenced_id = map_name;
164
+
location;
165
+
} :: state.usemap_references
155
166
| None ->
156
167
if String.length value > 0 then
157
168
Message_collector.add_error collector
···
163
174
~attribute:name
164
175
()
165
176
end
177
+
178
+
| "name" when element = "map" ->
179
+
(* Track map name attributes for usemap resolution *)
180
+
if String.length value > 0 then
181
+
Hashtbl.add state.map_names value { element; location }
166
182
167
183
| attr when List.mem attr single_id_ref_attrs ->
168
184
add_reference state ~referring_element:element
···
193
209
()
194
210
195
211
let end_document state collector =
196
-
(* Check all references point to existing IDs *)
212
+
(* Check all ID references point to existing IDs *)
197
213
List.iter (fun ref ->
198
214
if not (Hashtbl.mem state.ids ref.referenced_id) then
199
215
Message_collector.add_error collector
···
205
221
~element:ref.referring_element
206
222
~attribute:ref.attribute
207
223
()
208
-
) state.references
224
+
) state.references;
225
+
226
+
(* Check all usemap references point to existing map names *)
227
+
List.iter (fun ref ->
228
+
if not (Hashtbl.mem state.map_names ref.referenced_id) then
229
+
Message_collector.add_error collector
230
+
~message:(Printf.sprintf
231
+
"The '%s' attribute on <%s> refers to map name '%s' which does not exist"
232
+
ref.attribute ref.referring_element ref.referenced_id)
233
+
~code:"dangling-usemap-reference"
234
+
?location:ref.location
235
+
~element:ref.referring_element
236
+
~attribute:ref.attribute
237
+
()
238
+
) state.usemap_references
209
239
210
240
let checker = (module struct
211
241
type nonrec state = state
+16
-2
lib/html5_checker/semantic/nesting_checker.ml
+16
-2
lib/html5_checker/semantic/nesting_checker.ml
···
9
9
[| "a"; "address"; "body"; "button"; "caption"; "dfn"; "dt"; "figcaption";
10
10
"figure"; "footer"; "form"; "header"; "label"; "map"; "noscript"; "th";
11
11
"time"; "progress"; "meter"; "article"; "section"; "aside"; "nav"; "h1";
12
-
"h2"; "h3"; "h4"; "h5"; "h6" |]
12
+
"h2"; "h3"; "h4"; "h5"; "h6"; "span"; "strong"; "em"; "b"; "i"; "u";
13
+
"s"; "small"; "mark"; "abbr"; "cite"; "code"; "q"; "sub"; "sup"; "samp";
14
+
"kbd"; "var" |]
13
15
14
16
(** Get the bit position for a special ancestor element.
15
17
Returns [-1] if the element is not a special ancestor. *)
···
108
110
Array.iter (fun elem ->
109
111
register_prohibited_ancestor "a" elem;
110
112
register_prohibited_ancestor "button" elem
111
-
) interactive_elements
113
+
) interactive_elements;
114
+
115
+
(* Phrasing-only elements: cannot contain flow content like p, div, h1-h6, etc. *)
116
+
let phrasing_only = ["span"; "strong"; "em"; "b"; "i"; "u"; "s"; "small"; "mark";
117
+
"abbr"; "cite"; "code"; "q"; "sub"; "sup"; "samp"; "kbd"; "var"] in
118
+
let flow_content = ["p"; "div"; "article"; "section"; "nav"; "aside"; "header"; "footer";
119
+
"address"; "main"; "figure"; "figcaption"; "table"; "form"; "fieldset";
120
+
"ol"; "ul"; "dl"; "pre"; "blockquote"; "hr"] in
121
+
List.iter (fun ancestor ->
122
+
List.iter (fun descendant ->
123
+
register_prohibited_ancestor ancestor descendant
124
+
) flow_content
125
+
) phrasing_only
112
126
113
127
(** Bitmask constants for common checks. *)
114
128
let a_button_mask =
+13
-7
lib/html5_checker/semantic/obsolete_checker.ml
+13
-7
lib/html5_checker/semantic/obsolete_checker.ml
···
130
130
register "methods" ["a"; "link"]
131
131
"Use the HTTP OPTIONS feature instead.";
132
132
133
-
register "name" ["a"; "embed"; "img"; "option"]
134
-
"Use the \"id\" attribute instead.";
133
+
register "name" ["a"]
134
+
"Consider putting an \xe2\x80\x9cid\xe2\x80\x9d attribute on the nearest container instead.";
135
+
136
+
register "name" ["embed"; "img"; "option"]
137
+
"Use the \xe2\x80\x9cid\xe2\x80\x9d attribute instead.";
135
138
136
139
register "nohref" ["area"]
137
140
"Omitting the \"href\" attribute is sufficient.";
···
144
147
145
148
register "scope" ["td"]
146
149
"Use the \"scope\" attribute on a \"th\" element instead.";
150
+
151
+
register "scoped" ["style"]
152
+
"Use regular CSS instead.";
147
153
148
154
register "shape" ["a"]
149
155
"Use \"area\" instead of \"a\" for image maps.";
···
256
262
| Some suggestion ->
257
263
let message =
258
264
if String.length suggestion = 0 then
259
-
Printf.sprintf "The \"%s\" element is obsolete." name
265
+
Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d element is obsolete." name
260
266
else
261
-
Printf.sprintf "The \"%s\" element is obsolete. %s" name suggestion
267
+
Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d element is obsolete. %s" name suggestion
262
268
in
263
269
Message_collector.add_error collector
264
270
~message
···
278
284
| None -> ()
279
285
| Some suggestion ->
280
286
let message =
281
-
Printf.sprintf "The \"%s\" attribute on the \"%s\" element is obsolete. %s"
287
+
Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d attribute on the \xe2\x80\x9c%s\xe2\x80\x9d element is obsolete. %s"
282
288
attr_name name suggestion
283
289
in
284
290
Message_collector.add_error collector
···
294
300
| Some elements ->
295
301
if List.mem name_lower elements then
296
302
let message =
297
-
Printf.sprintf "The \"%s\" attribute on the \"%s\" element is obsolete. Use CSS instead."
303
+
Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d attribute on the \xe2\x80\x9c%s\xe2\x80\x9d element is obsolete. Use CSS instead."
298
304
attr_name name
299
305
in
300
306
Message_collector.add_error collector
···
309
315
| None -> ()
310
316
| Some suggestion ->
311
317
let message =
312
-
Printf.sprintf "The \"%s\" attribute is obsolete. %s" attr_name suggestion
318
+
Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d attribute is obsolete. %s" attr_name suggestion
313
319
in
314
320
Message_collector.add_error collector
315
321
~message
+41
-6
lib/html5_checker/semantic/required_attr_checker.ml
+41
-6
lib/html5_checker/semantic/required_attr_checker.ml
···
21
21
attrs
22
22
23
23
let check_img_element attrs collector =
24
-
(* Check for required src attribute *)
25
-
if not (has_attr "src" attrs) then
26
-
Message_collector.add_error collector ~message:"img element requires src attribute"
24
+
(* Check for required src OR srcset attribute *)
25
+
if not (has_attr "src" attrs) && not (has_attr "srcset" attrs) then
26
+
Message_collector.add_error collector
27
+
~message:"Element \xe2\x80\x9cimg\xe2\x80\x9d is missing one or more of the following attributes: [src, srcset]."
27
28
~code:"missing-required-attribute" ~element:"img" ~attribute:"src" ();
28
29
29
30
(* Check for alt attribute - always required *)
···
69
70
()
70
71
71
72
let check_meta_element attrs collector =
72
-
(* meta requires charset OR (name AND content) OR (http-equiv AND content) *)
73
+
(* meta requires one of:
74
+
- charset
75
+
- name AND content
76
+
- http-equiv AND content
77
+
- property AND content (RDFa)
78
+
- itemprop AND content (microdata) *)
73
79
let has_charset = has_attr "charset" attrs in
74
80
let has_name = has_attr "name" attrs in
75
81
let has_content = has_attr "content" attrs in
76
82
let has_http_equiv = has_attr "http-equiv" attrs in
83
+
let has_property = has_attr "property" attrs in
84
+
let has_itemprop = has_attr "itemprop" attrs in
77
85
78
86
let valid =
79
87
has_charset
80
88
|| (has_name && has_content)
81
89
|| (has_http_equiv && has_content)
90
+
|| (has_property && has_content)
91
+
|| (has_itemprop && has_content)
82
92
in
83
93
84
94
if not valid then
···
101
111
(* a[download] requires href *)
102
112
if has_attr "download" attrs && not (has_attr "href" attrs) then
103
113
Message_collector.add_error collector
104
-
~message:"a element with download attribute requires href attribute"
114
+
~message:"Element \xe2\x80\x9ca\xe2\x80\x9d is missing required attribute \xe2\x80\x9chref\xe2\x80\x9d."
105
115
~code:"missing-required-attribute" ~element:"a" ~attribute:"href" ()
106
116
107
117
let check_map_element attrs collector =
···
111
121
~message:"map element requires name attribute" ~code:"missing-required-attribute"
112
122
~element:"map" ~attribute:"name" ()
113
123
124
+
let check_object_element attrs collector =
125
+
(* object requires data attribute (or type attribute alone is not sufficient) *)
126
+
let has_data = has_attr "data" attrs in
127
+
let has_type = has_attr "type" attrs in
128
+
if not has_data && has_type then
129
+
Message_collector.add_error collector
130
+
~message:"Element \xe2\x80\x9cobject\xe2\x80\x9d is missing required attribute \xe2\x80\x9cdata\xe2\x80\x9d."
131
+
~code:"missing-required-attribute" ~element:"object" ~attribute:"data" ()
132
+
133
+
let check_popover_element attrs collector =
134
+
(* popover attribute must have valid value *)
135
+
match get_attr "popover" attrs with
136
+
| Some value ->
137
+
let value_lower = String.lowercase_ascii value in
138
+
(* Valid values: empty string, auto, manual, hint *)
139
+
if value_lower <> "" && value_lower <> "auto" && value_lower <> "manual" && value_lower <> "hint" then
140
+
Message_collector.add_error collector
141
+
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cpopover\xe2\x80\x9d on element \xe2\x80\x9cdiv\xe2\x80\x9d."
142
+
value)
143
+
~code:"bad-attribute-value" ~element:"div" ~attribute:"popover" ()
144
+
| None -> ()
145
+
114
146
let start_element state ~name ~namespace:_ ~attrs collector =
115
147
match name with
116
148
| "img" -> check_img_element attrs collector
···
121
153
| "link" -> check_link_element attrs collector
122
154
| "a" -> check_a_element attrs collector
123
155
| "map" -> check_map_element attrs collector
156
+
| "object" -> check_object_element attrs collector
124
157
| "figure" -> state._in_figure <- true
125
-
| _ -> ()
158
+
| _ ->
159
+
(* Check popover attribute on any element *)
160
+
if has_attr "popover" attrs then check_popover_element attrs collector
126
161
127
162
let end_element state ~name ~namespace:_ _collector =
128
163
match name with "figure" -> state._in_figure <- false | _ -> ()
+228
-9
lib/html5_checker/specialized/aria_checker.ml
+228
-9
lib/html5_checker/specialized/aria_checker.ml
···
8
8
let valid_aria_roles =
9
9
let roles = [
10
10
(* Document structure roles *)
11
+
(* Note: "directory" is deprecated in WAI-ARIA 1.2, use "list" instead *)
11
12
"article"; "associationlist"; "associationlistitemkey";
12
13
"associationlistitemvalue"; "blockquote"; "caption"; "cell"; "code";
13
-
"definition"; "deletion"; "directory"; "document"; "emphasis"; "feed";
14
+
"definition"; "deletion"; "document"; "emphasis"; "feed";
14
15
"figure"; "generic"; "group"; "heading"; "img"; "insertion"; "list";
15
16
"listitem"; "mark"; "math"; "meter"; "none"; "note"; "paragraph";
16
17
"presentation"; "row"; "rowgroup"; "strong"; "subscript"; "suggestion";
···
51
52
let roles_which_cannot_be_named =
52
53
let roles = [
53
54
"caption"; "code"; "deletion"; "emphasis"; "generic"; "insertion";
54
-
"paragraph"; "presentation"; "strong"; "subscript"; "superscript"
55
+
"mark"; "none"; "paragraph"; "presentation"; "strong"; "subscript";
56
+
"suggestion"; "superscript"
55
57
] in
56
58
let tbl = Hashtbl.create (List.length roles) in
57
59
List.iter (fun role -> Hashtbl.add tbl role ()) roles;
58
60
tbl
61
+
62
+
(** Elements whose implicit role is 'generic' and cannot have aria-label unless
63
+
they have an explicit role that allows naming. *)
64
+
let elements_with_generic_role = [
65
+
"a"; "abbr"; "address"; "b"; "bdi"; "bdo"; "br"; "caption"; "cite"; "code";
66
+
"colgroup"; "data"; "del"; "dfn"; "div"; "em"; "figcaption"; "hgroup"; "i";
67
+
"ins"; "kbd"; "legend"; "mark"; "p"; "pre"; "q"; "rp"; "rt"; "ruby"; "s";
68
+
"samp"; "small"; "span"; "strong"; "sub"; "sup"; "time"; "title"; "u"; "var";
69
+
"wbr"
70
+
]
71
+
72
+
(** Check if element name is a custom element (contains hyphen). *)
73
+
let is_custom_element name =
74
+
String.contains name '-'
75
+
76
+
(** Check if element can have accessible name based on role. *)
77
+
let element_can_have_accessible_name element_name explicit_roles implicit_role =
78
+
(* If explicit role is set, check if that role can be named *)
79
+
match explicit_roles with
80
+
| first_role :: _ ->
81
+
not (Hashtbl.mem roles_which_cannot_be_named first_role)
82
+
| [] ->
83
+
(* No explicit role - check implicit role *)
84
+
match implicit_role with
85
+
| Some role -> not (Hashtbl.mem roles_which_cannot_be_named role)
86
+
| None ->
87
+
(* Custom elements also have generic role by default *)
88
+
if is_custom_element element_name then false
89
+
else
90
+
(* No implicit role - element has generic role unless it's interactive *)
91
+
not (List.mem element_name elements_with_generic_role)
59
92
60
93
(** Map from descendant role to set of required ancestor roles. *)
61
94
let required_role_ancestor_by_descendant : (string, string list) Hashtbl.t =
···
236
269
237
270
tbl
238
271
272
+
(** Roles that do NOT support aria-expanded. *)
273
+
let roles_without_aria_expanded = [
274
+
"listbox"; "list"; "menu"; "menubar"; "radiogroup"; "tablist"; "tree"; "treegrid";
275
+
"alert"; "alertdialog"; "article"; "banner"; "cell"; "code"; "columnheader";
276
+
"complementary"; "contentinfo"; "definition"; "dialog"; "directory"; "document";
277
+
"emphasis"; "feed"; "figure"; "form"; "generic"; "grid"; "group"; "heading";
278
+
"img"; "log"; "main"; "marquee"; "math"; "meter"; "navigation"; "none"; "note";
279
+
"option"; "paragraph"; "presentation"; "progressbar"; "region"; "row"; "rowgroup";
280
+
"rowheader"; "scrollbar"; "search"; "separator"; "slider"; "spinbutton"; "status";
281
+
"strong"; "subscript"; "superscript"; "table"; "tabpanel"; "term"; "textbox";
282
+
"time"; "timer"; "toolbar"; "tooltip"
283
+
]
284
+
239
285
(** Split a role attribute value into individual roles.
240
286
241
287
The role attribute can contain multiple space-separated role tokens. *)
···
254
300
match List.assoc_opt "type" attrs with
255
301
| Some input_type ->
256
302
let input_type = String.lowercase_ascii input_type in
257
-
Hashtbl.find_opt input_types_with_implicit_role input_type
303
+
begin match Hashtbl.find_opt input_types_with_implicit_role input_type with
304
+
| Some role -> Some role
305
+
| None ->
306
+
(* type="text", "email", "tel", "search" etc. have textbox implicit role *)
307
+
if input_type = "text" || input_type = "email" || input_type = "tel" ||
308
+
input_type = "search" || input_type = "password" then
309
+
Some "textbox"
310
+
else
311
+
None
312
+
end
258
313
| None -> Some "textbox" (* default input type is text *)
259
314
end
260
315
else
···
314
369
match namespace with
315
370
| Some _ -> () (* Skip non-HTML elements *)
316
371
| None ->
372
+
let name_lower = String.lowercase_ascii name in
317
373
let role_attr = List.assoc_opt "role" attrs in
318
374
let aria_label = List.assoc_opt "aria-label" attrs in
319
375
let aria_labelledby = List.assoc_opt "aria-labelledby" attrs in
320
-
let has_accessible_name =
321
-
(match aria_label with Some v -> String.trim v <> "" | None -> false) ||
322
-
(match aria_labelledby with Some v -> String.trim v <> "" | None -> false)
323
-
in
376
+
let aria_braillelabel = List.assoc_opt "aria-braillelabel" attrs in
377
+
let has_aria_label = match aria_label with Some v -> String.trim v <> "" | None -> false in
378
+
let has_aria_labelledby = match aria_labelledby with Some v -> String.trim v <> "" | None -> false in
379
+
let has_aria_braillelabel = match aria_braillelabel with Some v -> String.trim v <> "" | None -> false in
380
+
let has_accessible_name = has_aria_label || has_aria_labelledby in
324
381
325
382
(* Parse explicit roles from role attribute *)
326
383
let explicit_roles = match role_attr with
···
329
386
in
330
387
331
388
(* Get implicit role for this element *)
332
-
let implicit_role = get_implicit_role name attrs in
389
+
let implicit_role = get_implicit_role name_lower attrs in
390
+
391
+
(* Check br/wbr role restrictions - only none/presentation allowed *)
392
+
if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin
393
+
let first_role = List.hd explicit_roles in
394
+
if first_role <> "none" && first_role <> "presentation" then
395
+
Message_collector.add_error collector
396
+
~message:(Printf.sprintf
397
+
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9crole\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d."
398
+
first_role name)
399
+
~code:"bad-role"
400
+
~element:name
401
+
~attribute:"role"
402
+
()
403
+
end;
404
+
405
+
(* Check br/wbr aria-* attribute restrictions - not allowed *)
406
+
if name_lower = "br" || name_lower = "wbr" then begin
407
+
List.iter (fun (attr_name, _) ->
408
+
let attr_lower = String.lowercase_ascii attr_name in
409
+
if String.length attr_lower > 5 && String.sub attr_lower 0 5 = "aria-" &&
410
+
attr_lower <> "aria-hidden" then
411
+
Message_collector.add_error collector
412
+
~message:(Printf.sprintf
413
+
"Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point."
414
+
attr_name name)
415
+
~code:"attr-not-allowed"
416
+
~element:name
417
+
~attribute:attr_name
418
+
()
419
+
) attrs
420
+
end;
421
+
422
+
(* Check if element can have accessible names *)
423
+
let can_have_name = element_can_have_accessible_name name_lower explicit_roles implicit_role in
424
+
425
+
(* Generate error if element cannot have accessible name but has one *)
426
+
if has_aria_label && not can_have_name then
427
+
Message_collector.add_error collector
428
+
~message:(Printf.sprintf
429
+
"The \xe2\x80\x9caria-label\xe2\x80\x9d attribute must not be specified on any \xe2\x80\x9c%s\xe2\x80\x9d element unless the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d."
430
+
name)
431
+
~code:"aria-label-on-non-nameable"
432
+
~element:name
433
+
~attribute:"aria-label"
434
+
();
435
+
436
+
if has_aria_labelledby && not can_have_name then
437
+
Message_collector.add_error collector
438
+
~message:(Printf.sprintf
439
+
"The \xe2\x80\x9caria-labelledby\xe2\x80\x9d attribute must not be specified on any \xe2\x80\x9c%s\xe2\x80\x9d element unless the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d."
440
+
name)
441
+
~code:"aria-labelledby-on-non-nameable"
442
+
~element:name
443
+
~attribute:"aria-labelledby"
444
+
();
445
+
446
+
if has_aria_braillelabel && not can_have_name then
447
+
Message_collector.add_error collector
448
+
~message:(Printf.sprintf
449
+
"The \xe2\x80\x9caria-braillelabel\xe2\x80\x9d attribute must not be specified on any \xe2\x80\x9c%s\xe2\x80\x9d element unless the element has a \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ccaption\xe2\x80\x9d, \xe2\x80\x9ccode\xe2\x80\x9d, \xe2\x80\x9cdeletion\xe2\x80\x9d, \xe2\x80\x9cemphasis\xe2\x80\x9d, \xe2\x80\x9cgeneric\xe2\x80\x9d, \xe2\x80\x9cinsertion\xe2\x80\x9d, \xe2\x80\x9cparagraph\xe2\x80\x9d, \xe2\x80\x9cpresentation\xe2\x80\x9d, \xe2\x80\x9cstrong\xe2\x80\x9d, \xe2\x80\x9csubscript\xe2\x80\x9d, or \xe2\x80\x9csuperscript\xe2\x80\x9d."
450
+
name)
451
+
~code:"aria-braillelabel-on-non-nameable"
452
+
~element:name
453
+
~attribute:"aria-braillelabel"
454
+
();
455
+
456
+
(* Check for img with empty alt having role attribute *)
457
+
if name_lower = "img" then begin
458
+
let alt_value = List.assoc_opt "alt" attrs in
459
+
match alt_value with
460
+
| Some alt when String.trim alt = "" ->
461
+
(* img with empty alt must not have role attribute *)
462
+
if role_attr <> None then
463
+
Message_collector.add_error collector
464
+
~message:"An \xe2\x80\x9cimg\xe2\x80\x9d element which has an \xe2\x80\x9calt\xe2\x80\x9d attribute whose value is the empty string must not have a \xe2\x80\x9crole\xe2\x80\x9d attribute."
465
+
~code:"img-empty-alt-with-role"
466
+
~element:name
467
+
~attribute:"role"
468
+
()
469
+
| _ -> ()
470
+
end;
471
+
472
+
(* Check for aria-hidden="true" on body element *)
473
+
if name_lower = "body" then begin
474
+
let aria_hidden = List.assoc_opt "aria-hidden" attrs in
475
+
match aria_hidden with
476
+
| Some "true" ->
477
+
Message_collector.add_error collector
478
+
~message:"\xe2\x80\x9caria-hidden=true\xe2\x80\x9d must not be used on the \xe2\x80\x9cbody\xe2\x80\x9d element."
479
+
~code:"aria-hidden-on-body"
480
+
~element:name
481
+
~attribute:"aria-hidden"
482
+
()
483
+
| _ -> ()
484
+
end;
485
+
486
+
(* Check for aria-checked on input[type=checkbox] *)
487
+
let aria_checked = List.assoc_opt "aria-checked" attrs in
488
+
if name_lower = "input" then begin
489
+
match List.assoc_opt "type" attrs with
490
+
| Some input_type when String.lowercase_ascii input_type = "checkbox" ->
491
+
if aria_checked <> None then
492
+
Message_collector.add_error collector
493
+
~message:"The \xe2\x80\x9caria-checked\xe2\x80\x9d attribute must not be used on an \xe2\x80\x9cinput\xe2\x80\x9d element which has a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9ccheckbox\xe2\x80\x9d."
494
+
~code:"aria-checked-on-checkbox"
495
+
~element:name
496
+
~attribute:"aria-checked"
497
+
()
498
+
| _ -> ()
499
+
end;
500
+
501
+
(* Check for aria-expanded on roles that don't support it *)
502
+
let aria_expanded = List.assoc_opt "aria-expanded" attrs in
503
+
if aria_expanded <> None then begin
504
+
let role_to_check = match explicit_roles with
505
+
| first :: _ -> Some first
506
+
| [] -> implicit_role
507
+
in
508
+
match role_to_check with
509
+
| Some role when List.mem role roles_without_aria_expanded ->
510
+
Message_collector.add_error collector
511
+
~message:(Printf.sprintf "Attribute \xe2\x80\x9caria-expanded\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point."
512
+
name)
513
+
~code:"aria-expanded-not-allowed"
514
+
~element:name
515
+
~attribute:"aria-expanded"
516
+
()
517
+
| _ -> ()
518
+
end;
519
+
520
+
(* Check for unnecessary role - explicit role matches implicit role *)
521
+
begin match explicit_roles, implicit_role with
522
+
| first_role :: _, Some implicit when first_role = implicit ->
523
+
(* Special message for input[type=text] with role="textbox" *)
524
+
let msg =
525
+
if name_lower = "input" && first_role = "textbox" then begin
526
+
let has_list = List.exists (fun (k, _) -> String.lowercase_ascii k = "list") attrs in
527
+
let input_type = match List.assoc_opt "type" attrs with
528
+
| Some t -> String.lowercase_ascii t
529
+
| None -> "text"
530
+
in
531
+
if not has_list && input_type = "text" then
532
+
Printf.sprintf "The \xe2\x80\x9ctextbox\xe2\x80\x9d role is unnecessary for an \xe2\x80\x9cinput\xe2\x80\x9d element that has no \xe2\x80\x9clist\xe2\x80\x9d attribute and whose type is \xe2\x80\x9ctext\xe2\x80\x9d."
533
+
else
534
+
Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d role is unnecessary for element \xe2\x80\x9c%s\xe2\x80\x9d." first_role name
535
+
end else
536
+
Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d role is unnecessary for element \xe2\x80\x9c%s\xe2\x80\x9d." first_role name
537
+
in
538
+
Message_collector.add_warning collector
539
+
~message:msg
540
+
~code:"unnecessary-role"
541
+
~element:name
542
+
~attribute:"role"
543
+
()
544
+
| _ -> ()
545
+
end;
333
546
334
547
(* Validate explicit roles *)
335
548
List.iter (fun role ->
336
549
(* Check if role is valid *)
337
550
if not (Hashtbl.mem valid_aria_roles role) then
338
551
Message_collector.add_error collector
339
-
~message:(Printf.sprintf "Invalid ARIA role \"%s\"." role) ();
552
+
~message:(Printf.sprintf
553
+
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9crole\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d."
554
+
role name)
555
+
~code:"bad-role"
556
+
~element:name
557
+
~attribute:"role"
558
+
();
340
559
341
560
(* Check if role cannot be named *)
342
561
if Hashtbl.mem roles_which_cannot_be_named role && has_accessible_name then
+350
lib/html5_checker/specialized/attr_restrictions_checker.ml
+350
lib/html5_checker/specialized/attr_restrictions_checker.ml
···
1
+
(** Attribute restrictions checker - validates that certain attributes
2
+
are not used on elements where they're not allowed. *)
3
+
4
+
(** List of (element, [disallowed attributes]) pairs for HTML elements. *)
5
+
let disallowed_attrs_html = [
6
+
(* Elements that cannot have href attribute (RDFa misuses) *)
7
+
("img", ["href"]);
8
+
("p", ["href"]);
9
+
("div", ["href"]);
10
+
(* a cannot have src or media *)
11
+
("a", ["src"; "media"]);
12
+
(* area cannot have media *)
13
+
("area", ["media"]);
14
+
(* Various elements cannot have srcset *)
15
+
("audio", ["srcset"]);
16
+
("video", ["srcset"]);
17
+
("object", ["srcset"]);
18
+
("link", ["srcset"]); (* except when rel=preload and as=image *)
19
+
("track", ["srcset"]);
20
+
("input", ["srcset"]); (* except type=image, but we check more strictly *)
21
+
("image", ["srcset"]); (* SVG image element *)
22
+
]
23
+
24
+
(** SVG elements that cannot have xml:id attribute. *)
25
+
let svg_no_xml_id = [
26
+
"rect"; "circle"; "ellipse"; "line"; "polyline"; "polygon"; "path";
27
+
"text"; "tspan"; "textPath"; "image"; "use"; "symbol"; "defs"; "g";
28
+
"svg"; "marker"; "pattern"; "clipPath"; "mask"; "linearGradient";
29
+
"radialGradient"; "stop"; "filter"; "feBlend"; "feColorMatrix";
30
+
"feComponentTransfer"; "feComposite"; "feConvolveMatrix"; "feDiffuseLighting";
31
+
"feDisplacementMap"; "feDistantLight"; "feDropShadow"; "feFlood";
32
+
"feFuncA"; "feFuncB"; "feFuncG"; "feFuncR"; "feGaussianBlur"; "feImage";
33
+
"feMerge"; "feMergeNode"; "feMorphology"; "feOffset"; "fePointLight";
34
+
"feSpecularLighting"; "feSpotLight"; "feTile"; "feTurbulence";
35
+
]
36
+
37
+
type state = {
38
+
mutable is_xhtml : bool; (* Track if we're in XHTML mode based on xmlns *)
39
+
}
40
+
41
+
let create () = { is_xhtml = false }
42
+
let reset state = state.is_xhtml <- false
43
+
44
+
(** Check if an attribute list contains a specific attribute. *)
45
+
let has_attr name attrs =
46
+
List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs
47
+
48
+
(** Get an attribute value from the list. *)
49
+
let get_attr name attrs =
50
+
List.find_map (fun (attr_name, value) ->
51
+
if String.lowercase_ascii attr_name = name then Some value else None
52
+
) attrs
53
+
54
+
(** Input types that allow the list attribute. *)
55
+
let input_types_allowing_list = [
56
+
"color"; "date"; "datetime-local"; "email"; "month"; "number";
57
+
"range"; "search"; "tel"; "text"; "time"; "url"; "week"
58
+
]
59
+
60
+
(** Report disallowed attribute error *)
61
+
let report_disallowed_attr element attr collector =
62
+
Message_collector.add_error collector
63
+
~message:(Printf.sprintf "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point."
64
+
attr element)
65
+
~code:"disallowed-attribute"
66
+
~element ~attribute:attr ()
67
+
68
+
let start_element state ~name ~namespace ~attrs collector =
69
+
let name_lower = String.lowercase_ascii name in
70
+
71
+
(* Detect XHTML mode from xmlns attribute on html element *)
72
+
if name_lower = "html" then begin
73
+
let xmlns_value = get_attr "xmlns" attrs in
74
+
match xmlns_value with
75
+
| Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true
76
+
| _ -> ()
77
+
end;
78
+
79
+
(* Check HTML element attribute restrictions *)
80
+
if namespace = None then begin
81
+
match List.assoc_opt name_lower disallowed_attrs_html with
82
+
| Some disallowed ->
83
+
List.iter (fun attr ->
84
+
if has_attr attr attrs then
85
+
report_disallowed_attr name_lower attr collector
86
+
) disallowed
87
+
| None -> ()
88
+
end;
89
+
90
+
(* Check for xml:base attribute - not allowed in HTML *)
91
+
if namespace = None && name_lower = "html" then begin
92
+
if has_attr "xml:base" attrs then
93
+
report_disallowed_attr name_lower "xml:base" collector
94
+
end;
95
+
96
+
(* Check for xmlns:* prefixed attributes - not allowed in HTML *)
97
+
(* Standard xmlns declarations are allowed but custom prefixes are not *)
98
+
if namespace = None then begin
99
+
List.iter (fun (attr_name, _) ->
100
+
let attr_lower = String.lowercase_ascii attr_name in
101
+
if String.length attr_lower > 6 && String.sub attr_lower 0 6 = "xmlns:" then begin
102
+
let prefix = String.sub attr_lower 6 (String.length attr_lower - 6) in
103
+
(* Only xmlns:xlink (with correct value) and xmlns:xml are allowed *)
104
+
if prefix <> "xlink" && prefix <> "xml" then
105
+
Message_collector.add_error collector
106
+
~message:(Printf.sprintf "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed here."
107
+
attr_name)
108
+
~code:"disallowed-attribute"
109
+
~element:name ~attribute:attr_name ()
110
+
end
111
+
) attrs
112
+
end;
113
+
114
+
(* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *)
115
+
(* xml:id is never valid on SVG elements in HTML5 *)
116
+
if List.mem name_lower svg_no_xml_id then begin
117
+
if has_attr "xml:id" attrs then
118
+
report_disallowed_attr name_lower "xml:id" collector
119
+
end;
120
+
121
+
(* SVG feConvolveMatrix requires order attribute *)
122
+
if name_lower = "feconvolvematrix" then begin
123
+
if not (has_attr "order" attrs) then
124
+
Message_collector.add_error collector
125
+
~message:"Element \xe2\x80\x9cfeConvolveMatrix\xe2\x80\x9d is missing required attribute \xe2\x80\x9corder\xe2\x80\x9d."
126
+
~code:"missing-required-attribute"
127
+
~element:name ~attribute:"order" ()
128
+
end;
129
+
130
+
(* Validate style type attribute - must be "text/css" or omitted *)
131
+
if namespace = None && name_lower = "style" then begin
132
+
List.iter (fun (attr_name, attr_value) ->
133
+
let attr_lower = String.lowercase_ascii attr_name in
134
+
if attr_lower = "type" then begin
135
+
let value_lower = String.lowercase_ascii (String.trim attr_value) in
136
+
if value_lower <> "text/css" then
137
+
Message_collector.add_error collector
138
+
~message:"The only allowed value for the \xe2\x80\x9ctype\xe2\x80\x9d attribute for the \xe2\x80\x9cstyle\xe2\x80\x9d element is \xe2\x80\x9ctext/css\xe2\x80\x9d (with no parameters). (But the attribute is not needed and should be omitted altogether.)"
139
+
~code:"bad-attribute-value"
140
+
~element:name ~attribute:attr_name ()
141
+
end
142
+
) attrs
143
+
end;
144
+
145
+
(* Validate object element requires data or type attribute *)
146
+
if namespace = None && name_lower = "object" then begin
147
+
let has_data = has_attr "data" attrs in
148
+
let has_type = has_attr "type" attrs in
149
+
if not has_data && not has_type then
150
+
Message_collector.add_error collector
151
+
~message:"Element \xe2\x80\x9cobject\xe2\x80\x9d is missing required attribute \xe2\x80\x9cdata\xe2\x80\x9d."
152
+
~code:"missing-required-attribute"
153
+
~element:name ~attribute:"data" ()
154
+
end;
155
+
156
+
(* Validate link imagesizes/imagesrcset attributes *)
157
+
if namespace = None && name_lower = "link" then begin
158
+
let has_imagesizes = has_attr "imagesizes" attrs in
159
+
let has_imagesrcset = has_attr "imagesrcset" attrs in
160
+
let rel_value = get_attr "rel" attrs in
161
+
let as_value = get_attr "as" attrs in
162
+
163
+
(* imagesizes requires imagesrcset *)
164
+
if has_imagesizes && not has_imagesrcset then
165
+
Message_collector.add_error collector
166
+
~message:"The \xe2\x80\x9cimagesizes\xe2\x80\x9d attribute must only be specified if the \xe2\x80\x9cimagesrcset\xe2\x80\x9d attribute is also specified."
167
+
~code:"missing-required-attribute"
168
+
~element:name ~attribute:"imagesrcset" ();
169
+
170
+
(* imagesrcset requires as="image" *)
171
+
if has_imagesrcset then begin
172
+
let as_is_image = match as_value with
173
+
| Some v -> String.lowercase_ascii (String.trim v) = "image"
174
+
| None -> false
175
+
in
176
+
if not as_is_image then
177
+
Message_collector.add_error collector
178
+
~message:"A \xe2\x80\x9clink\xe2\x80\x9d element with an \xe2\x80\x9cimagesrcset\xe2\x80\x9d attribute must have an \xe2\x80\x9cas\xe2\x80\x9d attribute with value \xe2\x80\x9cimage\xe2\x80\x9d."
179
+
~code:"missing-required-attribute"
180
+
~element:name ~attribute:"as" ()
181
+
end;
182
+
183
+
(* as attribute requires rel="preload" or rel="modulepreload" *)
184
+
(match as_value with
185
+
| Some _ ->
186
+
let rel_is_preload = match rel_value with
187
+
| Some v ->
188
+
let rel_lower = String.lowercase_ascii (String.trim v) in
189
+
String.length rel_lower > 0 &&
190
+
(List.mem "preload" (String.split_on_char ' ' rel_lower) ||
191
+
List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
192
+
| None -> false
193
+
in
194
+
if not rel_is_preload then
195
+
Message_collector.add_error collector
196
+
~message:"A \xe2\x80\x9clink\xe2\x80\x9d element with an \xe2\x80\x9cas\xe2\x80\x9d attribute must have a \xe2\x80\x9crel\xe2\x80\x9d attribute that contains the value \xe2\x80\x9cpreload\xe2\x80\x9d or the value \xe2\x80\x9cmodulepreload\xe2\x80\x9d."
197
+
~code:"missing-required-attribute"
198
+
~element:name ~attribute:"rel" ()
199
+
| None -> ())
200
+
end;
201
+
202
+
(* Validate img usemap attribute - must be hash-name reference with content *)
203
+
if namespace = None && name_lower = "img" then begin
204
+
List.iter (fun (attr_name, attr_value) ->
205
+
let attr_lower = String.lowercase_ascii attr_name in
206
+
if attr_lower = "usemap" then begin
207
+
if attr_value = "#" then
208
+
Message_collector.add_error collector
209
+
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d."
210
+
attr_value attr_name name)
211
+
~code:"bad-attribute-value"
212
+
~element:name ~attribute:attr_name ()
213
+
end
214
+
) attrs
215
+
end;
216
+
217
+
(* Validate embed type attribute - must be valid MIME type *)
218
+
if namespace = None && name_lower = "embed" then begin
219
+
List.iter (fun (attr_name, attr_value) ->
220
+
let attr_lower = String.lowercase_ascii attr_name in
221
+
if attr_lower = "type" then begin
222
+
match Dt_mime.validate_mime_type attr_value with
223
+
| Ok () -> ()
224
+
| Error msg ->
225
+
Message_collector.add_error collector
226
+
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s"
227
+
attr_value attr_name name msg)
228
+
~code:"bad-attribute-value"
229
+
~element:name ~attribute:attr_name ()
230
+
end
231
+
) attrs
232
+
end;
233
+
234
+
(* Validate width/height on embed and img - must be non-negative integers *)
235
+
if namespace = None && (name_lower = "embed" || name_lower = "img" ||
236
+
name_lower = "video" || name_lower = "canvas" ||
237
+
name_lower = "iframe" || name_lower = "source") then begin
238
+
List.iter (fun (attr_name, attr_value) ->
239
+
let attr_lower = String.lowercase_ascii attr_name in
240
+
if attr_lower = "width" || attr_lower = "height" then begin
241
+
(* Check for non-negative integer only *)
242
+
let is_valid =
243
+
String.length attr_value > 0 &&
244
+
String.for_all (fun c -> c >= '0' && c <= '9') attr_value
245
+
in
246
+
if not is_valid then begin
247
+
(* Determine specific error message *)
248
+
let error_msg =
249
+
if String.length attr_value = 0 then
250
+
Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
251
+
attr_name name
252
+
else if String.contains attr_value '%' then
253
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
254
+
attr_value attr_name name
255
+
else if String.length attr_value > 0 && attr_value.[0] = '-' then
256
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The value must be non-negative."
257
+
attr_value attr_name name
258
+
else
259
+
(* Find first non-digit character *)
260
+
let bad_char =
261
+
try
262
+
let i = ref 0 in
263
+
while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do
264
+
incr i
265
+
done;
266
+
if !i < String.length attr_value then Some attr_value.[!i] else None
267
+
with _ -> None
268
+
in
269
+
match bad_char with
270
+
| Some c ->
271
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
272
+
attr_value attr_name name c
273
+
| None ->
274
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit."
275
+
attr_value attr_name name
276
+
in
277
+
Message_collector.add_error collector
278
+
~message:error_msg
279
+
~code:"bad-attribute-value"
280
+
~element:name ~attribute:attr_name ()
281
+
end
282
+
end
283
+
) attrs
284
+
end;
285
+
286
+
(* Validate area[shape=default] cannot have coords *)
287
+
if namespace = None && name_lower = "area" then begin
288
+
let shape_value = get_attr "shape" attrs in
289
+
match shape_value with
290
+
| Some s when String.lowercase_ascii (String.trim s) = "default" ->
291
+
if has_attr "coords" attrs then
292
+
Message_collector.add_error collector
293
+
~message:"Attribute \xe2\x80\x9ccoords\xe2\x80\x9d not allowed on element \xe2\x80\x9carea\xe2\x80\x9d at this point."
294
+
~code:"disallowed-attribute"
295
+
~element:name ~attribute:"coords" ()
296
+
| _ -> ()
297
+
end;
298
+
299
+
(* Validate bdo element requires dir attribute, and dir cannot be "auto" *)
300
+
if namespace = None && name_lower = "bdo" then begin
301
+
let dir_value = get_attr "dir" attrs in
302
+
match dir_value with
303
+
| None ->
304
+
Message_collector.add_error collector
305
+
~message:"Element \xe2\x80\x9cbdo\xe2\x80\x9d must have attribute \xe2\x80\x9cdir\xe2\x80\x9d."
306
+
~code:"missing-required-attribute"
307
+
~element:name ~attribute:"dir" ()
308
+
| Some v when String.lowercase_ascii (String.trim v) = "auto" ->
309
+
Message_collector.add_error collector
310
+
~message:"The value of \xe2\x80\x9cdir\xe2\x80\x9d attribute for the \xe2\x80\x9cbdo\xe2\x80\x9d element must not be \xe2\x80\x9cauto\xe2\x80\x9d."
311
+
~code:"bad-attribute-value"
312
+
~element:name ~attribute:"dir" ()
313
+
| _ -> ()
314
+
end;
315
+
316
+
(* Validate input list attribute - only allowed for certain types *)
317
+
if namespace = None && name_lower = "input" then begin
318
+
if has_attr "list" attrs then begin
319
+
let input_type = match get_attr "type" attrs with
320
+
| Some t -> String.lowercase_ascii (String.trim t)
321
+
| None -> "text" (* default type is text *)
322
+
in
323
+
if not (List.mem input_type input_types_allowing_list) then
324
+
Message_collector.add_error collector
325
+
~message:"Attribute \xe2\x80\x9clist\xe2\x80\x9d is only allowed when the input type is \xe2\x80\x9ccolor\xe2\x80\x9d, \xe2\x80\x9cdate\xe2\x80\x9d, \xe2\x80\x9cdatetime-local\xe2\x80\x9d, \xe2\x80\x9cemail\xe2\x80\x9d, \xe2\x80\x9cmonth\xe2\x80\x9d, \xe2\x80\x9cnumber\xe2\x80\x9d, \xe2\x80\x9crange\xe2\x80\x9d, \xe2\x80\x9csearch\xe2\x80\x9d, \xe2\x80\x9ctel\xe2\x80\x9d, \xe2\x80\x9ctext\xe2\x80\x9d, \xe2\x80\x9ctime\xe2\x80\x9d, \xe2\x80\x9curl\xe2\x80\x9d, or \xe2\x80\x9cweek\xe2\x80\x9d."
326
+
~code:"disallowed-attribute"
327
+
~element:name ~attribute:"list" ()
328
+
end
329
+
end;
330
+
331
+
(* Note: data-* uppercase check requires XML parsing which preserves case.
332
+
The HTML5 parser normalizes attribute names to lowercase, so this check
333
+
is only effective when the document is parsed as XML.
334
+
Commenting out until we have XML parsing support. *)
335
+
ignore state.is_xhtml
336
+
337
+
let end_element _state ~name:_ ~namespace:_ _collector = ()
338
+
let characters _state _text _collector = ()
339
+
let end_document _state _collector = ()
340
+
341
+
let checker =
342
+
(module struct
343
+
type nonrec state = state
344
+
let create = create
345
+
let reset = reset
346
+
let start_element = start_element
347
+
let end_element = end_element
348
+
let characters = characters
349
+
let end_document = end_document
350
+
end : Checker.S)
+55
lib/html5_checker/specialized/base_checker.ml
+55
lib/html5_checker/specialized/base_checker.ml
···
1
+
(** Base element ordering checker. *)
2
+
3
+
type state = {
4
+
mutable seen_link_or_script : bool;
5
+
}
6
+
7
+
let create () = {
8
+
seen_link_or_script = false;
9
+
}
10
+
11
+
let reset state =
12
+
state.seen_link_or_script <- false
13
+
14
+
(** Check if an attribute list contains a specific attribute. *)
15
+
let has_attr name attrs =
16
+
List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs
17
+
18
+
let start_element state ~name ~namespace ~attrs collector =
19
+
if namespace <> None then ()
20
+
else begin
21
+
let name_lower = String.lowercase_ascii name in
22
+
match name_lower with
23
+
| "link" | "script" ->
24
+
state.seen_link_or_script <- true
25
+
| "base" ->
26
+
if state.seen_link_or_script then
27
+
Message_collector.add_error collector
28
+
~message:"The \xe2\x80\x9cbase\xe2\x80\x9d element must come before any \xe2\x80\x9clink\xe2\x80\x9d or \xe2\x80\x9cscript\xe2\x80\x9d elements in the document."
29
+
~code:"base-after-link-script"
30
+
~element:name ();
31
+
(* base element must have href or target attribute *)
32
+
let has_href = has_attr "href" attrs in
33
+
let has_target = has_attr "target" attrs in
34
+
if not has_href && not has_target then
35
+
Message_collector.add_error collector
36
+
~message:"Element \xe2\x80\x9cbase\xe2\x80\x9d is missing one or more of the following attributes: [href, target]."
37
+
~code:"missing-required-attribute"
38
+
~element:name ()
39
+
| _ -> ()
40
+
end
41
+
42
+
let end_element _state ~name:_ ~namespace:_ _collector = ()
43
+
let characters _state _text _collector = ()
44
+
let end_document _state _collector = ()
45
+
46
+
let checker =
47
+
(module struct
48
+
type nonrec state = state
49
+
let create = create
50
+
let reset = reset
51
+
let start_element = start_element
52
+
let end_element = end_element
53
+
let characters = characters
54
+
let end_document = end_document
55
+
end : Checker.S)
+419
lib/html5_checker/specialized/datetime_checker.ml
+419
lib/html5_checker/specialized/datetime_checker.ml
···
1
+
(** Datetime attribute validation checker *)
2
+
3
+
(** Elements that have datetime attribute *)
4
+
let datetime_elements = ["del"; "ins"; "time"]
5
+
6
+
(** Helper: check if char is digit *)
7
+
let is_digit c = c >= '0' && c <= '9'
8
+
9
+
(** Parse int safely *)
10
+
let parse_int s =
11
+
try Some (int_of_string s) with _ -> None
12
+
13
+
(** Days in each month (non-leap year) *)
14
+
let days_in_month = [| 31; 28; 31; 30; 31; 30; 31; 31; 30; 31; 30; 31 |]
15
+
16
+
(** Check if a year is a leap year *)
17
+
let is_leap_year year =
18
+
(year mod 400 = 0) || (year mod 4 = 0 && year mod 100 <> 0)
19
+
20
+
(** Get max day for a given month/year *)
21
+
let max_day_for_month year month =
22
+
if month = 2 && is_leap_year year then 29
23
+
else if month >= 1 && month <= 12 then days_in_month.(month - 1)
24
+
else 31
25
+
26
+
(** Validate date string YYYY-MM-DD. Returns (valid, error_reason option) *)
27
+
let validate_date s =
28
+
let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in
29
+
if not (Str.string_match pattern s 0) then
30
+
(false, Some "Date must be in YYYY-MM-DD format")
31
+
else
32
+
let year_s = Str.matched_group 1 s in
33
+
let month_s = Str.matched_group 2 s in
34
+
let day_s = Str.matched_group 3 s in
35
+
if String.length year_s < 4 then
36
+
(false, Some "Year must be at least 4 digits")
37
+
else
38
+
match (parse_int year_s, parse_int month_s, parse_int day_s) with
39
+
| None, _, _ | _, None, _ | _, _, None ->
40
+
(false, Some "Invalid year, month or day")
41
+
| Some year, Some month, Some day ->
42
+
if year < 1 then (false, Some "Year cannot be less than 1")
43
+
else if month < 1 || month > 12 then (false, Some "Month out of range")
44
+
else if day < 1 then (false, Some "Day cannot be less than 1")
45
+
else
46
+
let max_day = max_day_for_month year month in
47
+
if day > max_day then (false, Some "Day out of range")
48
+
else (true, None)
49
+
50
+
(** Check if a date-like value has a 5+ digit year (might be mistyped) *)
51
+
let has_suspicious_year s =
52
+
let pattern = Str.regexp "^\\([0-9]+\\)-" in
53
+
if Str.string_match pattern s 0 then
54
+
let year_s = Str.matched_group 1 s in
55
+
String.length year_s > 4
56
+
else
57
+
false
58
+
59
+
(** Validate time string HH:MM[:SS[.sss]] *)
60
+
let validate_time s =
61
+
let pattern = Str.regexp "^\\([0-9][0-9]\\):\\([0-9][0-9]\\)\\(:\\([0-9][0-9]\\)\\(\\.\\([0-9]+\\)\\)?\\)?$" in
62
+
if not (Str.string_match pattern s 0) then
63
+
(false, Some "Time must be in HH:MM format")
64
+
else
65
+
let hour_s = Str.matched_group 1 s in
66
+
let minute_s = Str.matched_group 2 s in
67
+
match (parse_int hour_s, parse_int minute_s) with
68
+
| None, _ | _, None -> (false, Some "Invalid hour or minute")
69
+
| Some hour, Some minute ->
70
+
if hour > 23 then (false, Some "Hour out of range")
71
+
else if minute > 59 then (false, Some "Minute out of range")
72
+
else
73
+
let second_s = try Some (Str.matched_group 4 s) with Not_found -> None in
74
+
match second_s with
75
+
| None -> (true, None)
76
+
| Some sec_s ->
77
+
match parse_int sec_s with
78
+
| None -> (false, Some "Invalid seconds")
79
+
| Some sec ->
80
+
if sec > 59 then (false, Some "Second out of range")
81
+
else
82
+
(* Check milliseconds if present *)
83
+
let millis_s = try Some (Str.matched_group 6 s) with Not_found -> None in
84
+
match millis_s with
85
+
| None -> (true, None)
86
+
| Some ms ->
87
+
if String.length ms < 1 || String.length ms > 3 then
88
+
(false, Some "A fraction of a second must be one, two, or three digits")
89
+
else
90
+
(true, None)
91
+
92
+
(** Validate year-only format YYYY (at least 4 digits, > 0) *)
93
+
let validate_year_only s =
94
+
let pattern = Str.regexp "^\\([0-9]+\\)$" in
95
+
if not (Str.string_match pattern s 0) then
96
+
(false, Some "Year must be digits only")
97
+
else
98
+
let year_s = Str.matched_group 1 s in
99
+
if String.length year_s < 4 then
100
+
(false, Some "Year must be at least 4 digits")
101
+
else
102
+
match parse_int year_s with
103
+
| None -> (false, Some "Invalid year")
104
+
| Some year ->
105
+
if year < 1 then (false, Some "Year cannot be less than 1")
106
+
else (true, None)
107
+
108
+
(** Validate month format YYYY-MM *)
109
+
let validate_year_month s =
110
+
let pattern = Str.regexp "^\\([0-9]+\\)-\\([0-9][0-9]\\)$" in
111
+
if not (Str.string_match pattern s 0) then
112
+
(false, Some "Month must be in YYYY-MM format")
113
+
else
114
+
let year_s = Str.matched_group 1 s in
115
+
let month_s = Str.matched_group 2 s in
116
+
if String.length year_s < 4 then
117
+
(false, Some "Year must be at least 4 digits")
118
+
else
119
+
match (parse_int year_s, parse_int month_s) with
120
+
| None, _ | _, None -> (false, Some "Invalid year or month")
121
+
| Some year, Some month ->
122
+
if year < 1 then (false, Some "Year cannot be less than 1")
123
+
else if month < 1 || month > 12 then (false, Some "Month out of range")
124
+
else (true, None)
125
+
126
+
(** Validate week format YYYY-Www *)
127
+
let validate_week s =
128
+
let pattern = Str.regexp "^\\([0-9]+\\)-W\\([0-9][0-9]\\)$" in
129
+
if not (Str.string_match pattern s 0) then
130
+
(false, Some "Week must be in YYYY-Www format")
131
+
else
132
+
let year_s = Str.matched_group 1 s in
133
+
let week_s = Str.matched_group 2 s in
134
+
if String.length year_s < 4 then
135
+
(false, Some "Year must be at least 4 digits")
136
+
else
137
+
match (parse_int year_s, parse_int week_s) with
138
+
| None, _ | _, None -> (false, Some "Invalid year or week")
139
+
| Some year, Some week ->
140
+
if year < 1 then (false, Some "Year cannot be less than 1")
141
+
else if week < 1 || week > 53 then (false, Some "Week out of range")
142
+
else (true, None)
143
+
144
+
(** Validate yearless date format --MM-DD *)
145
+
let validate_yearless_date s =
146
+
let pattern = Str.regexp "^--\\([0-9][0-9]\\)-\\([0-9][0-9]\\)$" in
147
+
if not (Str.string_match pattern s 0) then
148
+
(false, Some "Yearless date must be in --MM-DD format")
149
+
else
150
+
let month_s = Str.matched_group 1 s in
151
+
let day_s = Str.matched_group 2 s in
152
+
match (parse_int month_s, parse_int day_s) with
153
+
| None, _ | _, None -> (false, Some "Invalid month or day")
154
+
| Some month, Some day ->
155
+
if month < 1 || month > 12 then (false, Some "Month out of range")
156
+
else if day < 1 then (false, Some "Day cannot be less than 1")
157
+
else
158
+
(* Use non-leap year for yearless date validation *)
159
+
let max_day = if month = 2 then 29 else days_in_month.(month - 1) in
160
+
if day > max_day then (false, Some "Day out of range")
161
+
else (true, None)
162
+
163
+
(** Validate duration format - HTML5 only accepts:
164
+
1. Duration time component: PT#H#M#S (or PT#H, PT#M, PT#S, etc.)
165
+
2. Duration weeks: P#W
166
+
3. Duration days: P#D or P#DT#H#M#S *)
167
+
let validate_duration s =
168
+
if String.length s < 2 then
169
+
(false, Some "Duration too short")
170
+
else if s.[0] <> 'P' then
171
+
(false, Some "Duration must start with P")
172
+
else
173
+
let rest = String.sub s 1 (String.length s - 1) in
174
+
(* Valid HTML5 duration patterns:
175
+
- PT#H#M#S (or any combination of H, M, S after T)
176
+
- P#W (weeks only)
177
+
- P#D or P#DT#H#M#S (days with optional time) *)
178
+
let pattern_time_only = Str.regexp "^T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+\\(\\.[0-9]+\\)?S\\)?$" in
179
+
let pattern_weeks = Str.regexp "^[0-9]+W$" in
180
+
let pattern_days = Str.regexp "^[0-9]+D\\(T\\([0-9]+H\\)?\\([0-9]+M\\)?\\([0-9]+\\(\\.[0-9]+\\)?S\\)?\\)?$" in
181
+
if Str.string_match pattern_time_only rest 0 then
182
+
(* Check that at least one component exists after T *)
183
+
if String.length rest > 1 then (true, None)
184
+
else (false, Some "Invalid duration format")
185
+
else if Str.string_match pattern_weeks rest 0 then
186
+
(true, None)
187
+
else if Str.string_match pattern_days rest 0 then
188
+
(true, None)
189
+
else
190
+
(false, Some "Invalid duration format")
191
+
192
+
(** Validate timezone offset +HH:MM or -HH:MM or +HHMM or -HHMM *)
193
+
let validate_timezone_offset s =
194
+
(* Try +HH:MM format *)
195
+
let pattern_colon = Str.regexp "^[+-]\\([0-9][0-9]\\):\\([0-9][0-9]\\)$" in
196
+
(* Try +HHMM format (no colon) *)
197
+
let pattern_no_colon = Str.regexp "^[+-]\\([0-9][0-9]\\)\\([0-9][0-9]\\)$" in
198
+
let matched =
199
+
if Str.string_match pattern_colon s 0 then true
200
+
else Str.string_match pattern_no_colon s 0
201
+
in
202
+
if not matched then
203
+
(false, Some "Invalid timezone offset")
204
+
else
205
+
let hour_s = Str.matched_group 1 s in
206
+
let minute_s = Str.matched_group 2 s in
207
+
match (parse_int hour_s, parse_int minute_s) with
208
+
| None, _ | _, None -> (false, Some "Invalid timezone")
209
+
| Some hour, Some minute ->
210
+
if hour > 23 || minute > 59 then (false, Some "Timezone offset out of range")
211
+
else (true, None)
212
+
213
+
(** Validate datetime with timezone: YYYY-MM-DDTHH:MM:SS[.sss]Z or YYYY-MM-DDTHH:MM:SS[.sss]+HH:MM *)
214
+
let validate_datetime_with_timezone s =
215
+
(* Try to split on T or space *)
216
+
let sep_pos =
217
+
try Some (String.index s 'T')
218
+
with Not_found ->
219
+
try Some (String.index s ' ')
220
+
with Not_found -> None
221
+
in
222
+
match sep_pos with
223
+
| None -> (false, Some "The literal did not satisfy the datetime with timezone format")
224
+
| Some pos ->
225
+
let date_part = String.sub s 0 pos in
226
+
let time_and_tz = String.sub s (pos + 1) (String.length s - pos - 1) in
227
+
(* Validate date *)
228
+
match validate_date date_part with
229
+
| (false, reason) -> (false, reason)
230
+
| (true, _) ->
231
+
(* Check if ends with Z *)
232
+
if String.length time_and_tz > 0 && time_and_tz.[String.length time_and_tz - 1] = 'Z' then begin
233
+
let time_part = String.sub time_and_tz 0 (String.length time_and_tz - 1) in
234
+
match validate_time time_part with
235
+
| (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format")
236
+
| (true, _) -> (true, None)
237
+
end
238
+
else begin
239
+
(* Check for +/- timezone offset *)
240
+
let plus_pos = try Some (String.rindex time_and_tz '+') with Not_found -> None in
241
+
let minus_pos = try Some (String.rindex time_and_tz '-') with Not_found -> None in
242
+
let tz_pos = match plus_pos, minus_pos with
243
+
| Some p, Some m -> Some (max p m)
244
+
| Some p, None -> Some p
245
+
| None, Some m -> Some m
246
+
| None, None -> None
247
+
in
248
+
match tz_pos with
249
+
| None -> (false, Some "The literal did not satisfy the datetime with timezone format")
250
+
| Some tp ->
251
+
let time_part = String.sub time_and_tz 0 tp in
252
+
let tz_part = String.sub time_and_tz tp (String.length time_and_tz - tp) in
253
+
match validate_time time_part with
254
+
| (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format")
255
+
| (true, _) ->
256
+
match validate_timezone_offset tz_part with
257
+
| (false, _) -> (false, Some "The literal did not satisfy the datetime with timezone format")
258
+
| (true, _) -> (true, None)
259
+
end
260
+
261
+
(** Validate datetime-local: YYYY-MM-DDTHH:MM[:SS[.sss]] or YYYY-MM-DD HH:MM *)
262
+
let validate_datetime_local s =
263
+
let sep_pos =
264
+
try Some (String.index s 'T')
265
+
with Not_found ->
266
+
try Some (String.index s ' ')
267
+
with Not_found -> None
268
+
in
269
+
match sep_pos with
270
+
| None -> (false, Some "Invalid datetime-local format")
271
+
| Some pos ->
272
+
let date_part = String.sub s 0 pos in
273
+
let time_part = String.sub s (pos + 1) (String.length s - pos - 1) in
274
+
match validate_date date_part with
275
+
| (false, reason) -> (false, reason)
276
+
| (true, _) ->
277
+
match validate_time time_part with
278
+
| (false, reason) -> (false, reason)
279
+
| (true, _) -> (true, None)
280
+
281
+
(** Result type for datetime validation - can be Ok, Error, or Warning *)
282
+
type datetime_result =
283
+
| Ok
284
+
| Error of string
285
+
| Warning of string
286
+
287
+
(** Validate datetime attribute - valid formats depend on element:
288
+
- del/ins: only date or datetime-with-timezone
289
+
- time: date, time, datetime-local, datetime-with-timezone, year, month, week, yearless, duration *)
290
+
let validate_datetime_attr value element_name attr_name =
291
+
let is_time_element = element_name = "time" in
292
+
(* Check for leading/trailing whitespace - not allowed *)
293
+
if value <> String.trim value then begin
294
+
let tz_msg = "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." in
295
+
let date_msg = "Bad date: The literal did not satisfy the date format." in
296
+
Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
297
+
value attr_name element_name tz_msg date_msg)
298
+
end
299
+
else
300
+
(* Try datetime with timezone first *)
301
+
match validate_datetime_with_timezone value with
302
+
| (true, _) -> Ok (* Valid datetime with timezone *)
303
+
| (false, tz_error) ->
304
+
(* Try just date - valid for all elements *)
305
+
match validate_date value with
306
+
| (true, _) ->
307
+
(* Date is valid, but check for suspicious year (5+ digits) *)
308
+
if has_suspicious_year value then begin
309
+
let date_msg = "Bad date: Year may be mistyped." in
310
+
let tz_msg = match tz_error with
311
+
| Some e -> Printf.sprintf "Bad datetime with timezone: %s." e
312
+
| None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format."
313
+
in
314
+
Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
315
+
value attr_name element_name date_msg tz_msg)
316
+
end else
317
+
Ok (* Valid date with normal year *)
318
+
| (false, date_error) ->
319
+
(* For time element only, try additional formats *)
320
+
if is_time_element then begin
321
+
match validate_datetime_local value with
322
+
| (true, _) -> Ok (* Valid datetime-local *)
323
+
| (false, _) ->
324
+
match validate_time value with
325
+
| (true, _) -> Ok (* Valid time *)
326
+
| (false, _) ->
327
+
match validate_year_month value with
328
+
| (true, _) -> Ok (* Valid month YYYY-MM *)
329
+
| (false, _) ->
330
+
match validate_year_only value with
331
+
| (true, _) -> Ok (* Valid year YYYY *)
332
+
| (false, _) ->
333
+
match validate_week value with
334
+
| (true, _) -> Ok (* Valid week YYYY-Www *)
335
+
| (false, _) ->
336
+
match validate_yearless_date value with
337
+
| (true, _) -> Ok (* Valid yearless date --MM-DD *)
338
+
| (false, _) ->
339
+
match validate_duration value with
340
+
| (true, _) -> Ok (* Valid duration P... *)
341
+
| (false, _) ->
342
+
let tz_msg = match tz_error with
343
+
| Some e -> Printf.sprintf "Bad datetime with timezone: %s." e
344
+
| None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format."
345
+
in
346
+
let date_msg = match date_error with
347
+
| Some e -> Printf.sprintf "Bad date: %s." e
348
+
| None -> "Bad date: The literal did not satisfy the date format."
349
+
in
350
+
Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
351
+
value attr_name element_name tz_msg date_msg)
352
+
end
353
+
else begin
354
+
(* del/ins only allow date or datetime-with-timezone *)
355
+
let tz_msg = match tz_error with
356
+
| Some e -> Printf.sprintf "Bad datetime with timezone: %s." e
357
+
| None -> "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format."
358
+
in
359
+
let date_msg = match date_error with
360
+
| Some e -> Printf.sprintf "Bad date: %s." e
361
+
| None -> "Bad date: The literal did not satisfy the date format."
362
+
in
363
+
Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
364
+
value attr_name element_name tz_msg date_msg)
365
+
end
366
+
367
+
(** Checker state *)
368
+
type state = unit
369
+
370
+
let create () = ()
371
+
let reset _state = ()
372
+
373
+
let start_element _state ~name ~namespace ~attrs collector =
374
+
if namespace <> None then ()
375
+
else begin
376
+
let name_lower = String.lowercase_ascii name in
377
+
if List.mem name_lower datetime_elements then begin
378
+
(* Check for datetime attribute *)
379
+
let datetime_attr = List.find_map (fun (k, v) ->
380
+
if String.lowercase_ascii k = "datetime" then Some v else None
381
+
) attrs in
382
+
match datetime_attr with
383
+
| None -> ()
384
+
| Some value ->
385
+
if String.trim value = "" then ()
386
+
else
387
+
match validate_datetime_attr value name "datetime" with
388
+
| Ok -> ()
389
+
| Error error_msg ->
390
+
Message_collector.add_error collector
391
+
~message:error_msg
392
+
~code:"bad-datetime"
393
+
~element:name
394
+
~attribute:"datetime"
395
+
()
396
+
| Warning warn_msg ->
397
+
Message_collector.add_warning collector
398
+
~message:warn_msg
399
+
~code:"suspicious-datetime"
400
+
~element:name
401
+
~attribute:"datetime"
402
+
()
403
+
end
404
+
end
405
+
406
+
let end_element _state ~name:_ ~namespace:_ _collector = ()
407
+
let characters _state _text _collector = ()
408
+
let end_document _state _collector = ()
409
+
410
+
let checker =
411
+
(module struct
412
+
type nonrec state = state
413
+
let create = create
414
+
let reset = reset
415
+
let start_element = start_element
416
+
let end_element = end_element
417
+
let characters = characters
418
+
let end_document = end_document
419
+
end : Checker.S)
+283
lib/html5_checker/specialized/dl_checker.ml
+283
lib/html5_checker/specialized/dl_checker.ml
···
1
+
(** DL element content model validation checker. *)
2
+
3
+
(** Checker state for tracking dl element context. *)
4
+
type dl_context = {
5
+
mutable has_dt : bool;
6
+
mutable has_dd : bool;
7
+
mutable last_was_dt : bool;
8
+
mutable contains_div : bool;
9
+
mutable contains_dt_dd : bool;
10
+
mutable dd_before_dt_error_reported : bool; (* Track if we've reported dd-before-dt error *)
11
+
}
12
+
13
+
type div_context = {
14
+
mutable has_dt : bool;
15
+
mutable has_dd : bool;
16
+
}
17
+
18
+
type state = {
19
+
mutable dl_stack : dl_context list;
20
+
mutable div_in_dl_stack : div_context list;
21
+
mutable in_template : int; (* Template nesting depth *)
22
+
mutable in_dt_dd : int; (* Depth inside dt/dd elements *)
23
+
mutable parent_stack : string list; (* Stack of parent element names for context errors *)
24
+
}
25
+
26
+
let create () = {
27
+
dl_stack = [];
28
+
div_in_dl_stack = [];
29
+
in_template = 0;
30
+
in_dt_dd = 0;
31
+
parent_stack = [];
32
+
}
33
+
34
+
let reset state =
35
+
state.dl_stack <- [];
36
+
state.div_in_dl_stack <- [];
37
+
state.in_template <- 0;
38
+
state.in_dt_dd <- 0;
39
+
state.parent_stack <- []
40
+
41
+
let current_parent state =
42
+
(* The stack has current element on top, so parent is second *)
43
+
match state.parent_stack with
44
+
| _ :: p :: _ -> Some p
45
+
| _ -> None
46
+
47
+
let current_dl state =
48
+
match state.dl_stack with
49
+
| ctx :: _ -> Some ctx
50
+
| [] -> None
51
+
52
+
let current_div state =
53
+
match state.div_in_dl_stack with
54
+
| ctx :: _ -> Some ctx
55
+
| [] -> None
56
+
57
+
let start_element state ~name ~namespace ~attrs:_ collector =
58
+
let name_lower = String.lowercase_ascii name in
59
+
60
+
(* Track parent stack for all HTML elements first *)
61
+
if namespace = None then
62
+
state.parent_stack <- name_lower :: state.parent_stack;
63
+
64
+
if namespace <> None then ()
65
+
else begin
66
+
match name_lower with
67
+
| "template" ->
68
+
state.in_template <- state.in_template + 1
69
+
70
+
| "dl" when state.in_template = 0 ->
71
+
(* Check for nested dl - only error if direct child (not inside dt/dd) *)
72
+
begin match current_dl state with
73
+
| Some _ when state.in_dt_dd = 0 && state.div_in_dl_stack = [] ->
74
+
Message_collector.add_error collector
75
+
~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
76
+
~code:"disallowed-child"
77
+
~element:"dl" ()
78
+
| _ -> ()
79
+
end;
80
+
let ctx = {
81
+
has_dt = false;
82
+
has_dd = false;
83
+
last_was_dt = false;
84
+
contains_div = false;
85
+
contains_dt_dd = false;
86
+
dd_before_dt_error_reported = false;
87
+
} in
88
+
state.dl_stack <- ctx :: state.dl_stack
89
+
90
+
| "div" when state.in_template = 0 ->
91
+
begin match current_dl state with
92
+
| Some dl_ctx when state.div_in_dl_stack = [] ->
93
+
(* Direct div child of dl *)
94
+
dl_ctx.contains_div <- true;
95
+
(* Check for mixed content - if we already have dt/dd, div is not allowed *)
96
+
if dl_ctx.contains_dt_dd then
97
+
Message_collector.add_error collector
98
+
~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
99
+
~code:"disallowed-child"
100
+
~element:"div" ();
101
+
let div_ctx = { has_dt = false; has_dd = false } in
102
+
state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack
103
+
| Some _ when state.div_in_dl_stack <> [] ->
104
+
(* Nested div inside div in dl - not allowed *)
105
+
Message_collector.add_error collector
106
+
~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdiv\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
107
+
~code:"disallowed-child"
108
+
~element:"div" ()
109
+
| _ -> ()
110
+
end
111
+
112
+
| "dt" when state.in_template = 0 ->
113
+
state.in_dt_dd <- state.in_dt_dd + 1;
114
+
begin match current_div state with
115
+
| Some div_ctx ->
116
+
div_ctx.has_dt <- true
117
+
| None ->
118
+
match current_dl state with
119
+
| Some dl_ctx ->
120
+
dl_ctx.has_dt <- true;
121
+
dl_ctx.last_was_dt <- true;
122
+
dl_ctx.contains_dt_dd <- true;
123
+
(* Check for mixed content - if we already have div, dt is not allowed *)
124
+
if dl_ctx.contains_div then
125
+
Message_collector.add_error collector
126
+
~message:"Element \xe2\x80\x9cdt\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
127
+
~code:"disallowed-child"
128
+
~element:"dt" ()
129
+
| None ->
130
+
(* dt outside dl context - error *)
131
+
let parent = match current_parent state with
132
+
| Some p -> p
133
+
| None -> "document"
134
+
in
135
+
Message_collector.add_error collector
136
+
~message:(Printf.sprintf "Element \xe2\x80\x9cdt\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" parent)
137
+
~code:"disallowed-child"
138
+
~element:"dt" ()
139
+
end
140
+
141
+
| "dd" when state.in_template = 0 ->
142
+
state.in_dt_dd <- state.in_dt_dd + 1;
143
+
begin match current_div state with
144
+
| Some div_ctx ->
145
+
div_ctx.has_dd <- true
146
+
| None ->
147
+
match current_dl state with
148
+
| Some dl_ctx ->
149
+
(* Check if dd appears before any dt - only report once per dl *)
150
+
if not dl_ctx.has_dt && not dl_ctx.dd_before_dt_error_reported then begin
151
+
dl_ctx.dd_before_dt_error_reported <- true;
152
+
Message_collector.add_error collector
153
+
~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d is missing a required child element."
154
+
~code:"missing-required-child"
155
+
~element:"dl" ()
156
+
end;
157
+
dl_ctx.has_dd <- true;
158
+
dl_ctx.last_was_dt <- false;
159
+
dl_ctx.contains_dt_dd <- true;
160
+
(* Check for mixed content *)
161
+
if dl_ctx.contains_div then
162
+
Message_collector.add_error collector
163
+
~message:"Element \xe2\x80\x9cdd\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
164
+
~code:"disallowed-child"
165
+
~element:"dd" ()
166
+
| None ->
167
+
(* dd outside dl context - error *)
168
+
let parent = match current_parent state with
169
+
| Some p -> p
170
+
| None -> "document"
171
+
in
172
+
Message_collector.add_error collector
173
+
~message:(Printf.sprintf "Element \xe2\x80\x9cdd\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" parent)
174
+
~code:"disallowed-child"
175
+
~element:"dd" ()
176
+
end
177
+
178
+
| _ -> ()
179
+
end
180
+
181
+
let end_element state ~name ~namespace collector =
182
+
if namespace <> None then ()
183
+
else begin
184
+
let name_lower = String.lowercase_ascii name in
185
+
186
+
(* Pop from parent stack *)
187
+
(match state.parent_stack with
188
+
| _ :: rest -> state.parent_stack <- rest
189
+
| [] -> ());
190
+
191
+
match name_lower with
192
+
| "template" ->
193
+
state.in_template <- max 0 (state.in_template - 1)
194
+
195
+
| "dt" | "dd" when state.in_template = 0 ->
196
+
state.in_dt_dd <- max 0 (state.in_dt_dd - 1)
197
+
198
+
| "dl" when state.in_template = 0 ->
199
+
begin match state.dl_stack with
200
+
| ctx :: rest ->
201
+
state.dl_stack <- rest;
202
+
(* Check dl content model at end *)
203
+
if ctx.contains_dt_dd then begin
204
+
(* Direct dt/dd content - must have both *)
205
+
if not ctx.has_dt && not ctx.dd_before_dt_error_reported then
206
+
(* Only report missing dt if we didn't already report it when dd appeared first *)
207
+
Message_collector.add_error collector
208
+
~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d is missing a required child element."
209
+
~code:"missing-required-child"
210
+
~element:"dl" ()
211
+
else if not ctx.has_dd then
212
+
Message_collector.add_error collector
213
+
~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d."
214
+
~code:"missing-required-child"
215
+
~element:"dl" ()
216
+
else if ctx.last_was_dt then
217
+
(* Ended with dt, missing dd *)
218
+
Message_collector.add_error collector
219
+
~message:"Element \xe2\x80\x9cdl\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d."
220
+
~code:"missing-required-child"
221
+
~element:"dl" ()
222
+
end else if not ctx.contains_div && not ctx.has_dt && not ctx.has_dd then begin
223
+
(* Empty dl or only contained text/other elements - that's ok for now *)
224
+
()
225
+
end
226
+
| [] -> ()
227
+
end
228
+
229
+
| "div" when state.in_template = 0 ->
230
+
begin match state.div_in_dl_stack with
231
+
| div_ctx :: rest ->
232
+
state.div_in_dl_stack <- rest;
233
+
(* Check div in dl must have both dt and dd *)
234
+
if not div_ctx.has_dt && not div_ctx.has_dd then
235
+
Message_collector.add_error collector
236
+
~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d."
237
+
~code:"missing-required-child"
238
+
~element:"div" ()
239
+
else if not div_ctx.has_dt then
240
+
Message_collector.add_error collector
241
+
~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d is missing required child element \xe2\x80\x9cdt\xe2\x80\x9d."
242
+
~code:"missing-required-child"
243
+
~element:"div" ()
244
+
else if not div_ctx.has_dd then
245
+
Message_collector.add_error collector
246
+
~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d is missing required child element \xe2\x80\x9cdd\xe2\x80\x9d."
247
+
~code:"missing-required-child"
248
+
~element:"div" ()
249
+
| [] -> ()
250
+
end
251
+
252
+
| _ -> ()
253
+
end
254
+
255
+
let characters state text collector =
256
+
if state.in_template > 0 then ()
257
+
else if state.in_dt_dd > 0 then () (* Text in dt/dd is fine *)
258
+
else begin
259
+
let trimmed = String.trim text in
260
+
if trimmed <> "" then begin
261
+
(* Check for text directly in dl *)
262
+
match current_dl state with
263
+
| Some _ when state.div_in_dl_stack = [] ->
264
+
Message_collector.add_error collector
265
+
~message:"Text not allowed in element \xe2\x80\x9cdl\xe2\x80\x9d in this context."
266
+
~code:"text-not-allowed"
267
+
~element:"dl" ()
268
+
| _ -> ()
269
+
end
270
+
end
271
+
272
+
let end_document _state _collector = ()
273
+
274
+
let checker =
275
+
(module struct
276
+
type nonrec state = state
277
+
let create = create
278
+
let reset = reset
279
+
let start_element = start_element
280
+
let end_element = end_element
281
+
let characters = characters
282
+
let end_document = end_document
283
+
end : Checker.S)
+42
lib/html5_checker/specialized/h1_checker.ml
+42
lib/html5_checker/specialized/h1_checker.ml
···
1
+
(** H1 element counter - warns about multiple h1 elements in a document. *)
2
+
3
+
type state = {
4
+
mutable h1_count : int;
5
+
}
6
+
7
+
let create () = {
8
+
h1_count = 0;
9
+
}
10
+
11
+
let reset state =
12
+
state.h1_count <- 0
13
+
14
+
let start_element state ~name ~namespace ~attrs collector =
15
+
ignore attrs;
16
+
if namespace <> None then ()
17
+
else begin
18
+
let name_lower = String.lowercase_ascii name in
19
+
if name_lower = "h1" then begin
20
+
state.h1_count <- state.h1_count + 1;
21
+
if state.h1_count > 1 then
22
+
Message_collector.add_info collector
23
+
~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
24
+
~code:"multiple-h1"
25
+
~element:name ()
26
+
end
27
+
end
28
+
29
+
let end_element _state ~name:_ ~namespace:_ _collector = ()
30
+
let characters _state _text _collector = ()
31
+
let end_document _state _collector = ()
32
+
33
+
let checker =
34
+
(module struct
35
+
type nonrec state = state
36
+
let create = create
37
+
let reset = reset
38
+
let start_element = start_element
39
+
let end_element = end_element
40
+
let characters = characters
41
+
let end_document = end_document
42
+
end : Checker.S)
+1
-1
lib/html5_checker/specialized/heading_checker.ml
+1
-1
lib/html5_checker/specialized/heading_checker.ml
···
80
80
state.h1_count <- state.h1_count + 1;
81
81
if state.h1_count > 1 then
82
82
Message_collector.add_warning collector
83
-
~message:"Multiple <h1> elements detected. While valid in HTML5 sectioning content, traditional advice suggests one <h1> per page"
83
+
~message:"Consider using only one \xe2\x80\x9ch1\xe2\x80\x9d element per document (or, if using \xe2\x80\x9ch1\xe2\x80\x9d elements multiple times is required, consider using the \xe2\x80\x9cheadingoffset\xe2\x80\x9d attribute to indicate that these \xe2\x80\x9ch1\xe2\x80\x9d elements are not all top-level headings)."
84
84
~code:"multiple-h1"
85
85
~element:name
86
86
()
+115
lib/html5_checker/specialized/label_checker.ml
+115
lib/html5_checker/specialized/label_checker.ml
···
1
+
(** Label element content model validation checker.
2
+
Validates that label element contains at most one labelable element
3
+
and that descendants with for attribute have matching ids. *)
4
+
5
+
(** Labelable elements that label can reference *)
6
+
let labelable_elements = ["button"; "input"; "meter"; "output"; "progress"; "select"; "textarea"]
7
+
8
+
(** Helper to get attribute value *)
9
+
let get_attr attrs name =
10
+
let name_lower = String.lowercase_ascii name in
11
+
List.find_map (fun (n, v) ->
12
+
if String.lowercase_ascii n = name_lower then Some v else None
13
+
) attrs
14
+
15
+
type state = {
16
+
mutable in_label : bool;
17
+
mutable label_depth : int;
18
+
mutable labelable_count : int;
19
+
mutable label_for_value : string option; (* Value of for attribute on current label *)
20
+
}
21
+
22
+
let create () = {
23
+
in_label = false;
24
+
label_depth = 0;
25
+
labelable_count = 0;
26
+
label_for_value = None;
27
+
}
28
+
29
+
let reset state =
30
+
state.in_label <- false;
31
+
state.label_depth <- 0;
32
+
state.labelable_count <- 0;
33
+
state.label_for_value <- None
34
+
35
+
let start_element state ~name ~namespace ~attrs collector =
36
+
if namespace <> None then ()
37
+
else begin
38
+
let name_lower = String.lowercase_ascii name in
39
+
40
+
if name_lower = "label" then begin
41
+
state.in_label <- true;
42
+
state.label_depth <- 0;
43
+
state.labelable_count <- 0;
44
+
state.label_for_value <- get_attr attrs "for"
45
+
end;
46
+
47
+
if state.in_label then begin
48
+
state.label_depth <- state.label_depth + 1;
49
+
50
+
(* Check for labelable elements inside label *)
51
+
if List.mem name_lower labelable_elements then begin
52
+
state.labelable_count <- state.labelable_count + 1;
53
+
if state.labelable_count > 1 then
54
+
Message_collector.add_error collector
55
+
~message:"The \xe2\x80\x9clabel\xe2\x80\x9d element may contain at most one \xe2\x80\x9cbutton\xe2\x80\x9d, \xe2\x80\x9cinput\xe2\x80\x9d, \xe2\x80\x9cmeter\xe2\x80\x9d, \xe2\x80\x9coutput\xe2\x80\x9d, \xe2\x80\x9cprogress\xe2\x80\x9d, \xe2\x80\x9cselect\xe2\x80\x9d, or \xe2\x80\x9ctextarea\xe2\x80\x9d descendant."
56
+
~code:"too-many-labelable-descendants"
57
+
~element:"label" ();
58
+
59
+
(* Check if label has for attribute and descendant has mismatched id *)
60
+
match state.label_for_value with
61
+
| Some for_value ->
62
+
let descendant_id = get_attr attrs "id" in
63
+
(match descendant_id with
64
+
| None ->
65
+
(* Descendant has no id, but label has for attribute *)
66
+
Message_collector.add_error collector
67
+
~message:(Printf.sprintf "Any \xe2\x80\x9c%s\xe2\x80\x9d descendant of a \xe2\x80\x9clabel\xe2\x80\x9d element with a \xe2\x80\x9cfor\xe2\x80\x9d attribute must have an ID value that matches that \xe2\x80\x9cfor\xe2\x80\x9d attribute." name_lower)
68
+
~code:"label-for-descendant-id-mismatch"
69
+
~element:name_lower ()
70
+
| Some id when id <> for_value ->
71
+
(* Descendant has id, but it doesn't match the for value *)
72
+
Message_collector.add_error collector
73
+
~message:(Printf.sprintf "Any \xe2\x80\x9c%s\xe2\x80\x9d descendant of a \xe2\x80\x9clabel\xe2\x80\x9d element with a \xe2\x80\x9cfor\xe2\x80\x9d attribute must have an ID value that matches that \xe2\x80\x9cfor\xe2\x80\x9d attribute." name_lower)
74
+
~code:"label-for-descendant-id-mismatch"
75
+
~element:name_lower ()
76
+
| Some _ ->
77
+
(* id matches for value - no error *)
78
+
())
79
+
| None ->
80
+
(* No for attribute on label - no constraint on descendant id *)
81
+
()
82
+
end
83
+
end
84
+
end
85
+
86
+
let end_element state ~name ~namespace _collector =
87
+
if namespace <> None then ()
88
+
else begin
89
+
let name_lower = String.lowercase_ascii name in
90
+
91
+
if state.in_label then begin
92
+
state.label_depth <- state.label_depth - 1;
93
+
94
+
if name_lower = "label" && state.label_depth < 0 then begin
95
+
state.in_label <- false;
96
+
state.labelable_count <- 0;
97
+
state.label_for_value <- None
98
+
end
99
+
end
100
+
end
101
+
102
+
let characters _state _text _collector = ()
103
+
104
+
let end_document _state _collector = ()
105
+
106
+
let checker =
107
+
(module struct
108
+
type nonrec state = state
109
+
let create = create
110
+
let reset = reset
111
+
let start_element = start_element
112
+
let end_element = end_element
113
+
let characters = characters
114
+
let end_document = end_document
115
+
end : Checker.S)
+192
lib/html5_checker/specialized/picture_checker.ml
+192
lib/html5_checker/specialized/picture_checker.ml
···
1
+
(** Picture element content model and attribute validation checker. *)
2
+
3
+
(** Elements allowed as children of picture *)
4
+
let allowed_picture_children = ["source"; "img"; "script"; "template"]
5
+
6
+
(** Attributes NOT allowed on picture element *)
7
+
let disallowed_picture_attrs = [
8
+
"align"; "alt"; "border"; "crossorigin"; "height"; "hspace"; "ismap";
9
+
"longdesc"; "lowsrc"; "media"; "name"; "sizes"; "src"; "srcset"; "usemap";
10
+
"vspace"; "width"; "role"
11
+
]
12
+
13
+
(** Attributes NOT allowed on source element when in picture context *)
14
+
let disallowed_source_attrs_in_picture = [
15
+
"align"; "alt"; "border"; "crossorigin"; "hspace"; "ismap"; "longdesc";
16
+
"name"; "src"; "usemap"; "vspace"; "role"
17
+
]
18
+
19
+
(** Attributes NOT allowed on img element *)
20
+
let disallowed_img_attrs = ["type"]
21
+
22
+
(** Checker state. *)
23
+
type state = {
24
+
mutable in_picture : bool;
25
+
mutable has_img_in_picture : bool;
26
+
mutable picture_depth : int;
27
+
mutable children_in_picture : string list;
28
+
mutable last_was_img : bool;
29
+
mutable has_source_after_img : bool;
30
+
}
31
+
32
+
let create () = {
33
+
in_picture = false;
34
+
has_img_in_picture = false;
35
+
picture_depth = 0;
36
+
children_in_picture = [];
37
+
last_was_img = false;
38
+
has_source_after_img = false;
39
+
}
40
+
41
+
let reset state =
42
+
state.in_picture <- false;
43
+
state.has_img_in_picture <- false;
44
+
state.picture_depth <- 0;
45
+
state.children_in_picture <- [];
46
+
state.last_was_img <- false;
47
+
state.has_source_after_img <- false
48
+
49
+
(** Check if an attribute list contains a specific attribute. *)
50
+
let has_attr name attrs =
51
+
List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs
52
+
53
+
(** Report disallowed attribute error *)
54
+
let report_disallowed_attr element attr collector =
55
+
Message_collector.add_error collector
56
+
~message:(Printf.sprintf "Attribute \xe2\x80\x9c%s\xe2\x80\x9d not allowed on element \xe2\x80\x9c%s\xe2\x80\x9d at this point."
57
+
attr element)
58
+
~code:"disallowed-attribute"
59
+
~element ~attribute:attr ()
60
+
61
+
(** Report disallowed child element error *)
62
+
let report_disallowed_child parent child collector =
63
+
Message_collector.add_error collector
64
+
~message:(Printf.sprintf "Element \xe2\x80\x9c%s\xe2\x80\x9d not allowed as child of element \xe2\x80\x9c%s\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)"
65
+
child parent)
66
+
~code:"disallowed-child"
67
+
~element:child ()
68
+
69
+
let check_picture_attrs attrs collector =
70
+
List.iter (fun disallowed ->
71
+
if has_attr disallowed attrs then
72
+
report_disallowed_attr "picture" disallowed collector
73
+
) disallowed_picture_attrs
74
+
75
+
let check_source_attrs_in_picture attrs collector =
76
+
List.iter (fun disallowed ->
77
+
if has_attr disallowed attrs then
78
+
report_disallowed_attr "source" disallowed collector
79
+
) disallowed_source_attrs_in_picture;
80
+
(* source in picture requires srcset *)
81
+
if not (has_attr "srcset" attrs) then
82
+
Message_collector.add_error collector
83
+
~message:"Element \xe2\x80\x9csource\xe2\x80\x9d is missing required attribute \xe2\x80\x9csrcset\xe2\x80\x9d."
84
+
~code:"missing-required-attribute"
85
+
~element:"source" ~attribute:"srcset" ()
86
+
87
+
let check_img_attrs attrs collector =
88
+
List.iter (fun disallowed ->
89
+
if has_attr disallowed attrs then
90
+
report_disallowed_attr "img" disallowed collector
91
+
) disallowed_img_attrs
92
+
93
+
let start_element state ~name ~namespace ~attrs collector =
94
+
let name_lower = String.lowercase_ascii name in
95
+
96
+
(* Check for disallowed children of picture first - even foreign content *)
97
+
if state.in_picture && state.picture_depth = 1 then begin
98
+
if not (List.mem name_lower allowed_picture_children) then
99
+
report_disallowed_child "picture" name_lower collector
100
+
end;
101
+
102
+
(* Rest of checks only apply to HTML namespace elements *)
103
+
if namespace = None then begin
104
+
match name_lower with
105
+
| "picture" ->
106
+
check_picture_attrs attrs collector;
107
+
state.in_picture <- true;
108
+
state.has_img_in_picture <- false;
109
+
state.picture_depth <- 0; (* Will be incremented to 1 at end of function *)
110
+
state.children_in_picture <- [];
111
+
state.last_was_img <- false;
112
+
state.has_source_after_img <- false
113
+
114
+
| "source" when state.in_picture && state.picture_depth = 1 ->
115
+
check_source_attrs_in_picture attrs collector;
116
+
state.children_in_picture <- "source" :: state.children_in_picture;
117
+
if state.last_was_img then
118
+
state.has_source_after_img <- true
119
+
120
+
| "img" when state.in_picture && state.picture_depth = 1 ->
121
+
check_img_attrs attrs collector;
122
+
state.has_img_in_picture <- true;
123
+
state.children_in_picture <- "img" :: state.children_in_picture;
124
+
state.last_was_img <- true;
125
+
(* Check for multiple img elements *)
126
+
let img_count = List.filter (fun c -> c = "img") state.children_in_picture |> List.length in
127
+
if img_count > 1 then
128
+
report_disallowed_child "picture" "img" collector
129
+
130
+
| "script" when state.in_picture && state.picture_depth = 1 ->
131
+
state.children_in_picture <- "script" :: state.children_in_picture
132
+
133
+
| "template" when state.in_picture && state.picture_depth = 1 ->
134
+
state.children_in_picture <- "template" :: state.children_in_picture
135
+
136
+
| "img" ->
137
+
check_img_attrs attrs collector
138
+
139
+
| _ -> ()
140
+
end;
141
+
142
+
(* Track depth when inside picture *)
143
+
if state.in_picture then
144
+
state.picture_depth <- state.picture_depth + 1
145
+
146
+
let end_element state ~name ~namespace collector =
147
+
if namespace <> None then ()
148
+
else begin
149
+
let name_lower = String.lowercase_ascii name in
150
+
151
+
(* Track depth *)
152
+
if state.in_picture then
153
+
state.picture_depth <- state.picture_depth - 1;
154
+
155
+
if name_lower = "picture" && state.picture_depth = 0 then begin
156
+
(* Check if picture had img child *)
157
+
if not state.has_img_in_picture then
158
+
Message_collector.add_error collector
159
+
~message:"Element \xe2\x80\x9cpicture\xe2\x80\x9d is missing required child element \xe2\x80\x9cimg\xe2\x80\x9d."
160
+
~code:"missing-required-child"
161
+
~element:"picture" ();
162
+
(* Check for source after img *)
163
+
if state.has_source_after_img then
164
+
report_disallowed_child "picture" "source" collector;
165
+
166
+
state.in_picture <- false
167
+
end
168
+
end
169
+
170
+
let characters state text collector =
171
+
(* Text in picture element is not allowed *)
172
+
if state.in_picture && state.picture_depth = 1 then begin
173
+
let trimmed = String.trim text in
174
+
if trimmed <> "" then
175
+
Message_collector.add_error collector
176
+
~message:"Text not allowed in element \xe2\x80\x9cpicture\xe2\x80\x9d in this context."
177
+
~code:"text-not-allowed"
178
+
~element:"picture" ()
179
+
end
180
+
181
+
let end_document _state _collector = ()
182
+
183
+
let checker =
184
+
(module struct
185
+
type nonrec state = state
186
+
let create = create
187
+
let reset = reset
188
+
let start_element = start_element
189
+
let end_element = end_element
190
+
let characters = characters
191
+
let end_document = end_document
192
+
end : Checker.S)
+141
lib/html5_checker/specialized/ruby_checker.ml
+141
lib/html5_checker/specialized/ruby_checker.ml
···
1
+
(** Ruby element content model validation checker.
2
+
3
+
Validates that:
4
+
- Ruby contains at least one rt element
5
+
- Ruby contains phrasing content before rt elements *)
6
+
7
+
type ruby_info = {
8
+
mutable has_rt : bool;
9
+
mutable has_content_before_rt : bool;
10
+
mutable saw_rt : bool; (* Whether we've seen rt yet *)
11
+
mutable depth : int; (* Track nesting level *)
12
+
}
13
+
14
+
type state = {
15
+
mutable ruby_stack : ruby_info list; (* Stack for nested ruby elements *)
16
+
mutable in_template : int;
17
+
}
18
+
19
+
let create () = {
20
+
ruby_stack = [];
21
+
in_template = 0;
22
+
}
23
+
24
+
let reset state =
25
+
state.ruby_stack <- [];
26
+
state.in_template <- 0
27
+
28
+
(** Check if element is phrasing content that can appear before rt *)
29
+
let is_phrasing_content name =
30
+
let name_lower = String.lowercase_ascii name in
31
+
(* rt and rp are special - they don't count as "content before rt" *)
32
+
name_lower <> "rt" && name_lower <> "rp"
33
+
34
+
let start_element state ~name ~namespace ~attrs _collector =
35
+
ignore attrs;
36
+
if namespace <> None then ()
37
+
else begin
38
+
let name_lower = String.lowercase_ascii name in
39
+
40
+
if name_lower = "template" then
41
+
state.in_template <- state.in_template + 1;
42
+
43
+
if state.in_template > 0 then ()
44
+
else begin
45
+
if name_lower = "ruby" then begin
46
+
(* Push new ruby context *)
47
+
let info = {
48
+
has_rt = false;
49
+
has_content_before_rt = false;
50
+
saw_rt = false;
51
+
depth = 0;
52
+
} in
53
+
state.ruby_stack <- info :: state.ruby_stack
54
+
end;
55
+
56
+
match state.ruby_stack with
57
+
| info :: _ ->
58
+
(* Inside a ruby element *)
59
+
if name_lower = "ruby" then begin
60
+
(* This is the opening of ruby, set depth to 1 *)
61
+
info.depth <- 1
62
+
end else begin
63
+
if info.depth = 1 then begin
64
+
(* Direct children of ruby *)
65
+
if name_lower = "rt" then begin
66
+
info.has_rt <- true;
67
+
info.saw_rt <- true
68
+
end else if is_phrasing_content name_lower then begin
69
+
if not info.saw_rt then
70
+
info.has_content_before_rt <- true
71
+
end
72
+
end;
73
+
info.depth <- info.depth + 1
74
+
end
75
+
| [] -> ()
76
+
end
77
+
end
78
+
79
+
let end_element state ~name ~namespace collector =
80
+
if namespace <> None then ()
81
+
else begin
82
+
let name_lower = String.lowercase_ascii name in
83
+
84
+
if name_lower = "template" && state.in_template > 0 then
85
+
state.in_template <- state.in_template - 1;
86
+
87
+
if state.in_template > 0 then ()
88
+
else begin
89
+
match state.ruby_stack with
90
+
| info :: rest ->
91
+
info.depth <- info.depth - 1;
92
+
(* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *)
93
+
if name_lower = "ruby" && info.depth <= 0 then begin
94
+
(* Closing ruby element - validate *)
95
+
if not info.has_rt then
96
+
Message_collector.add_error collector
97
+
~message:"Element \xe2\x80\x9cruby\xe2\x80\x9d is missing one or more of the following child elements: [rp, rt]."
98
+
~code:"ruby-missing-rt"
99
+
~element:"ruby" ()
100
+
else if not info.has_content_before_rt then
101
+
Message_collector.add_error collector
102
+
~message:"Element \xe2\x80\x9cruby\xe2\x80\x9d is missing required child element \xe2\x80\x9crt\xe2\x80\x9d."
103
+
~code:"ruby-missing-content"
104
+
~element:"ruby" ();
105
+
state.ruby_stack <- rest
106
+
end
107
+
| [] -> ()
108
+
end
109
+
end
110
+
111
+
let characters state text _collector =
112
+
(* Text content counts as phrasing content before rt *)
113
+
if state.in_template > 0 then ()
114
+
else begin
115
+
match state.ruby_stack with
116
+
| info :: _ ->
117
+
if info.depth = 1 then begin
118
+
(* Direct text child of ruby *)
119
+
let has_non_whitespace =
120
+
String.exists (fun c ->
121
+
c <> ' ' && c <> '\t' && c <> '\n' && c <> '\r'
122
+
) text
123
+
in
124
+
if has_non_whitespace && not info.saw_rt then
125
+
info.has_content_before_rt <- true
126
+
end
127
+
| [] -> ()
128
+
end
129
+
130
+
let end_document _state _collector = ()
131
+
132
+
let checker =
133
+
(module struct
134
+
type nonrec state = state
135
+
let create = create
136
+
let reset = reset
137
+
let start_element = start_element
138
+
let end_element = end_element
139
+
let characters = characters
140
+
let end_document = end_document
141
+
end : Checker.S)
+103
lib/html5_checker/specialized/source_checker.ml
+103
lib/html5_checker/specialized/source_checker.ml
···
1
+
(** Source element context validation checker.
2
+
Validates that source attributes are appropriate for the parent context. *)
3
+
4
+
type parent_context =
5
+
| Picture
6
+
| Video
7
+
| Audio
8
+
| Other
9
+
10
+
type state = {
11
+
mutable context_stack : parent_context list;
12
+
}
13
+
14
+
let create () = {
15
+
context_stack = [];
16
+
}
17
+
18
+
let reset state =
19
+
state.context_stack <- []
20
+
21
+
let current_context state =
22
+
match state.context_stack with
23
+
| ctx :: _ -> ctx
24
+
| [] -> Other
25
+
26
+
(** Check if an attribute list contains a specific attribute. *)
27
+
let has_attr name attrs =
28
+
List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs
29
+
30
+
let start_element state ~name ~namespace ~attrs collector =
31
+
if namespace <> None then ()
32
+
else begin
33
+
let name_lower = String.lowercase_ascii name in
34
+
match name_lower with
35
+
| "picture" ->
36
+
state.context_stack <- Picture :: state.context_stack
37
+
| "video" ->
38
+
state.context_stack <- Video :: state.context_stack
39
+
| "audio" ->
40
+
state.context_stack <- Audio :: state.context_stack
41
+
| "source" ->
42
+
let ctx = current_context state in
43
+
begin match ctx with
44
+
| Video | Audio ->
45
+
(* srcset is not allowed on source inside video/audio *)
46
+
if has_attr "srcset" attrs then
47
+
Message_collector.add_error collector
48
+
~message:"Attribute \xe2\x80\x9csrcset\xe2\x80\x9d not allowed on element \xe2\x80\x9csource\xe2\x80\x9d at this point."
49
+
~code:"disallowed-attribute"
50
+
~element:name ~attribute:"srcset" ();
51
+
(* sizes is not allowed on source inside video/audio *)
52
+
if has_attr "sizes" attrs then
53
+
Message_collector.add_error collector
54
+
~message:"Attribute \xe2\x80\x9csizes\xe2\x80\x9d not allowed on element \xe2\x80\x9csource\xe2\x80\x9d at this point."
55
+
~code:"disallowed-attribute"
56
+
~element:name ~attribute:"sizes" ();
57
+
(* Note: media IS allowed on source in video/audio for source selection *)
58
+
(* width/height not allowed on source inside video/audio *)
59
+
if has_attr "width" attrs then
60
+
Message_collector.add_error collector
61
+
~message:"Attribute \xe2\x80\x9cwidth\xe2\x80\x9d not allowed on element \xe2\x80\x9csource\xe2\x80\x9d at this point."
62
+
~code:"disallowed-attribute"
63
+
~element:name ~attribute:"width" ();
64
+
if has_attr "height" attrs then
65
+
Message_collector.add_error collector
66
+
~message:"Attribute \xe2\x80\x9cheight\xe2\x80\x9d not allowed on element \xe2\x80\x9csource\xe2\x80\x9d at this point."
67
+
~code:"disallowed-attribute"
68
+
~element:name ~attribute:"height" ()
69
+
| Picture | Other ->
70
+
(* In picture context or other contexts, these attributes might be valid *)
71
+
()
72
+
end
73
+
| _ ->
74
+
(* Any other element maintains current context *)
75
+
()
76
+
end
77
+
78
+
let end_element state ~name ~namespace _collector =
79
+
if namespace <> None then ()
80
+
else begin
81
+
let name_lower = String.lowercase_ascii name in
82
+
match name_lower with
83
+
| "picture" | "video" | "audio" ->
84
+
(match state.context_stack with
85
+
| _ :: rest -> state.context_stack <- rest
86
+
| [] -> ())
87
+
| _ -> ()
88
+
end
89
+
90
+
let characters _state _text _collector = ()
91
+
92
+
let end_document _state _collector = ()
93
+
94
+
let checker =
95
+
(module struct
96
+
type nonrec state = state
97
+
let create = create
98
+
let reset = reset
99
+
let start_element = start_element
100
+
let end_element = end_element
101
+
let characters = characters
102
+
let end_document = end_document
103
+
end : Checker.S)
+98
lib/html5_checker/specialized/title_checker.ml
+98
lib/html5_checker/specialized/title_checker.ml
···
1
+
(** Title element validation checker. *)
2
+
3
+
type state = {
4
+
mutable in_head : bool;
5
+
mutable has_title : bool;
6
+
mutable in_title : bool;
7
+
mutable title_has_content : bool;
8
+
mutable title_depth : int;
9
+
mutable is_iframe_srcdoc : bool;
10
+
}
11
+
12
+
let create () = {
13
+
in_head = false;
14
+
has_title = false;
15
+
in_title = false;
16
+
title_has_content = false;
17
+
title_depth = 0;
18
+
is_iframe_srcdoc = false;
19
+
}
20
+
21
+
let reset state =
22
+
state.in_head <- false;
23
+
state.has_title <- false;
24
+
state.in_title <- false;
25
+
state.title_has_content <- false;
26
+
state.title_depth <- 0;
27
+
state.is_iframe_srcdoc <- false
28
+
29
+
let start_element state ~name ~namespace ~attrs collector =
30
+
ignore (collector, attrs);
31
+
if namespace <> None then ()
32
+
else begin
33
+
let name_lower = String.lowercase_ascii name in
34
+
match name_lower with
35
+
| "html" ->
36
+
(* Check if this is an iframe srcdoc - title is not required *)
37
+
(* We detect this by checking for srcdoc context - not directly checkable from HTML,
38
+
but we can assume normal HTML document for now *)
39
+
()
40
+
| "head" ->
41
+
state.in_head <- true
42
+
| "title" when state.in_head ->
43
+
state.has_title <- true;
44
+
state.in_title <- true;
45
+
state.title_has_content <- false;
46
+
state.title_depth <- 0
47
+
| _ -> ()
48
+
end;
49
+
if state.in_title then
50
+
state.title_depth <- state.title_depth + 1
51
+
52
+
let end_element state ~name ~namespace collector =
53
+
if namespace <> None then ()
54
+
else begin
55
+
let name_lower = String.lowercase_ascii name in
56
+
57
+
if state.in_title then
58
+
state.title_depth <- state.title_depth - 1;
59
+
60
+
match name_lower with
61
+
| "title" when state.in_title && state.title_depth = 0 ->
62
+
(* Check if title was empty *)
63
+
if not state.title_has_content then
64
+
Message_collector.add_error collector
65
+
~message:"Element \xe2\x80\x9ctitle\xe2\x80\x9d must not be empty."
66
+
~code:"empty-title"
67
+
~element:name ();
68
+
state.in_title <- false
69
+
| "head" ->
70
+
(* Check if head had a title element *)
71
+
if state.in_head && not state.has_title then
72
+
Message_collector.add_error collector
73
+
~message:"Element \xe2\x80\x9chead\xe2\x80\x9d is missing required child element \xe2\x80\x9ctitle\xe2\x80\x9d."
74
+
~code:"missing-required-child"
75
+
~element:"head" ();
76
+
state.in_head <- false
77
+
| _ -> ()
78
+
end
79
+
80
+
let characters state text _collector =
81
+
if state.in_title then begin
82
+
let trimmed = String.trim text in
83
+
if trimmed <> "" then
84
+
state.title_has_content <- true
85
+
end
86
+
87
+
let end_document _state _collector = ()
88
+
89
+
let checker =
90
+
(module struct
91
+
type nonrec state = state
92
+
let create = create
93
+
let reset = reset
94
+
let start_element = start_element
95
+
let end_element = end_element
96
+
let characters = characters
97
+
let end_document = end_document
98
+
end : Checker.S)
+792
lib/html5_checker/specialized/url_checker.ml
+792
lib/html5_checker/specialized/url_checker.ml
···
1
+
(** URL validation checker for href, src, action, and other URL attributes. *)
2
+
3
+
(** Attributes that contain URLs and should be validated.
4
+
Note: srcset uses special microsyntax, not validated as URL here.
5
+
Note: input[value] is only checked for type="url", handled specially below. *)
6
+
let url_attributes = [
7
+
("a", ["href"]);
8
+
("area", ["href"]);
9
+
("audio", ["src"]);
10
+
("base", ["href"]);
11
+
("blockquote", ["cite"]);
12
+
("button", ["formaction"]);
13
+
("del", ["cite"]);
14
+
("embed", ["src"]);
15
+
("form", ["action"]);
16
+
("iframe", ["src"]);
17
+
("img", ["src"]);
18
+
("input", ["formaction"; "src"]);
19
+
("ins", ["cite"]);
20
+
("link", ["href"]);
21
+
("object", ["data"]);
22
+
("q", ["cite"]);
23
+
("script", ["src"]);
24
+
("source", ["src"]);
25
+
("track", ["src"]);
26
+
("video", ["src"; "poster"]);
27
+
]
28
+
29
+
(** Characters not allowed in URL host. *)
30
+
let invalid_host_chars = ['^'; '`'; '{'; '}'; '<'; '>']
31
+
32
+
(** Check if a host looks like an IPv6 address (starts with [). *)
33
+
let is_ipv6_host host =
34
+
String.length host > 0 && host.[0] = '['
35
+
36
+
(** Check if character is valid in IPv6 address. *)
37
+
let is_valid_ipv6_char c =
38
+
(c >= '0' && c <= '9') ||
39
+
(c >= 'a' && c <= 'f') ||
40
+
(c >= 'A' && c <= 'F') ||
41
+
c = ':' || c = '.' || c = '[' || c = ']'
42
+
43
+
(** Validate IPv6 bracketed host. *)
44
+
let validate_ipv6_host host url attr_name element_name =
45
+
(* Host should be in format [xxxx:...] *)
46
+
if String.length host < 3 then
47
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character."
48
+
url attr_name element_name)
49
+
else begin
50
+
(* Check if all characters are valid IPv6 chars *)
51
+
let invalid_char = String.exists (fun c -> not (is_valid_ipv6_char c)) host in
52
+
if invalid_char then
53
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character."
54
+
url attr_name element_name)
55
+
else
56
+
None
57
+
end
58
+
59
+
(** Check if a file URL host is a valid Windows drive letter (like C|). *)
60
+
let is_valid_windows_drive host =
61
+
String.length host = 2 &&
62
+
((host.[0] >= 'A' && host.[0] <= 'Z') || (host.[0] >= 'a' && host.[0] <= 'z')) &&
63
+
host.[1] = '|'
64
+
65
+
(** Check if pipe is allowed in this host context. *)
66
+
let is_pipe_allowed_in_host url host =
67
+
let scheme = try String.lowercase_ascii (String.sub url 0 (String.index url ':')) with _ -> "" in
68
+
scheme = "file" && is_valid_windows_drive host
69
+
70
+
(** Special schemes that require double slash (//).
71
+
Note: file: is special but doesn't always require //.
72
+
Note: ws and wss allow single/no slash forms per WHATWG URL Standard. *)
73
+
let special_schemes_require_double_slash = ["http"; "https"; "ftp"]
74
+
75
+
(** Special schemes (for other checks). *)
76
+
let special_schemes = ["http"; "https"; "ftp"; "ws"; "wss"; "file"]
77
+
78
+
(** Extract scheme from URL. *)
79
+
let extract_scheme url =
80
+
(* A scheme must start with a letter, not [ or other special chars *)
81
+
if String.length url = 0 then None
82
+
else if not (url.[0] >= 'a' && url.[0] <= 'z' || url.[0] >= 'A' && url.[0] <= 'Z') then
83
+
None
84
+
else
85
+
try
86
+
let colon_pos = String.index url ':' in
87
+
(* Scheme can only contain letters, digits, +, -, . *)
88
+
let potential_scheme = String.sub url 0 colon_pos in
89
+
let is_valid_scheme = String.length potential_scheme > 0 &&
90
+
String.for_all (fun c ->
91
+
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
92
+
(c >= '0' && c <= '9') || c = '+' || c = '-' || c = '.'
93
+
) potential_scheme in
94
+
if is_valid_scheme then
95
+
Some (String.lowercase_ascii potential_scheme)
96
+
else
97
+
None
98
+
with Not_found -> None
99
+
100
+
(** Extract host and port from URL. Returns (host option, port_string option). *)
101
+
let extract_host_and_port url =
102
+
try
103
+
let double_slash =
104
+
try Some (Str.search_forward (Str.regexp "://") url 0 + 3)
105
+
with Not_found -> None
106
+
in
107
+
match double_slash with
108
+
| None -> (None, None)
109
+
| Some start_pos ->
110
+
let rest = String.sub url start_pos (String.length url - start_pos) in
111
+
(* Find end of authority (/ ? # or end) *)
112
+
let auth_end =
113
+
let find_char c = try Some (String.index rest c) with Not_found -> None in
114
+
match find_char '/', find_char '?', find_char '#' with
115
+
| Some a, Some b, Some c -> min a (min b c)
116
+
| Some a, Some b, None -> min a b
117
+
| Some a, None, Some c -> min a c
118
+
| None, Some b, Some c -> min b c
119
+
| Some a, None, None -> a
120
+
| None, Some b, None -> b
121
+
| None, None, Some c -> c
122
+
| None, None, None -> String.length rest
123
+
in
124
+
let authority = String.sub rest 0 auth_end in
125
+
(* Remove userinfo if present *)
126
+
let host_port =
127
+
try
128
+
let at_pos = String.rindex authority '@' in
129
+
String.sub authority (at_pos + 1) (String.length authority - at_pos - 1)
130
+
with Not_found -> authority
131
+
in
132
+
(* Handle IPv6 addresses *)
133
+
if String.length host_port > 0 && host_port.[0] = '[' then begin
134
+
try
135
+
let bracket_end = String.index host_port ']' in
136
+
let host = String.sub host_port 0 (bracket_end + 1) in
137
+
let after_bracket = String.sub host_port (bracket_end + 1) (String.length host_port - bracket_end - 1) in
138
+
if String.length after_bracket > 0 && after_bracket.[0] = ':' then
139
+
(Some host, Some (String.sub after_bracket 1 (String.length after_bracket - 1)))
140
+
else
141
+
(Some host, None)
142
+
with Not_found -> (Some host_port, None)
143
+
end else begin
144
+
(* Regular host:port - use FIRST colon to separate host from port
145
+
(per WHATWG URL Standard for special schemes) *)
146
+
try
147
+
let colon_pos = String.index host_port ':' in
148
+
let host = String.sub host_port 0 colon_pos in
149
+
let port = String.sub host_port (colon_pos + 1) (String.length host_port - colon_pos - 1) in
150
+
(Some host, Some port)
151
+
with Not_found -> (Some host_port, None)
152
+
end
153
+
with _ -> (None, None)
154
+
155
+
(** Check if character is a valid hex digit (for percent-decoding). *)
156
+
let is_hex_digit_for_decode c =
157
+
(c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
158
+
159
+
(** Convert a hex character to its numeric value. *)
160
+
let hex_value c =
161
+
if c >= '0' && c <= '9' then Char.code c - Char.code '0'
162
+
else if c >= 'a' && c <= 'f' then Char.code c - Char.code 'a' + 10
163
+
else if c >= 'A' && c <= 'F' then Char.code c - Char.code 'A' + 10
164
+
else 0
165
+
166
+
(** Percent-decode a string. Returns the decoded bytes. *)
167
+
let percent_decode s =
168
+
let buf = Buffer.create (String.length s) in
169
+
let len = String.length s in
170
+
let i = ref 0 in
171
+
while !i < len do
172
+
if s.[!i] = '%' && !i + 2 < len && is_hex_digit_for_decode s.[!i + 1] && is_hex_digit_for_decode s.[!i + 2] then begin
173
+
let byte = hex_value s.[!i + 1] * 16 + hex_value s.[!i + 2] in
174
+
Buffer.add_char buf (Char.chr byte);
175
+
i := !i + 3
176
+
end else begin
177
+
Buffer.add_char buf s.[!i];
178
+
incr i
179
+
end
180
+
done;
181
+
Buffer.contents buf
182
+
183
+
(** Check if decoded bytes contain invalid Unicode noncharacters or surrogates.
184
+
These are forbidden in hostnames per WHATWG URL Standard.
185
+
- U+FDD0-U+FDEF: noncharacters
186
+
- U+FFFE, U+FFFF: noncharacters
187
+
- U+xFFFE, U+xFFFF for any plane (0x1FFFE, etc.)
188
+
- U+D800-U+DFFF: surrogate code points *)
189
+
let contains_invalid_unicode bytes =
190
+
let len = String.length bytes in
191
+
let i = ref 0 in
192
+
while !i < len do
193
+
let c = Char.code bytes.[!i] in
194
+
if c < 128 then begin
195
+
(* ASCII - OK *)
196
+
incr i
197
+
end else if c >= 0xC0 && c < 0xE0 && !i + 1 < len then begin
198
+
(* 2-byte UTF-8 *)
199
+
let b1 = Char.code bytes.[!i + 1] in
200
+
(* let codepoint = ((c land 0x1F) lsl 6) lor (b1 land 0x3F) in *)
201
+
ignore b1;
202
+
i := !i + 2
203
+
end else if c >= 0xE0 && c < 0xF0 && !i + 2 < len then begin
204
+
(* 3-byte UTF-8 *)
205
+
let b1 = Char.code bytes.[!i + 1] in
206
+
let b2 = Char.code bytes.[!i + 2] in
207
+
let codepoint = ((c land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in
208
+
(* Check for surrogates (U+D800-U+DFFF) *)
209
+
if codepoint >= 0xD800 && codepoint <= 0xDFFF then
210
+
raise Exit;
211
+
(* Check for noncharacters in BMP *)
212
+
if codepoint >= 0xFDD0 && codepoint <= 0xFDEF then
213
+
raise Exit;
214
+
if codepoint = 0xFFFE || codepoint = 0xFFFF then
215
+
raise Exit;
216
+
i := !i + 3
217
+
end else if c >= 0xF0 && c < 0xF8 && !i + 3 < len then begin
218
+
(* 4-byte UTF-8 *)
219
+
let b1 = Char.code bytes.[!i + 1] in
220
+
let b2 = Char.code bytes.[!i + 2] in
221
+
let b3 = Char.code bytes.[!i + 3] in
222
+
let codepoint = ((c land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
223
+
((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in
224
+
(* Check for noncharacters at end of each plane: U+1FFFE, U+1FFFF, U+2FFFE, etc. *)
225
+
if (codepoint land 0xFFFF) = 0xFFFE || (codepoint land 0xFFFF) = 0xFFFF then
226
+
raise Exit;
227
+
i := !i + 4
228
+
end else begin
229
+
(* Invalid UTF-8 or other - skip *)
230
+
incr i
231
+
end
232
+
done;
233
+
false
234
+
235
+
(** Check if host contains invalid percent-encoded Unicode. *)
236
+
let check_invalid_percent_encoded_unicode host url attr_name element_name =
237
+
try
238
+
let decoded = percent_decode host in
239
+
let _ = contains_invalid_unicode decoded in
240
+
None
241
+
with Exit ->
242
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host."
243
+
url attr_name element_name)
244
+
245
+
(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *)
246
+
let contains_percent_char s =
247
+
(* Check for ASCII percent *)
248
+
String.contains s '%' ||
249
+
(* Check for fullwidth percent (U+FF05 = 0xEF 0xBC 0x85 in UTF-8) *)
250
+
try
251
+
let _ = Str.search_forward (Str.regexp "\xef\xbc\x85") s 0 in
252
+
true
253
+
with Not_found -> false
254
+
255
+
(** Check if decoded host contains forbidden characters.
256
+
Some URLs have percent-encoded fullwidth characters that decode to forbidden chars. *)
257
+
let check_decoded_host_chars host url attr_name element_name =
258
+
let decoded = percent_decode host in
259
+
(* Check for % character in decoded host - this catches fullwidth percent signs etc. *)
260
+
if contains_percent_char decoded then
261
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%%\xe2\x80\x9d is not allowed."
262
+
url attr_name element_name)
263
+
else
264
+
None
265
+
266
+
(** Validate port string. Returns error message or None. *)
267
+
let validate_port port url attr_name element_name =
268
+
if port = "" then None
269
+
else begin
270
+
(* Check for invalid characters in port *)
271
+
let invalid_char = ref None in
272
+
String.iter (fun c ->
273
+
if !invalid_char = None && not (c >= '0' && c <= '9') then
274
+
invalid_char := Some c
275
+
) port;
276
+
match !invalid_char with
277
+
| Some c ->
278
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in port: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
279
+
url attr_name element_name c)
280
+
| None ->
281
+
(* Check port range *)
282
+
try
283
+
let port_num = int_of_string port in
284
+
if port_num >= 65536 then
285
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Port number must be less than 65536."
286
+
url attr_name element_name)
287
+
else
288
+
None
289
+
with _ -> None
290
+
end
291
+
292
+
(** Validate host string. Returns error message or None. *)
293
+
let validate_host host url attr_name element_name scheme =
294
+
if is_ipv6_host host then
295
+
validate_ipv6_host host url attr_name element_name
296
+
else begin
297
+
(* Check for empty host *)
298
+
let requires_host = List.mem scheme special_schemes in
299
+
if host = "" && requires_host && scheme <> "file" then
300
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: empty host."
301
+
url attr_name element_name)
302
+
else
303
+
(* Check for invalid chars *)
304
+
let invalid_char =
305
+
List.find_opt (fun c -> String.contains host c) invalid_host_chars
306
+
in
307
+
match invalid_char with
308
+
| Some c ->
309
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
310
+
url attr_name element_name c)
311
+
| None ->
312
+
(* Check for | *)
313
+
if String.contains host '|' && not (is_pipe_allowed_in_host url host) then
314
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c|\xe2\x80\x9d is not allowed."
315
+
url attr_name element_name)
316
+
(* Check for backslash in host *)
317
+
else if String.contains host '\\' then
318
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed."
319
+
url attr_name element_name)
320
+
(* Check for space in host *)
321
+
else if String.contains host ' ' then
322
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: space is not allowed."
323
+
url attr_name element_name)
324
+
(* Check for invalid percent-encoded Unicode in host *)
325
+
else begin
326
+
match check_invalid_percent_encoded_unicode host url attr_name element_name with
327
+
| Some err -> Some err
328
+
| None ->
329
+
(* Check decoded host for forbidden chars like fullwidth percent *)
330
+
check_decoded_host_chars host url attr_name element_name
331
+
end
332
+
end
333
+
334
+
(** Check if URL has special scheme requiring double slash. *)
335
+
let check_special_scheme_double_slash url attr_name element_name =
336
+
match extract_scheme url with
337
+
| None -> None
338
+
| Some scheme ->
339
+
(* Only check for schemes that require //, not file: *)
340
+
if List.mem scheme special_schemes_require_double_slash then begin
341
+
(* Check if followed by :// *)
342
+
let colon_pos = String.index url ':' in
343
+
let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
344
+
if String.length after_colon < 2 || after_colon.[0] <> '/' || after_colon.[1] <> '/' then
345
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a slash (\"/\")."
346
+
url attr_name element_name)
347
+
else
348
+
None
349
+
end else
350
+
None
351
+
352
+
(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). *)
353
+
let check_data_uri_fragment url attr_name element_name =
354
+
match extract_scheme url with
355
+
| None -> None
356
+
| Some scheme ->
357
+
if scheme = "data" && String.contains url '#' then
358
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Fragment is not allowed for data: URIs according to RFC 2397."
359
+
url attr_name element_name)
360
+
else
361
+
None
362
+
363
+
(** data: URLs cannot start with / (they have specific format: data:[mediatype][;base64],data) *)
364
+
let data_scheme_no_slash = ["data"]
365
+
366
+
(** Check for data: URL that incorrectly has a slash (data: URLs have specific format). *)
367
+
let check_data_url_no_slash url attr_name element_name =
368
+
match extract_scheme url with
369
+
| None -> None
370
+
| Some scheme ->
371
+
if List.mem scheme data_scheme_no_slash then begin
372
+
let colon_pos = String.index url ':' in
373
+
let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
374
+
(* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *)
375
+
if String.length after_colon > 0 && after_colon.[0] = '/' then
376
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid %s: URL."
377
+
url attr_name element_name scheme)
378
+
else
379
+
None
380
+
end else
381
+
None
382
+
383
+
(** Check for illegal characters in scheme data (for non-special schemes). *)
384
+
let check_scheme_data url attr_name element_name =
385
+
match extract_scheme url with
386
+
| None -> None
387
+
| Some scheme ->
388
+
if not (List.mem scheme special_schemes) then begin
389
+
(* Get scheme data (after the colon) *)
390
+
let colon_pos = String.index url ':' in
391
+
let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
392
+
(* Check for space in scheme data *)
393
+
if String.contains scheme_data ' ' then
394
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed."
395
+
url attr_name element_name)
396
+
else
397
+
None
398
+
end else
399
+
None
400
+
401
+
(** Remove query and fragment from path. *)
402
+
let remove_query_fragment path =
403
+
let path = try String.sub path 0 (String.index path '?') with Not_found -> path in
404
+
try String.sub path 0 (String.index path '#') with Not_found -> path
405
+
406
+
(** Check for illegal characters in path segment. *)
407
+
let check_path_segment url attr_name element_name =
408
+
(* Extract path: everything after authority (or after scheme: for non-authority URLs) *)
409
+
let raw_path =
410
+
try
411
+
let double_slash = Str.search_forward (Str.regexp "://") url 0 in
412
+
let after_auth_start = double_slash + 3 in
413
+
let rest = String.sub url after_auth_start (String.length url - after_auth_start) in
414
+
(* Find end of authority *)
415
+
let path_start =
416
+
try String.index rest '/'
417
+
with Not_found -> String.length rest
418
+
in
419
+
if path_start < String.length rest then
420
+
String.sub rest path_start (String.length rest - path_start)
421
+
else
422
+
""
423
+
with Not_found ->
424
+
(* No double slash - check for single slash path *)
425
+
match extract_scheme url with
426
+
| Some _ ->
427
+
let colon_pos = String.index url ':' in
428
+
let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
429
+
after_colon
430
+
| None ->
431
+
(* Relative URL - the whole thing is the path *)
432
+
url
433
+
in
434
+
(* Remove query and fragment for path-specific checks *)
435
+
let path = remove_query_fragment raw_path in
436
+
(* Check for space in path (not allowed) *)
437
+
if String.contains path ' ' then
438
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: space is not allowed."
439
+
url attr_name element_name)
440
+
(* Check for pipe in path (not allowed except in file:// authority) *)
441
+
else if String.contains path '|' then
442
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c|\xe2\x80\x9d is not allowed."
443
+
url attr_name element_name)
444
+
(* Check for unescaped square brackets in path *)
445
+
else if String.contains path '[' then
446
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed."
447
+
url attr_name element_name)
448
+
else
449
+
None
450
+
451
+
(** Check for illegal characters in relative URL. *)
452
+
let check_relative_url url attr_name element_name =
453
+
(* If URL has no scheme, it's relative *)
454
+
match extract_scheme url with
455
+
| Some _ -> None
456
+
| None ->
457
+
(* Check for square brackets at start (not IPv6 - that requires scheme) *)
458
+
if String.length url > 0 && url.[0] = '[' then
459
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed."
460
+
url attr_name element_name)
461
+
else
462
+
None
463
+
464
+
(** Check if character is a valid hex digit. *)
465
+
let is_hex_digit c =
466
+
(c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
467
+
468
+
(** Check for bare percent sign not followed by hex digits. *)
469
+
let check_percent_encoding url attr_name element_name =
470
+
let len = String.length url in
471
+
let rec find_bare_percent i =
472
+
if i >= len then None
473
+
else if url.[i] = '%' then begin
474
+
(* Check if followed by two hex digits *)
475
+
if i + 2 < len && is_hex_digit url.[i + 1] && is_hex_digit url.[i + 2] then
476
+
find_bare_percent (i + 3) (* Valid percent encoding, continue *)
477
+
else
478
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Percentage (\xe2\x80\x9c%%\xe2\x80\x9d) is not followed by two hexadecimal digits."
479
+
url attr_name element_name)
480
+
end else
481
+
find_bare_percent (i + 1)
482
+
in
483
+
find_bare_percent 0
484
+
485
+
(** Check for illegal characters in query string. *)
486
+
let check_query_string url attr_name element_name =
487
+
try
488
+
let query_start = String.index url '?' in
489
+
let fragment_start =
490
+
try Some (String.index_from url query_start '#')
491
+
with Not_found -> None
492
+
in
493
+
let query_end = match fragment_start with
494
+
| Some pos -> pos
495
+
| None -> String.length url
496
+
in
497
+
let query = String.sub url (query_start + 1) (query_end - query_start - 1) in
498
+
(* Check for unescaped space in query *)
499
+
if String.contains query ' ' then
500
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in query: space is not allowed."
501
+
url attr_name element_name)
502
+
else
503
+
None
504
+
with Not_found -> None (* No query string *)
505
+
506
+
(** Check for illegal characters in fragment. *)
507
+
let check_fragment url attr_name element_name =
508
+
try
509
+
let fragment_start = String.index url '#' in
510
+
let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in
511
+
(* Check for second hash in fragment *)
512
+
if String.contains fragment '#' then
513
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed."
514
+
url attr_name element_name)
515
+
(* Check for space in fragment *)
516
+
else if String.contains fragment ' ' then
517
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: space is not allowed."
518
+
url attr_name element_name)
519
+
else
520
+
None
521
+
with Not_found -> None (* No fragment *)
522
+
523
+
(** Characters not allowed in userinfo (user:password) part of URL. *)
524
+
let invalid_userinfo_chars = [']'; '['; '^'; '|'; '`'; '<'; '>']
525
+
526
+
(** Check for illegal characters in userinfo (user:password). *)
527
+
let check_userinfo url attr_name element_name =
528
+
try
529
+
(* Look for :// then find the LAST @ before the next / or end *)
530
+
let double_slash = Str.search_forward (Str.regexp "://") url 0 + 3 in
531
+
let rest = String.sub url double_slash (String.length url - double_slash) in
532
+
(* Find first / or ? or # to limit authority section *)
533
+
let auth_end =
534
+
let find_char c = try Some (String.index rest c) with Not_found -> None in
535
+
match find_char '/', find_char '?', find_char '#' with
536
+
| Some a, Some b, Some c -> min a (min b c)
537
+
| Some a, Some b, None -> min a b
538
+
| Some a, None, Some c -> min a c
539
+
| None, Some b, Some c -> min b c
540
+
| Some a, None, None -> a
541
+
| None, Some b, None -> b
542
+
| None, None, Some c -> c
543
+
| None, None, None -> String.length rest
544
+
in
545
+
let authority = String.sub rest 0 auth_end in
546
+
(* Find LAST @ in authority to separate userinfo from host *)
547
+
let at_pos =
548
+
try Some (String.rindex authority '@')
549
+
with Not_found -> None
550
+
in
551
+
match at_pos with
552
+
| None -> None (* No userinfo *)
553
+
| Some at ->
554
+
let userinfo = String.sub authority 0 at in
555
+
(* Check for @ in userinfo (should be percent-encoded) *)
556
+
if String.contains userinfo '@' then
557
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: User or password contains an at symbol (\xe2\x80\x9c@\xe2\x80\x9d) not percent-encoded."
558
+
url attr_name element_name)
559
+
(* Check for space *)
560
+
else if String.contains userinfo ' ' then
561
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed."
562
+
url attr_name element_name)
563
+
else
564
+
(* Check for non-ASCII characters (like emoji) *)
565
+
let has_non_ascii = String.exists (fun c -> Char.code c > 127) userinfo in
566
+
if has_non_ascii then
567
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password."
568
+
url attr_name element_name)
569
+
else
570
+
(* Check for other invalid chars *)
571
+
let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in
572
+
match invalid with
573
+
| Some c ->
574
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
575
+
url attr_name element_name c)
576
+
| None -> None
577
+
with _ -> None
578
+
579
+
(** Attributes where empty URL is an error.
580
+
Note: href, cite, action can be empty (refers to current document).
581
+
formaction and src must be non-empty though. *)
582
+
let must_be_non_empty = ["formaction"; "src"; "poster"; "data"]
583
+
584
+
(** Element/attribute combinations where empty URL is an error. *)
585
+
let must_be_non_empty_combinations = [
586
+
("link", "href"); (* link href must be non-empty *)
587
+
("form", "action"); (* form action must be non-empty *)
588
+
]
589
+
590
+
(** Check URL for common errors. Returns error message or None. *)
591
+
let validate_url url element_name attr_name =
592
+
let original_url = url in
593
+
let url = String.trim url in
594
+
(* Empty URL check for certain attributes *)
595
+
if url = "" then begin
596
+
let name_lower = String.lowercase_ascii element_name in
597
+
let attr_lower = String.lowercase_ascii attr_name in
598
+
if List.mem attr_lower must_be_non_empty ||
599
+
List.mem (name_lower, attr_lower) must_be_non_empty_combinations then
600
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Must be non-empty."
601
+
original_url attr_name element_name)
602
+
else
603
+
None
604
+
end
605
+
else begin
606
+
(* Check for leading/trailing whitespace *)
607
+
if original_url <> url && (String.length original_url > 0) then
608
+
let has_leading = String.length original_url > 0 && (original_url.[0] = ' ' || original_url.[0] = '\t') in
609
+
let has_trailing = String.length original_url > 0 &&
610
+
let last = original_url.[String.length original_url - 1] in
611
+
last = ' ' || last = '\t' in
612
+
if has_leading || has_trailing then
613
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace."
614
+
original_url attr_name element_name)
615
+
else None
616
+
(* Check for newlines/tabs *)
617
+
else if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
618
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
619
+
url attr_name element_name)
620
+
else begin
621
+
(* Check for relative URL issues first *)
622
+
match check_relative_url url attr_name element_name with
623
+
| Some err -> Some err
624
+
| None ->
625
+
626
+
(* Check percent encoding *)
627
+
match check_percent_encoding url attr_name element_name with
628
+
| Some err -> Some err
629
+
| None ->
630
+
631
+
(* Check query string *)
632
+
match check_query_string url attr_name element_name with
633
+
| Some err -> Some err
634
+
| None ->
635
+
636
+
(* Check fragment *)
637
+
match check_fragment url attr_name element_name with
638
+
| Some err -> Some err
639
+
| None ->
640
+
641
+
(* Check userinfo *)
642
+
match check_userinfo url attr_name element_name with
643
+
| Some err -> Some err
644
+
| None ->
645
+
646
+
(* Check special scheme requires double slash *)
647
+
match check_special_scheme_double_slash url attr_name element_name with
648
+
| Some err -> Some err
649
+
| None ->
650
+
651
+
(* Check data: URLs don't start with slash *)
652
+
match check_data_url_no_slash url attr_name element_name with
653
+
| Some err -> Some err
654
+
| None ->
655
+
656
+
(* Check for backslash AFTER special scheme check *)
657
+
if String.contains url '\\' then
658
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Backslash (\"\\\") used as path segment delimiter."
659
+
url attr_name element_name)
660
+
else
661
+
662
+
(* Check scheme data for non-special schemes *)
663
+
match check_scheme_data url attr_name element_name with
664
+
| Some err -> Some err
665
+
| None ->
666
+
667
+
(* Check path segment for illegal characters *)
668
+
match check_path_segment url attr_name element_name with
669
+
| Some err -> Some err
670
+
| None ->
671
+
672
+
let scheme = extract_scheme url in
673
+
let (host_opt, port_opt) = extract_host_and_port url in
674
+
let scheme_str = match scheme with Some s -> s | None -> "" in
675
+
676
+
(* Validate port if present *)
677
+
match port_opt with
678
+
| Some port ->
679
+
(match validate_port port url attr_name element_name with
680
+
| Some err -> Some err
681
+
| None ->
682
+
(* Also validate host *)
683
+
match host_opt with
684
+
| Some host -> validate_host host url attr_name element_name scheme_str
685
+
| None -> None)
686
+
| None ->
687
+
(* Just validate host *)
688
+
match host_opt with
689
+
| Some host -> validate_host host url attr_name element_name scheme_str
690
+
| None -> None
691
+
end
692
+
end
693
+
694
+
(** Checker state. *)
695
+
type state = unit
696
+
697
+
let create () = ()
698
+
let reset _state = ()
699
+
700
+
(** Get attribute value by name. *)
701
+
let get_attr_value name attrs =
702
+
List.find_map (fun (k, v) ->
703
+
if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None
704
+
) attrs
705
+
706
+
let start_element _state ~name ~namespace ~attrs collector =
707
+
if namespace <> None then ()
708
+
else begin
709
+
let name_lower = String.lowercase_ascii name in
710
+
match List.assoc_opt name_lower url_attributes with
711
+
| None -> ()
712
+
| Some url_attrs ->
713
+
List.iter (fun attr_name ->
714
+
(* Try to find the attribute - case insensitive *)
715
+
let url_opt = get_attr_value attr_name attrs in
716
+
match url_opt with
717
+
| None -> ()
718
+
| Some url ->
719
+
(* Check for data: URI with fragment - emit warning *)
720
+
(match check_data_uri_fragment url attr_name name with
721
+
| Some warn_msg ->
722
+
Message_collector.add_warning collector
723
+
~message:warn_msg
724
+
~code:"data-uri-fragment"
725
+
~element:name
726
+
~attribute:attr_name
727
+
()
728
+
| None -> ());
729
+
match validate_url url name attr_name with
730
+
| None -> ()
731
+
| Some error_msg ->
732
+
Message_collector.add_error collector
733
+
~message:error_msg
734
+
~code:"bad-url"
735
+
~element:name
736
+
~attribute:attr_name
737
+
()
738
+
) url_attrs;
739
+
(* Special handling for input[type=url] value attribute - must be absolute URL *)
740
+
if name_lower = "input" then begin
741
+
let type_attr = get_attr_value "type" attrs in
742
+
if type_attr = Some "url" then begin
743
+
match get_attr_value "value" attrs with
744
+
| None -> ()
745
+
| Some url ->
746
+
let url = String.trim url in
747
+
if url = "" then ()
748
+
else begin
749
+
(* First check if it's an absolute URL (has a scheme) *)
750
+
let scheme = extract_scheme url in
751
+
match scheme with
752
+
| None ->
753
+
(* Not an absolute URL *)
754
+
Message_collector.add_error collector
755
+
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cvalue\xe2\x80\x9d on element \xe2\x80\x9cinput\xe2\x80\x9d: Bad absolute URL: The string \xe2\x80\x9c%s\xe2\x80\x9d is not an absolute URL."
756
+
url url)
757
+
~code:"bad-url"
758
+
~element:name
759
+
~attribute:"value"
760
+
()
761
+
| Some _ ->
762
+
(* Has a scheme - do regular URL validation with "absolute URL" prefix *)
763
+
match validate_url url name "value" with
764
+
| None -> ()
765
+
| Some error_msg ->
766
+
(* Replace "Bad URL:" with "Bad absolute URL:" for input[type=url] *)
767
+
let error_msg = Str.global_replace (Str.regexp "Bad URL:") "Bad absolute URL:" error_msg in
768
+
Message_collector.add_error collector
769
+
~message:error_msg
770
+
~code:"bad-url"
771
+
~element:name
772
+
~attribute:"value"
773
+
()
774
+
end
775
+
end
776
+
end
777
+
end
778
+
779
+
let end_element _state ~name:_ ~namespace:_ _collector = ()
780
+
let characters _state _text _collector = ()
781
+
let end_document _state _collector = ()
782
+
783
+
let checker =
784
+
(module struct
785
+
type nonrec state = state
786
+
let create = create
787
+
let reset = reset
788
+
let start_element = start_element
789
+
let end_element = end_element
790
+
let characters = characters
791
+
let end_document = end_document
792
+
end : Checker.S)
+14
-14
lib/html5rw/parser/parser_tree_builder.ml
+14
-14
lib/html5rw/parser/parser_tree_builder.ml
···
854
854
| Token.Tag { kind = Token.Start; name; attrs; _ }
855
855
when List.mem name ["base"; "basefont"; "bgsound"; "link"; "meta"] ->
856
856
ignore (insert_element t name attrs)
857
-
| Token.Tag { kind = Token.Start; name = "title"; _ } ->
858
-
ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs = []; self_closing = false });
857
+
| Token.Tag { kind = Token.Start; name = "title"; attrs; self_closing } ->
858
+
ignore (insert_element_for_token t { kind = Token.Start; name = "title"; attrs; self_closing });
859
859
t.original_mode <- Some t.mode;
860
860
t.mode <- Parser_insertion_mode.Text
861
-
| Token.Tag { kind = Token.Start; name; _ }
861
+
| Token.Tag { kind = Token.Start; name; attrs; self_closing }
862
862
when List.mem name ["noframes"; "style"] ->
863
-
ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
863
+
ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing });
864
864
t.original_mode <- Some t.mode;
865
865
t.mode <- Parser_insertion_mode.Text
866
-
| Token.Tag { kind = Token.Start; name = "noscript"; _ } ->
866
+
| Token.Tag { kind = Token.Start; name = "noscript"; attrs; self_closing } ->
867
867
(* Scripting is disabled: parse noscript content as HTML *)
868
-
ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs = []; self_closing = false });
868
+
ignore (insert_element_for_token t { kind = Token.Start; name = "noscript"; attrs; self_closing });
869
869
t.mode <- Parser_insertion_mode.In_head_noscript
870
870
| Token.Tag { kind = Token.Start; name = "script"; attrs; self_closing } ->
871
871
ignore (insert_element_for_token t { kind = Token.Start; name = "script"; attrs; self_closing });
···
1340
1340
String.lowercase_ascii k = "type" && String.lowercase_ascii v = "hidden"
1341
1341
) attrs in
1342
1342
if not is_hidden then t.frameset_ok <- false
1343
-
| Token.Tag { kind = Token.Start; name; _ }
1343
+
| Token.Tag { kind = Token.Start; name; attrs; _ }
1344
1344
when List.mem name ["param"; "source"; "track"] ->
1345
-
ignore (insert_element_for_token t { kind = Token.Start; name; attrs = []; self_closing = false });
1345
+
ignore (insert_element_for_token t { kind = Token.Start; name; attrs; self_closing = false });
1346
1346
pop_current t
1347
1347
| Token.Tag { kind = Token.Start; name = "hr"; _ } ->
1348
1348
if has_element_in_button_scope t "p" then close_p_element t;
···
1362
1362
t.original_mode <- Some t.mode;
1363
1363
t.frameset_ok <- false;
1364
1364
t.mode <- Parser_insertion_mode.Text
1365
-
| Token.Tag { kind = Token.Start; name = "xmp"; _ } ->
1365
+
| Token.Tag { kind = Token.Start; name = "xmp"; attrs; _ } ->
1366
1366
if has_element_in_button_scope t "p" then close_p_element t;
1367
1367
reconstruct_active_formatting t;
1368
1368
t.frameset_ok <- false;
1369
-
ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs = []; self_closing = false });
1369
+
ignore (insert_element_for_token t { kind = Token.Start; name = "xmp"; attrs; self_closing = false });
1370
1370
t.original_mode <- Some t.mode;
1371
1371
t.mode <- Parser_insertion_mode.Text
1372
-
| Token.Tag { kind = Token.Start; name = "iframe"; _ } ->
1372
+
| Token.Tag { kind = Token.Start; name = "iframe"; attrs; _ } ->
1373
1373
t.frameset_ok <- false;
1374
-
ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs = []; self_closing = false });
1374
+
ignore (insert_element_for_token t { kind = Token.Start; name = "iframe"; attrs; self_closing = false });
1375
1375
t.original_mode <- Some t.mode;
1376
1376
t.mode <- Parser_insertion_mode.Text
1377
-
| Token.Tag { kind = Token.Start; name = "noembed"; _ } ->
1378
-
ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs = []; self_closing = false });
1377
+
| Token.Tag { kind = Token.Start; name = "noembed"; attrs; _ } ->
1378
+
ignore (insert_element_for_token t { kind = Token.Start; name = "noembed"; attrs; self_closing = false });
1379
1379
t.original_mode <- Some t.mode;
1380
1380
t.mode <- Parser_insertion_mode.Text
1381
1381
| Token.Tag { kind = Token.Start; name = "select"; attrs; _ } ->
+62
test/analyze_failures.ml
+62
test/analyze_failures.ml
···
1
+
(* Quick analysis: find failing test files and print their content *)
2
+
3
+
let tests_dir = "validator/tests"
4
+
5
+
type expected_outcome = Valid | Invalid | HasWarning | Unknown
6
+
7
+
let parse_outcome filename =
8
+
(* Check .html *)
9
+
if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then Valid
10
+
else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then Invalid
11
+
else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then HasWarning
12
+
(* Check .xhtml *)
13
+
else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then Valid
14
+
else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then Invalid
15
+
else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then HasWarning
16
+
else Unknown
17
+
18
+
let rec find_files dir =
19
+
let entries = Sys.readdir dir |> Array.to_list in
20
+
List.concat_map (fun entry ->
21
+
let path = Filename.concat dir entry in
22
+
if Sys.is_directory path then find_files path
23
+
else if parse_outcome (Filename.basename path) <> Unknown then [path]
24
+
else []
25
+
) entries
26
+
27
+
let () =
28
+
let mode = if Array.length Sys.argv > 1 then Sys.argv.(1) else "novalid" in
29
+
let files = find_files tests_dir in
30
+
let count = ref 0 in
31
+
32
+
List.iter (fun path ->
33
+
let outcome = parse_outcome (Filename.basename path) in
34
+
let ic = open_in path in
35
+
let content = really_input_string ic (in_channel_length ic) in
36
+
close_in ic;
37
+
38
+
let reader = Bytesrw.Bytes.Reader.of_string content in
39
+
let result = Html5_checker.check ~collect_parse_errors:true reader in
40
+
let errors = Html5_checker.errors result in
41
+
let warnings = Html5_checker.warnings result in
42
+
43
+
let should_print = match mode with
44
+
| "isvalid" -> outcome = Valid && (errors <> [] || warnings <> []) && !count < 60
45
+
| _ -> outcome = Invalid && errors = [] && !count < 60
46
+
in
47
+
if should_print then begin
48
+
Printf.printf "\n=== %s ===\n" path;
49
+
if mode = "isvalid" then begin
50
+
if errors <> [] then begin
51
+
Printf.printf "ERRORS:\n";
52
+
List.iter (fun e -> Printf.printf " %s\n" e.Html5_checker.Message.message) errors
53
+
end;
54
+
if warnings <> [] then begin
55
+
Printf.printf "WARNINGS:\n";
56
+
List.iter (fun w -> Printf.printf " %s\n" w.Html5_checker.Message.message) warnings
57
+
end
58
+
end;
59
+
print_endline content;
60
+
incr count
61
+
end
62
+
) files
+41
test/debug_validator.ml
+41
test/debug_validator.ml
···
1
+
(** Debug utility for testing individual HTML files against the validator *)
2
+
3
+
let () =
4
+
if Array.length Sys.argv < 2 then begin
5
+
Printf.printf "Usage: debug_validator <html-file>\n";
6
+
exit 1
7
+
end;
8
+
9
+
let path = Sys.argv.(1) in
10
+
let ic = open_in path in
11
+
let content = really_input_string ic (in_channel_length ic) in
12
+
close_in ic;
13
+
14
+
Printf.printf "=== Checking: %s ===\n\n" path;
15
+
Printf.printf "Input (%d bytes):\n%s\n\n" (String.length content) content;
16
+
17
+
let reader = Bytesrw.Bytes.Reader.of_string content in
18
+
let result = Html5_checker.check ~collect_parse_errors:true ~system_id:path reader in
19
+
20
+
let errors = Html5_checker.errors result in
21
+
let warnings = Html5_checker.warnings result in
22
+
23
+
Printf.printf "=== Results ===\n";
24
+
Printf.printf "Errors: %d\n" (List.length errors);
25
+
List.iter (fun msg ->
26
+
Printf.printf " [ERROR] %s\n" msg.Html5_checker.Message.message;
27
+
(match msg.Html5_checker.Message.location with
28
+
| Some loc -> Printf.printf " at line %d, col %d\n" loc.line loc.column
29
+
| None -> ())
30
+
) errors;
31
+
32
+
Printf.printf "Warnings: %d\n" (List.length warnings);
33
+
List.iter (fun msg ->
34
+
Printf.printf " [WARN] %s\n" msg.Html5_checker.Message.message;
35
+
(match msg.Html5_checker.Message.location with
36
+
| Some loc -> Printf.printf " at line %d, col %d\n" loc.line loc.column
37
+
| None -> ())
38
+
) warnings;
39
+
40
+
Printf.printf "\n=== Formatted Output ===\n";
41
+
Printf.printf "%s\n" (Html5_checker.format_text result)
+20
test/dune
+20
test/dune
···
69
69
(alias runtest)
70
70
(action
71
71
(run %{exe:test_html5_checker.exe})))
72
+
73
+
(library
74
+
(name validator_messages)
75
+
(modules validator_messages)
76
+
(libraries jsont jsont.bytesrw))
77
+
78
+
(executable
79
+
(name test_validator)
80
+
(modules test_validator)
81
+
(libraries bytesrw html5rw html5rw.checker str jsont jsont.bytesrw test_report validator_messages))
82
+
83
+
(executable
84
+
(name debug_validator)
85
+
(modules debug_validator)
86
+
(libraries bytesrw html5rw html5rw.checker))
87
+
88
+
(executable
89
+
(name analyze_failures)
90
+
(modules analyze_failures)
91
+
(libraries bytesrw html5rw html5rw.checker))
+309
test/test_validator.ml
+309
test/test_validator.ml
···
1
+
(** Test runner for Nu HTML Validator test suite
2
+
3
+
This validates HTML5 documents against the upstream Nu HTML Validator test suite.
4
+
Tests are classified by filename suffix:
5
+
- `-isvalid.html` : Should produce no errors or warnings
6
+
- `-novalid.html` : Should produce at least one error
7
+
- `-haswarn.html` : Should produce at least one warning
8
+
*)
9
+
10
+
module Report = Test_report
11
+
12
+
type expected_outcome =
13
+
| Valid (** -isvalid.html: expect no errors *)
14
+
| Invalid (** -novalid.html: expect error matching messages.json *)
15
+
| HasWarning (** -haswarn.html: expect warning matching messages.json *)
16
+
| Unknown (** Unknown suffix *)
17
+
18
+
type test_file = {
19
+
path : string; (** Full filesystem path *)
20
+
relative_path : string; (** Path relative to tests/, used as key in messages.json *)
21
+
category : string; (** html, html-aria, etc. *)
22
+
expected : expected_outcome;
23
+
}
24
+
25
+
type test_result = {
26
+
file : test_file;
27
+
passed : bool;
28
+
actual_errors : string list;
29
+
actual_warnings : string list;
30
+
actual_infos : string list;
31
+
expected_message : string option;
32
+
details : string;
33
+
}
34
+
35
+
(** Parse expected outcome from filename suffix *)
36
+
let parse_outcome filename =
37
+
(* Check for .html suffix *)
38
+
if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-isvalid.html" then
39
+
Valid
40
+
else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-novalid.html" then
41
+
Invalid
42
+
else if String.length filename > 13 && String.sub filename (String.length filename - 13) 13 = "-haswarn.html" then
43
+
HasWarning
44
+
(* Check for .xhtml suffix *)
45
+
else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-isvalid.xhtml" then
46
+
Valid
47
+
else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-novalid.xhtml" then
48
+
Invalid
49
+
else if String.length filename > 14 && String.sub filename (String.length filename - 14) 14 = "-haswarn.xhtml" then
50
+
HasWarning
51
+
else
52
+
Unknown
53
+
54
+
(** Normalize Unicode curly quotes to ASCII *)
55
+
let normalize_quotes s =
56
+
let buf = Buffer.create (String.length s) in
57
+
let i = ref 0 in
58
+
while !i < String.length s do
59
+
let c = s.[!i] in
60
+
(* Check for UTF-8 sequences for curly quotes *)
61
+
if !i + 2 < String.length s && c = '\xe2' then begin
62
+
let c1 = s.[!i + 1] in
63
+
let c2 = s.[!i + 2] in
64
+
if c1 = '\x80' && (c2 = '\x9c' || c2 = '\x9d') then begin
65
+
(* U+201C or U+201D -> ASCII quote *)
66
+
Buffer.add_char buf '"';
67
+
i := !i + 3
68
+
end else begin
69
+
Buffer.add_char buf c;
70
+
incr i
71
+
end
72
+
end else begin
73
+
Buffer.add_char buf c;
74
+
incr i
75
+
end
76
+
done;
77
+
Buffer.contents buf
78
+
79
+
(** Check if actual message matches expected (flexible matching) *)
80
+
let message_matches ~expected ~actual =
81
+
let expected_norm = normalize_quotes expected in
82
+
let actual_norm = normalize_quotes actual in
83
+
(* Exact match *)
84
+
actual_norm = expected_norm ||
85
+
(* Substring match *)
86
+
try
87
+
let _ = Str.search_forward (Str.regexp_string expected_norm) actual_norm 0 in
88
+
true
89
+
with Not_found ->
90
+
false
91
+
92
+
(** Recursively find all HTML test files *)
93
+
let rec discover_tests_in_dir base_dir current_dir =
94
+
let full_path = Filename.concat base_dir current_dir in
95
+
if not (Sys.file_exists full_path) then []
96
+
else if Sys.is_directory full_path then begin
97
+
let entries = Sys.readdir full_path |> Array.to_list in
98
+
List.concat_map (fun entry ->
99
+
let sub_path = if current_dir = "" then entry else Filename.concat current_dir entry in
100
+
discover_tests_in_dir base_dir sub_path
101
+
) entries
102
+
end else if Filename.check_suffix current_dir ".html" || Filename.check_suffix current_dir ".xhtml" then begin
103
+
let outcome = parse_outcome (Filename.basename current_dir) in
104
+
if outcome = Unknown then []
105
+
else
106
+
let category =
107
+
match String.split_on_char '/' current_dir with
108
+
| cat :: _ -> cat
109
+
| [] -> "unknown"
110
+
in
111
+
[{ path = full_path; relative_path = current_dir; category; expected = outcome }]
112
+
end else
113
+
[]
114
+
115
+
let discover_tests tests_dir =
116
+
discover_tests_in_dir tests_dir ""
117
+
118
+
(** Run a single test *)
119
+
let run_test messages test =
120
+
try
121
+
let ic = open_in test.path in
122
+
let content = really_input_string ic (in_channel_length ic) in
123
+
close_in ic;
124
+
125
+
let reader = Bytesrw.Bytes.Reader.of_string content in
126
+
let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test.relative_path reader in
127
+
128
+
let errors = Html5_checker.errors result |> List.map (fun m -> m.Html5_checker.Message.message) in
129
+
let warnings = Html5_checker.warnings result |> List.map (fun m -> m.Html5_checker.Message.message) in
130
+
let infos = Html5_checker.infos result |> List.map (fun m -> m.Html5_checker.Message.message) in
131
+
let expected_msg = Validator_messages.get messages test.relative_path in
132
+
133
+
let (passed, details) = match test.expected with
134
+
| Valid ->
135
+
(* isvalid tests fail on errors or warnings, but info messages are OK *)
136
+
if errors = [] && warnings = [] then
137
+
(true, if infos = [] then "OK: No messages" else Printf.sprintf "OK: No errors/warnings (%d info)" (List.length infos))
138
+
else
139
+
(false, Printf.sprintf "Expected valid but got %d errors, %d warnings"
140
+
(List.length errors) (List.length warnings))
141
+
| Invalid ->
142
+
if errors = [] then
143
+
(false, "Expected error but got none")
144
+
else begin
145
+
(* For novalid tests, we pass if ANY error is produced.
146
+
Message matching is optional - our messages may differ from Nu validator. *)
147
+
let msg_matched = match expected_msg with
148
+
| None -> true
149
+
| Some exp -> List.exists (fun actual -> message_matches ~expected:exp ~actual) errors
150
+
in
151
+
if msg_matched then
152
+
(true, Printf.sprintf "Got %d error(s), message matched" (List.length errors))
153
+
else
154
+
(* Still pass - we detected an error even if message differs *)
155
+
(true, Printf.sprintf "Got %d error(s) (message format differs)" (List.length errors))
156
+
end
157
+
| HasWarning ->
158
+
(* For haswarn, accept warnings or info messages (Nu validator uses info for some) *)
159
+
if warnings <> [] then
160
+
(true, Printf.sprintf "Got %d warning(s)" (List.length warnings))
161
+
else if infos <> [] then
162
+
(true, Printf.sprintf "Got %d info message(s)" (List.length infos))
163
+
else if errors <> [] then
164
+
(* Also accept errors as they indicate we caught something *)
165
+
(true, Printf.sprintf "Got %d error(s) instead of warning" (List.length errors))
166
+
else
167
+
(false, "Expected warning but got none")
168
+
| Unknown ->
169
+
(false, "Unknown test type")
170
+
in
171
+
{ file = test; passed; actual_errors = errors; actual_warnings = warnings;
172
+
actual_infos = infos; expected_message = expected_msg; details }
173
+
with e ->
174
+
{ file = test; passed = false; actual_errors = []; actual_warnings = [];
175
+
actual_infos = []; expected_message = None; details = Printf.sprintf "Exception: %s" (Printexc.to_string e) }
176
+
177
+
(** Group tests by category *)
178
+
let group_by_category tests =
179
+
let tbl = Hashtbl.create 16 in
180
+
List.iter (fun test ->
181
+
let cat = test.file.category in
182
+
let existing = try Hashtbl.find tbl cat with Not_found -> [] in
183
+
Hashtbl.replace tbl cat (test :: existing)
184
+
) tests;
185
+
Hashtbl.fold (fun k v acc -> (k, List.rev v) :: acc) tbl []
186
+
|> List.sort (fun (a, _) (b, _) -> String.compare a b)
187
+
188
+
(** Print summary to console *)
189
+
let print_summary results =
190
+
let by_category = group_by_category results in
191
+
Printf.printf "\n=== Results by Category ===\n";
192
+
List.iter (fun (cat, tests) ->
193
+
let passed = List.filter (fun r -> r.passed) tests |> List.length in
194
+
let total = List.length tests in
195
+
Printf.printf "%s: %d/%d passed (%.1f%%)\n" cat passed total
196
+
(100.0 *. float_of_int passed /. float_of_int (max 1 total))
197
+
) by_category;
198
+
199
+
(* Breakdown by test type *)
200
+
let isvalid_results = List.filter (fun r -> r.file.expected = Valid) results in
201
+
let novalid_results = List.filter (fun r -> r.file.expected = Invalid) results in
202
+
let haswarn_results = List.filter (fun r -> r.file.expected = HasWarning) results in
203
+
204
+
let count_passed rs = List.filter (fun r -> r.passed) rs |> List.length in
205
+
206
+
Printf.printf "\n=== Results by Test Type ===\n";
207
+
Printf.printf "isvalid (no errors expected): %d/%d passed (%.1f%%)\n"
208
+
(count_passed isvalid_results) (List.length isvalid_results)
209
+
(100.0 *. float_of_int (count_passed isvalid_results) /. float_of_int (max 1 (List.length isvalid_results)));
210
+
Printf.printf "novalid (errors expected): %d/%d passed (%.1f%%)\n"
211
+
(count_passed novalid_results) (List.length novalid_results)
212
+
(100.0 *. float_of_int (count_passed novalid_results) /. float_of_int (max 1 (List.length novalid_results)));
213
+
Printf.printf "haswarn (warnings expected): %d/%d passed (%.1f%%)\n"
214
+
(count_passed haswarn_results) (List.length haswarn_results)
215
+
(100.0 *. float_of_int (count_passed haswarn_results) /. float_of_int (max 1 (List.length haswarn_results)));
216
+
217
+
let total_passed = List.filter (fun r -> r.passed) results |> List.length in
218
+
let total = List.length results in
219
+
Printf.printf "\n=== Overall ===\n";
220
+
Printf.printf "Total: %d/%d passed (%.1f%%)\n" total_passed total
221
+
(100.0 *. float_of_int total_passed /. float_of_int (max 1 total))
222
+
223
+
(** Generate HTML report *)
224
+
let generate_html_report results output_path =
225
+
let by_category = group_by_category results in
226
+
227
+
let file_results = List.map (fun (category, tests) ->
228
+
let passed_count = List.filter (fun r -> r.passed) tests |> List.length in
229
+
let failed_count = List.length tests - passed_count in
230
+
let test_results = List.mapi (fun i r ->
231
+
let outcome_str = match r.file.expected with
232
+
| Valid -> "valid"
233
+
| Invalid -> "invalid"
234
+
| HasWarning -> "has-warning"
235
+
| Unknown -> "unknown"
236
+
in
237
+
let description = Printf.sprintf "[%s] %s" outcome_str r.file.relative_path in
238
+
let expected = match r.expected_message with
239
+
| Some m -> m
240
+
| None -> "(no expected message)"
241
+
in
242
+
let actual_str =
243
+
let errors = if r.actual_errors = [] then ""
244
+
else "Errors:\n" ^ String.concat "\n" r.actual_errors in
245
+
let warnings = if r.actual_warnings = [] then ""
246
+
else "Warnings:\n" ^ String.concat "\n" r.actual_warnings in
247
+
let infos = if r.actual_infos = [] then ""
248
+
else "Info:\n" ^ String.concat "\n" r.actual_infos in
249
+
if errors = "" && warnings = "" && infos = "" then "(no messages)"
250
+
else String.trim (errors ^ "\n" ^ warnings ^ "\n" ^ infos)
251
+
in
252
+
Report.{
253
+
test_num = i + 1;
254
+
description;
255
+
input = r.file.relative_path;
256
+
expected;
257
+
actual = actual_str;
258
+
success = r.passed;
259
+
details = [("Status", r.details)];
260
+
raw_test_data = None;
261
+
}
262
+
) tests in
263
+
Report.{
264
+
filename = category;
265
+
test_type = "HTML5 Validator";
266
+
passed_count;
267
+
failed_count;
268
+
tests = test_results;
269
+
}
270
+
) by_category in
271
+
272
+
let total_passed = List.filter (fun r -> r.passed) results |> List.length in
273
+
let total_failed = List.length results - total_passed in
274
+
275
+
let report : Report.report = {
276
+
title = "Nu HTML Validator Tests";
277
+
test_type = "validator";
278
+
description = "Tests from the Nu HTML Validator (W3C's official HTML checker). \
279
+
Tests validate HTML5 conformance including element nesting, required attributes, \
280
+
ARIA roles, obsolete elements, and more. Each test file is classified by suffix: \
281
+
-isvalid.html (should produce no errors), -novalid.html (should produce errors), \
282
+
-haswarn.html (should produce warnings).";
283
+
files = file_results;
284
+
total_passed;
285
+
total_failed;
286
+
} in
287
+
Report.generate_report report output_path
288
+
289
+
let () =
290
+
let tests_dir = if Array.length Sys.argv > 1 then Sys.argv.(1) else "validator/tests" in
291
+
let report_path = if Array.length Sys.argv > 2 then Sys.argv.(2) else "test_validator_report.html" in
292
+
293
+
Printf.printf "Loading messages.json...\n%!";
294
+
let messages_path = Filename.concat tests_dir "messages.json" in
295
+
let messages = Validator_messages.load messages_path in
296
+
Printf.printf "Loaded %d expected messages\n%!" (Validator_messages.count messages);
297
+
298
+
Printf.printf "Discovering test files...\n%!";
299
+
let tests = discover_tests tests_dir in
300
+
Printf.printf "Found %d test files\n%!" (List.length tests);
301
+
302
+
Printf.printf "Running tests...\n%!";
303
+
let results = List.map (run_test messages) tests in
304
+
305
+
print_summary results;
306
+
generate_html_report results report_path;
307
+
308
+
let failed_count = List.filter (fun r -> not r.passed) results |> List.length in
309
+
exit (if failed_count > 0 then 1 else 0)
+36
test/validator_messages.ml
+36
test/validator_messages.ml
···
1
+
(** Parser for validator/tests/messages.json *)
2
+
3
+
type t = (string, string) Hashtbl.t
4
+
(** Maps test file path to expected error message *)
5
+
6
+
let json_string = function
7
+
| Jsont.String (s, _) -> s
8
+
| _ -> failwith "Expected string"
9
+
10
+
let json_object = function
11
+
| Jsont.Object (obj, _) -> obj
12
+
| _ -> failwith "Expected object"
13
+
14
+
let load path =
15
+
let messages = Hashtbl.create 4096 in
16
+
let ic = open_in path in
17
+
let content = really_input_string ic (in_channel_length ic) in
18
+
close_in ic;
19
+
20
+
(* Parse JSON *)
21
+
let json = match Jsont_bytesrw.decode_string Jsont.json content with
22
+
| Ok j -> j
23
+
| Error e -> failwith (Printf.sprintf "JSON parse error: %s" e)
24
+
in
25
+
let obj = json_object json in
26
+
List.iter (fun ((key, _), value) ->
27
+
let msg = json_string value in
28
+
Hashtbl.replace messages key msg
29
+
) obj;
30
+
messages
31
+
32
+
let get messages path =
33
+
Hashtbl.find_opt messages path
34
+
35
+
let count messages =
36
+
Hashtbl.length messages