+57
-16
lib/html5_checker/datatype/dt_autocomplete.ml
+57
-16
lib/html5_checker/datatype/dt_autocomplete.ml
···
147
is_contact_details := true
148
| _ -> ());
149
150
(* Process remaining tokens *)
151
-
let process_field_tokens = function
152
-
| [] -> Error "A list of autofill details tokens must contain an autofill field name"
153
| [ "webauthn" ] ->
154
Error
155
"The token \"webauthn\" must not be the only token in a list of \
156
-
autofill detail tokens"
157
| [ field_name ] ->
158
if not (List.mem field_name all_field_names) then
159
Error
160
(Printf.sprintf
161
-
"The string \"%s\" is not a valid autofill field name"
162
field_name)
163
else if !is_contact_details && not (List.mem field_name contact_field_names)
164
then
165
Error
166
(Printf.sprintf
167
"The autofill field name \"%s\" is not allowed in contact \
168
-
context"
169
field_name)
170
else Ok ()
171
| [ field_name; "webauthn" ] ->
172
if not (List.mem field_name all_field_names) then
173
Error
174
(Printf.sprintf
175
-
"The string \"%s\" is not a valid autofill field name"
176
field_name)
177
else if !is_contact_details && not (List.mem field_name contact_field_names)
178
then
179
Error
180
(Printf.sprintf
181
"The autofill field name \"%s\" is not allowed in contact \
182
-
context"
183
field_name)
184
else Ok ()
185
| token :: _ when List.mem token contact_types ->
186
Error
187
(Printf.sprintf
188
-
"The token \"%s\" must only appear before any autofill field names"
189
token)
190
| token :: _ when starts_with token "section-" ->
191
Error
192
"A \"section-*\" indicator must only appear as the first token in a \
193
-
list of autofill detail tokens"
194
| "shipping" :: _ | "billing" :: _ as toks ->
195
Error
196
(Printf.sprintf
197
"The token \"%s\" must only appear as either the first token in a \
198
list of autofill detail tokens, or, if the first token is a \
199
-
\"section-*\" indicator, as the second token"
200
(List.hd toks))
201
| _ :: "webauthn" :: _ :: _ ->
202
Error
203
"The token \"webauthn\" must only appear as the very last token in a \
204
-
list of autofill detail tokens"
205
-
| _ :: _ :: _ ->
206
-
Error
207
-
"A list of autofill details tokens must not contain more than one \
208
-
autofill field name"
209
in
210
process_field_tokens !tokens
211
212
(** Validate autocomplete value *)
213
let validate_autocomplete s =
214
let trimmed = trim_whitespace s in
215
-
if String.length trimmed = 0 then Error "Must not be empty"
216
else if trimmed = "on" || trimmed = "off" then Ok ()
217
else
218
let tokens = split_on_whitespace trimmed in
···
147
is_contact_details := true
148
| _ -> ());
149
150
+
(* Check if any token in the list is shipping/billing *)
151
+
let find_shipping_billing tokens =
152
+
List.find_opt (fun t -> t = "shipping" || t = "billing") tokens
153
+
in
154
+
155
+
(* Check if any token in the list is a contact type *)
156
+
let find_contact_type tokens =
157
+
List.find_opt (fun t -> List.mem t contact_types) tokens
158
+
in
159
+
160
+
(* Check if any token in the list is a section-* indicator *)
161
+
let find_section tokens =
162
+
List.find_opt (fun t -> starts_with t "section-") tokens
163
+
in
164
+
165
(* Process remaining tokens *)
166
+
let process_field_tokens tokens =
167
+
match tokens with
168
+
| [] -> Error "A list of autofill details tokens must contain an autofill field name."
169
| [ "webauthn" ] ->
170
Error
171
"The token \"webauthn\" must not be the only token in a list of \
172
+
autofill detail tokens."
173
| [ field_name ] ->
174
if not (List.mem field_name all_field_names) then
175
Error
176
(Printf.sprintf
177
+
"The string \"%s\" is not a valid autofill field name."
178
field_name)
179
else if !is_contact_details && not (List.mem field_name contact_field_names)
180
then
181
Error
182
(Printf.sprintf
183
"The autofill field name \"%s\" is not allowed in contact \
184
+
context."
185
field_name)
186
else Ok ()
187
| [ field_name; "webauthn" ] ->
188
if not (List.mem field_name all_field_names) then
189
Error
190
(Printf.sprintf
191
+
"The string \"%s\" is not a valid autofill field name."
192
field_name)
193
else if !is_contact_details && not (List.mem field_name contact_field_names)
194
then
195
Error
196
(Printf.sprintf
197
"The autofill field name \"%s\" is not allowed in contact \
198
+
context."
199
field_name)
200
else Ok ()
201
| token :: _ when List.mem token contact_types ->
202
Error
203
(Printf.sprintf
204
+
"The token \"%s\" must only appear before any autofill field names."
205
token)
206
| token :: _ when starts_with token "section-" ->
207
Error
208
"A \"section-*\" indicator must only appear as the first token in a \
209
+
list of autofill detail tokens."
210
| "shipping" :: _ | "billing" :: _ as toks ->
211
Error
212
(Printf.sprintf
213
"The token \"%s\" must only appear as either the first token in a \
214
list of autofill detail tokens, or, if the first token is a \
215
+
\"section-*\" indicator, as the second token."
216
(List.hd toks))
217
| _ :: "webauthn" :: _ :: _ ->
218
Error
219
"The token \"webauthn\" must only appear as the very last token in a \
220
+
list of autofill detail tokens."
221
+
| _ :: rest ->
222
+
(* Check if any remaining token is a section-* indicator - position error takes precedence *)
223
+
(match find_section rest with
224
+
| Some _ ->
225
+
Error
226
+
"A \"section-*\" indicator must only appear as the first token in a \
227
+
list of autofill detail tokens."
228
+
| None ->
229
+
(* Check if any remaining token is a contact type - position error takes precedence *)
230
+
match find_contact_type rest with
231
+
| Some ct_token ->
232
+
Error
233
+
(Printf.sprintf
234
+
"The token \"%s\" must only appear before any autofill field names."
235
+
ct_token)
236
+
| None ->
237
+
(* Check if any remaining token is shipping/billing - position error takes precedence *)
238
+
match find_shipping_billing rest with
239
+
| Some sb_token ->
240
+
Error
241
+
(Printf.sprintf
242
+
"The token \"%s\" must only appear as either the first token in a \
243
+
list of autofill detail tokens, or, if the first token is a \
244
+
\"section-*\" indicator, as the second token."
245
+
sb_token)
246
+
| None ->
247
+
Error
248
+
"A list of autofill details tokens must not contain more than one \
249
+
autofill field name.")
250
in
251
process_field_tokens !tokens
252
253
(** Validate autocomplete value *)
254
let validate_autocomplete s =
255
let trimmed = trim_whitespace s in
256
+
if String.length trimmed = 0 then Error "Must not be empty."
257
else if trimmed = "on" || trimmed = "off" then Ok ()
258
else
259
let tokens = split_on_whitespace trimmed in
+60
-4
lib/html5_checker/parse_error_bridge.ml
+60
-4
lib/html5_checker/parse_error_bridge.ml
···
11
Message.make_location ~line ~column ?system_id ()
12
in
13
let code_str = Html5rw.Parse_error_code.to_string code in
14
-
let message = match code with
15
| Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
16
-
"Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag."
17
-
| _ -> Printf.sprintf "Parse error: %s" code_str
18
in
19
Message.error
20
~message
21
-
~code:code_str
22
~location
23
()
24
···
11
Message.make_location ~line ~column ?system_id ()
12
in
13
let code_str = Html5rw.Parse_error_code.to_string code in
14
+
let (message, final_code) = match code with
15
| Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus ->
16
+
("Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag.", code_str)
17
+
| Html5rw.Parse_error_code.Tree_construction_error s ->
18
+
(* Check for control-character/noncharacter/surrogate with codepoint info *)
19
+
(try
20
+
if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then
21
+
let colon_pos = String.index s ':' in
22
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
23
+
let cp = int_of_string ("0x" ^ cp_str) in
24
+
(Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
25
+
else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then
26
+
let colon_pos = String.index s ':' in
27
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
28
+
let cp = int_of_string ("0x" ^ cp_str) in
29
+
(Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
30
+
else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then
31
+
let colon_pos = String.index s ':' in
32
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
33
+
let cp = int_of_string ("0x" ^ cp_str) in
34
+
(Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint")
35
+
(* Character reference errors *)
36
+
else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then
37
+
let cp_str = String.sub s 28 (String.length s - 28) in
38
+
let cp = int_of_string ("0x" ^ cp_str) in
39
+
if cp = 0x0D then
40
+
("A numeric character reference expanded to carriage return.", "control-character-reference")
41
+
else
42
+
(Printf.sprintf "Character reference expands to a control character (U+%04x)." cp, "control-character-reference")
43
+
else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then
44
+
let colon_pos = String.index s ':' in
45
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
46
+
let cp = int_of_string ("0x" ^ cp_str) in
47
+
(* U+FDD0-U+FDEF are "permanently unassigned" *)
48
+
if cp >= 0xFDD0 && cp <= 0xFDEF then
49
+
("Character reference expands to a permanently unassigned code point.", "noncharacter-character-reference")
50
+
(* Astral noncharacters (planes 1-16) *)
51
+
else if cp >= 0x10000 then
52
+
(Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp, "noncharacter-character-reference")
53
+
else
54
+
(Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp, "noncharacter-character-reference")
55
+
else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then
56
+
let colon_pos = String.index s ':' in
57
+
let _ = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
58
+
("Character reference outside the permissible Unicode range.", "character-reference-outside-unicode-range")
59
+
else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then
60
+
let colon_pos = String.index s ':' in
61
+
let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in
62
+
let cp = int_of_string ("0x" ^ cp_str) in
63
+
(Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp, "surrogate-character-reference")
64
+
else if s = "no-p-element-in-scope" then
65
+
("No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen.", "no-p-element-in-scope")
66
+
else if s = "end-tag-p-implied-but-open-elements" then
67
+
("End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements.", "end-tag-p-implied")
68
+
else if s = "end-tag-br" then
69
+
("End tag \xe2\x80\x9cbr\xe2\x80\x9d.", "end-tag-br")
70
+
else
71
+
(Printf.sprintf "Parse error: %s" s, s)
72
+
with _ -> (Printf.sprintf "Parse error: %s" s, s))
73
+
| _ -> (Printf.sprintf "Parse error: %s" code_str, code_str)
74
in
75
Message.error
76
~message
77
+
~code:final_code
78
~location
79
()
80
+3
-1
lib/html5_checker/semantic/form_checker.ml
+3
-1
lib/html5_checker/semantic/form_checker.ml
···
32
match Dt_autocomplete.validate_autocomplete value with
33
| Ok () -> ()
34
| Error msg ->
35
+
(* Nu validator prefixes autocomplete errors with "Bad autocomplete detail tokens (any): " for select/textarea, but not for input *)
36
+
let reason = if element_name = "input" then msg else "Bad autocomplete detail tokens (any): " ^ msg in
37
Message_collector.add_typed collector
38
(Error_code.Bad_attr_value {
39
element = element_name;
40
attr = "autocomplete";
41
value;
42
+
reason
43
})
44
end
45
+23
-8
lib/html5_checker/semantic/id_checker.ml
+23
-8
lib/html5_checker/semantic/id_checker.ml
···
193
so we pass None. In a full implementation, this would be passed
194
from the parser. *)
195
let location = None in
196
-
process_attrs state ~element:name ~attrs ~location collector
197
198
let end_element _state ~name:_ ~namespace:_ _collector =
199
()
···
204
let end_document state collector =
205
(* Check all ID references point to existing IDs *)
206
List.iter (fun ref ->
207
-
if not (Hashtbl.mem state.ids ref.referenced_id) then
208
-
(* Use generic for dangling references - format may vary *)
209
-
Message_collector.add_typed collector
210
-
(Error_code.Generic {
211
-
message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document."
212
-
(Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id)
213
-
})
214
) state.references;
215
216
(* Check all usemap references point to existing map names *)
···
193
so we pass None. In a full implementation, this would be passed
194
from the parser. *)
195
let location = None in
196
+
process_attrs state ~element:name ~attrs ~location collector;
197
+
198
+
(* Special check: map element must have matching id and name if both present *)
199
+
if name = "map" then begin
200
+
let id_opt = List.find_map (fun (n, v) -> if n = "id" then Some v else None) attrs in
201
+
let name_opt = List.find_map (fun (n, v) -> if n = "name" then Some v else None) attrs in
202
+
match id_opt, name_opt with
203
+
| Some id_val, Some name_val when id_val <> name_val ->
204
+
Message_collector.add_typed collector Error_code.Map_id_name_mismatch
205
+
| _ -> ()
206
+
end
207
208
let end_element _state ~name:_ ~namespace:_ _collector =
209
()
···
214
let end_document state collector =
215
(* Check all ID references point to existing IDs *)
216
List.iter (fun ref ->
217
+
if not (Hashtbl.mem state.ids ref.referenced_id) then begin
218
+
(* Use specific error for list attribute on input *)
219
+
if ref.attribute = "list" && ref.referring_element = "input" then
220
+
Message_collector.add_typed collector Error_code.List_attr_requires_datalist
221
+
else
222
+
(* Use generic for dangling references - format may vary *)
223
+
Message_collector.add_typed collector
224
+
(Error_code.Generic {
225
+
message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document."
226
+
(Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id)
227
+
})
228
+
end
229
) state.references;
230
231
(* Check all usemap references point to existing map names *)
+85
lib/html5_checker/semantic/lang_detecting_checker.ml
+85
lib/html5_checker/semantic/lang_detecting_checker.ml
···
141
| "zh-tw" -> "zh-hant"
142
| _ -> code
143
144
let start_element state ~name ~namespace ~attrs _collector =
145
let name_lower = String.lowercase_ascii name in
146
let ns = Option.value namespace ~default:"" in
···
226
let original_declared = match state.html_lang with
227
| Some l -> l
228
| None -> ""
229
in
230
let detected_code = detected_lang in (* Keep full code like zh-tw *)
231
let detected_name = get_language_name detected_lang in
···
141
| "zh-tw" -> "zh-hant"
142
| _ -> code
143
144
+
(* Traditional Chinese-only characters (simplified versions don't exist) *)
145
+
(* These are characters that were simplified in Simplified Chinese *)
146
+
let traditional_chars = [|
147
+
0x570B; (* 國 -> 国 *)
148
+
0x5B78; (* 學 -> 学 *)
149
+
0x8AAA; (* 說 -> 说 *)
150
+
0x66F8; (* 書 -> 书 *)
151
+
0x8A9E; (* 語 -> 语 *)
152
+
0x6642; (* 時 -> 时 *)
153
+
0x6703; (* 會 -> 会 *)
154
+
0x7D93; (* 經 -> 经 *)
155
+
0x6A5F; (* 機 -> 机 *)
156
+
0x767C; (* 發 -> 发 *)
157
+
0x554F; (* 問 -> 问 *)
158
+
0x6578; (* 數 -> 数 *)
159
+
0x5BE6; (* 實 -> 实 *)
160
+
0x958B; (* 開 -> 开 *)
161
+
0x95DC; (* 關 -> 关 *)
162
+
0x9577; (* 長 -> 长 *)
163
+
0x9AD4; (* 體 -> 体 *)
164
+
0x9EDE; (* 點 -> 点 *)
165
+
0x96FB; (* 電 -> 电 *)
166
+
0x8CC7; (* 資 -> 资 *)
167
+
0x7FA9; (* 義 -> 义 *)
168
+
0x8B93; (* 讓 -> 让 *)
169
+
0x9054; (* 達 -> 达 *)
170
+
0x71DF; (* 營 -> 营 *)
171
+
0x8655; (* 處 -> 处 *)
172
+
0x6771; (* 東 -> 东 *)
173
+
0x8209; (* 舉 -> 举 *)
174
+
0x8A18; (* 記 -> 记 *)
175
+
0x5099; (* 備 -> 备 *)
176
+
0x5354; (* 協 -> 协 *)
177
+
0x8FA6; (* 辦 -> 办 *)
178
+
0x8457; (* 著 -> 着 *)
179
+
0x8F09; (* 載 -> 载 *)
180
+
0x52D9; (* 務 -> 务 *)
181
+
0x7121; (* 無 -> 无 *)
182
+
0x5F9E; (* 從 -> 从 *)
183
+
0x8B58; (* 識 -> 识 *)
184
+
0x8207; (* 與 -> 与 *)
185
+
0x78BA; (* 確 -> 确 *)
186
+
0x904E; (* 過 -> 过 *)
187
+
0x8A72; (* 該 -> 该 *)
188
+
0x9810; (* 預 -> 预 *)
189
+
0x7576; (* 當 -> 当 *)
190
+
0x5831; (* 報 -> 报 *)
191
+
0x9054; (* 達 -> 达 *)
192
+
0x91AB; (* 醫 -> 医 *)
193
+
0x5718; (* 團 -> 团 *)
194
+
0x8B70; (* 議 -> 议 *)
195
+
0x7D71; (* 統 -> 统 *)
196
+
0x898F; (* 規 -> 规 *)
197
+
|]
198
+
199
+
(* Check if text contains enough Traditional Chinese characters *)
200
+
let is_traditional_chinese text =
201
+
let count = ref 0 in
202
+
let total = ref 0 in
203
+
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
204
+
let rec process () =
205
+
if !total >= 1000 then () (* Sample first 1000 chars *)
206
+
else match Uutf.decode decoder with
207
+
| `Await | `End -> ()
208
+
| `Malformed _ -> process ()
209
+
| `Uchar uchar ->
210
+
let code = Uchar.to_int uchar in
211
+
(* Count CJK characters *)
212
+
if code >= 0x4E00 && code <= 0x9FFF then begin
213
+
incr total;
214
+
(* Check if it's a Traditional-only character *)
215
+
if Array.exists (fun c -> c = code) traditional_chars then
216
+
incr count
217
+
end;
218
+
process ()
219
+
in
220
+
process ();
221
+
(* If > 2% are Traditional-only characters, it's Traditional Chinese *)
222
+
!total > 100 && (float_of_int !count /. float_of_int !total) > 0.02
223
+
224
let start_element state ~name ~namespace ~attrs _collector =
225
let name_lower = String.lowercase_ascii name in
226
let ns = Option.value namespace ~default:"" in
···
306
let original_declared = match state.html_lang with
307
| Some l -> l
308
| None -> ""
309
+
in
310
+
(* Correct for Traditional vs Simplified Chinese misdetection *)
311
+
let detected_lang =
312
+
if detected_lang = "zh-cn" && is_traditional_chinese text then "zh-tw"
313
+
else detected_lang
314
in
315
let detected_code = detected_lang in (* Keep full code like zh-tw *)
316
let detected_name = get_language_name detected_lang in
+23
-3
lib/html5_checker/specialized/aria_checker.ml
+23
-3
lib/html5_checker/specialized/aria_checker.ml
···
368
mutable stack : stack_node list;
369
mutable has_active_tab : bool; (* Whether document has role=tab with aria-selected=true *)
370
mutable has_tabpanel : bool; (* Whether document has role=tabpanel elements *)
371
}
372
373
-
let create () = { stack = []; has_active_tab = false; has_tabpanel = false }
374
375
let reset state =
376
state.stack <- [];
377
state.has_active_tab <- false;
378
-
state.has_tabpanel <- false
379
380
(** Check if any ancestor has one of the required roles. *)
381
let has_required_ancestor_role state required_roles =
···
451
if aria_selected = Some "true" then state.has_active_tab <- true
452
end;
453
if List.mem "tabpanel" explicit_roles then state.has_tabpanel <- true;
454
455
(* Check br/wbr role restrictions - only none/presentation allowed *)
456
if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin
···
784
Message_collector.add_error collector
785
~message:"Every active \xe2\x80\x9crole=tab\xe2\x80\x9d element must have a corresponding \xe2\x80\x9crole=tabpanel\xe2\x80\x9d element."
786
~code:"tab-without-tabpanel"
787
-
()
788
789
let checker = (module struct
790
type nonrec state = state
···
368
mutable stack : stack_node list;
369
mutable has_active_tab : bool; (* Whether document has role=tab with aria-selected=true *)
370
mutable has_tabpanel : bool; (* Whether document has role=tabpanel elements *)
371
+
mutable visible_main_count : int; (* Count of visible elements with role=main *)
372
}
373
374
+
let create () = { stack = []; has_active_tab = false; has_tabpanel = false; visible_main_count = 0 }
375
376
let reset state =
377
state.stack <- [];
378
state.has_active_tab <- false;
379
+
state.has_tabpanel <- false;
380
+
state.visible_main_count <- 0
381
382
(** Check if any ancestor has one of the required roles. *)
383
let has_required_ancestor_role state required_roles =
···
453
if aria_selected = Some "true" then state.has_active_tab <- true
454
end;
455
if List.mem "tabpanel" explicit_roles then state.has_tabpanel <- true;
456
+
457
+
(* Track visible main elements (explicit role=main or implicit main role) *)
458
+
let is_hidden =
459
+
let aria_hidden = List.assoc_opt "aria-hidden" attrs in
460
+
aria_hidden = Some "true"
461
+
in
462
+
if not is_hidden then begin
463
+
(* Check explicit role *)
464
+
if List.mem "main" explicit_roles then
465
+
state.visible_main_count <- state.visible_main_count + 1
466
+
(* Check implicit role from <main> element *)
467
+
else if name_lower = "main" then
468
+
state.visible_main_count <- state.visible_main_count + 1
469
+
end;
470
471
(* Check br/wbr role restrictions - only none/presentation allowed *)
472
if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin
···
800
Message_collector.add_error collector
801
~message:"Every active \xe2\x80\x9crole=tab\xe2\x80\x9d element must have a corresponding \xe2\x80\x9crole=tabpanel\xe2\x80\x9d element."
802
~code:"tab-without-tabpanel"
803
+
();
804
+
805
+
(* Check for multiple visible main elements *)
806
+
if state.visible_main_count > 1 then
807
+
Message_collector.add_typed collector Error_code.Multiple_main_visible
808
809
let checker = (module struct
810
type nonrec state = state
+7
-7
lib/html5_checker/specialized/attr_restrictions_checker.ml
+7
-7
lib/html5_checker/specialized/attr_restrictions_checker.ml
···
250
Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
251
attr_name name
252
else if String.contains attr_value '%' then
253
-
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
254
attr_value attr_name name
255
else if String.length attr_value > 0 && attr_value.[0] = '-' then
256
-
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The value must be non-negative."
257
attr_value attr_name name
258
else
259
(* Find first non-digit character *)
···
268
in
269
match bad_char with
270
| Some c ->
271
-
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
272
attr_value attr_name name c
273
| None ->
274
-
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit."
275
attr_value attr_name name
276
in
277
Message_collector.add_error collector
···
455
List.iter (fun key ->
456
if count_codepoints key > 1 then
457
Message_collector.add_error collector
458
-
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The space-separated list of key labels contains a value \xe2\x80\x9c%s\xe2\x80\x9d that consists of more than a single code point."
459
-
attr_value attr_name name key)
460
~code:"bad-attribute-value"
461
~element:name ~attribute:attr_name ()
462
) keys;
···
466
| k :: rest ->
467
if List.mem k seen then
468
Message_collector.add_error collector
469
-
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Duplicate key label."
470
attr_value attr_name name)
471
~code:"bad-attribute-value"
472
~element:name ~attribute:attr_name ()
···
250
Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
251
attr_name name
252
else if String.contains attr_value '%' then
253
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
254
attr_value attr_name name
255
else if String.length attr_value > 0 && attr_value.[0] = '-' then
256
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead."
257
attr_value attr_name name
258
else
259
(* Find first non-digit character *)
···
268
in
269
match bad_char with
270
| Some c ->
271
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
272
attr_value attr_name name c
273
| None ->
274
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit."
275
attr_value attr_name name
276
in
277
Message_collector.add_error collector
···
455
List.iter (fun key ->
456
if count_codepoints key > 1 then
457
Message_collector.add_error collector
458
+
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character."
459
+
attr_value attr_name name)
460
~code:"bad-attribute-value"
461
~element:name ~attribute:attr_name ()
462
) keys;
···
466
| k :: rest ->
467
if List.mem k seen then
468
Message_collector.add_error collector
469
+
~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique."
470
attr_value attr_name name)
471
~code:"bad-attribute-value"
472
~element:name ~attribute:attr_name ()
+9
-13
lib/html5_checker/specialized/datetime_checker.ml
+9
-13
lib/html5_checker/specialized/datetime_checker.ml
···
241
minute <> 0 && minute <> 30 && minute <> 45
242
in
243
if unusual_range then
244
-
TzWarning "unusual timezone offset"
245
else if unusual_minutes then
246
-
TzWarning "unusual timezone offset minutes"
247
else
248
TzOk
249
end
···
350
match validate_datetime_with_timezone value with
351
| DtOk -> Ok (* Valid datetime with timezone *)
352
| DtWarning w ->
353
-
(* Valid but with warning *)
354
-
Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s."
355
value attr_name element_name w)
356
| DtError tz_error ->
357
(* Try just date - valid for all elements *)
···
359
| (true, _) ->
360
(* Date is valid, but check for suspicious year (5+ digits or old year) *)
361
if has_suspicious_year value || has_old_year value then begin
362
-
let date_msg = "Year may be mistyped." in
363
let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
364
-
Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
365
value attr_name element_name date_msg tz_msg)
366
end else
367
Ok (* Valid date with normal year *)
···
389
match validate_duration value with
390
| (true, _) -> Ok (* Valid duration P... *)
391
| (false, _) ->
392
-
let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
393
-
let date_msg = match date_error with
394
-
| Some e -> Printf.sprintf "Bad date: %s." e
395
-
| None -> "Bad date: The literal did not satisfy the date format."
396
-
in
397
-
Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
398
-
value attr_name element_name tz_msg date_msg)
399
end
400
else begin
401
(* del/ins only allow date or datetime-with-timezone *)
···
241
minute <> 0 && minute <> 30 && minute <> 45
242
in
243
if unusual_range then
244
+
TzWarning "Hours in time zone designator should be from \"-12:00\" to \"+14:00\""
245
else if unusual_minutes then
246
+
TzWarning "Minutes in time zone designator should be either \"00\", \"30\", or \"45\"."
247
else
248
TzOk
249
end
···
350
match validate_datetime_with_timezone value with
351
| DtOk -> Ok (* Valid datetime with timezone *)
352
| DtWarning w ->
353
+
(* Valid but with warning - format matches Nu validator *)
354
+
Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format."
355
value attr_name element_name w)
356
| DtError tz_error ->
357
(* Try just date - valid for all elements *)
···
359
| (true, _) ->
360
(* Date is valid, but check for suspicious year (5+ digits or old year) *)
361
if has_suspicious_year value || has_old_year value then begin
362
+
let date_msg = "Bad date: Year may be mistyped." in
363
let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in
364
+
Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s"
365
value attr_name element_name date_msg tz_msg)
366
end else
367
Ok (* Valid date with normal year *)
···
389
match validate_duration value with
390
| (true, _) -> Ok (* Valid duration P... *)
391
| (false, _) ->
392
+
(* Use simplified message for time element matching Nu validator format *)
393
+
Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad time-datetime: The literal did not satisfy the time-datetime format."
394
+
value attr_name element_name)
395
end
396
else begin
397
(* del/ins only allow date or datetime-with-timezone *)
+6
-3
lib/html5_checker/specialized/importmap_checker.ml
+6
-3
lib/html5_checker/specialized/importmap_checker.ml
···
175
| SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *)
176
| InvalidScopeKey (* scope key is not a valid URL *)
177
| InvalidScopeValue of string (* scope value is not a valid URL *)
178
179
(** Check if a string looks like a valid URL-like specifier for importmaps *)
180
let is_valid_url_like s =
···
255
| JNull -> ()
256
| _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]"))
257
) scope_imports
258
-
| _ -> add_error (NotObject ("scopes[" ^ skey ^ "]"))
259
) scope_members
260
| _ -> add_error (NotObject "scopes")
261
end
···
290
Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop
291
| NotString _ ->
292
"A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values."
293
-
| ForbiddenProperty prop ->
294
-
Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d is not an allowed property." prop
295
| SlashKeyWithoutSlashValue prop ->
296
Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop
297
| InvalidScopeKey ->
298
"The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings."
299
| InvalidScopeValue _ ->
300
"A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values."
301
302
let end_element state ~name ~namespace collector =
303
if namespace <> None then ()
···
175
| SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *)
176
| InvalidScopeKey (* scope key is not a valid URL *)
177
| InvalidScopeValue of string (* scope value is not a valid URL *)
178
+
| ScopeValueNotObject (* a value inside scopes is not a JSON object *)
179
180
(** Check if a string looks like a valid URL-like specifier for importmaps *)
181
let is_valid_url_like s =
···
256
| JNull -> ()
257
| _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]"))
258
) scope_imports
259
+
| _ -> add_error ScopeValueNotObject
260
) scope_members
261
| _ -> add_error (NotObject "scopes")
262
end
···
291
Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop
292
| NotString _ ->
293
"A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values."
294
+
| ForbiddenProperty _ ->
295
+
"A \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must contain a JSON object with no properties other than \xe2\x80\x9cimports\xe2\x80\x9d, \xe2\x80\x9cscopes\xe2\x80\x9d, and \xe2\x80\x9cintegrity\xe2\x80\x9d."
296
| SlashKeyWithoutSlashValue prop ->
297
Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop
298
| InvalidScopeKey ->
299
"The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings."
300
| InvalidScopeValue _ ->
301
"A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values."
302
+
| ScopeValueNotObject ->
303
+
"The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose values are also JSON objects."
304
305
let end_element state ~name ~namespace collector =
306
if namespace <> None then ()
+3
-3
lib/html5_checker/specialized/language_checker.ml
+3
-3
lib/html5_checker/specialized/language_checker.ml
···
57
| Some (deprecated, replacement) ->
58
Message_collector.add_warning collector
59
~message:(Printf.sprintf
60
-
"The language tag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead."
61
-
deprecated replacement)
62
~code:"deprecated-lang"
63
?location
64
~element
65
-
~attribute:"lang"
66
()
67
| None -> ()
68
···
57
| Some (deprecated, replacement) ->
58
Message_collector.add_warning collector
59
~message:(Printf.sprintf
60
+
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad language tag: The language subtag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead."
61
+
value attribute element deprecated replacement)
62
~code:"deprecated-lang"
63
?location
64
~element
65
+
~attribute
66
()
67
| None -> ()
68
+68
-27
lib/html5_checker/specialized/url_checker.ml
+68
-27
lib/html5_checker/specialized/url_checker.ml
···
239
let _ = contains_invalid_unicode decoded in
240
None
241
with Exit ->
242
-
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host."
243
url attr_name element_name)
244
245
(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *)
···
349
end else
350
None
351
352
-
(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). *)
353
-
let check_data_uri_fragment url attr_name element_name =
354
match extract_scheme url with
355
| None -> None
356
| Some scheme ->
357
if scheme = "data" && String.contains url '#' then
358
-
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Fragment is not allowed for data: URIs according to RFC 2397."
359
-
url attr_name element_name)
360
else
361
None
362
···
373
let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
374
(* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *)
375
if String.length after_colon > 0 && after_colon.[0] = '/' then
376
-
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid %s: URL."
377
-
url attr_name element_name scheme)
378
else
379
None
380
end else
···
389
(* Get scheme data (after the colon) *)
390
let colon_pos = String.index url ':' in
391
let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
392
(* Check for space in scheme data *)
393
-
if String.contains scheme_data ' ' then
394
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed."
395
url attr_name element_name)
396
else
···
508
try
509
let fragment_start = String.index url '#' in
510
let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in
511
(* Check for second hash in fragment *)
512
-
if String.contains fragment '#' then
513
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed."
514
url attr_name element_name)
515
(* Check for space in fragment *)
···
560
else if String.contains userinfo ' ' then
561
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed."
562
url attr_name element_name)
563
-
else
564
-
(* Check for non-ASCII characters (like emoji) *)
565
-
let has_non_ascii = String.exists (fun c -> Char.code c > 127) userinfo in
566
-
if has_non_ascii then
567
-
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password."
568
-
url attr_name element_name)
569
-
else
570
(* Check for other invalid chars *)
571
let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in
572
match invalid with
···
574
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
575
url attr_name element_name c)
576
| None -> None
577
with _ -> None
578
579
(** Attributes where empty URL is an error.
···
613
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace."
614
original_url attr_name element_name)
615
else None
616
-
(* Check for newlines/tabs *)
617
-
else if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
618
-
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
619
-
url attr_name element_name)
620
else begin
621
(* Check for relative URL issues first *)
622
match check_relative_url url attr_name element_name with
623
| Some err -> Some err
···
659
url attr_name element_name)
660
else
661
662
-
(* Check scheme data for non-special schemes *)
663
-
match check_scheme_data url attr_name element_name with
664
-
| Some err -> Some err
665
-
| None ->
666
-
667
(* Check path segment for illegal characters *)
668
match check_path_segment url attr_name element_name with
669
| Some err -> Some err
···
688
match host_opt with
689
| Some host -> validate_host host url attr_name element_name scheme_str
690
| None -> None
691
end
692
end
693
···
761
()
762
| Some _ ->
763
(* Check for data: URI with fragment - emit warning *)
764
-
(match check_data_uri_fragment url "value" name with
765
| Some warn_msg ->
766
Message_collector.add_warning collector
767
~message:warn_msg
···
786
end
787
end;
788
(* Check microdata itemtype and itemid attributes for data: URI fragments *)
789
let itemtype_opt = get_attr_value "itemtype" attrs in
790
(match itemtype_opt with
791
| Some url when String.trim url <> "" ->
792
-
(match check_data_uri_fragment url "itemtype" name with
793
| Some warn_msg ->
794
Message_collector.add_warning collector
795
~message:warn_msg
···
799
()
800
| None -> ())
801
| _ -> ());
802
let itemid_opt = get_attr_value "itemid" attrs in
803
(match itemid_opt with
804
| Some url when String.trim url <> "" ->
···
239
let _ = contains_invalid_unicode decoded in
240
None
241
with Exit ->
242
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.."
243
url attr_name element_name)
244
245
(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *)
···
349
end else
350
None
351
352
+
(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments).
353
+
The is_absolute_url parameter controls whether to use "Bad URL:" or "Bad absolute URL:" in the message. *)
354
+
let check_data_uri_fragment ?(is_absolute_url=false) url attr_name element_name =
355
match extract_scheme url with
356
| None -> None
357
| Some scheme ->
358
if scheme = "data" && String.contains url '#' then
359
+
let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in
360
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397."
361
+
url attr_name element_name url_type)
362
else
363
None
364
···
375
let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
376
(* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *)
377
if String.length after_colon > 0 && after_colon.[0] = '/' then
378
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead."
379
+
url attr_name element_name)
380
else
381
None
382
end else
···
391
(* Get scheme data (after the colon) *)
392
let colon_pos = String.index url ':' in
393
let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
394
+
(* Check for tab in scheme data *)
395
+
if String.contains scheme_data '\t' then
396
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed."
397
+
url attr_name element_name)
398
+
(* Check for newline in scheme data *)
399
+
else if String.contains scheme_data '\n' then
400
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed."
401
+
url attr_name element_name)
402
+
(* Check for carriage return in scheme data *)
403
+
else if String.contains scheme_data '\r' then
404
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed."
405
+
url attr_name element_name)
406
(* Check for space in scheme data *)
407
+
else if String.contains scheme_data ' ' then
408
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed."
409
url attr_name element_name)
410
else
···
522
try
523
let fragment_start = String.index url '#' in
524
let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in
525
+
(* Check for backslash in fragment *)
526
+
if String.contains fragment '\\' then
527
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed."
528
+
url attr_name element_name)
529
(* Check for second hash in fragment *)
530
+
else if String.contains fragment '#' then
531
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed."
532
url attr_name element_name)
533
(* Check for space in fragment *)
···
578
else if String.contains userinfo ' ' then
579
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed."
580
url attr_name element_name)
581
+
else begin
582
+
(* Check for non-ASCII characters (like emoji) using UTF-8 decoding *)
583
+
let find_non_ascii_char userinfo =
584
+
let decoder = Uutf.decoder ~encoding:`UTF_8 (`String userinfo) in
585
+
let rec find () =
586
+
match Uutf.decode decoder with
587
+
| `End | `Await -> None
588
+
| `Malformed _ -> find ()
589
+
| `Uchar uchar ->
590
+
let code = Uchar.to_int uchar in
591
+
(* Check if character is not allowed in userinfo *)
592
+
(* Per URL Standard: only ASCII letters, digits, and certain symbols allowed *)
593
+
if code > 127 then begin
594
+
let buf = Buffer.create 8 in
595
+
Buffer.add_utf_8_uchar buf uchar;
596
+
Some (Buffer.contents buf)
597
+
end else find ()
598
+
in
599
+
find ()
600
+
in
601
+
match find_non_ascii_char userinfo with
602
+
| Some bad_char ->
603
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed."
604
+
url attr_name element_name bad_char)
605
+
| None ->
606
(* Check for other invalid chars *)
607
let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in
608
match invalid with
···
610
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
611
url attr_name element_name c)
612
| None -> None
613
+
end
614
with _ -> None
615
616
(** Attributes where empty URL is an error.
···
650
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace."
651
original_url attr_name element_name)
652
else None
653
+
(* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *)
654
else begin
655
+
match check_scheme_data url attr_name element_name with
656
+
| Some err -> Some err
657
+
| None ->
658
+
(* Check for newlines/tabs in special scheme URLs *)
659
+
if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
660
+
Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
661
+
url attr_name element_name)
662
+
else begin
663
(* Check for relative URL issues first *)
664
match check_relative_url url attr_name element_name with
665
| Some err -> Some err
···
701
url attr_name element_name)
702
else
703
704
(* Check path segment for illegal characters *)
705
match check_path_segment url attr_name element_name with
706
| Some err -> Some err
···
725
match host_opt with
726
| Some host -> validate_host host url attr_name element_name scheme_str
727
| None -> None
728
+
end
729
end
730
end
731
···
799
()
800
| Some _ ->
801
(* Check for data: URI with fragment - emit warning *)
802
+
(* input[type=url] uses "Bad absolute URL:" format *)
803
+
(match check_data_uri_fragment ~is_absolute_url:true url "value" name with
804
| Some warn_msg ->
805
Message_collector.add_warning collector
806
~message:warn_msg
···
825
end
826
end;
827
(* Check microdata itemtype and itemid attributes for data: URI fragments *)
828
+
(* Microdata uses "Bad absolute URL:" format *)
829
let itemtype_opt = get_attr_value "itemtype" attrs in
830
(match itemtype_opt with
831
| Some url when String.trim url <> "" ->
832
+
(match check_data_uri_fragment ~is_absolute_url:true url "itemtype" name with
833
| Some warn_msg ->
834
Message_collector.add_warning collector
835
~message:warn_msg
···
839
()
840
| None -> ())
841
| _ -> ());
842
+
(* itemid uses "Bad URL:" format (not "Bad absolute URL:") *)
843
let itemid_opt = get_attr_value "itemid" attrs in
844
(match itemid_opt with
845
| Some url when String.trim url <> "" ->
+3
-3
lib/html5rw/parser/parser_tree_builder.ml
+3
-3
lib/html5rw/parser/parser_tree_builder.ml
···
664
let close_p_element t =
665
generate_implied_end_tags t ~except:"p" ();
666
(match current_node t with
667
-
| Some n when n.Dom.name <> "p" -> parse_error t "expected-p"
668
| _ -> ());
669
pop_until_tag t "p"
670
···
1215
end
1216
| Token.Tag { kind = Token.End; name = "p"; _ } ->
1217
if not (has_element_in_button_scope t "p") then begin
1218
-
parse_error t "unexpected-end-tag";
1219
ignore (insert_element t "p" ~push:true [])
1220
end;
1221
close_p_element t
···
1321
t.frameset_ok <- false;
1322
t.mode <- Parser_insertion_mode.In_table
1323
| Token.Tag { kind = Token.End; name = "br"; _ } ->
1324
-
parse_error t "unexpected-end-tag";
1325
reconstruct_active_formatting t;
1326
ignore (insert_element t "br" ~push:true []);
1327
pop_current t;
···
664
let close_p_element t =
665
generate_implied_end_tags t ~except:"p" ();
666
(match current_node t with
667
+
| Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements"
668
| _ -> ());
669
pop_until_tag t "p"
670
···
1215
end
1216
| Token.Tag { kind = Token.End; name = "p"; _ } ->
1217
if not (has_element_in_button_scope t "p") then begin
1218
+
parse_error t "no-p-element-in-scope";
1219
ignore (insert_element t "p" ~push:true [])
1220
end;
1221
close_p_element t
···
1321
t.frameset_ok <- false;
1322
t.mode <- Parser_insertion_mode.In_table
1323
| Token.Tag { kind = Token.End; name = "br"; _ } ->
1324
+
parse_error t "end-tag-br";
1325
reconstruct_active_formatting t;
1326
ignore (insert_element t "br" ~push:true []);
1327
pop_current t;
+5
-5
lib/html5rw/tokenizer/tokenizer_impl.ml
+5
-5
lib/html5rw/tokenizer/tokenizer_impl.ml
···
264
code = 0x0B ||
265
(code >= 0x0E && code <= 0x1F) ||
266
code = 0x7F then
267
-
error t "control-character-in-input-stream"
268
in
269
270
···
1937
error t "null-character-reference";
1938
replacement_char
1939
end else if code > 0x10FFFF then begin
1940
-
error t "character-reference-outside-unicode-range";
1941
replacement_char
1942
end else if code >= 0xD800 && code <= 0xDFFF then begin
1943
-
error t "surrogate-character-reference";
1944
replacement_char
1945
end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1946
List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
···
1949
0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
1950
0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
1951
0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
1952
-
error t "noncharacter-character-reference";
1953
Entities.Numeric_ref.codepoint_to_utf8 code
1954
end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
1955
(code >= 0x0D && code <= 0x1F) ||
1956
(code >= 0x7F && code <= 0x9F) then begin
1957
-
error t "control-character-reference";
1958
(* Apply Windows-1252 replacement table for 0x80-0x9F *)
1959
match Entities.Numeric_ref.find_replacement code with
1960
| Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
···
264
code = 0x0B ||
265
(code >= 0x0E && code <= 0x1F) ||
266
code = 0x7F then
267
+
error t (Printf.sprintf "control-character-in-input-stream:%04x" code)
268
in
269
270
···
1937
error t "null-character-reference";
1938
replacement_char
1939
end else if code > 0x10FFFF then begin
1940
+
error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code);
1941
replacement_char
1942
end else if code >= 0xD800 && code <= 0xDFFF then begin
1943
+
error t (Printf.sprintf "surrogate-character-reference:%04x" code);
1944
replacement_char
1945
end else if (code >= 0xFDD0 && code <= 0xFDEF) ||
1946
List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF;
···
1949
0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF;
1950
0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF;
1951
0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin
1952
+
error t (Printf.sprintf "noncharacter-character-reference:%05x" code);
1953
Entities.Numeric_ref.codepoint_to_utf8 code
1954
end else if (code >= 0x01 && code <= 0x08) || code = 0x0B ||
1955
(code >= 0x0D && code <= 0x1F) ||
1956
(code >= 0x7F && code <= 0x9F) then begin
1957
+
error t (Printf.sprintf "control-character-reference:%04x" code);
1958
(* Apply Windows-1252 replacement table for 0x80-0x9F *)
1959
match Entities.Numeric_ref.find_replacement code with
1960
| Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
+5
-5
lib/html5rw/tokenizer/tokenizer_stream.ml
+5
-5
lib/html5rw/tokenizer/tokenizer_stream.ml
···
99
let check_utf8_codepoint t lead_byte =
100
let b0 = Char.code lead_byte in
101
if b0 < 0x80 then
102
-
(* ASCII - no surrogates or noncharacters possible in this range except control chars *)
103
()
104
else if b0 >= 0xC2 && b0 <= 0xDF then begin
105
(* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
···
112
(* C1 controls: U+0080 to U+009F *)
113
if cp >= 0x80 && cp <= 0x9F then
114
(match t.error_callback with
115
-
| Some cb -> cb "control-character-in-input-stream"
116
| None -> ())
117
| Some c1 ->
118
push_back_char t c1
···
132
(* Check for surrogates and noncharacters *)
133
(match t.error_callback with
134
| Some cb ->
135
-
if is_surrogate cp then cb "surrogate-in-input-stream"
136
-
else if is_noncharacter cp then cb "noncharacter-in-input-stream"
137
| None -> ())
138
| Some c2 ->
139
push_back_char t c2;
···
162
(* Check for noncharacters (no surrogates in 4-byte range) *)
163
(match t.error_callback with
164
| Some cb ->
165
-
if is_noncharacter cp then cb "noncharacter-in-input-stream"
166
| None -> ())
167
| Some c3 ->
168
push_back_char t c3;
···
99
let check_utf8_codepoint t lead_byte =
100
let b0 = Char.code lead_byte in
101
if b0 < 0x80 then
102
+
(* ASCII - control characters are handled in tokenizer_impl.ml *)
103
()
104
else if b0 >= 0xC2 && b0 <= 0xDF then begin
105
(* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *)
···
112
(* C1 controls: U+0080 to U+009F *)
113
if cp >= 0x80 && cp <= 0x9F then
114
(match t.error_callback with
115
+
| Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp)
116
| None -> ())
117
| Some c1 ->
118
push_back_char t c1
···
132
(* Check for surrogates and noncharacters *)
133
(match t.error_callback with
134
| Some cb ->
135
+
if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp)
136
+
else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp)
137
| None -> ())
138
| Some c2 ->
139
push_back_char t c2;
···
162
(* Check for noncharacters (no surrogates in 4-byte range) *)
163
(match t.error_callback with
164
| Some cb ->
165
+
if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp)
166
| None -> ())
167
| Some c3 ->
168
push_back_char t c3;