OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** URL validation checker for href, src, action, and other URL attributes. *)
2
3(** Attributes that contain URLs and should be validated.
4 Note: srcset uses special microsyntax, not validated as URL here.
5 Note: input[value] is only checked for type="url", handled specially below. *)
6let url_attributes = [
7 ("a", ["href"]);
8 ("area", ["href"]);
9 ("audio", ["src"]);
10 ("base", ["href"]);
11 ("blockquote", ["cite"]);
12 ("button", ["formaction"]);
13 ("del", ["cite"]);
14 ("embed", ["src"]);
15 ("form", ["action"]);
16 ("iframe", ["src"]);
17 ("img", ["src"]);
18 ("input", ["formaction"; "src"]);
19 ("ins", ["cite"]);
20 ("link", ["href"]);
21 ("object", ["data"]);
22 ("q", ["cite"]);
23 ("script", ["src"]);
24 ("source", ["src"]);
25 ("track", ["src"]);
26 ("video", ["src"; "poster"]);
27]
28
29(** Characters not allowed in URL host. *)
30let invalid_host_chars = ['^'; '`'; '{'; '}'; '<'; '>']
31
32(** Check if a host looks like an IPv6 address (starts with [). *)
33let is_ipv6_host host =
34 String.length host > 0 && host.[0] = '['
35
36(** Check if character is valid in IPv6 address. *)
37let is_valid_ipv6_char c =
38 (c >= '0' && c <= '9') ||
39 (c >= 'a' && c <= 'f') ||
40 (c >= 'A' && c <= 'F') ||
41 c = ':' || c = '.' || c = '[' || c = ']'
42
43(** Validate IPv6 bracketed host. *)
44let validate_ipv6_host host url attr_name element_name =
45 (* Host should be in format [xxxx:...] *)
46 if String.length host < 3 then
47 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character."
48 url attr_name element_name)
49 else begin
50 (* Check if all characters are valid IPv6 chars *)
51 let invalid_char = String.exists (fun c -> not (is_valid_ipv6_char c)) host in
52 if invalid_char then
53 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character."
54 url attr_name element_name)
55 else
56 None
57 end
58
59(** Check if a file URL host is a valid Windows drive letter (like C|). *)
60let is_valid_windows_drive host =
61 String.length host = 2 &&
62 ((host.[0] >= 'A' && host.[0] <= 'Z') || (host.[0] >= 'a' && host.[0] <= 'z')) &&
63 host.[1] = '|'
64
65(** Check if pipe is allowed in this host context. *)
66let is_pipe_allowed_in_host url host =
67 let scheme = try String.lowercase_ascii (String.sub url 0 (String.index url ':')) with _ -> "" in
68 scheme = "file" && is_valid_windows_drive host
69
70(** Special schemes that require double slash (//).
71 Note: file: is special but doesn't always require //.
72 Note: ws and wss allow single/no slash forms per WHATWG URL Standard. *)
73let special_schemes_require_double_slash = ["http"; "https"; "ftp"]
74
75(** Special schemes (for other checks). *)
76let special_schemes = ["http"; "https"; "ftp"; "ws"; "wss"; "file"]
77
78(** Extract scheme from URL. *)
79let extract_scheme url =
80 (* A scheme must start with a letter, not [ or other special chars *)
81 if String.length url = 0 then None
82 else if not (url.[0] >= 'a' && url.[0] <= 'z' || url.[0] >= 'A' && url.[0] <= 'Z') then
83 None
84 else
85 try
86 let colon_pos = String.index url ':' in
87 (* Scheme can only contain letters, digits, +, -, . *)
88 let potential_scheme = String.sub url 0 colon_pos in
89 let is_valid_scheme = String.length potential_scheme > 0 &&
90 String.for_all (fun c ->
91 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
92 (c >= '0' && c <= '9') || c = '+' || c = '-' || c = '.'
93 ) potential_scheme in
94 if is_valid_scheme then
95 Some (String.lowercase_ascii potential_scheme)
96 else
97 None
98 with Not_found -> None
99
100(** Extract host and port from URL. Returns (host option, port_string option). *)
101let extract_host_and_port url =
102 try
103 let double_slash =
104 try Some (Str.search_forward (Str.regexp "://") url 0 + 3)
105 with Not_found -> None
106 in
107 match double_slash with
108 | None -> (None, None)
109 | Some start_pos ->
110 let rest = String.sub url start_pos (String.length url - start_pos) in
111 (* Find end of authority (/ ? # or end) *)
112 let auth_end =
113 let find_char c = try Some (String.index rest c) with Not_found -> None in
114 match find_char '/', find_char '?', find_char '#' with
115 | Some a, Some b, Some c -> min a (min b c)
116 | Some a, Some b, None -> min a b
117 | Some a, None, Some c -> min a c
118 | None, Some b, Some c -> min b c
119 | Some a, None, None -> a
120 | None, Some b, None -> b
121 | None, None, Some c -> c
122 | None, None, None -> String.length rest
123 in
124 let authority = String.sub rest 0 auth_end in
125 (* Remove userinfo if present *)
126 let host_port =
127 try
128 let at_pos = String.rindex authority '@' in
129 String.sub authority (at_pos + 1) (String.length authority - at_pos - 1)
130 with Not_found -> authority
131 in
132 (* Handle IPv6 addresses *)
133 if String.length host_port > 0 && host_port.[0] = '[' then begin
134 try
135 let bracket_end = String.index host_port ']' in
136 let host = String.sub host_port 0 (bracket_end + 1) in
137 let after_bracket = String.sub host_port (bracket_end + 1) (String.length host_port - bracket_end - 1) in
138 if String.length after_bracket > 0 && after_bracket.[0] = ':' then
139 (Some host, Some (String.sub after_bracket 1 (String.length after_bracket - 1)))
140 else
141 (Some host, None)
142 with Not_found -> (Some host_port, None)
143 end else begin
144 (* Regular host:port - use FIRST colon to separate host from port
145 (per WHATWG URL Standard for special schemes) *)
146 try
147 let colon_pos = String.index host_port ':' in
148 let host = String.sub host_port 0 colon_pos in
149 let port = String.sub host_port (colon_pos + 1) (String.length host_port - colon_pos - 1) in
150 (Some host, Some port)
151 with Not_found -> (Some host_port, None)
152 end
153 with _ -> (None, None)
154
155(** Check if character is a valid hex digit (for percent-decoding). *)
156let is_hex_digit_for_decode c =
157 (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
158
159(** Convert a hex character to its numeric value. *)
160let hex_value c =
161 if c >= '0' && c <= '9' then Char.code c - Char.code '0'
162 else if c >= 'a' && c <= 'f' then Char.code c - Char.code 'a' + 10
163 else if c >= 'A' && c <= 'F' then Char.code c - Char.code 'A' + 10
164 else 0
165
166(** Percent-decode a string. Returns the decoded bytes. *)
167let percent_decode s =
168 let buf = Buffer.create (String.length s) in
169 let len = String.length s in
170 let i = ref 0 in
171 while !i < len do
172 if s.[!i] = '%' && !i + 2 < len && is_hex_digit_for_decode s.[!i + 1] && is_hex_digit_for_decode s.[!i + 2] then begin
173 let byte = hex_value s.[!i + 1] * 16 + hex_value s.[!i + 2] in
174 Buffer.add_char buf (Char.chr byte);
175 i := !i + 3
176 end else begin
177 Buffer.add_char buf s.[!i];
178 incr i
179 end
180 done;
181 Buffer.contents buf
182
183(** Check if decoded bytes contain invalid Unicode noncharacters or surrogates.
184 These are forbidden in hostnames per WHATWG URL Standard.
185 - U+FDD0-U+FDEF: noncharacters
186 - U+FFFE, U+FFFF: noncharacters
187 - U+xFFFE, U+xFFFF for any plane (0x1FFFE, etc.)
188 - U+D800-U+DFFF: surrogate code points *)
189let contains_invalid_unicode bytes =
190 let len = String.length bytes in
191 let i = ref 0 in
192 while !i < len do
193 let c = Char.code bytes.[!i] in
194 if c < 128 then begin
195 (* ASCII - OK *)
196 incr i
197 end else if c >= 0xC0 && c < 0xE0 && !i + 1 < len then begin
198 (* 2-byte UTF-8 *)
199 let b1 = Char.code bytes.[!i + 1] in
200 (* let codepoint = ((c land 0x1F) lsl 6) lor (b1 land 0x3F) in *)
201 ignore b1;
202 i := !i + 2
203 end else if c >= 0xE0 && c < 0xF0 && !i + 2 < len then begin
204 (* 3-byte UTF-8 *)
205 let b1 = Char.code bytes.[!i + 1] in
206 let b2 = Char.code bytes.[!i + 2] in
207 let codepoint = ((c land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in
208 (* Check for surrogates (U+D800-U+DFFF) *)
209 if codepoint >= 0xD800 && codepoint <= 0xDFFF then
210 raise Exit;
211 (* Check for noncharacters in BMP *)
212 if codepoint >= 0xFDD0 && codepoint <= 0xFDEF then
213 raise Exit;
214 if codepoint = 0xFFFE || codepoint = 0xFFFF then
215 raise Exit;
216 i := !i + 3
217 end else if c >= 0xF0 && c < 0xF8 && !i + 3 < len then begin
218 (* 4-byte UTF-8 *)
219 let b1 = Char.code bytes.[!i + 1] in
220 let b2 = Char.code bytes.[!i + 2] in
221 let b3 = Char.code bytes.[!i + 3] in
222 let codepoint = ((c land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor
223 ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in
224 (* Check for noncharacters at end of each plane: U+1FFFE, U+1FFFF, U+2FFFE, etc. *)
225 if (codepoint land 0xFFFF) = 0xFFFE || (codepoint land 0xFFFF) = 0xFFFF then
226 raise Exit;
227 i := !i + 4
228 end else begin
229 (* Invalid UTF-8 or other - skip *)
230 incr i
231 end
232 done;
233 false
234
235(** Check if host contains invalid percent-encoded Unicode. *)
236let check_invalid_percent_encoded_unicode host url attr_name element_name =
237 try
238 let decoded = percent_decode host in
239 let _ = contains_invalid_unicode decoded in
240 None
241 with Exit ->
242 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.."
243 url attr_name element_name)
244
245(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *)
246let contains_percent_char s =
247 (* Check for ASCII percent *)
248 String.contains s '%' ||
249 (* Check for fullwidth percent (U+FF05 = 0xEF 0xBC 0x85 in UTF-8) *)
250 try
251 let _ = Str.search_forward (Str.regexp "\xef\xbc\x85") s 0 in
252 true
253 with Not_found -> false
254
255(** Check if decoded host contains forbidden characters.
256 Some URLs have percent-encoded fullwidth characters that decode to forbidden chars. *)
257let check_decoded_host_chars host url attr_name element_name =
258 let decoded = percent_decode host in
259 (* Check for % character in decoded host - this catches fullwidth percent signs etc. *)
260 if contains_percent_char decoded then
261 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%%\xe2\x80\x9d is not allowed."
262 url attr_name element_name)
263 else
264 None
265
266(** Validate port string. Returns error message or None. *)
267let validate_port port url attr_name element_name =
268 if port = "" then None
269 else begin
270 (* Check for invalid characters in port *)
271 let invalid_char = ref None in
272 String.iter (fun c ->
273 if !invalid_char = None && not (c >= '0' && c <= '9') then
274 invalid_char := Some c
275 ) port;
276 match !invalid_char with
277 | Some c ->
278 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in port: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
279 url attr_name element_name c)
280 | None ->
281 (* Check port range *)
282 try
283 let port_num = int_of_string port in
284 if port_num >= 65536 then
285 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Port number must be less than 65536."
286 url attr_name element_name)
287 else
288 None
289 with _ -> None
290 end
291
292(** Validate host string. Returns error message or None. *)
293let validate_host host url attr_name element_name scheme =
294 if is_ipv6_host host then
295 validate_ipv6_host host url attr_name element_name
296 else begin
297 (* Check for empty host *)
298 let requires_host = List.mem scheme special_schemes in
299 if host = "" && requires_host && scheme <> "file" then
300 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: empty host."
301 url attr_name element_name)
302 else
303 (* Check for invalid chars *)
304 let invalid_char =
305 List.find_opt (fun c -> String.contains host c) invalid_host_chars
306 in
307 match invalid_char with
308 | Some c ->
309 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
310 url attr_name element_name c)
311 | None ->
312 (* Check for | *)
313 if String.contains host '|' && not (is_pipe_allowed_in_host url host) then
314 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c|\xe2\x80\x9d is not allowed."
315 url attr_name element_name)
316 (* Check for backslash in host *)
317 else if String.contains host '\\' then
318 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed."
319 url attr_name element_name)
320 (* Check for space in host *)
321 else if String.contains host ' ' then
322 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: space is not allowed."
323 url attr_name element_name)
324 (* Check for invalid percent-encoded Unicode in host *)
325 else begin
326 match check_invalid_percent_encoded_unicode host url attr_name element_name with
327 | Some err -> Some err
328 | None ->
329 (* Check decoded host for forbidden chars like fullwidth percent *)
330 check_decoded_host_chars host url attr_name element_name
331 end
332 end
333
334(** Check if URL has special scheme requiring double slash. *)
335let check_special_scheme_double_slash url attr_name element_name =
336 match extract_scheme url with
337 | None -> None
338 | Some scheme ->
339 (* Only check for schemes that require //, not file: *)
340 if List.mem scheme special_schemes_require_double_slash then begin
341 (* Check if followed by :// *)
342 let colon_pos = String.index url ':' in
343 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
344 if String.length after_colon < 2 || after_colon.[0] <> '/' || after_colon.[1] <> '/' then
345 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a slash (\"/\")."
346 url attr_name element_name)
347 else
348 None
349 end else
350 None
351
352(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments).
353 The is_absolute_url parameter controls whether to use "Bad URL:" or "Bad absolute URL:" in the message. *)
354let check_data_uri_fragment ?(is_absolute_url=false) url attr_name element_name =
355 match extract_scheme url with
356 | None -> None
357 | Some scheme ->
358 if scheme = "data" && String.contains url '#' then
359 let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in
360 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397."
361 url attr_name element_name url_type)
362 else
363 None
364
365(** data: URLs cannot start with / (they have specific format: data:[mediatype][;base64],data) *)
366let data_scheme_no_slash = ["data"]
367
368(** Check for data: URL that incorrectly has a slash (data: URLs have specific format). *)
369let check_data_url_no_slash url attr_name element_name =
370 match extract_scheme url with
371 | None -> None
372 | Some scheme ->
373 if List.mem scheme data_scheme_no_slash then begin
374 let colon_pos = String.index url ':' in
375 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
376 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *)
377 if String.length after_colon > 0 && after_colon.[0] = '/' then
378 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead."
379 url attr_name element_name)
380 else
381 None
382 end else
383 None
384
385(** Check for illegal characters in scheme data (for non-special schemes). *)
386let check_scheme_data url attr_name element_name =
387 match extract_scheme url with
388 | None -> None
389 | Some scheme ->
390 if not (List.mem scheme special_schemes) then begin
391 (* Get scheme data (after the colon) *)
392 let colon_pos = String.index url ':' in
393 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
394 (* Check for tab in scheme data *)
395 if String.contains scheme_data '\t' then
396 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed."
397 url attr_name element_name)
398 (* Check for newline in scheme data *)
399 else if String.contains scheme_data '\n' then
400 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed."
401 url attr_name element_name)
402 (* Check for carriage return in scheme data *)
403 else if String.contains scheme_data '\r' then
404 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed."
405 url attr_name element_name)
406 (* Check for space in scheme data *)
407 else if String.contains scheme_data ' ' then
408 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed."
409 url attr_name element_name)
410 else
411 None
412 end else
413 None
414
415(** Remove query and fragment from path. *)
416let remove_query_fragment path =
417 let path = try String.sub path 0 (String.index path '?') with Not_found -> path in
418 try String.sub path 0 (String.index path '#') with Not_found -> path
419
420(** Check for illegal characters in path segment. *)
421let check_path_segment url attr_name element_name =
422 (* Extract path: everything after authority (or after scheme: for non-authority URLs) *)
423 let raw_path =
424 try
425 let double_slash = Str.search_forward (Str.regexp "://") url 0 in
426 let after_auth_start = double_slash + 3 in
427 let rest = String.sub url after_auth_start (String.length url - after_auth_start) in
428 (* Find end of authority *)
429 let path_start =
430 try String.index rest '/'
431 with Not_found -> String.length rest
432 in
433 if path_start < String.length rest then
434 String.sub rest path_start (String.length rest - path_start)
435 else
436 ""
437 with Not_found ->
438 (* No double slash - check for single slash path *)
439 match extract_scheme url with
440 | Some _ ->
441 let colon_pos = String.index url ':' in
442 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in
443 after_colon
444 | None ->
445 (* Relative URL - the whole thing is the path *)
446 url
447 in
448 (* Remove query and fragment for path-specific checks *)
449 let path = remove_query_fragment raw_path in
450 (* Check for space in path (not allowed) *)
451 if String.contains path ' ' then
452 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: space is not allowed."
453 url attr_name element_name)
454 (* Check for pipe in path (not allowed except in file:// authority) *)
455 else if String.contains path '|' then
456 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c|\xe2\x80\x9d is not allowed."
457 url attr_name element_name)
458 (* Check for unescaped square brackets in path *)
459 else if String.contains path '[' then
460 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed."
461 url attr_name element_name)
462 else
463 None
464
465(** Check for illegal characters in relative URL. *)
466let check_relative_url url attr_name element_name =
467 (* If URL has no scheme, it's relative *)
468 match extract_scheme url with
469 | Some _ -> None
470 | None ->
471 (* Check for square brackets at start (not IPv6 - that requires scheme) *)
472 if String.length url > 0 && url.[0] = '[' then
473 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed."
474 url attr_name element_name)
475 else
476 None
477
478(** Check if character is a valid hex digit. *)
479let is_hex_digit c =
480 (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
481
482(** Check for bare percent sign not followed by hex digits. *)
483let check_percent_encoding url attr_name element_name =
484 let len = String.length url in
485 let rec find_bare_percent i =
486 if i >= len then None
487 else if url.[i] = '%' then begin
488 (* Check if followed by two hex digits *)
489 if i + 2 < len && is_hex_digit url.[i + 1] && is_hex_digit url.[i + 2] then
490 find_bare_percent (i + 3) (* Valid percent encoding, continue *)
491 else
492 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Percentage (\xe2\x80\x9c%%\xe2\x80\x9d) is not followed by two hexadecimal digits."
493 url attr_name element_name)
494 end else
495 find_bare_percent (i + 1)
496 in
497 find_bare_percent 0
498
499(** Check for illegal characters in query string. *)
500let check_query_string url attr_name element_name =
501 try
502 let query_start = String.index url '?' in
503 let fragment_start =
504 try Some (String.index_from url query_start '#')
505 with Not_found -> None
506 in
507 let query_end = match fragment_start with
508 | Some pos -> pos
509 | None -> String.length url
510 in
511 let query = String.sub url (query_start + 1) (query_end - query_start - 1) in
512 (* Check for unescaped space in query *)
513 if String.contains query ' ' then
514 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in query: space is not allowed."
515 url attr_name element_name)
516 else
517 None
518 with Not_found -> None (* No query string *)
519
520(** Check for illegal characters in fragment. *)
521let check_fragment url attr_name element_name =
522 try
523 let fragment_start = String.index url '#' in
524 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in
525 (* Check for backslash in fragment *)
526 if String.contains fragment '\\' then
527 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed."
528 url attr_name element_name)
529 (* Check for second hash in fragment *)
530 else if String.contains fragment '#' then
531 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed."
532 url attr_name element_name)
533 (* Check for space in fragment *)
534 else if String.contains fragment ' ' then
535 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: space is not allowed."
536 url attr_name element_name)
537 else
538 None
539 with Not_found -> None (* No fragment *)
540
541(** Characters not allowed in userinfo (user:password) part of URL. *)
542let invalid_userinfo_chars = [']'; '['; '^'; '|'; '`'; '<'; '>']
543
544(** Check for illegal characters in userinfo (user:password). *)
545let check_userinfo url attr_name element_name =
546 try
547 (* Look for :// then find the LAST @ before the next / or end *)
548 let double_slash = Str.search_forward (Str.regexp "://") url 0 + 3 in
549 let rest = String.sub url double_slash (String.length url - double_slash) in
550 (* Find first / or ? or # to limit authority section *)
551 let auth_end =
552 let find_char c = try Some (String.index rest c) with Not_found -> None in
553 match find_char '/', find_char '?', find_char '#' with
554 | Some a, Some b, Some c -> min a (min b c)
555 | Some a, Some b, None -> min a b
556 | Some a, None, Some c -> min a c
557 | None, Some b, Some c -> min b c
558 | Some a, None, None -> a
559 | None, Some b, None -> b
560 | None, None, Some c -> c
561 | None, None, None -> String.length rest
562 in
563 let authority = String.sub rest 0 auth_end in
564 (* Find LAST @ in authority to separate userinfo from host *)
565 let at_pos =
566 try Some (String.rindex authority '@')
567 with Not_found -> None
568 in
569 match at_pos with
570 | None -> None (* No userinfo *)
571 | Some at ->
572 let userinfo = String.sub authority 0 at in
573 (* Check for @ in userinfo (should be percent-encoded) *)
574 if String.contains userinfo '@' then
575 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: User or password contains an at symbol (\xe2\x80\x9c@\xe2\x80\x9d) not percent-encoded."
576 url attr_name element_name)
577 (* Check for space *)
578 else if String.contains userinfo ' ' then
579 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed."
580 url attr_name element_name)
581 else begin
582 (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *)
583 let find_non_ascii_char userinfo =
584 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String userinfo) in
585 let rec find () =
586 match Uutf.decode decoder with
587 | `End | `Await -> None
588 | `Malformed _ -> find ()
589 | `Uchar uchar ->
590 let code = Uchar.to_int uchar in
591 (* Check if character is not allowed in userinfo *)
592 (* Per URL Standard: only ASCII letters, digits, and certain symbols allowed *)
593 if code > 127 then begin
594 let buf = Buffer.create 8 in
595 Buffer.add_utf_8_uchar buf uchar;
596 Some (Buffer.contents buf)
597 end else find ()
598 in
599 find ()
600 in
601 match find_non_ascii_char userinfo with
602 | Some bad_char ->
603 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed."
604 url attr_name element_name bad_char)
605 | None ->
606 (* Check for other invalid chars *)
607 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in
608 match invalid with
609 | Some c ->
610 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed."
611 url attr_name element_name c)
612 | None -> None
613 end
614 with _ -> None
615
616(** Attributes where empty URL is an error.
617 Note: href, cite, action can be empty (refers to current document).
618 formaction and src must be non-empty though. *)
619let must_be_non_empty = ["formaction"; "src"; "poster"; "data"]
620
621(** Element/attribute combinations where empty URL is an error. *)
622let must_be_non_empty_combinations = [
623 ("link", "href"); (* link href must be non-empty *)
624 ("form", "action"); (* form action must be non-empty *)
625]
626
627(** Check URL for common errors. Returns error message or None. *)
628let validate_url url element_name attr_name =
629 let original_url = url in
630 let url = String.trim url in
631 (* Empty URL check for certain attributes *)
632 if url = "" then begin
633 let name_lower = String.lowercase_ascii element_name in
634 let attr_lower = String.lowercase_ascii attr_name in
635 if List.mem attr_lower must_be_non_empty ||
636 List.mem (name_lower, attr_lower) must_be_non_empty_combinations then
637 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Must be non-empty."
638 original_url attr_name element_name)
639 else
640 None
641 end
642 else begin
643 (* Check for leading/trailing whitespace *)
644 if original_url <> url && (String.length original_url > 0) then
645 let has_leading = String.length original_url > 0 && (original_url.[0] = ' ' || original_url.[0] = '\t') in
646 let has_trailing = String.length original_url > 0 &&
647 let last = original_url.[String.length original_url - 1] in
648 last = ' ' || last = '\t' in
649 if has_leading || has_trailing then
650 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace."
651 original_url attr_name element_name)
652 else None
653 (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *)
654 else begin
655 match check_scheme_data url attr_name element_name with
656 | Some err -> Some err
657 | None ->
658 (* Check for newlines/tabs in special scheme URLs *)
659 if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then
660 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found."
661 url attr_name element_name)
662 else begin
663 (* Check for relative URL issues first *)
664 match check_relative_url url attr_name element_name with
665 | Some err -> Some err
666 | None ->
667
668 (* Check percent encoding *)
669 match check_percent_encoding url attr_name element_name with
670 | Some err -> Some err
671 | None ->
672
673 (* Check query string *)
674 match check_query_string url attr_name element_name with
675 | Some err -> Some err
676 | None ->
677
678 (* Check fragment *)
679 match check_fragment url attr_name element_name with
680 | Some err -> Some err
681 | None ->
682
683 (* Check userinfo *)
684 match check_userinfo url attr_name element_name with
685 | Some err -> Some err
686 | None ->
687
688 (* Check special scheme requires double slash *)
689 match check_special_scheme_double_slash url attr_name element_name with
690 | Some err -> Some err
691 | None ->
692
693 (* Check data: URLs don't start with slash *)
694 match check_data_url_no_slash url attr_name element_name with
695 | Some err -> Some err
696 | None ->
697
698 (* Check for backslash AFTER special scheme check *)
699 if String.contains url '\\' then
700 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Backslash (\"\\\") used as path segment delimiter."
701 url attr_name element_name)
702 else
703
704 (* Check path segment for illegal characters *)
705 match check_path_segment url attr_name element_name with
706 | Some err -> Some err
707 | None ->
708
709 let scheme = extract_scheme url in
710 let (host_opt, port_opt) = extract_host_and_port url in
711 let scheme_str = match scheme with Some s -> s | None -> "" in
712
713 (* Validate port if present *)
714 match port_opt with
715 | Some port ->
716 (match validate_port port url attr_name element_name with
717 | Some err -> Some err
718 | None ->
719 (* Also validate host *)
720 match host_opt with
721 | Some host -> validate_host host url attr_name element_name scheme_str
722 | None -> None)
723 | None ->
724 (* Just validate host *)
725 match host_opt with
726 | Some host -> validate_host host url attr_name element_name scheme_str
727 | None -> None
728 end
729 end
730 end
731
732(** Checker state. *)
733type state = unit
734
735let create () = ()
736let reset _state = ()
737
738(** Get attribute value by name. *)
739let get_attr_value name attrs =
740 List.find_map (fun (k, v) ->
741 if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None
742 ) attrs
743
744let start_element _state ~element collector =
745 match element.Element.tag with
746 | Tag.Html _ ->
747 let name = Tag.tag_to_string element.tag in
748 let name_lower = String.lowercase_ascii name in
749 let attrs = element.raw_attrs in
750 (* Check URL attributes for elements that have them *)
751 (match List.assoc_opt name_lower url_attributes with
752 | None -> ()
753 | Some url_attrs ->
754 List.iter (fun attr_name ->
755 (* Try to find the attribute - case insensitive *)
756 let url_opt = get_attr_value attr_name attrs in
757 match url_opt with
758 | None -> ()
759 | Some url ->
760 (match check_data_uri_fragment url attr_name name with
761 | Some warn_msg ->
762 Message_collector.add_typed collector (`Generic warn_msg)
763 | None -> ());
764 match validate_url url name attr_name with
765 | None -> ()
766 | Some error_msg ->
767 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg)))
768 ) url_attrs);
769 (* Special handling for input[type=url] value attribute - must be absolute URL *)
770 if name_lower = "input" then begin
771 let type_attr = get_attr_value "type" attrs in
772 if type_attr = Some "url" then begin
773 match get_attr_value "value" attrs with
774 | None -> ()
775 | Some url ->
776 let url = String.trim url in
777 if url = "" then ()
778 else begin
779 (* First check if it's an absolute URL (has a scheme) *)
780 let scheme = extract_scheme url in
781 match scheme with
782 | None ->
783 let msg = Printf.sprintf "Bad value %s for attribute %s on element %s: Bad absolute URL: The string %s is not an absolute URL."
784 (Error_code.q url) (Error_code.q "value") (Error_code.q "input") (Error_code.q url) in
785 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message msg)))
786 | Some _ ->
787 (match check_data_uri_fragment ~is_absolute_url:true url "value" name with
788 | Some warn_msg ->
789 Message_collector.add_typed collector (`Generic warn_msg)
790 | None -> ());
791 match validate_url url name "value" with
792 | None -> ()
793 | Some error_msg ->
794 let error_msg = Str.global_replace (Str.regexp "Bad URL:") "Bad absolute URL:" error_msg in
795 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg)))
796 end
797 end
798 end;
799 let itemtype_opt = get_attr_value "itemtype" attrs in
800 (match itemtype_opt with
801 | Some url when String.trim url <> "" ->
802 (match check_data_uri_fragment ~is_absolute_url:true url "itemtype" name with
803 | Some warn_msg -> Message_collector.add_typed collector (`Generic warn_msg)
804 | None -> ())
805 | _ -> ());
806 let itemid_opt = get_attr_value "itemid" attrs in
807 (match itemid_opt with
808 | Some url when String.trim url <> "" ->
809 (match check_data_uri_fragment url "itemid" name with
810 | Some warn_msg -> Message_collector.add_typed collector (`Generic warn_msg)
811 | None -> ())
812 | _ -> ())
813 | _ -> () (* Non-HTML elements *)
814
815let end_element _state ~tag:_ _collector = ()
816let characters _state _text _collector = ()
817let end_document _state _collector = ()
818
819let checker =
820 (module struct
821 type nonrec state = state
822 let create = create
823 let reset = reset
824 let start_element = start_element
825 let end_element = end_element
826 let characters = characters
827 let end_document = end_document
828 end : Checker.S)