OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 84a2203f96dd85d03a6a6da73df91e62c08db43a 828 lines 38 kB view raw
1(** URL validation checker for href, src, action, and other URL attributes. *) 2 3(** Attributes that contain URLs and should be validated. 4 Note: srcset uses special microsyntax, not validated as URL here. 5 Note: input[value] is only checked for type="url", handled specially below. *) 6let url_attributes = [ 7 ("a", ["href"]); 8 ("area", ["href"]); 9 ("audio", ["src"]); 10 ("base", ["href"]); 11 ("blockquote", ["cite"]); 12 ("button", ["formaction"]); 13 ("del", ["cite"]); 14 ("embed", ["src"]); 15 ("form", ["action"]); 16 ("iframe", ["src"]); 17 ("img", ["src"]); 18 ("input", ["formaction"; "src"]); 19 ("ins", ["cite"]); 20 ("link", ["href"]); 21 ("object", ["data"]); 22 ("q", ["cite"]); 23 ("script", ["src"]); 24 ("source", ["src"]); 25 ("track", ["src"]); 26 ("video", ["src"; "poster"]); 27] 28 29(** Characters not allowed in URL host. *) 30let invalid_host_chars = ['^'; '`'; '{'; '}'; '<'; '>'] 31 32(** Check if a host looks like an IPv6 address (starts with [). *) 33let is_ipv6_host host = 34 String.length host > 0 && host.[0] = '[' 35 36(** Check if character is valid in IPv6 address. *) 37let is_valid_ipv6_char c = 38 (c >= '0' && c <= '9') || 39 (c >= 'a' && c <= 'f') || 40 (c >= 'A' && c <= 'F') || 41 c = ':' || c = '.' || c = '[' || c = ']' 42 43(** Validate IPv6 bracketed host. *) 44let validate_ipv6_host host url attr_name element_name = 45 (* Host should be in format [xxxx:...] *) 46 if String.length host < 3 then 47 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character." 48 url attr_name element_name) 49 else begin 50 (* Check if all characters are valid IPv6 chars *) 51 let invalid_char = String.exists (fun c -> not (is_valid_ipv6_char c)) host in 52 if invalid_char then 53 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character." 54 url attr_name element_name) 55 else 56 None 57 end 58 59(** Check if a file URL host is a valid Windows drive letter (like C|). *) 60let is_valid_windows_drive host = 61 String.length host = 2 && 62 ((host.[0] >= 'A' && host.[0] <= 'Z') || (host.[0] >= 'a' && host.[0] <= 'z')) && 63 host.[1] = '|' 64 65(** Check if pipe is allowed in this host context. *) 66let is_pipe_allowed_in_host url host = 67 let scheme = try String.lowercase_ascii (String.sub url 0 (String.index url ':')) with _ -> "" in 68 scheme = "file" && is_valid_windows_drive host 69 70(** Special schemes that require double slash (//). 71 Note: file: is special but doesn't always require //. 72 Note: ws and wss allow single/no slash forms per WHATWG URL Standard. *) 73let special_schemes_require_double_slash = ["http"; "https"; "ftp"] 74 75(** Special schemes (for other checks). *) 76let special_schemes = ["http"; "https"; "ftp"; "ws"; "wss"; "file"] 77 78(** Extract scheme from URL. *) 79let extract_scheme url = 80 (* A scheme must start with a letter, not [ or other special chars *) 81 if String.length url = 0 then None 82 else if not (url.[0] >= 'a' && url.[0] <= 'z' || url.[0] >= 'A' && url.[0] <= 'Z') then 83 None 84 else 85 try 86 let colon_pos = String.index url ':' in 87 (* Scheme can only contain letters, digits, +, -, . *) 88 let potential_scheme = String.sub url 0 colon_pos in 89 let is_valid_scheme = String.length potential_scheme > 0 && 90 String.for_all (fun c -> 91 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || 92 (c >= '0' && c <= '9') || c = '+' || c = '-' || c = '.' 93 ) potential_scheme in 94 if is_valid_scheme then 95 Some (String.lowercase_ascii potential_scheme) 96 else 97 None 98 with Not_found -> None 99 100(** Extract host and port from URL. Returns (host option, port_string option). *) 101let extract_host_and_port url = 102 try 103 let double_slash = 104 try Some (Str.search_forward (Str.regexp "://") url 0 + 3) 105 with Not_found -> None 106 in 107 match double_slash with 108 | None -> (None, None) 109 | Some start_pos -> 110 let rest = String.sub url start_pos (String.length url - start_pos) in 111 (* Find end of authority (/ ? # or end) *) 112 let auth_end = 113 let find_char c = try Some (String.index rest c) with Not_found -> None in 114 match find_char '/', find_char '?', find_char '#' with 115 | Some a, Some b, Some c -> min a (min b c) 116 | Some a, Some b, None -> min a b 117 | Some a, None, Some c -> min a c 118 | None, Some b, Some c -> min b c 119 | Some a, None, None -> a 120 | None, Some b, None -> b 121 | None, None, Some c -> c 122 | None, None, None -> String.length rest 123 in 124 let authority = String.sub rest 0 auth_end in 125 (* Remove userinfo if present *) 126 let host_port = 127 try 128 let at_pos = String.rindex authority '@' in 129 String.sub authority (at_pos + 1) (String.length authority - at_pos - 1) 130 with Not_found -> authority 131 in 132 (* Handle IPv6 addresses *) 133 if String.length host_port > 0 && host_port.[0] = '[' then begin 134 try 135 let bracket_end = String.index host_port ']' in 136 let host = String.sub host_port 0 (bracket_end + 1) in 137 let after_bracket = String.sub host_port (bracket_end + 1) (String.length host_port - bracket_end - 1) in 138 if String.length after_bracket > 0 && after_bracket.[0] = ':' then 139 (Some host, Some (String.sub after_bracket 1 (String.length after_bracket - 1))) 140 else 141 (Some host, None) 142 with Not_found -> (Some host_port, None) 143 end else begin 144 (* Regular host:port - use FIRST colon to separate host from port 145 (per WHATWG URL Standard for special schemes) *) 146 try 147 let colon_pos = String.index host_port ':' in 148 let host = String.sub host_port 0 colon_pos in 149 let port = String.sub host_port (colon_pos + 1) (String.length host_port - colon_pos - 1) in 150 (Some host, Some port) 151 with Not_found -> (Some host_port, None) 152 end 153 with _ -> (None, None) 154 155(** Check if character is a valid hex digit (for percent-decoding). *) 156let is_hex_digit_for_decode c = 157 (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 158 159(** Convert a hex character to its numeric value. *) 160let hex_value c = 161 if c >= '0' && c <= '9' then Char.code c - Char.code '0' 162 else if c >= 'a' && c <= 'f' then Char.code c - Char.code 'a' + 10 163 else if c >= 'A' && c <= 'F' then Char.code c - Char.code 'A' + 10 164 else 0 165 166(** Percent-decode a string. Returns the decoded bytes. *) 167let percent_decode s = 168 let buf = Buffer.create (String.length s) in 169 let len = String.length s in 170 let i = ref 0 in 171 while !i < len do 172 if s.[!i] = '%' && !i + 2 < len && is_hex_digit_for_decode s.[!i + 1] && is_hex_digit_for_decode s.[!i + 2] then begin 173 let byte = hex_value s.[!i + 1] * 16 + hex_value s.[!i + 2] in 174 Buffer.add_char buf (Char.chr byte); 175 i := !i + 3 176 end else begin 177 Buffer.add_char buf s.[!i]; 178 incr i 179 end 180 done; 181 Buffer.contents buf 182 183(** Check if decoded bytes contain invalid Unicode noncharacters or surrogates. 184 These are forbidden in hostnames per WHATWG URL Standard. 185 - U+FDD0-U+FDEF: noncharacters 186 - U+FFFE, U+FFFF: noncharacters 187 - U+xFFFE, U+xFFFF for any plane (0x1FFFE, etc.) 188 - U+D800-U+DFFF: surrogate code points *) 189let contains_invalid_unicode bytes = 190 let len = String.length bytes in 191 let i = ref 0 in 192 while !i < len do 193 let c = Char.code bytes.[!i] in 194 if c < 128 then begin 195 (* ASCII - OK *) 196 incr i 197 end else if c >= 0xC0 && c < 0xE0 && !i + 1 < len then begin 198 (* 2-byte UTF-8 *) 199 let b1 = Char.code bytes.[!i + 1] in 200 (* let codepoint = ((c land 0x1F) lsl 6) lor (b1 land 0x3F) in *) 201 ignore b1; 202 i := !i + 2 203 end else if c >= 0xE0 && c < 0xF0 && !i + 2 < len then begin 204 (* 3-byte UTF-8 *) 205 let b1 = Char.code bytes.[!i + 1] in 206 let b2 = Char.code bytes.[!i + 2] in 207 let codepoint = ((c land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F) in 208 (* Check for surrogates (U+D800-U+DFFF) *) 209 if codepoint >= 0xD800 && codepoint <= 0xDFFF then 210 raise Exit; 211 (* Check for noncharacters in BMP *) 212 if codepoint >= 0xFDD0 && codepoint <= 0xFDEF then 213 raise Exit; 214 if codepoint = 0xFFFE || codepoint = 0xFFFF then 215 raise Exit; 216 i := !i + 3 217 end else if c >= 0xF0 && c < 0xF8 && !i + 3 < len then begin 218 (* 4-byte UTF-8 *) 219 let b1 = Char.code bytes.[!i + 1] in 220 let b2 = Char.code bytes.[!i + 2] in 221 let b3 = Char.code bytes.[!i + 3] in 222 let codepoint = ((c land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) lor 223 ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F) in 224 (* Check for noncharacters at end of each plane: U+1FFFE, U+1FFFF, U+2FFFE, etc. *) 225 if (codepoint land 0xFFFF) = 0xFFFE || (codepoint land 0xFFFF) = 0xFFFF then 226 raise Exit; 227 i := !i + 4 228 end else begin 229 (* Invalid UTF-8 or other - skip *) 230 incr i 231 end 232 done; 233 false 234 235(** Check if host contains invalid percent-encoded Unicode. *) 236let check_invalid_percent_encoded_unicode host url attr_name element_name = 237 try 238 let decoded = percent_decode host in 239 let _ = contains_invalid_unicode decoded in 240 None 241 with Exit -> 242 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.." 243 url attr_name element_name) 244 245(** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *) 246let contains_percent_char s = 247 (* Check for ASCII percent *) 248 String.contains s '%' || 249 (* Check for fullwidth percent (U+FF05 = 0xEF 0xBC 0x85 in UTF-8) *) 250 try 251 let _ = Str.search_forward (Str.regexp "\xef\xbc\x85") s 0 in 252 true 253 with Not_found -> false 254 255(** Check if decoded host contains forbidden characters. 256 Some URLs have percent-encoded fullwidth characters that decode to forbidden chars. *) 257let check_decoded_host_chars host url attr_name element_name = 258 let decoded = percent_decode host in 259 (* Check for % character in decoded host - this catches fullwidth percent signs etc. *) 260 if contains_percent_char decoded then 261 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%%\xe2\x80\x9d is not allowed." 262 url attr_name element_name) 263 else 264 None 265 266(** Validate port string. Returns error message or None. *) 267let validate_port port url attr_name element_name = 268 if port = "" then None 269 else begin 270 (* Check for invalid characters in port *) 271 let invalid_char = ref None in 272 String.iter (fun c -> 273 if !invalid_char = None && not (c >= '0' && c <= '9') then 274 invalid_char := Some c 275 ) port; 276 match !invalid_char with 277 | Some c -> 278 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in port: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 279 url attr_name element_name c) 280 | None -> 281 (* Check port range *) 282 try 283 let port_num = int_of_string port in 284 if port_num >= 65536 then 285 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Port number must be less than 65536." 286 url attr_name element_name) 287 else 288 None 289 with _ -> None 290 end 291 292(** Validate host string. Returns error message or None. *) 293let validate_host host url attr_name element_name scheme = 294 if is_ipv6_host host then 295 validate_ipv6_host host url attr_name element_name 296 else begin 297 (* Check for empty host *) 298 let requires_host = List.mem scheme special_schemes in 299 if host = "" && requires_host && scheme <> "file" then 300 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: empty host." 301 url attr_name element_name) 302 else 303 (* Check for invalid chars *) 304 let invalid_char = 305 List.find_opt (fun c -> String.contains host c) invalid_host_chars 306 in 307 match invalid_char with 308 | Some c -> 309 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 310 url attr_name element_name c) 311 | None -> 312 (* Check for | *) 313 if String.contains host '|' && not (is_pipe_allowed_in_host url host) then 314 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c|\xe2\x80\x9d is not allowed." 315 url attr_name element_name) 316 (* Check for backslash in host *) 317 else if String.contains host '\\' then 318 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed." 319 url attr_name element_name) 320 (* Check for space in host *) 321 else if String.contains host ' ' then 322 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: space is not allowed." 323 url attr_name element_name) 324 (* Check for invalid percent-encoded Unicode in host *) 325 else begin 326 match check_invalid_percent_encoded_unicode host url attr_name element_name with 327 | Some err -> Some err 328 | None -> 329 (* Check decoded host for forbidden chars like fullwidth percent *) 330 check_decoded_host_chars host url attr_name element_name 331 end 332 end 333 334(** Check if URL has special scheme requiring double slash. *) 335let check_special_scheme_double_slash url attr_name element_name = 336 match extract_scheme url with 337 | None -> None 338 | Some scheme -> 339 (* Only check for schemes that require //, not file: *) 340 if List.mem scheme special_schemes_require_double_slash then begin 341 (* Check if followed by :// *) 342 let colon_pos = String.index url ':' in 343 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 344 if String.length after_colon < 2 || after_colon.[0] <> '/' || after_colon.[1] <> '/' then 345 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a slash (\"/\")." 346 url attr_name element_name) 347 else 348 None 349 end else 350 None 351 352(** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). 353 The is_absolute_url parameter controls whether to use "Bad URL:" or "Bad absolute URL:" in the message. *) 354let check_data_uri_fragment ?(is_absolute_url=false) url attr_name element_name = 355 match extract_scheme url with 356 | None -> None 357 | Some scheme -> 358 if scheme = "data" && String.contains url '#' then 359 let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in 360 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397." 361 url attr_name element_name url_type) 362 else 363 None 364 365(** data: URLs cannot start with / (they have specific format: data:[mediatype][;base64],data) *) 366let data_scheme_no_slash = ["data"] 367 368(** Check for data: URL that incorrectly has a slash (data: URLs have specific format). *) 369let check_data_url_no_slash url attr_name element_name = 370 match extract_scheme url with 371 | None -> None 372 | Some scheme -> 373 if List.mem scheme data_scheme_no_slash then begin 374 let colon_pos = String.index url ':' in 375 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 376 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *) 377 if String.length after_colon > 0 && after_colon.[0] = '/' then 378 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead." 379 url attr_name element_name) 380 else 381 None 382 end else 383 None 384 385(** Check for illegal characters in scheme data (for non-special schemes). *) 386let check_scheme_data url attr_name element_name = 387 match extract_scheme url with 388 | None -> None 389 | Some scheme -> 390 if not (List.mem scheme special_schemes) then begin 391 (* Get scheme data (after the colon) *) 392 let colon_pos = String.index url ':' in 393 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 394 (* Check for tab in scheme data *) 395 if String.contains scheme_data '\t' then 396 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed." 397 url attr_name element_name) 398 (* Check for newline in scheme data *) 399 else if String.contains scheme_data '\n' then 400 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 401 url attr_name element_name) 402 (* Check for carriage return in scheme data *) 403 else if String.contains scheme_data '\r' then 404 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 405 url attr_name element_name) 406 (* Check for space in scheme data *) 407 else if String.contains scheme_data ' ' then 408 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed." 409 url attr_name element_name) 410 else 411 None 412 end else 413 None 414 415(** Remove query and fragment from path. *) 416let remove_query_fragment path = 417 let path = try String.sub path 0 (String.index path '?') with Not_found -> path in 418 try String.sub path 0 (String.index path '#') with Not_found -> path 419 420(** Check for illegal characters in path segment. *) 421let check_path_segment url attr_name element_name = 422 (* Extract path: everything after authority (or after scheme: for non-authority URLs) *) 423 let raw_path = 424 try 425 let double_slash = Str.search_forward (Str.regexp "://") url 0 in 426 let after_auth_start = double_slash + 3 in 427 let rest = String.sub url after_auth_start (String.length url - after_auth_start) in 428 (* Find end of authority *) 429 let path_start = 430 try String.index rest '/' 431 with Not_found -> String.length rest 432 in 433 if path_start < String.length rest then 434 String.sub rest path_start (String.length rest - path_start) 435 else 436 "" 437 with Not_found -> 438 (* No double slash - check for single slash path *) 439 match extract_scheme url with 440 | Some _ -> 441 let colon_pos = String.index url ':' in 442 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 443 after_colon 444 | None -> 445 (* Relative URL - the whole thing is the path *) 446 url 447 in 448 (* Remove query and fragment for path-specific checks *) 449 let path = remove_query_fragment raw_path in 450 (* Check for space in path (not allowed) *) 451 if String.contains path ' ' then 452 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: space is not allowed." 453 url attr_name element_name) 454 (* Check for pipe in path (not allowed except in file:// authority) *) 455 else if String.contains path '|' then 456 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c|\xe2\x80\x9d is not allowed." 457 url attr_name element_name) 458 (* Check for unescaped square brackets in path *) 459 else if String.contains path '[' then 460 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed." 461 url attr_name element_name) 462 else 463 None 464 465(** Check for illegal characters in relative URL. *) 466let check_relative_url url attr_name element_name = 467 (* If URL has no scheme, it's relative *) 468 match extract_scheme url with 469 | Some _ -> None 470 | None -> 471 (* Check for square brackets at start (not IPv6 - that requires scheme) *) 472 if String.length url > 0 && url.[0] = '[' then 473 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed." 474 url attr_name element_name) 475 else 476 None 477 478(** Check if character is a valid hex digit. *) 479let is_hex_digit c = 480 (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 481 482(** Check for bare percent sign not followed by hex digits. *) 483let check_percent_encoding url attr_name element_name = 484 let len = String.length url in 485 let rec find_bare_percent i = 486 if i >= len then None 487 else if url.[i] = '%' then begin 488 (* Check if followed by two hex digits *) 489 if i + 2 < len && is_hex_digit url.[i + 1] && is_hex_digit url.[i + 2] then 490 find_bare_percent (i + 3) (* Valid percent encoding, continue *) 491 else 492 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Percentage (\xe2\x80\x9c%%\xe2\x80\x9d) is not followed by two hexadecimal digits." 493 url attr_name element_name) 494 end else 495 find_bare_percent (i + 1) 496 in 497 find_bare_percent 0 498 499(** Check for illegal characters in query string. *) 500let check_query_string url attr_name element_name = 501 try 502 let query_start = String.index url '?' in 503 let fragment_start = 504 try Some (String.index_from url query_start '#') 505 with Not_found -> None 506 in 507 let query_end = match fragment_start with 508 | Some pos -> pos 509 | None -> String.length url 510 in 511 let query = String.sub url (query_start + 1) (query_end - query_start - 1) in 512 (* Check for unescaped space in query *) 513 if String.contains query ' ' then 514 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in query: space is not allowed." 515 url attr_name element_name) 516 else 517 None 518 with Not_found -> None (* No query string *) 519 520(** Check for illegal characters in fragment. *) 521let check_fragment url attr_name element_name = 522 try 523 let fragment_start = String.index url '#' in 524 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in 525 (* Check for backslash in fragment *) 526 if String.contains fragment '\\' then 527 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed." 528 url attr_name element_name) 529 (* Check for second hash in fragment *) 530 else if String.contains fragment '#' then 531 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed." 532 url attr_name element_name) 533 (* Check for space in fragment *) 534 else if String.contains fragment ' ' then 535 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: space is not allowed." 536 url attr_name element_name) 537 else 538 None 539 with Not_found -> None (* No fragment *) 540 541(** Characters not allowed in userinfo (user:password) part of URL. *) 542let invalid_userinfo_chars = [']'; '['; '^'; '|'; '`'; '<'; '>'] 543 544(** Check for illegal characters in userinfo (user:password). *) 545let check_userinfo url attr_name element_name = 546 try 547 (* Look for :// then find the LAST @ before the next / or end *) 548 let double_slash = Str.search_forward (Str.regexp "://") url 0 + 3 in 549 let rest = String.sub url double_slash (String.length url - double_slash) in 550 (* Find first / or ? or # to limit authority section *) 551 let auth_end = 552 let find_char c = try Some (String.index rest c) with Not_found -> None in 553 match find_char '/', find_char '?', find_char '#' with 554 | Some a, Some b, Some c -> min a (min b c) 555 | Some a, Some b, None -> min a b 556 | Some a, None, Some c -> min a c 557 | None, Some b, Some c -> min b c 558 | Some a, None, None -> a 559 | None, Some b, None -> b 560 | None, None, Some c -> c 561 | None, None, None -> String.length rest 562 in 563 let authority = String.sub rest 0 auth_end in 564 (* Find LAST @ in authority to separate userinfo from host *) 565 let at_pos = 566 try Some (String.rindex authority '@') 567 with Not_found -> None 568 in 569 match at_pos with 570 | None -> None (* No userinfo *) 571 | Some at -> 572 let userinfo = String.sub authority 0 at in 573 (* Check for @ in userinfo (should be percent-encoded) *) 574 if String.contains userinfo '@' then 575 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: User or password contains an at symbol (\xe2\x80\x9c@\xe2\x80\x9d) not percent-encoded." 576 url attr_name element_name) 577 (* Check for space *) 578 else if String.contains userinfo ' ' then 579 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed." 580 url attr_name element_name) 581 else begin 582 (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *) 583 let find_non_ascii_char userinfo = 584 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String userinfo) in 585 let rec find () = 586 match Uutf.decode decoder with 587 | `End | `Await -> None 588 | `Malformed _ -> find () 589 | `Uchar uchar -> 590 let code = Uchar.to_int uchar in 591 (* Check if character is not allowed in userinfo *) 592 (* Per URL Standard: only ASCII letters, digits, and certain symbols allowed *) 593 if code > 127 then begin 594 let buf = Buffer.create 8 in 595 Buffer.add_utf_8_uchar buf uchar; 596 Some (Buffer.contents buf) 597 end else find () 598 in 599 find () 600 in 601 match find_non_ascii_char userinfo with 602 | Some bad_char -> 603 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed." 604 url attr_name element_name bad_char) 605 | None -> 606 (* Check for other invalid chars *) 607 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in 608 match invalid with 609 | Some c -> 610 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 611 url attr_name element_name c) 612 | None -> None 613 end 614 with _ -> None 615 616(** Attributes where empty URL is an error. 617 Note: href, cite, action can be empty (refers to current document). 618 formaction and src must be non-empty though. *) 619let must_be_non_empty = ["formaction"; "src"; "poster"; "data"] 620 621(** Element/attribute combinations where empty URL is an error. *) 622let must_be_non_empty_combinations = [ 623 ("link", "href"); (* link href must be non-empty *) 624 ("form", "action"); (* form action must be non-empty *) 625] 626 627(** Check URL for common errors. Returns error message or None. *) 628let validate_url url element_name attr_name = 629 let original_url = url in 630 let url = String.trim url in 631 (* Empty URL check for certain attributes *) 632 if url = "" then begin 633 let name_lower = String.lowercase_ascii element_name in 634 let attr_lower = String.lowercase_ascii attr_name in 635 if List.mem attr_lower must_be_non_empty || 636 List.mem (name_lower, attr_lower) must_be_non_empty_combinations then 637 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Must be non-empty." 638 original_url attr_name element_name) 639 else 640 None 641 end 642 else begin 643 (* Check for leading/trailing whitespace *) 644 if original_url <> url && (String.length original_url > 0) then 645 let has_leading = String.length original_url > 0 && (original_url.[0] = ' ' || original_url.[0] = '\t') in 646 let has_trailing = String.length original_url > 0 && 647 let last = original_url.[String.length original_url - 1] in 648 last = ' ' || last = '\t' in 649 if has_leading || has_trailing then 650 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace." 651 original_url attr_name element_name) 652 else None 653 (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *) 654 else begin 655 match check_scheme_data url attr_name element_name with 656 | Some err -> Some err 657 | None -> 658 (* Check for newlines/tabs in special scheme URLs *) 659 if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then 660 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found." 661 url attr_name element_name) 662 else begin 663 (* Check for relative URL issues first *) 664 match check_relative_url url attr_name element_name with 665 | Some err -> Some err 666 | None -> 667 668 (* Check percent encoding *) 669 match check_percent_encoding url attr_name element_name with 670 | Some err -> Some err 671 | None -> 672 673 (* Check query string *) 674 match check_query_string url attr_name element_name with 675 | Some err -> Some err 676 | None -> 677 678 (* Check fragment *) 679 match check_fragment url attr_name element_name with 680 | Some err -> Some err 681 | None -> 682 683 (* Check userinfo *) 684 match check_userinfo url attr_name element_name with 685 | Some err -> Some err 686 | None -> 687 688 (* Check special scheme requires double slash *) 689 match check_special_scheme_double_slash url attr_name element_name with 690 | Some err -> Some err 691 | None -> 692 693 (* Check data: URLs don't start with slash *) 694 match check_data_url_no_slash url attr_name element_name with 695 | Some err -> Some err 696 | None -> 697 698 (* Check for backslash AFTER special scheme check *) 699 if String.contains url '\\' then 700 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Backslash (\"\\\") used as path segment delimiter." 701 url attr_name element_name) 702 else 703 704 (* Check path segment for illegal characters *) 705 match check_path_segment url attr_name element_name with 706 | Some err -> Some err 707 | None -> 708 709 let scheme = extract_scheme url in 710 let (host_opt, port_opt) = extract_host_and_port url in 711 let scheme_str = match scheme with Some s -> s | None -> "" in 712 713 (* Validate port if present *) 714 match port_opt with 715 | Some port -> 716 (match validate_port port url attr_name element_name with 717 | Some err -> Some err 718 | None -> 719 (* Also validate host *) 720 match host_opt with 721 | Some host -> validate_host host url attr_name element_name scheme_str 722 | None -> None) 723 | None -> 724 (* Just validate host *) 725 match host_opt with 726 | Some host -> validate_host host url attr_name element_name scheme_str 727 | None -> None 728 end 729 end 730 end 731 732(** Checker state. *) 733type state = unit 734 735let create () = () 736let reset _state = () 737 738(** Get attribute value by name. *) 739let get_attr_value name attrs = 740 List.find_map (fun (k, v) -> 741 if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None 742 ) attrs 743 744let start_element _state ~element collector = 745 match element.Element.tag with 746 | Tag.Html _ -> 747 let name = Tag.tag_to_string element.tag in 748 let name_lower = String.lowercase_ascii name in 749 let attrs = element.raw_attrs in 750 (* Check URL attributes for elements that have them *) 751 (match List.assoc_opt name_lower url_attributes with 752 | None -> () 753 | Some url_attrs -> 754 List.iter (fun attr_name -> 755 (* Try to find the attribute - case insensitive *) 756 let url_opt = get_attr_value attr_name attrs in 757 match url_opt with 758 | None -> () 759 | Some url -> 760 (match check_data_uri_fragment url attr_name name with 761 | Some warn_msg -> 762 Message_collector.add_typed collector (`Generic warn_msg) 763 | None -> ()); 764 match validate_url url name attr_name with 765 | None -> () 766 | Some error_msg -> 767 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg))) 768 ) url_attrs); 769 (* Special handling for input[type=url] value attribute - must be absolute URL *) 770 if name_lower = "input" then begin 771 let type_attr = get_attr_value "type" attrs in 772 if type_attr = Some "url" then begin 773 match get_attr_value "value" attrs with 774 | None -> () 775 | Some url -> 776 let url = String.trim url in 777 if url = "" then () 778 else begin 779 (* First check if it's an absolute URL (has a scheme) *) 780 let scheme = extract_scheme url in 781 match scheme with 782 | None -> 783 let msg = Printf.sprintf "Bad value %s for attribute %s on element %s: Bad absolute URL: The string %s is not an absolute URL." 784 (Error_code.q url) (Error_code.q "value") (Error_code.q "input") (Error_code.q url) in 785 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message msg))) 786 | Some _ -> 787 (match check_data_uri_fragment ~is_absolute_url:true url "value" name with 788 | Some warn_msg -> 789 Message_collector.add_typed collector (`Generic warn_msg) 790 | None -> ()); 791 match validate_url url name "value" with 792 | None -> () 793 | Some error_msg -> 794 let error_msg = Str.global_replace (Str.regexp "Bad URL:") "Bad absolute URL:" error_msg in 795 Message_collector.add_typed collector (`Attr (`Bad_value_generic (`Message error_msg))) 796 end 797 end 798 end; 799 let itemtype_opt = get_attr_value "itemtype" attrs in 800 (match itemtype_opt with 801 | Some url when String.trim url <> "" -> 802 (match check_data_uri_fragment ~is_absolute_url:true url "itemtype" name with 803 | Some warn_msg -> Message_collector.add_typed collector (`Generic warn_msg) 804 | None -> ()) 805 | _ -> ()); 806 let itemid_opt = get_attr_value "itemid" attrs in 807 (match itemid_opt with 808 | Some url when String.trim url <> "" -> 809 (match check_data_uri_fragment url "itemid" name with 810 | Some warn_msg -> Message_collector.add_typed collector (`Generic warn_msg) 811 | None -> ()) 812 | _ -> ()) 813 | _ -> () (* Non-HTML elements *) 814 815let end_element _state ~tag:_ _collector = () 816let characters _state _text _collector = () 817let end_document _state _collector = () 818 819let checker = 820 (module struct 821 type nonrec state = state 822 let create = create 823 let reset = reset 824 let start_element = start_element 825 let end_element = end_element 826 let characters = characters 827 let end_document = end_document 828 end : Checker.S)