OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+3 -1
lib/htmlrw_check/datatype/dt_language.ml
··· 1 (** Helper functions for language tag validation *) 2 3 let is_lower_alpha c = c >= 'a' && c <= 'z' 4 let is_upper_alpha c = c >= 'A' && c <= 'Z' 5 let is_alpha c = is_lower_alpha c || is_upper_alpha c ··· 123 if is_valid_extlang first_lower second_lower then 124 Ok () 125 else 126 - Error (Printf.sprintf "Bad extlang subtag \xe2\x80\x9c%s\xe2\x80\x9d" second_lower) 127 else 128 Ok () (* Not an extlang pattern, continue *) 129 | [] -> Ok ())
··· 1 (** Helper functions for language tag validation *) 2 3 + let q = Error_code.q 4 + 5 let is_lower_alpha c = c >= 'a' && c <= 'z' 6 let is_upper_alpha c = c >= 'A' && c <= 'Z' 7 let is_alpha c = is_lower_alpha c || is_upper_alpha c ··· 125 if is_valid_extlang first_lower second_lower then 126 Ok () 127 else 128 + Error (Printf.sprintf "Bad extlang subtag %s" (q second_lower)) 129 else 130 Ok () (* Not an extlang pattern, continue *) 131 | [] -> Ok ())
+8 -6
lib/htmlrw_check/parse_error_bridge.ml
··· 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6 (** Generate human-readable message for a parse error code *) 7 let message_of_parse_error code = 8 let code_str = Html5rw.Parse_error_code.to_string code in ··· 57 let cp = int_of_string ("0x" ^ cp_str) in 58 Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp 59 else if s = "no-p-element-in-scope" then 60 - "No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen." 61 else if s = "end-tag-p-implied-but-open-elements" then 62 - "End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements." 63 else if s = "end-tag-br" then 64 - "End tag \xe2\x80\x9cbr\xe2\x80\x9d." 65 else if s = "expected-closing-tag-but-got-eof" then 66 "End of file seen and there were open elements." 67 else if String.starts_with ~prefix:"bad-start-tag-in-head-noscri" s then 68 let colon_pos = String.index s ':' in 69 let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 70 - Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element 71 else if String.starts_with ~prefix:"unexpected-end-tag:" s then 72 let element = String.sub s 19 (String.length s - 19) in 73 - Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element 74 else if String.starts_with ~prefix:"start-tag-in-table:" s then 75 let tag = String.sub s 19 (String.length s - 19) in 76 - Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag 77 else 78 Printf.sprintf "Parse error: %s" s 79 with _ -> Printf.sprintf "Parse error: %s" s)
··· 3 SPDX-License-Identifier: MIT 4 ---------------------------------------------------------------------------*) 5 6 + let q = Error_code.q 7 + 8 (** Generate human-readable message for a parse error code *) 9 let message_of_parse_error code = 10 let code_str = Html5rw.Parse_error_code.to_string code in ··· 59 let cp = int_of_string ("0x" ^ cp_str) in 60 Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp 61 else if s = "no-p-element-in-scope" then 62 + Printf.sprintf "No %s element in scope but a %s end tag seen." (q "p") (q "p") 63 else if s = "end-tag-p-implied-but-open-elements" then 64 + Printf.sprintf "End tag %s implied, but there were open elements." (q "p") 65 else if s = "end-tag-br" then 66 + Printf.sprintf "End tag %s." (q "br") 67 else if s = "expected-closing-tag-but-got-eof" then 68 "End of file seen and there were open elements." 69 else if String.starts_with ~prefix:"bad-start-tag-in-head-noscri" s then 70 let colon_pos = String.index s ':' in 71 let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 72 + Printf.sprintf "Bad start tag in %s in %s in %s." (q element) (q "noscript") (q "head") 73 else if String.starts_with ~prefix:"unexpected-end-tag:" s then 74 let element = String.sub s 19 (String.length s - 19) in 75 + Printf.sprintf "Stray end tag %s." (q element) 76 else if String.starts_with ~prefix:"start-tag-in-table:" s then 77 let tag = String.sub s 19 (String.length s - 19) in 78 + Printf.sprintf "Start tag %s seen in %s." (q tag) (q "table") 79 else 80 Printf.sprintf "Parse error: %s" s 81 with _ -> Printf.sprintf "Parse error: %s" s)
+4 -2
lib/htmlrw_check/semantic/obsolete_checker.ml
··· 1 (** Obsolete elements map: element name -> suggestion message *) 2 let obsolete_elements = 3 let tbl = Hashtbl.create 32 in ··· 131 "Use the HTTP OPTIONS feature instead."; 132 133 register "name" ["a"] 134 - "Consider putting an \xe2\x80\x9cid\xe2\x80\x9d attribute on the nearest container instead."; 135 136 register "name" ["embed"; "img"; "option"] 137 - "Use the \xe2\x80\x9cid\xe2\x80\x9d attribute instead."; 138 139 register "nohref" ["area"] 140 "Omitting the \"href\" attribute is sufficient.";
··· 1 + let q = Error_code.q 2 + 3 (** Obsolete elements map: element name -> suggestion message *) 4 let obsolete_elements = 5 let tbl = Hashtbl.create 32 in ··· 133 "Use the HTTP OPTIONS feature instead."; 134 135 register "name" ["a"] 136 + (Printf.sprintf "Consider putting an %s attribute on the nearest container instead." (q "id")); 137 138 register "name" ["embed"; "img"; "option"] 139 + (Printf.sprintf "Use the %s attribute instead." (q "id")); 140 141 register "nohref" ["area"] 142 "Omitting the \"href\" attribute is sufficient.";
+2 -4
lib/htmlrw_check/semantic/required_attr_checker.ml
··· 1 (** Required attribute checker implementation. *) 2 3 type state = { 4 mutable _in_figure : bool; 5 (** Track if we're inside a <figure> element (alt is more critical there) *) ··· 81 in 82 83 if not valid then 84 - let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 85 Message_collector.add_typed collector 86 (`Generic (Printf.sprintf "A %s element must have either a %s attribute, a %s attribute with a %s attribute, or an %s attribute with a %s attribute." 87 (q "meta") (q "charset") (q "name") ··· 122 let value_lower = String.lowercase_ascii value in 123 (* Valid values: empty string, auto, manual, hint *) 124 if value_lower <> "" && value_lower <> "auto" && value_lower <> "manual" && value_lower <> "hint" then 125 - let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 126 Message_collector.add_typed collector 127 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s." 128 (q value) (q "popover") (q element_name))))) ··· 141 let value = float_of_string value_str in 142 let min_val = float_of_string min_str in 143 if min_val > value then 144 - let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 145 Message_collector.add_typed collector 146 (`Generic (Printf.sprintf "The value of the %s attribute must be less than or equal to the value of the %s attribute." 147 (q "min") (q "value"))) ··· 161 | Some max_str -> (try float_of_string max_str with _ -> 1.0) 162 in 163 if value > max_val then 164 - let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 165 (* Check which message to use based on whether max is present *) 166 if Attr_utils.has_attr "max" attrs then 167 Message_collector.add_typed collector
··· 1 (** Required attribute checker implementation. *) 2 3 + let q = Error_code.q 4 + 5 type state = { 6 mutable _in_figure : bool; 7 (** Track if we're inside a <figure> element (alt is more critical there) *) ··· 83 in 84 85 if not valid then 86 Message_collector.add_typed collector 87 (`Generic (Printf.sprintf "A %s element must have either a %s attribute, a %s attribute with a %s attribute, or an %s attribute with a %s attribute." 88 (q "meta") (q "charset") (q "name") ··· 123 let value_lower = String.lowercase_ascii value in 124 (* Valid values: empty string, auto, manual, hint *) 125 if value_lower <> "" && value_lower <> "auto" && value_lower <> "manual" && value_lower <> "hint" then 126 Message_collector.add_typed collector 127 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s." 128 (q value) (q "popover") (q element_name))))) ··· 141 let value = float_of_string value_str in 142 let min_val = float_of_string min_str in 143 if min_val > value then 144 Message_collector.add_typed collector 145 (`Generic (Printf.sprintf "The value of the %s attribute must be less than or equal to the value of the %s attribute." 146 (q "min") (q "value"))) ··· 160 | Some max_str -> (try float_of_string max_str with _ -> 1.0) 161 in 162 if value > max_val then 163 (* Check which message to use based on whether max is present *) 164 if Attr_utils.has_attr "max" attrs then 165 Message_collector.add_typed collector
+1 -1
lib/htmlrw_check/specialized/aria_checker.ml
··· 586 if aria_checked <> None then 587 Message_collector.add_typed collector 588 (`Aria (`Must_not_use (`Attr "aria-checked", `Elem "input", 589 - `Condition "a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9ccheckbox\xe2\x80\x9d"))) 590 | _ -> () 591 end; 592
··· 586 if aria_checked <> None then 587 Message_collector.add_typed collector 588 (`Aria (`Must_not_use (`Attr "aria-checked", `Elem "input", 589 + `Condition (Printf.sprintf "a %s attribute whose value is %s" (q "type") (q "checkbox"))))) 590 | _ -> () 591 end; 592
+24 -22
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
··· 1 (** Attribute restrictions checker - validates that certain attributes 2 are not used on elements where they're not allowed. *) 3 4 (** List of (element, [disallowed attributes]) pairs for HTML elements. *) 5 let disallowed_attrs_html = [ 6 (* Elements that cannot have href attribute (RDFa misuses) *) ··· 174 if attr_value = "#" then 175 Message_collector.add_typed collector 176 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 177 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d." 178 - attr_value attr_name name)))) 179 end 180 ) attrs 181 end; ··· 190 | Error msg -> 191 Message_collector.add_typed collector 192 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 193 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s" 194 - attr_value attr_name name msg)))) 195 end 196 ) attrs 197 end; ··· 213 (* Determine specific error message *) 214 let error_msg = 215 if String.length attr_value = 0 then 216 - Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer." 217 - attr_name name 218 else if String.contains attr_value '%' then 219 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 220 - attr_value attr_name name 221 else if String.length attr_value > 0 && attr_value.[0] = '-' then 222 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead." 223 - attr_value attr_name name 224 else 225 (* Find first non-digit character *) 226 let bad_char = ··· 234 in 235 match bad_char with 236 | Some c -> 237 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 238 - attr_value attr_name name c 239 | None -> 240 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit." 241 - attr_value attr_name name 242 in 243 Message_collector.add_typed collector 244 (`Attr (`Bad_value_generic (`Message error_msg))) ··· 377 if count_codepoints key > 1 then 378 Message_collector.add_typed collector 379 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 380 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character." 381 - attr_value attr_name name)))) 382 ) keys; 383 (* Check for duplicate keys *) 384 let rec find_duplicates seen = function ··· 387 if List.mem k seen then 388 Message_collector.add_typed collector 389 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 390 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique." 391 - attr_value attr_name name)))) 392 else 393 find_duplicates (k :: seen) rest 394 in ··· 405 if has_command && has_aria_expanded then 406 Message_collector.add_typed collector 407 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 408 - `Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute"))); 409 410 if has_popovertarget && has_aria_expanded then 411 Message_collector.add_typed collector 412 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 413 - `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute"))) 414 end; 415 416 (* Note: data-* uppercase check requires XML parsing which preserves case. ··· 432 | Error msg -> 433 Message_collector.add_typed collector 434 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 435 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s" 436 - attr_value attr_name name msg)))) 437 end 438 end 439 ) attrs
··· 1 (** Attribute restrictions checker - validates that certain attributes 2 are not used on elements where they're not allowed. *) 3 4 + let q = Error_code.q 5 + 6 (** List of (element, [disallowed attributes]) pairs for HTML elements. *) 7 let disallowed_attrs_html = [ 8 (* Elements that cannot have href attribute (RDFa misuses) *) ··· 176 if attr_value = "#" then 177 Message_collector.add_typed collector 178 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 179 + "Bad value %s for attribute %s on element %s: Bad hash-name reference: A hash-name reference must have at least one character after %s." 180 + (q attr_value) (q attr_name) (q name) (q "#"))))) 181 end 182 ) attrs 183 end; ··· 192 | Error msg -> 193 Message_collector.add_typed collector 194 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 195 + "Bad value %s for attribute %s on element %s: Bad MIME type: %s" 196 + (q attr_value) (q attr_name) (q name) msg)))) 197 end 198 ) attrs 199 end; ··· 215 (* Determine specific error message *) 216 let error_msg = 217 if String.length attr_value = 0 then 218 + Printf.sprintf "Bad value %s for attribute %s on element %s: The empty string is not a valid non-negative integer." 219 + (q "") (q attr_name) (q name) 220 else if String.contains attr_value '%' then 221 + Printf.sprintf "Bad value %s for attribute %s on element %s: Bad non-negative integer: Expected a digit but saw %s instead." 222 + (q attr_value) (q attr_name) (q name) (q "%") 223 else if String.length attr_value > 0 && attr_value.[0] = '-' then 224 + Printf.sprintf "Bad value %s for attribute %s on element %s: Bad non-negative integer: Expected a digit but saw %s instead." 225 + (q attr_value) (q attr_name) (q name) (q "-") 226 else 227 (* Find first non-digit character *) 228 let bad_char = ··· 236 in 237 match bad_char with 238 | Some c -> 239 + Printf.sprintf "Bad value %s for attribute %s on element %s: Bad non-negative integer: Expected a digit but saw %s instead." 240 + (q attr_value) (q attr_name) (q name) (q (String.make 1 c)) 241 | None -> 242 + Printf.sprintf "Bad value %s for attribute %s on element %s: Bad non-negative integer: Expected a digit." 243 + (q attr_value) (q attr_name) (q name) 244 in 245 Message_collector.add_typed collector 246 (`Attr (`Bad_value_generic (`Message error_msg))) ··· 379 if count_codepoints key > 1 then 380 Message_collector.add_typed collector 381 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 382 + "Bad value %s for attribute %s on element %s: Bad key label list: Key label has multiple characters. Each key label must be a single character." 383 + (q attr_value) (q attr_name) (q name))))) 384 ) keys; 385 (* Check for duplicate keys *) 386 let rec find_duplicates seen = function ··· 389 if List.mem k seen then 390 Message_collector.add_typed collector 391 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 392 + "Bad value %s for attribute %s on element %s: Bad key label list: Duplicate key label. Each key label must be unique." 393 + (q attr_value) (q attr_name) (q name))))) 394 else 395 find_duplicates (k :: seen) rest 396 in ··· 407 if has_command && has_aria_expanded then 408 Message_collector.add_typed collector 409 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 410 + `Condition (Printf.sprintf "a %s attribute" (q "command"))))); 411 412 if has_popovertarget && has_aria_expanded then 413 Message_collector.add_typed collector 414 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 415 + `Condition (Printf.sprintf "a %s attribute" (q "popovertarget"))))) 416 end; 417 418 (* Note: data-* uppercase check requires XML parsing which preserves case. ··· 434 | Error msg -> 435 Message_collector.add_typed collector 436 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 437 + "Bad value %s for attribute %s on element %s: Bad media query: %s" 438 + (q attr_value) (q attr_name) (q name) msg)))) 439 end 440 end 441 ) attrs
+18 -16
lib/htmlrw_check/specialized/datetime_checker.ml
··· 1 (** Datetime attribute validation checker *) 2 3 (** Elements that have datetime attribute *) 4 let datetime_elements = ["del"; "ins"; "time"] 5 ··· 346 if value <> String.trim value then begin 347 let tz_msg = "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." in 348 let date_msg = "Bad date: The literal did not satisfy the date format." in 349 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 350 - value attr_name element_name tz_msg date_msg) 351 end 352 else 353 (* Try datetime with timezone first *) ··· 355 | DtOk -> Ok (* Valid datetime with timezone *) 356 | DtWarning w -> 357 (* Valid but with warning - format matches Nu validator *) 358 - Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format." 359 - value attr_name element_name w) 360 | DtError tz_error -> 361 (* Try just date - valid for all elements *) 362 match validate_date value with ··· 365 if has_suspicious_year value || has_old_year value then begin 366 let date_msg = "Bad date: Year may be mistyped." in 367 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 368 - Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 369 - value attr_name element_name date_msg tz_msg) 370 end else 371 Ok (* Valid date with normal year *) 372 | (false, date_error) -> ··· 394 | (true, _) -> Ok (* Valid duration P... *) 395 | (false, _) -> 396 (* Use simplified message for time element matching Nu validator format *) 397 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad time-datetime: The literal did not satisfy the time-datetime format." 398 - value attr_name element_name) 399 end 400 else begin 401 (* del/ins only allow date or datetime-with-timezone *) ··· 426 (* Datetime first for: generic tz, tz hours error, time minute/hour errors, year errors 427 Date first for: "Month cannot be less than" date error, tz minutes error, fraction error *) 428 if is_month_less_than_error then 429 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 430 - value attr_name element_name date_msg tz_msg) 431 else if is_tz_minutes_error || is_fraction_error then 432 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 433 - value attr_name element_name date_msg tz_msg) 434 else if is_tz_hours_error || is_time_minute_or_hour_error || is_generic_tz then 435 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 436 - value attr_name element_name tz_msg date_msg) 437 else 438 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 439 - value attr_name element_name tz_msg date_msg) 440 end 441 442 (** Checker state *)
··· 1 (** Datetime attribute validation checker *) 2 3 + let q = Error_code.q 4 + 5 (** Elements that have datetime attribute *) 6 let datetime_elements = ["del"; "ins"; "time"] 7 ··· 348 if value <> String.trim value then begin 349 let tz_msg = "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." in 350 let date_msg = "Bad date: The literal did not satisfy the date format." in 351 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 352 + (q value) (q attr_name) (q element_name) tz_msg date_msg) 353 end 354 else 355 (* Try datetime with timezone first *) ··· 357 | DtOk -> Ok (* Valid datetime with timezone *) 358 | DtWarning w -> 359 (* Valid but with warning - format matches Nu validator *) 360 + Warning (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format." 361 + (q value) (q attr_name) (q element_name) w) 362 | DtError tz_error -> 363 (* Try just date - valid for all elements *) 364 match validate_date value with ··· 367 if has_suspicious_year value || has_old_year value then begin 368 let date_msg = "Bad date: Year may be mistyped." in 369 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 370 + Warning (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 371 + (q value) (q attr_name) (q element_name) date_msg tz_msg) 372 end else 373 Ok (* Valid date with normal year *) 374 | (false, date_error) -> ··· 396 | (true, _) -> Ok (* Valid duration P... *) 397 | (false, _) -> 398 (* Use simplified message for time element matching Nu validator format *) 399 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad time-datetime: The literal did not satisfy the time-datetime format." 400 + (q value) (q attr_name) (q element_name)) 401 end 402 else begin 403 (* del/ins only allow date or datetime-with-timezone *) ··· 428 (* Datetime first for: generic tz, tz hours error, time minute/hour errors, year errors 429 Date first for: "Month cannot be less than" date error, tz minutes error, fraction error *) 430 if is_month_less_than_error then 431 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 432 + (q value) (q attr_name) (q element_name) date_msg tz_msg) 433 else if is_tz_minutes_error || is_fraction_error then 434 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 435 + (q value) (q attr_name) (q element_name) date_msg tz_msg) 436 else if is_tz_hours_error || is_time_minute_or_hour_error || is_generic_tz then 437 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 438 + (q value) (q attr_name) (q element_name) tz_msg date_msg) 439 else 440 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 441 + (q value) (q attr_name) (q element_name) tz_msg date_msg) 442 end 443 444 (** Checker state *)
+9 -6
lib/htmlrw_check/specialized/microdata_checker.ml
··· 2 3 Validates HTML5 microdata attributes. *) 4 5 (** Information about an itemscope. *) 6 type item_scope = { 7 element : string; ··· 74 let url_trimmed = String.trim url in 75 if String.length url_trimmed = 0 then 76 Some (Printf.sprintf 77 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad absolute URL: Must be non-empty." 78 - original_value attr_name element) 79 else 80 (* First check if it has a scheme (required for absolute URL) *) 81 match Url_checker.extract_scheme url_trimmed with 82 | None -> 83 Some (Printf.sprintf 84 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad absolute URL: The string \xe2\x80\x9c%s\xe2\x80\x9d is not an absolute URL." 85 - original_value attr_name element url) 86 | Some _ -> 87 (* Has a scheme - do comprehensive URL validation *) 88 match Url_checker.validate_url url element attr_name with ··· 94 (* Escape backslashes in replacement string for Str.global_replace *) 95 let escaped_original = Str.global_replace (Str.regexp "\\\\") "\\\\\\\\" original_value in 96 let error_msg = Str.global_replace 97 - (Str.regexp_string (Printf.sprintf "\xe2\x80\x9c%s\xe2\x80\x9d for attribute" url)) 98 - (Printf.sprintf "\xe2\x80\x9c%s\xe2\x80\x9d for attribute" escaped_original) 99 error_msg in 100 Some error_msg 101
··· 2 3 Validates HTML5 microdata attributes. *) 4 5 + (** Quote helper for consistent message formatting. *) 6 + let q = Error_code.q 7 + 8 (** Information about an itemscope. *) 9 type item_scope = { 10 element : string; ··· 77 let url_trimmed = String.trim url in 78 if String.length url_trimmed = 0 then 79 Some (Printf.sprintf 80 + "Bad value %s for attribute %s on element %s: Bad absolute URL: Must be non-empty." 81 + (q original_value) (q attr_name) (q element)) 82 else 83 (* First check if it has a scheme (required for absolute URL) *) 84 match Url_checker.extract_scheme url_trimmed with 85 | None -> 86 Some (Printf.sprintf 87 + "Bad value %s for attribute %s on element %s: Bad absolute URL: The string %s is not an absolute URL." 88 + (q original_value) (q attr_name) (q element) (q url)) 89 | Some _ -> 90 (* Has a scheme - do comprehensive URL validation *) 91 match Url_checker.validate_url url element attr_name with ··· 97 (* Escape backslashes in replacement string for Str.global_replace *) 98 let escaped_original = Str.global_replace (Str.regexp "\\\\") "\\\\\\\\" original_value in 99 let error_msg = Str.global_replace 100 + (Str.regexp_string (Printf.sprintf "%s for attribute" (q url))) 101 + (Printf.sprintf "%s for attribute" (q escaped_original)) 102 error_msg in 103 Some error_msg 104
+32 -30
lib/htmlrw_check/specialized/mime_type_checker.ml
··· 2 3 Validates MIME type values in type attributes. *) 4 5 (** Validate a MIME type value. Returns error message or None. *) 6 let validate_mime_type value element attr_name = 7 let len = String.length value in 8 if len = 0 then 9 Some (Printf.sprintf 10 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Empty value." 11 - value attr_name element) 12 else if value.[len - 1] = ' ' || value.[len - 1] = '\t' then begin 13 (* Check if this is a semicolon followed by only whitespace *) 14 let semicolon_pos = try Some (String.index value ';') with Not_found -> None in ··· 18 let params_trimmed = String.trim params in 19 if params_trimmed = "" then 20 Some (Printf.sprintf 21 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 22 - value attr_name element) 23 else 24 Some (Printf.sprintf 25 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Extraneous trailing whitespace." 26 - value attr_name element) 27 | None -> 28 Some (Printf.sprintf 29 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Extraneous trailing whitespace." 30 - value attr_name element) 31 end 32 else if len > 0 && (value.[0] = ' ' || value.[0] = '\t') then 33 Some (Printf.sprintf 34 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Expected a token character but saw \xe2\x80\x9c \xe2\x80\x9d instead." 35 - value attr_name element) 36 else 37 (* Parse type/subtype *) 38 let slash_pos = try Some (String.index value '/') with Not_found -> None in ··· 43 (match semicolon_pos with 44 | Some _ -> 45 Some (Printf.sprintf 46 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 47 - value attr_name element) 48 | None -> 49 Some (Printf.sprintf 50 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 51 - value attr_name element)) 52 | Some slash_pos -> 53 (* Check for empty subtype *) 54 let after_slash = String.sub value (slash_pos + 1) (len - slash_pos - 1) in ··· 60 let subtype_trimmed = String.trim subtype in 61 if subtype_trimmed = "" then 62 Some (Printf.sprintf 63 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 64 - value attr_name element) 65 else if String.length subtype > 0 && subtype.[String.length subtype - 1] = ' ' then 66 (* Space before semicolon - also check parameter format *) 67 let semicolon_pos = try Some (String.index value ';') with Not_found -> None in ··· 72 let params_trimmed = String.trim params in 73 if params_trimmed = "" then 74 Some (Printf.sprintf 75 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 76 - value attr_name element) 77 else 78 (* Check for param_name=value format *) 79 let eq_pos = try Some (String.index params '=') with Not_found -> None in 80 (match eq_pos with 81 | None -> 82 Some (Printf.sprintf 83 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 84 - value attr_name element) 85 | Some _ -> None) 86 | None -> None) 87 else ··· 94 let params_trimmed = String.trim params in 95 if params_trimmed = "" then 96 Some (Printf.sprintf 97 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 98 - value attr_name element) 99 else 100 (* Check for param_name=value format *) 101 let eq_pos = try Some (String.index params '=') with Not_found -> None in 102 (match eq_pos with 103 | None -> 104 Some (Printf.sprintf 105 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 106 - value attr_name element) 107 | Some eq_pos -> 108 let param_value = String.sub params (eq_pos + 1) (String.length params - eq_pos - 1) in 109 let param_value_trimmed = String.trim param_value in 110 if param_value_trimmed = "" then 111 Some (Printf.sprintf 112 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 113 - value attr_name element) 114 else if param_value_trimmed.[0] = '"' then 115 (* Quoted string - check for closing quote *) 116 let quote_end = try Some (String.index_from param_value_trimmed 1 '"') with ··· 127 in 128 if has_backslash_at_end then 129 Some (Printf.sprintf 130 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Unfinished quoted string." 131 - value attr_name element) 132 else 133 Some (Printf.sprintf 134 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Unfinished quoted string." 135 - value attr_name element)) 136 else 137 None)) 138
··· 2 3 Validates MIME type values in type attributes. *) 4 5 + let q = Error_code.q 6 + 7 (** Validate a MIME type value. Returns error message or None. *) 8 let validate_mime_type value element attr_name = 9 let len = String.length value in 10 if len = 0 then 11 Some (Printf.sprintf 12 + "Bad value %s for attribute %s on element %s: Bad MIME type: Empty value." 13 + (q value) (q attr_name) (q element)) 14 else if value.[len - 1] = ' ' || value.[len - 1] = '\t' then begin 15 (* Check if this is a semicolon followed by only whitespace *) 16 let semicolon_pos = try Some (String.index value ';') with Not_found -> None in ··· 20 let params_trimmed = String.trim params in 21 if params_trimmed = "" then 22 Some (Printf.sprintf 23 + "Bad value %s for attribute %s on element %s: Bad MIME type: Semicolon seen but there was no parameter following it." 24 + (q value) (q attr_name) (q element)) 25 else 26 Some (Printf.sprintf 27 + "Bad value %s for attribute %s on element %s: Bad MIME type: Extraneous trailing whitespace." 28 + (q value) (q attr_name) (q element)) 29 | None -> 30 Some (Printf.sprintf 31 + "Bad value %s for attribute %s on element %s: Bad MIME type: Extraneous trailing whitespace." 32 + (q value) (q attr_name) (q element)) 33 end 34 else if len > 0 && (value.[0] = ' ' || value.[0] = '\t') then 35 Some (Printf.sprintf 36 + "Bad value %s for attribute %s on element %s: Bad MIME type: Expected a token character but saw %s instead." 37 + (q value) (q attr_name) (q element) (q " ")) 38 else 39 (* Parse type/subtype *) 40 let slash_pos = try Some (String.index value '/') with Not_found -> None in ··· 45 (match semicolon_pos with 46 | Some _ -> 47 Some (Printf.sprintf 48 + "Bad value %s for attribute %s on element %s: Bad MIME type: Subtype missing." 49 + (q value) (q attr_name) (q element)) 50 | None -> 51 Some (Printf.sprintf 52 + "Bad value %s for attribute %s on element %s: Bad MIME type: Subtype missing." 53 + (q value) (q attr_name) (q element))) 54 | Some slash_pos -> 55 (* Check for empty subtype *) 56 let after_slash = String.sub value (slash_pos + 1) (len - slash_pos - 1) in ··· 62 let subtype_trimmed = String.trim subtype in 63 if subtype_trimmed = "" then 64 Some (Printf.sprintf 65 + "Bad value %s for attribute %s on element %s: Bad MIME type: Subtype missing." 66 + (q value) (q attr_name) (q element)) 67 else if String.length subtype > 0 && subtype.[String.length subtype - 1] = ' ' then 68 (* Space before semicolon - also check parameter format *) 69 let semicolon_pos = try Some (String.index value ';') with Not_found -> None in ··· 74 let params_trimmed = String.trim params in 75 if params_trimmed = "" then 76 Some (Printf.sprintf 77 + "Bad value %s for attribute %s on element %s: Bad MIME type: Semicolon seen but there was no parameter following it." 78 + (q value) (q attr_name) (q element)) 79 else 80 (* Check for param_name=value format *) 81 let eq_pos = try Some (String.index params '=') with Not_found -> None in 82 (match eq_pos with 83 | None -> 84 Some (Printf.sprintf 85 + "Bad value %s for attribute %s on element %s: Bad MIME type: Parameter value missing." 86 + (q value) (q attr_name) (q element)) 87 | Some _ -> None) 88 | None -> None) 89 else ··· 96 let params_trimmed = String.trim params in 97 if params_trimmed = "" then 98 Some (Printf.sprintf 99 + "Bad value %s for attribute %s on element %s: Bad MIME type: Semicolon seen but there was no parameter following it." 100 + (q value) (q attr_name) (q element)) 101 else 102 (* Check for param_name=value format *) 103 let eq_pos = try Some (String.index params '=') with Not_found -> None in 104 (match eq_pos with 105 | None -> 106 Some (Printf.sprintf 107 + "Bad value %s for attribute %s on element %s: Bad MIME type: Parameter value missing." 108 + (q value) (q attr_name) (q element)) 109 | Some eq_pos -> 110 let param_value = String.sub params (eq_pos + 1) (String.length params - eq_pos - 1) in 111 let param_value_trimmed = String.trim param_value in 112 if param_value_trimmed = "" then 113 Some (Printf.sprintf 114 + "Bad value %s for attribute %s on element %s: Bad MIME type: Parameter value missing." 115 + (q value) (q attr_name) (q element)) 116 else if param_value_trimmed.[0] = '"' then 117 (* Quoted string - check for closing quote *) 118 let quote_end = try Some (String.index_from param_value_trimmed 1 '"') with ··· 129 in 130 if has_backslash_at_end then 131 Some (Printf.sprintf 132 + "Bad value %s for attribute %s on element %s: Bad MIME type: Unfinished quoted string." 133 + (q value) (q attr_name) (q element)) 134 else 135 Some (Printf.sprintf 136 + "Bad value %s for attribute %s on element %s: Bad MIME type: Unfinished quoted string." 137 + (q value) (q attr_name) (q element))) 138 else 139 None)) 140
+10 -8
lib/htmlrw_check/specialized/svg_checker.ml
··· 2 3 Validates SVG elements and attributes according to SVG 1.1/2 specifications. *) 4 5 type font_state = { 6 mutable has_missing_glyph : bool; 7 } ··· 292 if value <> svg_ns_url then 293 Message_collector.add_typed collector 294 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 295 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for the attribute \xe2\x80\x9cxmlns\xe2\x80\x9d (only \xe2\x80\x9c%s\xe2\x80\x9d permitted here)." 296 - value svg_ns_url)))) 297 | "xmlns:xlink" -> 298 if value <> "http://www.w3.org/1999/xlink" then 299 Message_collector.add_typed collector 300 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 301 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for the attribute \xe2\x80\x9cxmlns:link\xe2\x80\x9d (only \xe2\x80\x9chttp://www.w3.org/1999/xlink\xe2\x80\x9d permitted here)." 302 - value)))) 303 | _ when String.starts_with ~prefix:"xmlns:" attr && attr <> "xmlns:xlink" -> 304 (* Other xmlns declarations are not allowed in HTML-embedded SVG *) 305 Message_collector.add_typed collector ··· 324 let context = String.sub d !context_start (ctx_end - !context_start) in 325 Message_collector.add_typed collector 326 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 327 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cd\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad SVG path data: Expected command but found \xe2\x80\x9c#\xe2\x80\x9d (context: \xe2\x80\x9c%s\xe2\x80\x9d)." 328 - d element context)))); 329 i := len (* Stop processing *) 330 | _ -> 331 incr i ··· 344 let context = String.sub d ctx_start (flag_end - ctx_start) in 345 Message_collector.add_typed collector 346 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 347 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cd\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad SVG path data: Expected \xe2\x80\x9c0\xe2\x80\x9d or \xe2\x80\x9c1\xe2\x80\x9d for large-arc-flag for \xe2\x80\x9ca\xe2\x80\x9d command but found \xe2\x80\x9c%s\xe2\x80\x9d instead (context: \xe2\x80\x9c%s\xe2\x80\x9d)." 348 - d element flag context)))) 349 end 350 with Not_found -> () 351
··· 2 3 Validates SVG elements and attributes according to SVG 1.1/2 specifications. *) 4 5 + let q = Error_code.q 6 + 7 type font_state = { 8 mutable has_missing_glyph : bool; 9 } ··· 294 if value <> svg_ns_url then 295 Message_collector.add_typed collector 296 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 297 + "Bad value %s for the attribute %s (only %s permitted here)." 298 + (q value) (q "xmlns") (q svg_ns_url))))) 299 | "xmlns:xlink" -> 300 if value <> "http://www.w3.org/1999/xlink" then 301 Message_collector.add_typed collector 302 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 303 + "Bad value %s for the attribute %s (only %s permitted here)." 304 + (q value) (q "xmlns:link") (q "http://www.w3.org/1999/xlink"))))) 305 | _ when String.starts_with ~prefix:"xmlns:" attr && attr <> "xmlns:xlink" -> 306 (* Other xmlns declarations are not allowed in HTML-embedded SVG *) 307 Message_collector.add_typed collector ··· 326 let context = String.sub d !context_start (ctx_end - !context_start) in 327 Message_collector.add_typed collector 328 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 329 + "Bad value %s for attribute %s on element %s: Bad SVG path data: Expected command but found %s (context: %s)." 330 + (q d) (q "d") (q element) (q "#") (q context))))); 331 i := len (* Stop processing *) 332 | _ -> 333 incr i ··· 346 let context = String.sub d ctx_start (flag_end - ctx_start) in 347 Message_collector.add_typed collector 348 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 349 + "Bad value %s for attribute %s on element %s: Bad SVG path data: Expected %s or %s for large-arc-flag for %s command but found %s instead (context: %s)." 350 + (q d) (q "d") (q element) (q "0") (q "1") (q "a") (q flag) (q context))))) 351 end 352 with Not_found -> () 353
+73 -70
lib/htmlrw_check/specialized/url_checker.ml
··· 1 (** URL validation checker for href, src, action, and other URL attributes. *) 2 3 (** Attributes that contain URLs and should be validated. 4 Note: srcset uses special microsyntax, not validated as URL here. 5 Note: input[value] is only checked for type="url", handled specially below. *) ··· 44 let validate_ipv6_host host url attr_name element_name = 45 (* Host should be in format [xxxx:...] *) 46 if String.length host < 3 then 47 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character." 48 - url attr_name element_name) 49 else begin 50 (* Check if all characters are valid IPv6 chars *) 51 let invalid_char = String.exists (fun c -> not (is_valid_ipv6_char c)) host in 52 if invalid_char then 53 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character." 54 - url attr_name element_name) 55 else 56 None 57 end ··· 239 let _ = contains_invalid_unicode decoded in 240 None 241 with Exit -> 242 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.." 243 - url attr_name element_name) 244 245 (** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *) 246 let contains_percent_char s = ··· 258 let decoded = percent_decode host in 259 (* Check for % character in decoded host - this catches fullwidth percent signs etc. *) 260 if contains_percent_char decoded then 261 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%%\xe2\x80\x9d is not allowed." 262 - url attr_name element_name) 263 else 264 None 265 ··· 275 ) port; 276 match !invalid_char with 277 | Some c -> 278 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in port: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 279 - url attr_name element_name c) 280 | None -> 281 (* Check port range *) 282 try 283 let port_num = int_of_string port in 284 if port_num >= 65536 then 285 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Port number must be less than 65536." 286 - url attr_name element_name) 287 else 288 None 289 with _ -> None ··· 297 (* Check for empty host *) 298 let requires_host = List.mem scheme special_schemes in 299 if host = "" && requires_host && scheme <> "file" then 300 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: empty host." 301 - url attr_name element_name) 302 else 303 (* Check for invalid chars *) 304 let invalid_char = ··· 306 in 307 match invalid_char with 308 | Some c -> 309 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 310 - url attr_name element_name c) 311 | None -> 312 (* Check for | *) 313 if String.contains host '|' && not (is_pipe_allowed_in_host url host) then 314 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c|\xe2\x80\x9d is not allowed." 315 - url attr_name element_name) 316 (* Check for backslash in host *) 317 else if String.contains host '\\' then 318 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed." 319 - url attr_name element_name) 320 (* Check for space in host *) 321 else if String.contains host ' ' then 322 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: space is not allowed." 323 - url attr_name element_name) 324 (* Check for invalid percent-encoded Unicode in host *) 325 else begin 326 match check_invalid_percent_encoded_unicode host url attr_name element_name with ··· 342 let colon_pos = String.index url ':' in 343 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 344 if String.length after_colon < 2 || after_colon.[0] <> '/' || after_colon.[1] <> '/' then 345 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a slash (\"/\")." 346 - url attr_name element_name) 347 else 348 None 349 end else ··· 357 | Some scheme -> 358 if scheme = "data" && String.contains url '#' then 359 let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in 360 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397." 361 - url attr_name element_name url_type) 362 else 363 None 364 ··· 375 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 376 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *) 377 if String.length after_colon > 0 && after_colon.[0] = '/' then 378 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead." 379 - url attr_name element_name) 380 else 381 None 382 end else ··· 393 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 394 (* Check for tab in scheme data *) 395 if String.contains scheme_data '\t' then 396 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed." 397 - url attr_name element_name) 398 (* Check for newline in scheme data *) 399 else if String.contains scheme_data '\n' then 400 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 401 - url attr_name element_name) 402 (* Check for carriage return in scheme data *) 403 else if String.contains scheme_data '\r' then 404 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 405 - url attr_name element_name) 406 (* Check for space in scheme data *) 407 else if String.contains scheme_data ' ' then 408 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed." 409 - url attr_name element_name) 410 else 411 None 412 end else ··· 449 let path = remove_query_fragment raw_path in 450 (* Check for space in path (not allowed) *) 451 if String.contains path ' ' then 452 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: space is not allowed." 453 - url attr_name element_name) 454 (* Check for pipe in path (not allowed except in file:// authority) *) 455 else if String.contains path '|' then 456 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c|\xe2\x80\x9d is not allowed." 457 - url attr_name element_name) 458 (* Check for unescaped square brackets in path *) 459 else if String.contains path '[' then 460 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed." 461 - url attr_name element_name) 462 else 463 None 464 ··· 470 | None -> 471 (* Check for square brackets at start (not IPv6 - that requires scheme) *) 472 if String.length url > 0 && url.[0] = '[' then 473 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed." 474 - url attr_name element_name) 475 else 476 None 477 ··· 489 if i + 2 < len && is_hex_digit url.[i + 1] && is_hex_digit url.[i + 2] then 490 find_bare_percent (i + 3) (* Valid percent encoding, continue *) 491 else 492 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Percentage (\xe2\x80\x9c%%\xe2\x80\x9d) is not followed by two hexadecimal digits." 493 - url attr_name element_name) 494 end else 495 find_bare_percent (i + 1) 496 in ··· 511 let query = String.sub url (query_start + 1) (query_end - query_start - 1) in 512 (* Check for unescaped space in query *) 513 if String.contains query ' ' then 514 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in query: space is not allowed." 515 - url attr_name element_name) 516 else 517 None 518 with Not_found -> None (* No query string *) ··· 524 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in 525 (* Check for backslash in fragment *) 526 if String.contains fragment '\\' then 527 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed." 528 - url attr_name element_name) 529 (* Check for second hash in fragment *) 530 else if String.contains fragment '#' then 531 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed." 532 - url attr_name element_name) 533 (* Check for space in fragment *) 534 else if String.contains fragment ' ' then 535 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: space is not allowed." 536 - url attr_name element_name) 537 else 538 None 539 with Not_found -> None (* No fragment *) ··· 572 let userinfo = String.sub authority 0 at in 573 (* Check for @ in userinfo (should be percent-encoded) *) 574 if String.contains userinfo '@' then 575 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: User or password contains an at symbol (\xe2\x80\x9c@\xe2\x80\x9d) not percent-encoded." 576 - url attr_name element_name) 577 (* Check for space *) 578 else if String.contains userinfo ' ' then 579 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed." 580 - url attr_name element_name) 581 else begin 582 (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *) 583 let find_non_ascii_char userinfo = ··· 600 in 601 match find_non_ascii_char userinfo with 602 | Some bad_char -> 603 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed." 604 - url attr_name element_name bad_char) 605 | None -> 606 (* Check for other invalid chars *) 607 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in 608 match invalid with 609 | Some c -> 610 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 611 - url attr_name element_name c) 612 | None -> None 613 end 614 with _ -> None ··· 634 let attr_lower = String.lowercase_ascii attr_name in 635 if List.mem attr_lower must_be_non_empty || 636 List.mem (name_lower, attr_lower) must_be_non_empty_combinations then 637 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Must be non-empty." 638 - original_url attr_name element_name) 639 else 640 None 641 end ··· 647 let last = original_url.[String.length original_url - 1] in 648 last = ' ' || last = '\t' in 649 if has_leading || has_trailing then 650 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace." 651 - original_url attr_name element_name) 652 else None 653 (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *) 654 else begin ··· 657 | None -> 658 (* Check for newlines/tabs in special scheme URLs *) 659 if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then 660 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found." 661 - url attr_name element_name) 662 else begin 663 (* Check for relative URL issues first *) 664 match check_relative_url url attr_name element_name with ··· 697 698 (* Check for backslash AFTER special scheme check *) 699 if String.contains url '\\' then 700 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Backslash (\"\\\") used as path segment delimiter." 701 - url attr_name element_name) 702 else 703 704 (* Check path segment for illegal characters *)
··· 1 (** URL validation checker for href, src, action, and other URL attributes. *) 2 3 + (** Quote helper for consistent message formatting. *) 4 + let q = Error_code.q 5 + 6 (** Attributes that contain URLs and should be validated. 7 Note: srcset uses special microsyntax, not validated as URL here. 8 Note: input[value] is only checked for type="url", handled specially below. *) ··· 47 let validate_ipv6_host host url attr_name element_name = 48 (* Host should be in format [xxxx:...] *) 49 if String.length host < 3 then 50 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character." 51 + (q url) (q attr_name) (q element_name)) 52 else begin 53 (* Check if all characters are valid IPv6 chars *) 54 let invalid_char = String.exists (fun c -> not (is_valid_ipv6_char c)) host in 55 if invalid_char then 56 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character." 57 + (q url) (q attr_name) (q element_name)) 58 else 59 None 60 end ··· 242 let _ = contains_invalid_unicode decoded in 243 None 244 with Exit -> 245 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: A label or domain name contains disallowed characters.." 246 + (q url) (q attr_name) (q element_name)) 247 248 (** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *) 249 let contains_percent_char s = ··· 261 let decoded = percent_decode host in 262 (* Check for % character in decoded host - this catches fullwidth percent signs etc. *) 263 if contains_percent_char decoded then 264 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: %s is not allowed." 265 + (q url) (q attr_name) (q element_name) (q "%")) 266 else 267 None 268 ··· 278 ) port; 279 match !invalid_char with 280 | Some c -> 281 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in port: %s is not allowed." 282 + (q url) (q attr_name) (q element_name) (q (String.make 1 c))) 283 | None -> 284 (* Check port range *) 285 try 286 let port_num = int_of_string port in 287 if port_num >= 65536 then 288 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Port number must be less than 65536." 289 + (q url) (q attr_name) (q element_name)) 290 else 291 None 292 with _ -> None ··· 300 (* Check for empty host *) 301 let requires_host = List.mem scheme special_schemes in 302 if host = "" && requires_host && scheme <> "file" then 303 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: empty host." 304 + (q url) (q attr_name) (q element_name)) 305 else 306 (* Check for invalid chars *) 307 let invalid_char = ··· 309 in 310 match invalid_char with 311 | Some c -> 312 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: %s is not allowed." 313 + (q url) (q attr_name) (q element_name) (q (String.make 1 c))) 314 | None -> 315 (* Check for | *) 316 if String.contains host '|' && not (is_pipe_allowed_in_host url host) then 317 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: %s is not allowed." 318 + (q url) (q attr_name) (q element_name) (q "|")) 319 (* Check for backslash in host *) 320 else if String.contains host '\\' then 321 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: %s is not allowed." 322 + (q url) (q attr_name) (q element_name) (q "\\")) 323 (* Check for space in host *) 324 else if String.contains host ' ' then 325 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: space is not allowed." 326 + (q url) (q attr_name) (q element_name)) 327 (* Check for invalid percent-encoded Unicode in host *) 328 else begin 329 match check_invalid_percent_encoded_unicode host url attr_name element_name with ··· 345 let colon_pos = String.index url ':' in 346 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 347 if String.length after_colon < 2 || after_colon.[0] <> '/' || after_colon.[1] <> '/' then 348 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Expected a slash (\"/\")." 349 + (q url) (q attr_name) (q element_name)) 350 else 351 None 352 end else ··· 360 | Some scheme -> 361 if scheme = "data" && String.contains url '#' then 362 let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in 363 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: %s Fragment is not allowed for data: URIs according to RFC 2397." 364 + (q url) (q attr_name) (q element_name) url_type) 365 else 366 None 367 ··· 378 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 379 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *) 380 if String.length after_colon > 0 && after_colon.[0] = '/' then 381 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Expected a token character or a semicolon but saw %s instead." 382 + (q url) (q attr_name) (q element_name) (q "/")) 383 else 384 None 385 end else ··· 396 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 397 (* Check for tab in scheme data *) 398 if String.contains scheme_data '\t' then 399 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in scheme data: tab is not allowed." 400 + (q url) (q attr_name) (q element_name)) 401 (* Check for newline in scheme data *) 402 else if String.contains scheme_data '\n' then 403 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in scheme data: line break is not allowed." 404 + (q url) (q attr_name) (q element_name)) 405 (* Check for carriage return in scheme data *) 406 else if String.contains scheme_data '\r' then 407 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in scheme data: line break is not allowed." 408 + (q url) (q attr_name) (q element_name)) 409 (* Check for space in scheme data *) 410 else if String.contains scheme_data ' ' then 411 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in scheme data: space is not allowed." 412 + (q url) (q attr_name) (q element_name)) 413 else 414 None 415 end else ··· 452 let path = remove_query_fragment raw_path in 453 (* Check for space in path (not allowed) *) 454 if String.contains path ' ' then 455 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in path segment: space is not allowed." 456 + (q url) (q attr_name) (q element_name)) 457 (* Check for pipe in path (not allowed except in file:// authority) *) 458 else if String.contains path '|' then 459 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in path segment: %s is not allowed." 460 + (q url) (q attr_name) (q element_name) (q "|")) 461 (* Check for unescaped square brackets in path *) 462 else if String.contains path '[' then 463 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in path segment: %s is not allowed." 464 + (q url) (q attr_name) (q element_name) (q "[")) 465 else 466 None 467 ··· 473 | None -> 474 (* Check for square brackets at start (not IPv6 - that requires scheme) *) 475 if String.length url > 0 && url.[0] = '[' then 476 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in path segment: %s is not allowed." 477 + (q url) (q attr_name) (q element_name) (q "[")) 478 else 479 None 480 ··· 492 if i + 2 < len && is_hex_digit url.[i + 1] && is_hex_digit url.[i + 2] then 493 find_bare_percent (i + 3) (* Valid percent encoding, continue *) 494 else 495 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Percentage (%s) is not followed by two hexadecimal digits." 496 + (q url) (q attr_name) (q element_name) (q "%")) 497 end else 498 find_bare_percent (i + 1) 499 in ··· 514 let query = String.sub url (query_start + 1) (query_end - query_start - 1) in 515 (* Check for unescaped space in query *) 516 if String.contains query ' ' then 517 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in query: space is not allowed." 518 + (q url) (q attr_name) (q element_name)) 519 else 520 None 521 with Not_found -> None (* No query string *) ··· 527 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in 528 (* Check for backslash in fragment *) 529 if String.contains fragment '\\' then 530 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in fragment: %s is not allowed." 531 + (q url) (q attr_name) (q element_name) (q "\\")) 532 (* Check for second hash in fragment *) 533 else if String.contains fragment '#' then 534 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in fragment: %s is not allowed." 535 + (q url) (q attr_name) (q element_name) (q "#")) 536 (* Check for space in fragment *) 537 else if String.contains fragment ' ' then 538 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in fragment: space is not allowed." 539 + (q url) (q attr_name) (q element_name)) 540 else 541 None 542 with Not_found -> None (* No fragment *) ··· 575 let userinfo = String.sub authority 0 at in 576 (* Check for @ in userinfo (should be percent-encoded) *) 577 if String.contains userinfo '@' then 578 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: User or password contains an at symbol (%s) not percent-encoded." 579 + (q url) (q attr_name) (q element_name) (q "@")) 580 (* Check for space *) 581 else if String.contains userinfo ' ' then 582 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in user or password: space is not allowed." 583 + (q url) (q attr_name) (q element_name)) 584 else begin 585 (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *) 586 let find_non_ascii_char userinfo = ··· 603 in 604 match find_non_ascii_char userinfo with 605 | Some bad_char -> 606 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in user or password: %s is not allowed." 607 + (q url) (q attr_name) (q element_name) (q bad_char)) 608 | None -> 609 (* Check for other invalid chars *) 610 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in 611 match invalid with 612 | Some c -> 613 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in user or password: %s is not allowed." 614 + (q url) (q attr_name) (q element_name) (q (String.make 1 c))) 615 | None -> None 616 end 617 with _ -> None ··· 637 let attr_lower = String.lowercase_ascii attr_name in 638 if List.mem attr_lower must_be_non_empty || 639 List.mem (name_lower, attr_lower) must_be_non_empty_combinations then 640 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Must be non-empty." 641 + (q original_url) (q attr_name) (q element_name)) 642 else 643 None 644 end ··· 650 let last = original_url.[String.length original_url - 1] in 651 last = ' ' || last = '\t' in 652 if has_leading || has_trailing then 653 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character: leading/trailing ASCII whitespace." 654 + (q original_url) (q attr_name) (q element_name)) 655 else None 656 (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *) 657 else begin ··· 660 | None -> 661 (* Check for newlines/tabs in special scheme URLs *) 662 if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then 663 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Tab, new line or carriage return found." 664 + (q url) (q attr_name) (q element_name)) 665 else begin 666 (* Check for relative URL issues first *) 667 match check_relative_url url attr_name element_name with ··· 700 701 (* Check for backslash AFTER special scheme check *) 702 if String.contains url '\\' then 703 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Backslash (\"\\\") used as path segment delimiter." 704 + (q url) (q attr_name) (q element_name)) 705 else 706 707 (* Check path segment for illegal characters *)