OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+3 -1
lib/htmlrw_check/datatype/dt_language.ml
··· 1 1 (** Helper functions for language tag validation *) 2 2 3 + let q = Error_code.q 4 + 3 5 let is_lower_alpha c = c >= 'a' && c <= 'z' 4 6 let is_upper_alpha c = c >= 'A' && c <= 'Z' 5 7 let is_alpha c = is_lower_alpha c || is_upper_alpha c ··· 123 125 if is_valid_extlang first_lower second_lower then 124 126 Ok () 125 127 else 126 - Error (Printf.sprintf "Bad extlang subtag \xe2\x80\x9c%s\xe2\x80\x9d" second_lower) 128 + Error (Printf.sprintf "Bad extlang subtag %s" (q second_lower)) 127 129 else 128 130 Ok () (* Not an extlang pattern, continue *) 129 131 | [] -> Ok ())
+8 -6
lib/htmlrw_check/parse_error_bridge.ml
··· 3 3 SPDX-License-Identifier: MIT 4 4 ---------------------------------------------------------------------------*) 5 5 6 + let q = Error_code.q 7 + 6 8 (** Generate human-readable message for a parse error code *) 7 9 let message_of_parse_error code = 8 10 let code_str = Html5rw.Parse_error_code.to_string code in ··· 57 59 let cp = int_of_string ("0x" ^ cp_str) in 58 60 Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp 59 61 else if s = "no-p-element-in-scope" then 60 - "No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen." 62 + Printf.sprintf "No %s element in scope but a %s end tag seen." (q "p") (q "p") 61 63 else if s = "end-tag-p-implied-but-open-elements" then 62 - "End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements." 64 + Printf.sprintf "End tag %s implied, but there were open elements." (q "p") 63 65 else if s = "end-tag-br" then 64 - "End tag \xe2\x80\x9cbr\xe2\x80\x9d." 66 + Printf.sprintf "End tag %s." (q "br") 65 67 else if s = "expected-closing-tag-but-got-eof" then 66 68 "End of file seen and there were open elements." 67 69 else if String.starts_with ~prefix:"bad-start-tag-in-head-noscri" s then 68 70 let colon_pos = String.index s ':' in 69 71 let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 70 - Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element 72 + Printf.sprintf "Bad start tag in %s in %s in %s." (q element) (q "noscript") (q "head") 71 73 else if String.starts_with ~prefix:"unexpected-end-tag:" s then 72 74 let element = String.sub s 19 (String.length s - 19) in 73 - Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element 75 + Printf.sprintf "Stray end tag %s." (q element) 74 76 else if String.starts_with ~prefix:"start-tag-in-table:" s then 75 77 let tag = String.sub s 19 (String.length s - 19) in 76 - Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag 78 + Printf.sprintf "Start tag %s seen in %s." (q tag) (q "table") 77 79 else 78 80 Printf.sprintf "Parse error: %s" s 79 81 with _ -> Printf.sprintf "Parse error: %s" s)
+4 -2
lib/htmlrw_check/semantic/obsolete_checker.ml
··· 1 + let q = Error_code.q 2 + 1 3 (** Obsolete elements map: element name -> suggestion message *) 2 4 let obsolete_elements = 3 5 let tbl = Hashtbl.create 32 in ··· 131 133 "Use the HTTP OPTIONS feature instead."; 132 134 133 135 register "name" ["a"] 134 - "Consider putting an \xe2\x80\x9cid\xe2\x80\x9d attribute on the nearest container instead."; 136 + (Printf.sprintf "Consider putting an %s attribute on the nearest container instead." (q "id")); 135 137 136 138 register "name" ["embed"; "img"; "option"] 137 - "Use the \xe2\x80\x9cid\xe2\x80\x9d attribute instead."; 139 + (Printf.sprintf "Use the %s attribute instead." (q "id")); 138 140 139 141 register "nohref" ["area"] 140 142 "Omitting the \"href\" attribute is sufficient.";
+2 -4
lib/htmlrw_check/semantic/required_attr_checker.ml
··· 1 1 (** Required attribute checker implementation. *) 2 2 3 + let q = Error_code.q 4 + 3 5 type state = { 4 6 mutable _in_figure : bool; 5 7 (** Track if we're inside a <figure> element (alt is more critical there) *) ··· 81 83 in 82 84 83 85 if not valid then 84 - let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 85 86 Message_collector.add_typed collector 86 87 (`Generic (Printf.sprintf "A %s element must have either a %s attribute, a %s attribute with a %s attribute, or an %s attribute with a %s attribute." 87 88 (q "meta") (q "charset") (q "name") ··· 122 123 let value_lower = String.lowercase_ascii value in 123 124 (* Valid values: empty string, auto, manual, hint *) 124 125 if value_lower <> "" && value_lower <> "auto" && value_lower <> "manual" && value_lower <> "hint" then 125 - let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 126 126 Message_collector.add_typed collector 127 127 (`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value %s for attribute %s on element %s." 128 128 (q value) (q "popover") (q element_name))))) ··· 141 141 let value = float_of_string value_str in 142 142 let min_val = float_of_string min_str in 143 143 if min_val > value then 144 - let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 145 144 Message_collector.add_typed collector 146 145 (`Generic (Printf.sprintf "The value of the %s attribute must be less than or equal to the value of the %s attribute." 147 146 (q "min") (q "value"))) ··· 161 160 | Some max_str -> (try float_of_string max_str with _ -> 1.0) 162 161 in 163 162 if value > max_val then 164 - let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 165 163 (* Check which message to use based on whether max is present *) 166 164 if Attr_utils.has_attr "max" attrs then 167 165 Message_collector.add_typed collector
+1 -1
lib/htmlrw_check/specialized/aria_checker.ml
··· 586 586 if aria_checked <> None then 587 587 Message_collector.add_typed collector 588 588 (`Aria (`Must_not_use (`Attr "aria-checked", `Elem "input", 589 - `Condition "a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9ccheckbox\xe2\x80\x9d"))) 589 + `Condition (Printf.sprintf "a %s attribute whose value is %s" (q "type") (q "checkbox"))))) 590 590 | _ -> () 591 591 end; 592 592
+24 -22
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
··· 1 1 (** Attribute restrictions checker - validates that certain attributes 2 2 are not used on elements where they're not allowed. *) 3 3 4 + let q = Error_code.q 5 + 4 6 (** List of (element, [disallowed attributes]) pairs for HTML elements. *) 5 7 let disallowed_attrs_html = [ 6 8 (* Elements that cannot have href attribute (RDFa misuses) *) ··· 174 176 if attr_value = "#" then 175 177 Message_collector.add_typed collector 176 178 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 177 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d." 178 - attr_value attr_name name)))) 179 + "Bad value %s for attribute %s on element %s: Bad hash-name reference: A hash-name reference must have at least one character after %s." 180 + (q attr_value) (q attr_name) (q name) (q "#"))))) 179 181 end 180 182 ) attrs 181 183 end; ··· 190 192 | Error msg -> 191 193 Message_collector.add_typed collector 192 194 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 193 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s" 194 - attr_value attr_name name msg)))) 195 + "Bad value %s for attribute %s on element %s: Bad MIME type: %s" 196 + (q attr_value) (q attr_name) (q name) msg)))) 195 197 end 196 198 ) attrs 197 199 end; ··· 213 215 (* Determine specific error message *) 214 216 let error_msg = 215 217 if String.length attr_value = 0 then 216 - Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer." 217 - attr_name name 218 + Printf.sprintf "Bad value %s for attribute %s on element %s: The empty string is not a valid non-negative integer." 219 + (q "") (q attr_name) (q name) 218 220 else if String.contains attr_value '%' then 219 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 220 - attr_value attr_name name 221 + Printf.sprintf "Bad value %s for attribute %s on element %s: Bad non-negative integer: Expected a digit but saw %s instead." 222 + (q attr_value) (q attr_name) (q name) (q "%") 221 223 else if String.length attr_value > 0 && attr_value.[0] = '-' then 222 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead." 223 - attr_value attr_name name 224 + Printf.sprintf "Bad value %s for attribute %s on element %s: Bad non-negative integer: Expected a digit but saw %s instead." 225 + (q attr_value) (q attr_name) (q name) (q "-") 224 226 else 225 227 (* Find first non-digit character *) 226 228 let bad_char = ··· 234 236 in 235 237 match bad_char with 236 238 | Some c -> 237 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 238 - attr_value attr_name name c 239 + Printf.sprintf "Bad value %s for attribute %s on element %s: Bad non-negative integer: Expected a digit but saw %s instead." 240 + (q attr_value) (q attr_name) (q name) (q (String.make 1 c)) 239 241 | None -> 240 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit." 241 - attr_value attr_name name 242 + Printf.sprintf "Bad value %s for attribute %s on element %s: Bad non-negative integer: Expected a digit." 243 + (q attr_value) (q attr_name) (q name) 242 244 in 243 245 Message_collector.add_typed collector 244 246 (`Attr (`Bad_value_generic (`Message error_msg))) ··· 377 379 if count_codepoints key > 1 then 378 380 Message_collector.add_typed collector 379 381 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 380 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character." 381 - attr_value attr_name name)))) 382 + "Bad value %s for attribute %s on element %s: Bad key label list: Key label has multiple characters. Each key label must be a single character." 383 + (q attr_value) (q attr_name) (q name))))) 382 384 ) keys; 383 385 (* Check for duplicate keys *) 384 386 let rec find_duplicates seen = function ··· 387 389 if List.mem k seen then 388 390 Message_collector.add_typed collector 389 391 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 390 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique." 391 - attr_value attr_name name)))) 392 + "Bad value %s for attribute %s on element %s: Bad key label list: Duplicate key label. Each key label must be unique." 393 + (q attr_value) (q attr_name) (q name))))) 392 394 else 393 395 find_duplicates (k :: seen) rest 394 396 in ··· 405 407 if has_command && has_aria_expanded then 406 408 Message_collector.add_typed collector 407 409 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 408 - `Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute"))); 410 + `Condition (Printf.sprintf "a %s attribute" (q "command"))))); 409 411 410 412 if has_popovertarget && has_aria_expanded then 411 413 Message_collector.add_typed collector 412 414 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 413 - `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute"))) 415 + `Condition (Printf.sprintf "a %s attribute" (q "popovertarget"))))) 414 416 end; 415 417 416 418 (* Note: data-* uppercase check requires XML parsing which preserves case. ··· 432 434 | Error msg -> 433 435 Message_collector.add_typed collector 434 436 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 435 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s" 436 - attr_value attr_name name msg)))) 437 + "Bad value %s for attribute %s on element %s: Bad media query: %s" 438 + (q attr_value) (q attr_name) (q name) msg)))) 437 439 end 438 440 end 439 441 ) attrs
+18 -16
lib/htmlrw_check/specialized/datetime_checker.ml
··· 1 1 (** Datetime attribute validation checker *) 2 2 3 + let q = Error_code.q 4 + 3 5 (** Elements that have datetime attribute *) 4 6 let datetime_elements = ["del"; "ins"; "time"] 5 7 ··· 346 348 if value <> String.trim value then begin 347 349 let tz_msg = "Bad datetime with timezone: The literal did not satisfy the datetime with timezone format." in 348 350 let date_msg = "Bad date: The literal did not satisfy the date format." in 349 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 350 - value attr_name element_name tz_msg date_msg) 351 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 352 + (q value) (q attr_name) (q element_name) tz_msg date_msg) 351 353 end 352 354 else 353 355 (* Try datetime with timezone first *) ··· 355 357 | DtOk -> Ok (* Valid datetime with timezone *) 356 358 | DtWarning w -> 357 359 (* Valid but with warning - format matches Nu validator *) 358 - Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format." 359 - value attr_name element_name w) 360 + Warning (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format." 361 + (q value) (q attr_name) (q element_name) w) 360 362 | DtError tz_error -> 361 363 (* Try just date - valid for all elements *) 362 364 match validate_date value with ··· 365 367 if has_suspicious_year value || has_old_year value then begin 366 368 let date_msg = "Bad date: Year may be mistyped." in 367 369 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 368 - Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 369 - value attr_name element_name date_msg tz_msg) 370 + Warning (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 371 + (q value) (q attr_name) (q element_name) date_msg tz_msg) 370 372 end else 371 373 Ok (* Valid date with normal year *) 372 374 | (false, date_error) -> ··· 394 396 | (true, _) -> Ok (* Valid duration P... *) 395 397 | (false, _) -> 396 398 (* Use simplified message for time element matching Nu validator format *) 397 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad time-datetime: The literal did not satisfy the time-datetime format." 398 - value attr_name element_name) 399 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad time-datetime: The literal did not satisfy the time-datetime format." 400 + (q value) (q attr_name) (q element_name)) 399 401 end 400 402 else begin 401 403 (* del/ins only allow date or datetime-with-timezone *) ··· 426 428 (* Datetime first for: generic tz, tz hours error, time minute/hour errors, year errors 427 429 Date first for: "Month cannot be less than" date error, tz minutes error, fraction error *) 428 430 if is_month_less_than_error then 429 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 430 - value attr_name element_name date_msg tz_msg) 431 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 432 + (q value) (q attr_name) (q element_name) date_msg tz_msg) 431 433 else if is_tz_minutes_error || is_fraction_error then 432 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 433 - value attr_name element_name date_msg tz_msg) 434 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 435 + (q value) (q attr_name) (q element_name) date_msg tz_msg) 434 436 else if is_tz_hours_error || is_time_minute_or_hour_error || is_generic_tz then 435 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 436 - value attr_name element_name tz_msg date_msg) 437 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 438 + (q value) (q attr_name) (q element_name) tz_msg date_msg) 437 439 else 438 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 439 - value attr_name element_name tz_msg date_msg) 440 + Error (Printf.sprintf "Bad value %s for attribute %s on element %s: %s %s" 441 + (q value) (q attr_name) (q element_name) tz_msg date_msg) 440 442 end 441 443 442 444 (** Checker state *)
+9 -6
lib/htmlrw_check/specialized/microdata_checker.ml
··· 2 2 3 3 Validates HTML5 microdata attributes. *) 4 4 5 + (** Quote helper for consistent message formatting. *) 6 + let q = Error_code.q 7 + 5 8 (** Information about an itemscope. *) 6 9 type item_scope = { 7 10 element : string; ··· 74 77 let url_trimmed = String.trim url in 75 78 if String.length url_trimmed = 0 then 76 79 Some (Printf.sprintf 77 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad absolute URL: Must be non-empty." 78 - original_value attr_name element) 80 + "Bad value %s for attribute %s on element %s: Bad absolute URL: Must be non-empty." 81 + (q original_value) (q attr_name) (q element)) 79 82 else 80 83 (* First check if it has a scheme (required for absolute URL) *) 81 84 match Url_checker.extract_scheme url_trimmed with 82 85 | None -> 83 86 Some (Printf.sprintf 84 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad absolute URL: The string \xe2\x80\x9c%s\xe2\x80\x9d is not an absolute URL." 85 - original_value attr_name element url) 87 + "Bad value %s for attribute %s on element %s: Bad absolute URL: The string %s is not an absolute URL." 88 + (q original_value) (q attr_name) (q element) (q url)) 86 89 | Some _ -> 87 90 (* Has a scheme - do comprehensive URL validation *) 88 91 match Url_checker.validate_url url element attr_name with ··· 94 97 (* Escape backslashes in replacement string for Str.global_replace *) 95 98 let escaped_original = Str.global_replace (Str.regexp "\\\\") "\\\\\\\\" original_value in 96 99 let error_msg = Str.global_replace 97 - (Str.regexp_string (Printf.sprintf "\xe2\x80\x9c%s\xe2\x80\x9d for attribute" url)) 98 - (Printf.sprintf "\xe2\x80\x9c%s\xe2\x80\x9d for attribute" escaped_original) 100 + (Str.regexp_string (Printf.sprintf "%s for attribute" (q url))) 101 + (Printf.sprintf "%s for attribute" (q escaped_original)) 99 102 error_msg in 100 103 Some error_msg 101 104
+32 -30
lib/htmlrw_check/specialized/mime_type_checker.ml
··· 2 2 3 3 Validates MIME type values in type attributes. *) 4 4 5 + let q = Error_code.q 6 + 5 7 (** Validate a MIME type value. Returns error message or None. *) 6 8 let validate_mime_type value element attr_name = 7 9 let len = String.length value in 8 10 if len = 0 then 9 11 Some (Printf.sprintf 10 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Empty value." 11 - value attr_name element) 12 + "Bad value %s for attribute %s on element %s: Bad MIME type: Empty value." 13 + (q value) (q attr_name) (q element)) 12 14 else if value.[len - 1] = ' ' || value.[len - 1] = '\t' then begin 13 15 (* Check if this is a semicolon followed by only whitespace *) 14 16 let semicolon_pos = try Some (String.index value ';') with Not_found -> None in ··· 18 20 let params_trimmed = String.trim params in 19 21 if params_trimmed = "" then 20 22 Some (Printf.sprintf 21 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 22 - value attr_name element) 23 + "Bad value %s for attribute %s on element %s: Bad MIME type: Semicolon seen but there was no parameter following it." 24 + (q value) (q attr_name) (q element)) 23 25 else 24 26 Some (Printf.sprintf 25 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Extraneous trailing whitespace." 26 - value attr_name element) 27 + "Bad value %s for attribute %s on element %s: Bad MIME type: Extraneous trailing whitespace." 28 + (q value) (q attr_name) (q element)) 27 29 | None -> 28 30 Some (Printf.sprintf 29 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Extraneous trailing whitespace." 30 - value attr_name element) 31 + "Bad value %s for attribute %s on element %s: Bad MIME type: Extraneous trailing whitespace." 32 + (q value) (q attr_name) (q element)) 31 33 end 32 34 else if len > 0 && (value.[0] = ' ' || value.[0] = '\t') then 33 35 Some (Printf.sprintf 34 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Expected a token character but saw \xe2\x80\x9c \xe2\x80\x9d instead." 35 - value attr_name element) 36 + "Bad value %s for attribute %s on element %s: Bad MIME type: Expected a token character but saw %s instead." 37 + (q value) (q attr_name) (q element) (q " ")) 36 38 else 37 39 (* Parse type/subtype *) 38 40 let slash_pos = try Some (String.index value '/') with Not_found -> None in ··· 43 45 (match semicolon_pos with 44 46 | Some _ -> 45 47 Some (Printf.sprintf 46 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 47 - value attr_name element) 48 + "Bad value %s for attribute %s on element %s: Bad MIME type: Subtype missing." 49 + (q value) (q attr_name) (q element)) 48 50 | None -> 49 51 Some (Printf.sprintf 50 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 51 - value attr_name element)) 52 + "Bad value %s for attribute %s on element %s: Bad MIME type: Subtype missing." 53 + (q value) (q attr_name) (q element))) 52 54 | Some slash_pos -> 53 55 (* Check for empty subtype *) 54 56 let after_slash = String.sub value (slash_pos + 1) (len - slash_pos - 1) in ··· 60 62 let subtype_trimmed = String.trim subtype in 61 63 if subtype_trimmed = "" then 62 64 Some (Printf.sprintf 63 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 64 - value attr_name element) 65 + "Bad value %s for attribute %s on element %s: Bad MIME type: Subtype missing." 66 + (q value) (q attr_name) (q element)) 65 67 else if String.length subtype > 0 && subtype.[String.length subtype - 1] = ' ' then 66 68 (* Space before semicolon - also check parameter format *) 67 69 let semicolon_pos = try Some (String.index value ';') with Not_found -> None in ··· 72 74 let params_trimmed = String.trim params in 73 75 if params_trimmed = "" then 74 76 Some (Printf.sprintf 75 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 76 - value attr_name element) 77 + "Bad value %s for attribute %s on element %s: Bad MIME type: Semicolon seen but there was no parameter following it." 78 + (q value) (q attr_name) (q element)) 77 79 else 78 80 (* Check for param_name=value format *) 79 81 let eq_pos = try Some (String.index params '=') with Not_found -> None in 80 82 (match eq_pos with 81 83 | None -> 82 84 Some (Printf.sprintf 83 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 84 - value attr_name element) 85 + "Bad value %s for attribute %s on element %s: Bad MIME type: Parameter value missing." 86 + (q value) (q attr_name) (q element)) 85 87 | Some _ -> None) 86 88 | None -> None) 87 89 else ··· 94 96 let params_trimmed = String.trim params in 95 97 if params_trimmed = "" then 96 98 Some (Printf.sprintf 97 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 98 - value attr_name element) 99 + "Bad value %s for attribute %s on element %s: Bad MIME type: Semicolon seen but there was no parameter following it." 100 + (q value) (q attr_name) (q element)) 99 101 else 100 102 (* Check for param_name=value format *) 101 103 let eq_pos = try Some (String.index params '=') with Not_found -> None in 102 104 (match eq_pos with 103 105 | None -> 104 106 Some (Printf.sprintf 105 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 106 - value attr_name element) 107 + "Bad value %s for attribute %s on element %s: Bad MIME type: Parameter value missing." 108 + (q value) (q attr_name) (q element)) 107 109 | Some eq_pos -> 108 110 let param_value = String.sub params (eq_pos + 1) (String.length params - eq_pos - 1) in 109 111 let param_value_trimmed = String.trim param_value in 110 112 if param_value_trimmed = "" then 111 113 Some (Printf.sprintf 112 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 113 - value attr_name element) 114 + "Bad value %s for attribute %s on element %s: Bad MIME type: Parameter value missing." 115 + (q value) (q attr_name) (q element)) 114 116 else if param_value_trimmed.[0] = '"' then 115 117 (* Quoted string - check for closing quote *) 116 118 let quote_end = try Some (String.index_from param_value_trimmed 1 '"') with ··· 127 129 in 128 130 if has_backslash_at_end then 129 131 Some (Printf.sprintf 130 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Unfinished quoted string." 131 - value attr_name element) 132 + "Bad value %s for attribute %s on element %s: Bad MIME type: Unfinished quoted string." 133 + (q value) (q attr_name) (q element)) 132 134 else 133 135 Some (Printf.sprintf 134 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Unfinished quoted string." 135 - value attr_name element)) 136 + "Bad value %s for attribute %s on element %s: Bad MIME type: Unfinished quoted string." 137 + (q value) (q attr_name) (q element))) 136 138 else 137 139 None)) 138 140
+10 -8
lib/htmlrw_check/specialized/svg_checker.ml
··· 2 2 3 3 Validates SVG elements and attributes according to SVG 1.1/2 specifications. *) 4 4 5 + let q = Error_code.q 6 + 5 7 type font_state = { 6 8 mutable has_missing_glyph : bool; 7 9 } ··· 292 294 if value <> svg_ns_url then 293 295 Message_collector.add_typed collector 294 296 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 295 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for the attribute \xe2\x80\x9cxmlns\xe2\x80\x9d (only \xe2\x80\x9c%s\xe2\x80\x9d permitted here)." 296 - value svg_ns_url)))) 297 + "Bad value %s for the attribute %s (only %s permitted here)." 298 + (q value) (q "xmlns") (q svg_ns_url))))) 297 299 | "xmlns:xlink" -> 298 300 if value <> "http://www.w3.org/1999/xlink" then 299 301 Message_collector.add_typed collector 300 302 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 301 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for the attribute \xe2\x80\x9cxmlns:link\xe2\x80\x9d (only \xe2\x80\x9chttp://www.w3.org/1999/xlink\xe2\x80\x9d permitted here)." 302 - value)))) 303 + "Bad value %s for the attribute %s (only %s permitted here)." 304 + (q value) (q "xmlns:link") (q "http://www.w3.org/1999/xlink"))))) 303 305 | _ when String.starts_with ~prefix:"xmlns:" attr && attr <> "xmlns:xlink" -> 304 306 (* Other xmlns declarations are not allowed in HTML-embedded SVG *) 305 307 Message_collector.add_typed collector ··· 324 326 let context = String.sub d !context_start (ctx_end - !context_start) in 325 327 Message_collector.add_typed collector 326 328 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 327 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cd\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad SVG path data: Expected command but found \xe2\x80\x9c#\xe2\x80\x9d (context: \xe2\x80\x9c%s\xe2\x80\x9d)." 328 - d element context)))); 329 + "Bad value %s for attribute %s on element %s: Bad SVG path data: Expected command but found %s (context: %s)." 330 + (q d) (q "d") (q element) (q "#") (q context))))); 329 331 i := len (* Stop processing *) 330 332 | _ -> 331 333 incr i ··· 344 346 let context = String.sub d ctx_start (flag_end - ctx_start) in 345 347 Message_collector.add_typed collector 346 348 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 347 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9cd\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad SVG path data: Expected \xe2\x80\x9c0\xe2\x80\x9d or \xe2\x80\x9c1\xe2\x80\x9d for large-arc-flag for \xe2\x80\x9ca\xe2\x80\x9d command but found \xe2\x80\x9c%s\xe2\x80\x9d instead (context: \xe2\x80\x9c%s\xe2\x80\x9d)." 348 - d element flag context)))) 349 + "Bad value %s for attribute %s on element %s: Bad SVG path data: Expected %s or %s for large-arc-flag for %s command but found %s instead (context: %s)." 350 + (q d) (q "d") (q element) (q "0") (q "1") (q "a") (q flag) (q context))))) 349 351 end 350 352 with Not_found -> () 351 353
+73 -70
lib/htmlrw_check/specialized/url_checker.ml
··· 1 1 (** URL validation checker for href, src, action, and other URL attributes. *) 2 2 3 + (** Quote helper for consistent message formatting. *) 4 + let q = Error_code.q 5 + 3 6 (** Attributes that contain URLs and should be validated. 4 7 Note: srcset uses special microsyntax, not validated as URL here. 5 8 Note: input[value] is only checked for type="url", handled specially below. *) ··· 44 47 let validate_ipv6_host host url attr_name element_name = 45 48 (* Host should be in format [xxxx:...] *) 46 49 if String.length host < 3 then 47 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character." 48 - url attr_name element_name) 50 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character." 51 + (q url) (q attr_name) (q element_name)) 49 52 else begin 50 53 (* Check if all characters are valid IPv6 chars *) 51 54 let invalid_char = String.exists (fun c -> not (is_valid_ipv6_char c)) host in 52 55 if invalid_char then 53 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character." 54 - url attr_name element_name) 56 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character." 57 + (q url) (q attr_name) (q element_name)) 55 58 else 56 59 None 57 60 end ··· 239 242 let _ = contains_invalid_unicode decoded in 240 243 None 241 244 with Exit -> 242 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.." 243 - url attr_name element_name) 245 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: A label or domain name contains disallowed characters.." 246 + (q url) (q attr_name) (q element_name)) 244 247 245 248 (** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *) 246 249 let contains_percent_char s = ··· 258 261 let decoded = percent_decode host in 259 262 (* Check for % character in decoded host - this catches fullwidth percent signs etc. *) 260 263 if contains_percent_char decoded then 261 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%%\xe2\x80\x9d is not allowed." 262 - url attr_name element_name) 264 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: %s is not allowed." 265 + (q url) (q attr_name) (q element_name) (q "%")) 263 266 else 264 267 None 265 268 ··· 275 278 ) port; 276 279 match !invalid_char with 277 280 | Some c -> 278 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in port: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 279 - url attr_name element_name c) 281 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in port: %s is not allowed." 282 + (q url) (q attr_name) (q element_name) (q (String.make 1 c))) 280 283 | None -> 281 284 (* Check port range *) 282 285 try 283 286 let port_num = int_of_string port in 284 287 if port_num >= 65536 then 285 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Port number must be less than 65536." 286 - url attr_name element_name) 288 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Port number must be less than 65536." 289 + (q url) (q attr_name) (q element_name)) 287 290 else 288 291 None 289 292 with _ -> None ··· 297 300 (* Check for empty host *) 298 301 let requires_host = List.mem scheme special_schemes in 299 302 if host = "" && requires_host && scheme <> "file" then 300 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: empty host." 301 - url attr_name element_name) 303 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: empty host." 304 + (q url) (q attr_name) (q element_name)) 302 305 else 303 306 (* Check for invalid chars *) 304 307 let invalid_char = ··· 306 309 in 307 310 match invalid_char with 308 311 | Some c -> 309 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 310 - url attr_name element_name c) 312 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: %s is not allowed." 313 + (q url) (q attr_name) (q element_name) (q (String.make 1 c))) 311 314 | None -> 312 315 (* Check for | *) 313 316 if String.contains host '|' && not (is_pipe_allowed_in_host url host) then 314 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c|\xe2\x80\x9d is not allowed." 315 - url attr_name element_name) 317 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: %s is not allowed." 318 + (q url) (q attr_name) (q element_name) (q "|")) 316 319 (* Check for backslash in host *) 317 320 else if String.contains host '\\' then 318 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed." 319 - url attr_name element_name) 321 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: %s is not allowed." 322 + (q url) (q attr_name) (q element_name) (q "\\")) 320 323 (* Check for space in host *) 321 324 else if String.contains host ' ' then 322 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: Illegal character in domain: space is not allowed." 323 - url attr_name element_name) 325 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Invalid host: Illegal character in domain: space is not allowed." 326 + (q url) (q attr_name) (q element_name)) 324 327 (* Check for invalid percent-encoded Unicode in host *) 325 328 else begin 326 329 match check_invalid_percent_encoded_unicode host url attr_name element_name with ··· 342 345 let colon_pos = String.index url ':' in 343 346 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 344 347 if String.length after_colon < 2 || after_colon.[0] <> '/' || after_colon.[1] <> '/' then 345 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a slash (\"/\")." 346 - url attr_name element_name) 348 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Expected a slash (\"/\")." 349 + (q url) (q attr_name) (q element_name)) 347 350 else 348 351 None 349 352 end else ··· 357 360 | Some scheme -> 358 361 if scheme = "data" && String.contains url '#' then 359 362 let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in 360 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397." 361 - url attr_name element_name url_type) 363 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: %s Fragment is not allowed for data: URIs according to RFC 2397." 364 + (q url) (q attr_name) (q element_name) url_type) 362 365 else 363 366 None 364 367 ··· 375 378 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 376 379 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *) 377 380 if String.length after_colon > 0 && after_colon.[0] = '/' then 378 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead." 379 - url attr_name element_name) 381 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Expected a token character or a semicolon but saw %s instead." 382 + (q url) (q attr_name) (q element_name) (q "/")) 380 383 else 381 384 None 382 385 end else ··· 393 396 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 394 397 (* Check for tab in scheme data *) 395 398 if String.contains scheme_data '\t' then 396 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed." 397 - url attr_name element_name) 399 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in scheme data: tab is not allowed." 400 + (q url) (q attr_name) (q element_name)) 398 401 (* Check for newline in scheme data *) 399 402 else if String.contains scheme_data '\n' then 400 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 401 - url attr_name element_name) 403 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in scheme data: line break is not allowed." 404 + (q url) (q attr_name) (q element_name)) 402 405 (* Check for carriage return in scheme data *) 403 406 else if String.contains scheme_data '\r' then 404 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 405 - url attr_name element_name) 407 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in scheme data: line break is not allowed." 408 + (q url) (q attr_name) (q element_name)) 406 409 (* Check for space in scheme data *) 407 410 else if String.contains scheme_data ' ' then 408 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed." 409 - url attr_name element_name) 411 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in scheme data: space is not allowed." 412 + (q url) (q attr_name) (q element_name)) 410 413 else 411 414 None 412 415 end else ··· 449 452 let path = remove_query_fragment raw_path in 450 453 (* Check for space in path (not allowed) *) 451 454 if String.contains path ' ' then 452 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: space is not allowed." 453 - url attr_name element_name) 455 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in path segment: space is not allowed." 456 + (q url) (q attr_name) (q element_name)) 454 457 (* Check for pipe in path (not allowed except in file:// authority) *) 455 458 else if String.contains path '|' then 456 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c|\xe2\x80\x9d is not allowed." 457 - url attr_name element_name) 459 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in path segment: %s is not allowed." 460 + (q url) (q attr_name) (q element_name) (q "|")) 458 461 (* Check for unescaped square brackets in path *) 459 462 else if String.contains path '[' then 460 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed." 461 - url attr_name element_name) 463 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in path segment: %s is not allowed." 464 + (q url) (q attr_name) (q element_name) (q "[")) 462 465 else 463 466 None 464 467 ··· 470 473 | None -> 471 474 (* Check for square brackets at start (not IPv6 - that requires scheme) *) 472 475 if String.length url > 0 && url.[0] = '[' then 473 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in path segment: \xe2\x80\x9c[\xe2\x80\x9d is not allowed." 474 - url attr_name element_name) 476 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in path segment: %s is not allowed." 477 + (q url) (q attr_name) (q element_name) (q "[")) 475 478 else 476 479 None 477 480 ··· 489 492 if i + 2 < len && is_hex_digit url.[i + 1] && is_hex_digit url.[i + 2] then 490 493 find_bare_percent (i + 3) (* Valid percent encoding, continue *) 491 494 else 492 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Percentage (\xe2\x80\x9c%%\xe2\x80\x9d) is not followed by two hexadecimal digits." 493 - url attr_name element_name) 495 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Percentage (%s) is not followed by two hexadecimal digits." 496 + (q url) (q attr_name) (q element_name) (q "%")) 494 497 end else 495 498 find_bare_percent (i + 1) 496 499 in ··· 511 514 let query = String.sub url (query_start + 1) (query_end - query_start - 1) in 512 515 (* Check for unescaped space in query *) 513 516 if String.contains query ' ' then 514 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in query: space is not allowed." 515 - url attr_name element_name) 517 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in query: space is not allowed." 518 + (q url) (q attr_name) (q element_name)) 516 519 else 517 520 None 518 521 with Not_found -> None (* No query string *) ··· 524 527 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in 525 528 (* Check for backslash in fragment *) 526 529 if String.contains fragment '\\' then 527 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed." 528 - url attr_name element_name) 530 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in fragment: %s is not allowed." 531 + (q url) (q attr_name) (q element_name) (q "\\")) 529 532 (* Check for second hash in fragment *) 530 533 else if String.contains fragment '#' then 531 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed." 532 - url attr_name element_name) 534 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in fragment: %s is not allowed." 535 + (q url) (q attr_name) (q element_name) (q "#")) 533 536 (* Check for space in fragment *) 534 537 else if String.contains fragment ' ' then 535 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: space is not allowed." 536 - url attr_name element_name) 538 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in fragment: space is not allowed." 539 + (q url) (q attr_name) (q element_name)) 537 540 else 538 541 None 539 542 with Not_found -> None (* No fragment *) ··· 572 575 let userinfo = String.sub authority 0 at in 573 576 (* Check for @ in userinfo (should be percent-encoded) *) 574 577 if String.contains userinfo '@' then 575 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: User or password contains an at symbol (\xe2\x80\x9c@\xe2\x80\x9d) not percent-encoded." 576 - url attr_name element_name) 578 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: User or password contains an at symbol (%s) not percent-encoded." 579 + (q url) (q attr_name) (q element_name) (q "@")) 577 580 (* Check for space *) 578 581 else if String.contains userinfo ' ' then 579 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed." 580 - url attr_name element_name) 582 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in user or password: space is not allowed." 583 + (q url) (q attr_name) (q element_name)) 581 584 else begin 582 585 (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *) 583 586 let find_non_ascii_char userinfo = ··· 600 603 in 601 604 match find_non_ascii_char userinfo with 602 605 | Some bad_char -> 603 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed." 604 - url attr_name element_name bad_char) 606 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in user or password: %s is not allowed." 607 + (q url) (q attr_name) (q element_name) (q bad_char)) 605 608 | None -> 606 609 (* Check for other invalid chars *) 607 610 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in 608 611 match invalid with 609 612 | Some c -> 610 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 611 - url attr_name element_name c) 613 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character in user or password: %s is not allowed." 614 + (q url) (q attr_name) (q element_name) (q (String.make 1 c))) 612 615 | None -> None 613 616 end 614 617 with _ -> None ··· 634 637 let attr_lower = String.lowercase_ascii attr_name in 635 638 if List.mem attr_lower must_be_non_empty || 636 639 List.mem (name_lower, attr_lower) must_be_non_empty_combinations then 637 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Must be non-empty." 638 - original_url attr_name element_name) 640 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Must be non-empty." 641 + (q original_url) (q attr_name) (q element_name)) 639 642 else 640 643 None 641 644 end ··· 647 650 let last = original_url.[String.length original_url - 1] in 648 651 last = ' ' || last = '\t' in 649 652 if has_leading || has_trailing then 650 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace." 651 - original_url attr_name element_name) 653 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Illegal character: leading/trailing ASCII whitespace." 654 + (q original_url) (q attr_name) (q element_name)) 652 655 else None 653 656 (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *) 654 657 else begin ··· 657 660 | None -> 658 661 (* Check for newlines/tabs in special scheme URLs *) 659 662 if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then 660 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found." 661 - url attr_name element_name) 663 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Tab, new line or carriage return found." 664 + (q url) (q attr_name) (q element_name)) 662 665 else begin 663 666 (* Check for relative URL issues first *) 664 667 match check_relative_url url attr_name element_name with ··· 697 700 698 701 (* Check for backslash AFTER special scheme check *) 699 702 if String.contains url '\\' then 700 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Backslash (\"\\\") used as path segment delimiter." 701 - url attr_name element_name) 703 + Some (Printf.sprintf "Bad value %s for attribute %s on element %s: Bad URL: Backslash (\"\\\") used as path segment delimiter." 704 + (q url) (q attr_name) (q element_name)) 702 705 else 703 706 704 707 (* Check path segment for illegal characters *)