OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+57 -16
lib/html5_checker/datatype/dt_autocomplete.ml
··· 147 is_contact_details := true 148 | _ -> ()); 149 150 (* Process remaining tokens *) 151 - let process_field_tokens = function 152 - | [] -> Error "A list of autofill details tokens must contain an autofill field name" 153 | [ "webauthn" ] -> 154 Error 155 "The token \"webauthn\" must not be the only token in a list of \ 156 - autofill detail tokens" 157 | [ field_name ] -> 158 if not (List.mem field_name all_field_names) then 159 Error 160 (Printf.sprintf 161 - "The string \"%s\" is not a valid autofill field name" 162 field_name) 163 else if !is_contact_details && not (List.mem field_name contact_field_names) 164 then 165 Error 166 (Printf.sprintf 167 "The autofill field name \"%s\" is not allowed in contact \ 168 - context" 169 field_name) 170 else Ok () 171 | [ field_name; "webauthn" ] -> 172 if not (List.mem field_name all_field_names) then 173 Error 174 (Printf.sprintf 175 - "The string \"%s\" is not a valid autofill field name" 176 field_name) 177 else if !is_contact_details && not (List.mem field_name contact_field_names) 178 then 179 Error 180 (Printf.sprintf 181 "The autofill field name \"%s\" is not allowed in contact \ 182 - context" 183 field_name) 184 else Ok () 185 | token :: _ when List.mem token contact_types -> 186 Error 187 (Printf.sprintf 188 - "The token \"%s\" must only appear before any autofill field names" 189 token) 190 | token :: _ when starts_with token "section-" -> 191 Error 192 "A \"section-*\" indicator must only appear as the first token in a \ 193 - list of autofill detail tokens" 194 | "shipping" :: _ | "billing" :: _ as toks -> 195 Error 196 (Printf.sprintf 197 "The token \"%s\" must only appear as either the first token in a \ 198 list of autofill detail tokens, or, if the first token is a \ 199 - \"section-*\" indicator, as the second token" 200 (List.hd toks)) 201 | _ :: "webauthn" :: _ :: _ -> 202 Error 203 "The token \"webauthn\" must only appear as the very last token in a \ 204 - list of autofill detail tokens" 205 - | _ :: _ :: _ -> 206 - Error 207 - "A list of autofill details tokens must not contain more than one \ 208 - autofill field name" 209 in 210 process_field_tokens !tokens 211 212 (** Validate autocomplete value *) 213 let validate_autocomplete s = 214 let trimmed = trim_whitespace s in 215 - if String.length trimmed = 0 then Error "Must not be empty" 216 else if trimmed = "on" || trimmed = "off" then Ok () 217 else 218 let tokens = split_on_whitespace trimmed in
··· 147 is_contact_details := true 148 | _ -> ()); 149 150 + (* Check if any token in the list is shipping/billing *) 151 + let find_shipping_billing tokens = 152 + List.find_opt (fun t -> t = "shipping" || t = "billing") tokens 153 + in 154 + 155 + (* Check if any token in the list is a contact type *) 156 + let find_contact_type tokens = 157 + List.find_opt (fun t -> List.mem t contact_types) tokens 158 + in 159 + 160 + (* Check if any token in the list is a section-* indicator *) 161 + let find_section tokens = 162 + List.find_opt (fun t -> starts_with t "section-") tokens 163 + in 164 + 165 (* Process remaining tokens *) 166 + let process_field_tokens tokens = 167 + match tokens with 168 + | [] -> Error "A list of autofill details tokens must contain an autofill field name." 169 | [ "webauthn" ] -> 170 Error 171 "The token \"webauthn\" must not be the only token in a list of \ 172 + autofill detail tokens." 173 | [ field_name ] -> 174 if not (List.mem field_name all_field_names) then 175 Error 176 (Printf.sprintf 177 + "The string \"%s\" is not a valid autofill field name." 178 field_name) 179 else if !is_contact_details && not (List.mem field_name contact_field_names) 180 then 181 Error 182 (Printf.sprintf 183 "The autofill field name \"%s\" is not allowed in contact \ 184 + context." 185 field_name) 186 else Ok () 187 | [ field_name; "webauthn" ] -> 188 if not (List.mem field_name all_field_names) then 189 Error 190 (Printf.sprintf 191 + "The string \"%s\" is not a valid autofill field name." 192 field_name) 193 else if !is_contact_details && not (List.mem field_name contact_field_names) 194 then 195 Error 196 (Printf.sprintf 197 "The autofill field name \"%s\" is not allowed in contact \ 198 + context." 199 field_name) 200 else Ok () 201 | token :: _ when List.mem token contact_types -> 202 Error 203 (Printf.sprintf 204 + "The token \"%s\" must only appear before any autofill field names." 205 token) 206 | token :: _ when starts_with token "section-" -> 207 Error 208 "A \"section-*\" indicator must only appear as the first token in a \ 209 + list of autofill detail tokens." 210 | "shipping" :: _ | "billing" :: _ as toks -> 211 Error 212 (Printf.sprintf 213 "The token \"%s\" must only appear as either the first token in a \ 214 list of autofill detail tokens, or, if the first token is a \ 215 + \"section-*\" indicator, as the second token." 216 (List.hd toks)) 217 | _ :: "webauthn" :: _ :: _ -> 218 Error 219 "The token \"webauthn\" must only appear as the very last token in a \ 220 + list of autofill detail tokens." 221 + | _ :: rest -> 222 + (* Check if any remaining token is a section-* indicator - position error takes precedence *) 223 + (match find_section rest with 224 + | Some _ -> 225 + Error 226 + "A \"section-*\" indicator must only appear as the first token in a \ 227 + list of autofill detail tokens." 228 + | None -> 229 + (* Check if any remaining token is a contact type - position error takes precedence *) 230 + match find_contact_type rest with 231 + | Some ct_token -> 232 + Error 233 + (Printf.sprintf 234 + "The token \"%s\" must only appear before any autofill field names." 235 + ct_token) 236 + | None -> 237 + (* Check if any remaining token is shipping/billing - position error takes precedence *) 238 + match find_shipping_billing rest with 239 + | Some sb_token -> 240 + Error 241 + (Printf.sprintf 242 + "The token \"%s\" must only appear as either the first token in a \ 243 + list of autofill detail tokens, or, if the first token is a \ 244 + \"section-*\" indicator, as the second token." 245 + sb_token) 246 + | None -> 247 + Error 248 + "A list of autofill details tokens must not contain more than one \ 249 + autofill field name.") 250 in 251 process_field_tokens !tokens 252 253 (** Validate autocomplete value *) 254 let validate_autocomplete s = 255 let trimmed = trim_whitespace s in 256 + if String.length trimmed = 0 then Error "Must not be empty." 257 else if trimmed = "on" || trimmed = "off" then Ok () 258 else 259 let tokens = split_on_whitespace trimmed in
+60 -4
lib/html5_checker/parse_error_bridge.ml
··· 11 Message.make_location ~line ~column ?system_id () 12 in 13 let code_str = Html5rw.Parse_error_code.to_string code in 14 - let message = match code with 15 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> 16 - "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag." 17 - | _ -> Printf.sprintf "Parse error: %s" code_str 18 in 19 Message.error 20 ~message 21 - ~code:code_str 22 ~location 23 () 24
··· 11 Message.make_location ~line ~column ?system_id () 12 in 13 let code_str = Html5rw.Parse_error_code.to_string code in 14 + let (message, final_code) = match code with 15 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> 16 + ("Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag.", code_str) 17 + | Html5rw.Parse_error_code.Tree_construction_error s -> 18 + (* Check for control-character/noncharacter/surrogate with codepoint info *) 19 + (try 20 + if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then 21 + let colon_pos = String.index s ':' in 22 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 23 + let cp = int_of_string ("0x" ^ cp_str) in 24 + (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 25 + else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then 26 + let colon_pos = String.index s ':' in 27 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 28 + let cp = int_of_string ("0x" ^ cp_str) in 29 + (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 30 + else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then 31 + let colon_pos = String.index s ':' in 32 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 33 + let cp = int_of_string ("0x" ^ cp_str) in 34 + (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 35 + (* Character reference errors *) 36 + else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then 37 + let cp_str = String.sub s 28 (String.length s - 28) in 38 + let cp = int_of_string ("0x" ^ cp_str) in 39 + if cp = 0x0D then 40 + ("A numeric character reference expanded to carriage return.", "control-character-reference") 41 + else 42 + (Printf.sprintf "Character reference expands to a control character (U+%04x)." cp, "control-character-reference") 43 + else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then 44 + let colon_pos = String.index s ':' in 45 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 46 + let cp = int_of_string ("0x" ^ cp_str) in 47 + (* U+FDD0-U+FDEF are "permanently unassigned" *) 48 + if cp >= 0xFDD0 && cp <= 0xFDEF then 49 + ("Character reference expands to a permanently unassigned code point.", "noncharacter-character-reference") 50 + (* Astral noncharacters (planes 1-16) *) 51 + else if cp >= 0x10000 then 52 + (Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp, "noncharacter-character-reference") 53 + else 54 + (Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp, "noncharacter-character-reference") 55 + else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then 56 + let colon_pos = String.index s ':' in 57 + let _ = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 58 + ("Character reference outside the permissible Unicode range.", "character-reference-outside-unicode-range") 59 + else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then 60 + let colon_pos = String.index s ':' in 61 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 62 + let cp = int_of_string ("0x" ^ cp_str) in 63 + (Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp, "surrogate-character-reference") 64 + else if s = "no-p-element-in-scope" then 65 + ("No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen.", "no-p-element-in-scope") 66 + else if s = "end-tag-p-implied-but-open-elements" then 67 + ("End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements.", "end-tag-p-implied") 68 + else if s = "end-tag-br" then 69 + ("End tag \xe2\x80\x9cbr\xe2\x80\x9d.", "end-tag-br") 70 + else 71 + (Printf.sprintf "Parse error: %s" s, s) 72 + with _ -> (Printf.sprintf "Parse error: %s" s, s)) 73 + | _ -> (Printf.sprintf "Parse error: %s" code_str, code_str) 74 in 75 Message.error 76 ~message 77 + ~code:final_code 78 ~location 79 () 80
+3 -1
lib/html5_checker/semantic/form_checker.ml
··· 32 match Dt_autocomplete.validate_autocomplete value with 33 | Ok () -> () 34 | Error msg -> 35 Message_collector.add_typed collector 36 (Error_code.Bad_attr_value { 37 element = element_name; 38 attr = "autocomplete"; 39 value; 40 - reason = msg 41 }) 42 end 43
··· 32 match Dt_autocomplete.validate_autocomplete value with 33 | Ok () -> () 34 | Error msg -> 35 + (* Nu validator prefixes autocomplete errors with "Bad autocomplete detail tokens (any): " for select/textarea, but not for input *) 36 + let reason = if element_name = "input" then msg else "Bad autocomplete detail tokens (any): " ^ msg in 37 Message_collector.add_typed collector 38 (Error_code.Bad_attr_value { 39 element = element_name; 40 attr = "autocomplete"; 41 value; 42 + reason 43 }) 44 end 45
+23 -8
lib/html5_checker/semantic/id_checker.ml
··· 193 so we pass None. In a full implementation, this would be passed 194 from the parser. *) 195 let location = None in 196 - process_attrs state ~element:name ~attrs ~location collector 197 198 let end_element _state ~name:_ ~namespace:_ _collector = 199 () ··· 204 let end_document state collector = 205 (* Check all ID references point to existing IDs *) 206 List.iter (fun ref -> 207 - if not (Hashtbl.mem state.ids ref.referenced_id) then 208 - (* Use generic for dangling references - format may vary *) 209 - Message_collector.add_typed collector 210 - (Error_code.Generic { 211 - message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document." 212 - (Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id) 213 - }) 214 ) state.references; 215 216 (* Check all usemap references point to existing map names *)
··· 193 so we pass None. In a full implementation, this would be passed 194 from the parser. *) 195 let location = None in 196 + process_attrs state ~element:name ~attrs ~location collector; 197 + 198 + (* Special check: map element must have matching id and name if both present *) 199 + if name = "map" then begin 200 + let id_opt = List.find_map (fun (n, v) -> if n = "id" then Some v else None) attrs in 201 + let name_opt = List.find_map (fun (n, v) -> if n = "name" then Some v else None) attrs in 202 + match id_opt, name_opt with 203 + | Some id_val, Some name_val when id_val <> name_val -> 204 + Message_collector.add_typed collector Error_code.Map_id_name_mismatch 205 + | _ -> () 206 + end 207 208 let end_element _state ~name:_ ~namespace:_ _collector = 209 () ··· 214 let end_document state collector = 215 (* Check all ID references point to existing IDs *) 216 List.iter (fun ref -> 217 + if not (Hashtbl.mem state.ids ref.referenced_id) then begin 218 + (* Use specific error for list attribute on input *) 219 + if ref.attribute = "list" && ref.referring_element = "input" then 220 + Message_collector.add_typed collector Error_code.List_attr_requires_datalist 221 + else 222 + (* Use generic for dangling references - format may vary *) 223 + Message_collector.add_typed collector 224 + (Error_code.Generic { 225 + message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document." 226 + (Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id) 227 + }) 228 + end 229 ) state.references; 230 231 (* Check all usemap references point to existing map names *)
+85
lib/html5_checker/semantic/lang_detecting_checker.ml
··· 141 | "zh-tw" -> "zh-hant" 142 | _ -> code 143 144 let start_element state ~name ~namespace ~attrs _collector = 145 let name_lower = String.lowercase_ascii name in 146 let ns = Option.value namespace ~default:"" in ··· 226 let original_declared = match state.html_lang with 227 | Some l -> l 228 | None -> "" 229 in 230 let detected_code = detected_lang in (* Keep full code like zh-tw *) 231 let detected_name = get_language_name detected_lang in
··· 141 | "zh-tw" -> "zh-hant" 142 | _ -> code 143 144 + (* Traditional Chinese-only characters (simplified versions don't exist) *) 145 + (* These are characters that were simplified in Simplified Chinese *) 146 + let traditional_chars = [| 147 + 0x570B; (* 國 -> 国 *) 148 + 0x5B78; (* 學 -> 学 *) 149 + 0x8AAA; (* 說 -> 说 *) 150 + 0x66F8; (* 書 -> 书 *) 151 + 0x8A9E; (* 語 -> 语 *) 152 + 0x6642; (* 時 -> 时 *) 153 + 0x6703; (* 會 -> 会 *) 154 + 0x7D93; (* 經 -> 经 *) 155 + 0x6A5F; (* 機 -> 机 *) 156 + 0x767C; (* 發 -> 发 *) 157 + 0x554F; (* 問 -> 问 *) 158 + 0x6578; (* 數 -> 数 *) 159 + 0x5BE6; (* 實 -> 实 *) 160 + 0x958B; (* 開 -> 开 *) 161 + 0x95DC; (* 關 -> 关 *) 162 + 0x9577; (* 長 -> 长 *) 163 + 0x9AD4; (* 體 -> 体 *) 164 + 0x9EDE; (* 點 -> 点 *) 165 + 0x96FB; (* 電 -> 电 *) 166 + 0x8CC7; (* 資 -> 资 *) 167 + 0x7FA9; (* 義 -> 义 *) 168 + 0x8B93; (* 讓 -> 让 *) 169 + 0x9054; (* 達 -> 达 *) 170 + 0x71DF; (* 營 -> 营 *) 171 + 0x8655; (* 處 -> 处 *) 172 + 0x6771; (* 東 -> 东 *) 173 + 0x8209; (* 舉 -> 举 *) 174 + 0x8A18; (* 記 -> 记 *) 175 + 0x5099; (* 備 -> 备 *) 176 + 0x5354; (* 協 -> 协 *) 177 + 0x8FA6; (* 辦 -> 办 *) 178 + 0x8457; (* 著 -> 着 *) 179 + 0x8F09; (* 載 -> 载 *) 180 + 0x52D9; (* 務 -> 务 *) 181 + 0x7121; (* 無 -> 无 *) 182 + 0x5F9E; (* 從 -> 从 *) 183 + 0x8B58; (* 識 -> 识 *) 184 + 0x8207; (* 與 -> 与 *) 185 + 0x78BA; (* 確 -> 确 *) 186 + 0x904E; (* 過 -> 过 *) 187 + 0x8A72; (* 該 -> 该 *) 188 + 0x9810; (* 預 -> 预 *) 189 + 0x7576; (* 當 -> 当 *) 190 + 0x5831; (* 報 -> 报 *) 191 + 0x9054; (* 達 -> 达 *) 192 + 0x91AB; (* 醫 -> 医 *) 193 + 0x5718; (* 團 -> 团 *) 194 + 0x8B70; (* 議 -> 议 *) 195 + 0x7D71; (* 統 -> 统 *) 196 + 0x898F; (* 規 -> 规 *) 197 + |] 198 + 199 + (* Check if text contains enough Traditional Chinese characters *) 200 + let is_traditional_chinese text = 201 + let count = ref 0 in 202 + let total = ref 0 in 203 + let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 204 + let rec process () = 205 + if !total >= 1000 then () (* Sample first 1000 chars *) 206 + else match Uutf.decode decoder with 207 + | `Await | `End -> () 208 + | `Malformed _ -> process () 209 + | `Uchar uchar -> 210 + let code = Uchar.to_int uchar in 211 + (* Count CJK characters *) 212 + if code >= 0x4E00 && code <= 0x9FFF then begin 213 + incr total; 214 + (* Check if it's a Traditional-only character *) 215 + if Array.exists (fun c -> c = code) traditional_chars then 216 + incr count 217 + end; 218 + process () 219 + in 220 + process (); 221 + (* If > 2% are Traditional-only characters, it's Traditional Chinese *) 222 + !total > 100 && (float_of_int !count /. float_of_int !total) > 0.02 223 + 224 let start_element state ~name ~namespace ~attrs _collector = 225 let name_lower = String.lowercase_ascii name in 226 let ns = Option.value namespace ~default:"" in ··· 306 let original_declared = match state.html_lang with 307 | Some l -> l 308 | None -> "" 309 + in 310 + (* Correct for Traditional vs Simplified Chinese misdetection *) 311 + let detected_lang = 312 + if detected_lang = "zh-cn" && is_traditional_chinese text then "zh-tw" 313 + else detected_lang 314 in 315 let detected_code = detected_lang in (* Keep full code like zh-tw *) 316 let detected_name = get_language_name detected_lang in
+23 -3
lib/html5_checker/specialized/aria_checker.ml
··· 368 mutable stack : stack_node list; 369 mutable has_active_tab : bool; (* Whether document has role=tab with aria-selected=true *) 370 mutable has_tabpanel : bool; (* Whether document has role=tabpanel elements *) 371 } 372 373 - let create () = { stack = []; has_active_tab = false; has_tabpanel = false } 374 375 let reset state = 376 state.stack <- []; 377 state.has_active_tab <- false; 378 - state.has_tabpanel <- false 379 380 (** Check if any ancestor has one of the required roles. *) 381 let has_required_ancestor_role state required_roles = ··· 451 if aria_selected = Some "true" then state.has_active_tab <- true 452 end; 453 if List.mem "tabpanel" explicit_roles then state.has_tabpanel <- true; 454 455 (* Check br/wbr role restrictions - only none/presentation allowed *) 456 if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin ··· 784 Message_collector.add_error collector 785 ~message:"Every active \xe2\x80\x9crole=tab\xe2\x80\x9d element must have a corresponding \xe2\x80\x9crole=tabpanel\xe2\x80\x9d element." 786 ~code:"tab-without-tabpanel" 787 - () 788 789 let checker = (module struct 790 type nonrec state = state
··· 368 mutable stack : stack_node list; 369 mutable has_active_tab : bool; (* Whether document has role=tab with aria-selected=true *) 370 mutable has_tabpanel : bool; (* Whether document has role=tabpanel elements *) 371 + mutable visible_main_count : int; (* Count of visible elements with role=main *) 372 } 373 374 + let create () = { stack = []; has_active_tab = false; has_tabpanel = false; visible_main_count = 0 } 375 376 let reset state = 377 state.stack <- []; 378 state.has_active_tab <- false; 379 + state.has_tabpanel <- false; 380 + state.visible_main_count <- 0 381 382 (** Check if any ancestor has one of the required roles. *) 383 let has_required_ancestor_role state required_roles = ··· 453 if aria_selected = Some "true" then state.has_active_tab <- true 454 end; 455 if List.mem "tabpanel" explicit_roles then state.has_tabpanel <- true; 456 + 457 + (* Track visible main elements (explicit role=main or implicit main role) *) 458 + let is_hidden = 459 + let aria_hidden = List.assoc_opt "aria-hidden" attrs in 460 + aria_hidden = Some "true" 461 + in 462 + if not is_hidden then begin 463 + (* Check explicit role *) 464 + if List.mem "main" explicit_roles then 465 + state.visible_main_count <- state.visible_main_count + 1 466 + (* Check implicit role from <main> element *) 467 + else if name_lower = "main" then 468 + state.visible_main_count <- state.visible_main_count + 1 469 + end; 470 471 (* Check br/wbr role restrictions - only none/presentation allowed *) 472 if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin ··· 800 Message_collector.add_error collector 801 ~message:"Every active \xe2\x80\x9crole=tab\xe2\x80\x9d element must have a corresponding \xe2\x80\x9crole=tabpanel\xe2\x80\x9d element." 802 ~code:"tab-without-tabpanel" 803 + (); 804 + 805 + (* Check for multiple visible main elements *) 806 + if state.visible_main_count > 1 then 807 + Message_collector.add_typed collector Error_code.Multiple_main_visible 808 809 let checker = (module struct 810 type nonrec state = state
+7 -7
lib/html5_checker/specialized/attr_restrictions_checker.ml
··· 250 Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer." 251 attr_name name 252 else if String.contains attr_value '%' then 253 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 254 attr_value attr_name name 255 else if String.length attr_value > 0 && attr_value.[0] = '-' then 256 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The value must be non-negative." 257 attr_value attr_name name 258 else 259 (* Find first non-digit character *) ··· 268 in 269 match bad_char with 270 | Some c -> 271 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 272 attr_value attr_name name c 273 | None -> 274 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit." 275 attr_value attr_name name 276 in 277 Message_collector.add_error collector ··· 455 List.iter (fun key -> 456 if count_codepoints key > 1 then 457 Message_collector.add_error collector 458 - ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The space-separated list of key labels contains a value \xe2\x80\x9c%s\xe2\x80\x9d that consists of more than a single code point." 459 - attr_value attr_name name key) 460 ~code:"bad-attribute-value" 461 ~element:name ~attribute:attr_name () 462 ) keys; ··· 466 | k :: rest -> 467 if List.mem k seen then 468 Message_collector.add_error collector 469 - ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Duplicate key label." 470 attr_value attr_name name) 471 ~code:"bad-attribute-value" 472 ~element:name ~attribute:attr_name ()
··· 250 Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer." 251 attr_name name 252 else if String.contains attr_value '%' then 253 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 254 attr_value attr_name name 255 else if String.length attr_value > 0 && attr_value.[0] = '-' then 256 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead." 257 attr_value attr_name name 258 else 259 (* Find first non-digit character *) ··· 268 in 269 match bad_char with 270 | Some c -> 271 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 272 attr_value attr_name name c 273 | None -> 274 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit." 275 attr_value attr_name name 276 in 277 Message_collector.add_error collector ··· 455 List.iter (fun key -> 456 if count_codepoints key > 1 then 457 Message_collector.add_error collector 458 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character." 459 + attr_value attr_name name) 460 ~code:"bad-attribute-value" 461 ~element:name ~attribute:attr_name () 462 ) keys; ··· 466 | k :: rest -> 467 if List.mem k seen then 468 Message_collector.add_error collector 469 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique." 470 attr_value attr_name name) 471 ~code:"bad-attribute-value" 472 ~element:name ~attribute:attr_name ()
+9 -13
lib/html5_checker/specialized/datetime_checker.ml
··· 241 minute <> 0 && minute <> 30 && minute <> 45 242 in 243 if unusual_range then 244 - TzWarning "unusual timezone offset" 245 else if unusual_minutes then 246 - TzWarning "unusual timezone offset minutes" 247 else 248 TzOk 249 end ··· 350 match validate_datetime_with_timezone value with 351 | DtOk -> Ok (* Valid datetime with timezone *) 352 | DtWarning w -> 353 - (* Valid but with warning *) 354 - Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s." 355 value attr_name element_name w) 356 | DtError tz_error -> 357 (* Try just date - valid for all elements *) ··· 359 | (true, _) -> 360 (* Date is valid, but check for suspicious year (5+ digits or old year) *) 361 if has_suspicious_year value || has_old_year value then begin 362 - let date_msg = "Year may be mistyped." in 363 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 364 - Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 365 value attr_name element_name date_msg tz_msg) 366 end else 367 Ok (* Valid date with normal year *) ··· 389 match validate_duration value with 390 | (true, _) -> Ok (* Valid duration P... *) 391 | (false, _) -> 392 - let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 393 - let date_msg = match date_error with 394 - | Some e -> Printf.sprintf "Bad date: %s." e 395 - | None -> "Bad date: The literal did not satisfy the date format." 396 - in 397 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 398 - value attr_name element_name tz_msg date_msg) 399 end 400 else begin 401 (* del/ins only allow date or datetime-with-timezone *)
··· 241 minute <> 0 && minute <> 30 && minute <> 45 242 in 243 if unusual_range then 244 + TzWarning "Hours in time zone designator should be from \"-12:00\" to \"+14:00\"" 245 else if unusual_minutes then 246 + TzWarning "Minutes in time zone designator should be either \"00\", \"30\", or \"45\"." 247 else 248 TzOk 249 end ··· 350 match validate_datetime_with_timezone value with 351 | DtOk -> Ok (* Valid datetime with timezone *) 352 | DtWarning w -> 353 + (* Valid but with warning - format matches Nu validator *) 354 + Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format." 355 value attr_name element_name w) 356 | DtError tz_error -> 357 (* Try just date - valid for all elements *) ··· 359 | (true, _) -> 360 (* Date is valid, but check for suspicious year (5+ digits or old year) *) 361 if has_suspicious_year value || has_old_year value then begin 362 + let date_msg = "Bad date: Year may be mistyped." in 363 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 364 + Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 365 value attr_name element_name date_msg tz_msg) 366 end else 367 Ok (* Valid date with normal year *) ··· 389 match validate_duration value with 390 | (true, _) -> Ok (* Valid duration P... *) 391 | (false, _) -> 392 + (* Use simplified message for time element matching Nu validator format *) 393 + Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad time-datetime: The literal did not satisfy the time-datetime format." 394 + value attr_name element_name) 395 end 396 else begin 397 (* del/ins only allow date or datetime-with-timezone *)
+6 -3
lib/html5_checker/specialized/importmap_checker.ml
··· 175 | SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *) 176 | InvalidScopeKey (* scope key is not a valid URL *) 177 | InvalidScopeValue of string (* scope value is not a valid URL *) 178 179 (** Check if a string looks like a valid URL-like specifier for importmaps *) 180 let is_valid_url_like s = ··· 255 | JNull -> () 256 | _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]")) 257 ) scope_imports 258 - | _ -> add_error (NotObject ("scopes[" ^ skey ^ "]")) 259 ) scope_members 260 | _ -> add_error (NotObject "scopes") 261 end ··· 290 Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop 291 | NotString _ -> 292 "A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values." 293 - | ForbiddenProperty prop -> 294 - Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d is not an allowed property." prop 295 | SlashKeyWithoutSlashValue prop -> 296 Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop 297 | InvalidScopeKey -> 298 "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings." 299 | InvalidScopeValue _ -> 300 "A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values." 301 302 let end_element state ~name ~namespace collector = 303 if namespace <> None then ()
··· 175 | SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *) 176 | InvalidScopeKey (* scope key is not a valid URL *) 177 | InvalidScopeValue of string (* scope value is not a valid URL *) 178 + | ScopeValueNotObject (* a value inside scopes is not a JSON object *) 179 180 (** Check if a string looks like a valid URL-like specifier for importmaps *) 181 let is_valid_url_like s = ··· 256 | JNull -> () 257 | _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]")) 258 ) scope_imports 259 + | _ -> add_error ScopeValueNotObject 260 ) scope_members 261 | _ -> add_error (NotObject "scopes") 262 end ··· 291 Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop 292 | NotString _ -> 293 "A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values." 294 + | ForbiddenProperty _ -> 295 + "A \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must contain a JSON object with no properties other than \xe2\x80\x9cimports\xe2\x80\x9d, \xe2\x80\x9cscopes\xe2\x80\x9d, and \xe2\x80\x9cintegrity\xe2\x80\x9d." 296 | SlashKeyWithoutSlashValue prop -> 297 Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop 298 | InvalidScopeKey -> 299 "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings." 300 | InvalidScopeValue _ -> 301 "A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values." 302 + | ScopeValueNotObject -> 303 + "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose values are also JSON objects." 304 305 let end_element state ~name ~namespace collector = 306 if namespace <> None then ()
+3 -3
lib/html5_checker/specialized/language_checker.ml
··· 57 | Some (deprecated, replacement) -> 58 Message_collector.add_warning collector 59 ~message:(Printf.sprintf 60 - "The language tag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead." 61 - deprecated replacement) 62 ~code:"deprecated-lang" 63 ?location 64 ~element 65 - ~attribute:"lang" 66 () 67 | None -> () 68
··· 57 | Some (deprecated, replacement) -> 58 Message_collector.add_warning collector 59 ~message:(Printf.sprintf 60 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad language tag: The language subtag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead." 61 + value attribute element deprecated replacement) 62 ~code:"deprecated-lang" 63 ?location 64 ~element 65 + ~attribute 66 () 67 | None -> () 68
+68 -27
lib/html5_checker/specialized/url_checker.ml
··· 239 let _ = contains_invalid_unicode decoded in 240 None 241 with Exit -> 242 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host." 243 url attr_name element_name) 244 245 (** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *) ··· 349 end else 350 None 351 352 - (** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). *) 353 - let check_data_uri_fragment url attr_name element_name = 354 match extract_scheme url with 355 | None -> None 356 | Some scheme -> 357 if scheme = "data" && String.contains url '#' then 358 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Fragment is not allowed for data: URIs according to RFC 2397." 359 - url attr_name element_name) 360 else 361 None 362 ··· 373 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 374 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *) 375 if String.length after_colon > 0 && after_colon.[0] = '/' then 376 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid %s: URL." 377 - url attr_name element_name scheme) 378 else 379 None 380 end else ··· 389 (* Get scheme data (after the colon) *) 390 let colon_pos = String.index url ':' in 391 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 392 (* Check for space in scheme data *) 393 - if String.contains scheme_data ' ' then 394 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed." 395 url attr_name element_name) 396 else ··· 508 try 509 let fragment_start = String.index url '#' in 510 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in 511 (* Check for second hash in fragment *) 512 - if String.contains fragment '#' then 513 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed." 514 url attr_name element_name) 515 (* Check for space in fragment *) ··· 560 else if String.contains userinfo ' ' then 561 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed." 562 url attr_name element_name) 563 - else 564 - (* Check for non-ASCII characters (like emoji) *) 565 - let has_non_ascii = String.exists (fun c -> Char.code c > 127) userinfo in 566 - if has_non_ascii then 567 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password." 568 - url attr_name element_name) 569 - else 570 (* Check for other invalid chars *) 571 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in 572 match invalid with ··· 574 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 575 url attr_name element_name c) 576 | None -> None 577 with _ -> None 578 579 (** Attributes where empty URL is an error. ··· 613 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace." 614 original_url attr_name element_name) 615 else None 616 - (* Check for newlines/tabs *) 617 - else if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then 618 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found." 619 - url attr_name element_name) 620 else begin 621 (* Check for relative URL issues first *) 622 match check_relative_url url attr_name element_name with 623 | Some err -> Some err ··· 659 url attr_name element_name) 660 else 661 662 - (* Check scheme data for non-special schemes *) 663 - match check_scheme_data url attr_name element_name with 664 - | Some err -> Some err 665 - | None -> 666 - 667 (* Check path segment for illegal characters *) 668 match check_path_segment url attr_name element_name with 669 | Some err -> Some err ··· 688 match host_opt with 689 | Some host -> validate_host host url attr_name element_name scheme_str 690 | None -> None 691 end 692 end 693 ··· 761 () 762 | Some _ -> 763 (* Check for data: URI with fragment - emit warning *) 764 - (match check_data_uri_fragment url "value" name with 765 | Some warn_msg -> 766 Message_collector.add_warning collector 767 ~message:warn_msg ··· 786 end 787 end; 788 (* Check microdata itemtype and itemid attributes for data: URI fragments *) 789 let itemtype_opt = get_attr_value "itemtype" attrs in 790 (match itemtype_opt with 791 | Some url when String.trim url <> "" -> 792 - (match check_data_uri_fragment url "itemtype" name with 793 | Some warn_msg -> 794 Message_collector.add_warning collector 795 ~message:warn_msg ··· 799 () 800 | None -> ()) 801 | _ -> ()); 802 let itemid_opt = get_attr_value "itemid" attrs in 803 (match itemid_opt with 804 | Some url when String.trim url <> "" ->
··· 239 let _ = contains_invalid_unicode decoded in 240 None 241 with Exit -> 242 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.." 243 url attr_name element_name) 244 245 (** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *) ··· 349 end else 350 None 351 352 + (** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). 353 + The is_absolute_url parameter controls whether to use "Bad URL:" or "Bad absolute URL:" in the message. *) 354 + let check_data_uri_fragment ?(is_absolute_url=false) url attr_name element_name = 355 match extract_scheme url with 356 | None -> None 357 | Some scheme -> 358 if scheme = "data" && String.contains url '#' then 359 + let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in 360 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397." 361 + url attr_name element_name url_type) 362 else 363 None 364 ··· 375 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 376 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *) 377 if String.length after_colon > 0 && after_colon.[0] = '/' then 378 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead." 379 + url attr_name element_name) 380 else 381 None 382 end else ··· 391 (* Get scheme data (after the colon) *) 392 let colon_pos = String.index url ':' in 393 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 394 + (* Check for tab in scheme data *) 395 + if String.contains scheme_data '\t' then 396 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed." 397 + url attr_name element_name) 398 + (* Check for newline in scheme data *) 399 + else if String.contains scheme_data '\n' then 400 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 401 + url attr_name element_name) 402 + (* Check for carriage return in scheme data *) 403 + else if String.contains scheme_data '\r' then 404 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 405 + url attr_name element_name) 406 (* Check for space in scheme data *) 407 + else if String.contains scheme_data ' ' then 408 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed." 409 url attr_name element_name) 410 else ··· 522 try 523 let fragment_start = String.index url '#' in 524 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in 525 + (* Check for backslash in fragment *) 526 + if String.contains fragment '\\' then 527 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed." 528 + url attr_name element_name) 529 (* Check for second hash in fragment *) 530 + else if String.contains fragment '#' then 531 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed." 532 url attr_name element_name) 533 (* Check for space in fragment *) ··· 578 else if String.contains userinfo ' ' then 579 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed." 580 url attr_name element_name) 581 + else begin 582 + (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *) 583 + let find_non_ascii_char userinfo = 584 + let decoder = Uutf.decoder ~encoding:`UTF_8 (`String userinfo) in 585 + let rec find () = 586 + match Uutf.decode decoder with 587 + | `End | `Await -> None 588 + | `Malformed _ -> find () 589 + | `Uchar uchar -> 590 + let code = Uchar.to_int uchar in 591 + (* Check if character is not allowed in userinfo *) 592 + (* Per URL Standard: only ASCII letters, digits, and certain symbols allowed *) 593 + if code > 127 then begin 594 + let buf = Buffer.create 8 in 595 + Buffer.add_utf_8_uchar buf uchar; 596 + Some (Buffer.contents buf) 597 + end else find () 598 + in 599 + find () 600 + in 601 + match find_non_ascii_char userinfo with 602 + | Some bad_char -> 603 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed." 604 + url attr_name element_name bad_char) 605 + | None -> 606 (* Check for other invalid chars *) 607 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in 608 match invalid with ··· 610 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 611 url attr_name element_name c) 612 | None -> None 613 + end 614 with _ -> None 615 616 (** Attributes where empty URL is an error. ··· 650 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace." 651 original_url attr_name element_name) 652 else None 653 + (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *) 654 else begin 655 + match check_scheme_data url attr_name element_name with 656 + | Some err -> Some err 657 + | None -> 658 + (* Check for newlines/tabs in special scheme URLs *) 659 + if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then 660 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found." 661 + url attr_name element_name) 662 + else begin 663 (* Check for relative URL issues first *) 664 match check_relative_url url attr_name element_name with 665 | Some err -> Some err ··· 701 url attr_name element_name) 702 else 703 704 (* Check path segment for illegal characters *) 705 match check_path_segment url attr_name element_name with 706 | Some err -> Some err ··· 725 match host_opt with 726 | Some host -> validate_host host url attr_name element_name scheme_str 727 | None -> None 728 + end 729 end 730 end 731 ··· 799 () 800 | Some _ -> 801 (* Check for data: URI with fragment - emit warning *) 802 + (* input[type=url] uses "Bad absolute URL:" format *) 803 + (match check_data_uri_fragment ~is_absolute_url:true url "value" name with 804 | Some warn_msg -> 805 Message_collector.add_warning collector 806 ~message:warn_msg ··· 825 end 826 end; 827 (* Check microdata itemtype and itemid attributes for data: URI fragments *) 828 + (* Microdata uses "Bad absolute URL:" format *) 829 let itemtype_opt = get_attr_value "itemtype" attrs in 830 (match itemtype_opt with 831 | Some url when String.trim url <> "" -> 832 + (match check_data_uri_fragment ~is_absolute_url:true url "itemtype" name with 833 | Some warn_msg -> 834 Message_collector.add_warning collector 835 ~message:warn_msg ··· 839 () 840 | None -> ()) 841 | _ -> ()); 842 + (* itemid uses "Bad URL:" format (not "Bad absolute URL:") *) 843 let itemid_opt = get_attr_value "itemid" attrs in 844 (match itemid_opt with 845 | Some url when String.trim url <> "" ->
+3 -3
lib/html5rw/parser/parser_tree_builder.ml
··· 664 let close_p_element t = 665 generate_implied_end_tags t ~except:"p" (); 666 (match current_node t with 667 - | Some n when n.Dom.name <> "p" -> parse_error t "expected-p" 668 | _ -> ()); 669 pop_until_tag t "p" 670 ··· 1215 end 1216 | Token.Tag { kind = Token.End; name = "p"; _ } -> 1217 if not (has_element_in_button_scope t "p") then begin 1218 - parse_error t "unexpected-end-tag"; 1219 ignore (insert_element t "p" ~push:true []) 1220 end; 1221 close_p_element t ··· 1321 t.frameset_ok <- false; 1322 t.mode <- Parser_insertion_mode.In_table 1323 | Token.Tag { kind = Token.End; name = "br"; _ } -> 1324 - parse_error t "unexpected-end-tag"; 1325 reconstruct_active_formatting t; 1326 ignore (insert_element t "br" ~push:true []); 1327 pop_current t;
··· 664 let close_p_element t = 665 generate_implied_end_tags t ~except:"p" (); 666 (match current_node t with 667 + | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements" 668 | _ -> ()); 669 pop_until_tag t "p" 670 ··· 1215 end 1216 | Token.Tag { kind = Token.End; name = "p"; _ } -> 1217 if not (has_element_in_button_scope t "p") then begin 1218 + parse_error t "no-p-element-in-scope"; 1219 ignore (insert_element t "p" ~push:true []) 1220 end; 1221 close_p_element t ··· 1321 t.frameset_ok <- false; 1322 t.mode <- Parser_insertion_mode.In_table 1323 | Token.Tag { kind = Token.End; name = "br"; _ } -> 1324 + parse_error t "end-tag-br"; 1325 reconstruct_active_formatting t; 1326 ignore (insert_element t "br" ~push:true []); 1327 pop_current t;
+5 -5
lib/html5rw/tokenizer/tokenizer_impl.ml
··· 264 code = 0x0B || 265 (code >= 0x0E && code <= 0x1F) || 266 code = 0x7F then 267 - error t "control-character-in-input-stream" 268 in 269 270 ··· 1937 error t "null-character-reference"; 1938 replacement_char 1939 end else if code > 0x10FFFF then begin 1940 - error t "character-reference-outside-unicode-range"; 1941 replacement_char 1942 end else if code >= 0xD800 && code <= 0xDFFF then begin 1943 - error t "surrogate-character-reference"; 1944 replacement_char 1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) || 1946 List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; ··· 1949 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 1950 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 1951 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin 1952 - error t "noncharacter-character-reference"; 1953 Entities.Numeric_ref.codepoint_to_utf8 code 1954 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B || 1955 (code >= 0x0D && code <= 0x1F) || 1956 (code >= 0x7F && code <= 0x9F) then begin 1957 - error t "control-character-reference"; 1958 (* Apply Windows-1252 replacement table for 0x80-0x9F *) 1959 match Entities.Numeric_ref.find_replacement code with 1960 | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
··· 264 code = 0x0B || 265 (code >= 0x0E && code <= 0x1F) || 266 code = 0x7F then 267 + error t (Printf.sprintf "control-character-in-input-stream:%04x" code) 268 in 269 270 ··· 1937 error t "null-character-reference"; 1938 replacement_char 1939 end else if code > 0x10FFFF then begin 1940 + error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code); 1941 replacement_char 1942 end else if code >= 0xD800 && code <= 0xDFFF then begin 1943 + error t (Printf.sprintf "surrogate-character-reference:%04x" code); 1944 replacement_char 1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) || 1946 List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; ··· 1949 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 1950 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 1951 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin 1952 + error t (Printf.sprintf "noncharacter-character-reference:%05x" code); 1953 Entities.Numeric_ref.codepoint_to_utf8 code 1954 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B || 1955 (code >= 0x0D && code <= 0x1F) || 1956 (code >= 0x7F && code <= 0x9F) then begin 1957 + error t (Printf.sprintf "control-character-reference:%04x" code); 1958 (* Apply Windows-1252 replacement table for 0x80-0x9F *) 1959 match Entities.Numeric_ref.find_replacement code with 1960 | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
+5 -5
lib/html5rw/tokenizer/tokenizer_stream.ml
··· 99 let check_utf8_codepoint t lead_byte = 100 let b0 = Char.code lead_byte in 101 if b0 < 0x80 then 102 - (* ASCII - no surrogates or noncharacters possible in this range except control chars *) 103 () 104 else if b0 >= 0xC2 && b0 <= 0xDF then begin 105 (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *) ··· 112 (* C1 controls: U+0080 to U+009F *) 113 if cp >= 0x80 && cp <= 0x9F then 114 (match t.error_callback with 115 - | Some cb -> cb "control-character-in-input-stream" 116 | None -> ()) 117 | Some c1 -> 118 push_back_char t c1 ··· 132 (* Check for surrogates and noncharacters *) 133 (match t.error_callback with 134 | Some cb -> 135 - if is_surrogate cp then cb "surrogate-in-input-stream" 136 - else if is_noncharacter cp then cb "noncharacter-in-input-stream" 137 | None -> ()) 138 | Some c2 -> 139 push_back_char t c2; ··· 162 (* Check for noncharacters (no surrogates in 4-byte range) *) 163 (match t.error_callback with 164 | Some cb -> 165 - if is_noncharacter cp then cb "noncharacter-in-input-stream" 166 | None -> ()) 167 | Some c3 -> 168 push_back_char t c3;
··· 99 let check_utf8_codepoint t lead_byte = 100 let b0 = Char.code lead_byte in 101 if b0 < 0x80 then 102 + (* ASCII - control characters are handled in tokenizer_impl.ml *) 103 () 104 else if b0 >= 0xC2 && b0 <= 0xDF then begin 105 (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *) ··· 112 (* C1 controls: U+0080 to U+009F *) 113 if cp >= 0x80 && cp <= 0x9F then 114 (match t.error_callback with 115 + | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp) 116 | None -> ()) 117 | Some c1 -> 118 push_back_char t c1 ··· 132 (* Check for surrogates and noncharacters *) 133 (match t.error_callback with 134 | Some cb -> 135 + if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp) 136 + else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp) 137 | None -> ()) 138 | Some c2 -> 139 push_back_char t c2; ··· 162 (* Check for noncharacters (no surrogates in 4-byte range) *) 163 (match t.error_callback with 164 | Some cb -> 165 + if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp) 166 | None -> ()) 167 | Some c3 -> 168 push_back_char t c3;