OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+57 -16
lib/html5_checker/datatype/dt_autocomplete.ml
··· 147 147 is_contact_details := true 148 148 | _ -> ()); 149 149 150 + (* Check if any token in the list is shipping/billing *) 151 + let find_shipping_billing tokens = 152 + List.find_opt (fun t -> t = "shipping" || t = "billing") tokens 153 + in 154 + 155 + (* Check if any token in the list is a contact type *) 156 + let find_contact_type tokens = 157 + List.find_opt (fun t -> List.mem t contact_types) tokens 158 + in 159 + 160 + (* Check if any token in the list is a section-* indicator *) 161 + let find_section tokens = 162 + List.find_opt (fun t -> starts_with t "section-") tokens 163 + in 164 + 150 165 (* Process remaining tokens *) 151 - let process_field_tokens = function 152 - | [] -> Error "A list of autofill details tokens must contain an autofill field name" 166 + let process_field_tokens tokens = 167 + match tokens with 168 + | [] -> Error "A list of autofill details tokens must contain an autofill field name." 153 169 | [ "webauthn" ] -> 154 170 Error 155 171 "The token \"webauthn\" must not be the only token in a list of \ 156 - autofill detail tokens" 172 + autofill detail tokens." 157 173 | [ field_name ] -> 158 174 if not (List.mem field_name all_field_names) then 159 175 Error 160 176 (Printf.sprintf 161 - "The string \"%s\" is not a valid autofill field name" 177 + "The string \"%s\" is not a valid autofill field name." 162 178 field_name) 163 179 else if !is_contact_details && not (List.mem field_name contact_field_names) 164 180 then 165 181 Error 166 182 (Printf.sprintf 167 183 "The autofill field name \"%s\" is not allowed in contact \ 168 - context" 184 + context." 169 185 field_name) 170 186 else Ok () 171 187 | [ field_name; "webauthn" ] -> 172 188 if not (List.mem field_name all_field_names) then 173 189 Error 174 190 (Printf.sprintf 175 - "The string \"%s\" is not a valid autofill field name" 191 + "The string \"%s\" is not a valid autofill field name." 176 192 field_name) 177 193 else if !is_contact_details && not (List.mem field_name contact_field_names) 178 194 then 179 195 Error 180 196 (Printf.sprintf 181 197 "The autofill field name \"%s\" is not allowed in contact \ 182 - context" 198 + context." 183 199 field_name) 184 200 else Ok () 185 201 | token :: _ when List.mem token contact_types -> 186 202 Error 187 203 (Printf.sprintf 188 - "The token \"%s\" must only appear before any autofill field names" 204 + "The token \"%s\" must only appear before any autofill field names." 189 205 token) 190 206 | token :: _ when starts_with token "section-" -> 191 207 Error 192 208 "A \"section-*\" indicator must only appear as the first token in a \ 193 - list of autofill detail tokens" 209 + list of autofill detail tokens." 194 210 | "shipping" :: _ | "billing" :: _ as toks -> 195 211 Error 196 212 (Printf.sprintf 197 213 "The token \"%s\" must only appear as either the first token in a \ 198 214 list of autofill detail tokens, or, if the first token is a \ 199 - \"section-*\" indicator, as the second token" 215 + \"section-*\" indicator, as the second token." 200 216 (List.hd toks)) 201 217 | _ :: "webauthn" :: _ :: _ -> 202 218 Error 203 219 "The token \"webauthn\" must only appear as the very last token in a \ 204 - list of autofill detail tokens" 205 - | _ :: _ :: _ -> 206 - Error 207 - "A list of autofill details tokens must not contain more than one \ 208 - autofill field name" 220 + list of autofill detail tokens." 221 + | _ :: rest -> 222 + (* Check if any remaining token is a section-* indicator - position error takes precedence *) 223 + (match find_section rest with 224 + | Some _ -> 225 + Error 226 + "A \"section-*\" indicator must only appear as the first token in a \ 227 + list of autofill detail tokens." 228 + | None -> 229 + (* Check if any remaining token is a contact type - position error takes precedence *) 230 + match find_contact_type rest with 231 + | Some ct_token -> 232 + Error 233 + (Printf.sprintf 234 + "The token \"%s\" must only appear before any autofill field names." 235 + ct_token) 236 + | None -> 237 + (* Check if any remaining token is shipping/billing - position error takes precedence *) 238 + match find_shipping_billing rest with 239 + | Some sb_token -> 240 + Error 241 + (Printf.sprintf 242 + "The token \"%s\" must only appear as either the first token in a \ 243 + list of autofill detail tokens, or, if the first token is a \ 244 + \"section-*\" indicator, as the second token." 245 + sb_token) 246 + | None -> 247 + Error 248 + "A list of autofill details tokens must not contain more than one \ 249 + autofill field name.") 209 250 in 210 251 process_field_tokens !tokens 211 252 212 253 (** Validate autocomplete value *) 213 254 let validate_autocomplete s = 214 255 let trimmed = trim_whitespace s in 215 - if String.length trimmed = 0 then Error "Must not be empty" 256 + if String.length trimmed = 0 then Error "Must not be empty." 216 257 else if trimmed = "on" || trimmed = "off" then Ok () 217 258 else 218 259 let tokens = split_on_whitespace trimmed in
+60 -4
lib/html5_checker/parse_error_bridge.ml
··· 11 11 Message.make_location ~line ~column ?system_id () 12 12 in 13 13 let code_str = Html5rw.Parse_error_code.to_string code in 14 - let message = match code with 14 + let (message, final_code) = match code with 15 15 | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> 16 - "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag." 17 - | _ -> Printf.sprintf "Parse error: %s" code_str 16 + ("Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag.", code_str) 17 + | Html5rw.Parse_error_code.Tree_construction_error s -> 18 + (* Check for control-character/noncharacter/surrogate with codepoint info *) 19 + (try 20 + if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then 21 + let colon_pos = String.index s ':' in 22 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 23 + let cp = int_of_string ("0x" ^ cp_str) in 24 + (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 25 + else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then 26 + let colon_pos = String.index s ':' in 27 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 28 + let cp = int_of_string ("0x" ^ cp_str) in 29 + (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 30 + else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then 31 + let colon_pos = String.index s ':' in 32 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 33 + let cp = int_of_string ("0x" ^ cp_str) in 34 + (Printf.sprintf "Forbidden code point U+%04x." cp, "forbidden-codepoint") 35 + (* Character reference errors *) 36 + else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then 37 + let cp_str = String.sub s 28 (String.length s - 28) in 38 + let cp = int_of_string ("0x" ^ cp_str) in 39 + if cp = 0x0D then 40 + ("A numeric character reference expanded to carriage return.", "control-character-reference") 41 + else 42 + (Printf.sprintf "Character reference expands to a control character (U+%04x)." cp, "control-character-reference") 43 + else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then 44 + let colon_pos = String.index s ':' in 45 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 46 + let cp = int_of_string ("0x" ^ cp_str) in 47 + (* U+FDD0-U+FDEF are "permanently unassigned" *) 48 + if cp >= 0xFDD0 && cp <= 0xFDEF then 49 + ("Character reference expands to a permanently unassigned code point.", "noncharacter-character-reference") 50 + (* Astral noncharacters (planes 1-16) *) 51 + else if cp >= 0x10000 then 52 + (Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp, "noncharacter-character-reference") 53 + else 54 + (Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp, "noncharacter-character-reference") 55 + else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then 56 + let colon_pos = String.index s ':' in 57 + let _ = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 58 + ("Character reference outside the permissible Unicode range.", "character-reference-outside-unicode-range") 59 + else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then 60 + let colon_pos = String.index s ':' in 61 + let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 62 + let cp = int_of_string ("0x" ^ cp_str) in 63 + (Printf.sprintf "Character reference expands to a surrogate (U+%04x)." cp, "surrogate-character-reference") 64 + else if s = "no-p-element-in-scope" then 65 + ("No \xe2\x80\x9cp\xe2\x80\x9d element in scope but a \xe2\x80\x9cp\xe2\x80\x9d end tag seen.", "no-p-element-in-scope") 66 + else if s = "end-tag-p-implied-but-open-elements" then 67 + ("End tag \xe2\x80\x9cp\xe2\x80\x9d implied, but there were open elements.", "end-tag-p-implied") 68 + else if s = "end-tag-br" then 69 + ("End tag \xe2\x80\x9cbr\xe2\x80\x9d.", "end-tag-br") 70 + else 71 + (Printf.sprintf "Parse error: %s" s, s) 72 + with _ -> (Printf.sprintf "Parse error: %s" s, s)) 73 + | _ -> (Printf.sprintf "Parse error: %s" code_str, code_str) 18 74 in 19 75 Message.error 20 76 ~message 21 - ~code:code_str 77 + ~code:final_code 22 78 ~location 23 79 () 24 80
+3 -1
lib/html5_checker/semantic/form_checker.ml
··· 32 32 match Dt_autocomplete.validate_autocomplete value with 33 33 | Ok () -> () 34 34 | Error msg -> 35 + (* Nu validator prefixes autocomplete errors with "Bad autocomplete detail tokens (any): " for select/textarea, but not for input *) 36 + let reason = if element_name = "input" then msg else "Bad autocomplete detail tokens (any): " ^ msg in 35 37 Message_collector.add_typed collector 36 38 (Error_code.Bad_attr_value { 37 39 element = element_name; 38 40 attr = "autocomplete"; 39 41 value; 40 - reason = msg 42 + reason 41 43 }) 42 44 end 43 45
+23 -8
lib/html5_checker/semantic/id_checker.ml
··· 193 193 so we pass None. In a full implementation, this would be passed 194 194 from the parser. *) 195 195 let location = None in 196 - process_attrs state ~element:name ~attrs ~location collector 196 + process_attrs state ~element:name ~attrs ~location collector; 197 + 198 + (* Special check: map element must have matching id and name if both present *) 199 + if name = "map" then begin 200 + let id_opt = List.find_map (fun (n, v) -> if n = "id" then Some v else None) attrs in 201 + let name_opt = List.find_map (fun (n, v) -> if n = "name" then Some v else None) attrs in 202 + match id_opt, name_opt with 203 + | Some id_val, Some name_val when id_val <> name_val -> 204 + Message_collector.add_typed collector Error_code.Map_id_name_mismatch 205 + | _ -> () 206 + end 197 207 198 208 let end_element _state ~name:_ ~namespace:_ _collector = 199 209 () ··· 204 214 let end_document state collector = 205 215 (* Check all ID references point to existing IDs *) 206 216 List.iter (fun ref -> 207 - if not (Hashtbl.mem state.ids ref.referenced_id) then 208 - (* Use generic for dangling references - format may vary *) 209 - Message_collector.add_typed collector 210 - (Error_code.Generic { 211 - message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document." 212 - (Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id) 213 - }) 217 + if not (Hashtbl.mem state.ids ref.referenced_id) then begin 218 + (* Use specific error for list attribute on input *) 219 + if ref.attribute = "list" && ref.referring_element = "input" then 220 + Message_collector.add_typed collector Error_code.List_attr_requires_datalist 221 + else 222 + (* Use generic for dangling references - format may vary *) 223 + Message_collector.add_typed collector 224 + (Error_code.Generic { 225 + message = Printf.sprintf "The %s attribute on the %s element refers to ID %s which does not exist in the document." 226 + (Error_code.q ref.attribute) (Error_code.q ref.referring_element) (Error_code.q ref.referenced_id) 227 + }) 228 + end 214 229 ) state.references; 215 230 216 231 (* Check all usemap references point to existing map names *)
+85
lib/html5_checker/semantic/lang_detecting_checker.ml
··· 141 141 | "zh-tw" -> "zh-hant" 142 142 | _ -> code 143 143 144 + (* Traditional Chinese-only characters (simplified versions don't exist) *) 145 + (* These are characters that were simplified in Simplified Chinese *) 146 + let traditional_chars = [| 147 + 0x570B; (* 國 -> 国 *) 148 + 0x5B78; (* 學 -> 学 *) 149 + 0x8AAA; (* 說 -> 说 *) 150 + 0x66F8; (* 書 -> 书 *) 151 + 0x8A9E; (* 語 -> 语 *) 152 + 0x6642; (* 時 -> 时 *) 153 + 0x6703; (* 會 -> 会 *) 154 + 0x7D93; (* 經 -> 经 *) 155 + 0x6A5F; (* 機 -> 机 *) 156 + 0x767C; (* 發 -> 发 *) 157 + 0x554F; (* 問 -> 问 *) 158 + 0x6578; (* 數 -> 数 *) 159 + 0x5BE6; (* 實 -> 实 *) 160 + 0x958B; (* 開 -> 开 *) 161 + 0x95DC; (* 關 -> 关 *) 162 + 0x9577; (* 長 -> 长 *) 163 + 0x9AD4; (* 體 -> 体 *) 164 + 0x9EDE; (* 點 -> 点 *) 165 + 0x96FB; (* 電 -> 电 *) 166 + 0x8CC7; (* 資 -> 资 *) 167 + 0x7FA9; (* 義 -> 义 *) 168 + 0x8B93; (* 讓 -> 让 *) 169 + 0x9054; (* 達 -> 达 *) 170 + 0x71DF; (* 營 -> 营 *) 171 + 0x8655; (* 處 -> 处 *) 172 + 0x6771; (* 東 -> 东 *) 173 + 0x8209; (* 舉 -> 举 *) 174 + 0x8A18; (* 記 -> 记 *) 175 + 0x5099; (* 備 -> 备 *) 176 + 0x5354; (* 協 -> 协 *) 177 + 0x8FA6; (* 辦 -> 办 *) 178 + 0x8457; (* 著 -> 着 *) 179 + 0x8F09; (* 載 -> 载 *) 180 + 0x52D9; (* 務 -> 务 *) 181 + 0x7121; (* 無 -> 无 *) 182 + 0x5F9E; (* 從 -> 从 *) 183 + 0x8B58; (* 識 -> 识 *) 184 + 0x8207; (* 與 -> 与 *) 185 + 0x78BA; (* 確 -> 确 *) 186 + 0x904E; (* 過 -> 过 *) 187 + 0x8A72; (* 該 -> 该 *) 188 + 0x9810; (* 預 -> 预 *) 189 + 0x7576; (* 當 -> 当 *) 190 + 0x5831; (* 報 -> 报 *) 191 + 0x9054; (* 達 -> 达 *) 192 + 0x91AB; (* 醫 -> 医 *) 193 + 0x5718; (* 團 -> 团 *) 194 + 0x8B70; (* 議 -> 议 *) 195 + 0x7D71; (* 統 -> 统 *) 196 + 0x898F; (* 規 -> 规 *) 197 + |] 198 + 199 + (* Check if text contains enough Traditional Chinese characters *) 200 + let is_traditional_chinese text = 201 + let count = ref 0 in 202 + let total = ref 0 in 203 + let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 204 + let rec process () = 205 + if !total >= 1000 then () (* Sample first 1000 chars *) 206 + else match Uutf.decode decoder with 207 + | `Await | `End -> () 208 + | `Malformed _ -> process () 209 + | `Uchar uchar -> 210 + let code = Uchar.to_int uchar in 211 + (* Count CJK characters *) 212 + if code >= 0x4E00 && code <= 0x9FFF then begin 213 + incr total; 214 + (* Check if it's a Traditional-only character *) 215 + if Array.exists (fun c -> c = code) traditional_chars then 216 + incr count 217 + end; 218 + process () 219 + in 220 + process (); 221 + (* If > 2% are Traditional-only characters, it's Traditional Chinese *) 222 + !total > 100 && (float_of_int !count /. float_of_int !total) > 0.02 223 + 144 224 let start_element state ~name ~namespace ~attrs _collector = 145 225 let name_lower = String.lowercase_ascii name in 146 226 let ns = Option.value namespace ~default:"" in ··· 226 306 let original_declared = match state.html_lang with 227 307 | Some l -> l 228 308 | None -> "" 309 + in 310 + (* Correct for Traditional vs Simplified Chinese misdetection *) 311 + let detected_lang = 312 + if detected_lang = "zh-cn" && is_traditional_chinese text then "zh-tw" 313 + else detected_lang 229 314 in 230 315 let detected_code = detected_lang in (* Keep full code like zh-tw *) 231 316 let detected_name = get_language_name detected_lang in
+23 -3
lib/html5_checker/specialized/aria_checker.ml
··· 368 368 mutable stack : stack_node list; 369 369 mutable has_active_tab : bool; (* Whether document has role=tab with aria-selected=true *) 370 370 mutable has_tabpanel : bool; (* Whether document has role=tabpanel elements *) 371 + mutable visible_main_count : int; (* Count of visible elements with role=main *) 371 372 } 372 373 373 - let create () = { stack = []; has_active_tab = false; has_tabpanel = false } 374 + let create () = { stack = []; has_active_tab = false; has_tabpanel = false; visible_main_count = 0 } 374 375 375 376 let reset state = 376 377 state.stack <- []; 377 378 state.has_active_tab <- false; 378 - state.has_tabpanel <- false 379 + state.has_tabpanel <- false; 380 + state.visible_main_count <- 0 379 381 380 382 (** Check if any ancestor has one of the required roles. *) 381 383 let has_required_ancestor_role state required_roles = ··· 451 453 if aria_selected = Some "true" then state.has_active_tab <- true 452 454 end; 453 455 if List.mem "tabpanel" explicit_roles then state.has_tabpanel <- true; 456 + 457 + (* Track visible main elements (explicit role=main or implicit main role) *) 458 + let is_hidden = 459 + let aria_hidden = List.assoc_opt "aria-hidden" attrs in 460 + aria_hidden = Some "true" 461 + in 462 + if not is_hidden then begin 463 + (* Check explicit role *) 464 + if List.mem "main" explicit_roles then 465 + state.visible_main_count <- state.visible_main_count + 1 466 + (* Check implicit role from <main> element *) 467 + else if name_lower = "main" then 468 + state.visible_main_count <- state.visible_main_count + 1 469 + end; 454 470 455 471 (* Check br/wbr role restrictions - only none/presentation allowed *) 456 472 if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin ··· 784 800 Message_collector.add_error collector 785 801 ~message:"Every active \xe2\x80\x9crole=tab\xe2\x80\x9d element must have a corresponding \xe2\x80\x9crole=tabpanel\xe2\x80\x9d element." 786 802 ~code:"tab-without-tabpanel" 787 - () 803 + (); 804 + 805 + (* Check for multiple visible main elements *) 806 + if state.visible_main_count > 1 then 807 + Message_collector.add_typed collector Error_code.Multiple_main_visible 788 808 789 809 let checker = (module struct 790 810 type nonrec state = state
+7 -7
lib/html5_checker/specialized/attr_restrictions_checker.ml
··· 250 250 Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer." 251 251 attr_name name 252 252 else if String.contains attr_value '%' then 253 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 253 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 254 254 attr_value attr_name name 255 255 else if String.length attr_value > 0 && attr_value.[0] = '-' then 256 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The value must be non-negative." 256 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead." 257 257 attr_value attr_name name 258 258 else 259 259 (* Find first non-digit character *) ··· 268 268 in 269 269 match bad_char with 270 270 | Some c -> 271 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 271 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 272 272 attr_value attr_name name c 273 273 | None -> 274 - Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Expected a digit." 274 + Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit." 275 275 attr_value attr_name name 276 276 in 277 277 Message_collector.add_error collector ··· 455 455 List.iter (fun key -> 456 456 if count_codepoints key > 1 then 457 457 Message_collector.add_error collector 458 - ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The space-separated list of key labels contains a value \xe2\x80\x9c%s\xe2\x80\x9d that consists of more than a single code point." 459 - attr_value attr_name name key) 458 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character." 459 + attr_value attr_name name) 460 460 ~code:"bad-attribute-value" 461 461 ~element:name ~attribute:attr_name () 462 462 ) keys; ··· 466 466 | k :: rest -> 467 467 if List.mem k seen then 468 468 Message_collector.add_error collector 469 - ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Duplicate key label." 469 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique." 470 470 attr_value attr_name name) 471 471 ~code:"bad-attribute-value" 472 472 ~element:name ~attribute:attr_name ()
+9 -13
lib/html5_checker/specialized/datetime_checker.ml
··· 241 241 minute <> 0 && minute <> 30 && minute <> 45 242 242 in 243 243 if unusual_range then 244 - TzWarning "unusual timezone offset" 244 + TzWarning "Hours in time zone designator should be from \"-12:00\" to \"+14:00\"" 245 245 else if unusual_minutes then 246 - TzWarning "unusual timezone offset minutes" 246 + TzWarning "Minutes in time zone designator should be either \"00\", \"30\", or \"45\"." 247 247 else 248 248 TzOk 249 249 end ··· 350 350 match validate_datetime_with_timezone value with 351 351 | DtOk -> Ok (* Valid datetime with timezone *) 352 352 | DtWarning w -> 353 - (* Valid but with warning *) 354 - Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s." 353 + (* Valid but with warning - format matches Nu validator *) 354 + Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad datetime with timezone: %s Bad date: The literal did not satisfy the date format." 355 355 value attr_name element_name w) 356 356 | DtError tz_error -> 357 357 (* Try just date - valid for all elements *) ··· 359 359 | (true, _) -> 360 360 (* Date is valid, but check for suspicious year (5+ digits or old year) *) 361 361 if has_suspicious_year value || has_old_year value then begin 362 - let date_msg = "Year may be mistyped." in 362 + let date_msg = "Bad date: Year may be mistyped." in 363 363 let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 364 - Warning (Printf.sprintf "Possibly mistyped value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 364 + Warning (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 365 365 value attr_name element_name date_msg tz_msg) 366 366 end else 367 367 Ok (* Valid date with normal year *) ··· 389 389 match validate_duration value with 390 390 | (true, _) -> Ok (* Valid duration P... *) 391 391 | (false, _) -> 392 - let tz_msg = Printf.sprintf "Bad datetime with timezone: %s." tz_error in 393 - let date_msg = match date_error with 394 - | Some e -> Printf.sprintf "Bad date: %s." e 395 - | None -> "Bad date: The literal did not satisfy the date format." 396 - in 397 - Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s %s" 398 - value attr_name element_name tz_msg date_msg) 392 + (* Use simplified message for time element matching Nu validator format *) 393 + Error (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad time-datetime: The literal did not satisfy the time-datetime format." 394 + value attr_name element_name) 399 395 end 400 396 else begin 401 397 (* del/ins only allow date or datetime-with-timezone *)
+6 -3
lib/html5_checker/specialized/importmap_checker.ml
··· 175 175 | SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *) 176 176 | InvalidScopeKey (* scope key is not a valid URL *) 177 177 | InvalidScopeValue of string (* scope value is not a valid URL *) 178 + | ScopeValueNotObject (* a value inside scopes is not a JSON object *) 178 179 179 180 (** Check if a string looks like a valid URL-like specifier for importmaps *) 180 181 let is_valid_url_like s = ··· 255 256 | JNull -> () 256 257 | _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]")) 257 258 ) scope_imports 258 - | _ -> add_error (NotObject ("scopes[" ^ skey ^ "]")) 259 + | _ -> add_error ScopeValueNotObject 259 260 ) scope_members 260 261 | _ -> add_error (NotObject "scopes") 261 262 end ··· 290 291 Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop 291 292 | NotString _ -> 292 293 "A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values." 293 - | ForbiddenProperty prop -> 294 - Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d is not an allowed property." prop 294 + | ForbiddenProperty _ -> 295 + "A \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must contain a JSON object with no properties other than \xe2\x80\x9cimports\xe2\x80\x9d, \xe2\x80\x9cscopes\xe2\x80\x9d, and \xe2\x80\x9cintegrity\xe2\x80\x9d." 295 296 | SlashKeyWithoutSlashValue prop -> 296 297 Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop 297 298 | InvalidScopeKey -> 298 299 "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings." 299 300 | InvalidScopeValue _ -> 300 301 "A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values." 302 + | ScopeValueNotObject -> 303 + "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose values are also JSON objects." 301 304 302 305 let end_element state ~name ~namespace collector = 303 306 if namespace <> None then ()
+3 -3
lib/html5_checker/specialized/language_checker.ml
··· 57 57 | Some (deprecated, replacement) -> 58 58 Message_collector.add_warning collector 59 59 ~message:(Printf.sprintf 60 - "The language tag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead." 61 - deprecated replacement) 60 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad language tag: The language subtag \xe2\x80\x9c%s\xe2\x80\x9d is deprecated. Use \xe2\x80\x9c%s\xe2\x80\x9d instead." 61 + value attribute element deprecated replacement) 62 62 ~code:"deprecated-lang" 63 63 ?location 64 64 ~element 65 - ~attribute:"lang" 65 + ~attribute 66 66 () 67 67 | None -> () 68 68
+68 -27
lib/html5_checker/specialized/url_checker.ml
··· 239 239 let _ = contains_invalid_unicode decoded in 240 240 None 241 241 with Exit -> 242 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host." 242 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid host: A label or domain name contains disallowed characters.." 243 243 url attr_name element_name) 244 244 245 245 (** Check if string contains a character (checking both ASCII and UTF-8 fullwidth variants). *) ··· 349 349 end else 350 350 None 351 351 352 - (** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). *) 353 - let check_data_uri_fragment url attr_name element_name = 352 + (** Check for data: URI with fragment - this is a warning (RFC 2397 forbids fragments). 353 + The is_absolute_url parameter controls whether to use "Bad URL:" or "Bad absolute URL:" in the message. *) 354 + let check_data_uri_fragment ?(is_absolute_url=false) url attr_name element_name = 354 355 match extract_scheme url with 355 356 | None -> None 356 357 | Some scheme -> 357 358 if scheme = "data" && String.contains url '#' then 358 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Fragment is not allowed for data: URIs according to RFC 2397." 359 - url attr_name element_name) 359 + let url_type = if is_absolute_url then "Bad absolute URL:" else "Bad URL:" in 360 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: %s Fragment is not allowed for data: URIs according to RFC 2397." 361 + url attr_name element_name url_type) 360 362 else 361 363 None 362 364 ··· 373 375 let after_colon = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 374 376 (* data: URLs should NOT start with / - format is data:[mediatype][;base64],data *) 375 377 if String.length after_colon > 0 && after_colon.[0] = '/' then 376 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Invalid %s: URL." 377 - url attr_name element_name scheme) 378 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Expected a token character or a semicolon but saw \xe2\x80\x9c/\xe2\x80\x9d instead." 379 + url attr_name element_name) 378 380 else 379 381 None 380 382 end else ··· 389 391 (* Get scheme data (after the colon) *) 390 392 let colon_pos = String.index url ':' in 391 393 let scheme_data = String.sub url (colon_pos + 1) (String.length url - colon_pos - 1) in 394 + (* Check for tab in scheme data *) 395 + if String.contains scheme_data '\t' then 396 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: tab is not allowed." 397 + url attr_name element_name) 398 + (* Check for newline in scheme data *) 399 + else if String.contains scheme_data '\n' then 400 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 401 + url attr_name element_name) 402 + (* Check for carriage return in scheme data *) 403 + else if String.contains scheme_data '\r' then 404 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: line break is not allowed." 405 + url attr_name element_name) 392 406 (* Check for space in scheme data *) 393 - if String.contains scheme_data ' ' then 407 + else if String.contains scheme_data ' ' then 394 408 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in scheme data: space is not allowed." 395 409 url attr_name element_name) 396 410 else ··· 508 522 try 509 523 let fragment_start = String.index url '#' in 510 524 let fragment = String.sub url (fragment_start + 1) (String.length url - fragment_start - 1) in 525 + (* Check for backslash in fragment *) 526 + if String.contains fragment '\\' then 527 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c\\\xe2\x80\x9d is not allowed." 528 + url attr_name element_name) 511 529 (* Check for second hash in fragment *) 512 - if String.contains fragment '#' then 530 + else if String.contains fragment '#' then 513 531 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in fragment: \xe2\x80\x9c#\xe2\x80\x9d is not allowed." 514 532 url attr_name element_name) 515 533 (* Check for space in fragment *) ··· 560 578 else if String.contains userinfo ' ' then 561 579 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: space is not allowed." 562 580 url attr_name element_name) 563 - else 564 - (* Check for non-ASCII characters (like emoji) *) 565 - let has_non_ascii = String.exists (fun c -> Char.code c > 127) userinfo in 566 - if has_non_ascii then 567 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password." 568 - url attr_name element_name) 569 - else 581 + else begin 582 + (* Check for non-ASCII characters (like emoji) using UTF-8 decoding *) 583 + let find_non_ascii_char userinfo = 584 + let decoder = Uutf.decoder ~encoding:`UTF_8 (`String userinfo) in 585 + let rec find () = 586 + match Uutf.decode decoder with 587 + | `End | `Await -> None 588 + | `Malformed _ -> find () 589 + | `Uchar uchar -> 590 + let code = Uchar.to_int uchar in 591 + (* Check if character is not allowed in userinfo *) 592 + (* Per URL Standard: only ASCII letters, digits, and certain symbols allowed *) 593 + if code > 127 then begin 594 + let buf = Buffer.create 8 in 595 + Buffer.add_utf_8_uchar buf uchar; 596 + Some (Buffer.contents buf) 597 + end else find () 598 + in 599 + find () 600 + in 601 + match find_non_ascii_char userinfo with 602 + | Some bad_char -> 603 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%s\xe2\x80\x9d is not allowed." 604 + url attr_name element_name bad_char) 605 + | None -> 570 606 (* Check for other invalid chars *) 571 607 let invalid = List.find_opt (fun c -> String.contains userinfo c) invalid_userinfo_chars in 572 608 match invalid with ··· 574 610 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character in user or password: \xe2\x80\x9c%c\xe2\x80\x9d is not allowed." 575 611 url attr_name element_name c) 576 612 | None -> None 613 + end 577 614 with _ -> None 578 615 579 616 (** Attributes where empty URL is an error. ··· 613 650 Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Illegal character: leading/trailing ASCII whitespace." 614 651 original_url attr_name element_name) 615 652 else None 616 - (* Check for newlines/tabs *) 617 - else if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then 618 - Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found." 619 - url attr_name element_name) 653 + (* Check scheme data for non-special schemes FIRST - handles tab/newline/CR in scheme data *) 620 654 else begin 655 + match check_scheme_data url attr_name element_name with 656 + | Some err -> Some err 657 + | None -> 658 + (* Check for newlines/tabs in special scheme URLs *) 659 + if String.contains url '\n' || String.contains url '\r' || String.contains url '\t' then 660 + Some (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad URL: Tab, new line or carriage return found." 661 + url attr_name element_name) 662 + else begin 621 663 (* Check for relative URL issues first *) 622 664 match check_relative_url url attr_name element_name with 623 665 | Some err -> Some err ··· 659 701 url attr_name element_name) 660 702 else 661 703 662 - (* Check scheme data for non-special schemes *) 663 - match check_scheme_data url attr_name element_name with 664 - | Some err -> Some err 665 - | None -> 666 - 667 704 (* Check path segment for illegal characters *) 668 705 match check_path_segment url attr_name element_name with 669 706 | Some err -> Some err ··· 688 725 match host_opt with 689 726 | Some host -> validate_host host url attr_name element_name scheme_str 690 727 | None -> None 728 + end 691 729 end 692 730 end 693 731 ··· 761 799 () 762 800 | Some _ -> 763 801 (* Check for data: URI with fragment - emit warning *) 764 - (match check_data_uri_fragment url "value" name with 802 + (* input[type=url] uses "Bad absolute URL:" format *) 803 + (match check_data_uri_fragment ~is_absolute_url:true url "value" name with 765 804 | Some warn_msg -> 766 805 Message_collector.add_warning collector 767 806 ~message:warn_msg ··· 786 825 end 787 826 end; 788 827 (* Check microdata itemtype and itemid attributes for data: URI fragments *) 828 + (* Microdata uses "Bad absolute URL:" format *) 789 829 let itemtype_opt = get_attr_value "itemtype" attrs in 790 830 (match itemtype_opt with 791 831 | Some url when String.trim url <> "" -> 792 - (match check_data_uri_fragment url "itemtype" name with 832 + (match check_data_uri_fragment ~is_absolute_url:true url "itemtype" name with 793 833 | Some warn_msg -> 794 834 Message_collector.add_warning collector 795 835 ~message:warn_msg ··· 799 839 () 800 840 | None -> ()) 801 841 | _ -> ()); 842 + (* itemid uses "Bad URL:" format (not "Bad absolute URL:") *) 802 843 let itemid_opt = get_attr_value "itemid" attrs in 803 844 (match itemid_opt with 804 845 | Some url when String.trim url <> "" ->
+3 -3
lib/html5rw/parser/parser_tree_builder.ml
··· 664 664 let close_p_element t = 665 665 generate_implied_end_tags t ~except:"p" (); 666 666 (match current_node t with 667 - | Some n when n.Dom.name <> "p" -> parse_error t "expected-p" 667 + | Some n when n.Dom.name <> "p" -> parse_error t "end-tag-p-implied-but-open-elements" 668 668 | _ -> ()); 669 669 pop_until_tag t "p" 670 670 ··· 1215 1215 end 1216 1216 | Token.Tag { kind = Token.End; name = "p"; _ } -> 1217 1217 if not (has_element_in_button_scope t "p") then begin 1218 - parse_error t "unexpected-end-tag"; 1218 + parse_error t "no-p-element-in-scope"; 1219 1219 ignore (insert_element t "p" ~push:true []) 1220 1220 end; 1221 1221 close_p_element t ··· 1321 1321 t.frameset_ok <- false; 1322 1322 t.mode <- Parser_insertion_mode.In_table 1323 1323 | Token.Tag { kind = Token.End; name = "br"; _ } -> 1324 - parse_error t "unexpected-end-tag"; 1324 + parse_error t "end-tag-br"; 1325 1325 reconstruct_active_formatting t; 1326 1326 ignore (insert_element t "br" ~push:true []); 1327 1327 pop_current t;
+5 -5
lib/html5rw/tokenizer/tokenizer_impl.ml
··· 264 264 code = 0x0B || 265 265 (code >= 0x0E && code <= 0x1F) || 266 266 code = 0x7F then 267 - error t "control-character-in-input-stream" 267 + error t (Printf.sprintf "control-character-in-input-stream:%04x" code) 268 268 in 269 269 270 270 ··· 1937 1937 error t "null-character-reference"; 1938 1938 replacement_char 1939 1939 end else if code > 0x10FFFF then begin 1940 - error t "character-reference-outside-unicode-range"; 1940 + error t (Printf.sprintf "character-reference-outside-unicode-range:%x" code); 1941 1941 replacement_char 1942 1942 end else if code >= 0xD800 && code <= 0xDFFF then begin 1943 - error t "surrogate-character-reference"; 1943 + error t (Printf.sprintf "surrogate-character-reference:%04x" code); 1944 1944 replacement_char 1945 1945 end else if (code >= 0xFDD0 && code <= 0xFDEF) || 1946 1946 List.mem code [0xFFFE; 0xFFFF; 0x1FFFE; 0x1FFFF; 0x2FFFE; 0x2FFFF; ··· 1949 1949 0x9FFFE; 0x9FFFF; 0xAFFFE; 0xAFFFF; 0xBFFFE; 0xBFFFF; 1950 1950 0xCFFFE; 0xCFFFF; 0xDFFFE; 0xDFFFF; 0xEFFFE; 0xEFFFF; 1951 1951 0xFFFFE; 0xFFFFF; 0x10FFFE; 0x10FFFF] then begin 1952 - error t "noncharacter-character-reference"; 1952 + error t (Printf.sprintf "noncharacter-character-reference:%05x" code); 1953 1953 Entities.Numeric_ref.codepoint_to_utf8 code 1954 1954 end else if (code >= 0x01 && code <= 0x08) || code = 0x0B || 1955 1955 (code >= 0x0D && code <= 0x1F) || 1956 1956 (code >= 0x7F && code <= 0x9F) then begin 1957 - error t "control-character-reference"; 1957 + error t (Printf.sprintf "control-character-reference:%04x" code); 1958 1958 (* Apply Windows-1252 replacement table for 0x80-0x9F *) 1959 1959 match Entities.Numeric_ref.find_replacement code with 1960 1960 | Some replacement -> Entities.Numeric_ref.codepoint_to_utf8 replacement
+5 -5
lib/html5rw/tokenizer/tokenizer_stream.ml
··· 99 99 let check_utf8_codepoint t lead_byte = 100 100 let b0 = Char.code lead_byte in 101 101 if b0 < 0x80 then 102 - (* ASCII - no surrogates or noncharacters possible in this range except control chars *) 102 + (* ASCII - control characters are handled in tokenizer_impl.ml *) 103 103 () 104 104 else if b0 >= 0xC2 && b0 <= 0xDF then begin 105 105 (* 2-byte sequence: 110xxxxx 10xxxxxx -> U+0080 to U+07FF *) ··· 112 112 (* C1 controls: U+0080 to U+009F *) 113 113 if cp >= 0x80 && cp <= 0x9F then 114 114 (match t.error_callback with 115 - | Some cb -> cb "control-character-in-input-stream" 115 + | Some cb -> cb (Printf.sprintf "control-character-in-input-stream:%04x" cp) 116 116 | None -> ()) 117 117 | Some c1 -> 118 118 push_back_char t c1 ··· 132 132 (* Check for surrogates and noncharacters *) 133 133 (match t.error_callback with 134 134 | Some cb -> 135 - if is_surrogate cp then cb "surrogate-in-input-stream" 136 - else if is_noncharacter cp then cb "noncharacter-in-input-stream" 135 + if is_surrogate cp then cb (Printf.sprintf "surrogate-in-input-stream:%04x" cp) 136 + else if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%04x" cp) 137 137 | None -> ()) 138 138 | Some c2 -> 139 139 push_back_char t c2; ··· 162 162 (* Check for noncharacters (no surrogates in 4-byte range) *) 163 163 (match t.error_callback with 164 164 | Some cb -> 165 - if is_noncharacter cp then cb "noncharacter-in-input-stream" 165 + if is_noncharacter cp then cb (Printf.sprintf "noncharacter-in-input-stream:%05x" cp) 166 166 | None -> ()) 167 167 | Some c3 -> 168 168 push_back_char t c3;