OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+19
lib/htmlrw_check/attr_utils.ml
··· 1 + (** Common attribute utilities used across checkers. *) 2 + 3 + type attrs = (string * string) list 4 + 5 + let has_attr name attrs = 6 + List.exists (fun (n, _) -> String.lowercase_ascii n = name) attrs 7 + 8 + let get_attr name attrs = 9 + List.find_map (fun (n, v) -> 10 + if String.lowercase_ascii n = name then Some v else None 11 + ) attrs 12 + 13 + let get_attr_or name ~default attrs = 14 + Option.value ~default (get_attr name attrs) 15 + 16 + let is_non_empty_attr name attrs = 17 + match get_attr name attrs with 18 + | Some v -> String.trim v <> "" 19 + | None -> false
+2 -8
lib/htmlrw_check/datatype/dt_color.ml
··· 213 213 if String.length s = 0 then Error "Color value must not be empty" 214 214 else if List.mem s named_colors then Ok () 215 215 else if String.length s > 0 && s.[0] = '#' then validate_hex_color s 216 - else if 217 - String.length s > 4 218 - && (String.sub s 0 4 = "rgb(" || String.sub s 0 5 = "rgba(") 219 - then 216 + else if String.starts_with ~prefix:"rgb(" s || String.starts_with ~prefix:"rgba(" s then 220 217 (* Basic validation for rgb/rgba - just check balanced parens *) 221 218 if s.[String.length s - 1] = ')' then Ok () 222 219 else Error "rgb/rgba function must end with ')'" 223 - else if 224 - String.length s > 4 225 - && (String.sub s 0 4 = "hsl(" || String.sub s 0 5 = "hsla(") 226 - then 220 + else if String.starts_with ~prefix:"hsl(" s || String.starts_with ~prefix:"hsla(" s then 227 221 (* Basic validation for hsl/hsla - just check balanced parens *) 228 222 if s.[String.length s - 1] = ')' then Ok () 229 223 else Error "hsl/hsla function must end with ')'"
+2 -2
lib/htmlrw_check/datatype/dt_media_query.ml
··· 330 330 331 331 (* Get base feature name for error messages (strip min-/max- prefix) *) 332 332 let base_feature = 333 - if String.length feature > 4 && String.sub feature 0 4 = "min-" then 333 + if String.starts_with ~prefix:"min-" feature then 334 334 String.sub feature 4 (String.length feature - 4) 335 - else if String.length feature > 4 && String.sub feature 0 4 = "max-" then 335 + else if String.starts_with ~prefix:"max-" feature then 336 336 String.sub feature 4 (String.length feature - 4) 337 337 else 338 338 feature
+77 -102
lib/htmlrw_check/message_format.ml
··· 1 + (** Get effective system_id, preferring location's system_id over the passed one *) 2 + let get_system_id ?system_id loc_system_id = 3 + loc_system_id 4 + |> Option.fold ~none:system_id ~some:Option.some 5 + |> Option.value ~default:"input" 6 + 1 7 let format_text ?system_id messages = 2 8 let buf = Buffer.create 1024 in 3 - List.iter 4 - (fun msg -> 5 - let loc_str = 6 - match msg.Message.location with 7 - | Some loc -> ( 8 - let sid = 9 - match loc.Message.system_id with 10 - | Some s -> s 11 - | None -> ( 12 - match system_id with Some s -> s | None -> "input") 13 - in 14 - let col_info = 15 - match (loc.end_line, loc.end_column) with 16 - | Some el, Some ec when el = loc.line && ec > loc.column -> 17 - Printf.sprintf "%d.%d-%d" loc.line loc.column ec 18 - | Some el, Some ec when el > loc.line -> 19 - Printf.sprintf "%d.%d-%d.%d" loc.line loc.column el ec 20 - | _ -> Printf.sprintf "%d.%d" loc.line loc.column 21 - in 22 - Printf.sprintf "%s:%s" sid col_info) 23 - | None -> ( 24 - match system_id with Some s -> s | None -> "input") 25 - in 26 - let severity_str = Message.severity_to_string msg.Message.severity in 27 - let code_str = " [" ^ Message.error_code_to_string msg.Message.error_code ^ "]" in 28 - let elem_str = 29 - match msg.Message.element with 30 - | Some e -> " (element: " ^ e ^ ")" 31 - | None -> "" 32 - in 33 - let attr_str = 34 - match msg.Message.attribute with 35 - | Some a -> " (attribute: " ^ a ^ ")" 36 - | None -> "" 37 - in 38 - Buffer.add_string buf 39 - (Printf.sprintf "%s: %s%s: %s%s%s\n" loc_str severity_str code_str 40 - msg.Message.message elem_str attr_str)) 41 - messages; 9 + List.iter (fun msg -> 10 + let loc_str = match msg.Message.location with 11 + | Some loc -> 12 + let sid = get_system_id ?system_id loc.Message.system_id in 13 + let col_info = match loc.end_line, loc.end_column with 14 + | Some el, Some ec when el = loc.line && ec > loc.column -> 15 + Printf.sprintf "%d.%d-%d" loc.line loc.column ec 16 + | Some el, Some ec when el > loc.line -> 17 + Printf.sprintf "%d.%d-%d.%d" loc.line loc.column el ec 18 + | _ -> 19 + Printf.sprintf "%d.%d" loc.line loc.column 20 + in 21 + Printf.sprintf "%s:%s" sid col_info 22 + | None -> 23 + Option.value system_id ~default:"input" 24 + in 25 + let elem_str = Option.fold ~none:"" ~some:(Printf.sprintf " (element: %s)") msg.Message.element in 26 + let attr_str = Option.fold ~none:"" ~some:(Printf.sprintf " (attribute: %s)") msg.Message.attribute in 27 + Buffer.add_string buf (Printf.sprintf "%s: %s [%s]: %s%s%s\n" 28 + loc_str 29 + (Message.severity_to_string msg.Message.severity) 30 + (Message.error_code_to_string msg.Message.error_code) 31 + msg.Message.message 32 + elem_str 33 + attr_str) 34 + ) messages; 42 35 Buffer.contents buf 43 36 44 37 let format_gnu ?system_id messages = 45 38 let buf = Buffer.create 1024 in 46 - List.iter 47 - (fun msg -> 48 - let loc_str = 49 - match msg.Message.location with 50 - | Some loc -> ( 51 - let sid = 52 - match loc.Message.system_id with 53 - | Some s -> s 54 - | None -> ( 55 - match system_id with Some s -> s | None -> "input") 56 - in 57 - Printf.sprintf "%s:%d:%d" sid loc.line loc.column) 58 - | None -> ( 59 - match system_id with Some s -> s ^ ":0:0" | None -> "input:0:0") 60 - in 61 - let severity_str = Message.severity_to_string msg.Message.severity in 62 - let code_str = " [" ^ Message.error_code_to_string msg.Message.error_code ^ "]" in 63 - Buffer.add_string buf 64 - (Printf.sprintf "%s: %s%s: %s\n" loc_str severity_str code_str 65 - msg.Message.message)) 66 - messages; 39 + List.iter (fun msg -> 40 + let loc_str = match msg.Message.location with 41 + | Some loc -> 42 + Printf.sprintf "%s:%d:%d" 43 + (get_system_id ?system_id loc.Message.system_id) 44 + loc.line loc.column 45 + | None -> 46 + Option.value system_id ~default:"input" ^ ":0:0" 47 + in 48 + Buffer.add_string buf (Printf.sprintf "%s: %s [%s]: %s\n" 49 + loc_str 50 + (Message.severity_to_string msg.Message.severity) 51 + (Message.error_code_to_string msg.Message.error_code) 52 + msg.Message.message) 53 + ) messages; 67 54 Buffer.contents buf 68 55 69 56 let message_to_json ?system_id msg = 70 57 let open Jsont in 71 - let severity = String (Message.severity_to_string msg.Message.severity, Meta.none) in 72 - let message_text = String (msg.Message.message, Meta.none) in 73 - let base = [ (("type", Meta.none), severity); (("message", Meta.none), message_text) ] in 74 - let with_code = 75 - (("subType", Meta.none), String (Message.error_code_to_string msg.Message.error_code, Meta.none)) :: base 76 - in 77 - let with_location = 78 - match msg.Message.location with 58 + let str s = String (s, Meta.none) in 59 + let num n = Number (float_of_int n, Meta.none) in 60 + let field name value = ((name, Meta.none), value) in 61 + 62 + let base = [ 63 + field "type" (str (Message.severity_to_string msg.Message.severity)); 64 + field "message" (str msg.Message.message); 65 + field "subType" (str (Message.error_code_to_string msg.Message.error_code)); 66 + ] in 67 + 68 + let with_location = match msg.Message.location with 79 69 | Some loc -> 80 - let line = Number (float_of_int loc.Message.line, Meta.none) in 81 - let column = Number (float_of_int loc.Message.column, Meta.none) in 82 - let loc_fields = 83 - [ (("firstLine", Meta.none), line); (("firstColumn", Meta.none), column) ] 84 - in 85 - let loc_fields = 86 - match loc.Message.end_line with 87 - | Some el -> 88 - (("lastLine", Meta.none), Number (float_of_int el, Meta.none)) :: loc_fields 89 - | None -> loc_fields 90 - in 91 - let loc_fields = 92 - match loc.Message.end_column with 93 - | Some ec -> 94 - (("lastColumn", Meta.none), Number (float_of_int ec, Meta.none)) 95 - :: loc_fields 96 - | None -> loc_fields 97 - in 98 - let url = 99 - match loc.Message.system_id with 100 - | Some s -> s 101 - | None -> ( 102 - match system_id with Some s -> s | None -> "input") 103 - in 104 - (("url", Meta.none), String (url, Meta.none)) :: loc_fields @ with_code 70 + let url = get_system_id ?system_id loc.Message.system_id in 71 + let loc_fields = [ 72 + field "url" (str url); 73 + field "firstLine" (num loc.line); 74 + field "firstColumn" (num loc.column); 75 + ] in 76 + let loc_fields = Option.fold ~none:loc_fields 77 + ~some:(fun el -> field "lastLine" (num el) :: loc_fields) 78 + loc.Message.end_line in 79 + let loc_fields = Option.fold ~none:loc_fields 80 + ~some:(fun ec -> field "lastColumn" (num ec) :: loc_fields) 81 + loc.Message.end_column in 82 + loc_fields @ base 105 83 | None -> 106 - let url = 107 - match system_id with Some s -> s | None -> "input" 108 - in 109 - (("url", Meta.none), String (url, Meta.none)) :: with_code 84 + field "url" (str (Option.value system_id ~default:"input")) :: base 110 85 in 111 - let with_extract = 112 - match msg.Message.extract with 113 - | Some e -> (("extract", Meta.none), String (e, Meta.none)) :: with_location 114 - | None -> with_location 115 - in 86 + 87 + let with_extract = Option.fold ~none:with_location 88 + ~some:(fun e -> field "extract" (str e) :: with_location) 89 + msg.Message.extract in 90 + 116 91 Object (with_extract, Meta.none) 117 92 118 93 let format_json ?system_id messages =
+10 -10
lib/htmlrw_check/parse_error_bridge.ml
··· 14 14 | Html5rw.Parse_error_code.Tree_construction_error s -> 15 15 (* Check for control-character/noncharacter/surrogate with codepoint info *) 16 16 (try 17 - if String.length s > 28 && String.sub s 0 28 = "control-character-in-input-s" then 17 + if String.starts_with ~prefix:"control-character-in-input-s" s then 18 18 let colon_pos = String.index s ':' in 19 19 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 20 20 let cp = int_of_string ("0x" ^ cp_str) in 21 21 Printf.sprintf "Forbidden code point U+%04x." cp 22 - else if String.length s > 25 && String.sub s 0 25 = "noncharacter-in-input-str" then 22 + else if String.starts_with ~prefix:"noncharacter-in-input-str" s then 23 23 let colon_pos = String.index s ':' in 24 24 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 25 25 let cp = int_of_string ("0x" ^ cp_str) in 26 26 Printf.sprintf "Forbidden code point U+%04x." cp 27 - else if String.length s > 22 && String.sub s 0 22 = "surrogate-in-input-str" then 27 + else if String.starts_with ~prefix:"surrogate-in-input-str" s then 28 28 let colon_pos = String.index s ':' in 29 29 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 30 30 let cp = int_of_string ("0x" ^ cp_str) in 31 31 Printf.sprintf "Forbidden code point U+%04x." cp 32 32 (* Character reference errors *) 33 - else if String.length s > 28 && String.sub s 0 28 = "control-character-reference:" then 33 + else if String.starts_with ~prefix:"control-character-reference:" s then 34 34 let cp_str = String.sub s 28 (String.length s - 28) in 35 35 let cp = int_of_string ("0x" ^ cp_str) in 36 36 if cp = 0x0D then 37 37 "A numeric character reference expanded to carriage return." 38 38 else 39 39 Printf.sprintf "Character reference expands to a control character (U+%04x)." cp 40 - else if String.length s > 31 && String.sub s 0 31 = "noncharacter-character-referenc" then 40 + else if String.starts_with ~prefix:"noncharacter-character-referenc" s then 41 41 let colon_pos = String.index s ':' in 42 42 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 43 43 let cp = int_of_string ("0x" ^ cp_str) in ··· 49 49 Printf.sprintf "Character reference expands to an astral non-character (U+%05x)." cp 50 50 else 51 51 Printf.sprintf "Character reference expands to a non-character (U+%04x)." cp 52 - else if String.length s > 36 && String.sub s 0 36 = "character-reference-outside-unicode-" then 52 + else if String.starts_with ~prefix:"character-reference-outside-unicode-" s then 53 53 "Character reference outside the permissible Unicode range." 54 - else if String.length s > 27 && String.sub s 0 27 = "surrogate-character-referen" then 54 + else if String.starts_with ~prefix:"surrogate-character-referen" s then 55 55 let colon_pos = String.index s ':' in 56 56 let cp_str = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 57 57 let cp = int_of_string ("0x" ^ cp_str) in ··· 64 64 "End tag \xe2\x80\x9cbr\xe2\x80\x9d." 65 65 else if s = "expected-closing-tag-but-got-eof" then 66 66 "End of file seen and there were open elements." 67 - else if String.length s > 28 && String.sub s 0 28 = "bad-start-tag-in-head-noscri" then 67 + else if String.starts_with ~prefix:"bad-start-tag-in-head-noscri" s then 68 68 let colon_pos = String.index s ':' in 69 69 let element = String.sub s (colon_pos + 1) (String.length s - colon_pos - 1) in 70 70 Printf.sprintf "Bad start tag in \xe2\x80\x9c%s\xe2\x80\x9d in \xe2\x80\x9cnoscript\xe2\x80\x9d in \xe2\x80\x9chead\xe2\x80\x9d." element 71 - else if String.length s > 19 && String.sub s 0 19 = "unexpected-end-tag:" then 71 + else if String.starts_with ~prefix:"unexpected-end-tag:" s then 72 72 let element = String.sub s 19 (String.length s - 19) in 73 73 Printf.sprintf "Stray end tag \xe2\x80\x9c%s\xe2\x80\x9d." element 74 - else if String.length s > 19 && String.sub s 0 19 = "start-tag-in-table:" then 74 + else if String.starts_with ~prefix:"start-tag-in-table:" s then 75 75 let tag = String.sub s 19 (String.length s - 19) in 76 76 Printf.sprintf "Start tag \xe2\x80\x9c%s\xe2\x80\x9d seen in \xe2\x80\x9ctable\xe2\x80\x9d." tag 77 77 else
+17 -39
lib/htmlrw_check/semantic/autofocus_checker.ml
··· 3 3 Validates that only one element with autofocus attribute exists within 4 4 each dialog or popover context. *) 5 5 6 - (** Context for tracking autofocus elements. *) 7 6 type context_type = Dialog | Popover 8 7 9 8 type context = { ··· 26 25 state.context_stack <- []; 27 26 state.current_depth <- 0 28 27 29 - (** Check if an attribute list contains a specific attribute. *) 30 - let has_attr name attrs = 31 - List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs 32 - 33 - (** Get an attribute value from the list. *) 34 - let get_attr name attrs = 35 - List.find_map (fun (attr_name, value) -> 36 - if String.lowercase_ascii attr_name = name then Some value else None 37 - ) attrs 38 - 39 - (** Check if element has popover attribute. *) 40 - let has_popover attrs = 41 - List.exists (fun (attr_name, _) -> 42 - String.lowercase_ascii attr_name = "popover" 43 - ) attrs 44 - 45 28 let start_element state ~name ~namespace ~attrs collector = 46 - let name_lower = String.lowercase_ascii name in 47 - 48 - (* Track depth *) 49 29 state.current_depth <- state.current_depth + 1; 50 30 51 - if namespace = None then begin 31 + match namespace with 32 + | Some _ -> () 33 + | None -> 34 + let name_lower = String.lowercase_ascii name in 35 + 52 36 (* Check if we're entering a dialog or popover context *) 53 - let enters_context = 54 - if name_lower = "dialog" then Some Dialog 55 - else if has_popover attrs then Some Popover 56 - else None 37 + let enters_context = match name_lower with 38 + | "dialog" -> Some Dialog 39 + | _ when Attr_utils.has_attr "popover" attrs -> Some Popover 40 + | _ -> None 57 41 in 58 42 59 - (match enters_context with 60 - | Some ctx_type -> 43 + Option.iter (fun ctx_type -> 61 44 let ctx = { context_type = ctx_type; autofocus_count = 0; depth = state.current_depth } in 62 45 state.context_stack <- ctx :: state.context_stack 63 - | None -> ()); 46 + ) enters_context; 64 47 65 48 (* Check for autofocus attribute *) 66 - if has_attr "autofocus" attrs then begin 67 - (* Increment count in innermost context if any *) 49 + if Attr_utils.has_attr "autofocus" attrs then 68 50 match state.context_stack with 69 51 | ctx :: _ -> 70 52 ctx.autofocus_count <- ctx.autofocus_count + 1; 71 53 if ctx.autofocus_count > 1 then 72 54 Message_collector.add_typed collector (`Misc `Multiple_autofocus) 73 55 | [] -> () 74 - end 75 - end 76 56 77 57 let end_element state ~name ~namespace _collector = 78 - let name_lower = String.lowercase_ascii name in 79 - 80 - if namespace = None then begin 81 - (* Pop context if we're leaving one *) 58 + (match namespace with 59 + | Some _ -> () 60 + | None -> 61 + let name_lower = String.lowercase_ascii name in 82 62 match state.context_stack with 83 63 | ctx :: rest when ctx.depth = state.current_depth -> 84 - (* Verify this is the right element *) 85 64 let matches = 86 65 (name_lower = "dialog" && ctx.context_type = Dialog) || 87 66 (ctx.context_type = Popover) 88 67 in 89 68 if matches then state.context_stack <- rest 90 - | _ -> () 91 - end; 69 + | _ -> ()); 92 70 93 71 state.current_depth <- state.current_depth - 1 94 72
+1 -8
lib/htmlrw_check/semantic/form_checker.ml
··· 10 10 11 11 let reset _state = () 12 12 13 - (** Get the value of an attribute if present. *) 14 - let get_attr name attrs = 15 - List.find_map 16 - (fun (attr_name, value) -> 17 - if String.equal attr_name name then Some value else None) 18 - attrs 19 - 20 13 (** Check if autocomplete value contains webauthn token *) 21 14 let contains_webauthn value = 22 15 let lower = String.lowercase_ascii value in ··· 42 35 (* Check autocomplete attribute on form elements *) 43 36 match name with 44 37 | "input" | "select" | "textarea" -> 45 - (match get_attr "autocomplete" attrs with 38 + (match Attr_utils.get_attr "autocomplete" attrs with 46 39 | Some autocomplete_value -> 47 40 check_autocomplete_value autocomplete_value name collector 48 41 | None -> ())
+3 -8
lib/htmlrw_check/semantic/lang_detecting_checker.ml
··· 60 60 let n = String.lowercase_ascii name in 61 61 n = "svg" || n = "math" 62 62 63 - let get_attr name attrs = 64 - List.find_map (fun (n, v) -> 65 - if String.lowercase_ascii n = name then Some v else None 66 - ) attrs 67 - 68 63 let get_lang_code lang = 69 64 (* Extract primary language subtag *) 70 65 match String.split_on_char '-' lang with ··· 226 221 let ns = Option.value namespace ~default:"" in 227 222 228 223 if name_lower = "html" then begin 229 - state.html_lang <- get_attr "lang" attrs; 230 - state.html_dir <- get_attr "dir" attrs; 224 + state.html_lang <- Attr_utils.get_attr "lang" attrs; 225 + state.html_dir <- Attr_utils.get_attr "dir" attrs; 231 226 (* TODO: get line/column from locator *) 232 227 state.html_locator <- Some (1, 1) 233 228 end ··· 244 239 state.skip_depth <- state.skip_depth + 1 245 240 else begin 246 241 (* Check for different lang attribute *) 247 - match get_attr "lang" attrs with 242 + match Attr_utils.get_attr "lang" attrs with 248 243 | Some lang when state.html_lang <> Some lang -> 249 244 state.skip_depth <- state.skip_depth + 1 250 245 | _ -> ()
+1 -7
lib/htmlrw_check/semantic/option_checker.ml
··· 22 22 state.option_stack <- []; 23 23 state.in_template <- 0 24 24 25 - (** Get attribute value if present. *) 26 - let get_attr name attrs = 27 - List.find_map (fun (attr_name, value) -> 28 - if String.lowercase_ascii attr_name = name then Some value else None 29 - ) attrs 30 - 31 25 let start_element state ~name ~namespace ~attrs collector = 32 26 let name_lower = String.lowercase_ascii name in 33 27 ··· 36 30 if name_lower = "template" then 37 31 state.in_template <- state.in_template + 1 38 32 else if state.in_template = 0 && name_lower = "option" then begin 39 - let label_opt = get_attr "label" attrs in 33 + let label_opt = Attr_utils.get_attr "label" attrs in 40 34 let has_label = label_opt <> None in 41 35 let label_empty = match label_opt with 42 36 | Some v -> String.trim v = ""
+29 -40
lib/htmlrw_check/semantic/required_attr_checker.ml
··· 13 13 state._in_figure <- false; 14 14 state.in_a_with_href <- false 15 15 16 - (** Check if an attribute list contains a specific attribute. *) 17 - let has_attr name attrs = 18 - List.exists (fun (attr_name, _) -> String.equal attr_name name) attrs 19 - 20 - (** Get the value of an attribute if present. *) 21 - let get_attr name attrs = 22 - List.find_map 23 - (fun (attr_name, value) -> 24 - if String.equal attr_name name then Some value else None) 25 - attrs 26 - 27 16 let check_img_element state attrs collector = 28 17 (* Check for required src OR srcset attribute *) 29 - if not (has_attr "src" attrs) && not (has_attr "srcset" attrs) then 18 + if not (Attr_utils.has_attr "src" attrs) && not (Attr_utils.has_attr "srcset" attrs) then 30 19 Message_collector.add_typed collector (`Img `Missing_src_or_srcset); 31 20 32 21 (* Check for alt attribute - always required *) 33 - if not (has_attr "alt" attrs) then 22 + if not (Attr_utils.has_attr "alt" attrs) then 34 23 Message_collector.add_typed collector (`Img `Missing_alt); 35 24 36 25 (* Check ismap requires 'a' ancestor with href *) 37 - if has_attr "ismap" attrs && not state.in_a_with_href then 26 + if Attr_utils.has_attr "ismap" attrs && not state.in_a_with_href then 38 27 Message_collector.add_typed collector (`Img `Ismap_needs_href) 39 28 40 29 let check_area_element attrs collector = 41 30 (* area with href requires alt *) 42 - if has_attr "href" attrs && not (has_attr "alt" attrs) then 31 + if Attr_utils.has_attr "href" attrs && not (Attr_utils.has_attr "alt" attrs) then 43 32 Message_collector.add_typed collector 44 33 (`Attr (`Missing (`Elem "area", `Attr "alt"))) 45 34 46 35 let check_input_element attrs collector = 47 - match get_attr "type" attrs with 36 + match Attr_utils.get_attr "type" attrs with 48 37 | Some "image" -> 49 38 (* input[type=image] requires alt *) 50 - if not (has_attr "alt" attrs) then 39 + if not (Attr_utils.has_attr "alt" attrs) then 51 40 Message_collector.add_typed collector 52 41 (`Attr (`Missing (`Elem "input", `Attr "alt"))) 53 42 | Some "hidden" -> 54 43 (* input[type=hidden] should not have required attribute *) 55 - if has_attr "required" attrs then 44 + if Attr_utils.has_attr "required" attrs then 56 45 Message_collector.add_typed collector 57 46 (`Attr (`Not_allowed_when (`Attr "required", `Elem "input", `Condition "the type attribute is hidden"))) 58 47 | Some "file" -> 59 48 (* input[type=file] should not have value attribute *) 60 - if has_attr "value" attrs then 49 + if Attr_utils.has_attr "value" attrs then 61 50 Message_collector.add_typed collector 62 51 (`Attr (`Not_allowed_when (`Attr "value", `Elem "input", `Condition "the type attribute is file"))) 63 52 | _ -> () 64 53 65 54 let check_script_element attrs _collector = 66 55 (* script requires src OR text content *) 67 - if not (has_attr "src" attrs) then 56 + if not (Attr_utils.has_attr "src" attrs) then 68 57 (* We can't check for text content here; that would need to be done 69 58 in end_element or with state tracking *) 70 59 () ··· 76 65 - http-equiv AND content 77 66 - property AND content (RDFa) 78 67 - itemprop AND content (microdata) *) 79 - let has_charset = has_attr "charset" attrs in 80 - let has_name = has_attr "name" attrs in 81 - let has_content = has_attr "content" attrs in 82 - let has_http_equiv = has_attr "http-equiv" attrs in 83 - let has_property = has_attr "property" attrs in 84 - let has_itemprop = has_attr "itemprop" attrs in 68 + let has_charset = Attr_utils.has_attr "charset" attrs in 69 + let has_name = Attr_utils.has_attr "name" attrs in 70 + let has_content = Attr_utils.has_attr "content" attrs in 71 + let has_http_equiv = Attr_utils.has_attr "http-equiv" attrs in 72 + let has_property = Attr_utils.has_attr "property" attrs in 73 + let has_itemprop = Attr_utils.has_attr "itemprop" attrs in 85 74 86 75 let valid = 87 76 has_charset ··· 100 89 101 90 let check_link_element attrs collector = 102 91 (* link[rel="stylesheet"] requires href *) 103 - match get_attr "rel" attrs with 92 + match Attr_utils.get_attr "rel" attrs with 104 93 | Some rel when String.equal rel "stylesheet" -> 105 - if not (has_attr "href" attrs) then 94 + if not (Attr_utils.has_attr "href" attrs) then 106 95 Message_collector.add_typed collector (`Link `Missing_href) 107 96 | _ -> () 108 97 109 98 let check_a_element attrs collector = 110 99 (* a[download] requires href *) 111 - if has_attr "download" attrs && not (has_attr "href" attrs) then 100 + if Attr_utils.has_attr "download" attrs && not (Attr_utils.has_attr "href" attrs) then 112 101 Message_collector.add_typed collector 113 102 (`Attr (`Missing (`Elem "a", `Attr "href"))) 114 103 115 104 let check_map_element attrs collector = 116 105 (* map requires name *) 117 - if not (has_attr "name" attrs) then 106 + if not (Attr_utils.has_attr "name" attrs) then 118 107 Message_collector.add_typed collector 119 108 (`Attr (`Missing (`Elem "map", `Attr "name"))) 120 109 121 110 let check_object_element attrs collector = 122 111 (* object requires data attribute (or type attribute alone is not sufficient) *) 123 - let has_data = has_attr "data" attrs in 124 - let has_type = has_attr "type" attrs in 112 + let has_data = Attr_utils.has_attr "data" attrs in 113 + let has_type = Attr_utils.has_attr "type" attrs in 125 114 if not has_data && has_type then 126 115 Message_collector.add_typed collector 127 116 (`Attr (`Missing (`Elem "object", `Attr "data"))) 128 117 129 118 let check_popover_element element_name attrs collector = 130 119 (* popover attribute must have valid value *) 131 - match get_attr "popover" attrs with 120 + match Attr_utils.get_attr "popover" attrs with 132 121 | Some value -> 133 122 let value_lower = String.lowercase_ascii value in 134 123 (* Valid values: empty string, auto, manual, hint *) ··· 141 130 142 131 let check_meter_element attrs collector = 143 132 (* meter requires value attribute *) 144 - if not (has_attr "value" attrs) then 133 + if not (Attr_utils.has_attr "value" attrs) then 145 134 Message_collector.add_typed collector 146 135 (`Attr (`Missing (`Elem "meter", `Attr "value"))) 147 136 else begin 148 137 (* Validate min <= value constraint *) 149 - match get_attr "value" attrs, get_attr "min" attrs with 138 + match Attr_utils.get_attr "value" attrs, Attr_utils.get_attr "min" attrs with 150 139 | Some value_str, Some min_str -> 151 140 (try 152 141 let value = float_of_string value_str in ··· 162 151 163 152 let check_progress_element attrs collector = 164 153 (* Validate progress value constraints *) 165 - match get_attr "value" attrs with 154 + match Attr_utils.get_attr "value" attrs with 166 155 | None -> () (* value is optional *) 167 156 | Some value_str -> 168 157 (try 169 158 let value = float_of_string value_str in 170 - let max_val = match get_attr "max" attrs with 159 + let max_val = match Attr_utils.get_attr "max" attrs with 171 160 | None -> 1.0 (* default max is 1 *) 172 161 | Some max_str -> (try float_of_string max_str with _ -> 1.0) 173 162 in 174 163 if value > max_val then 175 164 let q s = "\xe2\x80\x9c" ^ s ^ "\xe2\x80\x9d" in 176 165 (* Check which message to use based on whether max is present *) 177 - if has_attr "max" attrs then 166 + if Attr_utils.has_attr "max" attrs then 178 167 Message_collector.add_typed collector 179 168 (`Generic ( 180 169 (* Note: double space before "value" matches Nu validator quirk *) ··· 198 187 | "link" -> check_link_element attrs collector 199 188 | "a" -> 200 189 check_a_element attrs collector; 201 - if has_attr "href" attrs then state.in_a_with_href <- true 190 + if Attr_utils.has_attr "href" attrs then state.in_a_with_href <- true 202 191 | "map" -> check_map_element attrs collector 203 192 | "object" -> check_object_element attrs collector 204 193 | "meter" -> check_meter_element attrs collector ··· 206 195 | "figure" -> state._in_figure <- true 207 196 | _ -> 208 197 (* Check popover attribute on any element *) 209 - if has_attr "popover" attrs then check_popover_element name attrs collector 198 + if Attr_utils.has_attr "popover" attrs then check_popover_element name attrs collector 210 199 211 200 let end_element state ~name ~namespace:_ _collector = 212 201 match name with
+1 -1
lib/htmlrw_check/specialized/aria_checker.ml
··· 491 491 if name_lower = "br" || name_lower = "wbr" then begin 492 492 List.iter (fun (attr_name, _) -> 493 493 let attr_lower = String.lowercase_ascii attr_name in 494 - if String.length attr_lower > 5 && String.sub attr_lower 0 5 = "aria-" && 494 + if String.starts_with ~prefix:"aria-" attr_lower && 495 495 attr_lower <> "aria-hidden" then 496 496 Message_collector.add_typed collector 497 497 (`Attr (`Not_allowed (`Attr attr_name, `Elem name)))
+116 -97
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
··· 41 41 let create () = { is_xhtml = false } 42 42 let reset state = state.is_xhtml <- false 43 43 44 - (** Check if an attribute list contains a specific attribute. *) 45 - let has_attr name attrs = 46 - List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs 47 - 48 - (** Get an attribute value from the list. *) 49 - let get_attr name attrs = 50 - List.find_map (fun (attr_name, value) -> 51 - if String.lowercase_ascii attr_name = name then Some value else None 52 - ) attrs 53 - 54 44 (** Input types that allow the list attribute. *) 55 45 let input_types_allowing_list = [ 56 46 "color"; "date"; "datetime-local"; "email"; "month"; "number"; ··· 67 57 68 58 (* Detect XHTML mode from xmlns attribute on html element *) 69 59 if name_lower = "html" then begin 70 - let xmlns_value = get_attr "xmlns" attrs in 71 - match xmlns_value with 60 + match Attr_utils.get_attr "xmlns" attrs with 72 61 | Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true 73 62 | _ -> () 74 63 end; 75 64 76 65 (* Check HTML element attribute restrictions *) 77 - if namespace = None then begin 66 + (match namespace with 67 + | Some _ -> () 68 + | None -> 78 69 match List.assoc_opt name_lower disallowed_attrs_html with 79 70 | Some disallowed -> 80 71 List.iter (fun attr -> 81 - if has_attr attr attrs then 72 + if Attr_utils.has_attr attr attrs then 82 73 report_disallowed_attr name_lower attr collector 83 74 ) disallowed 84 - | None -> () 85 - end; 75 + | None -> ()); 86 76 87 77 (* Check for xml:base attribute - not allowed in HTML *) 88 - if namespace = None && name_lower = "html" then begin 89 - if has_attr "xml:base" attrs then 78 + (match namespace with 79 + | Some _ -> () 80 + | None when name_lower = "html" -> 81 + if Attr_utils.has_attr "xml:base" attrs then 90 82 report_disallowed_attr name_lower "xml:base" collector 91 - end; 83 + | None -> ()); 92 84 93 85 (* Check for xmlns:* prefixed attributes - not allowed in HTML *) 94 86 (* Standard xmlns declarations are allowed but custom prefixes are not *) 95 - if namespace = None then begin 87 + (match namespace with 88 + | Some _ -> () 89 + | None -> 96 90 List.iter (fun (attr_name, _) -> 97 91 let attr_lower = String.lowercase_ascii attr_name in 98 - if String.length attr_lower > 6 && String.sub attr_lower 0 6 = "xmlns:" then begin 92 + if String.starts_with ~prefix:"xmlns:" attr_lower then begin 99 93 let prefix = String.sub attr_lower 6 (String.length attr_lower - 6) in 100 94 (* Only xmlns:xlink (with correct value) and xmlns:xml are allowed *) 101 95 if prefix <> "xlink" && prefix <> "xml" then 102 96 Message_collector.add_typed collector 103 97 (`Attr (`Not_allowed_here (`Attr attr_name))) 104 98 end 105 - ) attrs 106 - end; 99 + ) attrs); 107 100 108 101 (* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *) 109 102 (* xml:id is never valid on SVG elements in HTML5 *) 110 103 if List.mem name_lower svg_no_xml_id then begin 111 - if has_attr "xml:id" attrs then 104 + if Attr_utils.has_attr "xml:id" attrs then 112 105 report_disallowed_attr name_lower "xml:id" collector 113 106 end; 114 107 115 108 (* SVG feConvolveMatrix requires order attribute *) 116 109 if name_lower = "feconvolvematrix" then begin 117 - if not (has_attr "order" attrs) then 110 + if not (Attr_utils.has_attr "order" attrs) then 118 111 Message_collector.add_typed collector 119 112 (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order"))) 120 113 end; 121 114 122 115 (* Validate style type attribute - must be "text/css" or omitted *) 123 - if namespace = None && name_lower = "style" then begin 116 + (match namespace with 117 + | Some _ -> () 118 + | None when name_lower = "style" -> 124 119 List.iter (fun (attr_name, attr_value) -> 125 120 let attr_lower = String.lowercase_ascii attr_name in 126 121 if attr_lower = "type" then begin ··· 129 124 Message_collector.add_typed collector (`Misc `Style_type_invalid) 130 125 end 131 126 ) attrs 132 - end; 127 + | None -> ()); 133 128 134 129 (* Validate object element requires data or type attribute *) 135 - if namespace = None && name_lower = "object" then begin 136 - let has_data = has_attr "data" attrs in 137 - let has_type = has_attr "type" attrs in 130 + (match namespace with 131 + | Some _ -> () 132 + | None when name_lower = "object" -> 133 + let has_data = Attr_utils.has_attr "data" attrs in 134 + let has_type = Attr_utils.has_attr "type" attrs in 138 135 if not has_data && not has_type then 139 136 Message_collector.add_typed collector 140 137 (`Attr (`Missing (`Elem "object", `Attr "data"))) 141 - end; 138 + | None -> ()); 142 139 143 140 (* Validate link imagesizes/imagesrcset attributes *) 144 - if namespace = None && name_lower = "link" then begin 145 - let has_imagesizes = has_attr "imagesizes" attrs in 146 - let has_imagesrcset = has_attr "imagesrcset" attrs in 147 - let rel_value = get_attr "rel" attrs in 148 - let as_value = get_attr "as" attrs in 141 + (match namespace with 142 + | Some _ -> () 143 + | None when name_lower = "link" -> 144 + let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in 145 + let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in 146 + let rel_value = Attr_utils.get_attr "rel" attrs in 147 + let as_value = Attr_utils.get_attr "as" attrs in 149 148 150 149 (* imagesizes requires imagesrcset *) 151 150 if has_imagesizes && not has_imagesrcset then ··· 175 174 if not rel_is_preload then 176 175 Message_collector.add_typed collector (`Link `As_requires_preload) 177 176 | None -> ()) 178 - end; 177 + | None -> ()); 179 178 180 179 (* Validate img usemap attribute - must be hash-name reference with content *) 181 - if namespace = None && name_lower = "img" then begin 180 + (match namespace with 181 + | Some _ -> () 182 + | None when name_lower = "img" -> 182 183 List.iter (fun (attr_name, attr_value) -> 183 184 let attr_lower = String.lowercase_ascii attr_name in 184 185 if attr_lower = "usemap" then begin ··· 189 190 attr_value attr_name name)))) 190 191 end 191 192 ) attrs 192 - end; 193 + | None -> ()); 193 194 194 195 (* Validate embed type attribute - must be valid MIME type *) 195 - if namespace = None && name_lower = "embed" then begin 196 + (match namespace with 197 + | Some _ -> () 198 + | None when name_lower = "embed" -> 196 199 List.iter (fun (attr_name, attr_value) -> 197 200 let attr_lower = String.lowercase_ascii attr_name in 198 201 if attr_lower = "type" then begin ··· 205 208 attr_value attr_name name msg)))) 206 209 end 207 210 ) attrs 208 - end; 211 + | None -> ()); 209 212 210 213 (* Validate width/height on embed and img - must be non-negative integers *) 211 - if namespace = None && (name_lower = "embed" || name_lower = "img" || 212 - name_lower = "video" || name_lower = "canvas" || 213 - name_lower = "iframe" || name_lower = "source") then begin 214 + let is_dimension_element = name_lower = "embed" || name_lower = "img" || 215 + name_lower = "video" || name_lower = "canvas" || 216 + name_lower = "iframe" || name_lower = "source" in 217 + (match namespace with 218 + | Some _ -> () 219 + | None when is_dimension_element -> 214 220 List.iter (fun (attr_name, attr_value) -> 215 221 let attr_lower = String.lowercase_ascii attr_name in 216 222 if attr_lower = "width" || attr_lower = "height" then begin ··· 255 261 end 256 262 end 257 263 ) attrs 258 - end; 264 + | None -> ()); 259 265 260 266 (* Validate area[shape=default] cannot have coords *) 261 - if namespace = None && name_lower = "area" then begin 262 - let shape_value = get_attr "shape" attrs in 263 - match shape_value with 267 + (match namespace with 268 + | Some _ -> () 269 + | None when name_lower = "area" -> 270 + (match Attr_utils.get_attr "shape" attrs with 264 271 | Some s when String.lowercase_ascii (String.trim s) = "default" -> 265 - if has_attr "coords" attrs then 272 + if Attr_utils.has_attr "coords" attrs then 266 273 Message_collector.add_typed collector 267 274 (`Attr (`Not_allowed (`Attr "coords", `Elem "area"))) 268 - | _ -> () 269 - end; 275 + | _ -> ()) 276 + | None -> ()); 270 277 271 278 (* Validate bdo element requires dir attribute, and dir cannot be "auto" *) 272 - if namespace = None && name_lower = "bdo" then begin 273 - let dir_value = get_attr "dir" attrs in 274 - match dir_value with 279 + (match namespace with 280 + | Some _ -> () 281 + | None when name_lower = "bdo" -> 282 + (match Attr_utils.get_attr "dir" attrs with 275 283 | None -> 276 284 Message_collector.add_typed collector (`Misc `Bdo_missing_dir) 277 285 | Some v when String.lowercase_ascii (String.trim v) = "auto" -> 278 286 Message_collector.add_typed collector (`Misc `Bdo_dir_auto) 279 - | _ -> () 280 - end; 287 + | _ -> ()) 288 + | None -> ()); 281 289 282 290 (* Validate input list attribute - only allowed for certain types *) 283 - if namespace = None && name_lower = "input" then begin 284 - if has_attr "list" attrs then begin 285 - let input_type = match get_attr "type" attrs with 286 - | Some t -> String.lowercase_ascii (String.trim t) 287 - | None -> "text" (* default type is text *) 288 - in 291 + (match namespace with 292 + | Some _ -> () 293 + | None when name_lower = "input" -> 294 + if Attr_utils.has_attr "list" attrs then begin 295 + let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs 296 + |> String.trim |> String.lowercase_ascii in 289 297 if not (List.mem input_type input_types_allowing_list) then 290 298 Message_collector.add_typed collector (`Input `List_not_allowed) 291 299 end 292 - end; 300 + | None -> ()); 293 301 294 302 (* Validate data-* attributes *) 295 - if namespace = None then begin 303 + (match namespace with 304 + | Some _ -> () 305 + | None -> 296 306 List.iter (fun (attr_name, _) -> 297 307 let attr_lower = String.lowercase_ascii attr_name in 298 308 (* Check if it starts with "data-" *) 299 - if String.length attr_lower >= 5 && String.sub attr_lower 0 5 = "data-" then begin 309 + if String.starts_with ~prefix:"data-" attr_lower then begin 300 310 let after_prefix = String.sub attr_lower 5 (String.length attr_lower - 5) in 301 311 (* Check if it's exactly "data-" with nothing after *) 302 312 if after_prefix = "" then ··· 306 316 Message_collector.add_typed collector 307 317 (`Attr (`Data_invalid_name (`Reason "must be XML 1.0 4th ed. plus Namespaces NCNames"))) 308 318 end 309 - ) attrs 310 - end; 319 + ) attrs); 311 320 312 321 (* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *) 313 - if namespace = None && not state.is_xhtml then begin 314 - let xmllang_value = get_attr "xml:lang" attrs in 315 - let lang_value = get_attr "lang" attrs in 316 - match xmllang_value with 322 + (match namespace with 323 + | Some _ -> () 324 + | None when not state.is_xhtml -> 325 + let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in 326 + let lang_value = Attr_utils.get_attr "lang" attrs in 327 + (match xmllang_value with 317 328 | Some xmllang -> 318 329 (match lang_value with 319 330 | None -> 320 - (* xml:lang without lang attribute *) 321 331 Message_collector.add_typed collector (`I18n `Xml_lang_without_lang) 322 332 | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang -> 323 - (* xml:lang and lang have different values - "lang present with same value" message *) 324 333 Message_collector.add_typed collector (`I18n `Xml_lang_without_lang) 325 334 | _ -> ()) 326 - | None -> () 327 - end; 335 + | None -> ()) 336 + | None -> ()); 328 337 329 338 (* Validate spellcheck attribute - must be "true" or "false" or empty *) 330 - if namespace = None then begin 339 + (match namespace with 340 + | Some _ -> () 341 + | None -> 331 342 List.iter (fun (attr_name, attr_value) -> 332 343 let attr_lower = String.lowercase_ascii attr_name in 333 344 if attr_lower = "spellcheck" then begin ··· 336 347 Message_collector.add_typed collector 337 348 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason ""))) 338 349 end 339 - ) attrs 340 - end; 350 + ) attrs); 341 351 342 352 (* Validate enterkeyhint attribute - must be one of specific values *) 343 - if namespace = None then begin 353 + (match namespace with 354 + | Some _ -> () 355 + | None -> 344 356 let valid_enterkeyhint = ["enter"; "done"; "go"; "next"; "previous"; "search"; "send"] in 345 357 List.iter (fun (attr_name, attr_value) -> 346 358 let attr_lower = String.lowercase_ascii attr_name in ··· 350 362 Message_collector.add_typed collector 351 363 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason ""))) 352 364 end 353 - ) attrs 354 - end; 365 + ) attrs); 355 366 356 367 (* Validate headingoffset attribute - must be a number between 0 and 8 *) 357 - if namespace = None then begin 368 + (match namespace with 369 + | Some _ -> () 370 + | None -> 358 371 List.iter (fun (attr_name, attr_value) -> 359 372 let attr_lower = String.lowercase_ascii attr_name in 360 373 if attr_lower = "headingoffset" then begin ··· 370 383 if not is_valid then 371 384 Message_collector.add_typed collector (`Misc `Headingoffset_invalid) 372 385 end 373 - ) attrs 374 - end; 386 + ) attrs); 375 387 376 388 (* Validate accesskey attribute - each key label must be a single code point *) 377 - if namespace = None then begin 389 + (match namespace with 390 + | Some _ -> () 391 + | None -> 378 392 List.iter (fun (attr_name, attr_value) -> 379 393 let attr_lower = String.lowercase_ascii attr_name in 380 394 if attr_lower = "accesskey" then begin ··· 419 433 in 420 434 find_duplicates [] keys 421 435 end 422 - ) attrs 423 - end; 436 + ) attrs); 424 437 425 438 (* Validate that command and popovertarget cannot have aria-expanded *) 426 - if namespace = None && name_lower = "button" then begin 427 - let has_command = has_attr "command" attrs in 428 - let has_popovertarget = has_attr "popovertarget" attrs in 429 - let has_aria_expanded = has_attr "aria-expanded" attrs in 439 + (match namespace with 440 + | Some _ -> () 441 + | None when name_lower = "button" -> 442 + let has_command = Attr_utils.has_attr "command" attrs in 443 + let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in 444 + let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in 430 445 431 446 if has_command && has_aria_expanded then 432 447 Message_collector.add_typed collector ··· 437 452 Message_collector.add_typed collector 438 453 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 439 454 `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute"))) 440 - end; 455 + | None -> ()); 441 456 442 457 (* Note: data-* uppercase check requires XML parsing which preserves case. 443 458 The HTML5 parser normalizes attribute names to lowercase, so this check ··· 446 461 ignore state.is_xhtml; 447 462 448 463 (* Validate media attribute on link, style, source elements *) 449 - if namespace = None && (name_lower = "link" || name_lower = "style" || name_lower = "source") then begin 464 + let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in 465 + (match namespace with 466 + | Some _ -> () 467 + | None when is_media_element -> 450 468 List.iter (fun (attr_name, attr_value) -> 451 469 let attr_lower = String.lowercase_ascii attr_name in 452 470 if attr_lower = "media" then begin ··· 462 480 end 463 481 end 464 482 ) attrs 465 - end; 483 + | None -> ()); 466 484 467 485 (* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *) 468 - if namespace = None then begin 486 + (match namespace with 487 + | Some _ -> () 488 + | None -> 469 489 List.iter (fun (attr_name, attr_value) -> 470 490 let attr_lower = String.lowercase_ascii attr_name in 471 491 if attr_lower = "prefix" then begin ··· 487 507 end 488 508 end 489 509 end 490 - ) attrs 491 - end 510 + ) attrs) 492 511 493 512 let end_element _state ~name:_ ~namespace:_ _collector = () 494 513 let characters _state _text _collector = ()
+5 -12
lib/htmlrw_check/specialized/base_checker.ml
··· 11 11 let reset state = 12 12 state.seen_link_or_script <- false 13 13 14 - (** Check if an attribute list contains a specific attribute. *) 15 - let has_attr name attrs = 16 - List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs 17 - 18 14 let start_element state ~name ~namespace ~attrs collector = 19 - if namespace <> None then () 20 - else begin 21 - let name_lower = String.lowercase_ascii name in 22 - match name_lower with 15 + match namespace with 16 + | Some _ -> () 17 + | None -> 18 + match String.lowercase_ascii name with 23 19 | "link" | "script" -> 24 20 state.seen_link_or_script <- true 25 21 | "base" -> 26 22 if state.seen_link_or_script then 27 23 Message_collector.add_typed collector (`Misc `Base_after_link_script); 28 24 (* base element must have href or target attribute *) 29 - let has_href = has_attr "href" attrs in 30 - let has_target = has_attr "target" attrs in 31 - if not has_href && not has_target then 25 + if not (Attr_utils.has_attr "href" attrs || Attr_utils.has_attr "target" attrs) then 32 26 Message_collector.add_typed collector (`Misc `Base_missing_href_or_target) 33 27 | _ -> () 34 - end 35 28 36 29 let end_element _state ~name:_ ~namespace:_ _collector = () 37 30 let characters _state _text _collector = ()
+1 -6
lib/htmlrw_check/specialized/dl_checker.ml
··· 57 57 | ctx :: _ -> Some ctx 58 58 | [] -> None 59 59 60 - let get_attr name attrs = 61 - List.find_map (fun (n, v) -> 62 - if String.lowercase_ascii n = name then Some v else None 63 - ) attrs 64 - 65 60 let start_element state ~name ~namespace ~attrs collector = 66 61 let name_lower = String.lowercase_ascii name in 67 62 ··· 115 110 Message_collector.add_typed collector 116 111 (`Element (`Not_allowed_as_child (`Child "div", `Parent "dl"))); 117 112 (* Check that role is only presentation or none *) 118 - (match get_attr "role" attrs with 113 + (match Attr_utils.get_attr "role" attrs with 119 114 | Some role_value -> 120 115 let role_lower = String.lowercase_ascii (String.trim role_value) in 121 116 if role_lower <> "presentation" && role_lower <> "none" then
+42 -76
lib/htmlrw_check/specialized/picture_checker.ml
··· 66 66 state.always_matching_is_media_all <- false; 67 67 state.always_matching_is_media_empty <- false 68 68 69 - (** Check if an attribute list contains a specific attribute. *) 70 - let has_attr name attrs = 71 - List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs 72 - 73 69 (** Report disallowed attribute error *) 74 70 let report_disallowed_attr element attr collector = 75 71 Message_collector.add_typed collector ··· 80 76 Message_collector.add_typed collector 81 77 (`Element (`Not_allowed_as_child (`Child child, `Parent parent))) 82 78 79 + let check_disallowed_attrs element disallowed_list attrs collector = 80 + List.iter (fun attr -> 81 + if Attr_utils.has_attr attr attrs then 82 + report_disallowed_attr element attr collector 83 + ) disallowed_list 84 + 83 85 let check_picture_attrs attrs collector = 84 - List.iter (fun disallowed -> 85 - if has_attr disallowed attrs then 86 - report_disallowed_attr "picture" disallowed collector 87 - ) disallowed_picture_attrs 86 + check_disallowed_attrs "picture" disallowed_picture_attrs attrs collector 88 87 89 88 let check_source_attrs_in_picture attrs collector = 90 - List.iter (fun disallowed -> 91 - if has_attr disallowed attrs then 92 - report_disallowed_attr "source" disallowed collector 93 - ) disallowed_source_attrs_in_picture; 94 - (* source in picture requires srcset *) 95 - if not (has_attr "srcset" attrs) then 96 - Message_collector.add_typed collector 97 - (`Srcset `Source_missing_srcset) 89 + check_disallowed_attrs "source" disallowed_source_attrs_in_picture attrs collector; 90 + if not (Attr_utils.has_attr "srcset" attrs) then 91 + Message_collector.add_typed collector (`Srcset `Source_missing_srcset) 98 92 99 93 let check_img_attrs attrs collector = 100 - List.iter (fun disallowed -> 101 - if has_attr disallowed attrs then 102 - report_disallowed_attr "img" disallowed collector 103 - ) disallowed_img_attrs 94 + check_disallowed_attrs "img" disallowed_img_attrs attrs collector 104 95 105 96 let start_element state ~name ~namespace ~attrs collector = 106 97 let name_lower = String.lowercase_ascii name in ··· 112 103 end; 113 104 114 105 (* Rest of checks only apply to HTML namespace elements *) 115 - if namespace = None then begin 116 - match name_lower with 106 + match namespace with 107 + | Some _ -> () 108 + | None -> 109 + (match name_lower with 117 110 | "picture" -> 118 111 (* Check if picture is in a disallowed parent context *) 119 112 (match state.parent_stack with ··· 124 117 check_picture_attrs attrs collector; 125 118 state.in_picture <- true; 126 119 state.has_img_in_picture <- false; 127 - state.picture_depth <- 0; (* Will be incremented to 1 at end of function *) 120 + state.picture_depth <- 0; 128 121 state.children_in_picture <- []; 129 122 state.last_was_img <- false; 130 123 state.has_source_after_img <- false; ··· 136 129 state.children_in_picture <- "source" :: state.children_in_picture; 137 130 if state.last_was_img then 138 131 state.has_source_after_img <- true; 139 - (* Check for always-matching source followed by another source *) 140 132 if state.has_always_matching_source then 141 133 state.source_after_always_matching <- true; 142 - (* A source is "always matching" if it has: 143 - - no media and no type attribute, OR 144 - - media attribute with empty/whitespace-only value, OR 145 - - media="all" (with optional whitespace) *) 146 - let media_value = List.find_map (fun (attr_name, v) -> 147 - if String.lowercase_ascii attr_name = "media" then Some v else None 148 - ) attrs in 149 - let has_type = has_attr "type" attrs in 134 + (* A source is "always matching" if it has no media/type, or media="" or media="all" *) 135 + let media_value = Attr_utils.get_attr "media" attrs in 136 + let has_type = Attr_utils.has_attr "type" attrs in 150 137 let is_media_all = match media_value with 151 138 | Some v -> String.lowercase_ascii (String.trim v) = "all" 152 - | None -> false 153 - in 139 + | None -> false in 154 140 let is_media_empty = match media_value with 155 141 | Some v -> String.trim v = "" 156 - | None -> false 157 - in 142 + | None -> false in 158 143 let is_always_matching = match media_value with 159 - | None -> not has_type (* no media, check if no type either *) 144 + | None -> not has_type 160 145 | Some v -> 161 146 let trimmed = String.trim v in 162 147 trimmed = "" || String.lowercase_ascii trimmed = "all" 163 148 in 164 149 if is_always_matching then begin 165 150 state.has_always_matching_source <- true; 166 - if is_media_all then 167 - state.always_matching_is_media_all <- true 168 - else if is_media_empty then 169 - state.always_matching_is_media_empty <- true 151 + (* Only set flags to true, never reset to false *) 152 + if is_media_all then state.always_matching_is_media_all <- true; 153 + if is_media_empty then state.always_matching_is_media_empty <- true 170 154 end 171 155 172 156 | "img" when state.in_picture && state.picture_depth = 1 -> ··· 174 158 state.has_img_in_picture <- true; 175 159 state.children_in_picture <- "img" :: state.children_in_picture; 176 160 state.last_was_img <- true; 177 - (* Check for multiple img elements *) 178 - let img_count = List.filter (fun c -> c = "img") state.children_in_picture |> List.length in 161 + let img_count = List.length (List.filter (( = ) "img") state.children_in_picture) in 179 162 if img_count > 1 then 180 163 report_disallowed_child "picture" "img" collector; 181 - (* Check if always-matching source is followed by img with srcset *) 182 - if state.has_always_matching_source && has_attr "srcset" attrs then begin 183 - if state.always_matching_is_media_all then 184 - Message_collector.add_typed collector (`Misc `Media_all) 185 - else if state.always_matching_is_media_empty then 186 - Message_collector.add_typed collector (`Misc `Media_empty) 187 - else 188 - Message_collector.add_typed collector (`Srcset `Source_needs_media_or_type) 189 - end 164 + if state.has_always_matching_source && Attr_utils.has_attr "srcset" attrs then 165 + Message_collector.add_typed collector 166 + (if state.always_matching_is_media_all then `Misc `Media_all 167 + else if state.always_matching_is_media_empty then `Misc `Media_empty 168 + else `Srcset `Source_needs_media_or_type) 190 169 191 170 | "script" when state.in_picture && state.picture_depth = 1 -> 192 171 state.children_in_picture <- "script" :: state.children_in_picture ··· 197 176 | "img" -> 198 177 check_img_attrs attrs collector 199 178 200 - | _ -> () 201 - end; 179 + | _ -> ()); 202 180 203 181 (* Track depth when inside picture *) 204 182 if state.in_picture then ··· 209 187 state.parent_stack <- name_lower :: state.parent_stack 210 188 211 189 let end_element state ~name ~namespace collector = 212 - if namespace <> None then () 213 - else begin 190 + match namespace with 191 + | Some _ -> () 192 + | None -> 214 193 let name_lower = String.lowercase_ascii name in 215 194 216 - (* Track depth *) 217 195 if state.in_picture then 218 196 state.picture_depth <- state.picture_depth - 1; 219 197 220 198 if name_lower = "picture" && state.picture_depth = 0 then begin 221 - (* Check if picture had img child *) 222 199 if not state.has_img_in_picture then 223 - Message_collector.add_typed collector 224 - (`Srcset `Picture_missing_img); 225 - (* Check for source after img *) 200 + Message_collector.add_typed collector (`Srcset `Picture_missing_img); 226 201 if state.has_source_after_img then 227 202 report_disallowed_child "picture" "source" collector; 228 - (* Check for source after always-matching source *) 229 - if state.source_after_always_matching then begin 230 - if state.always_matching_is_media_all then 231 - Message_collector.add_typed collector (`Misc `Media_all) 232 - else if state.always_matching_is_media_empty then 233 - Message_collector.add_typed collector (`Misc `Media_empty) 234 - else 235 - Message_collector.add_typed collector (`Srcset `Source_needs_media_or_type) 236 - end; 237 - 203 + if state.source_after_always_matching then 204 + Message_collector.add_typed collector 205 + (if state.always_matching_is_media_all then `Misc `Media_all 206 + else if state.always_matching_is_media_empty then `Misc `Media_empty 207 + else `Srcset `Source_needs_media_or_type); 238 208 state.in_picture <- false 239 209 end; 240 210 241 - (* Pop from parent stack *) 242 - state.parent_stack <- (match state.parent_stack with 243 - | _ :: rest -> rest 244 - | [] -> []) 245 - end 211 + state.parent_stack <- match state.parent_stack with _ :: rest -> rest | [] -> [] 246 212 247 213 let characters state text collector = 248 214 (* Text in picture element is not allowed *)
+4 -8
lib/htmlrw_check/specialized/source_checker.ml
··· 23 23 | ctx :: _ -> ctx 24 24 | [] -> Other 25 25 26 - (** Check if an attribute list contains a specific attribute. *) 27 - let has_attr name attrs = 28 - List.exists (fun (attr_name, _) -> String.lowercase_ascii attr_name = name) attrs 29 - 30 26 let start_element state ~name ~namespace ~attrs collector = 31 27 if namespace <> None then () 32 28 else begin ··· 42 38 let ctx = current_context state in 43 39 begin match ctx with 44 40 | Video | Audio -> 45 - if has_attr "srcset" attrs then 41 + if Attr_utils.has_attr "srcset" attrs then 46 42 Message_collector.add_typed collector 47 43 (`Attr (`Not_allowed (`Attr "srcset", `Elem "source"))); 48 - if has_attr "sizes" attrs then 44 + if Attr_utils.has_attr "sizes" attrs then 49 45 Message_collector.add_typed collector 50 46 (`Attr (`Not_allowed (`Attr "sizes", `Elem "source"))); 51 - if has_attr "width" attrs then 47 + if Attr_utils.has_attr "width" attrs then 52 48 Message_collector.add_typed collector 53 49 (`Attr (`Not_allowed (`Attr "width", `Elem "source"))); 54 - if has_attr "height" attrs then 50 + if Attr_utils.has_attr "height" attrs then 55 51 Message_collector.add_typed collector 56 52 (`Attr (`Not_allowed (`Attr "height", `Elem "source"))) 57 53 | Picture | Other -> ()
+3 -9
lib/htmlrw_check/specialized/srcset_sizes_checker.ml
··· 14 14 let create () = () 15 15 let reset _state = () 16 16 17 - (** Get attribute value *) 18 - let get_attr name attrs = 19 - List.find_map (fun (n, v) -> 20 - if String.lowercase_ascii n = name then Some v else None 21 - ) attrs 22 - 23 17 (** Split string on a character while respecting parentheses *) 24 18 let split_respecting_parens ~sep s = 25 19 let len = String.length s in ··· 971 965 972 966 (* SVG image elements should not have srcset *) 973 967 if namespace <> None && name_lower = "image" then begin 974 - if get_attr "srcset" attrs <> None then 968 + if Attr_utils.get_attr "srcset" attrs <> None then 975 969 Message_collector.add_typed collector 976 970 (`Attr (`Not_allowed (`Attr "srcset", `Elem "image"))) 977 971 end; ··· 980 974 else begin 981 975 (* Check sizes and srcset on img and source *) 982 976 if name_lower = "img" || name_lower = "source" then begin 983 - let sizes_value = get_attr "sizes" attrs in 984 - let srcset_value = get_attr "srcset" attrs in 977 + let sizes_value = Attr_utils.get_attr "sizes" attrs in 978 + let srcset_value = Attr_utils.get_attr "srcset" attrs in 985 979 let has_sizes = sizes_value <> None in 986 980 let has_srcset = srcset_value <> None in 987 981
+1 -2
lib/htmlrw_check/specialized/xhtml_content_checker.ml
··· 46 46 (* Check if data-* attribute has uppercase characters *) 47 47 let check_data_attr_case attrs collector = 48 48 List.iter (fun (attr_name, _) -> 49 - if String.length attr_name > 5 && 50 - String.sub attr_name 0 5 = "data-" then 49 + if String.starts_with ~prefix:"data-" attr_name then 51 50 let suffix = String.sub attr_name 5 (String.length attr_name - 5) in 52 51 if String.exists (fun c -> c >= 'A' && c <= 'Z') suffix then 53 52 Message_collector.add_typed collector (`Attr `Data_uppercase)