OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+3 -1
lib/html5_checker/checker_registry.ml
··· 37 37 Hashtbl.replace reg "option" Option_checker.checker; 38 38 Hashtbl.replace reg "language" Language_checker.checker; 39 39 Hashtbl.replace reg "microdata" Microdata_checker.checker; 40 - (* Hashtbl.replace reg "table" Table_checker.checker; *) 40 + Hashtbl.replace reg "importmap" Importmap_checker.checker; 41 + Hashtbl.replace reg "table" Table_checker.checker; 42 + Hashtbl.replace reg "mime-type" Mime_type_checker.checker; 41 43 (* Hashtbl.replace reg "heading" Heading_checker.checker; *) 42 44 (* Hashtbl.replace reg "content" Content_checker.checker; *) 43 45 reg
+336
lib/html5_checker/specialized/importmap_checker.ml
··· 1 + (** Importmap validation checker. 2 + 3 + Validates that <script type="importmap"> elements contain valid JSON 4 + and conform to importmap structural requirements. *) 5 + 6 + type state = { 7 + mutable in_importmap : bool; 8 + content : Buffer.t; 9 + } 10 + 11 + let create () = { 12 + in_importmap = false; 13 + content = Buffer.create 256; 14 + } 15 + 16 + let reset state = 17 + state.in_importmap <- false; 18 + Buffer.clear state.content 19 + 20 + (** Simple JSON value representation *) 21 + type json = 22 + | JNull 23 + | JBool of bool 24 + | JNumber of float 25 + | JString of string 26 + | JArray of json list 27 + | JObject of (string * json) list 28 + 29 + (** Simple JSON parser *) 30 + let parse_json s_orig = 31 + let s = String.trim s_orig in 32 + let len = String.length s in 33 + if len = 0 then Error "Empty JSON" 34 + else 35 + let pos = ref 0 in 36 + 37 + let skip_ws () = 38 + while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t' || s.[!pos] = '\n' || s.[!pos] = '\r') do 39 + incr pos 40 + done 41 + in 42 + 43 + let peek () = if !pos < len then Some s.[!pos] else None in 44 + let consume () = let c = s.[!pos] in incr pos; c in 45 + 46 + let rec parse_value () = 47 + skip_ws (); 48 + match peek () with 49 + | None -> Error "Unexpected end of input" 50 + | Some '{' -> parse_object () 51 + | Some '[' -> parse_array () 52 + | Some '"' -> parse_string () 53 + | Some 't' -> parse_true () 54 + | Some 'f' -> parse_false () 55 + | Some 'n' -> parse_null () 56 + | Some c when c = '-' || (c >= '0' && c <= '9') -> parse_number () 57 + | Some _ -> Error "Unexpected character" 58 + 59 + and parse_object () = 60 + ignore (consume ()); (* consume { *) 61 + skip_ws (); 62 + match peek () with 63 + | Some '}' -> ignore (consume ()); Ok (JObject []) 64 + | _ -> 65 + let rec parse_members acc = 66 + skip_ws (); 67 + match parse_string () with 68 + | Error e -> Error e 69 + | Ok (JString key) -> 70 + skip_ws (); 71 + (match peek () with 72 + | Some ':' -> 73 + ignore (consume ()); 74 + (match parse_value () with 75 + | Error e -> Error e 76 + | Ok value -> 77 + skip_ws (); 78 + let acc' = (key, value) :: acc in 79 + match peek () with 80 + | Some ',' -> ignore (consume ()); parse_members acc' 81 + | Some '}' -> ignore (consume ()); Ok (JObject (List.rev acc')) 82 + | _ -> Error "Expected ',' or '}'") 83 + | _ -> Error "Expected ':'") 84 + | Ok _ -> Error "Expected string key" 85 + in 86 + parse_members [] 87 + 88 + and parse_array () = 89 + ignore (consume ()); (* consume [ *) 90 + skip_ws (); 91 + match peek () with 92 + | Some ']' -> ignore (consume ()); Ok (JArray []) 93 + | _ -> 94 + let rec parse_elements acc = 95 + match parse_value () with 96 + | Error e -> Error e 97 + | Ok value -> 98 + skip_ws (); 99 + let acc' = value :: acc in 100 + match peek () with 101 + | Some ',' -> ignore (consume ()); parse_elements acc' 102 + | Some ']' -> ignore (consume ()); Ok (JArray (List.rev acc')) 103 + | _ -> Error "Expected ',' or ']'" 104 + in 105 + parse_elements [] 106 + 107 + and parse_string () = 108 + skip_ws (); 109 + match peek () with 110 + | Some '"' -> 111 + ignore (consume ()); 112 + let buf = Buffer.create 32 in 113 + let rec read () = 114 + match peek () with 115 + | None -> Error "Unterminated string" 116 + | Some '"' -> ignore (consume ()); Ok (JString (Buffer.contents buf)) 117 + | Some '\\' -> 118 + ignore (consume ()); 119 + (match peek () with 120 + | None -> Error "Unterminated escape" 121 + | Some c -> ignore (consume ()); Buffer.add_char buf c; read ()) 122 + | Some c -> ignore (consume ()); Buffer.add_char buf c; read () 123 + in 124 + read () 125 + | _ -> Error "Expected string" 126 + 127 + and parse_number () = 128 + let start = !pos in 129 + if peek () = Some '-' then incr pos; 130 + while !pos < len && s.[!pos] >= '0' && s.[!pos] <= '9' do incr pos done; 131 + if !pos < len && s.[!pos] = '.' then begin 132 + incr pos; 133 + while !pos < len && s.[!pos] >= '0' && s.[!pos] <= '9' do incr pos done 134 + end; 135 + if !pos < len && (s.[!pos] = 'e' || s.[!pos] = 'E') then begin 136 + incr pos; 137 + if !pos < len && (s.[!pos] = '+' || s.[!pos] = '-') then incr pos; 138 + while !pos < len && s.[!pos] >= '0' && s.[!pos] <= '9' do incr pos done 139 + end; 140 + let num_str = String.sub s start (!pos - start) in 141 + match float_of_string_opt num_str with 142 + | Some f -> Ok (JNumber f) 143 + | None -> Error "Invalid number" 144 + 145 + and parse_true () = 146 + if !pos + 4 <= len && String.sub s !pos 4 = "true" then 147 + (pos := !pos + 4; Ok (JBool true)) 148 + else Error "Expected 'true'" 149 + 150 + and parse_false () = 151 + if !pos + 5 <= len && String.sub s !pos 5 = "false" then 152 + (pos := !pos + 5; Ok (JBool false)) 153 + else Error "Expected 'false'" 154 + 155 + and parse_null () = 156 + if !pos + 4 <= len && String.sub s !pos 4 = "null" then 157 + (pos := !pos + 4; Ok JNull) 158 + else Error "Expected 'null'" 159 + in 160 + 161 + match parse_value () with 162 + | Error e -> Error e 163 + | Ok v -> 164 + skip_ws (); 165 + if !pos = len then Ok v 166 + else Error "Unexpected content after JSON" 167 + 168 + (** Validate importmap structure *) 169 + type importmap_error = 170 + | InvalidJSON of string 171 + | EmptyKey of string (* property name where empty key was found *) 172 + | NotObject of string (* property name that should be object but isn't *) 173 + | NotString of string (* property name that should be string but isn't *) 174 + | ForbiddenProperty of string 175 + | SlashKeyWithoutSlashValue of string (* property name where slash key doesn't have slash value *) 176 + | InvalidScopeKey (* scope key is not a valid URL *) 177 + | InvalidScopeValue of string (* scope value is not a valid URL *) 178 + 179 + (** Check if a string looks like a valid URL-like specifier for importmaps *) 180 + let is_valid_url_like s = 181 + if String.length s = 0 then false 182 + else 183 + (* Valid URL-like: starts with /, ./, ../, or has a scheme followed by :// or : *) 184 + let starts_with_slash = s.[0] = '/' in 185 + let starts_with_dot_slash = String.length s >= 2 && s.[0] = '.' && s.[1] = '/' in 186 + let starts_with_dot_dot_slash = String.length s >= 3 && s.[0] = '.' && s.[1] = '.' && s.[2] = '/' in 187 + let has_scheme = 188 + match String.index_opt s ':' with 189 + | None -> false 190 + | Some pos when pos > 0 -> 191 + (* Check that characters before : are valid scheme characters *) 192 + let scheme = String.sub s 0 pos in 193 + String.length scheme > 0 && 194 + String.for_all (fun c -> 195 + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || 196 + (c >= '0' && c <= '9') || c = '+' || c = '-' || c = '.' 197 + ) scheme 198 + | _ -> false 199 + in 200 + starts_with_slash || starts_with_dot_slash || starts_with_dot_dot_slash || has_scheme 201 + 202 + let validate_importmap s = 203 + match parse_json s with 204 + | Error msg -> [InvalidJSON msg] 205 + | Ok json -> 206 + let errors = ref [] in 207 + let add_error e = errors := e :: !errors in 208 + 209 + (match json with 210 + | JObject members -> 211 + List.iter (fun (key, value) -> 212 + (* Check for forbidden top-level properties *) 213 + if key <> "imports" && key <> "scopes" && key <> "integrity" then 214 + add_error (ForbiddenProperty key); 215 + 216 + (* Check imports *) 217 + if key = "imports" then begin 218 + match value with 219 + | JObject import_members -> 220 + List.iter (fun (ikey, ivalue) -> 221 + if ikey = "" then add_error (EmptyKey "imports"); 222 + (* Check slash-ending consistency *) 223 + let key_ends_with_slash = String.length ikey > 0 && ikey.[String.length ikey - 1] = '/' in 224 + match ivalue with 225 + | JString v -> 226 + if key_ends_with_slash then begin 227 + let val_ends_with_slash = String.length v > 0 && v.[String.length v - 1] = '/' in 228 + if not val_ends_with_slash then 229 + add_error (SlashKeyWithoutSlashValue "imports") 230 + end 231 + | JNull -> () (* null is allowed *) 232 + | _ -> add_error (NotString ("imports[" ^ ikey ^ "]")) 233 + ) import_members 234 + | _ -> add_error (NotObject "imports") 235 + end; 236 + 237 + (* Check scopes *) 238 + if key = "scopes" then begin 239 + match value with 240 + | JObject scope_members -> 241 + List.iter (fun (skey, svalue) -> 242 + if skey = "" then add_error (EmptyKey "scopes"); 243 + (* Check that scope key is a valid URL *) 244 + if skey <> "" && not (is_valid_url_like skey) then 245 + add_error InvalidScopeKey; 246 + match svalue with 247 + | JObject scope_imports -> 248 + List.iter (fun (sikey, sivalue) -> 249 + if sikey = "" then add_error (EmptyKey ("scopes[" ^ skey ^ "]")); 250 + match sivalue with 251 + | JString v -> 252 + (* Check that scope value is a valid URL *) 253 + if not (is_valid_url_like v) then 254 + add_error (InvalidScopeValue sikey) 255 + | JNull -> () 256 + | _ -> add_error (NotString ("scopes[" ^ skey ^ "][" ^ sikey ^ "]")) 257 + ) scope_imports 258 + | _ -> add_error (NotObject ("scopes[" ^ skey ^ "]")) 259 + ) scope_members 260 + | _ -> add_error (NotObject "scopes") 261 + end 262 + ) members 263 + | _ -> add_error (NotObject "root")); 264 + 265 + List.rev !errors 266 + 267 + let start_element state ~name ~namespace ~attrs _collector = 268 + if namespace <> None then () 269 + else begin 270 + let name_lower = String.lowercase_ascii name in 271 + if name_lower = "script" then begin 272 + (* Check if type="importmap" *) 273 + let type_attr = List.find_opt (fun (n, _) -> 274 + String.lowercase_ascii n = "type" 275 + ) attrs in 276 + match type_attr with 277 + | Some (_, v) when String.lowercase_ascii v = "importmap" -> 278 + state.in_importmap <- true; 279 + Buffer.clear state.content 280 + | _ -> () 281 + end 282 + end 283 + 284 + let error_to_message = function 285 + | InvalidJSON _ -> 286 + "A script \xe2\x80\x9cscript\xe2\x80\x9d with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have valid JSON content." 287 + | EmptyKey prop -> 288 + Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain non-empty keys." prop 289 + | NotObject prop -> 290 + Printf.sprintf "The value of the \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object." prop 291 + | NotString _ -> 292 + "A specifier map defined in a \xe2\x80\x9cimports\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain string values." 293 + | ForbiddenProperty prop -> 294 + Printf.sprintf "The \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d is not an allowed property." prop 295 + | SlashKeyWithoutSlashValue prop -> 296 + Printf.sprintf "A specifier map defined in a \xe2\x80\x9c%s\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must have values that end with \xe2\x80\x9c/\xe2\x80\x9d when its corresponding key ends with \xe2\x80\x9c/\xe2\x80\x9d." prop 297 + | InvalidScopeKey -> 298 + "The value of the \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must be a JSON object whose keys are valid URL strings." 299 + | InvalidScopeValue _ -> 300 + "A specifier map defined in a \xe2\x80\x9cscopes\xe2\x80\x9d property within the content of a \xe2\x80\x9cscript\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9cimportmap\xe2\x80\x9d must only contain valid URL values." 301 + 302 + let end_element state ~name ~namespace collector = 303 + if namespace <> None then () 304 + else begin 305 + let name_lower = String.lowercase_ascii name in 306 + if name_lower = "script" && state.in_importmap then begin 307 + let content = Buffer.contents state.content in 308 + let errors = validate_importmap content in 309 + List.iter (fun err -> 310 + Message_collector.add_error collector 311 + ~message:(error_to_message err) 312 + ~code:"importmap-invalid" 313 + ~element:"script" 314 + ~attribute:"type" 315 + () 316 + ) errors; 317 + state.in_importmap <- false 318 + end 319 + end 320 + 321 + let characters state text _collector = 322 + if state.in_importmap then 323 + Buffer.add_string state.content text 324 + 325 + let end_document _state _collector = () 326 + 327 + let checker = 328 + (module struct 329 + type nonrec state = state 330 + let create = create 331 + let reset = reset 332 + let start_element = start_element 333 + let end_element = end_element 334 + let characters = characters 335 + let end_document = end_document 336 + end : Checker.S)
+5
lib/html5_checker/specialized/importmap_checker.mli
··· 1 + (** Importmap validation checker. 2 + 3 + Validates that <script type="importmap"> elements contain valid JSON. *) 4 + 5 + val checker : Checker.t
+60 -46
lib/html5_checker/specialized/microdata_checker.ml
··· 67 67 let is_url s = 68 68 String.contains s ':' 69 69 70 - (** Validate that a URL is a valid absolute URL for itemtype. 71 - itemtype must be an absolute URL per the HTML5 spec. 72 - http/https URLs require :// but other schemes like mailto:, data:, javascript: don't. *) 73 - let validate_itemtype_url url = 74 - let url = String.trim url in 75 - if String.length url = 0 then 76 - Error "itemtype must not be empty" 70 + (** Validate that a URL is a valid absolute URL for itemtype/itemid. 71 + Uses the comprehensive URL validation from Url_checker. *) 72 + let validate_microdata_url url element attr_name = 73 + let url_trimmed = String.trim url in 74 + if String.length url_trimmed = 0 then 75 + Some (Printf.sprintf 76 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad absolute URL: Must be non-empty." 77 + url attr_name element) 77 78 else 78 - match String.index_opt url ':' with 79 - | None -> Error "Expected a slash (\"/\")." 80 - | Some colon_pos -> 81 - if colon_pos = 0 then 82 - Error "Expected a slash (\"/\")." 83 - else 84 - let scheme = String.lowercase_ascii (String.sub url 0 colon_pos) in 85 - (* Schemes that require :// for itemtype validation 86 - Note: The Nu validator only enforces :// for http, https, and ftp *) 87 - let special_schemes = [ 88 - "http"; "https"; "ftp" 89 - ] in 90 - if List.mem scheme special_schemes then begin 91 - if colon_pos + 2 >= String.length url then 92 - Error "Expected a slash (\"/\")." 93 - else if url.[colon_pos + 1] <> '/' || url.[colon_pos + 2] <> '/' then 94 - Error "Expected a slash (\"/\")." 95 - else 96 - Ok () 97 - end else 98 - (* Other schemes (mailto:, data:, javascript:, etc.) are valid as-is *) 99 - Ok () 79 + (* First check if it has a scheme (required for absolute URL) *) 80 + match Url_checker.extract_scheme url_trimmed with 81 + | None -> 82 + Some (Printf.sprintf 83 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad absolute URL: The string \xe2\x80\x9c%s\xe2\x80\x9d is not an absolute URL." 84 + url attr_name element url) 85 + | Some _ -> 86 + (* Has a scheme - do comprehensive URL validation *) 87 + match Url_checker.validate_url url element attr_name with 88 + | None -> None 89 + | Some error_msg -> 90 + (* Replace "Bad URL:" with "Bad absolute URL:" for microdata *) 91 + let error_msg = Str.global_replace (Str.regexp "Bad URL:") "Bad absolute URL:" error_msg in 92 + Some error_msg 100 93 101 94 (** Check if itemprop value is valid. *) 102 95 let validate_itemprop_value value = ··· 125 118 let itemref_opt = get_attr attrs "itemref" in 126 119 let itemprop_opt = get_attr attrs "itemprop" in 127 120 128 - (* Check itemid requires itemscope and itemtype *) 121 + (* Check itemid requires itemscope and itemtype, and validate URL *) 129 122 begin match itemid_opt with 130 - | Some _itemid -> 123 + | Some itemid -> 131 124 if not has_itemscope then 132 125 Message_collector.add_error collector 133 126 ~message:"itemid attribute requires itemscope attribute" ··· 143 136 ?location 144 137 ~element 145 138 ~attribute:"itemid" 146 - () 139 + (); 140 + (* Validate itemid as URL (note: itemid can be relative, unlike itemtype) *) 141 + (match Url_checker.validate_url itemid element "itemid" with 142 + | None -> () 143 + | Some error_msg -> 144 + Message_collector.add_error collector 145 + ~message:error_msg 146 + ~code:"microdata-invalid-itemid" 147 + ?location 148 + ~element 149 + ~attribute:"itemid" 150 + ()) 147 151 | None -> () 148 152 end; 149 153 ··· 184 188 else begin 185 189 (* Validate each itemtype URL (can be space-separated) *) 186 190 let types = split_whitespace itemtype in 187 - List.iter (fun url -> 188 - match validate_itemtype_url url with 189 - | Ok () -> () 190 - | Error msg -> 191 - Message_collector.add_error collector 192 - ~message:(Printf.sprintf 193 - "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9citemtype\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad absolute URL: %s" 194 - url element msg) 195 - ~code:"microdata-invalid-itemtype" 196 - ?location 197 - ~element 198 - ~attribute:"itemtype" 199 - () 200 - ) types 191 + if types = [] then 192 + (* Empty itemtype is an error *) 193 + Message_collector.add_error collector 194 + ~message:(Printf.sprintf 195 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9citemtype\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d." 196 + itemtype element) 197 + ~code:"microdata-invalid-itemtype" 198 + ?location 199 + ~element 200 + ~attribute:"itemtype" 201 + () 202 + else 203 + List.iter (fun url -> 204 + match validate_microdata_url url element "itemtype" with 205 + | None -> () 206 + | Some error_msg -> 207 + Message_collector.add_error collector 208 + ~message:error_msg 209 + ~code:"microdata-invalid-itemtype" 210 + ?location 211 + ~element 212 + ~attribute:"itemtype" 213 + () 214 + ) types 201 215 end 202 216 | None -> () 203 217 end;
+189
lib/html5_checker/specialized/mime_type_checker.ml
··· 1 + (** MIME type validation checker. 2 + 3 + Validates MIME type values in type attributes. *) 4 + 5 + (** Validate a MIME type value. Returns error message or None. *) 6 + let validate_mime_type value element attr_name = 7 + let len = String.length value in 8 + if len = 0 then 9 + Some (Printf.sprintf 10 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Empty value." 11 + value attr_name element) 12 + else if value.[len - 1] = ' ' || value.[len - 1] = '\t' then 13 + Some (Printf.sprintf 14 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Extraneous trailing whitespace." 15 + value attr_name element) 16 + else if len > 0 && (value.[0] = ' ' || value.[0] = '\t') then 17 + Some (Printf.sprintf 18 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Expected a token character but saw \xe2\x80\x9c \xe2\x80\x9d instead." 19 + value attr_name element) 20 + else 21 + (* Parse type/subtype *) 22 + let slash_pos = try Some (String.index value '/') with Not_found -> None in 23 + match slash_pos with 24 + | None -> 25 + (* No slash found - check if it looks like a type without subtype *) 26 + let semicolon_pos = try Some (String.index value ';') with Not_found -> None in 27 + (match semicolon_pos with 28 + | Some _ -> 29 + Some (Printf.sprintf 30 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 31 + value attr_name element) 32 + | None -> 33 + Some (Printf.sprintf 34 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 35 + value attr_name element)) 36 + | Some slash_pos -> 37 + (* Check for empty subtype *) 38 + let after_slash = String.sub value (slash_pos + 1) (len - slash_pos - 1) in 39 + let subtype_end = 40 + try String.index after_slash ';' 41 + with Not_found -> String.length after_slash 42 + in 43 + let subtype = String.sub after_slash 0 subtype_end in 44 + let subtype_trimmed = String.trim subtype in 45 + if subtype_trimmed = "" then 46 + Some (Printf.sprintf 47 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Subtype missing." 48 + value attr_name element) 49 + else if String.length subtype > 0 && subtype.[String.length subtype - 1] = ' ' then 50 + (* Space before semicolon - also check parameter format *) 51 + let semicolon_pos = try Some (String.index value ';') with Not_found -> None in 52 + (match semicolon_pos with 53 + | Some semi_pos -> 54 + (* Check what comes after semicolon *) 55 + let params = String.sub value (semi_pos + 1) (len - semi_pos - 1) in 56 + let params_trimmed = String.trim params in 57 + if params_trimmed = "" then 58 + Some (Printf.sprintf 59 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 60 + value attr_name element) 61 + else 62 + (* Check for param_name=value format *) 63 + let eq_pos = try Some (String.index params '=') with Not_found -> None in 64 + (match eq_pos with 65 + | None -> 66 + Some (Printf.sprintf 67 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 68 + value attr_name element) 69 + | Some _ -> None) 70 + | None -> None) 71 + else 72 + (* Check parameters after semicolon *) 73 + let semicolon_pos = try Some (String.index value ';') with Not_found -> None in 74 + (match semicolon_pos with 75 + | None -> None (* No parameters - OK *) 76 + | Some semi_pos -> 77 + let params = String.sub value (semi_pos + 1) (len - semi_pos - 1) in 78 + let params_trimmed = String.trim params in 79 + if params_trimmed = "" then 80 + Some (Printf.sprintf 81 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Semicolon seen but there was no parameter following it." 82 + value attr_name element) 83 + else 84 + (* Check for param_name=value format *) 85 + let eq_pos = try Some (String.index params '=') with Not_found -> None in 86 + (match eq_pos with 87 + | None -> 88 + Some (Printf.sprintf 89 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 90 + value attr_name element) 91 + | Some eq_pos -> 92 + let param_value = String.sub params (eq_pos + 1) (String.length params - eq_pos - 1) in 93 + let param_value_trimmed = String.trim param_value in 94 + if param_value_trimmed = "" then 95 + Some (Printf.sprintf 96 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Parameter value missing." 97 + value attr_name element) 98 + else if param_value_trimmed.[0] = '"' then 99 + (* Quoted string - check for closing quote *) 100 + let quote_end = try Some (String.index_from param_value_trimmed 1 '"') with 101 + | Not_found -> None 102 + | Invalid_argument _ -> None 103 + in 104 + (match quote_end with 105 + | Some _ -> None (* Properly quoted *) 106 + | None -> 107 + (* Check for escaped quote at end *) 108 + let has_backslash_at_end = 109 + String.length param_value_trimmed >= 2 && 110 + param_value_trimmed.[String.length param_value_trimmed - 1] = '\\' 111 + in 112 + if has_backslash_at_end then 113 + Some (Printf.sprintf 114 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Unfinished quoted string." 115 + value attr_name element) 116 + else 117 + Some (Printf.sprintf 118 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: Unfinished quoted string." 119 + value attr_name element)) 120 + else 121 + None)) 122 + 123 + (** Elements and attributes that contain MIME types. *) 124 + let mime_type_attrs = [ 125 + ("link", ["type"]); 126 + ("style", ["type"]); 127 + ("script", ["type"]); 128 + ("source", ["type"]); 129 + ("embed", ["type"]); 130 + ("object", ["type"]); 131 + ] 132 + 133 + type state = unit 134 + 135 + let create () = () 136 + let reset _state = () 137 + 138 + let get_attr_value name attrs = 139 + List.find_map (fun (k, v) -> 140 + if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None 141 + ) attrs 142 + 143 + let start_element _state ~name ~namespace ~attrs collector = 144 + if namespace <> None then () 145 + else begin 146 + let name_lower = String.lowercase_ascii name in 147 + match List.assoc_opt name_lower mime_type_attrs with 148 + | None -> () 149 + | Some type_attrs -> 150 + List.iter (fun attr_name -> 151 + match get_attr_value attr_name attrs with 152 + | None -> () 153 + | Some value -> 154 + (* Don't validate empty type attributes or special script types *) 155 + if value = "" then () 156 + else if name_lower = "script" then 157 + (* script type can be module, importmap, etc. - skip validation for non-MIME types *) 158 + let value_lower = String.lowercase_ascii value in 159 + if value_lower = "module" || value_lower = "importmap" || 160 + not (String.contains value '/') then () 161 + else 162 + match validate_mime_type value name attr_name with 163 + | None -> () 164 + | Some err -> 165 + Message_collector.add_error collector 166 + ~message:err ~code:"bad-mime-type" ~element:name ~attribute:attr_name () 167 + else 168 + match validate_mime_type value name attr_name with 169 + | None -> () 170 + | Some err -> 171 + Message_collector.add_error collector 172 + ~message:err ~code:"bad-mime-type" ~element:name ~attribute:attr_name () 173 + ) type_attrs 174 + end 175 + 176 + let end_element _state ~name:_ ~namespace:_ _collector = () 177 + let characters _state _text _collector = () 178 + let end_document _state _collector = () 179 + 180 + let checker = 181 + (module struct 182 + type nonrec state = state 183 + let create = create 184 + let reset = reset 185 + let start_element = start_element 186 + let end_element = end_element 187 + let characters = characters 188 + let end_document = end_document 189 + end : Checker.S)
+5
lib/html5_checker/specialized/mime_type_checker.mli
··· 1 + (** MIME type validation checker. 2 + 3 + Validates MIME type values in type attributes. *) 4 + 5 + val checker : Checker.t
+42 -40
lib/html5_checker/specialized/table_checker.ml
··· 767 767 768 768 let reset state = state.tables := [] 769 769 770 + let is_html_namespace = function 771 + | None -> true (* HTML mode - no namespace specified *) 772 + | Some ns -> ns = html_ns (* XHTML mode - check namespace *) 773 + 770 774 let start_element state ~name ~namespace ~attrs collector = 771 - match namespace with 772 - | Some ns when ns = html_ns -> ( 773 - match name with 774 - | "table" -> 775 - (* Push a new table onto the stack *) 776 - state.tables := make_table () :: !(state.tables) 777 - | _ -> ( 778 - match !(state.tables) with 779 - | [] -> () 780 - | table :: _ -> ( 781 - match name with 782 - | "td" -> start_cell table false attrs collector 783 - | "th" -> start_cell table true attrs collector 784 - | "tr" -> start_row table collector 785 - | "tbody" | "thead" | "tfoot" -> start_row_group table name collector 786 - | "col" -> start_col table attrs collector 787 - | "colgroup" -> start_colgroup table attrs collector 788 - | _ -> ()))) 789 - | _ -> () 775 + if is_html_namespace namespace then ( 776 + let name_lower = String.lowercase_ascii name in 777 + match name_lower with 778 + | "table" -> 779 + (* Push a new table onto the stack *) 780 + state.tables := make_table () :: !(state.tables) 781 + | _ -> ( 782 + match !(state.tables) with 783 + | [] -> () 784 + | table :: _ -> ( 785 + match name_lower with 786 + | "td" -> start_cell table false attrs collector 787 + | "th" -> start_cell table true attrs collector 788 + | "tr" -> start_row table collector 789 + | "tbody" | "thead" | "tfoot" -> start_row_group table name collector 790 + | "col" -> start_col table attrs collector 791 + | "colgroup" -> start_colgroup table attrs collector 792 + | _ -> ()))) 790 793 791 794 let end_element state ~name ~namespace collector = 792 - match namespace with 793 - | Some ns when ns = html_ns -> ( 794 - match name with 795 - | "table" -> ( 796 - match !(state.tables) with 797 - | [] -> failwith "Bug: end table but no table on stack" 798 - | table :: rest -> 799 - end_table table collector; 800 - state.tables := rest) 801 - | _ -> ( 802 - match !(state.tables) with 803 - | [] -> () 804 - | table :: _ -> ( 805 - match name with 806 - | "td" | "th" -> end_cell table 807 - | "tr" -> end_row table collector 808 - | "tbody" | "thead" | "tfoot" -> end_row_group_handler table collector 809 - | "col" -> end_col table 810 - | "colgroup" -> end_colgroup table 811 - | _ -> ()))) 812 - | _ -> () 795 + if is_html_namespace namespace then ( 796 + let name_lower = String.lowercase_ascii name in 797 + match name_lower with 798 + | "table" -> ( 799 + match !(state.tables) with 800 + | [] -> () (* End tag without start - ignore *) 801 + | table :: rest -> 802 + end_table table collector; 803 + state.tables := rest) 804 + | _ -> ( 805 + match !(state.tables) with 806 + | [] -> () 807 + | table :: _ -> ( 808 + match name_lower with 809 + | "td" | "th" -> end_cell table 810 + | "tr" -> end_row table collector 811 + | "tbody" | "thead" | "tfoot" -> end_row_group_handler table collector 812 + | "col" -> end_col table 813 + | "colgroup" -> end_colgroup table 814 + | _ -> ()))) 813 815 814 816 let characters _state _text _collector = () 815 817
+1 -1
test/debug_check.ml
··· 1 1 let () = 2 - let test_file = "validator/tests/html/microdata/itemtype/scheme-https-no-slash-novalid.html" in 2 + let test_file = "validator/tests/html/mime-types/004-novalid.html" in 3 3 let ic = open_in test_file in 4 4 let html = really_input_string ic (in_channel_length ic) in 5 5 close_in ic;