OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 84a2203f96dd85d03a6a6da73df91e62c08db43a 481 lines 22 kB view raw
1(** Attribute restrictions checker - validates that certain attributes 2 are not used on elements where they're not allowed. *) 3 4(** List of (element, [disallowed attributes]) pairs for HTML elements. *) 5let disallowed_attrs_html = [ 6 (* Elements that cannot have href attribute (RDFa misuses) *) 7 ("img", ["href"]); 8 ("p", ["href"]); 9 ("div", ["href"]); 10 (* a cannot have src or media *) 11 ("a", ["src"; "media"]); 12 (* area cannot have media *) 13 ("area", ["media"]); 14 (* Various elements cannot have srcset *) 15 ("audio", ["srcset"]); 16 ("video", ["srcset"]); 17 ("object", ["srcset"]); 18 ("link", ["srcset"]); (* except when rel=preload and as=image *) 19 ("track", ["srcset"]); 20 ("input", ["srcset"]); (* except type=image, but we check more strictly *) 21 ("image", ["srcset"]); (* SVG image element *) 22] 23 24(** SVG elements that cannot have xml:id attribute. *) 25let svg_no_xml_id = [ 26 "rect"; "circle"; "ellipse"; "line"; "polyline"; "polygon"; "path"; 27 "text"; "tspan"; "textPath"; "image"; "use"; "symbol"; "defs"; "g"; 28 "svg"; "marker"; "pattern"; "clipPath"; "mask"; "linearGradient"; 29 "radialGradient"; "stop"; "filter"; "feBlend"; "feColorMatrix"; 30 "feComponentTransfer"; "feComposite"; "feConvolveMatrix"; "feDiffuseLighting"; 31 "feDisplacementMap"; "feDistantLight"; "feDropShadow"; "feFlood"; 32 "feFuncA"; "feFuncB"; "feFuncG"; "feFuncR"; "feGaussianBlur"; "feImage"; 33 "feMerge"; "feMergeNode"; "feMorphology"; "feOffset"; "fePointLight"; 34 "feSpecularLighting"; "feSpotLight"; "feTile"; "feTurbulence"; 35] 36 37type state = { 38 mutable is_xhtml : bool; (* Track if we're in XHTML mode based on xmlns *) 39} 40 41let create () = { is_xhtml = false } 42let reset state = state.is_xhtml <- false 43 44(** Input types that allow the list attribute. *) 45let input_types_allowing_list = [ 46 "color"; "date"; "datetime-local"; "email"; "month"; "number"; 47 "range"; "search"; "tel"; "text"; "time"; "url"; "week" 48] 49 50(** Report disallowed attribute error *) 51let report_disallowed_attr element attr collector = 52 Message_collector.add_typed collector 53 (`Attr (`Not_allowed (`Attr attr, `Elem element))) 54 55let start_element state ~element collector = 56 match element.Element.tag with 57 | Tag.Html _ -> 58 let name = Tag.tag_to_string element.tag in 59 let name_lower = String.lowercase_ascii name in 60 let attrs = element.raw_attrs in 61 62 (* Detect XHTML mode from xmlns attribute on html element *) 63 if name_lower = "html" then begin 64 match Attr_utils.get_attr "xmlns" attrs with 65 | Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true 66 | _ -> () 67 end; 68 69 (* Check HTML element attribute restrictions *) 70 (match List.assoc_opt name_lower disallowed_attrs_html with 71 | Some disallowed -> 72 List.iter (fun attr -> 73 if Attr_utils.has_attr attr attrs then 74 report_disallowed_attr name_lower attr collector 75 ) disallowed 76 | None -> ()); 77 78 (* Check for xml:base attribute - not allowed in HTML *) 79 if name_lower = "html" then begin 80 if Attr_utils.has_attr "xml:base" attrs then 81 report_disallowed_attr name_lower "xml:base" collector 82 end; 83 84 (* Check for xmlns:* prefixed attributes - not allowed in HTML *) 85 (* Standard xmlns declarations are allowed but custom prefixes are not *) 86 List.iter (fun (attr_name, _) -> 87 let attr_lower = String.lowercase_ascii attr_name in 88 if String.starts_with ~prefix:"xmlns:" attr_lower then begin 89 let prefix = String.sub attr_lower 6 (String.length attr_lower - 6) in 90 (* Only xmlns:xlink (with correct value) and xmlns:xml are allowed *) 91 if prefix <> "xlink" && prefix <> "xml" then 92 Message_collector.add_typed collector 93 (`Attr (`Not_allowed_here (`Attr attr_name))) 94 end 95 ) attrs; 96 97 (* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *) 98 (* xml:id is never valid on SVG elements in HTML5 *) 99 if List.mem name_lower svg_no_xml_id then begin 100 if Attr_utils.has_attr "xml:id" attrs then 101 report_disallowed_attr name_lower "xml:id" collector 102 end; 103 104 (* SVG feConvolveMatrix requires order attribute *) 105 if name_lower = "feconvolvematrix" then begin 106 if not (Attr_utils.has_attr "order" attrs) then 107 Message_collector.add_typed collector 108 (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order"))) 109 end; 110 111 (* Validate style type attribute - must be "text/css" or omitted *) 112 if name_lower = "style" then begin 113 List.iter (fun (attr_name, attr_value) -> 114 let attr_lower = String.lowercase_ascii attr_name in 115 if attr_lower = "type" then begin 116 let value_lower = String.lowercase_ascii (String.trim attr_value) in 117 if value_lower <> "text/css" then 118 Message_collector.add_typed collector (`Misc `Style_type_invalid) 119 end 120 ) attrs 121 end; 122 123 (* Validate object element requires data or type attribute *) 124 if name_lower = "object" then begin 125 let has_data = Attr_utils.has_attr "data" attrs in 126 let has_type = Attr_utils.has_attr "type" attrs in 127 if not has_data && not has_type then 128 Message_collector.add_typed collector 129 (`Attr (`Missing (`Elem "object", `Attr "data"))) 130 end; 131 132 (* Validate link imagesizes/imagesrcset attributes *) 133 if name_lower = "link" then begin 134 let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in 135 let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in 136 let rel_value = Attr_utils.get_attr "rel" attrs in 137 let as_value = Attr_utils.get_attr "as" attrs in 138 139 (* imagesizes requires imagesrcset *) 140 if has_imagesizes && not has_imagesrcset then 141 Message_collector.add_typed collector (`Srcset `Imagesizes_without_imagesrcset); 142 143 (* imagesrcset requires as="image" *) 144 if has_imagesrcset then begin 145 let as_is_image = match as_value with 146 | Some v -> String.lowercase_ascii (String.trim v) = "image" 147 | None -> false 148 in 149 if not as_is_image then 150 Message_collector.add_typed collector (`Link `Imagesrcset_requires_as_image) 151 end; 152 153 (* as attribute requires rel="preload" or rel="modulepreload" *) 154 (match as_value with 155 | Some _ -> 156 let rel_is_preload = match rel_value with 157 | Some v -> 158 let rel_lower = String.lowercase_ascii (String.trim v) in 159 String.length rel_lower > 0 && 160 (List.mem "preload" (String.split_on_char ' ' rel_lower) || 161 List.mem "modulepreload" (String.split_on_char ' ' rel_lower)) 162 | None -> false 163 in 164 if not rel_is_preload then 165 Message_collector.add_typed collector (`Link `As_requires_preload) 166 | None -> ()) 167 end; 168 169 (* Validate img usemap attribute - must be hash-name reference with content *) 170 if name_lower = "img" then begin 171 List.iter (fun (attr_name, attr_value) -> 172 let attr_lower = String.lowercase_ascii attr_name in 173 if attr_lower = "usemap" then begin 174 if attr_value = "#" then 175 Message_collector.add_typed collector 176 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 177 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d." 178 attr_value attr_name name)))) 179 end 180 ) attrs 181 end; 182 183 (* Validate embed type attribute - must be valid MIME type *) 184 if name_lower = "embed" then begin 185 List.iter (fun (attr_name, attr_value) -> 186 let attr_lower = String.lowercase_ascii attr_name in 187 if attr_lower = "type" then begin 188 match Dt_mime.validate_mime_type attr_value with 189 | Ok () -> () 190 | Error msg -> 191 Message_collector.add_typed collector 192 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 193 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s" 194 attr_value attr_name name msg)))) 195 end 196 ) attrs 197 end; 198 199 (* Validate width/height on embed and img - must be non-negative integers *) 200 let is_dimension_element = name_lower = "embed" || name_lower = "img" || 201 name_lower = "video" || name_lower = "canvas" || 202 name_lower = "iframe" || name_lower = "source" in 203 if is_dimension_element then begin 204 List.iter (fun (attr_name, attr_value) -> 205 let attr_lower = String.lowercase_ascii attr_name in 206 if attr_lower = "width" || attr_lower = "height" then begin 207 (* Check for non-negative integer only *) 208 let is_valid = 209 String.length attr_value > 0 && 210 String.for_all (fun c -> c >= '0' && c <= '9') attr_value 211 in 212 if not is_valid then begin 213 (* Determine specific error message *) 214 let error_msg = 215 if String.length attr_value = 0 then 216 Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer." 217 attr_name name 218 else if String.contains attr_value '%' then 219 Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead." 220 attr_value attr_name name 221 else if String.length attr_value > 0 && attr_value.[0] = '-' then 222 Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead." 223 attr_value attr_name name 224 else 225 (* Find first non-digit character *) 226 let bad_char = 227 try 228 let i = ref 0 in 229 while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do 230 incr i 231 done; 232 if !i < String.length attr_value then Some attr_value.[!i] else None 233 with _ -> None 234 in 235 match bad_char with 236 | Some c -> 237 Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead." 238 attr_value attr_name name c 239 | None -> 240 Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit." 241 attr_value attr_name name 242 in 243 Message_collector.add_typed collector 244 (`Attr (`Bad_value_generic (`Message error_msg))) 245 end 246 end 247 ) attrs 248 end; 249 250 (* Validate area[shape=default] cannot have coords *) 251 if name_lower = "area" then begin 252 match Attr_utils.get_attr "shape" attrs with 253 | Some s when String.lowercase_ascii (String.trim s) = "default" -> 254 if Attr_utils.has_attr "coords" attrs then 255 Message_collector.add_typed collector 256 (`Attr (`Not_allowed (`Attr "coords", `Elem "area"))) 257 | _ -> () 258 end; 259 260 (* Validate bdo element requires dir attribute, and dir cannot be "auto" *) 261 if name_lower = "bdo" then begin 262 match Attr_utils.get_attr "dir" attrs with 263 | None -> 264 Message_collector.add_typed collector (`Misc `Bdo_missing_dir) 265 | Some v when String.lowercase_ascii (String.trim v) = "auto" -> 266 Message_collector.add_typed collector (`Misc `Bdo_dir_auto) 267 | _ -> () 268 end; 269 270 (* Validate input list attribute - only allowed for certain types *) 271 if name_lower = "input" then begin 272 if Attr_utils.has_attr "list" attrs then begin 273 let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs 274 |> String.trim |> String.lowercase_ascii in 275 if not (List.mem input_type input_types_allowing_list) then 276 Message_collector.add_typed collector (`Input `List_not_allowed) 277 end 278 end; 279 280 (* Validate data-* attributes *) 281 List.iter (fun (attr_name, _) -> 282 let attr_lower = String.lowercase_ascii attr_name in 283 (* Check if it starts with "data-" *) 284 if String.starts_with ~prefix:"data-" attr_lower then begin 285 let after_prefix = String.sub attr_lower 5 (String.length attr_lower - 5) in 286 (* Check if it's exactly "data-" with nothing after *) 287 if after_prefix = "" then 288 report_disallowed_attr name_lower attr_name collector 289 (* Check if the name contains colon - not XML serializable *) 290 else if String.contains after_prefix ':' then 291 Message_collector.add_typed collector 292 (`Attr (`Data_invalid_name (`Reason "must be XML 1.0 4th ed. plus Namespaces NCNames"))) 293 end 294 ) attrs; 295 296 (* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *) 297 if not state.is_xhtml then begin 298 let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in 299 let lang_value = Attr_utils.get_attr "lang" attrs in 300 match xmllang_value with 301 | Some xmllang -> 302 (match lang_value with 303 | None -> 304 Message_collector.add_typed collector (`I18n `Xml_lang_without_lang) 305 | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang -> 306 Message_collector.add_typed collector (`I18n `Xml_lang_without_lang) 307 | _ -> ()) 308 | None -> () 309 end; 310 311 (* Validate spellcheck attribute - must be "true" or "false" or empty *) 312 List.iter (fun (attr_name, attr_value) -> 313 let attr_lower = String.lowercase_ascii attr_name in 314 if attr_lower = "spellcheck" then begin 315 let value_lower = String.lowercase_ascii (String.trim attr_value) in 316 if value_lower <> "" && value_lower <> "true" && value_lower <> "false" then 317 Message_collector.add_typed collector 318 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason ""))) 319 end 320 ) attrs; 321 322 (* Validate enterkeyhint attribute - must be one of specific values *) 323 let valid_enterkeyhint = ["enter"; "done"; "go"; "next"; "previous"; "search"; "send"] in 324 List.iter (fun (attr_name, attr_value) -> 325 let attr_lower = String.lowercase_ascii attr_name in 326 if attr_lower = "enterkeyhint" then begin 327 let value_lower = String.lowercase_ascii (String.trim attr_value) in 328 if not (List.mem value_lower valid_enterkeyhint) then 329 Message_collector.add_typed collector 330 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason ""))) 331 end 332 ) attrs; 333 334 (* Validate headingoffset attribute - must be a number between 0 and 8 *) 335 List.iter (fun (attr_name, attr_value) -> 336 let attr_lower = String.lowercase_ascii attr_name in 337 if attr_lower = "headingoffset" then begin 338 let trimmed = String.trim attr_value in 339 let is_valid = 340 String.length trimmed > 0 && 341 String.for_all (fun c -> c >= '0' && c <= '9') trimmed && 342 (try 343 let n = int_of_string trimmed in 344 n >= 0 && n <= 8 345 with _ -> false) 346 in 347 if not is_valid then 348 Message_collector.add_typed collector (`Misc `Headingoffset_invalid) 349 end 350 ) attrs; 351 352 (* Validate accesskey attribute - each key label must be a single code point *) 353 List.iter (fun (attr_name, attr_value) -> 354 let attr_lower = String.lowercase_ascii attr_name in 355 if attr_lower = "accesskey" then begin 356 (* Split by whitespace to get key labels *) 357 let keys = String.split_on_char ' ' attr_value |> 358 List.filter (fun s -> String.length (String.trim s) > 0) |> 359 List.map String.trim in 360 (* Count Unicode code points in each key *) 361 let count_codepoints s = 362 let len = String.length s in 363 let count = ref 0 in 364 let i = ref 0 in 365 while !i < len do 366 let c = Char.code s.[!i] in 367 if c < 0x80 then incr i 368 else if c < 0xE0 then i := !i + 2 369 else if c < 0xF0 then i := !i + 3 370 else i := !i + 4; 371 incr count 372 done; 373 !count 374 in 375 (* Check for multi-character keys *) 376 List.iter (fun key -> 377 if count_codepoints key > 1 then 378 Message_collector.add_typed collector 379 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 380 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character." 381 attr_value attr_name name)))) 382 ) keys; 383 (* Check for duplicate keys *) 384 let rec find_duplicates seen = function 385 | [] -> () 386 | k :: rest -> 387 if List.mem k seen then 388 Message_collector.add_typed collector 389 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 390 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique." 391 attr_value attr_name name)))) 392 else 393 find_duplicates (k :: seen) rest 394 in 395 find_duplicates [] keys 396 end 397 ) attrs; 398 399 (* Validate that command and popovertarget cannot have aria-expanded *) 400 if name_lower = "button" then begin 401 let has_command = Attr_utils.has_attr "command" attrs in 402 let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in 403 let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in 404 405 if has_command && has_aria_expanded then 406 Message_collector.add_typed collector 407 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 408 `Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute"))); 409 410 if has_popovertarget && has_aria_expanded then 411 Message_collector.add_typed collector 412 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name, 413 `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute"))) 414 end; 415 416 (* Note: data-* uppercase check requires XML parsing which preserves case. 417 The HTML5 parser normalizes attribute names to lowercase, so this check 418 is only effective when the document is parsed as XML. 419 Commenting out until we have XML parsing support. *) 420 ignore state.is_xhtml; 421 422 (* Validate media attribute on link, style, source elements *) 423 let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in 424 if is_media_element then begin 425 List.iter (fun (attr_name, attr_value) -> 426 let attr_lower = String.lowercase_ascii attr_name in 427 if attr_lower = "media" then begin 428 let trimmed = String.trim attr_value in 429 if trimmed <> "" then begin 430 match Dt_media_query.validate_media_query_strict trimmed with 431 | Ok () -> () 432 | Error msg -> 433 Message_collector.add_typed collector 434 (`Attr (`Bad_value_generic (`Message (Printf.sprintf 435 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s" 436 attr_value attr_name name msg)))) 437 end 438 end 439 ) attrs 440 end; 441 442 (* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *) 443 List.iter (fun (attr_name, attr_value) -> 444 let attr_lower = String.lowercase_ascii attr_name in 445 if attr_lower = "prefix" then begin 446 (* Parse prefix attribute value - format: "prefix1: iri1 prefix2: iri2 ..." *) 447 let trimmed = String.trim attr_value in 448 if trimmed <> "" then begin 449 (* Check for empty prefix (starts with : or has space:) *) 450 if String.length trimmed > 0 && trimmed.[0] = ':' then 451 Message_collector.add_typed collector 452 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason ""))) 453 else begin 454 (* Check for invalid prefix names - must start with letter or underscore *) 455 let is_ncname_start c = 456 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' 457 in 458 if String.length trimmed > 0 && not (is_ncname_start trimmed.[0]) then 459 Message_collector.add_typed collector 460 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason ""))) 461 end 462 end 463 end 464 ) attrs 465 466 | _ -> () (* Skip non-HTML elements *) 467 468let end_element _state ~tag:_ _collector = () 469let characters _state _text _collector = () 470let end_document _state _collector = () 471 472let checker = 473 (module struct 474 type nonrec state = state 475 let create = create 476 let reset = reset 477 let start_element = start_element 478 let end_element = end_element 479 let characters = characters 480 let end_document = end_document 481 end : Checker.S)