OCaml HTML5 parser/serialiser based on Python's JustHTML

more

+1 -1
lib/html5_checker/checker_registry.ml
··· 36 36 Hashtbl.replace reg "autofocus" Autofocus_checker.checker; 37 37 Hashtbl.replace reg "option" Option_checker.checker; 38 38 Hashtbl.replace reg "language" Language_checker.checker; 39 + Hashtbl.replace reg "microdata" Microdata_checker.checker; 39 40 (* Hashtbl.replace reg "table" Table_checker.checker; *) 40 41 (* Hashtbl.replace reg "heading" Heading_checker.checker; *) 41 - (* Hashtbl.replace reg "microdata" Microdata_checker.checker; *) 42 42 (* Hashtbl.replace reg "content" Content_checker.checker; *) 43 43 reg 44 44
+331 -79
lib/html5_checker/datatype/dt_media_query.ml
··· 1 - (** Media query validation - simplified implementation *) 1 + (** Media query validation - strict implementation for HTML5 conformance *) 2 2 3 - (** Media types *) 4 - let media_types = 3 + (** Valid media types per Media Queries Level 4 spec *) 4 + let valid_media_types = 5 5 [ 6 6 "all"; 7 7 "screen"; 8 8 "print"; 9 9 "speech"; 10 + ] 11 + 12 + (** Deprecated media types that should trigger an error *) 13 + let deprecated_media_types = 14 + [ 10 15 "aural"; 11 16 "braille"; 12 17 "handheld"; ··· 16 21 "embossed"; 17 22 ] 18 23 19 - (** Media query keywords *) 20 - let media_keywords = [ "and"; "or"; "not"; "only" ] 24 + (** Deprecated media features that should trigger an error *) 25 + let deprecated_media_features = 26 + [ 27 + "device-width"; 28 + "device-height"; 29 + "device-aspect-ratio"; 30 + ] 31 + 32 + (** Valid media features *) 33 + let valid_media_features = 34 + [ 35 + (* Dimensions *) 36 + "width"; "min-width"; "max-width"; 37 + "height"; "min-height"; "max-height"; 38 + "aspect-ratio"; "min-aspect-ratio"; "max-aspect-ratio"; 39 + (* Display quality *) 40 + "resolution"; "min-resolution"; "max-resolution"; 41 + "scan"; "grid"; "update"; "overflow-block"; "overflow-inline"; 42 + (* Color *) 43 + "color"; "min-color"; "max-color"; 44 + "color-index"; "min-color-index"; "max-color-index"; 45 + "monochrome"; "min-monochrome"; "max-monochrome"; 46 + "color-gamut"; 47 + (* Interaction *) 48 + "pointer"; "any-pointer"; "hover"; "any-hover"; 49 + (* Scripting *) 50 + "scripting"; 51 + (* Light/dark *) 52 + "prefers-color-scheme"; "prefers-contrast"; "prefers-reduced-motion"; 53 + "prefers-reduced-transparency"; 54 + (* Display mode *) 55 + "display-mode"; 56 + (* Inverted colors *) 57 + "inverted-colors"; 58 + (* Forced colors *) 59 + "forced-colors"; 60 + (* Orientation *) 61 + "orientation"; 62 + ] 63 + 64 + (** Valid length units *) 65 + let valid_length_units = ["px"; "em"; "rem"; "vh"; "vw"; "vmin"; "vmax"; "cm"; "mm"; "in"; "pt"; "pc"; "ch"; "ex"] 66 + 67 + (** Valid resolution units *) 68 + let valid_resolution_units = ["dpi"; "dpcm"; "dppx"; "x"] 69 + 70 + (** Media query keywords (unused but kept for documentation) *) 71 + let _media_keywords = [ "and"; "not"; "only" ] 21 72 22 73 (** Check if character is whitespace *) 23 74 let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' ··· 32 83 let is_ident_char c = 33 84 is_ident_start c || (c >= '0' && c <= '9') 34 85 86 + (** Unicode case-fold for Turkish dotted-I etc *) 87 + let lowercase_unicode s = 88 + (* Handle special case: U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE -> i *) 89 + let buf = Buffer.create (String.length s) in 90 + let i = ref 0 in 91 + while !i < String.length s do 92 + let c = s.[!i] in 93 + if c = '\xc4' && !i + 1 < String.length s && s.[!i + 1] = '\xb0' then begin 94 + (* U+0130 -> 'i' + U+0307 (combining dot above), but for simplicity just 'i' followed by U+0307 *) 95 + Buffer.add_string buf "i\xcc\x87"; 96 + i := !i + 2 97 + end else begin 98 + Buffer.add_char buf (Char.lowercase_ascii c); 99 + incr i 100 + end 101 + done; 102 + Buffer.contents buf 103 + 35 104 (** Check balanced parentheses *) 36 105 let check_balanced_parens s = 37 106 let rec check depth i = 38 107 if i >= String.length s then 39 108 if depth = 0 then Ok () 40 - else Error "Unbalanced parentheses: unclosed '('" 109 + else Error "Parse Error." 41 110 else 42 111 let c = s.[i] in 43 112 match c with 44 113 | '(' -> check (depth + 1) (i + 1) 45 114 | ')' -> 46 - if depth = 0 then Error "Unbalanced parentheses: unexpected ')'" 115 + if depth = 0 then Error "Parse Error." 47 116 else check (depth - 1) (i + 1) 48 117 | _ -> check depth (i + 1) 49 118 in 50 119 check 0 0 51 120 52 - (** Extract words (identifiers and keywords) from media query *) 53 - let extract_words s = 54 - let words = ref [] in 55 - let buf = Buffer.create 16 in 56 - let in_parens = ref 0 in 121 + (** Strict media query validation *) 122 + let rec validate_media_query_strict s = 123 + let s = String.trim s in 124 + if String.length s = 0 then Error "Parse Error." 125 + else begin 126 + (* Check for empty commas *) 127 + if s = "," then Error "Parse Error." 128 + else if String.length s > 0 && s.[0] = ',' then Error "Parse Error." 129 + else if String.length s > 0 && s.[String.length s - 1] = ',' then Error "Parse Error." 130 + else if String.contains s ',' then begin 131 + (* Check for empty queries between commas *) 132 + let parts = String.split_on_char ',' s in 133 + if List.exists (fun p -> String.trim p = "") parts then Error "Parse Error." 134 + else begin 135 + (* Validate each media query in the list *) 136 + let rec validate_all = function 137 + | [] -> Ok () 138 + | part :: rest -> 139 + match validate_media_query_strict (String.trim part) with 140 + | Ok () -> validate_all rest 141 + | Error e -> Error e 142 + in 143 + validate_all parts 144 + end 145 + end else begin 146 + (* Single media query *) 147 + match check_balanced_parens s with 148 + | Error e -> Error e 149 + | Ok () -> 150 + (* Check for "and" or "and(" at end *) 151 + let trimmed = String.trim s in 152 + if String.length trimmed >= 3 then begin 153 + let suffix = String.sub trimmed (String.length trimmed - 3) 3 in 154 + if String.lowercase_ascii suffix = "and" then 155 + Error "Parse Error." 156 + else if String.length trimmed >= 4 then begin 157 + let suffix4 = String.sub trimmed (String.length trimmed - 4) 4 in 158 + if String.lowercase_ascii suffix4 = "and(" then 159 + Error "Parse Error." 160 + else 161 + validate_media_query_content trimmed 162 + end else 163 + validate_media_query_content trimmed 164 + end else 165 + validate_media_query_content trimmed 166 + end 167 + end 57 168 58 - for i = 0 to String.length s - 1 do 59 - let c = s.[i] in 60 - match c with 61 - | '(' -> 62 - if Buffer.length buf > 0 then ( 63 - words := Buffer.contents buf :: !words; 64 - Buffer.clear buf); 65 - incr in_parens 66 - | ')' -> 67 - if Buffer.length buf > 0 then ( 68 - words := Buffer.contents buf :: !words; 69 - Buffer.clear buf); 70 - decr in_parens 71 - | _ -> 72 - if !in_parens = 0 then 73 - if is_ident_char c then Buffer.add_char buf c 74 - else if is_whitespace c then 75 - if Buffer.length buf > 0 then ( 76 - words := Buffer.contents buf :: !words; 77 - Buffer.clear buf) 78 - else () 79 - else if Buffer.length buf > 0 then ( 80 - words := Buffer.contents buf :: !words; 81 - Buffer.clear buf) 82 - done; 169 + and validate_media_query_content s = 170 + (* Parse into tokens *) 171 + let len = String.length s in 172 + let i = ref 0 in 173 + let skip_ws () = while !i < len && is_whitespace s.[!i] do incr i done in 174 + 175 + let read_ident () = 176 + let start = !i in 177 + while !i < len && is_ident_char s.[!i] do incr i done; 178 + if !i > start then Some (String.sub s start (!i - start)) 179 + else None 180 + in 181 + 182 + let read_paren_content () = 183 + (* Read until matching ) *) 184 + let start = !i in 185 + let depth = ref 1 in 186 + incr i; (* skip opening ( *) 187 + while !i < len && !depth > 0 do 188 + if s.[!i] = '(' then incr depth 189 + else if s.[!i] = ')' then decr depth; 190 + incr i 191 + done; 192 + String.sub s (start + 1) (!i - start - 2) 193 + in 83 194 84 - if Buffer.length buf > 0 then words := Buffer.contents buf :: !words; 85 - List.rev !words 195 + (* Parse the query *) 196 + skip_ws (); 197 + if !i >= len then Error "Parse Error." 198 + else begin 199 + (* Check for only/not prefix *) 200 + let has_only = ref false in 201 + let has_not = ref false in 202 + (match read_ident () with 203 + | Some w -> 204 + let w_lower = String.lowercase_ascii w in 205 + if w_lower = "only" then (has_only := true; skip_ws ()) 206 + else if w_lower = "not" then (has_not := true; skip_ws ()) 207 + else i := !i - String.length w (* put back *) 208 + | None -> ()); 86 209 87 - (** Validate media query structure *) 210 + skip_ws (); 211 + if !i >= len then begin 212 + if !has_only || !has_not then Error "Parse Error." 213 + else Error "Parse Error." 214 + end else begin 215 + (* Check for media type or ( *) 216 + if s.[!i] = '(' then begin 217 + (* Media feature only *) 218 + let content = read_paren_content () in 219 + validate_media_feature content 220 + end else begin 221 + (* Expect media type *) 222 + match read_ident () with 223 + | None -> Error "Parse Error." 224 + | Some media_type -> 225 + let mt_lower = lowercase_unicode media_type in 226 + (* Check for deprecated media type *) 227 + if List.mem mt_lower deprecated_media_types then 228 + Error (Printf.sprintf "The media \"%s\" has been deprecated" mt_lower) 229 + (* Check if valid media type *) 230 + else if not (List.mem mt_lower valid_media_types) then 231 + Error (Printf.sprintf "unrecognized media \"%s\"." mt_lower) 232 + else begin 233 + skip_ws (); 234 + if !i >= len then Ok () 235 + else begin 236 + (* Check for "and" - must be followed by whitespace *) 237 + let and_start = !i in 238 + match read_ident () with 239 + | None -> Error "Parse Error." 240 + | Some kw -> 241 + let kw_lower = String.lowercase_ascii kw in 242 + if kw_lower <> "and" then Error "Parse Error." 243 + else begin 244 + (* Check that there was whitespace before 'and' *) 245 + if and_start > 0 && not (is_whitespace s.[and_start - 1]) then 246 + Error "Parse Error." 247 + (* Check that there is whitespace after 'and' *) 248 + else if !i < len && s.[!i] = '(' then 249 + Error "Parse Error." 250 + else begin 251 + skip_ws (); 252 + if !i >= len then Error "Parse Error." 253 + else if s.[!i] <> '(' then Error "Parse Error." 254 + else begin 255 + (* Validate remaining features *) 256 + let rec validate_features () = 257 + skip_ws (); 258 + if !i >= len then Ok () 259 + else if s.[!i] = '(' then begin 260 + let content = read_paren_content () in 261 + match validate_media_feature content with 262 + | Error e -> Error e 263 + | Ok () -> 264 + skip_ws (); 265 + if !i >= len then Ok () 266 + else begin 267 + match read_ident () with 268 + | None -> Error "Parse Error." 269 + | Some kw2 -> 270 + let kw2_lower = String.lowercase_ascii kw2 in 271 + if kw2_lower <> "and" then Error "Parse Error." 272 + else begin 273 + skip_ws (); 274 + if !i >= len then Error "Parse Error." 275 + else validate_features () 276 + end 277 + end 278 + end else Error "Parse Error." 279 + in 280 + validate_features () 281 + end 282 + end 283 + end 284 + end 285 + end 286 + end 287 + end 288 + end 289 + 290 + and validate_media_feature content = 291 + let content = String.trim content in 292 + if content = "" then Error "Parse Error." 293 + else begin 294 + (* Check for colon - feature: value *) 295 + match String.index_opt content ':' with 296 + | None -> 297 + (* Just feature name - boolean feature or range syntax *) 298 + let feature_lower = String.lowercase_ascii content in 299 + if List.mem feature_lower deprecated_media_features then 300 + Error (Printf.sprintf "Deprecated media feature \"%s\". For guidance, see the Deprecated Media Features section in the current Media Queries specification." feature_lower) 301 + else if List.mem feature_lower valid_media_features then 302 + Ok () 303 + else 304 + Ok () (* Allow unknown features for forward compatibility *) 305 + | Some colon_pos -> 306 + let feature = String.trim (String.sub content 0 colon_pos) in 307 + let value = String.trim (String.sub content (colon_pos + 1) (String.length content - colon_pos - 1)) in 308 + let feature_lower = String.lowercase_ascii feature in 309 + 310 + (* Check for deprecated features *) 311 + if List.mem feature_lower deprecated_media_features then 312 + Error (Printf.sprintf "Deprecated media feature \"%s\". For guidance, see the Deprecated Media Features section in the current Media Queries specification." feature_lower) 313 + (* Check for incomplete value *) 314 + else if value = "" then 315 + Error "Parse Error." 316 + (* Check for invalid value syntax *) 317 + else if String.length value > 0 && value.[String.length value - 1] = ';' then 318 + Error "Parse Error." 319 + else begin 320 + (* Validate value based on feature type *) 321 + validate_feature_value feature_lower value 322 + end 323 + end 324 + 325 + and validate_feature_value feature value = 326 + (* Width/height features require length values *) 327 + let length_features = ["width"; "min-width"; "max-width"; "height"; "min-height"; "max-height"] in 328 + let color_features = ["color"; "min-color"; "max-color"; "color-index"; "min-color-index"; "max-color-index"; 329 + "monochrome"; "min-monochrome"; "max-monochrome"] in 330 + 331 + if List.mem feature length_features then begin 332 + (* Must be a valid length: number followed by unit *) 333 + let value = String.trim value in 334 + let is_digit c = c >= '0' && c <= '9' in 335 + 336 + (* Parse number - includes sign, digits, and decimal point *) 337 + let i = ref 0 in 338 + let len = String.length value in 339 + while !i < len && (is_digit value.[!i] || value.[!i] = '.' || value.[!i] = '-' || value.[!i] = '+') do 340 + incr i 341 + done; 342 + let num_part = String.sub value 0 !i in 343 + let unit_part = String.sub value !i (len - !i) in 344 + 345 + (* Check if the number is zero (including 0.0, 0.00, etc.) *) 346 + let is_zero num = 347 + let rec check i = 348 + if i >= String.length num then true 349 + else match num.[i] with 350 + | '0' | '.' | '-' -> check (i + 1) 351 + | _ -> false 352 + in 353 + check 0 354 + in 355 + if num_part = "" then Error "Parse Error." 356 + else if is_zero num_part && unit_part = "" then Ok () (* 0 (or 0.0) can be unitless *) 357 + else if unit_part = "" then 358 + Error "only \"0\" can be a \"unit\". You must put a unit after your number" 359 + else begin 360 + let unit_lower = String.lowercase_ascii unit_part in 361 + if List.mem unit_lower valid_length_units then Ok () 362 + else if List.mem unit_lower valid_resolution_units then 363 + Error (Printf.sprintf "\"%s\" is not a \"%s\" value" value feature) 364 + else 365 + Error "Unknown dimension." 366 + end 367 + end else if List.mem feature color_features then begin 368 + (* Must be an integer *) 369 + let value = String.trim value in 370 + let is_digit c = c >= '0' && c <= '9' in 371 + if String.length value > 0 && String.for_all is_digit value then Ok () 372 + else 373 + Error (Printf.sprintf "\"%s\" is not a \"%s\" value" value feature) 374 + end else 375 + Ok () (* Allow other features with any value for now *) 376 + 377 + (** Legacy permissive validation *) 88 378 let validate_media_query s = 89 379 let s = String.trim s in 90 380 if String.length s = 0 then Error "Media query must not be empty" 91 381 else 92 - (* Check balanced parentheses *) 93 382 match check_balanced_parens s with 94 383 | Error _ as e -> e 95 - | Ok () -> 96 - (* Extract and validate words *) 97 - let words = extract_words s in 98 - let words_lower = List.map String.lowercase_ascii words in 99 - 100 - (* Basic validation: check for invalid keyword combinations *) 101 - let rec validate_words prev = function 102 - | [] -> Ok () 103 - | word :: rest -> ( 104 - let word_lower = String.lowercase_ascii word in 105 - match (prev, word_lower) with 106 - | None, "and" | None, "or" -> 107 - Error 108 - (Printf.sprintf 109 - "Media query cannot start with keyword '%s'" word) 110 - | Some "and", "and" | Some "or", "or" | Some "not", "not" -> 111 - Error 112 - (Printf.sprintf "Consecutive '%s' keywords are not allowed" 113 - word) 114 - | Some "only", "only" -> 115 - Error "Consecutive 'only' keywords are not allowed" 116 - | _, _ -> validate_words (Some word_lower) rest) 117 - in 118 - 119 - (* Check if query contains valid media types or features *) 120 - let has_media_type = 121 - List.exists 122 - (fun w -> List.mem (String.lowercase_ascii w) media_types) 123 - words 124 - in 125 - let has_features = String.contains s '(' in 126 - 127 - if not (has_media_type || has_features) then 128 - (* Only keywords, no actual media type or features *) 129 - if List.for_all (fun w -> List.mem w media_keywords) words_lower then 130 - Error "Media query contains only keywords without media type or features" 131 - else Ok () (* Assume other identifiers are valid *) 132 - else validate_words None words 384 + | Ok () -> Ok () 133 385 134 386 module Media_query = struct 135 387 let name = "media query"
+4
lib/html5_checker/datatype/dt_media_query.mli
··· 2 2 3 3 This module provides a validator for CSS media queries as used in HTML5. *) 4 4 5 + (** Strict media query validation for HTML5 conformance checking. 6 + Returns Ok () if valid, Error message if invalid. *) 7 + val validate_media_query_strict : string -> (unit, string) result 8 + 5 9 (** Media query validator. 6 10 7 11 Validates CSS media queries used in media attributes and CSS @media rules.
+10 -2
lib/html5_checker/parse_error_bridge.ml
··· 11 11 Message.make_location ~line ~column ?system_id () 12 12 in 13 13 let code_str = Html5rw.Parse_error_code.to_string code in 14 + let message = match code with 15 + | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> 16 + "Self-closing syntax (\"/>\") used on a non-void HTML element. Ignoring the slash and treating as a start tag." 17 + | _ -> Printf.sprintf "Parse error: %s" code_str 18 + in 14 19 Message.error 15 - ~message:(Printf.sprintf "Parse error: %s" code_str) 20 + ~message 16 21 ~code:code_str 17 22 ~location 18 23 () ··· 25 30 in 26 31 let filtered_errors = 27 32 if is_xhtml then 28 - (* XHTML doesn't require DOCTYPE - filter that error *) 33 + (* XHTML has different requirements than HTML: 34 + - No DOCTYPE required 35 + - Self-closing syntax is valid for all elements *) 29 36 List.filter (fun err -> 30 37 match Html5rw.error_code err with 31 38 | Html5rw.Parse_error_code.Tree_construction_error "expected-doctype-but-got-other" -> false 39 + | Html5rw.Parse_error_code.Non_void_html_element_start_tag_with_trailing_solidus -> false 32 40 | _ -> true 33 41 ) errors 34 42 else errors
+4 -14
lib/html5_checker/semantic/id_checker.ml
··· 6 6 - ID values conform to HTML5 requirements *) 7 7 8 8 (** Location information for ID occurrences. *) 9 - type id_location = { 10 - element : string; 11 - location : Message.location option; 12 - } 9 + type id_location = unit (* simplified since we only need to track existence *) 13 10 14 11 (** Information about an ID reference. *) 15 12 type id_reference = { ··· 120 117 () 121 118 (* Check for duplicate ID *) 122 119 else if Hashtbl.mem state.ids id then 123 - let first_occurrence = Hashtbl.find state.ids id in 124 - let first_loc_str = match first_occurrence.location with 125 - | None -> "" 126 - | Some loc -> Printf.sprintf " at line %d, column %d" loc.line loc.column 127 - in 128 120 Message_collector.add_error collector 129 - ~message:(Printf.sprintf 130 - "Duplicate ID '%s': first used on <%s>%s, now on <%s>" 131 - id first_occurrence.element first_loc_str element) 121 + ~message:(Printf.sprintf "Duplicate ID \xe2\x80\x9c%s\xe2\x80\x9d." id) 132 122 ~code:"duplicate-id" 133 123 ?location 134 124 ~element ··· 136 126 () 137 127 else 138 128 (* Store the ID *) 139 - Hashtbl.add state.ids id { element; location } 129 + Hashtbl.add state.ids id () 140 130 141 131 (** Record a single ID reference. *) 142 132 let add_reference state ~referring_element ~attribute ~referenced_id ~location = ··· 181 171 | "name" when element = "map" -> 182 172 (* Track map name attributes for usemap resolution *) 183 173 if String.length value > 0 then 184 - Hashtbl.add state.map_names value { element; location } 174 + Hashtbl.add state.map_names value () 185 175 186 176 | attr when List.mem attr single_id_ref_attrs -> 187 177 add_reference state ~referring_element:element
+128 -3
lib/html5_checker/specialized/aria_checker.ml
··· 358 358 359 359 (** Stack node representing an element in the ancestor chain. *) 360 360 type stack_node = { 361 + element_name : string; 361 362 explicit_roles : string list; 362 363 implicit_role : string option; 363 364 } ··· 365 366 (** Checker state. *) 366 367 type state = { 367 368 mutable stack : stack_node list; 369 + mutable has_active_tab : bool; (* Whether document has role=tab with aria-selected=true *) 370 + mutable has_tabpanel : bool; (* Whether document has role=tabpanel elements *) 368 371 } 369 372 370 - let create () = { stack = [] } 373 + let create () = { stack = []; has_active_tab = false; has_tabpanel = false } 371 374 372 - let reset state = state.stack <- [] 375 + let reset state = 376 + state.stack <- []; 377 + state.has_active_tab <- false; 378 + state.has_tabpanel <- false 373 379 374 380 (** Check if any ancestor has one of the required roles. *) 375 381 let has_required_ancestor_role state required_roles = ··· 385 391 | None -> false 386 392 ) state.stack 387 393 394 + (** Get the first ancestor role from a list of target roles. *) 395 + let get_ancestor_role state target_roles = 396 + let rec find_in_stack = function 397 + | [] -> None 398 + | ancestor :: rest -> 399 + let found_explicit = List.find_opt (fun role -> List.mem role target_roles) ancestor.explicit_roles in 400 + match found_explicit with 401 + | Some r -> Some r 402 + | None -> 403 + match ancestor.implicit_role with 404 + | Some r when List.mem r target_roles -> Some r 405 + | _ -> find_in_stack rest 406 + in 407 + find_in_stack state.stack 408 + 409 + (** Get the immediate parent element name. *) 410 + let get_parent_element state = 411 + match state.stack with 412 + | parent :: _ -> Some parent.element_name 413 + | [] -> None 414 + 388 415 (** Render a list of roles as a human-readable string. *) 389 416 let render_role_set roles = 390 417 match roles with ··· 418 445 (* Get implicit role for this element *) 419 446 let implicit_role = get_implicit_role name_lower attrs in 420 447 448 + (* Track active tabs and tabpanel roles for end_document validation *) 449 + if List.mem "tab" explicit_roles then begin 450 + let aria_selected = List.assoc_opt "aria-selected" attrs in 451 + if aria_selected = Some "true" then state.has_active_tab <- true 452 + end; 453 + if List.mem "tabpanel" explicit_roles then state.has_tabpanel <- true; 454 + 421 455 (* Check br/wbr role restrictions - only none/presentation allowed *) 422 456 if (name_lower = "br" || name_lower = "wbr") && explicit_roles <> [] then begin 423 457 let first_role = List.hd explicit_roles in ··· 499 533 | _ -> () 500 534 end; 501 535 536 + (* Check for input[type=checkbox][role=button] requires aria-pressed *) 537 + if name_lower = "input" then begin 538 + let input_type = match List.assoc_opt "type" attrs with 539 + | Some t -> String.lowercase_ascii t 540 + | None -> "text" 541 + in 542 + if input_type = "checkbox" && List.mem "button" explicit_roles then begin 543 + let has_aria_pressed = List.assoc_opt "aria-pressed" attrs <> None in 544 + if not has_aria_pressed then 545 + Message_collector.add_error collector 546 + ~message:"An \xe2\x80\x9cinput\xe2\x80\x9d element with a \xe2\x80\x9ctype\xe2\x80\x9d attribute whose value is \xe2\x80\x9ccheckbox\xe2\x80\x9d and with a \xe2\x80\x9crole\xe2\x80\x9d attribute whose value is \xe2\x80\x9cbutton\xe2\x80\x9d must have an \xe2\x80\x9caria-pressed\xe2\x80\x9d attribute." 547 + ~code:"checkbox-button-needs-aria-pressed" 548 + ~element:name 549 + ~attribute:"role" 550 + () 551 + end 552 + end; 553 + 554 + (* Check li role restrictions in menu/menubar/tablist contexts *) 555 + if name_lower = "li" && explicit_roles <> [] then begin 556 + let first_role = List.hd explicit_roles in 557 + (* none/presentation are always allowed as they remove from accessibility tree *) 558 + if first_role <> "none" && first_role <> "presentation" then begin 559 + (* Check if in menu or menubar context *) 560 + (match get_ancestor_role state ["menu"; "menubar"] with 561 + | Some _ -> 562 + let valid_roles = ["group"; "menuitem"; "menuitemcheckbox"; "menuitemradio"; "separator"] in 563 + if not (List.mem first_role valid_roles) then 564 + Message_collector.add_error collector 565 + ~message:"An \xe2\x80\x9cli\xe2\x80\x9d element that is a descendant of a \xe2\x80\x9crole=menu\xe2\x80\x9d element or \xe2\x80\x9crole=menubar\xe2\x80\x9d element must not have any \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9cgroup\xe2\x80\x9d, \xe2\x80\x9cmenuitem\xe2\x80\x9d, \xe2\x80\x9cmenuitemcheckbox\xe2\x80\x9d, \xe2\x80\x9cmenuitemradio\xe2\x80\x9d, or \xe2\x80\x9cseparator\xe2\x80\x9d." 566 + ~code:"invalid-li-role-in-menu" 567 + ~element:name 568 + ~attribute:"role" 569 + () 570 + | None -> 571 + (* Check if in tablist context *) 572 + match get_ancestor_role state ["tablist"] with 573 + | Some _ -> 574 + if first_role <> "tab" then 575 + Message_collector.add_error collector 576 + ~message:"An \xe2\x80\x9cli\xe2\x80\x9d element that is a descendant of a \xe2\x80\x9crole=tablist\xe2\x80\x9d element must not have any \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9ctab\xe2\x80\x9d." 577 + ~code:"invalid-li-role-in-tablist" 578 + ~element:name 579 + ~attribute:"role" 580 + () 581 + | None -> ()) 582 + end 583 + end; 584 + 502 585 (* Check for aria-hidden="true" on body element *) 503 586 if name_lower = "body" then begin 504 587 let aria_hidden = List.assoc_opt "aria-hidden" attrs in ··· 640 723 | None -> () 641 724 ) attrs; 642 725 726 + (* Check summary restrictions in details context *) 727 + if name_lower = "summary" then begin 728 + let parent = get_parent_element state in 729 + let is_in_details = parent = Some "details" in 730 + if is_in_details then begin 731 + (* summary that is the first child of details *) 732 + (* Cannot have role=paragraph (or other non-button roles) *) 733 + if explicit_roles <> [] then begin 734 + let first_role = List.hd explicit_roles in 735 + if first_role <> "button" && first_role <> "none" && first_role <> "presentation" then 736 + Message_collector.add_error collector 737 + ~message:"The \xe2\x80\x9crole\xe2\x80\x9d attribute must not be used on any \xe2\x80\x9csummary\xe2\x80\x9d element that is a summary for its parent \xe2\x80\x9cdetails\xe2\x80\x9d element." 738 + ~code:"invalid-role-on-summary" 739 + ~element:name 740 + ~attribute:"role" 741 + () 742 + end; 743 + (* If has aria-expanded or aria-pressed, must have role *) 744 + let has_aria_expanded = List.assoc_opt "aria-expanded" attrs <> None in 745 + let has_aria_pressed = List.assoc_opt "aria-pressed" attrs <> None in 746 + if (has_aria_expanded || has_aria_pressed) && explicit_roles = [] then begin 747 + if has_aria_pressed then 748 + Message_collector.add_error collector 749 + ~message:"Element \xe2\x80\x9csummary\xe2\x80\x9d is missing required attribute \xe2\x80\x9crole\xe2\x80\x9d." 750 + ~code:"missing-role-on-summary" 751 + ~element:name () 752 + else 753 + Message_collector.add_error collector 754 + ~message:"Element \xe2\x80\x9csummary\xe2\x80\x9d is missing one or more of the following attributes: [aria-checked, aria-level, role]." 755 + ~code:"missing-role-on-summary" 756 + ~element:name () 757 + end 758 + end 759 + end; 760 + 643 761 (* Push current element onto stack *) 644 762 let node = { 763 + element_name = name_lower; 645 764 explicit_roles; 646 765 implicit_role; 647 766 } in ··· 659 778 660 779 let characters _state _text _collector = () 661 780 662 - let end_document _state _collector = () 781 + let end_document state collector = 782 + (* Check that active tabs have corresponding tabpanels *) 783 + if state.has_active_tab && not state.has_tabpanel then 784 + Message_collector.add_error collector 785 + ~message:"Every active \xe2\x80\x9crole=tab\xe2\x80\x9d element must have a corresponding \xe2\x80\x9crole=tabpanel\xe2\x80\x9d element." 786 + ~code:"tab-without-tabpanel" 787 + () 663 788 664 789 let checker = (module struct 665 790 type nonrec state = state
+53 -1
lib/html5_checker/specialized/attr_restrictions_checker.ml
··· 501 501 The HTML5 parser normalizes attribute names to lowercase, so this check 502 502 is only effective when the document is parsed as XML. 503 503 Commenting out until we have XML parsing support. *) 504 - ignore state.is_xhtml 504 + ignore state.is_xhtml; 505 + 506 + (* Validate media attribute on link, style, source elements *) 507 + if namespace = None && (name_lower = "link" || name_lower = "style" || name_lower = "source") then begin 508 + List.iter (fun (attr_name, attr_value) -> 509 + let attr_lower = String.lowercase_ascii attr_name in 510 + if attr_lower = "media" then begin 511 + let trimmed = String.trim attr_value in 512 + if trimmed <> "" then begin 513 + match Dt_media_query.validate_media_query_strict trimmed with 514 + | Ok () -> () 515 + | Error msg -> 516 + Message_collector.add_error collector 517 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s" 518 + attr_value attr_name name msg) 519 + ~code:"bad-attribute-value" 520 + ~element:name ~attribute:attr_name () 521 + end 522 + end 523 + ) attrs 524 + end; 525 + 526 + (* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *) 527 + if namespace = None then begin 528 + List.iter (fun (attr_name, attr_value) -> 529 + let attr_lower = String.lowercase_ascii attr_name in 530 + if attr_lower = "prefix" then begin 531 + (* Parse prefix attribute value - format: "prefix1: iri1 prefix2: iri2 ..." *) 532 + let trimmed = String.trim attr_value in 533 + if trimmed <> "" then begin 534 + (* Check for empty prefix (starts with : or has space:) *) 535 + if String.length trimmed > 0 && trimmed.[0] = ':' then 536 + Message_collector.add_error collector 537 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d." 538 + attr_value attr_name name) 539 + ~code:"bad-attribute-value" 540 + ~element:name ~attribute:attr_name () 541 + else begin 542 + (* Check for invalid prefix names - must start with letter or underscore *) 543 + let is_ncname_start c = 544 + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_' 545 + in 546 + if String.length trimmed > 0 && not (is_ncname_start trimmed.[0]) then 547 + Message_collector.add_error collector 548 + ~message:(Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d." 549 + attr_value attr_name name) 550 + ~code:"bad-attribute-value" 551 + ~element:name ~attribute:attr_name () 552 + end 553 + end 554 + end 555 + ) attrs 556 + end 505 557 506 558 let end_element _state ~name:_ ~namespace:_ _collector = () 507 559 let characters _state _text _collector = ()
+17 -1
lib/html5_checker/specialized/dl_checker.ml
··· 56 56 | ctx :: _ -> Some ctx 57 57 | [] -> None 58 58 59 - let start_element state ~name ~namespace ~attrs:_ collector = 59 + let get_attr name attrs = 60 + List.find_map (fun (n, v) -> 61 + if String.lowercase_ascii n = name then Some v else None 62 + ) attrs 63 + 64 + let start_element state ~name ~namespace ~attrs collector = 60 65 let name_lower = String.lowercase_ascii name in 61 66 62 67 (* Track parent stack for all HTML elements first *) ··· 100 105 ~message:"Element \xe2\x80\x9cdiv\xe2\x80\x9d not allowed as child of element \xe2\x80\x9cdl\xe2\x80\x9d in this context. (Suppressing further errors from this subtree.)" 101 106 ~code:"disallowed-child" 102 107 ~element:"div" (); 108 + (* Check that role is only presentation or none *) 109 + (match get_attr "role" attrs with 110 + | Some role_value -> 111 + let role_lower = String.lowercase_ascii (String.trim role_value) in 112 + if role_lower <> "presentation" && role_lower <> "none" then 113 + Message_collector.add_error collector 114 + ~message:"A \xe2\x80\x9cdiv\xe2\x80\x9d child of a \xe2\x80\x9cdl\xe2\x80\x9d element must not have any \xe2\x80\x9crole\xe2\x80\x9d value other than \xe2\x80\x9cpresentation\xe2\x80\x9d or \xe2\x80\x9cnone\xe2\x80\x9d." 115 + ~code:"invalid-role-on-div-in-dl" 116 + ~element:"div" 117 + ~attribute:"role" () 118 + | None -> ()); 103 119 let div_ctx = { has_dt = false; has_dd = false; group_count = 0; in_dd_part = false } in 104 120 state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack 105 121 | Some _ when state.div_in_dl_stack <> [] ->
+71 -10
lib/html5_checker/specialized/label_checker.ml
··· 12 12 if String.lowercase_ascii n = name_lower then Some v else None 13 13 ) attrs 14 14 15 + type label_for_info = { 16 + for_target : string; 17 + has_role : bool; 18 + has_aria_label : bool; 19 + } 20 + 15 21 type state = { 16 22 mutable in_label : bool; 17 23 mutable label_depth : int; 18 24 mutable labelable_count : int; 19 25 mutable label_for_value : string option; (* Value of for attribute on current label *) 26 + mutable label_has_role : bool; (* Whether current label has role attribute *) 27 + mutable label_has_aria_label : bool; (* Whether current label has aria-label attribute *) 28 + mutable labels_for : label_for_info list; (* Labels with for= attribute *) 29 + mutable labelable_ids : string list; (* IDs of labelable elements *) 20 30 } 21 31 22 32 let create () = { ··· 24 34 label_depth = 0; 25 35 labelable_count = 0; 26 36 label_for_value = None; 37 + label_has_role = false; 38 + label_has_aria_label = false; 39 + labels_for = []; 40 + labelable_ids = []; 27 41 } 28 42 29 43 let reset state = 30 44 state.in_label <- false; 31 45 state.label_depth <- 0; 32 46 state.labelable_count <- 0; 33 - state.label_for_value <- None 47 + state.label_for_value <- None; 48 + state.label_has_role <- false; 49 + state.label_has_aria_label <- false; 50 + state.labels_for <- []; 51 + state.labelable_ids <- [] 34 52 35 53 let start_element state ~name ~namespace ~attrs collector = 36 54 if namespace <> None then () ··· 39 57 40 58 if name_lower = "label" then begin 41 59 state.in_label <- true; 42 - state.label_depth <- 0; 60 + state.label_depth <- 1; (* Start at 1 for the label element itself *) 43 61 state.labelable_count <- 0; 44 - state.label_for_value <- get_attr attrs "for" 62 + let for_value = get_attr attrs "for" in 63 + let has_role = get_attr attrs "role" <> None in 64 + let has_aria_label = get_attr attrs "aria-label" <> None in 65 + state.label_for_value <- for_value; 66 + state.label_has_role <- has_role; 67 + state.label_has_aria_label <- has_aria_label; 68 + (* Track this label if it has for= and role/aria-label *) 69 + (match for_value with 70 + | Some target when has_role || has_aria_label -> 71 + state.labels_for <- { for_target = target; has_role; has_aria_label } :: state.labels_for 72 + | _ -> ()) 45 73 end; 74 + (* Track labelable element IDs *) 75 + (if List.mem name_lower labelable_elements then 76 + match get_attr attrs "id" with 77 + | Some id -> state.labelable_ids <- id :: state.labelable_ids 78 + | None -> ()); 46 79 47 - if state.in_label then begin 80 + if state.in_label && name_lower <> "label" then begin 48 81 state.label_depth <- state.label_depth + 1; 49 82 50 83 (* Check for labelable elements inside label *) ··· 57 90 ~element:"label" (); 58 91 59 92 (* Check if label has for attribute and descendant has mismatched id *) 60 - match state.label_for_value with 93 + (match state.label_for_value with 61 94 | Some for_value -> 62 95 let descendant_id = get_attr attrs "id" in 63 96 (match descendant_id with ··· 78 111 ()) 79 112 | None -> 80 113 (* No for attribute on label - no constraint on descendant id *) 81 - () 114 + ()) 82 115 end 83 116 end 84 117 end 85 118 86 - let end_element state ~name ~namespace _collector = 119 + let end_element state ~name ~namespace collector = 87 120 if namespace <> None then () 88 121 else begin 89 122 let name_lower = String.lowercase_ascii name in ··· 91 124 if state.in_label then begin 92 125 state.label_depth <- state.label_depth - 1; 93 126 94 - if name_lower = "label" && state.label_depth < 0 then begin 127 + if name_lower = "label" && state.label_depth = 0 then begin 128 + (* Check for role attribute on label that's ancestor of labelable element *) 129 + if state.label_has_role && state.labelable_count > 0 then 130 + Message_collector.add_error collector 131 + ~message:"The \xe2\x80\x9crole\xe2\x80\x9d attribute must not be used on any \xe2\x80\x9clabel\xe2\x80\x9d element that is an ancestor of a labelable element." 132 + ~code:"role-on-label-ancestor" 133 + ~element:"label" 134 + ~attribute:"role" (); 135 + 95 136 state.in_label <- false; 96 137 state.labelable_count <- 0; 97 - state.label_for_value <- None 138 + state.label_for_value <- None; 139 + state.label_has_role <- false; 140 + state.label_has_aria_label <- false 98 141 end 99 142 end 100 143 end 101 144 102 145 let characters _state _text _collector = () 103 146 104 - let end_document _state _collector = () 147 + let end_document state collector = 148 + (* Check labels with for= that target labelable elements *) 149 + List.iter (fun label_info -> 150 + if List.mem label_info.for_target state.labelable_ids then begin 151 + (* This label is associated with a labelable element *) 152 + if label_info.has_role then 153 + Message_collector.add_error collector 154 + ~message:"The \xe2\x80\x9crole\xe2\x80\x9d attribute must not be used on any \xe2\x80\x9clabel\xe2\x80\x9d element that is associated with a labelable element." 155 + ~code:"role-on-label-for" 156 + ~element:"label" 157 + ~attribute:"role" (); 158 + if label_info.has_aria_label then 159 + Message_collector.add_error collector 160 + ~message:"The \xe2\x80\x9caria-label\xe2\x80\x9d attribute must not be used on any \xe2\x80\x9clabel\xe2\x80\x9d element that is associated with a labelable element." 161 + ~code:"aria-label-on-label-for" 162 + ~element:"label" 163 + ~attribute:"aria-label" () 164 + end 165 + ) state.labels_for 105 166 106 167 let checker = 107 168 (module struct
+51 -2
lib/html5_checker/specialized/microdata_checker.ml
··· 67 67 let is_url s = 68 68 String.contains s ':' 69 69 70 + (** Validate that a URL is a valid absolute URL for itemtype. 71 + itemtype must be an absolute URL per the HTML5 spec. 72 + http/https URLs require :// but other schemes like mailto:, data:, javascript: don't. *) 73 + let validate_itemtype_url url = 74 + let url = String.trim url in 75 + if String.length url = 0 then 76 + Error "itemtype must not be empty" 77 + else 78 + match String.index_opt url ':' with 79 + | None -> Error "Expected a slash (\"/\")." 80 + | Some colon_pos -> 81 + if colon_pos = 0 then 82 + Error "Expected a slash (\"/\")." 83 + else 84 + let scheme = String.lowercase_ascii (String.sub url 0 colon_pos) in 85 + (* Schemes that require :// for itemtype validation 86 + Note: The Nu validator only enforces :// for http, https, and ftp *) 87 + let special_schemes = [ 88 + "http"; "https"; "ftp" 89 + ] in 90 + if List.mem scheme special_schemes then begin 91 + if colon_pos + 2 >= String.length url then 92 + Error "Expected a slash (\"/\")." 93 + else if url.[colon_pos + 1] <> '/' || url.[colon_pos + 2] <> '/' then 94 + Error "Expected a slash (\"/\")." 95 + else 96 + Ok () 97 + end else 98 + (* Other schemes (mailto:, data:, javascript:, etc.) are valid as-is *) 99 + Ok () 100 + 70 101 (** Check if itemprop value is valid. *) 71 102 let validate_itemprop_value value = 72 103 if String.length value = 0 then ··· 139 170 | None -> () 140 171 end; 141 172 142 - (* Check itemtype requires itemscope *) 173 + (* Check itemtype requires itemscope and is valid URL *) 143 174 begin match itemtype_opt with 144 - | Some _itemtype -> 175 + | Some itemtype -> 145 176 if not has_itemscope then 146 177 Message_collector.add_error collector 147 178 ~message:"itemtype attribute requires itemscope attribute" ··· 150 181 ~element 151 182 ~attribute:"itemtype" 152 183 () 184 + else begin 185 + (* Validate each itemtype URL (can be space-separated) *) 186 + let types = split_whitespace itemtype in 187 + List.iter (fun url -> 188 + match validate_itemtype_url url with 189 + | Ok () -> () 190 + | Error msg -> 191 + Message_collector.add_error collector 192 + ~message:(Printf.sprintf 193 + "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9citemtype\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad absolute URL: %s" 194 + url element msg) 195 + ~code:"microdata-invalid-itemtype" 196 + ?location 197 + ~element 198 + ~attribute:"itemtype" 199 + () 200 + ) types 201 + end 153 202 | None -> () 154 203 end; 155 204
+5 -2
lib/html5rw/parser/parser_tree_builder.ml
··· 1428 1428 | Token.Tag { kind = Token.Start; name; _ } 1429 1429 when List.mem name ["caption"; "colgroup"; "head"; "tbody"; "td"; "tfoot"; "th"; "thead"; "tr"] -> 1430 1430 parse_error t "unexpected-start-tag" 1431 - | Token.Tag { kind = Token.Start; name; attrs; _ } -> 1431 + | Token.Tag { kind = Token.Start; name; attrs; self_closing } -> 1432 1432 (* Any other start tag *) 1433 1433 reconstruct_active_formatting t; 1434 - ignore (insert_element t name ~push:true attrs) 1434 + ignore (insert_element t name ~push:true attrs); 1435 + (* Check for self-closing on non-void HTML element *) 1436 + if self_closing && not (List.mem name Parser_constants.void_elements) then 1437 + parse_error t "non-void-html-element-start-tag-with-trailing-solidus" 1435 1438 | Token.Tag { kind = Token.End; name; _ } -> 1436 1439 (* Any other end tag *) 1437 1440 let rec check = function
+17
test/debug_check.ml
··· 1 + let () = 2 + let test_file = "validator/tests/html/microdata/itemtype/scheme-https-no-slash-novalid.html" in 3 + let ic = open_in test_file in 4 + let html = really_input_string ic (in_channel_length ic) in 5 + close_in ic; 6 + let reader = Bytesrw.Bytes.Reader.of_string html in 7 + let result = Html5_checker.check ~collect_parse_errors:true ~system_id:test_file reader in 8 + let errors = Html5_checker.errors result in 9 + let warnings = Html5_checker.warnings result in 10 + print_endline "=== Errors ==="; 11 + List.iter (fun e -> print_endline e.Html5_checker.Message.message) errors; 12 + print_endline "=== Warnings ==="; 13 + List.iter (fun e -> print_endline e.Html5_checker.Message.message) warnings; 14 + if List.length errors > 0 then 15 + print_endline "PASS (has errors)" 16 + else 17 + print_endline "FAIL (no errors)"