commit 779224385286e0247336e92e206fbe591bf96039 · anil.recoil.org/ocaml-html5rw

+1

dune-project

··· 25 25 (uutf (>= 1.0.0)) 26 26 (uuuu (>= 0.3.0)) 27 27 (uunf (>= 15.0.0)) 28 + (uucp (>= 15.0.0)) 28 29 (xmlm (>= 1.4.0)) 29 30 langdetect 30 31 (odoc :with-doc)

+1

html5rw.opam

··· 16 16 "uutf" {>= "1.0.0"} 17 17 "uuuu" {>= "0.3.0"} 18 18 "uunf" {>= "15.0.0"} 19 + "uucp" {>= "15.0.0"} 19 20 "xmlm" {>= "1.4.0"} 20 21 "langdetect" 21 22 "odoc" {with-doc}

+1 -1

lib/check/checker_registry.ml

··· 22 22 Hashtbl.replace reg "source" Source_checker.checker; 23 23 Hashtbl.replace reg "label" Label_checker.checker; 24 24 Hashtbl.replace reg "ruby" Ruby_checker.checker; 25 - Hashtbl.replace reg "h1" H1_checker.checker; 25 + Hashtbl.replace reg "heading" Heading_checker.checker; 26 26 Hashtbl.replace reg "srcset-sizes" Srcset_sizes_checker.checker; 27 27 Hashtbl.replace reg "autofocus" Autofocus_checker.checker; 28 28 Hashtbl.replace reg "option" Option_checker.checker;

+26

lib/check/datatype/datatype.ml

··· 12 12 13 13 (* Helper utilities *) 14 14 15 + (** Character predicates *) 16 + 15 17 let is_whitespace = function 16 18 | ' ' | '\t' | '\n' | '\r' | '\012' (* FF *) -> true 17 19 | _ -> false 18 20 19 21 let is_ascii_digit = function '0' .. '9' -> true | _ -> false 20 22 23 + let is_lower_alpha = function 'a' .. 'z' -> true | _ -> false 24 + 25 + let is_upper_alpha = function 'A' .. 'Z' -> true | _ -> false 26 + 27 + let is_alpha c = is_lower_alpha c || is_upper_alpha c 28 + 29 + let is_alphanumeric c = is_alpha c || is_ascii_digit c 30 + 31 + let is_hex_digit = function 32 + | '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' -> true 33 + | _ -> false 34 + 35 + (** Case conversion *) 36 + 21 37 let to_ascii_lowercase c = 22 38 match c with 'A' .. 'Z' -> Char.chr (Char.code c + 32) | _ -> c 23 39 24 40 let string_to_ascii_lowercase s = 25 41 String.map to_ascii_lowercase s 42 + 43 + (** String predicates *) 44 + 45 + let is_non_empty s = String.trim s <> "" 46 + 47 + let is_all_digits s = String.length s > 0 && String.for_all is_ascii_digit s 48 + 49 + let is_all_alpha s = String.length s > 0 && String.for_all is_alpha s 50 + 51 + let is_all_alphanumeric s = String.length s > 0 && String.for_all is_alphanumeric s 26 52 27 53 let trim_html_spaces s = 28 54 let len = String.length s in

+36 -1

lib/check/datatype/datatype.mli

··· 27 27 (** Check if a value is valid *) 28 28 val is_valid : t -> string -> bool 29 29 30 - (** Helper utilities for implementing datatype validators. *) 30 + (** {1 Helper utilities for implementing datatype validators} *) 31 + 32 + (** {2 Character predicates} *) 31 33 32 34 (** Check if a character is HTML5 whitespace (space, tab, LF, FF, or CR). *) 33 35 val is_whitespace : char -> bool ··· 35 37 (** Check if a character is an ASCII digit (0-9). *) 36 38 val is_ascii_digit : char -> bool 37 39 40 + (** Check if a character is a lowercase ASCII letter (a-z). *) 41 + val is_lower_alpha : char -> bool 42 + 43 + (** Check if a character is an uppercase ASCII letter (A-Z). *) 44 + val is_upper_alpha : char -> bool 45 + 46 + (** Check if a character is an ASCII letter (a-z or A-Z). *) 47 + val is_alpha : char -> bool 48 + 49 + (** Check if a character is an ASCII letter or digit. *) 50 + val is_alphanumeric : char -> bool 51 + 52 + (** Check if a character is a hexadecimal digit (0-9, a-f, A-F). *) 53 + val is_hex_digit : char -> bool 54 + 55 + (** {2 Case conversion} *) 56 + 38 57 (** Convert an ASCII character to lowercase. *) 39 58 val to_ascii_lowercase : char -> char 40 59 41 60 (** Convert an ASCII string to lowercase. *) 42 61 val string_to_ascii_lowercase : string -> string 62 + 63 + (** {2 String predicates} *) 64 + 65 + (** Check if a string has non-whitespace content after trimming. *) 66 + val is_non_empty : string -> bool 67 + 68 + (** Check if all characters in a non-empty string are ASCII digits. *) 69 + val is_all_digits : string -> bool 70 + 71 + (** Check if all characters in a non-empty string are ASCII letters. *) 72 + val is_all_alpha : string -> bool 73 + 74 + (** Check if all characters in a non-empty string are ASCII letters or digits. *) 75 + val is_all_alphanumeric : string -> bool 76 + 77 + (** {2 String manipulation} *) 43 78 44 79 (** Trim HTML5 whitespace from both ends of a string. *) 45 80 val trim_html_spaces : string -> string

+6 -8

lib/check/datatype/dt_autocomplete.ml

··· 4 4 let is_whitespace = Datatype.is_whitespace 5 5 let to_ascii_lowercase = Datatype.to_ascii_lowercase 6 6 7 + (* Use Astring for string operations *) 8 + let is_prefix = Astring.String.is_prefix 9 + 7 10 (** Trim whitespace from string and collapse internal whitespace *) 8 11 let trim_whitespace s = 9 12 let s = String.trim s in ··· 104 107 (** Split string on whitespace - uses shared utility *) 105 108 let split_on_whitespace = Datatype.split_on_whitespace 106 109 107 - (** Check if string starts with prefix *) 108 - let starts_with s prefix = 109 - String.length s >= String.length prefix 110 - && String.sub s 0 (String.length prefix) = prefix 111 - 112 110 (** Validate detail tokens *) 113 111 let check_tokens tokens = 114 112 let tokens = ref tokens in ··· 116 114 117 115 (* Check for section-* *) 118 116 (match !tokens with 119 - | token :: rest when starts_with token "section-" -> 117 + | token :: rest when is_prefix ~affix:"section-" token -> 120 118 tokens := rest 121 119 | _ -> ()); 122 120 ··· 145 143 146 144 (* Check if any token in the list is a section-* indicator *) 147 145 let find_section tokens = 148 - List.find_opt (fun t -> starts_with t "section-") tokens 146 + List.find_opt (fun t -> is_prefix ~affix:"section-" t) tokens 149 147 in 150 148 151 149 (* Check if webauthn appears anywhere except as the very last token *) ··· 207 205 (Printf.sprintf 208 206 "The token \"%s\" must only appear before any autofill field names." 209 207 token) 210 - | token :: _ when starts_with token "section-" -> 208 + | token :: _ when is_prefix ~affix:"section-" token -> 211 209 Error 212 210 "A \"section-*\" indicator must only appear as the first token in a \ 213 211 list of autofill detail tokens."

+3 -6

lib/check/datatype/dt_charset.ml

··· 1 1 (** Helper functions for charset validation *) 2 2 3 3 let is_valid_charset_char c = 4 - (c >= '0' && c <= '9') || 5 - (c >= 'a' && c <= 'z') || 6 - (c >= 'A' && c <= 'Z') || 4 + Datatype.is_alphanumeric c || 7 5 c = '-' || c = '!' || c = '#' || c = '$' || c = '%' || c = '&' || 8 6 c = '\'' || c = '+' || c = '_' || c = '`' || c = '{' || c = '}' || 9 7 c = '~' || c = '^' 10 8 11 - let to_lower s = String.lowercase_ascii s 9 + let to_lower = Datatype.string_to_ascii_lowercase 12 10 13 11 (** Common encoding labels recognized by WHATWG Encoding Standard. 14 12 This is a subset of the full list. *) ··· 74 72 module Meta_charset = struct 75 73 let name = "legacy character encoding declaration" 76 74 77 - let is_whitespace c = 78 - c = ' ' || c = '\t' || c = '\n' || c = '\012' || c = '\r' 75 + let is_whitespace = Datatype.is_whitespace 79 76 80 77 let validate s = 81 78 let lower = to_lower s in

+1 -2

lib/check/datatype/dt_color.ml

··· 154 154 ] 155 155 156 156 (** Check if character is hex digit *) 157 - let is_hex_digit c = 158 - (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') 157 + let is_hex_digit = Datatype.is_hex_digit 159 158 160 159 (** Validate hex color (#RGB or #RRGGBB) *) 161 160 let validate_hex_color s =

+1 -4

lib/check/datatype/dt_datetime.ml

··· 1 1 (** Helper functions for datetime validation *) 2 2 3 - let is_digit c = c >= '0' && c <= '9' 4 - 5 - let is_all_digits s = 6 - String.for_all is_digit s 3 + let is_all_digits = Datatype.is_all_digits 7 4 8 5 let parse_int s = 9 6 try Some (int_of_string s)

+1 -3

lib/check/datatype/dt_email.ml

··· 2 2 3 3 (** Helper to check if a character is valid in email local/domain parts *) 4 4 let is_email_char c = 5 - (c >= 'a' && c <= 'z') 6 - || (c >= 'A' && c <= 'Z') 7 - || (c >= '0' && c <= '9') 5 + Datatype.is_alphanumeric c 8 6 || c = '.' || c = '-' || c = '_' || c = '+' || c = '=' 9 7 10 8 (** Validate a single email address using simplified rules *)

+4 -17

lib/check/datatype/dt_language.ml

··· 2 2 3 3 let q = Error_code.q 4 4 5 - let is_lower_alpha c = c >= 'a' && c <= 'z' 6 - let is_upper_alpha c = c >= 'A' && c <= 'Z' 7 - let is_alpha c = is_lower_alpha c || is_upper_alpha c 8 - let is_digit c = c >= '0' && c <= '9' 9 - let is_alphanumeric c = is_alpha c || is_digit c 10 - 11 - let is_all_alpha s = 12 - String.for_all is_alpha s 13 - 14 - let _is_all_digits s = 15 - String.for_all is_digit s 16 - 17 - let is_all_alphanumeric s = 18 - String.for_all is_alphanumeric s 19 - 20 - let to_lower s = 21 - String.lowercase_ascii s 5 + (* Use shared character predicates from Datatype *) 6 + let is_all_alpha = Datatype.is_all_alpha 7 + let is_all_alphanumeric = Datatype.is_all_alphanumeric 8 + let to_lower = Datatype.string_to_ascii_lowercase 22 9 23 10 (** Valid extlang subtags per IANA language-subtag-registry. 24 11 Extlangs are 3-letter subtags that follow the primary language.

+20 -26

lib/check/datatype/dt_media_query.ml

··· 70 70 (** Media query keywords (unused but kept for documentation) *) 71 71 let _media_keywords = [ "and"; "not"; "only" ] 72 72 73 - (** Check if character is whitespace *) 74 - let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' 73 + let is_whitespace = Datatype.is_whitespace 75 74 76 75 (** Check if character can start an identifier *) 77 76 let is_ident_start c = 78 - (c >= 'a' && c <= 'z') 79 - || (c >= 'A' && c <= 'Z') 80 - || c = '_' || c = '-' || Char.code c >= 128 77 + Datatype.is_alpha c || c = '_' || c = '-' || Char.code c >= 128 81 78 82 79 (** Check if character can be in an identifier *) 83 80 let is_ident_char c = 84 - is_ident_start c || (c >= '0' && c <= '9') 81 + is_ident_start c || Datatype.is_ascii_digit c 85 82 86 - (** Unicode case-fold for Turkish dotted-I etc *) 87 - let lowercase_unicode s = 88 - (* Handle special case: U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE -> i *) 83 + (** Unicode case folding for case-insensitive comparison. 84 + 85 + Uses the Uucp library for proper Unicode case folding, which handles 86 + special cases like Turkish dotted-I (U+0130 -> 'i' + U+0307) correctly. *) 87 + let case_fold s = 89 88 let buf = Buffer.create (String.length s) in 90 - let i = ref 0 in 91 - while !i < String.length s do 92 - let c = s.[!i] in 93 - if c = '\xc4' && !i + 1 < String.length s && s.[!i + 1] = '\xb0' then begin 94 - (* U+0130 -> 'i' + U+0307 (combining dot above), but for simplicity just 'i' followed by U+0307 *) 95 - Buffer.add_string buf "i\xcc\x87"; 96 - i := !i + 2 97 - end else begin 98 - Buffer.add_char buf (Char.lowercase_ascii c); 99 - incr i 100 - end 101 - done; 89 + let add_uchar u = Uutf.Buffer.add_utf_8 buf u in 90 + let fold_char () _pos = function 91 + | `Malformed _ -> () (* Skip malformed sequences *) 92 + | `Uchar u -> 93 + match Uucp.Case.Fold.fold u with 94 + | `Self -> add_uchar u 95 + | `Uchars us -> List.iter add_uchar us 96 + in 97 + Uutf.String.fold_utf_8 fold_char () s; 102 98 Buffer.contents buf 103 99 104 100 (** Check balanced parentheses *) ··· 222 218 match read_ident () with 223 219 | None -> Error "Parse Error." 224 220 | Some media_type -> 225 - let mt_lower = lowercase_unicode media_type in 221 + let mt_lower = case_fold media_type in 226 222 (* Check for deprecated media type *) 227 223 if List.mem mt_lower deprecated_media_types then 228 224 Error (Printf.sprintf "The media \"%s\" has been deprecated" mt_lower) ··· 341 337 if List.mem feature length_features then begin 342 338 (* Must be a valid length: number followed by unit *) 343 339 let value = String.trim value in 344 - let is_digit c = c >= '0' && c <= '9' in 345 340 346 341 (* Parse number - includes sign, digits, and decimal point *) 347 342 let i = ref 0 in 348 343 let len = String.length value in 349 - while !i < len && (is_digit value.[!i] || value.[!i] = '.' || value.[!i] = '-' || value.[!i] = '+') do 344 + while !i < len && (Datatype.is_ascii_digit value.[!i] || value.[!i] = '.' || value.[!i] = '-' || value.[!i] = '+') do 350 345 incr i 351 346 done; 352 347 let num_part = String.sub value 0 !i in ··· 377 372 end else if List.mem feature color_features then begin 378 373 (* Must be an integer *) 379 374 let value = String.trim value in 380 - let is_digit c = c >= '0' && c <= '9' in 381 - if String.length value > 0 && String.for_all is_digit value then Ok () 375 + if Datatype.is_all_digits value then Ok () 382 376 else 383 377 Error (Printf.sprintf "\"%s\" is not a \"%s\" value" value base_feature) 384 378 end else

+1 -2

lib/check/datatype/dt_mime.ml

··· 1 1 (** MIME type validation based on RFC 2045 and HTML5 spec *) 2 2 3 - (** Check if character is whitespace *) 4 - let is_whitespace c = c = ' ' || c = '\t' || c = '\n' || c = '\r' 3 + let is_whitespace = Datatype.is_whitespace 5 4 6 5 (** Check if character is a token character (RFC 2045) *) 7 6 let is_token_char c =

+1 -1

lib/check/dune

··· 3 3 (library 4 4 (name htmlrw_check) 5 5 (public_name html5rw.check) 6 - (libraries html5rw jsont jsont.bytesrw astring str uunf uutf xmlm langdetect)) 6 + (libraries html5rw jsont jsont.bytesrw astring str uunf uucp uutf xmlm langdetect))

-37

lib/check/specialized/h1_checker.ml

··· 1 - (** H1 element counter - warns about multiple h1 elements in a document. *) 2 - 3 - type state = { 4 - mutable h1_count : int; 5 - mutable svg_depth : int; (* Track depth inside SVG *) 6 - } 7 - 8 - let create () = { 9 - h1_count = 0; 10 - svg_depth = 0; 11 - } 12 - 13 - let reset state = 14 - state.h1_count <- 0; 15 - state.svg_depth <- 0 16 - 17 - let start_element state ~element collector = 18 - (* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *) 19 - match element.Element.tag with 20 - | Tag.Svg _ -> 21 - state.svg_depth <- state.svg_depth + 1 22 - | Tag.Html `H1 when state.svg_depth = 0 -> 23 - state.h1_count <- state.h1_count + 1; 24 - if state.h1_count > 1 then 25 - Message_collector.add_typed collector (`Misc `Multiple_h1) 26 - | Tag.Html _ when state.svg_depth = 0 -> 27 - () (* Other HTML elements outside SVG *) 28 - | _ -> 29 - () (* Non-HTML or inside SVG *) 30 - 31 - let end_element state ~tag _collector = 32 - match tag with 33 - | Tag.Svg _ when state.svg_depth > 0 -> 34 - state.svg_depth <- state.svg_depth - 1 35 - | _ -> () 36 - 37 - let checker = Checker.make ~create ~reset ~start_element ~end_element ()

-16

lib/check/specialized/h1_checker.mli

··· 1 - (** H1 element counter checker. 2 - 3 - This checker validates that documents don't have multiple h1 elements, 4 - which can confuse document structure and accessibility tools. 5 - 6 - {2 Validation Rules} 7 - 8 - - Documents should have at most one [<h1>] element 9 - - [<h1>] elements inside SVG content (foreignObject, desc) are not counted 10 - 11 - {2 Error Messages} 12 - 13 - - [Multiple_h1]: Document contains more than one h1 element *) 14 - 15 - val checker : Checker.t 16 - (** The H1 checker instance. *)

+19 -107

lib/check/specialized/heading_checker.ml

··· 1 1 (** Heading structure validation checker. 2 2 3 3 This checker validates that: 4 - - Heading levels don't skip (e.g., h1 to h3) 5 - - Documents have at least one heading 6 - - Multiple h1 usage is noted 7 - - Headings are not empty *) 4 + - Multiple h1 usage is reported as an error 5 + 6 + Note: Additional accessibility checks (first heading should be h1, skipped 7 + levels, empty headings) are intentionally not included as errors since they 8 + are recommendations rather than HTML5 spec requirements. *) 8 9 9 10 (** Checker state tracking heading structure. *) 10 11 type state = { 11 - mutable current_level : int option; 12 12 mutable h1_count : int; 13 - mutable has_any_heading : bool; 14 - mutable first_heading_checked : bool; 15 - mutable in_heading : Tag.html_tag option; 16 - mutable heading_has_text : bool; 17 13 mutable svg_depth : int; (* Track depth inside SVG - headings in SVG don't count *) 18 14 } 19 15 20 - let create () = 21 - { 22 - current_level = None; 23 - h1_count = 0; 24 - has_any_heading = false; 25 - first_heading_checked = false; 26 - in_heading = None; 27 - heading_has_text = false; 28 - svg_depth = 0; 29 - } 16 + let create () = { 17 + h1_count = 0; 18 + svg_depth = 0; 19 + } 30 20 31 21 let reset state = 32 - state.current_level <- None; 33 22 state.h1_count <- 0; 34 - state.has_any_heading <- false; 35 - state.first_heading_checked <- false; 36 - state.in_heading <- None; 37 - state.heading_has_text <- false; 38 23 state.svg_depth <- 0 39 24 40 - (** Check if text is effectively empty (only whitespace). *) 41 - let is_empty_text text = 42 - let rec check i = 43 - if i >= String.length text then 44 - true 45 - else 46 - match text.[i] with 47 - | ' ' | '\t' | '\n' | '\r' -> check (i + 1) 48 - | _ -> false 49 - in 50 - check 0 51 - 52 25 let start_element state ~element collector = 53 26 match element.Element.tag with 54 27 | Tag.Svg _ -> 55 - (* Track SVG depth - headings inside SVG (foreignObject, desc) don't count *) 28 + (* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *) 56 29 state.svg_depth <- state.svg_depth + 1 57 - | Tag.Html (#Tag.heading_tag as h) when state.svg_depth = 0 -> 58 - let level = match Tag.heading_level h with Some l -> l | None -> 0 in 59 - let name = Tag.html_tag_to_string h in 60 - state.has_any_heading <- true; 61 - 62 - (* Check if this is the first heading *) 63 - if not state.first_heading_checked then begin 64 - state.first_heading_checked <- true; 65 - if level <> 1 then 66 - Message_collector.add_typed collector 67 - (`Generic (Printf.sprintf 68 - "First heading in document is <%s>, should typically be <h1>" name)) 69 - end; 70 - 71 - (* Track h1 count *) 72 - if level = 1 then begin 73 - state.h1_count <- state.h1_count + 1; 74 - if state.h1_count > 1 then 75 - Message_collector.add_typed collector (`Misc `Multiple_h1) 76 - end; 77 - 78 - (* Check for skipped levels *) 79 - begin match state.current_level with 80 - | None -> 81 - state.current_level <- Some level 82 - | Some prev_level -> 83 - let diff = level - prev_level in 84 - if diff > 1 then 85 - Message_collector.add_typed collector 86 - (`Generic (Printf.sprintf 87 - "Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users" 88 - name prev_level (diff - 1) (if diff > 2 then "s" else ""))); 89 - state.current_level <- Some level 90 - end; 91 - 92 - (* Track that we're in a heading to check for empty content *) 93 - state.in_heading <- Some h; 94 - state.heading_has_text <- false 30 + | Tag.Html `H1 when state.svg_depth = 0 -> 31 + state.h1_count <- state.h1_count + 1; 32 + if state.h1_count > 1 then 33 + Message_collector.add_typed collector (`Misc `Multiple_h1) 95 34 | _ -> () 96 35 97 - let end_element state ~tag collector = 98 - (* Track SVG depth *) 99 - (match tag with 100 - | Tag.Svg _ when state.svg_depth > 0 -> 101 - state.svg_depth <- state.svg_depth - 1 102 - | _ -> ()); 103 - (* Check for empty headings *) 104 - match state.in_heading, tag with 105 - | Some h, Tag.Html h2 when h = h2 -> 106 - if not state.heading_has_text then 107 - Message_collector.add_typed collector 108 - (`Generic (Printf.sprintf 109 - "Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers" 110 - (Tag.html_tag_to_string h))); 111 - state.in_heading <- None; 112 - state.heading_has_text <- false 36 + let end_element state ~tag _collector = 37 + match tag with 38 + | Tag.Svg _ when state.svg_depth > 0 -> 39 + state.svg_depth <- state.svg_depth - 1 113 40 | _ -> () 114 41 115 - let characters state text _collector = 116 - (* If we're inside a heading, check if this text is non-whitespace *) 117 - match state.in_heading with 118 - | Some _ -> 119 - if not (is_empty_text text) then 120 - state.heading_has_text <- true 121 - | None -> 122 - () 123 - 124 - let end_document state collector = 125 - if not state.has_any_heading then 126 - Message_collector.add_typed collector 127 - (`Generic "Document contains no heading elements (h1-h6). Headings provide important document structure for accessibility") 128 - 129 - let checker = Checker.make ~create ~reset ~start_element ~end_element 130 - ~characters ~end_document () 42 + let checker = Checker.make ~create ~reset ~start_element ~end_element ()