OCaml HTML5 parser/serialiser based on Python's JustHTML
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at 01d38f40bc913dd9ee9950cf56b338d7e79a5ebc 242 lines 7.5 kB view raw
1(** ID uniqueness and reference checker. 2 3 This checker validates that: 4 - ID attributes are unique within the document 5 - ID references point to existing IDs 6 - ID values conform to HTML5 requirements *) 7 8(** Location information for ID occurrences. *) 9type id_location = unit (* simplified since we only need to track existence *) 10 11(** Information about an ID reference. *) 12type id_reference = { 13 referring_element : string; 14 attribute : string; 15 referenced_id : string; 16 location : Message.location option; 17} 18 19(** Checker state tracking IDs, map names, and references. *) 20type state = { 21 ids : (string, id_location) Hashtbl.t; 22 map_names : (string, id_location) Hashtbl.t; 23 mutable references : id_reference list; 24 mutable usemap_references : id_reference list; 25} 26 27let create () = 28 { 29 ids = Hashtbl.create 64; 30 map_names = Hashtbl.create 16; 31 references = []; 32 usemap_references = []; 33 } 34 35let reset state = 36 Hashtbl.clear state.ids; 37 Hashtbl.clear state.map_names; 38 state.references <- []; 39 state.usemap_references <- [] 40 41(** Check if a string contains whitespace. *) 42let contains_whitespace s = 43 String.contains s ' ' || String.contains s '\t' || 44 String.contains s '\n' || String.contains s '\r' 45 46(** Extract ID from a usemap value (removes leading #). *) 47let extract_usemap_id value = 48 if String.length value > 0 && value.[0] = '#' then 49 Some (String.sub value 1 (String.length value - 1)) 50 else 51 None 52 53(** Split whitespace-separated ID references. *) 54let split_ids value = 55 let rec split acc start i = 56 if i >= String.length value then 57 if i > start then 58 (String.sub value start (i - start)) :: acc 59 else 60 acc 61 else 62 match value.[i] with 63 | ' ' | '\t' | '\n' | '\r' -> 64 let acc' = 65 if i > start then 66 (String.sub value start (i - start)) :: acc 67 else 68 acc 69 in 70 split acc' (i + 1) (i + 1) 71 | _ -> 72 split acc start (i + 1) 73 in 74 List.rev (split [] 0 0) 75 76(** Attributes that reference a single ID. *) 77let single_id_ref_attrs = [ 78 "for"; (* label *) 79 "form"; (* form-associated elements *) 80 "list"; (* input *) 81 "aria-activedescendant"; 82 "popovertarget"; (* button - references popover element *) 83 "commandfor"; (* button - references element to control *) 84 "anchor"; (* popover - references anchor element *) 85] 86 87(** Attributes that reference multiple IDs (space-separated). *) 88let multi_id_ref_attrs = [ 89 "headers"; (* td, th *) 90 "aria-labelledby"; 91 "aria-describedby"; 92 "aria-controls"; 93 "aria-flowto"; 94 "aria-owns"; 95 "itemref"; 96] 97 98(** Check and store an ID attribute. *) 99let check_id state ~element ~id ~location collector = 100 (* Check for empty ID *) 101 if String.length id = 0 then 102 Message_collector.add_error collector 103 ~message:"ID attribute must not be empty" 104 ~code:"empty-id" 105 ?location 106 ~element 107 ~attribute:"id" 108 () 109 (* Check for whitespace in ID *) 110 else if contains_whitespace id then 111 Message_collector.add_error collector 112 ~message:(Printf.sprintf "ID attribute value '%s' must not contain whitespace" id) 113 ~code:"id-whitespace" 114 ?location 115 ~element 116 ~attribute:"id" 117 () 118 (* Check for duplicate ID *) 119 else if Hashtbl.mem state.ids id then 120 Message_collector.add_error collector 121 ~message:(Printf.sprintf "Duplicate ID \xe2\x80\x9c%s\xe2\x80\x9d." id) 122 ~code:"duplicate-id" 123 ?location 124 ~element 125 ~attribute:"id" 126 () 127 else 128 (* Store the ID *) 129 Hashtbl.add state.ids id () 130 131(** Record a single ID reference. *) 132let add_reference state ~referring_element ~attribute ~referenced_id ~location = 133 if String.length referenced_id > 0 then 134 state.references <- { 135 referring_element; 136 attribute; 137 referenced_id; 138 location; 139 } :: state.references 140 141(** Process attributes to check IDs and collect references. *) 142let process_attrs state ~element ~attrs ~location collector = 143 List.iter (fun (name, value) -> 144 match name with 145 | "id" -> 146 check_id state ~element ~id:value ~location collector 147 148 | "usemap" -> 149 (* usemap references a map name (not ID), stored separately *) 150 begin match extract_usemap_id value with 151 | Some map_name -> 152 if String.length map_name > 0 then 153 state.usemap_references <- { 154 referring_element = element; 155 attribute = name; 156 referenced_id = map_name; 157 location; 158 } :: state.usemap_references 159 | None -> 160 if String.length value > 0 then 161 Message_collector.add_error collector 162 ~message:(Printf.sprintf 163 "usemap attribute value '%s' must start with '#'" value) 164 ~code:"invalid-usemap" 165 ?location 166 ~element 167 ~attribute:name 168 () 169 end 170 171 | "name" when element = "map" -> 172 (* Track map name attributes for usemap resolution *) 173 if String.length value > 0 then 174 Hashtbl.add state.map_names value () 175 176 | attr when List.mem attr single_id_ref_attrs -> 177 add_reference state ~referring_element:element 178 ~attribute:attr ~referenced_id:value ~location 179 180 | attr when List.mem attr multi_id_ref_attrs -> 181 (* Split space-separated IDs and add each as a reference *) 182 let ids = split_ids value in 183 List.iter (fun id -> 184 add_reference state ~referring_element:element 185 ~attribute:attr ~referenced_id:id ~location 186 ) ids 187 188 | _ -> () 189 ) attrs 190 191let start_element state ~name ~namespace:_ ~attrs collector = 192 (* For now, we don't have location information from the DOM walker, 193 so we pass None. In a full implementation, this would be passed 194 from the parser. *) 195 let location = None in 196 process_attrs state ~element:name ~attrs ~location collector 197 198let end_element _state ~name:_ ~namespace:_ _collector = 199 () 200 201let characters _state _text _collector = 202 () 203 204let end_document state collector = 205 (* Check all ID references point to existing IDs *) 206 List.iter (fun ref -> 207 if not (Hashtbl.mem state.ids ref.referenced_id) then 208 Message_collector.add_error collector 209 ~message:(Printf.sprintf 210 "The '%s' attribute on <%s> refers to ID '%s' which does not exist" 211 ref.attribute ref.referring_element ref.referenced_id) 212 ~code:"dangling-id-reference" 213 ?location:ref.location 214 ~element:ref.referring_element 215 ~attribute:ref.attribute 216 () 217 ) state.references; 218 219 (* Check all usemap references point to existing map names *) 220 List.iter (fun ref -> 221 if not (Hashtbl.mem state.map_names ref.referenced_id) then 222 Message_collector.add_error collector 223 ~message:(Printf.sprintf 224 "The '%s' attribute on <%s> refers to map name '%s' which does not exist" 225 ref.attribute ref.referring_element ref.referenced_id) 226 ~code:"dangling-usemap-reference" 227 ?location:ref.location 228 ~element:ref.referring_element 229 ~attribute:ref.attribute 230 () 231 ) state.usemap_references 232 233let checker = (module struct 234 type nonrec state = state 235 236 let create = create 237 let reset = reset 238 let start_element = start_element 239 let end_element = end_element 240 let characters = characters 241 let end_document = end_document 242end : Checker.S)