OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** ID uniqueness and reference checker.
2
3 This checker validates that:
4 - ID attributes are unique within the document
5 - ID references point to existing IDs
6 - ID values conform to HTML5 requirements *)
7
8(** Location information for ID occurrences. *)
9type id_location = unit (* simplified since we only need to track existence *)
10
11(** Information about an ID reference. *)
12type id_reference = {
13 referring_element : string;
14 attribute : string;
15 referenced_id : string;
16 location : Message.location option;
17}
18
19(** Checker state tracking IDs, map names, and references. *)
20type state = {
21 ids : (string, id_location) Hashtbl.t;
22 map_names : (string, id_location) Hashtbl.t;
23 mutable references : id_reference list;
24 mutable usemap_references : id_reference list;
25}
26
27let create () =
28 {
29 ids = Hashtbl.create 64;
30 map_names = Hashtbl.create 16;
31 references = [];
32 usemap_references = [];
33 }
34
35let reset state =
36 Hashtbl.clear state.ids;
37 Hashtbl.clear state.map_names;
38 state.references <- [];
39 state.usemap_references <- []
40
41(** Check if a string contains whitespace. *)
42let contains_whitespace s =
43 String.contains s ' ' || String.contains s '\t' ||
44 String.contains s '\n' || String.contains s '\r'
45
46(** Extract ID from a usemap value (removes leading #). *)
47let extract_usemap_id value =
48 if String.length value > 0 && value.[0] = '#' then
49 Some (String.sub value 1 (String.length value - 1))
50 else
51 None
52
53(** Split whitespace-separated ID references. *)
54let split_ids value =
55 let rec split acc start i =
56 if i >= String.length value then
57 if i > start then
58 (String.sub value start (i - start)) :: acc
59 else
60 acc
61 else
62 match value.[i] with
63 | ' ' | '\t' | '\n' | '\r' ->
64 let acc' =
65 if i > start then
66 (String.sub value start (i - start)) :: acc
67 else
68 acc
69 in
70 split acc' (i + 1) (i + 1)
71 | _ ->
72 split acc start (i + 1)
73 in
74 List.rev (split [] 0 0)
75
76(** Attributes that reference a single ID. *)
77let single_id_ref_attrs = [
78 "for"; (* label *)
79 "form"; (* form-associated elements *)
80 "list"; (* input *)
81 "aria-activedescendant";
82 "popovertarget"; (* button - references popover element *)
83 "commandfor"; (* button - references element to control *)
84 "anchor"; (* popover - references anchor element *)
85]
86
87(** Attributes that reference multiple IDs (space-separated). *)
88let multi_id_ref_attrs = [
89 "headers"; (* td, th *)
90 "aria-labelledby";
91 "aria-describedby";
92 "aria-controls";
93 "aria-flowto";
94 "aria-owns";
95 "itemref";
96]
97
98(** Check and store an ID attribute. *)
99let check_id state ~element ~id ~location collector =
100 (* Check for empty ID *)
101 if String.length id = 0 then
102 Message_collector.add_error collector
103 ~message:"ID attribute must not be empty"
104 ~code:"empty-id"
105 ?location
106 ~element
107 ~attribute:"id"
108 ()
109 (* Check for whitespace in ID *)
110 else if contains_whitespace id then
111 Message_collector.add_error collector
112 ~message:(Printf.sprintf "ID attribute value '%s' must not contain whitespace" id)
113 ~code:"id-whitespace"
114 ?location
115 ~element
116 ~attribute:"id"
117 ()
118 (* Check for duplicate ID *)
119 else if Hashtbl.mem state.ids id then
120 Message_collector.add_error collector
121 ~message:(Printf.sprintf "Duplicate ID \xe2\x80\x9c%s\xe2\x80\x9d." id)
122 ~code:"duplicate-id"
123 ?location
124 ~element
125 ~attribute:"id"
126 ()
127 else
128 (* Store the ID *)
129 Hashtbl.add state.ids id ()
130
131(** Record a single ID reference. *)
132let add_reference state ~referring_element ~attribute ~referenced_id ~location =
133 if String.length referenced_id > 0 then
134 state.references <- {
135 referring_element;
136 attribute;
137 referenced_id;
138 location;
139 } :: state.references
140
141(** Process attributes to check IDs and collect references. *)
142let process_attrs state ~element ~attrs ~location collector =
143 List.iter (fun (name, value) ->
144 match name with
145 | "id" ->
146 check_id state ~element ~id:value ~location collector
147
148 | "usemap" ->
149 (* usemap references a map name (not ID), stored separately *)
150 begin match extract_usemap_id value with
151 | Some map_name ->
152 if String.length map_name > 0 then
153 state.usemap_references <- {
154 referring_element = element;
155 attribute = name;
156 referenced_id = map_name;
157 location;
158 } :: state.usemap_references
159 | None ->
160 if String.length value > 0 then
161 Message_collector.add_error collector
162 ~message:(Printf.sprintf
163 "usemap attribute value '%s' must start with '#'" value)
164 ~code:"invalid-usemap"
165 ?location
166 ~element
167 ~attribute:name
168 ()
169 end
170
171 | "name" when element = "map" ->
172 (* Track map name attributes for usemap resolution *)
173 if String.length value > 0 then
174 Hashtbl.add state.map_names value ()
175
176 | attr when List.mem attr single_id_ref_attrs ->
177 add_reference state ~referring_element:element
178 ~attribute:attr ~referenced_id:value ~location
179
180 | attr when List.mem attr multi_id_ref_attrs ->
181 (* Split space-separated IDs and add each as a reference *)
182 let ids = split_ids value in
183 List.iter (fun id ->
184 add_reference state ~referring_element:element
185 ~attribute:attr ~referenced_id:id ~location
186 ) ids
187
188 | _ -> ()
189 ) attrs
190
191let start_element state ~name ~namespace:_ ~attrs collector =
192 (* For now, we don't have location information from the DOM walker,
193 so we pass None. In a full implementation, this would be passed
194 from the parser. *)
195 let location = None in
196 process_attrs state ~element:name ~attrs ~location collector
197
198let end_element _state ~name:_ ~namespace:_ _collector =
199 ()
200
201let characters _state _text _collector =
202 ()
203
204let end_document state collector =
205 (* Check all ID references point to existing IDs *)
206 List.iter (fun ref ->
207 if not (Hashtbl.mem state.ids ref.referenced_id) then
208 Message_collector.add_error collector
209 ~message:(Printf.sprintf
210 "The '%s' attribute on <%s> refers to ID '%s' which does not exist"
211 ref.attribute ref.referring_element ref.referenced_id)
212 ~code:"dangling-id-reference"
213 ?location:ref.location
214 ~element:ref.referring_element
215 ~attribute:ref.attribute
216 ()
217 ) state.references;
218
219 (* Check all usemap references point to existing map names *)
220 List.iter (fun ref ->
221 if not (Hashtbl.mem state.map_names ref.referenced_id) then
222 Message_collector.add_error collector
223 ~message:(Printf.sprintf
224 "The '%s' attribute on <%s> refers to map name '%s' which does not exist"
225 ref.attribute ref.referring_element ref.referenced_id)
226 ~code:"dangling-usemap-reference"
227 ?location:ref.location
228 ~element:ref.referring_element
229 ~attribute:ref.attribute
230 ()
231 ) state.usemap_references
232
233let checker = (module struct
234 type nonrec state = state
235
236 let create = create
237 let reset = reset
238 let start_element = start_element
239 let end_element = end_element
240 let characters = characters
241 let end_document = end_document
242end : Checker.S)