OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Attribute restrictions checker - validates that certain attributes
2 are not used on elements where they're not allowed. *)
3
4(** List of (element, [disallowed attributes]) pairs for HTML elements. *)
5let disallowed_attrs_html = [
6 (* Elements that cannot have href attribute (RDFa misuses) *)
7 ("img", ["href"]);
8 ("p", ["href"]);
9 ("div", ["href"]);
10 (* a cannot have src or media *)
11 ("a", ["src"; "media"]);
12 (* area cannot have media *)
13 ("area", ["media"]);
14 (* Various elements cannot have srcset *)
15 ("audio", ["srcset"]);
16 ("video", ["srcset"]);
17 ("object", ["srcset"]);
18 ("link", ["srcset"]); (* except when rel=preload and as=image *)
19 ("track", ["srcset"]);
20 ("input", ["srcset"]); (* except type=image, but we check more strictly *)
21 ("image", ["srcset"]); (* SVG image element *)
22]
23
24(** SVG elements that cannot have xml:id attribute. *)
25let svg_no_xml_id = [
26 "rect"; "circle"; "ellipse"; "line"; "polyline"; "polygon"; "path";
27 "text"; "tspan"; "textPath"; "image"; "use"; "symbol"; "defs"; "g";
28 "svg"; "marker"; "pattern"; "clipPath"; "mask"; "linearGradient";
29 "radialGradient"; "stop"; "filter"; "feBlend"; "feColorMatrix";
30 "feComponentTransfer"; "feComposite"; "feConvolveMatrix"; "feDiffuseLighting";
31 "feDisplacementMap"; "feDistantLight"; "feDropShadow"; "feFlood";
32 "feFuncA"; "feFuncB"; "feFuncG"; "feFuncR"; "feGaussianBlur"; "feImage";
33 "feMerge"; "feMergeNode"; "feMorphology"; "feOffset"; "fePointLight";
34 "feSpecularLighting"; "feSpotLight"; "feTile"; "feTurbulence";
35]
36
37type state = {
38 mutable is_xhtml : bool; (* Track if we're in XHTML mode based on xmlns *)
39}
40
41let create () = { is_xhtml = false }
42let reset state = state.is_xhtml <- false
43
44(** Input types that allow the list attribute. *)
45let input_types_allowing_list = [
46 "color"; "date"; "datetime-local"; "email"; "month"; "number";
47 "range"; "search"; "tel"; "text"; "time"; "url"; "week"
48]
49
50(** Report disallowed attribute error *)
51let report_disallowed_attr element attr collector =
52 Message_collector.add_typed collector
53 (`Attr (`Not_allowed (`Attr attr, `Elem element)))
54
55let start_element state ~element collector =
56 match element.Element.tag with
57 | Tag.Html _ ->
58 let name = Tag.tag_to_string element.tag in
59 let name_lower = String.lowercase_ascii name in
60 let attrs = element.raw_attrs in
61
62 (* Detect XHTML mode from xmlns attribute on html element *)
63 if name_lower = "html" then begin
64 match Attr_utils.get_attr "xmlns" attrs with
65 | Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true
66 | _ -> ()
67 end;
68
69 (* Check HTML element attribute restrictions *)
70 (match List.assoc_opt name_lower disallowed_attrs_html with
71 | Some disallowed ->
72 List.iter (fun attr ->
73 if Attr_utils.has_attr attr attrs then
74 report_disallowed_attr name_lower attr collector
75 ) disallowed
76 | None -> ());
77
78 (* Check for xml:base attribute - not allowed in HTML *)
79 if name_lower = "html" then begin
80 if Attr_utils.has_attr "xml:base" attrs then
81 report_disallowed_attr name_lower "xml:base" collector
82 end;
83
84 (* Check for xmlns:* prefixed attributes - not allowed in HTML *)
85 (* Standard xmlns declarations are allowed but custom prefixes are not *)
86 List.iter (fun (attr_name, _) ->
87 let attr_lower = String.lowercase_ascii attr_name in
88 if String.starts_with ~prefix:"xmlns:" attr_lower then begin
89 let prefix = String.sub attr_lower 6 (String.length attr_lower - 6) in
90 (* Only xmlns:xlink (with correct value) and xmlns:xml are allowed *)
91 if prefix <> "xlink" && prefix <> "xml" then
92 Message_collector.add_typed collector
93 (`Attr (`Not_allowed_here (`Attr attr_name)))
94 end
95 ) attrs;
96
97 (* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *)
98 (* xml:id is never valid on SVG elements in HTML5 *)
99 if List.mem name_lower svg_no_xml_id then begin
100 if Attr_utils.has_attr "xml:id" attrs then
101 report_disallowed_attr name_lower "xml:id" collector
102 end;
103
104 (* SVG feConvolveMatrix requires order attribute *)
105 if name_lower = "feconvolvematrix" then begin
106 if not (Attr_utils.has_attr "order" attrs) then
107 Message_collector.add_typed collector
108 (`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order")))
109 end;
110
111 (* Validate style type attribute - must be "text/css" or omitted *)
112 if name_lower = "style" then begin
113 List.iter (fun (attr_name, attr_value) ->
114 let attr_lower = String.lowercase_ascii attr_name in
115 if attr_lower = "type" then begin
116 let value_lower = String.lowercase_ascii (String.trim attr_value) in
117 if value_lower <> "text/css" then
118 Message_collector.add_typed collector (`Misc `Style_type_invalid)
119 end
120 ) attrs
121 end;
122
123 (* Validate object element requires data or type attribute *)
124 if name_lower = "object" then begin
125 let has_data = Attr_utils.has_attr "data" attrs in
126 let has_type = Attr_utils.has_attr "type" attrs in
127 if not has_data && not has_type then
128 Message_collector.add_typed collector
129 (`Attr (`Missing (`Elem "object", `Attr "data")))
130 end;
131
132 (* Validate link imagesizes/imagesrcset attributes *)
133 if name_lower = "link" then begin
134 let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
135 let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
136 let rel_value = Attr_utils.get_attr "rel" attrs in
137 let as_value = Attr_utils.get_attr "as" attrs in
138
139 (* imagesizes requires imagesrcset *)
140 if has_imagesizes && not has_imagesrcset then
141 Message_collector.add_typed collector (`Srcset `Imagesizes_without_imagesrcset);
142
143 (* imagesrcset requires as="image" *)
144 if has_imagesrcset then begin
145 let as_is_image = match as_value with
146 | Some v -> String.lowercase_ascii (String.trim v) = "image"
147 | None -> false
148 in
149 if not as_is_image then
150 Message_collector.add_typed collector (`Link `Imagesrcset_requires_as_image)
151 end;
152
153 (* as attribute requires rel="preload" or rel="modulepreload" *)
154 (match as_value with
155 | Some _ ->
156 let rel_is_preload = match rel_value with
157 | Some v ->
158 let rel_lower = String.lowercase_ascii (String.trim v) in
159 String.length rel_lower > 0 &&
160 (List.mem "preload" (String.split_on_char ' ' rel_lower) ||
161 List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
162 | None -> false
163 in
164 if not rel_is_preload then
165 Message_collector.add_typed collector (`Link `As_requires_preload)
166 | None -> ())
167 end;
168
169 (* Validate img usemap attribute - must be hash-name reference with content *)
170 if name_lower = "img" then begin
171 List.iter (fun (attr_name, attr_value) ->
172 let attr_lower = String.lowercase_ascii attr_name in
173 if attr_lower = "usemap" then begin
174 if attr_value = "#" then
175 Message_collector.add_typed collector
176 (`Attr (`Bad_value_generic (`Message (Printf.sprintf
177 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d."
178 attr_value attr_name name))))
179 end
180 ) attrs
181 end;
182
183 (* Validate embed type attribute - must be valid MIME type *)
184 if name_lower = "embed" then begin
185 List.iter (fun (attr_name, attr_value) ->
186 let attr_lower = String.lowercase_ascii attr_name in
187 if attr_lower = "type" then begin
188 match Dt_mime.validate_mime_type attr_value with
189 | Ok () -> ()
190 | Error msg ->
191 Message_collector.add_typed collector
192 (`Attr (`Bad_value_generic (`Message (Printf.sprintf
193 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s"
194 attr_value attr_name name msg))))
195 end
196 ) attrs
197 end;
198
199 (* Validate width/height on embed and img - must be non-negative integers *)
200 let is_dimension_element = name_lower = "embed" || name_lower = "img" ||
201 name_lower = "video" || name_lower = "canvas" ||
202 name_lower = "iframe" || name_lower = "source" in
203 if is_dimension_element then begin
204 List.iter (fun (attr_name, attr_value) ->
205 let attr_lower = String.lowercase_ascii attr_name in
206 if attr_lower = "width" || attr_lower = "height" then begin
207 (* Check for non-negative integer only *)
208 let is_valid =
209 String.length attr_value > 0 &&
210 String.for_all (fun c -> c >= '0' && c <= '9') attr_value
211 in
212 if not is_valid then begin
213 (* Determine specific error message *)
214 let error_msg =
215 if String.length attr_value = 0 then
216 Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
217 attr_name name
218 else if String.contains attr_value '%' then
219 Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
220 attr_value attr_name name
221 else if String.length attr_value > 0 && attr_value.[0] = '-' then
222 Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead."
223 attr_value attr_name name
224 else
225 (* Find first non-digit character *)
226 let bad_char =
227 try
228 let i = ref 0 in
229 while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do
230 incr i
231 done;
232 if !i < String.length attr_value then Some attr_value.[!i] else None
233 with _ -> None
234 in
235 match bad_char with
236 | Some c ->
237 Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
238 attr_value attr_name name c
239 | None ->
240 Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit."
241 attr_value attr_name name
242 in
243 Message_collector.add_typed collector
244 (`Attr (`Bad_value_generic (`Message error_msg)))
245 end
246 end
247 ) attrs
248 end;
249
250 (* Validate area[shape=default] cannot have coords *)
251 if name_lower = "area" then begin
252 match Attr_utils.get_attr "shape" attrs with
253 | Some s when String.lowercase_ascii (String.trim s) = "default" ->
254 if Attr_utils.has_attr "coords" attrs then
255 Message_collector.add_typed collector
256 (`Attr (`Not_allowed (`Attr "coords", `Elem "area")))
257 | _ -> ()
258 end;
259
260 (* Validate bdo element requires dir attribute, and dir cannot be "auto" *)
261 if name_lower = "bdo" then begin
262 match Attr_utils.get_attr "dir" attrs with
263 | None ->
264 Message_collector.add_typed collector (`Misc `Bdo_missing_dir)
265 | Some v when String.lowercase_ascii (String.trim v) = "auto" ->
266 Message_collector.add_typed collector (`Misc `Bdo_dir_auto)
267 | _ -> ()
268 end;
269
270 (* Validate input list attribute - only allowed for certain types *)
271 if name_lower = "input" then begin
272 if Attr_utils.has_attr "list" attrs then begin
273 let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs
274 |> String.trim |> String.lowercase_ascii in
275 if not (List.mem input_type input_types_allowing_list) then
276 Message_collector.add_typed collector (`Input `List_not_allowed)
277 end
278 end;
279
280 (* Validate data-* attributes *)
281 List.iter (fun (attr_name, _) ->
282 let attr_lower = String.lowercase_ascii attr_name in
283 (* Check if it starts with "data-" *)
284 if String.starts_with ~prefix:"data-" attr_lower then begin
285 let after_prefix = String.sub attr_lower 5 (String.length attr_lower - 5) in
286 (* Check if it's exactly "data-" with nothing after *)
287 if after_prefix = "" then
288 report_disallowed_attr name_lower attr_name collector
289 (* Check if the name contains colon - not XML serializable *)
290 else if String.contains after_prefix ':' then
291 Message_collector.add_typed collector
292 (`Attr (`Data_invalid_name (`Reason "must be XML 1.0 4th ed. plus Namespaces NCNames")))
293 end
294 ) attrs;
295
296 (* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *)
297 if not state.is_xhtml then begin
298 let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in
299 let lang_value = Attr_utils.get_attr "lang" attrs in
300 match xmllang_value with
301 | Some xmllang ->
302 (match lang_value with
303 | None ->
304 Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
305 | Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang ->
306 Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
307 | _ -> ())
308 | None -> ()
309 end;
310
311 (* Validate spellcheck attribute - must be "true" or "false" or empty *)
312 List.iter (fun (attr_name, attr_value) ->
313 let attr_lower = String.lowercase_ascii attr_name in
314 if attr_lower = "spellcheck" then begin
315 let value_lower = String.lowercase_ascii (String.trim attr_value) in
316 if value_lower <> "" && value_lower <> "true" && value_lower <> "false" then
317 Message_collector.add_typed collector
318 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason "")))
319 end
320 ) attrs;
321
322 (* Validate enterkeyhint attribute - must be one of specific values *)
323 let valid_enterkeyhint = ["enter"; "done"; "go"; "next"; "previous"; "search"; "send"] in
324 List.iter (fun (attr_name, attr_value) ->
325 let attr_lower = String.lowercase_ascii attr_name in
326 if attr_lower = "enterkeyhint" then begin
327 let value_lower = String.lowercase_ascii (String.trim attr_value) in
328 if not (List.mem value_lower valid_enterkeyhint) then
329 Message_collector.add_typed collector
330 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason "")))
331 end
332 ) attrs;
333
334 (* Validate headingoffset attribute - must be a number between 0 and 8 *)
335 List.iter (fun (attr_name, attr_value) ->
336 let attr_lower = String.lowercase_ascii attr_name in
337 if attr_lower = "headingoffset" then begin
338 let trimmed = String.trim attr_value in
339 let is_valid =
340 String.length trimmed > 0 &&
341 String.for_all (fun c -> c >= '0' && c <= '9') trimmed &&
342 (try
343 let n = int_of_string trimmed in
344 n >= 0 && n <= 8
345 with _ -> false)
346 in
347 if not is_valid then
348 Message_collector.add_typed collector (`Misc `Headingoffset_invalid)
349 end
350 ) attrs;
351
352 (* Validate accesskey attribute - each key label must be a single code point *)
353 List.iter (fun (attr_name, attr_value) ->
354 let attr_lower = String.lowercase_ascii attr_name in
355 if attr_lower = "accesskey" then begin
356 (* Split by whitespace to get key labels *)
357 let keys = String.split_on_char ' ' attr_value |>
358 List.filter (fun s -> String.length (String.trim s) > 0) |>
359 List.map String.trim in
360 (* Count Unicode code points in each key *)
361 let count_codepoints s =
362 let len = String.length s in
363 let count = ref 0 in
364 let i = ref 0 in
365 while !i < len do
366 let c = Char.code s.[!i] in
367 if c < 0x80 then incr i
368 else if c < 0xE0 then i := !i + 2
369 else if c < 0xF0 then i := !i + 3
370 else i := !i + 4;
371 incr count
372 done;
373 !count
374 in
375 (* Check for multi-character keys *)
376 List.iter (fun key ->
377 if count_codepoints key > 1 then
378 Message_collector.add_typed collector
379 (`Attr (`Bad_value_generic (`Message (Printf.sprintf
380 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Key label has multiple characters. Each key label must be a single character."
381 attr_value attr_name name))))
382 ) keys;
383 (* Check for duplicate keys *)
384 let rec find_duplicates seen = function
385 | [] -> ()
386 | k :: rest ->
387 if List.mem k seen then
388 Message_collector.add_typed collector
389 (`Attr (`Bad_value_generic (`Message (Printf.sprintf
390 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad key label list: Duplicate key label. Each key label must be unique."
391 attr_value attr_name name))))
392 else
393 find_duplicates (k :: seen) rest
394 in
395 find_duplicates [] keys
396 end
397 ) attrs;
398
399 (* Validate that command and popovertarget cannot have aria-expanded *)
400 if name_lower = "button" then begin
401 let has_command = Attr_utils.has_attr "command" attrs in
402 let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in
403 let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in
404
405 if has_command && has_aria_expanded then
406 Message_collector.add_typed collector
407 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
408 `Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute")));
409
410 if has_popovertarget && has_aria_expanded then
411 Message_collector.add_typed collector
412 (`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
413 `Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute")))
414 end;
415
416 (* Note: data-* uppercase check requires XML parsing which preserves case.
417 The HTML5 parser normalizes attribute names to lowercase, so this check
418 is only effective when the document is parsed as XML.
419 Commenting out until we have XML parsing support. *)
420 ignore state.is_xhtml;
421
422 (* Validate media attribute on link, style, source elements *)
423 let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in
424 if is_media_element then begin
425 List.iter (fun (attr_name, attr_value) ->
426 let attr_lower = String.lowercase_ascii attr_name in
427 if attr_lower = "media" then begin
428 let trimmed = String.trim attr_value in
429 if trimmed <> "" then begin
430 match Dt_media_query.validate_media_query_strict trimmed with
431 | Ok () -> ()
432 | Error msg ->
433 Message_collector.add_typed collector
434 (`Attr (`Bad_value_generic (`Message (Printf.sprintf
435 "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s"
436 attr_value attr_name name msg))))
437 end
438 end
439 ) attrs
440 end;
441
442 (* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *)
443 List.iter (fun (attr_name, attr_value) ->
444 let attr_lower = String.lowercase_ascii attr_name in
445 if attr_lower = "prefix" then begin
446 (* Parse prefix attribute value - format: "prefix1: iri1 prefix2: iri2 ..." *)
447 let trimmed = String.trim attr_value in
448 if trimmed <> "" then begin
449 (* Check for empty prefix (starts with : or has space:) *)
450 if String.length trimmed > 0 && trimmed.[0] = ':' then
451 Message_collector.add_typed collector
452 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason "")))
453 else begin
454 (* Check for invalid prefix names - must start with letter or underscore *)
455 let is_ncname_start c =
456 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c = '_'
457 in
458 if String.length trimmed > 0 && not (is_ncname_start trimmed.[0]) then
459 Message_collector.add_typed collector
460 (`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason "")))
461 end
462 end
463 end
464 ) attrs
465
466 | _ -> () (* Skip non-HTML elements *)
467
468let end_element _state ~tag:_ _collector = ()
469let characters _state _text _collector = ()
470let end_document _state _collector = ()
471
472let checker =
473 (module struct
474 type nonrec state = state
475 let create = create
476 let reset = reset
477 let start_element = start_element
478 let end_element = end_element
479 let characters = characters
480 let end_document = end_document
481 end : Checker.S)