+55
lib/htmlrw_check/attr_utils.mli
+55
lib/htmlrw_check/attr_utils.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Common attribute utilities used across checkers.
7
+
8
+
This module provides simple helper functions for working with raw
9
+
attribute lists (name-value pairs). These utilities are used by
10
+
checkers that need to inspect attributes without full typed parsing.
11
+
12
+
For typed attribute access, see the {!Attr} module.
13
+
*)
14
+
15
+
(** {1 Types} *)
16
+
17
+
type attrs = (string * string) list
18
+
(** Raw attribute list as name-value pairs. *)
19
+
20
+
(** {1 Attribute Lookup} *)
21
+
22
+
val has_attr : string -> attrs -> bool
23
+
(** [has_attr name attrs] checks if an attribute exists.
24
+
25
+
The comparison is case-insensitive.
26
+
27
+
@param name The attribute name to look for (lowercase)
28
+
@param attrs The attribute list
29
+
@return [true] if the attribute is present *)
30
+
31
+
val get_attr : string -> attrs -> string option
32
+
(** [get_attr name attrs] gets an attribute value.
33
+
34
+
The comparison is case-insensitive.
35
+
36
+
@param name The attribute name to look for (lowercase)
37
+
@param attrs The attribute list
38
+
@return [Some value] if found, [None] otherwise *)
39
+
40
+
val get_attr_or : string -> default:string -> attrs -> string
41
+
(** [get_attr_or name ~default attrs] gets an attribute value with a default.
42
+
43
+
@param name The attribute name to look for (lowercase)
44
+
@param default The default value if not found
45
+
@param attrs The attribute list
46
+
@return The attribute value or the default *)
47
+
48
+
val is_non_empty_attr : string -> attrs -> bool
49
+
(** [is_non_empty_attr name attrs] checks if an attribute exists with non-empty value.
50
+
51
+
The value is considered non-empty if it contains non-whitespace characters.
52
+
53
+
@param name The attribute name to look for (lowercase)
54
+
@param attrs The attribute list
55
+
@return [true] if the attribute exists and has a non-empty value *)
+531
lib/htmlrw_check/element/attr.mli
+531
lib/htmlrw_check/element/attr.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Typed HTML5 attribute representations using polymorphic variants.
7
+
8
+
This module provides typed representations for HTML attributes with
9
+
proper value types for enumerated attributes. Parsing raw attribute
10
+
name-value pairs produces typed variants that can be pattern-matched
11
+
with exhaustiveness checking.
12
+
13
+
{2 Design Philosophy}
14
+
15
+
HTML5 attributes have specific value constraints that this module
16
+
encodes in the type system:
17
+
18
+
- Boolean attributes: Present means true (e.g., [disabled], [checked])
19
+
- Enumerated attributes: Fixed set of valid values (e.g., [dir], [method])
20
+
- Numeric attributes: Integer or float values (e.g., [tabindex], [colspan])
21
+
- URL attributes: String values representing URLs (e.g., [href], [src])
22
+
- Free-form attributes: Any string value (e.g., [class], [title])
23
+
24
+
{2 Parsing Strategy}
25
+
26
+
Attributes are parsed with validation:
27
+
- Known attributes are parsed into typed variants
28
+
- Invalid values for enumerated attributes fall back to [Unknown_attr]
29
+
- Unknown attribute names are captured as [Unknown_attr]
30
+
- Special handling for [data-*] and [aria-*] prefixed attributes
31
+
32
+
@see <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>
33
+
HTML Standard: Global attributes
34
+
*)
35
+
36
+
(** {1 Attribute Value Types}
37
+
38
+
These types represent the valid values for enumerated HTML attributes. *)
39
+
40
+
(** Direction attribute values for [dir]. *)
41
+
type dir_value = [ `Ltr | `Rtl | `Auto ]
42
+
43
+
(** Hidden attribute values. *)
44
+
type hidden_value = [ `Hidden | `Until_found ]
45
+
46
+
(** Popover attribute values. *)
47
+
type popover_value = [ `Auto | `Manual | `Hint ]
48
+
49
+
(** Link target attribute values. *)
50
+
type target_value = [ `Self | `Blank | `Parent | `Top | `Named of string ]
51
+
52
+
(** Image/resource loading behavior. *)
53
+
type loading_value = [ `Eager | `Lazy ]
54
+
55
+
(** Image decoding hint. *)
56
+
type decoding_value = [ `Sync | `Async | `Auto ]
57
+
58
+
(** Fetch priority hint. *)
59
+
type fetchpriority_value = [ `High | `Low | `Auto ]
60
+
61
+
(** CORS settings. *)
62
+
type crossorigin_value = [ `Anonymous | `Use_credentials ]
63
+
64
+
(** Media preload hint. *)
65
+
type preload_value = [ `None | `Metadata | `Auto ]
66
+
67
+
(** Form method values. *)
68
+
type method_value = [ `Get | `Post | `Dialog ]
69
+
70
+
(** Form encoding type values. *)
71
+
type enctype_value = [ `Urlencoded | `Multipart | `Plain ]
72
+
73
+
(** Textarea wrap mode. *)
74
+
type wrap_value = [ `Soft | `Hard ]
75
+
76
+
(** Table cell scope. *)
77
+
type scope_value = [ `Row | `Col | `Rowgroup | `Colgroup ]
78
+
79
+
(** Input element type values. *)
80
+
type input_type = [
81
+
| `Hidden | `Text | `Search | `Tel | `Url | `Email | `Password
82
+
| `Date | `Month | `Week | `Time | `Datetime_local | `Number
83
+
| `Range | `Color | `Checkbox | `Radio | `File | `Submit
84
+
| `Image | `Reset | `Button
85
+
]
86
+
87
+
(** Button element type values. *)
88
+
type button_type = [ `Submit | `Reset | `Button ]
89
+
90
+
(** Referrer policy values. *)
91
+
type referrerpolicy_value = [
92
+
| `No_referrer | `No_referrer_when_downgrade | `Origin
93
+
| `Origin_when_cross_origin | `Same_origin | `Strict_origin
94
+
| `Strict_origin_when_cross_origin | `Unsafe_url
95
+
]
96
+
97
+
(** Iframe sandbox flags. *)
98
+
type sandbox_flag = [
99
+
| `Allow_downloads | `Allow_forms | `Allow_modals | `Allow_orientation_lock
100
+
| `Allow_pointer_lock | `Allow_popups | `Allow_popups_to_escape_sandbox
101
+
| `Allow_presentation | `Allow_same_origin | `Allow_scripts
102
+
| `Allow_top_navigation | `Allow_top_navigation_by_user_activation
103
+
| `Allow_top_navigation_to_custom_protocols
104
+
]
105
+
106
+
(** Enter key hint values for virtual keyboards. *)
107
+
type enterkeyhint_value = [
108
+
| `Enter | `Done | `Go | `Next | `Previous | `Search | `Send
109
+
]
110
+
111
+
(** Input mode hint for virtual keyboards. *)
112
+
type inputmode_value = [
113
+
| `None | `Text | `Decimal | `Numeric | `Tel | `Search | `Email | `Url
114
+
]
115
+
116
+
(** Content editable values. *)
117
+
type contenteditable_value = [ `True | `False | `Plaintext_only ]
118
+
119
+
(** Autocapitalize values. *)
120
+
type autocapitalize_value = [
121
+
| `Off | `None | `On | `Sentences | `Words | `Characters
122
+
]
123
+
124
+
(** Image map shape values. *)
125
+
type shape_value = [ `Rect | `Circle | `Poly | `Default ]
126
+
127
+
(** Input capture values for file inputs. *)
128
+
type capture_value = [ `User | `Environment ]
129
+
130
+
(** Ordered list type values. *)
131
+
type list_type_value = [
132
+
| `Decimal | `Lower_alpha | `Upper_alpha | `Lower_roman | `Upper_roman
133
+
]
134
+
135
+
(** Track element kind values. *)
136
+
type kind_value = [
137
+
| `Subtitles | `Captions | `Descriptions | `Chapters | `Metadata
138
+
]
139
+
140
+
(** {1 Typed Attribute Variant} *)
141
+
142
+
(** Typed attribute representation.
143
+
144
+
This type covers all HTML5 attributes with appropriate value types.
145
+
Attributes are organized into logical groups. *)
146
+
type t = [
147
+
(* Global attributes *)
148
+
| `Id of string
149
+
| `Class of string
150
+
| `Style of string
151
+
| `Title of string
152
+
| `Lang of string
153
+
| `Dir of dir_value
154
+
| `Hidden of hidden_value option
155
+
| `Tabindex of int
156
+
| `Accesskey of string
157
+
| `Autocapitalize of autocapitalize_value
158
+
| `Autofocus
159
+
| `Contenteditable of contenteditable_value option
160
+
| `Draggable of bool
161
+
| `Enterkeyhint of enterkeyhint_value
162
+
| `Inert
163
+
| `Inputmode of inputmode_value
164
+
| `Is of string
165
+
| `Nonce of string
166
+
| `Popover of popover_value option
167
+
| `Slot of string
168
+
| `Spellcheck of bool option
169
+
| `Translate of bool
170
+
| `Exportparts of string
171
+
| `Part of string
172
+
173
+
(* Microdata *)
174
+
| `Itemscope
175
+
| `Itemtype of string
176
+
| `Itemprop of string
177
+
| `Itemid of string
178
+
| `Itemref of string
179
+
180
+
(* ARIA *)
181
+
| `Role of string
182
+
| `Aria of string * string
183
+
184
+
(* Event handlers *)
185
+
| `Event of string * string
186
+
187
+
(* Link/navigation attributes *)
188
+
| `Href of string
189
+
| `Target of target_value
190
+
| `Rel of string
191
+
| `Download of string option
192
+
| `Hreflang of string
193
+
| `Ping of string
194
+
| `Referrerpolicy of referrerpolicy_value
195
+
| `Type_link of string
196
+
197
+
(* Media/resource attributes *)
198
+
| `Src of string
199
+
| `Srcset of string
200
+
| `Sizes of string
201
+
| `Alt of string
202
+
| `Width of string
203
+
| `Height of string
204
+
| `Loading of loading_value
205
+
| `Decoding of decoding_value
206
+
| `Fetchpriority of fetchpriority_value
207
+
| `Crossorigin of crossorigin_value option
208
+
| `Ismap
209
+
| `Usemap of string
210
+
| `Media of string
211
+
212
+
(* Audio/Video specific *)
213
+
| `Controls
214
+
| `Autoplay
215
+
| `Loop
216
+
| `Muted
217
+
| `Preload of preload_value
218
+
| `Poster of string
219
+
| `Playsinline
220
+
221
+
(* Image map *)
222
+
| `Coords of string
223
+
| `Shape of shape_value
224
+
225
+
(* iframe *)
226
+
| `Sandbox of sandbox_flag list option
227
+
| `Allow of string
228
+
| `Allowfullscreen
229
+
| `Srcdoc of string
230
+
| `Csp of string
231
+
232
+
(* Form attributes *)
233
+
| `Action of string
234
+
| `Method of method_value
235
+
| `Enctype of enctype_value
236
+
| `Novalidate
237
+
| `Accept_charset of string
238
+
| `Autocomplete of string
239
+
| `Name of string
240
+
| `Form of string
241
+
242
+
(* Form control attributes *)
243
+
| `Value of string
244
+
| `Type_input of input_type
245
+
| `Type_button of button_type
246
+
| `Disabled
247
+
| `Readonly
248
+
| `Required
249
+
| `Checked
250
+
| `Selected
251
+
| `Multiple
252
+
| `Placeholder of string
253
+
| `Min of string
254
+
| `Max of string
255
+
| `Step of string
256
+
| `Minlength of int
257
+
| `Maxlength of int
258
+
| `Pattern of string
259
+
| `Size of int
260
+
| `Cols of int
261
+
| `Rows of int
262
+
| `Wrap of wrap_value
263
+
| `Accept of string
264
+
| `Capture of capture_value
265
+
| `Dirname of string
266
+
| `For of string
267
+
| `List of string
268
+
269
+
(* Form submission attributes *)
270
+
| `Formaction of string
271
+
| `Formmethod of method_value
272
+
| `Formenctype of enctype_value
273
+
| `Formnovalidate
274
+
| `Formtarget of target_value
275
+
276
+
(* Table attributes *)
277
+
| `Colspan of int
278
+
| `Rowspan of int
279
+
| `Headers of string
280
+
| `Scope of scope_value
281
+
| `Span of int
282
+
283
+
(* Details/Dialog *)
284
+
| `Open
285
+
286
+
(* Script *)
287
+
| `Async
288
+
| `Defer
289
+
| `Integrity of string
290
+
| `Nomodule
291
+
| `Blocking of string
292
+
| `Type_script of string
293
+
294
+
(* Meta *)
295
+
| `Charset of string
296
+
| `Content of string
297
+
| `Http_equiv of string
298
+
299
+
(* Link element *)
300
+
| `As of string
301
+
| `Imagesizes of string
302
+
| `Imagesrcset of string
303
+
304
+
(* Object/Embed *)
305
+
| `Data_object of string
306
+
307
+
(* Output *)
308
+
| `For_output of string
309
+
310
+
(* Meter/Progress *)
311
+
| `Low of float
312
+
| `High of float
313
+
| `Optimum of float
314
+
315
+
(* Time *)
316
+
| `Datetime of string
317
+
318
+
(* Ol *)
319
+
| `Start of int
320
+
| `Reversed
321
+
| `Type_list of list_type_value
322
+
323
+
(* Track *)
324
+
| `Kind of kind_value
325
+
| `Srclang of string
326
+
| `Default
327
+
328
+
(* Td/Th *)
329
+
| `Abbr of string
330
+
331
+
(* Data attributes *)
332
+
| `Data_attr of string * string
333
+
334
+
(* RDFa *)
335
+
| `Property of string
336
+
| `Typeof of string
337
+
| `Resource of string
338
+
| `Prefix of string
339
+
| `Vocab of string
340
+
| `About of string
341
+
| `Datatype of string
342
+
| `Inlist
343
+
| `Rev of string
344
+
345
+
(* Escape hatch *)
346
+
| `Unknown_attr of string * string
347
+
]
348
+
349
+
(** {1 Parsing Functions} *)
350
+
351
+
val parse_dir : string -> dir_value option
352
+
(** [parse_dir value] parses a direction attribute value. *)
353
+
354
+
val parse_target : string -> target_value
355
+
(** [parse_target value] parses a target attribute value. *)
356
+
357
+
val parse_loading : string -> loading_value option
358
+
(** [parse_loading value] parses a loading attribute value. *)
359
+
360
+
val parse_decoding : string -> decoding_value option
361
+
(** [parse_decoding value] parses a decoding attribute value. *)
362
+
363
+
val parse_fetchpriority : string -> fetchpriority_value option
364
+
(** [parse_fetchpriority value] parses a fetchpriority attribute value. *)
365
+
366
+
val parse_crossorigin : string -> crossorigin_value option
367
+
(** [parse_crossorigin value] parses a crossorigin attribute value. *)
368
+
369
+
val parse_preload : string -> preload_value option
370
+
(** [parse_preload value] parses a preload attribute value. *)
371
+
372
+
val parse_method : string -> method_value option
373
+
(** [parse_method value] parses a form method attribute value. *)
374
+
375
+
val parse_enctype : string -> enctype_value option
376
+
(** [parse_enctype value] parses a form enctype attribute value. *)
377
+
378
+
val parse_wrap : string -> wrap_value option
379
+
(** [parse_wrap value] parses a textarea wrap attribute value. *)
380
+
381
+
val parse_scope : string -> scope_value option
382
+
(** [parse_scope value] parses a table scope attribute value. *)
383
+
384
+
val parse_input_type : string -> input_type option
385
+
(** [parse_input_type value] parses an input type attribute value. *)
386
+
387
+
val parse_button_type : string -> button_type option
388
+
(** [parse_button_type value] parses a button type attribute value. *)
389
+
390
+
val parse_shape : string -> shape_value option
391
+
(** [parse_shape value] parses an area shape attribute value. *)
392
+
393
+
val parse_capture : string -> capture_value option
394
+
(** [parse_capture value] parses an input capture attribute value. *)
395
+
396
+
val parse_list_type : string -> list_type_value option
397
+
(** [parse_list_type value] parses an ordered list type attribute value. *)
398
+
399
+
val parse_kind : string -> kind_value option
400
+
(** [parse_kind value] parses a track kind attribute value. *)
401
+
402
+
val parse_referrerpolicy : string -> referrerpolicy_value option
403
+
(** [parse_referrerpolicy value] parses a referrer policy attribute value. *)
404
+
405
+
val parse_sandbox_flag : string -> sandbox_flag option
406
+
(** [parse_sandbox_flag value] parses a single sandbox flag token. *)
407
+
408
+
val parse_sandbox : string -> sandbox_flag list option
409
+
(** [parse_sandbox value] parses a space-separated sandbox attribute value. *)
410
+
411
+
val parse_enterkeyhint : string -> enterkeyhint_value option
412
+
(** [parse_enterkeyhint value] parses an enterkeyhint attribute value. *)
413
+
414
+
val parse_inputmode : string -> inputmode_value option
415
+
(** [parse_inputmode value] parses an inputmode attribute value. *)
416
+
417
+
val parse_contenteditable : string -> contenteditable_value option
418
+
(** [parse_contenteditable value] parses a contenteditable attribute value. *)
419
+
420
+
val parse_autocapitalize : string -> autocapitalize_value option
421
+
(** [parse_autocapitalize value] parses an autocapitalize attribute value. *)
422
+
423
+
val parse_hidden : string -> hidden_value option
424
+
(** [parse_hidden value] parses a hidden attribute value. *)
425
+
426
+
val parse_popover : string -> popover_value option
427
+
(** [parse_popover value] parses a popover attribute value. *)
428
+
429
+
val parse_int : string -> int option
430
+
(** [parse_int value] attempts to parse an integer from a string. *)
431
+
432
+
val parse_float : string -> float option
433
+
(** [parse_float value] attempts to parse a float from a string. *)
434
+
435
+
val parse_bool : string -> bool option
436
+
(** [parse_bool value] parses a boolean attribute value. *)
437
+
438
+
val parse_attr : string -> string -> t
439
+
(** [parse_attr name value] parses a single attribute name-value pair.
440
+
441
+
@param name The attribute name
442
+
@param value The attribute value
443
+
@return A typed attribute variant
444
+
445
+
{b Example:}
446
+
{[
447
+
parse_attr "class" "container" (* `Class "container" *)
448
+
parse_attr "disabled" "" (* `Disabled *)
449
+
parse_attr "data-id" "123" (* `Data_attr ("id", "123") *)
450
+
]} *)
451
+
452
+
val parse_attrs : (string * string) list -> t list
453
+
(** [parse_attrs attrs] parses multiple attributes.
454
+
455
+
@param attrs List of (name, value) pairs
456
+
@return List of typed attributes *)
457
+
458
+
(** {1 Accessor Functions} *)
459
+
460
+
val get_id : t list -> string option
461
+
(** [get_id attrs] extracts the id attribute value if present. *)
462
+
463
+
val get_class : t list -> string option
464
+
(** [get_class attrs] extracts the class attribute value if present. *)
465
+
466
+
val get_href : t list -> string option
467
+
(** [get_href attrs] extracts the href attribute value if present. *)
468
+
469
+
val get_src : t list -> string option
470
+
(** [get_src attrs] extracts the src attribute value if present. *)
471
+
472
+
val get_alt : t list -> string option
473
+
(** [get_alt attrs] extracts the alt attribute value if present. *)
474
+
475
+
val get_name : t list -> string option
476
+
(** [get_name attrs] extracts the name attribute value if present. *)
477
+
478
+
val get_value : t list -> string option
479
+
(** [get_value attrs] extracts the value attribute value if present. *)
480
+
481
+
val get_role : t list -> string option
482
+
(** [get_role attrs] extracts the role attribute value if present. *)
483
+
484
+
val get_aria : string -> t list -> string option
485
+
(** [get_aria name attrs] extracts a specific aria-* attribute value.
486
+
487
+
@param name The aria attribute name without the "aria-" prefix *)
488
+
489
+
val get_data : string -> t list -> string option
490
+
(** [get_data name attrs] extracts a specific data-* attribute value.
491
+
492
+
@param name The data attribute name without the "data-" prefix *)
493
+
494
+
val has_disabled : t list -> bool
495
+
(** [has_disabled attrs] checks if the disabled attribute is present. *)
496
+
497
+
val has_required : t list -> bool
498
+
(** [has_required attrs] checks if the required attribute is present. *)
499
+
500
+
val has_readonly : t list -> bool
501
+
(** [has_readonly attrs] checks if the readonly attribute is present. *)
502
+
503
+
val has_checked : t list -> bool
504
+
(** [has_checked attrs] checks if the checked attribute is present. *)
505
+
506
+
val has_autofocus : t list -> bool
507
+
(** [has_autofocus attrs] checks if the autofocus attribute is present. *)
508
+
509
+
val has_hidden : t list -> bool
510
+
(** [has_hidden attrs] checks if the hidden attribute is present. *)
511
+
512
+
val has_inert : t list -> bool
513
+
(** [has_inert attrs] checks if the inert attribute is present. *)
514
+
515
+
val has_open : t list -> bool
516
+
(** [has_open attrs] checks if the open attribute is present. *)
517
+
518
+
val get_all_aria : t list -> (string * string) list
519
+
(** [get_all_aria attrs] extracts all aria-* attributes. *)
520
+
521
+
val get_all_data : t list -> (string * string) list
522
+
(** [get_all_data attrs] extracts all data-* attributes. *)
523
+
524
+
val find : (t -> 'a option) -> t list -> 'a option
525
+
(** [find f attrs] finds the first attribute matching predicate [f]. *)
526
+
527
+
val exists : (t -> bool) -> t list -> bool
528
+
(** [exists f attrs] checks if any attribute matches predicate [f]. *)
529
+
530
+
val filter : (t -> bool) -> t list -> t list
531
+
(** [filter f attrs] filters attributes matching predicate [f]. *)
+348
lib/htmlrw_check/element/element.mli
+348
lib/htmlrw_check/element/element.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Typed HTML5 element representation.
7
+
8
+
This module combines tags and attributes into a complete typed element
9
+
representation. Elements are created from raw input (tag name, namespace,
10
+
attributes) and provide typed accessors for validation and manipulation.
11
+
12
+
{2 Design Philosophy}
13
+
14
+
An element in this module represents a complete typed view of an HTML
15
+
element, including:
16
+
17
+
- The element's tag (typed via {!Tag.element_tag})
18
+
- Typed attributes (via {!Attr.t} list)
19
+
- Raw attributes (for fallback access)
20
+
21
+
This dual representation allows checkers to use typed pattern matching
22
+
for common cases while falling back to raw strings when needed.
23
+
24
+
{2 Usage Example}
25
+
26
+
{[
27
+
let elem = Element.create
28
+
~name:"input"
29
+
~namespace:None
30
+
~attrs:[("type", "email"); ("required", ""); ("class", "form-input")]
31
+
in
32
+
match elem.tag with
33
+
| Tag.Html `Input ->
34
+
if Element.has_required elem then
35
+
(* Validate required input *)
36
+
()
37
+
| _ -> ()
38
+
]}
39
+
40
+
@see 'Tag' for element tag types
41
+
@see 'Attr' for attribute types
42
+
*)
43
+
44
+
(** {1 Element Type} *)
45
+
46
+
(** A typed HTML element.
47
+
48
+
@field tag The element's tag classification
49
+
@field attrs Typed attributes parsed from raw input
50
+
@field raw_attrs Original attribute name-value pairs for fallback *)
51
+
type t = {
52
+
tag : Tag.element_tag;
53
+
attrs : Attr.t list;
54
+
raw_attrs : (string * string) list;
55
+
}
56
+
57
+
(** {1 Construction} *)
58
+
59
+
val create : name:string -> namespace:string option -> attrs:(string * string) list -> t
60
+
(** [create ~name ~namespace ~attrs] creates a typed element.
61
+
62
+
@param name The element's tag name
63
+
@param namespace Optional namespace URI (for SVG/MathML)
64
+
@param attrs Raw attribute name-value pairs
65
+
@return A typed element
66
+
67
+
{b Example:}
68
+
{[
69
+
let div = Element.create ~name:"div" ~namespace:None
70
+
~attrs:[("class", "container"); ("id", "main")]
71
+
]} *)
72
+
73
+
(** {1 Tag Accessors} *)
74
+
75
+
val tag : t -> Tag.element_tag
76
+
(** [tag elem] returns the element's tag. *)
77
+
78
+
val tag_name : t -> string
79
+
(** [tag_name elem] returns the element's tag name as a string. *)
80
+
81
+
val is_html_tag : Tag.html_tag -> t -> bool
82
+
(** [is_html_tag expected elem] checks if the element is a specific HTML tag.
83
+
84
+
@param expected The expected HTML tag variant
85
+
@param elem The element to check
86
+
@return [true] if the element matches *)
87
+
88
+
val as_html_tag : t -> Tag.html_tag option
89
+
(** [as_html_tag elem] extracts the HTML tag if this is an HTML element.
90
+
91
+
@return [Some tag] for HTML elements, [None] for SVG/MathML/Custom/Unknown *)
92
+
93
+
(** {1 Attribute Accessors} *)
94
+
95
+
val attrs : t -> Attr.t list
96
+
(** [attrs elem] returns the typed attributes. *)
97
+
98
+
val raw_attrs : t -> (string * string) list
99
+
(** [raw_attrs elem] returns the original raw attributes. *)
100
+
101
+
val get_id : t -> string option
102
+
(** [get_id elem] extracts the id attribute value. *)
103
+
104
+
val get_class : t -> string option
105
+
(** [get_class elem] extracts the class attribute value. *)
106
+
107
+
val get_href : t -> string option
108
+
(** [get_href elem] extracts the href attribute value. *)
109
+
110
+
val get_src : t -> string option
111
+
(** [get_src elem] extracts the src attribute value. *)
112
+
113
+
val get_alt : t -> string option
114
+
(** [get_alt elem] extracts the alt attribute value. *)
115
+
116
+
val get_name : t -> string option
117
+
(** [get_name elem] extracts the name attribute value. *)
118
+
119
+
val get_value : t -> string option
120
+
(** [get_value elem] extracts the value attribute value. *)
121
+
122
+
val get_role : t -> string option
123
+
(** [get_role elem] extracts the role attribute value. *)
124
+
125
+
val get_aria : string -> t -> string option
126
+
(** [get_aria name elem] extracts a specific aria-* attribute value.
127
+
128
+
@param name The aria attribute name without the "aria-" prefix *)
129
+
130
+
val get_data : string -> t -> string option
131
+
(** [get_data name elem] extracts a specific data-* attribute value.
132
+
133
+
@param name The data attribute name without the "data-" prefix *)
134
+
135
+
val has_disabled : t -> bool
136
+
(** [has_disabled elem] checks if the disabled attribute is present. *)
137
+
138
+
val has_required : t -> bool
139
+
(** [has_required elem] checks if the required attribute is present. *)
140
+
141
+
val has_readonly : t -> bool
142
+
(** [has_readonly elem] checks if the readonly attribute is present. *)
143
+
144
+
val has_checked : t -> bool
145
+
(** [has_checked elem] checks if the checked attribute is present. *)
146
+
147
+
val has_autofocus : t -> bool
148
+
(** [has_autofocus elem] checks if the autofocus attribute is present. *)
149
+
150
+
val has_hidden : t -> bool
151
+
(** [has_hidden elem] checks if the hidden attribute is present. *)
152
+
153
+
val has_inert : t -> bool
154
+
(** [has_inert elem] checks if the inert attribute is present. *)
155
+
156
+
val has_open : t -> bool
157
+
(** [has_open elem] checks if the open attribute is present. *)
158
+
159
+
val get_all_aria : t -> (string * string) list
160
+
(** [get_all_aria elem] extracts all aria-* attributes. *)
161
+
162
+
val get_all_data : t -> (string * string) list
163
+
(** [get_all_data elem] extracts all data-* attributes. *)
164
+
165
+
(** {1 Raw Attribute Fallback} *)
166
+
167
+
val get_raw_attr : string -> t -> string option
168
+
(** [get_raw_attr name elem] gets a raw attribute value by name.
169
+
170
+
This is useful when the typed representation doesn't capture a specific
171
+
attribute or when you need the exact original value.
172
+
173
+
@param name The attribute name (case-insensitive)
174
+
@param elem The element
175
+
@return [Some value] if the attribute exists *)
176
+
177
+
val has_raw_attr : string -> t -> bool
178
+
(** [has_raw_attr name elem] checks if a raw attribute exists.
179
+
180
+
@param name The attribute name (case-insensitive)
181
+
@param elem The element
182
+
@return [true] if the attribute is present *)
183
+
184
+
(** {1 Category Checks}
185
+
186
+
These predicates check element categories based on the HTML5 content model. *)
187
+
188
+
val is_void : t -> bool
189
+
(** [is_void elem] checks if this is a void element (cannot have children).
190
+
191
+
@return [true] for br, hr, img, input, etc. *)
192
+
193
+
val is_heading : t -> bool
194
+
(** [is_heading elem] checks if this is a heading element.
195
+
196
+
@return [true] for h1-h6 *)
197
+
198
+
val heading_level : t -> int option
199
+
(** [heading_level elem] gets the heading level (1-6) if applicable.
200
+
201
+
@return [Some level] for h1-h6, [None] otherwise *)
202
+
203
+
val is_sectioning : t -> bool
204
+
(** [is_sectioning elem] checks if this is sectioning content.
205
+
206
+
@return [true] for article, aside, nav, section *)
207
+
208
+
val is_sectioning_root : t -> bool
209
+
(** [is_sectioning_root elem] checks if this is a sectioning root.
210
+
211
+
@return [true] for blockquote, body, details, dialog, fieldset, figure, td *)
212
+
213
+
val is_embedded : t -> bool
214
+
(** [is_embedded elem] checks if this is embedded content.
215
+
216
+
@return [true] for audio, canvas, embed, iframe, img, object, picture, video *)
217
+
218
+
val is_interactive : t -> bool
219
+
(** [is_interactive elem] checks if this is interactive content.
220
+
221
+
@return [true] for focusable/activatable elements *)
222
+
223
+
val is_form_associated : t -> bool
224
+
(** [is_form_associated elem] checks if this is form-associated.
225
+
226
+
@return [true] for elements that can belong to a form *)
227
+
228
+
val is_labelable : t -> bool
229
+
(** [is_labelable elem] checks if this can be associated with a label.
230
+
231
+
@return [true] for button, input, meter, output, progress, select, textarea *)
232
+
233
+
val is_submittable : t -> bool
234
+
(** [is_submittable elem] checks if this is a submittable form element.
235
+
236
+
@return [true] for button, input, select, textarea *)
237
+
238
+
val is_table_element : t -> bool
239
+
(** [is_table_element elem] checks if this is a table-related element.
240
+
241
+
@return [true] for table, tr, td, th, etc. *)
242
+
243
+
val is_media : t -> bool
244
+
(** [is_media elem] checks if this is a media element.
245
+
246
+
@return [true] for audio, video *)
247
+
248
+
val is_list_container : t -> bool
249
+
(** [is_list_container elem] checks if this is a list container.
250
+
251
+
@return [true] for ul, ol, menu, dl *)
252
+
253
+
val is_transparent : t -> bool
254
+
(** [is_transparent elem] checks if this has a transparent content model.
255
+
256
+
@return [true] for a, abbr, audio, canvas, del, ins, map, noscript, etc. *)
257
+
258
+
val is_phrasing : t -> bool
259
+
(** [is_phrasing elem] checks if this is phrasing content.
260
+
261
+
@return [true] for inline-level elements *)
262
+
263
+
val is_flow : t -> bool
264
+
(** [is_flow elem] checks if this is flow content.
265
+
266
+
@return [true] for most body-level elements *)
267
+
268
+
val is_obsolete : t -> bool
269
+
(** [is_obsolete elem] checks if this is a deprecated element.
270
+
271
+
@return [true] for applet, font, marquee, etc. *)
272
+
273
+
val is_svg : t -> bool
274
+
(** [is_svg elem] checks if this is an SVG element.
275
+
276
+
@return [true] if the element is in the SVG namespace *)
277
+
278
+
val is_mathml : t -> bool
279
+
(** [is_mathml elem] checks if this is a MathML element.
280
+
281
+
@return [true] if the element is in the MathML namespace *)
282
+
283
+
val is_custom : t -> bool
284
+
(** [is_custom elem] checks if this is a custom element.
285
+
286
+
@return [true] if the element name contains a hyphen *)
287
+
288
+
val is_unknown : t -> bool
289
+
(** [is_unknown elem] checks if this is an unknown element.
290
+
291
+
@return [true] if the element is not recognized *)
292
+
293
+
(** {1 Input Type Utilities} *)
294
+
295
+
val get_input_type : t -> Attr.input_type option
296
+
(** [get_input_type elem] gets the input type for input elements.
297
+
298
+
@return [Some type] for input elements with a type, [None] otherwise *)
299
+
300
+
val get_button_type : t -> Attr.button_type option
301
+
(** [get_button_type elem] gets the button type for button elements.
302
+
303
+
@return [Some type] for button elements with a type, [None] otherwise *)
304
+
305
+
val is_input_type : Attr.input_type -> t -> bool
306
+
(** [is_input_type expected elem] checks if an input has a specific type.
307
+
308
+
@param expected The expected input type
309
+
@param elem The element to check
310
+
@return [true] if this is an input with the specified type *)
311
+
312
+
(** {1 Pattern Matching Helpers} *)
313
+
314
+
val match_html : t -> (Tag.html_tag -> 'a) -> 'a option
315
+
(** [match_html elem f] applies [f] to the HTML tag if present.
316
+
317
+
@param elem The element
318
+
@param f Function to apply to the HTML tag
319
+
@return [Some (f tag)] for HTML elements, [None] otherwise *)
320
+
321
+
val when_html_tag : Tag.html_tag -> t -> (unit -> 'a) -> 'a option
322
+
(** [when_html_tag expected elem f] applies [f] if the element matches.
323
+
324
+
@param expected The expected HTML tag
325
+
@param elem The element to check
326
+
@param f Function to call if the element matches
327
+
@return [Some (f ())] if matched, [None] otherwise *)
328
+
329
+
(** {1 Internal} *)
330
+
331
+
val parse_type_attr : Tag.html_tag -> string -> Attr.t
332
+
(** [parse_type_attr tag value] parses a type attribute for an element.
333
+
334
+
Different elements have different valid type values. This function
335
+
handles context-dependent parsing.
336
+
337
+
@param tag The element's HTML tag
338
+
@param value The type attribute value
339
+
@return The parsed attribute variant *)
340
+
341
+
val parse_attrs_for_tag : Tag.element_tag -> (string * string) list -> Attr.t list
342
+
(** [parse_attrs_for_tag tag raw_attrs] parses attributes with element context.
343
+
344
+
The type attribute is parsed differently depending on the element tag.
345
+
346
+
@param tag The element's tag
347
+
@param raw_attrs Raw attribute name-value pairs
348
+
@return List of typed attributes *)
+439
lib/htmlrw_check/element/tag.mli
+439
lib/htmlrw_check/element/tag.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Typed HTML5 tag representations using polymorphic variants.
7
+
8
+
This module provides compile-time type safety for HTML elements while
9
+
maintaining escape hatches for unknown/custom elements. Tags are
10
+
represented using polymorphic variants, enabling pattern matching with
11
+
exhaustiveness checking while avoiding the overhead of explicit
12
+
constructors.
13
+
14
+
{2 Design Philosophy}
15
+
16
+
HTML5 defines over 100 standard elements with specific categories and
17
+
content models. This module:
18
+
19
+
- Provides typed representations for all standard elements
20
+
- Supports SVG and MathML namespaced elements
21
+
- Recognizes custom elements (containing hyphens)
22
+
- Falls back to [Unknown] for unrecognized elements
23
+
24
+
{2 Element Categories}
25
+
26
+
HTML5 categorizes elements into content categories that define where
27
+
elements can appear and what they can contain. This module provides
28
+
predicates for common categories:
29
+
30
+
- {!is_void} - Elements that cannot have children
31
+
- {!is_heading} - Heading elements (h1-h6)
32
+
- {!is_sectioning} - Elements that create document sections
33
+
- {!is_phrasing} - Inline/phrasing content elements
34
+
- {!is_flow} - Block/flow content elements
35
+
36
+
@see <https://html.spec.whatwg.org/multipage/dom.html#content-models>
37
+
HTML Standard: Content models
38
+
*)
39
+
40
+
(** {1 HTML Tag Types} *)
41
+
42
+
(** All standard HTML5 elements plus deprecated elements needed by the validator.
43
+
44
+
This type covers:
45
+
- Document metadata elements (html, head, title, etc.)
46
+
- Sectioning elements (article, section, nav, etc.)
47
+
- Heading elements (h1-h6)
48
+
- Grouping content (div, p, ul, ol, etc.)
49
+
- Text-level semantics (a, em, strong, span, etc.)
50
+
- Embedded content (img, video, audio, etc.)
51
+
- Table elements (table, tr, td, th, etc.)
52
+
- Form elements (form, input, button, etc.)
53
+
- Interactive elements (details, dialog, summary)
54
+
- Scripting elements (script, noscript, template)
55
+
- Deprecated/obsolete elements (font, center, marquee, etc.) *)
56
+
type html_tag = [
57
+
(* Document metadata *)
58
+
| `Html | `Head | `Title | `Base | `Link | `Meta | `Style
59
+
60
+
(* Sectioning root *)
61
+
| `Body
62
+
63
+
(* Content sectioning *)
64
+
| `Address | `Article | `Aside | `Footer | `Header | `Hgroup
65
+
| `Main | `Nav | `Search | `Section
66
+
67
+
(* Heading content *)
68
+
| `H1 | `H2 | `H3 | `H4 | `H5 | `H6
69
+
70
+
(* Grouping content *)
71
+
| `Blockquote | `Dd | `Div | `Dl | `Dt | `Figcaption | `Figure
72
+
| `Hr | `Li | `Menu | `Ol | `P | `Pre | `Ul
73
+
74
+
(* Text-level semantics *)
75
+
| `A | `Abbr | `B | `Bdi | `Bdo | `Br | `Cite | `Code | `Data
76
+
| `Dfn | `Em | `I | `Kbd | `Mark | `Q | `Rp | `Rt | `Ruby
77
+
| `S | `Samp | `Small | `Span | `Strong | `Sub | `Sup | `Time
78
+
| `U | `Var | `Wbr
79
+
80
+
(* Edits *)
81
+
| `Del | `Ins
82
+
83
+
(* Embedded content *)
84
+
| `Area | `Audio | `Canvas | `Embed | `Iframe | `Img | `Map | `Object
85
+
| `Picture | `Source | `Track | `Video
86
+
87
+
(* Tabular data *)
88
+
| `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
89
+
| `Th | `Thead | `Tr
90
+
91
+
(* Forms *)
92
+
| `Button | `Datalist | `Fieldset | `Form | `Input | `Label
93
+
| `Legend | `Meter | `Optgroup | `Option | `Output | `Progress
94
+
| `Select | `Textarea
95
+
96
+
(* Interactive elements *)
97
+
| `Details | `Dialog | `Summary
98
+
99
+
(* Scripting *)
100
+
| `Noscript | `Script | `Slot | `Template
101
+
102
+
(* Web Components / Misc *)
103
+
| `Portal | `Param
104
+
105
+
(* Deprecated/obsolete elements *)
106
+
| `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset
107
+
| `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid
108
+
| `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp
109
+
| `Basefont | `Big | `Blink | `Center | `Font | `Marquee
110
+
| `Multicol | `Nobr | `Spacer | `Tt | `Image
111
+
]
112
+
113
+
(** {1 Category Types}
114
+
115
+
Type aliases for element subsets, enabling functions that only accept
116
+
specific categories with compile-time checking. *)
117
+
118
+
(** Void elements - cannot have children (e.g., br, hr, img, input). *)
119
+
type void_tag = [
120
+
| `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input
121
+
| `Link | `Meta | `Source | `Track | `Wbr
122
+
| `Basefont | `Frame | `Isindex | `Keygen | `Param
123
+
]
124
+
125
+
(** Heading elements (h1-h6). *)
126
+
type heading_tag = [ `H1 | `H2 | `H3 | `H4 | `H5 | `H6 ]
127
+
128
+
(** Sectioning content elements that establish document sections. *)
129
+
type sectioning_tag = [ `Article | `Aside | `Nav | `Section ]
130
+
131
+
(** Sectioning roots that establish their own outline context. *)
132
+
type sectioning_root_tag = [
133
+
| `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td
134
+
]
135
+
136
+
(** Embedded content elements. *)
137
+
type embedded_tag = [
138
+
| `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video
139
+
]
140
+
141
+
(** Interactive content elements (focusable/activatable). *)
142
+
type interactive_tag = [
143
+
| `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img
144
+
| `Input | `Label | `Select | `Textarea | `Video
145
+
]
146
+
147
+
(** Form-associated elements that can belong to a form. *)
148
+
type form_associated_tag = [
149
+
| `Button | `Fieldset | `Input | `Label | `Object | `Output
150
+
| `Select | `Textarea | `Meter | `Progress
151
+
]
152
+
153
+
(** Labelable elements that can be associated with a label. *)
154
+
type labelable_tag = [
155
+
| `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea
156
+
]
157
+
158
+
(** Submittable form elements. *)
159
+
type submittable_tag = [
160
+
| `Button | `Input | `Select | `Textarea
161
+
]
162
+
163
+
(** Resettable form elements. *)
164
+
type resettable_tag = [
165
+
| `Input | `Output | `Select | `Textarea
166
+
]
167
+
168
+
(** Table-related elements. *)
169
+
type table_tag = [
170
+
| `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
171
+
| `Th | `Thead | `Tr
172
+
]
173
+
174
+
(** Media elements (audio and video). *)
175
+
type media_tag = [ `Audio | `Video ]
176
+
177
+
(** List container elements. *)
178
+
type list_container_tag = [ `Ul | `Ol | `Menu | `Dl ]
179
+
180
+
(** List item elements. *)
181
+
type list_item_tag = [ `Li | `Dd | `Dt ]
182
+
183
+
(** Script-supporting elements. *)
184
+
type script_supporting_tag = [ `Script | `Template ]
185
+
186
+
(** Metadata content elements. *)
187
+
type metadata_tag = [ `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title ]
188
+
189
+
(** {1 Top-Level Element Type} *)
190
+
191
+
(** Top-level element classification.
192
+
193
+
Elements are classified by namespace and recognition status:
194
+
- [Html tag] - A known HTML5 element
195
+
- [Svg name] - An SVG element (preserves original case)
196
+
- [MathML name] - A MathML element (preserves original case)
197
+
- [Custom name] - A custom element (contains hyphen)
198
+
- [Unknown name] - An unrecognized element *)
199
+
type element_tag =
200
+
| Html of html_tag
201
+
| Svg of string
202
+
| MathML of string
203
+
| Custom of string
204
+
| Unknown of string
205
+
206
+
(** {1 Namespace Constants} *)
207
+
208
+
val svg_namespace : string
209
+
(** The SVG namespace URI: ["http://www.w3.org/2000/svg"]. *)
210
+
211
+
val mathml_namespace : string
212
+
(** The MathML namespace URI: ["http://www.w3.org/1998/Math/MathML"]. *)
213
+
214
+
(** {1 Conversion Functions} *)
215
+
216
+
val html_tag_of_string_opt : string -> html_tag option
217
+
(** [html_tag_of_string_opt name] converts a lowercase tag name to an [html_tag].
218
+
219
+
@param name The lowercase tag name (e.g., ["div"], ["span"])
220
+
@return [Some tag] if recognized, [None] otherwise
221
+
222
+
{b Example:}
223
+
{[
224
+
html_tag_of_string_opt "div" (* Some `Div *)
225
+
html_tag_of_string_opt "xyz" (* None *)
226
+
]} *)
227
+
228
+
val is_custom_element_name : string -> bool
229
+
(** [is_custom_element_name name] checks if a name is a valid custom element name.
230
+
231
+
A valid custom element name must contain a hyphen and not be reserved
232
+
(e.g., not start with "xml" or be "annotation-xml").
233
+
234
+
@param name The element name to check
235
+
@return [true] if the name is a valid custom element name *)
236
+
237
+
val is_svg_namespace : string -> bool
238
+
(** [is_svg_namespace ns] checks if a namespace string represents SVG.
239
+
240
+
Accepts both the short form ["svg"] and the full URI. *)
241
+
242
+
val is_mathml_namespace : string -> bool
243
+
(** [is_mathml_namespace ns] checks if a namespace string represents MathML.
244
+
245
+
Accepts both the short form ["mathml"] and the full URI. *)
246
+
247
+
val tag_of_string : ?namespace:string -> string -> element_tag
248
+
(** [tag_of_string ?namespace name] converts a tag name to an [element_tag].
249
+
250
+
@param namespace Optional namespace URI or short form
251
+
@param name The element name
252
+
@return The classified element tag
253
+
254
+
{b Example:}
255
+
{[
256
+
tag_of_string "div" (* Html `Div *)
257
+
tag_of_string ~namespace:"svg" "circle" (* Svg "circle" *)
258
+
tag_of_string "my-component" (* Custom "my-component" *)
259
+
tag_of_string "xyz" (* Unknown "xyz" *)
260
+
]} *)
261
+
262
+
val html_tag_to_string : html_tag -> string
263
+
(** [html_tag_to_string tag] converts an [html_tag] to its lowercase string name.
264
+
265
+
@param tag The HTML tag variant
266
+
@return The lowercase tag name (e.g., ["div"], ["span"]) *)
267
+
268
+
val tag_to_string : element_tag -> string
269
+
(** [tag_to_string tag] converts any [element_tag] to its string name.
270
+
271
+
@param tag The element tag
272
+
@return The tag name (lowercase for HTML, original case for SVG/MathML) *)
273
+
274
+
(** {1 Category Predicates} *)
275
+
276
+
val is_void : html_tag -> bool
277
+
(** [is_void tag] checks if an element is a void element (cannot have children).
278
+
279
+
@param tag The HTML tag to check
280
+
@return [true] if the element is void (br, hr, img, input, etc.) *)
281
+
282
+
val is_heading : html_tag -> bool
283
+
(** [is_heading tag] checks if an element is a heading element.
284
+
285
+
@param tag The HTML tag to check
286
+
@return [true] if the element is h1-h6 *)
287
+
288
+
val heading_level : html_tag -> int option
289
+
(** [heading_level tag] gets the heading level (1-6) if applicable.
290
+
291
+
@param tag The HTML tag to check
292
+
@return [Some level] for h1-h6, [None] for other elements *)
293
+
294
+
val is_sectioning : html_tag -> bool
295
+
(** [is_sectioning tag] checks if an element is sectioning content.
296
+
297
+
@param tag The HTML tag to check
298
+
@return [true] if the element is article, aside, nav, or section *)
299
+
300
+
val is_sectioning_root : html_tag -> bool
301
+
(** [is_sectioning_root tag] checks if an element is a sectioning root.
302
+
303
+
Sectioning roots establish their own outline context.
304
+
305
+
@param tag The HTML tag to check
306
+
@return [true] if the element is blockquote, body, details, dialog,
307
+
fieldset, figure, or td *)
308
+
309
+
val is_embedded : html_tag -> bool
310
+
(** [is_embedded tag] checks if an element is embedded content.
311
+
312
+
@param tag The HTML tag to check
313
+
@return [true] if the element is audio, canvas, embed, iframe, img,
314
+
object, picture, or video *)
315
+
316
+
val is_interactive : html_tag -> bool
317
+
(** [is_interactive tag] checks if an element is interactive content.
318
+
319
+
@param tag The HTML tag to check
320
+
@return [true] if the element is focusable or activatable *)
321
+
322
+
val is_form_associated : html_tag -> bool
323
+
(** [is_form_associated tag] checks if an element is form-associated.
324
+
325
+
@param tag The HTML tag to check
326
+
@return [true] if the element can belong to a form *)
327
+
328
+
val is_labelable : html_tag -> bool
329
+
(** [is_labelable tag] checks if an element can be associated with a label.
330
+
331
+
@param tag The HTML tag to check
332
+
@return [true] if the element is labelable *)
333
+
334
+
val is_submittable : html_tag -> bool
335
+
(** [is_submittable tag] checks if an element is a submittable form element.
336
+
337
+
@param tag The HTML tag to check
338
+
@return [true] if the element is button, input, select, or textarea *)
339
+
340
+
val is_resettable : html_tag -> bool
341
+
(** [is_resettable tag] checks if an element is a resettable form element.
342
+
343
+
@param tag The HTML tag to check
344
+
@return [true] if the element is input, output, select, or textarea *)
345
+
346
+
val is_transparent : html_tag -> bool
347
+
(** [is_transparent tag] checks if an element has a transparent content model.
348
+
349
+
Transparent elements inherit their content model from their parent.
350
+
351
+
@param tag The HTML tag to check
352
+
@return [true] if the element is transparent (a, abbr, audio, canvas, etc.) *)
353
+
354
+
val is_script_supporting : html_tag -> bool
355
+
(** [is_script_supporting tag] checks if an element is script-supporting.
356
+
357
+
@param tag The HTML tag to check
358
+
@return [true] if the element is script or template *)
359
+
360
+
val is_table_element : html_tag -> bool
361
+
(** [is_table_element tag] checks if an element is a table-related element.
362
+
363
+
@param tag The HTML tag to check
364
+
@return [true] if the element is table, tr, td, th, etc. *)
365
+
366
+
val is_media : html_tag -> bool
367
+
(** [is_media tag] checks if an element is a media element.
368
+
369
+
@param tag The HTML tag to check
370
+
@return [true] if the element is audio or video *)
371
+
372
+
val is_list_container : html_tag -> bool
373
+
(** [is_list_container tag] checks if an element is a list container.
374
+
375
+
@param tag The HTML tag to check
376
+
@return [true] if the element is ul, ol, menu, or dl *)
377
+
378
+
val is_list_item : html_tag -> bool
379
+
(** [is_list_item tag] checks if an element is a list item.
380
+
381
+
@param tag The HTML tag to check
382
+
@return [true] if the element is li, dd, or dt *)
383
+
384
+
val is_metadata : html_tag -> bool
385
+
(** [is_metadata tag] checks if an element is metadata content.
386
+
387
+
@param tag The HTML tag to check
388
+
@return [true] if the element is base, link, meta, etc. *)
389
+
390
+
val is_obsolete : html_tag -> bool
391
+
(** [is_obsolete tag] checks if an element is deprecated/obsolete.
392
+
393
+
@param tag The HTML tag to check
394
+
@return [true] if the element is applet, font, marquee, etc. *)
395
+
396
+
val is_raw_text : html_tag -> bool
397
+
(** [is_raw_text tag] checks if an element is a raw text element.
398
+
399
+
Raw text elements contain unparsed text content.
400
+
401
+
@param tag The HTML tag to check
402
+
@return [true] if the element is script or style *)
403
+
404
+
val is_escapable_raw_text : html_tag -> bool
405
+
(** [is_escapable_raw_text tag] checks if an element is escapable raw text.
406
+
407
+
@param tag The HTML tag to check
408
+
@return [true] if the element is textarea or title *)
409
+
410
+
val is_phrasing : html_tag -> bool
411
+
(** [is_phrasing tag] checks if an element is phrasing content.
412
+
413
+
Phrasing content is inline-level content that forms paragraphs.
414
+
415
+
@param tag The HTML tag to check
416
+
@return [true] if the element is phrasing content *)
417
+
418
+
val is_flow : html_tag -> bool
419
+
(** [is_flow tag] checks if an element is flow content.
420
+
421
+
Flow content is most elements that can appear in the body.
422
+
423
+
@param tag The HTML tag to check
424
+
@return [true] if the element is flow content *)
425
+
426
+
(** {1 Pattern Matching Helpers} *)
427
+
428
+
val as_html_tag : element_tag -> html_tag option
429
+
(** [as_html_tag tag] extracts the HTML tag if present.
430
+
431
+
@param tag The element tag
432
+
@return [Some html_tag] if [tag] is [Html html_tag], [None] otherwise *)
433
+
434
+
val is_html_tag : html_tag -> element_tag -> bool
435
+
(** [is_html_tag expected tag] checks if [tag] matches the expected HTML tag.
436
+
437
+
@param expected The expected HTML tag variant
438
+
@param tag The element tag to check
439
+
@return [true] if [tag] is [Html expected] *)
+5
lib/htmlrw_check/error_code.ml
+5
lib/htmlrw_check/error_code.ml
···
119
119
| `For_id_mismatch
120
120
| `Role_on_ancestor
121
121
| `Role_on_for
122
+
| `Aria_label_on_ancestor
122
123
| `Aria_label_on_for
123
124
]
124
125
···
309
310
| `Label `For_id_mismatch -> "label-for-mismatch"
310
311
| `Label `Role_on_ancestor -> "role-on-label"
311
312
| `Label `Role_on_for -> "role-on-label"
313
+
| `Label `Aria_label_on_ancestor -> "aria-label-on-label"
312
314
| `Label `Aria_label_on_for -> "aria-label-on-label"
313
315
314
316
(* Input errors *)
···
624
626
| `Label `Role_on_for ->
625
627
Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element."
626
628
(q "role") (q "label")
629
+
| `Label `Aria_label_on_ancestor ->
630
+
Printf.sprintf "The %s attribute must not be used on any %s element that is an ancestor of a labelable element."
631
+
(q "aria-label") (q "label")
627
632
| `Label `Aria_label_on_for ->
628
633
Printf.sprintf "The %s attribute must not be used on any %s element that is associated with a labelable element."
629
634
(q "aria-label") (q "label")
+5
lib/htmlrw_check/error_code.mli
+5
lib/htmlrw_check/error_code.mli
···
527
527
Adding [role] to a label that wraps a form control
528
528
breaks the implicit label association. *)
529
529
530
+
| `Aria_label_on_ancestor
531
+
(** [<label>] with [aria-label] is ancestor of labelable element.
532
+
[aria-label] on a label that wraps a form control creates
533
+
conflicting accessible names. *)
534
+
530
535
| `Role_on_for
531
536
(** [<label>] with role uses [for] association.
532
537
Labels with explicit [for] association must not have [role]. *)
+31
lib/htmlrw_check/semantic/autofocus_checker.mli
+31
lib/htmlrw_check/semantic/autofocus_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Autofocus attribute validation checker.
7
+
8
+
This checker validates that only one element with the [autofocus] attribute
9
+
exists within each dialog or popover context. HTML5 specifies that there
10
+
should be at most one autofocused element per autofocus scope.
11
+
12
+
{2 Validation Rules}
13
+
14
+
- Within each dialog element, only one descendant may have [autofocus]
15
+
- Within each popover element, only one descendant may have [autofocus]
16
+
- Nested dialogs and popovers create separate scopes
17
+
18
+
{2 Error Messages}
19
+
20
+
Reports [Multiple_autofocus] when more than one autofocus attribute is
21
+
found within the same scope.
22
+
23
+
@see <https://html.spec.whatwg.org/multipage/interaction.html#the-autofocus-attribute>
24
+
HTML Standard: The autofocus attribute
25
+
*)
26
+
27
+
val checker : Checker.t
28
+
(** The autofocus checker instance.
29
+
30
+
This checker can be registered with the checker registry and will be
31
+
invoked during DOM traversal to validate autofocus attribute usage. *)
+2
-19
lib/htmlrw_check/semantic/lang_detecting_checker.ml
+2
-19
lib/htmlrw_check/semantic/lang_detecting_checker.ml
···
6
6
type state = {
7
7
mutable html_lang : string option;
8
8
mutable html_dir : string option;
9
-
mutable html_locator : (int * int) option; (* line, column *)
10
9
mutable in_body : bool;
11
10
mutable skip_depth : int; (* depth in elements to skip *)
12
11
mutable foreign_depth : int; (* depth in SVG/MathML content to skip *)
13
-
mutable text_buffer : Buffer.t;
12
+
text_buffer : Buffer.t; (* buffer contents are mutated, not the field itself *)
14
13
mutable char_count : int;
15
14
}
16
15
···
30
29
let create () = {
31
30
html_lang = None;
32
31
html_dir = None;
33
-
html_locator = None;
34
32
in_body = false;
35
33
skip_depth = 0;
36
34
foreign_depth = 0;
···
41
39
let reset state =
42
40
state.html_lang <- None;
43
41
state.html_dir <- None;
44
-
state.html_locator <- None;
45
42
state.in_body <- false;
46
43
state.skip_depth <- 0;
47
44
state.foreign_depth <- 0;
48
45
Buffer.clear state.text_buffer;
49
46
state.char_count <- 0
50
-
51
-
(* Namespaces to skip for language detection *)
52
-
let svg_namespace = "http://www.w3.org/2000/svg"
53
-
let mathml_namespace = "http://www.w3.org/1998/Math/MathML"
54
-
55
-
let is_foreign_namespace ns =
56
-
ns = svg_namespace || ns = mathml_namespace
57
-
58
-
(* Element names that start foreign content (for when namespace isn't set) *)
59
-
let is_foreign_element name =
60
-
let n = String.lowercase_ascii name in
61
-
n = "svg" || n = "math"
62
47
63
48
let get_lang_code lang =
64
49
(* Extract primary language subtag *)
···
221
206
match element.tag with
222
207
| Tag.Html `Html ->
223
208
state.html_lang <- Attr_utils.get_attr "lang" attrs;
224
-
state.html_dir <- Attr_utils.get_attr "dir" attrs;
225
-
(* TODO: get line/column from locator *)
226
-
state.html_locator <- Some (1, 1)
209
+
state.html_dir <- Attr_utils.get_attr "dir" attrs
227
210
| Tag.Html `Body ->
228
211
state.in_body <- true
229
212
| Tag.Svg _ | Tag.MathML _ ->
+41
lib/htmlrw_check/semantic/lang_detecting_checker.mli
+41
lib/htmlrw_check/semantic/lang_detecting_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Language detection and validation checker.
7
+
8
+
This checker validates that the document's [lang] attribute matches the
9
+
detected language of the content, and that the [dir] attribute is correct
10
+
for right-to-left (RTL) languages.
11
+
12
+
{2 Detection Algorithm}
13
+
14
+
The checker:
15
+
1. Collects text content from the document body (up to 30720 characters)
16
+
2. Skips text from certain elements (scripts, navigation, form controls)
17
+
3. Skips foreign namespace content (SVG, MathML)
18
+
4. Uses statistical language detection with >90% confidence threshold
19
+
5. Handles Traditional vs Simplified Chinese detection
20
+
21
+
{2 Validation Rules}
22
+
23
+
- Documents should have a [lang] attribute on the [<html>] element
24
+
- The declared language should match the detected content language
25
+
- RTL languages (Arabic, Hebrew, Persian, Urdu, etc.) should have [dir="rtl"]
26
+
27
+
{2 Error Messages}
28
+
29
+
- [Wrong_lang]: The declared language doesn't match detected content
30
+
- [Missing_dir_rtl]: An RTL language is detected but no [dir] attribute
31
+
- [Wrong_dir]: The [dir] attribute doesn't match the detected RTL language
32
+
33
+
@see <https://html.spec.whatwg.org/multipage/dom.html#the-lang-and-xml:lang-attributes>
34
+
HTML Standard: The lang attribute
35
+
*)
36
+
37
+
val checker : Checker.t
38
+
(** The language detection checker instance.
39
+
40
+
This checker collects text during DOM traversal and performs language
41
+
detection at document end. *)
+2
-1
lib/htmlrw_check/semantic/option_checker.ml
+2
-1
lib/htmlrw_check/semantic/option_checker.ml
···
49
49
(match state.option_stack with
50
50
| ctx :: rest ->
51
51
state.option_stack <- rest;
52
-
if not ctx.has_text && not ctx.has_label then
52
+
(* Empty label attribute doesn't count as a valid label *)
53
+
if not ctx.has_text && (not ctx.has_label || ctx.label_empty) then
53
54
Message_collector.add_typed collector (`Misc `Option_empty_without_label)
54
55
| [] -> ())
55
56
| _ -> ()
+32
lib/htmlrw_check/semantic/option_checker.mli
+32
lib/htmlrw_check/semantic/option_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Option element validation checker.
7
+
8
+
This checker validates that [<option>] elements have proper content or
9
+
a [label] attribute. Empty options without labels can be confusing for
10
+
users, especially those using assistive technologies.
11
+
12
+
{2 Validation Rules}
13
+
14
+
- An [<option>] element must have either:
15
+
- Non-whitespace text content, OR
16
+
- A non-empty [label] attribute
17
+
- Empty [label] attribute values are reported as errors
18
+
- Options inside [<template>] elements are not checked
19
+
20
+
{2 Error Messages}
21
+
22
+
- [Option_empty_without_label]: Option has no text and no label attribute
23
+
- [Bad_value] for label: The label attribute value is empty
24
+
25
+
@see <https://html.spec.whatwg.org/multipage/form-elements.html#the-option-element>
26
+
HTML Standard: The option element
27
+
*)
28
+
29
+
val checker : Checker.t
30
+
(** The option element checker instance.
31
+
32
+
This checker validates option elements during DOM traversal. *)
+31
lib/htmlrw_check/specialized/attr_restrictions_checker.mli
+31
lib/htmlrw_check/specialized/attr_restrictions_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Attribute restrictions checker.
7
+
8
+
This checker validates that certain attributes are not used on elements
9
+
where they are not allowed. It catches common misuses such as:
10
+
11
+
- RDFa-style [href] on elements like [<img>], [<p>], [<div>]
12
+
- [src] or [media] on [<a>] elements
13
+
- [srcset] on media elements ([<audio>], [<video>], [<object>])
14
+
15
+
{2 Validation Rules}
16
+
17
+
The checker maintains a list of (element, disallowed_attributes) pairs
18
+
for both HTML and SVG elements. When an element is encountered with
19
+
a disallowed attribute, an error is reported.
20
+
21
+
{2 Error Messages}
22
+
23
+
Reports [Not_allowed] when an attribute is used on an element where
24
+
it is not permitted.
25
+
26
+
@see <https://html.spec.whatwg.org/multipage/dom.html#element-definitions>
27
+
HTML Standard: Element definitions
28
+
*)
29
+
30
+
val checker : Checker.t
31
+
(** The attribute restrictions checker instance. *)
+28
lib/htmlrw_check/specialized/base_checker.mli
+28
lib/htmlrw_check/specialized/base_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Base element ordering checker.
7
+
8
+
This checker validates that the [<base>] element appears before any
9
+
elements that may use URLs resolved against the base URL. Specifically,
10
+
[<base>] should appear before [<link>] and [<script>] elements.
11
+
12
+
{2 Validation Rules}
13
+
14
+
- [<base>] must appear before any [<link>] elements
15
+
- [<base>] must appear before any [<script>] elements
16
+
- The order is significant for URL resolution in the document
17
+
18
+
{2 Error Messages}
19
+
20
+
Reports [Base_after_link_script] when a [<base>] element is found
21
+
after [<link>] or [<script>] elements.
22
+
23
+
@see <https://html.spec.whatwg.org/multipage/semantics.html#the-base-element>
24
+
HTML Standard: The base element
25
+
*)
26
+
27
+
val checker : Checker.t
28
+
(** The base element ordering checker instance. *)
-3
lib/htmlrw_check/specialized/datetime_checker.ml
-3
lib/htmlrw_check/specialized/datetime_checker.ml
+43
lib/htmlrw_check/specialized/datetime_checker.mli
+43
lib/htmlrw_check/specialized/datetime_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Datetime attribute validation checker.
7
+
8
+
This checker validates the [datetime] attribute on [<del>], [<ins>],
9
+
and [<time>] elements. The datetime value must conform to a valid
10
+
date, time, or datetime format as specified by HTML5.
11
+
12
+
{2 Supported Formats}
13
+
14
+
The checker validates these datetime formats:
15
+
- Date: [YYYY-MM-DD] (e.g., "2025-12-19")
16
+
- Month: [YYYY-MM] (e.g., "2025-12")
17
+
- Year: [YYYY] (e.g., "2025")
18
+
- Week: [YYYY-Www] (e.g., "2025-W51")
19
+
- Time: [HH:MM] or [HH:MM:SS] (e.g., "14:30:00")
20
+
- Datetime: Date followed by time with separator (e.g., "2025-12-19T14:30")
21
+
- Timezone offsets: [+HH:MM] or [-HH:MM] or [Z]
22
+
- Duration: [P] prefix followed by duration components
23
+
24
+
{2 Validation Rules}
25
+
26
+
- Month values must be 01-12
27
+
- Day values must be valid for the given month
28
+
- Leap years are correctly handled for February 29th
29
+
- Hour values must be 00-23
30
+
- Minute and second values must be 00-59
31
+
- Week numbers must be 01-53
32
+
33
+
{2 Error Messages}
34
+
35
+
Reports [Bad_value] when the datetime attribute contains an invalid
36
+
format or out-of-range values.
37
+
38
+
@see <https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#dates-and-times>
39
+
HTML Standard: Dates and times
40
+
*)
41
+
42
+
val checker : Checker.t
43
+
(** The datetime attribute checker instance. *)
+37
lib/htmlrw_check/specialized/dl_checker.mli
+37
lib/htmlrw_check/specialized/dl_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** DL element content model validation checker.
7
+
8
+
This checker validates that [<dl>] (description list) elements follow
9
+
the HTML5 content model requirements. Description lists must contain
10
+
[<dt>] (term) and [<dd>] (description) elements in the correct order.
11
+
12
+
{2 Content Model}
13
+
14
+
A [<dl>] element may contain:
15
+
- Zero or more groups of [<dt>] followed by [<dd>] elements
16
+
- [<div>] elements wrapping [<dt>]/[<dd>] groups (for styling)
17
+
- [<template>] and [<script>] elements (script-supporting)
18
+
19
+
{2 Validation Rules}
20
+
21
+
- [<dd>] should not appear before any [<dt>] (terms should come first)
22
+
- [<dl>] should not be empty (should contain at least one term/description)
23
+
- When using [<div>] wrappers, mixing wrapped and unwrapped content
24
+
is discouraged
25
+
- Each [<div>] in a [<dl>] should contain at least one [<dt>]/[<dd>] group
26
+
27
+
{2 Error Messages}
28
+
29
+
- [Dl_empty]: The [<dl>] element has no content
30
+
- [Dd_before_dt]: A [<dd>] appears before any [<dt>] element
31
+
32
+
@see <https://html.spec.whatwg.org/multipage/grouping-content.html#the-dl-element>
33
+
HTML Standard: The dl element
34
+
*)
35
+
36
+
val checker : Checker.t
37
+
(** The description list content model checker instance. *)
+35
lib/htmlrw_check/specialized/h1_checker.mli
+35
lib/htmlrw_check/specialized/h1_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** H1 element counter and validator.
7
+
8
+
This checker warns about multiple [<h1>] elements in a document.
9
+
While HTML5 technically allows multiple [<h1>] elements when using
10
+
the document outline algorithm, this algorithm was never implemented
11
+
by browsers and has been removed from the specification.
12
+
13
+
{2 Best Practice}
14
+
15
+
Documents should have exactly one [<h1>] element that represents the
16
+
main heading of the page. Multiple [<h1>] elements can confuse users
17
+
and assistive technologies about the document's structure.
18
+
19
+
{2 Special Cases}
20
+
21
+
- [<h1>] elements inside [<svg>] content (e.g., in [<foreignObject>])
22
+
are not counted, as they may represent different content contexts
23
+
- The checker reports a warning after the second [<h1>] is encountered
24
+
25
+
{2 Error Messages}
26
+
27
+
Reports [Multiple_h1] when more than one [<h1>] element is found
28
+
in the document.
29
+
30
+
@see <https://html.spec.whatwg.org/multipage/sections.html#the-h1,-h2,-h3,-h4,-h5,-and-h6-elements>
31
+
HTML Standard: The h1-h6 elements
32
+
*)
33
+
34
+
val checker : Checker.t
35
+
(** The h1 element counter/validator instance. *)
+2
lib/htmlrw_check/specialized/label_checker.ml
+2
lib/htmlrw_check/specialized/label_checker.ml
···
110
110
| Tag.Html `Label when state.label_depth = 0 ->
111
111
if state.label_has_role && state.labelable_count > 0 then
112
112
Message_collector.add_typed collector (`Label `Role_on_ancestor);
113
+
if state.label_has_aria_label && state.labelable_count > 0 then
114
+
Message_collector.add_typed collector (`Label `Aria_label_on_ancestor);
113
115
state.in_label <- false;
114
116
state.labelable_count <- 0;
115
117
state.label_for_value <- None;
+41
lib/htmlrw_check/specialized/label_checker.mli
+41
lib/htmlrw_check/specialized/label_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Label element content model validation checker.
7
+
8
+
This checker validates that [<label>] elements follow the HTML5
9
+
content model requirements. Labels associate text with form controls
10
+
and must be used correctly for accessibility.
11
+
12
+
{2 Validation Rules}
13
+
14
+
- A [<label>] element may contain at most one labelable element
15
+
(button, input, meter, output, progress, select, textarea)
16
+
- When using the [for] attribute, it should reference an existing
17
+
element ID in the document
18
+
- Nested labelable elements are not counted (only direct descendants)
19
+
20
+
{2 Labelable Elements}
21
+
22
+
The following elements can be labeled:
23
+
- [<button>]
24
+
- [<input>] (except type="hidden")
25
+
- [<meter>]
26
+
- [<output>]
27
+
- [<progress>]
28
+
- [<select>]
29
+
- [<textarea>]
30
+
31
+
{2 Error Messages}
32
+
33
+
- Multiple labelable elements inside a single [<label>]
34
+
- [for] attribute references a non-existent ID
35
+
36
+
@see <https://html.spec.whatwg.org/multipage/forms.html#the-label-element>
37
+
HTML Standard: The label element
38
+
*)
39
+
40
+
val checker : Checker.t
41
+
(** The label element content model checker instance. *)
+42
lib/htmlrw_check/specialized/picture_checker.mli
+42
lib/htmlrw_check/specialized/picture_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Picture element content model and attribute validation checker.
7
+
8
+
This checker validates that [<picture>] elements follow the HTML5
9
+
content model requirements and that attributes are used correctly.
10
+
11
+
{2 Content Model}
12
+
13
+
A [<picture>] element may contain:
14
+
- Zero or more [<source>] elements (must come before [<img>])
15
+
- Exactly one [<img>] element (required)
16
+
- [<script>] and [<template>] elements (script-supporting)
17
+
18
+
{2 Attribute Restrictions}
19
+
20
+
The [<picture>] element should not have image-related attributes
21
+
directly on it (these belong on the [<img>] child):
22
+
- [src], [srcset], [sizes], [alt], [width], [height]
23
+
- [crossorigin], [loading], [decoding]
24
+
- Legacy attributes like [align], [border], [hspace], etc.
25
+
26
+
{2 Source Restrictions in Picture}
27
+
28
+
When [<source>] is a child of [<picture>]:
29
+
- It must have [srcset] attribute (required)
30
+
- It should not have [src] attribute
31
+
32
+
{2 Error Messages}
33
+
34
+
- Disallowed attributes on [<picture>] or [<source>] in picture context
35
+
- Invalid parent elements for [<picture>]
36
+
37
+
@see <https://html.spec.whatwg.org/multipage/embedded-content.html#the-picture-element>
38
+
HTML Standard: The picture element
39
+
*)
40
+
41
+
val checker : Checker.t
42
+
(** The picture element checker instance. *)
+36
lib/htmlrw_check/specialized/ruby_checker.mli
+36
lib/htmlrw_check/specialized/ruby_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Ruby element content model validation checker.
7
+
8
+
This checker validates that [<ruby>] elements follow the HTML5
9
+
content model requirements. Ruby annotations are used for East Asian
10
+
typography to show pronunciation or meaning of characters.
11
+
12
+
{2 Content Model}
13
+
14
+
A [<ruby>] element must contain:
15
+
- Phrasing content (the base text)
16
+
- One or more [<rt>] elements (the ruby text/annotation)
17
+
- Optional [<rp>] elements (fallback parentheses)
18
+
19
+
{2 Validation Rules}
20
+
21
+
- [<ruby>] must contain at least one [<rt>] element
22
+
- There should be phrasing content before the first [<rt>]
23
+
- [<rp>] elements should surround [<rt>] for fallback rendering
24
+
- Nested [<ruby>] elements are handled correctly
25
+
26
+
{2 Error Messages}
27
+
28
+
- Ruby element without any [<rt>] child
29
+
- Missing base text before ruby annotation
30
+
31
+
@see <https://html.spec.whatwg.org/multipage/text-level-semantics.html#the-ruby-element>
32
+
HTML Standard: The ruby element
33
+
*)
34
+
35
+
val checker : Checker.t
36
+
(** The ruby element content model checker instance. *)
+34
lib/htmlrw_check/specialized/source_checker.mli
+34
lib/htmlrw_check/specialized/source_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Source element context validation checker.
7
+
8
+
This checker validates that [<source>] element attributes are appropriate
9
+
for the parent context. The allowed attributes differ based on whether
10
+
the source is inside [<picture>], [<video>], or [<audio>].
11
+
12
+
{2 Context-Dependent Rules}
13
+
14
+
In [<picture>] context:
15
+
- [srcset] is required
16
+
- [src] is not allowed
17
+
- [media] and [type] are allowed
18
+
19
+
In [<video>] or [<audio>] context:
20
+
- [src] is required
21
+
- [srcset] and [sizes] are not allowed
22
+
- [type] is allowed for MIME type hints
23
+
24
+
{2 Error Messages}
25
+
26
+
- Missing required attributes for the context
27
+
- Attributes not allowed in the current context
28
+
29
+
@see <https://html.spec.whatwg.org/multipage/embedded-content.html#the-source-element>
30
+
HTML Standard: The source element
31
+
*)
32
+
33
+
val checker : Checker.t
34
+
(** The source element context checker instance. *)
-15
lib/htmlrw_check/specialized/srcset_sizes_checker.ml
-15
lib/htmlrw_check/specialized/srcset_sizes_checker.ml
···
61
61
let split_on_space_respecting_parens s =
62
62
split_respecting_parens ~sep:' ' s |> List.filter (fun s -> s <> "")
63
63
64
-
(** Check if string contains only whitespace *)
65
-
let is_whitespace_only s =
66
-
String.for_all (fun c -> c = ' ' || c = '\t' || c = '\n' || c = '\r') s
67
-
68
64
(** Invalid units that are not CSS lengths but might be confused for them *)
69
65
let invalid_size_units = [
70
66
"deg"; "grad"; "rad"; "turn"; (* angle units *)
···
154
150
NoCommentError
155
151
end
156
152
end
157
-
158
-
(** For backward compatibility *)
159
-
let has_invalid_css_comment s =
160
-
match check_css_comment_position s with
161
-
| NoCommentError -> false
162
-
| _ -> true
163
153
164
154
(** Check if scientific notation has invalid exponent (like 1e+1.5 - decimal in exponent) *)
165
155
let has_invalid_scientific_notation s =
···
280
270
end
281
271
end
282
272
end
283
-
284
-
let has_valid_size_unit size_value =
285
-
match check_size_value size_value with
286
-
| Valid -> true
287
-
| InvalidUnit (_, _) | NegativeValue | CssCommentAfterSign (_, _) | CssCommentBeforeUnit (_, _) | BadScientificNotation | BadCssNumber (_, _) -> false
288
273
289
274
(** Check if a sizes entry has a media condition (starts with '(') *)
290
275
let has_media_condition entry =
+50
lib/htmlrw_check/specialized/srcset_sizes_checker.mli
+50
lib/htmlrw_check/specialized/srcset_sizes_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Srcset and sizes attribute validation checker.
7
+
8
+
This checker validates the [srcset] and [sizes] attributes on [<img>]
9
+
and [<source>] elements. These attributes use a specialized microsyntax
10
+
for responsive images.
11
+
12
+
{2 Srcset Syntax}
13
+
14
+
The [srcset] attribute contains a comma-separated list of image
15
+
candidates, each with:
16
+
- A URL
17
+
- An optional width descriptor ([Nw], e.g., "800w")
18
+
- Or an optional pixel density descriptor ([Nx], e.g., "2x")
19
+
20
+
Width and pixel density descriptors cannot be mixed in the same srcset.
21
+
22
+
{2 Sizes Syntax}
23
+
24
+
The [sizes] attribute contains a comma-separated list of:
25
+
- Media conditions (optional)
26
+
- Source sizes (CSS lengths)
27
+
28
+
The last entry should not have a media condition (it's the default).
29
+
30
+
{2 Validation Rules}
31
+
32
+
- URLs in srcset must be valid
33
+
- Width descriptors must be positive integers
34
+
- Pixel density descriptors must be positive numbers
35
+
- Sizes must use valid CSS length units
36
+
- Duplicate descriptors are flagged
37
+
38
+
{2 Error Messages}
39
+
40
+
- Invalid srcset syntax
41
+
- Invalid sizes syntax
42
+
- Missing sizes when srcset uses width descriptors
43
+
- Invalid CSS length units
44
+
45
+
@see <https://html.spec.whatwg.org/multipage/images.html#srcset-attributes>
46
+
HTML Standard: Srcset attributes
47
+
*)
48
+
49
+
val checker : Checker.t
50
+
(** The srcset/sizes attribute checker instance. *)
+1
-4
lib/htmlrw_check/specialized/title_checker.ml
+1
-4
lib/htmlrw_check/specialized/title_checker.ml
···
6
6
mutable in_title : bool;
7
7
mutable title_has_content : bool;
8
8
mutable title_depth : int;
9
-
mutable is_iframe_srcdoc : bool;
10
9
}
11
10
12
11
let create () = {
···
15
14
in_title = false;
16
15
title_has_content = false;
17
16
title_depth = 0;
18
-
is_iframe_srcdoc = false;
19
17
}
20
18
21
19
let reset state =
···
23
21
state.has_title <- false;
24
22
state.in_title <- false;
25
23
state.title_has_content <- false;
26
-
state.title_depth <- 0;
27
-
state.is_iframe_srcdoc <- false
24
+
state.title_depth <- 0
28
25
29
26
let start_element state ~element _collector =
30
27
(match element.Element.tag with
+28
lib/htmlrw_check/specialized/title_checker.mli
+28
lib/htmlrw_check/specialized/title_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Title element validation checker.
7
+
8
+
This checker validates that documents have a proper [<title>] element
9
+
with meaningful content. The title is important for accessibility,
10
+
SEO, and browser tab identification.
11
+
12
+
{2 Validation Rules}
13
+
14
+
- Documents should have exactly one [<title>] element in the [<head>]
15
+
- The [<title>] element should contain non-whitespace text
16
+
- Empty titles are flagged as errors
17
+
18
+
{2 Error Messages}
19
+
20
+
- [Empty_title]: The title element is empty or contains only whitespace
21
+
- [Missing_title]: No title element found in the document head
22
+
23
+
@see <https://html.spec.whatwg.org/multipage/semantics.html#the-title-element>
24
+
HTML Standard: The title element
25
+
*)
26
+
27
+
val checker : Checker.t
28
+
(** The title element checker instance. *)
+4
-55
lib/htmlrw_check/specialized/unknown_element_checker.ml
+4
-55
lib/htmlrw_check/specialized/unknown_element_checker.ml
···
1
1
(** Unknown HTML element checker.
2
2
3
3
Detects elements that are not in the HTML5 specification and produces
4
-
appropriate error messages. Custom elements (with hyphens) are allowed. *)
5
-
6
-
(** Set of all known HTML5 element names. *)
7
-
let known_elements =
8
-
let elements = [
9
-
(* Document metadata *)
10
-
"html"; "head"; "title"; "base"; "link"; "meta"; "style";
11
-
12
-
(* Sections *)
13
-
"body"; "article"; "section"; "nav"; "aside"; "h1"; "h2"; "h3"; "h4"; "h5"; "h6";
14
-
"hgroup"; "header"; "footer"; "address"; "main";
15
-
16
-
(* Grouping content *)
17
-
"p"; "hr"; "pre"; "blockquote"; "ol"; "ul"; "menu"; "li"; "dl"; "dt"; "dd";
18
-
"figure"; "figcaption"; "div";
4
+
appropriate error messages. Custom elements (with hyphens) are allowed.
19
5
20
-
(* Text-level semantics *)
21
-
"a"; "em"; "strong"; "small"; "s"; "cite"; "q"; "dfn"; "abbr"; "ruby"; "rt"; "rp";
22
-
"data"; "time"; "code"; "var"; "samp"; "kbd"; "sub"; "sup"; "i"; "b"; "u"; "mark";
23
-
"bdi"; "bdo"; "span"; "br"; "wbr"; "search";
24
-
25
-
(* Edits *)
26
-
"ins"; "del";
27
-
28
-
(* Embedded content *)
29
-
"picture"; "source"; "img"; "iframe"; "embed"; "object"; "video"; "audio";
30
-
"track"; "map"; "area"; "math"; "svg";
31
-
32
-
(* Tables *)
33
-
"table"; "caption"; "colgroup"; "col"; "tbody"; "thead"; "tfoot"; "tr"; "td"; "th";
34
-
35
-
(* Forms *)
36
-
"form"; "label"; "input"; "button"; "select"; "datalist"; "optgroup"; "option";
37
-
"textarea"; "output"; "progress"; "meter"; "fieldset"; "legend";
38
-
39
-
(* Interactive *)
40
-
"details"; "summary"; "dialog";
41
-
42
-
(* Scripting *)
43
-
"script"; "noscript"; "template"; "slot"; "canvas";
44
-
45
-
(* Deprecated but still recognized *)
46
-
"param";
47
-
] in
48
-
let tbl = Hashtbl.create (List.length elements) in
49
-
List.iter (fun el -> Hashtbl.add tbl el ()) elements;
50
-
tbl
51
-
52
-
(** Check if an element name is a custom element (contains hyphen). *)
53
-
let is_custom_element name =
54
-
String.contains name '-'
55
-
56
-
(** Check if an element name is known. *)
57
-
let is_known_element name =
58
-
let name_lower = String.lowercase_ascii name in
59
-
Hashtbl.mem known_elements name_lower || is_custom_element name_lower
6
+
Note: Unknown element detection is performed by the parser, which marks
7
+
unrecognized elements as [Tag.Unknown]. This checker produces appropriate
8
+
error messages for those elements. *)
60
9
61
10
type state = {
62
11
mutable stack : string list; (* Parent element stack *)
+40
lib/htmlrw_check/specialized/unknown_element_checker.mli
+40
lib/htmlrw_check/specialized/unknown_element_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Unknown HTML element checker.
7
+
8
+
This checker detects elements that are not in the HTML5 specification
9
+
and produces appropriate error messages. Custom elements (names
10
+
containing hyphens) are allowed per the Web Components specification.
11
+
12
+
{2 Recognized Elements}
13
+
14
+
The checker recognizes all standard HTML5 elements including:
15
+
- Document metadata (html, head, title, etc.)
16
+
- Sections (body, article, section, nav, etc.)
17
+
- Grouping content (p, div, ul, ol, etc.)
18
+
- Text-level semantics (a, em, strong, span, etc.)
19
+
- Embedded content (img, video, audio, iframe, etc.)
20
+
- Tabular data (table, tr, td, th, etc.)
21
+
- Forms (form, input, button, select, etc.)
22
+
- Interactive elements (details, dialog, summary)
23
+
- Scripting (script, noscript, template)
24
+
25
+
{2 Custom Elements}
26
+
27
+
Element names containing a hyphen are treated as custom elements
28
+
and are allowed without warning (e.g., [<my-component>], [<app-header>]).
29
+
30
+
{2 Error Messages}
31
+
32
+
Reports [Unknown_element] for unrecognized element names that are
33
+
not valid custom elements.
34
+
35
+
@see <https://html.spec.whatwg.org/multipage/custom-elements.html>
36
+
HTML Standard: Custom elements
37
+
*)
38
+
39
+
val checker : Checker.t
40
+
(** The unknown element checker instance. *)
+68
lib/htmlrw_check/specialized/url_checker.mli
+68
lib/htmlrw_check/specialized/url_checker.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** URL validation checker.
7
+
8
+
This checker validates URL attributes ([href], [src], [action], etc.)
9
+
on HTML elements. It checks for common URL issues and security concerns.
10
+
11
+
{2 Validated Attributes}
12
+
13
+
The checker validates URLs in these attributes:
14
+
- [href] on [<a>], [<area>], [<base>], [<link>]
15
+
- [src] on [<audio>], [<embed>], [<iframe>], [<img>], [<input>],
16
+
[<script>], [<source>], [<track>], [<video>]
17
+
- [action] on [<form>], [<button>] (formaction)
18
+
- [cite] on [<blockquote>], [<del>], [<ins>], [<q>]
19
+
- [data] on [<object>]
20
+
- [poster] on [<video>]
21
+
- [value] on [<input type="url">]
22
+
23
+
{2 Validation Rules}
24
+
25
+
- URLs should be well-formed (parseable)
26
+
- Relative URLs are allowed
27
+
- Fragment-only URLs ([#anchor]) are valid
28
+
- Data URLs are validated for proper structure
29
+
- javascript: URLs may trigger warnings
30
+
- Empty URLs are flagged on elements that require them
31
+
32
+
{2 Error Messages}
33
+
34
+
- [Bad_url]: Malformed URL that cannot be parsed
35
+
- [Empty_url]: Required URL attribute is empty
36
+
- Various URL-specific validation errors
37
+
38
+
@see <https://url.spec.whatwg.org/>
39
+
URL Standard
40
+
*)
41
+
42
+
(** {1 URL Parsing Utilities} *)
43
+
44
+
val extract_scheme : string -> string option
45
+
(** [extract_scheme url] extracts the scheme (protocol) from a URL.
46
+
47
+
@param url The URL to parse
48
+
@return [Some scheme] if a valid scheme is found (e.g., "http", "https"),
49
+
[None] if no scheme is present or the URL is relative *)
50
+
51
+
val validate_url : string -> string -> string -> string option
52
+
(** [validate_url url element_name attr_name] validates a URL.
53
+
54
+
Performs comprehensive validation including:
55
+
- Checking for empty URLs on elements that require them
56
+
- Validating scheme, host, port, path, query, and fragment
57
+
- Checking for illegal characters and encoding issues
58
+
- Validating special schemes (http, https, etc.)
59
+
60
+
@param url The URL to validate
61
+
@param element_name The element containing the URL attribute
62
+
@param attr_name The attribute name
63
+
@return [Some error_message] if the URL is invalid, [None] if valid *)
64
+
65
+
(** {1 Checker} *)
66
+
67
+
val checker : Checker.t
68
+
(** The URL validation checker instance. *)
+56
lib/htmlrw_check/xhtml_parser.mli
+56
lib/htmlrw_check/xhtml_parser.mli
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** XHTML parser using xmlm for proper XML parsing.
7
+
8
+
This module provides XML parsing for XHTML files. While the HTML5 parser
9
+
handles most content, XHTML requires proper XML parsing to correctly handle:
10
+
11
+
- Self-closing tags on non-void elements (e.g., [<div/>])
12
+
- XML namespaces for SVG and MathML
13
+
- Strict XML well-formedness requirements
14
+
15
+
{2 Usage}
16
+
17
+
{[
18
+
if Xhtml_parser.is_xhtml_file (Some "page.xhtml") then
19
+
match Xhtml_parser.parse_xhtml content with
20
+
| Ok doc -> (* Process XHTML document *)
21
+
| Error msg -> (* Handle parse error *)
22
+
]}
23
+
*)
24
+
25
+
(** {1 Types} *)
26
+
27
+
type xhtml_doc = {
28
+
root : Html5rw.Dom.node;
29
+
(** The document root node. *)
30
+
errors : Html5rw.Error.t list;
31
+
(** Parse errors (empty for valid XML). *)
32
+
}
33
+
(** An XHTML document representation. *)
34
+
35
+
(** {1 Parsing} *)
36
+
37
+
val parse_xhtml : string -> (Html5rw.Dom.node, string) result
38
+
(** [parse_xhtml content] parses XHTML content using xmlm.
39
+
40
+
@param content The XHTML content as a string
41
+
@return [Ok root] with the document root on success,
42
+
[Error message] with parse error details on failure *)
43
+
44
+
val is_xhtml_file : string option -> bool
45
+
(** [is_xhtml_file system_id] checks if a system_id indicates an XHTML file.
46
+
47
+
@param system_id The optional file path or identifier
48
+
@return [true] if the path ends with ".xhtml" *)
49
+
50
+
(** {1 Document Access} *)
51
+
52
+
val xhtml_root : xhtml_doc -> Html5rw.Dom.node
53
+
(** [xhtml_root doc] returns the document root node. *)
54
+
55
+
val xhtml_errors : xhtml_doc -> Html5rw.Error.t list
56
+
(** [xhtml_errors doc] returns the parse errors (always empty for XHTML). *)