+7
-6
lib/htmlrw_check/checker.ml
+7
-6
lib/htmlrw_check/checker.ml
···
8
8
9
9
val start_element :
10
10
state ->
11
-
name:string ->
12
-
namespace:string option ->
13
-
attrs:(string * string) list ->
11
+
element:Element.t ->
14
12
Message_collector.t ->
15
13
unit
16
14
17
15
val end_element :
18
-
state -> name:string -> namespace:string option -> Message_collector.t -> unit
16
+
state ->
17
+
tag:Tag.element_tag ->
18
+
Message_collector.t ->
19
+
unit
19
20
20
21
val characters : state -> string -> Message_collector.t -> unit
21
22
val end_document : state -> Message_collector.t -> unit
···
30
31
let create () = ()
31
32
let reset () = ()
32
33
33
-
let start_element () ~name:_ ~namespace:_ ~attrs:_ _ = ()
34
-
let end_element () ~name:_ ~namespace:_ _ = ()
34
+
let start_element () ~element:_ _ = ()
35
+
let end_element () ~tag:_ _ = ()
35
36
let characters () _ _ = ()
36
37
let end_document () _ = ()
37
38
end
+9
-12
lib/htmlrw_check/checker.mli
+9
-12
lib/htmlrw_check/checker.mli
···
87
87
88
88
val start_element :
89
89
state ->
90
-
name:string ->
91
-
namespace:string option ->
92
-
attrs:(string * string) list ->
90
+
element:Element.t ->
93
91
Message_collector.t ->
94
92
unit
95
-
(** [start_element state ~name ~namespace ~attrs collector] is called when
93
+
(** [start_element state ~element collector] is called when
96
94
entering an element during DOM traversal.
97
95
98
96
@param state The checker state
99
-
@param name The element tag name (e.g., "div", "p", "span")
100
-
@param namespace The element namespace ([None] for HTML, [Some "svg"]
101
-
for SVG, [Some "mathml"] for MathML)
102
-
@param attrs The element's attributes as [(name, value)] pairs
97
+
@param element The typed element (includes tag, typed attrs, and raw attrs)
103
98
@param collector The message collector for emitting validation messages
104
99
105
100
This is where checkers can validate:
···
109
104
- Whether the element opens a new validation context *)
110
105
111
106
val end_element :
112
-
state -> name:string -> namespace:string option -> Message_collector.t -> unit
113
-
(** [end_element state ~name ~namespace collector] is called when exiting
107
+
state ->
108
+
tag:Tag.element_tag ->
109
+
Message_collector.t ->
110
+
unit
111
+
(** [end_element state ~tag collector] is called when exiting
114
112
an element during DOM traversal.
115
113
116
114
@param state The checker state
117
-
@param name The element tag name
118
-
@param namespace The element namespace
115
+
@param tag The element tag
119
116
@param collector The message collector for emitting validation messages
120
117
121
118
This is where checkers can:
+46
-16
lib/htmlrw_check/content_model/content_checker.ml
+46
-16
lib/htmlrw_check/content_model/content_checker.ml
···
2
2
name : string;
3
3
spec : Element_spec.t;
4
4
children_count : int;
5
+
is_foreign : bool; (* SVG or MathML element *)
5
6
}
6
7
7
8
type state = {
···
92
93
Message_collector.add_typed collector
93
94
(`Element (`Not_allowed_as_child (`Child child_name, `Parent parent.name)))
94
95
95
-
let start_element state ~name ~namespace:_ ~attrs:_ collector =
96
-
(* Look up element specification *)
97
-
let spec_opt = Element_registry.get state.registry name in
96
+
let start_element state ~element collector =
97
+
let name = Tag.tag_to_string element.Element.tag in
98
+
99
+
(* Check if we're inside a foreign (SVG/MathML) context *)
100
+
let in_foreign_context = match state.ancestor_stack with
101
+
| ctx :: _ -> ctx.is_foreign
102
+
| [] -> false
103
+
in
104
+
105
+
(* Determine if this element is foreign content *)
106
+
let is_foreign = match element.Element.tag with
107
+
| Tag.Svg _ | Tag.MathML _ -> true
108
+
| _ -> in_foreign_context (* Inherit from parent if inside foreign content *)
109
+
in
110
+
111
+
(* If entering foreign content from HTML, SVG/MathML are valid embedded content *)
112
+
(* If already in foreign content, skip HTML content model checks *)
113
+
if is_foreign && not in_foreign_context then begin
114
+
(* Entering SVG/MathML from HTML - just track it, it's valid embedded content *)
115
+
let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in
116
+
let context = { name; spec; children_count = 0; is_foreign = true } in
117
+
state.ancestor_stack <- context :: state.ancestor_stack
118
+
end else if is_foreign then begin
119
+
(* Inside SVG/MathML - just track nesting, don't validate against HTML *)
120
+
let spec = Element_spec.make ~name ~content_model:(Content_model.Categories [Content_category.Flow]) () in
121
+
let context = { name; spec; children_count = 0; is_foreign = true } in
122
+
state.ancestor_stack <- context :: state.ancestor_stack
123
+
end else begin
124
+
(* HTML element - do normal validation *)
125
+
let spec_opt = Element_registry.get state.registry name in
98
126
99
-
match spec_opt with
100
-
| None ->
101
-
(* Unknown element - first check if it's allowed in current context *)
102
-
validate_child_element state name collector
103
-
| Some spec ->
104
-
(* Check prohibited ancestors *)
105
-
check_prohibited_ancestors state name spec collector;
127
+
match spec_opt with
128
+
| None ->
129
+
(* Unknown element - first check if it's allowed in current context *)
130
+
validate_child_element state name collector
131
+
| Some spec ->
132
+
(* Check prohibited ancestors *)
133
+
check_prohibited_ancestors state name spec collector;
106
134
107
-
(* Validate this element is allowed as child of parent *)
108
-
validate_child_element state name collector;
135
+
(* Validate this element is allowed as child of parent *)
136
+
validate_child_element state name collector;
109
137
110
-
(* Push element context onto stack *)
111
-
let context = { name; spec; children_count = 0 } in
112
-
state.ancestor_stack <- context :: state.ancestor_stack
138
+
(* Push element context onto stack *)
139
+
let context = { name; spec; children_count = 0; is_foreign = false } in
140
+
state.ancestor_stack <- context :: state.ancestor_stack
141
+
end
113
142
114
-
let end_element state ~name ~namespace:_ collector =
143
+
let end_element state ~tag collector =
144
+
let name = Tag.tag_to_string tag in
115
145
match state.ancestor_stack with
116
146
| [] ->
117
147
(* Unmatched closing tag *)
+14
-22
lib/htmlrw_check/dom_walker.ml
+14
-22
lib/htmlrw_check/dom_walker.ml
···
15
15
16
16
(** Package a checker with its state for traversal. *)
17
17
type checker_state = {
18
-
start_element :
19
-
name:string ->
20
-
namespace:string option ->
21
-
attrs:(string * string) list ->
22
-
Message_collector.t ->
23
-
unit;
24
-
end_element :
25
-
name:string -> namespace:string option -> Message_collector.t -> unit;
18
+
start_element : element:Element.t -> Message_collector.t -> unit;
19
+
end_element : tag:Tag.element_tag -> Message_collector.t -> unit;
26
20
characters : string -> Message_collector.t -> unit;
27
21
end_document : Message_collector.t -> unit;
28
22
}
···
31
25
let make_checker_state (module C : Checker.S) =
32
26
let state = C.create () in
33
27
{
34
-
start_element = (fun ~name ~namespace ~attrs collector ->
35
-
C.start_element state ~name ~namespace ~attrs collector);
36
-
end_element = (fun ~name ~namespace collector ->
37
-
C.end_element state ~name ~namespace collector);
28
+
start_element = (fun ~element collector ->
29
+
C.start_element state ~element collector);
30
+
end_element = (fun ~tag collector ->
31
+
C.end_element state ~tag collector);
38
32
characters = (fun text collector ->
39
33
C.characters state text collector);
40
34
end_document = (fun collector ->
···
60
54
(* Doctype node: skip (no validation events for doctype) *)
61
55
()
62
56
| _ ->
63
-
(* Element node: emit start, traverse children, emit end *)
64
-
cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector;
57
+
(* Element node: create typed element, emit start, traverse children, emit end *)
58
+
let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in
59
+
cs.start_element ~element collector;
65
60
List.iter (walk_node_single cs collector) node.children;
66
-
cs.end_element ~name:node.name ~namespace:node.namespace collector
61
+
cs.end_element ~tag:element.tag collector
67
62
68
63
let walk checker collector node =
69
64
let cs = make_checker_state checker in
···
89
84
(* Doctype node: skip *)
90
85
()
91
86
| _ ->
92
-
(* Element node: emit start to all checkers, traverse children, emit end to all *)
93
-
List.iter (fun cs ->
94
-
cs.start_element ~name:node.name ~namespace:node.namespace ~attrs:node.attrs collector
95
-
) css;
87
+
(* Element node: create typed element, emit start to all checkers, traverse children, emit end to all *)
88
+
let element = Element.create ~name:node.name ~namespace:node.namespace ~attrs:node.attrs in
89
+
List.iter (fun cs -> cs.start_element ~element collector) css;
96
90
List.iter (walk_node_all css collector) node.children;
97
-
List.iter (fun cs ->
98
-
cs.end_element ~name:node.name ~namespace:node.namespace collector
99
-
) css
91
+
List.iter (fun cs -> cs.end_element ~tag:element.tag collector) css
100
92
101
93
let walk_all checkers collector node =
102
94
(* Create checker state packages *)
+873
lib/htmlrw_check/element/attr.ml
+873
lib/htmlrw_check/element/attr.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Typed HTML5 attribute representations using polymorphic variants.
7
+
8
+
This module provides typed representations for HTML attributes with
9
+
proper value types for enumerated attributes. *)
10
+
11
+
(** {1 Attribute Value Types} *)
12
+
13
+
(** Direction attribute values *)
14
+
type dir_value = [ `Ltr | `Rtl | `Auto ]
15
+
16
+
(** Hidden attribute values *)
17
+
type hidden_value = [ `Hidden | `Until_found ]
18
+
19
+
(** Popover attribute values *)
20
+
type popover_value = [ `Auto | `Manual | `Hint ]
21
+
22
+
(** Link target values *)
23
+
type target_value = [ `Self | `Blank | `Parent | `Top | `Named of string ]
24
+
25
+
(** Loading behavior values *)
26
+
type loading_value = [ `Eager | `Lazy ]
27
+
28
+
(** Decoding hint values *)
29
+
type decoding_value = [ `Sync | `Async | `Auto ]
30
+
31
+
(** Fetch priority values *)
32
+
type fetchpriority_value = [ `High | `Low | `Auto ]
33
+
34
+
(** CORS settings values *)
35
+
type crossorigin_value = [ `Anonymous | `Use_credentials ]
36
+
37
+
(** Preload hint values *)
38
+
type preload_value = [ `None | `Metadata | `Auto ]
39
+
40
+
(** Form method values *)
41
+
type method_value = [ `Get | `Post | `Dialog ]
42
+
43
+
(** Form enctype values *)
44
+
type enctype_value = [ `Urlencoded | `Multipart | `Plain ]
45
+
46
+
(** Textarea wrap values *)
47
+
type wrap_value = [ `Soft | `Hard ]
48
+
49
+
(** Table scope values *)
50
+
type scope_value = [ `Row | `Col | `Rowgroup | `Colgroup ]
51
+
52
+
(** Input type values *)
53
+
type input_type = [
54
+
| `Hidden | `Text | `Search | `Tel | `Url | `Email | `Password
55
+
| `Date | `Month | `Week | `Time | `Datetime_local | `Number
56
+
| `Range | `Color | `Checkbox | `Radio | `File | `Submit
57
+
| `Image | `Reset | `Button
58
+
]
59
+
60
+
(** Button type values *)
61
+
type button_type = [ `Submit | `Reset | `Button ]
62
+
63
+
(** Referrer policy values *)
64
+
type referrerpolicy_value = [
65
+
| `No_referrer | `No_referrer_when_downgrade | `Origin
66
+
| `Origin_when_cross_origin | `Same_origin | `Strict_origin
67
+
| `Strict_origin_when_cross_origin | `Unsafe_url
68
+
]
69
+
70
+
(** Sandbox flag values *)
71
+
type sandbox_flag = [
72
+
| `Allow_downloads | `Allow_forms | `Allow_modals | `Allow_orientation_lock
73
+
| `Allow_pointer_lock | `Allow_popups | `Allow_popups_to_escape_sandbox
74
+
| `Allow_presentation | `Allow_same_origin | `Allow_scripts
75
+
| `Allow_top_navigation | `Allow_top_navigation_by_user_activation
76
+
| `Allow_top_navigation_to_custom_protocols
77
+
]
78
+
79
+
(** Enter key hint values *)
80
+
type enterkeyhint_value = [
81
+
| `Enter | `Done | `Go | `Next | `Previous | `Search | `Send
82
+
]
83
+
84
+
(** Input mode values *)
85
+
type inputmode_value = [
86
+
| `None | `Text | `Decimal | `Numeric | `Tel | `Search | `Email | `Url
87
+
]
88
+
89
+
(** Content editable values *)
90
+
type contenteditable_value = [ `True | `False | `Plaintext_only ]
91
+
92
+
(** Autocapitalize values *)
93
+
type autocapitalize_value = [
94
+
| `Off | `None | `On | `Sentences | `Words | `Characters
95
+
]
96
+
97
+
(** Image shape values *)
98
+
type shape_value = [ `Rect | `Circle | `Poly | `Default ]
99
+
100
+
(** Capture values *)
101
+
type capture_value = [ `User | `Environment ]
102
+
103
+
(** List type values *)
104
+
type list_type_value = [
105
+
| `Decimal | `Lower_alpha | `Upper_alpha | `Lower_roman | `Upper_roman
106
+
]
107
+
108
+
(** Track kind values *)
109
+
type kind_value = [
110
+
| `Subtitles | `Captions | `Descriptions | `Chapters | `Metadata
111
+
]
112
+
113
+
(** {1 Typed Attribute Variant} *)
114
+
115
+
(** Typed attribute representation *)
116
+
type t = [
117
+
(* Global attributes *)
118
+
| `Id of string
119
+
| `Class of string
120
+
| `Style of string
121
+
| `Title of string
122
+
| `Lang of string
123
+
| `Dir of dir_value
124
+
| `Hidden of hidden_value option (* None = just "hidden" *)
125
+
| `Tabindex of int
126
+
| `Accesskey of string
127
+
| `Autocapitalize of autocapitalize_value
128
+
| `Autofocus
129
+
| `Contenteditable of contenteditable_value option
130
+
| `Draggable of bool
131
+
| `Enterkeyhint of enterkeyhint_value
132
+
| `Inert
133
+
| `Inputmode of inputmode_value
134
+
| `Is of string
135
+
| `Nonce of string
136
+
| `Popover of popover_value option
137
+
| `Slot of string
138
+
| `Spellcheck of bool option
139
+
| `Translate of bool
140
+
| `Exportparts of string
141
+
| `Part of string
142
+
143
+
(* Microdata *)
144
+
| `Itemscope
145
+
| `Itemtype of string
146
+
| `Itemprop of string
147
+
| `Itemid of string
148
+
| `Itemref of string
149
+
150
+
(* ARIA *)
151
+
| `Role of string
152
+
| `Aria of string * string (* aria-* -> (name, value) *)
153
+
154
+
(* Event handlers *)
155
+
| `Event of string * string (* onclick -> ("click", handler) *)
156
+
157
+
(* Link/navigation attributes *)
158
+
| `Href of string
159
+
| `Target of target_value
160
+
| `Rel of string
161
+
| `Download of string option
162
+
| `Hreflang of string
163
+
| `Ping of string
164
+
| `Referrerpolicy of referrerpolicy_value
165
+
| `Type_link of string
166
+
167
+
(* Media/resource attributes *)
168
+
| `Src of string
169
+
| `Srcset of string
170
+
| `Sizes of string
171
+
| `Alt of string
172
+
| `Width of string
173
+
| `Height of string
174
+
| `Loading of loading_value
175
+
| `Decoding of decoding_value
176
+
| `Fetchpriority of fetchpriority_value
177
+
| `Crossorigin of crossorigin_value option
178
+
| `Ismap
179
+
| `Usemap of string
180
+
| `Media of string
181
+
182
+
(* Audio/Video specific *)
183
+
| `Controls
184
+
| `Autoplay
185
+
| `Loop
186
+
| `Muted
187
+
| `Preload of preload_value
188
+
| `Poster of string
189
+
| `Playsinline
190
+
191
+
(* Image map *)
192
+
| `Coords of string
193
+
| `Shape of shape_value
194
+
195
+
(* iframe *)
196
+
| `Sandbox of sandbox_flag list option
197
+
| `Allow of string
198
+
| `Allowfullscreen
199
+
| `Srcdoc of string
200
+
| `Csp of string
201
+
202
+
(* Form attributes *)
203
+
| `Action of string
204
+
| `Method of method_value
205
+
| `Enctype of enctype_value
206
+
| `Novalidate
207
+
| `Accept_charset of string
208
+
| `Autocomplete of string
209
+
| `Name of string
210
+
| `Form of string
211
+
212
+
(* Form control attributes *)
213
+
| `Value of string
214
+
| `Type_input of input_type
215
+
| `Type_button of button_type
216
+
| `Disabled
217
+
| `Readonly
218
+
| `Required
219
+
| `Checked
220
+
| `Selected
221
+
| `Multiple
222
+
| `Placeholder of string
223
+
| `Min of string
224
+
| `Max of string
225
+
| `Step of string
226
+
| `Minlength of int
227
+
| `Maxlength of int
228
+
| `Pattern of string
229
+
| `Size of int
230
+
| `Cols of int
231
+
| `Rows of int
232
+
| `Wrap of wrap_value
233
+
| `Accept of string
234
+
| `Capture of capture_value
235
+
| `Dirname of string
236
+
| `For of string
237
+
| `List of string
238
+
239
+
(* Form submission attributes *)
240
+
| `Formaction of string
241
+
| `Formmethod of method_value
242
+
| `Formenctype of enctype_value
243
+
| `Formnovalidate
244
+
| `Formtarget of target_value
245
+
246
+
(* Table attributes *)
247
+
| `Colspan of int
248
+
| `Rowspan of int
249
+
| `Headers of string
250
+
| `Scope of scope_value
251
+
| `Span of int
252
+
253
+
(* Details/Dialog *)
254
+
| `Open
255
+
256
+
(* Script *)
257
+
| `Async
258
+
| `Defer
259
+
| `Integrity of string
260
+
| `Nomodule
261
+
| `Blocking of string
262
+
| `Type_script of string
263
+
264
+
(* Meta *)
265
+
| `Charset of string
266
+
| `Content of string
267
+
| `Http_equiv of string
268
+
269
+
(* Link element *)
270
+
| `As of string
271
+
| `Imagesizes of string
272
+
| `Imagesrcset of string
273
+
274
+
(* Object/Embed *)
275
+
| `Data_object of string
276
+
277
+
(* Output *)
278
+
| `For_output of string
279
+
280
+
(* Meter/Progress *)
281
+
| `Low of float
282
+
| `High of float
283
+
| `Optimum of float
284
+
285
+
(* Time *)
286
+
| `Datetime of string
287
+
288
+
(* Ol *)
289
+
| `Start of int
290
+
| `Reversed
291
+
| `Type_list of list_type_value
292
+
293
+
(* Track *)
294
+
| `Kind of kind_value
295
+
| `Srclang of string
296
+
| `Default
297
+
298
+
(* Td/Th *)
299
+
| `Abbr of string
300
+
301
+
(* Data attributes *)
302
+
| `Data_attr of string * string
303
+
304
+
(* RDFa *)
305
+
| `Property of string
306
+
| `Typeof of string
307
+
| `Resource of string
308
+
| `Prefix of string
309
+
| `Vocab of string
310
+
| `About of string
311
+
| `Datatype of string
312
+
| `Inlist
313
+
| `Rev of string
314
+
315
+
(* Escape hatch *)
316
+
| `Unknown_attr of string * string
317
+
]
318
+
319
+
(** {1 Parsing Functions} *)
320
+
321
+
(** Parse dir value *)
322
+
let parse_dir = function
323
+
| "ltr" -> Some `Ltr
324
+
| "rtl" -> Some `Rtl
325
+
| "auto" -> Some `Auto
326
+
| _ -> None
327
+
328
+
(** Parse target value *)
329
+
let parse_target = function
330
+
| "_self" -> `Self
331
+
| "_blank" -> `Blank
332
+
| "_parent" -> `Parent
333
+
| "_top" -> `Top
334
+
| s -> `Named s
335
+
336
+
(** Parse loading value *)
337
+
let parse_loading = function
338
+
| "eager" -> Some `Eager
339
+
| "lazy" -> Some `Lazy
340
+
| _ -> None
341
+
342
+
(** Parse decoding value *)
343
+
let parse_decoding = function
344
+
| "sync" -> Some `Sync
345
+
| "async" -> Some `Async
346
+
| "auto" -> Some `Auto
347
+
| _ -> None
348
+
349
+
(** Parse fetchpriority value *)
350
+
let parse_fetchpriority = function
351
+
| "high" -> Some `High
352
+
| "low" -> Some `Low
353
+
| "auto" -> Some `Auto
354
+
| _ -> None
355
+
356
+
(** Parse crossorigin value *)
357
+
let parse_crossorigin = function
358
+
| "anonymous" | "" -> Some `Anonymous
359
+
| "use-credentials" -> Some `Use_credentials
360
+
| _ -> None
361
+
362
+
(** Parse preload value *)
363
+
let parse_preload = function
364
+
| "none" -> Some `None
365
+
| "metadata" -> Some `Metadata
366
+
| "auto" | "" -> Some `Auto
367
+
| _ -> None
368
+
369
+
(** Parse method value *)
370
+
let parse_method = function
371
+
| "get" -> Some `Get
372
+
| "post" -> Some `Post
373
+
| "dialog" -> Some `Dialog
374
+
| _ -> None
375
+
376
+
(** Parse enctype value *)
377
+
let parse_enctype = function
378
+
| "application/x-www-form-urlencoded" -> Some `Urlencoded
379
+
| "multipart/form-data" -> Some `Multipart
380
+
| "text/plain" -> Some `Plain
381
+
| _ -> None
382
+
383
+
(** Parse wrap value *)
384
+
let parse_wrap = function
385
+
| "soft" -> Some `Soft
386
+
| "hard" -> Some `Hard
387
+
| _ -> None
388
+
389
+
(** Parse scope value *)
390
+
let parse_scope = function
391
+
| "row" -> Some `Row
392
+
| "col" -> Some `Col
393
+
| "rowgroup" -> Some `Rowgroup
394
+
| "colgroup" -> Some `Colgroup
395
+
| _ -> None
396
+
397
+
(** Parse input type value *)
398
+
let parse_input_type = function
399
+
| "hidden" -> Some `Hidden
400
+
| "text" -> Some `Text
401
+
| "search" -> Some `Search
402
+
| "tel" -> Some `Tel
403
+
| "url" -> Some `Url
404
+
| "email" -> Some `Email
405
+
| "password" -> Some `Password
406
+
| "date" -> Some `Date
407
+
| "month" -> Some `Month
408
+
| "week" -> Some `Week
409
+
| "time" -> Some `Time
410
+
| "datetime-local" -> Some `Datetime_local
411
+
| "number" -> Some `Number
412
+
| "range" -> Some `Range
413
+
| "color" -> Some `Color
414
+
| "checkbox" -> Some `Checkbox
415
+
| "radio" -> Some `Radio
416
+
| "file" -> Some `File
417
+
| "submit" -> Some `Submit
418
+
| "image" -> Some `Image
419
+
| "reset" -> Some `Reset
420
+
| "button" -> Some `Button
421
+
| _ -> None
422
+
423
+
(** Parse button type value *)
424
+
let parse_button_type = function
425
+
| "submit" -> Some `Submit
426
+
| "reset" -> Some `Reset
427
+
| "button" -> Some `Button
428
+
| _ -> None
429
+
430
+
(** Parse shape value *)
431
+
let parse_shape = function
432
+
| "rect" -> Some `Rect
433
+
| "circle" -> Some `Circle
434
+
| "poly" -> Some `Poly
435
+
| "default" -> Some `Default
436
+
| _ -> None
437
+
438
+
(** Parse capture value *)
439
+
let parse_capture = function
440
+
| "user" -> Some `User
441
+
| "environment" -> Some `Environment
442
+
| _ -> None
443
+
444
+
(** Parse list type value *)
445
+
let parse_list_type = function
446
+
| "1" -> Some `Decimal
447
+
| "a" -> Some `Lower_alpha
448
+
| "A" -> Some `Upper_alpha
449
+
| "i" -> Some `Lower_roman
450
+
| "I" -> Some `Upper_roman
451
+
| _ -> None
452
+
453
+
(** Parse kind value *)
454
+
let parse_kind = function
455
+
| "subtitles" -> Some `Subtitles
456
+
| "captions" -> Some `Captions
457
+
| "descriptions" -> Some `Descriptions
458
+
| "chapters" -> Some `Chapters
459
+
| "metadata" -> Some `Metadata
460
+
| _ -> None
461
+
462
+
(** Parse referrerpolicy value *)
463
+
let parse_referrerpolicy = function
464
+
| "no-referrer" -> Some `No_referrer
465
+
| "no-referrer-when-downgrade" -> Some `No_referrer_when_downgrade
466
+
| "origin" -> Some `Origin
467
+
| "origin-when-cross-origin" -> Some `Origin_when_cross_origin
468
+
| "same-origin" -> Some `Same_origin
469
+
| "strict-origin" -> Some `Strict_origin
470
+
| "strict-origin-when-cross-origin" -> Some `Strict_origin_when_cross_origin
471
+
| "unsafe-url" -> Some `Unsafe_url
472
+
| _ -> None
473
+
474
+
(** Parse sandbox flag *)
475
+
let parse_sandbox_flag = function
476
+
| "allow-downloads" -> Some `Allow_downloads
477
+
| "allow-forms" -> Some `Allow_forms
478
+
| "allow-modals" -> Some `Allow_modals
479
+
| "allow-orientation-lock" -> Some `Allow_orientation_lock
480
+
| "allow-pointer-lock" -> Some `Allow_pointer_lock
481
+
| "allow-popups" -> Some `Allow_popups
482
+
| "allow-popups-to-escape-sandbox" -> Some `Allow_popups_to_escape_sandbox
483
+
| "allow-presentation" -> Some `Allow_presentation
484
+
| "allow-same-origin" -> Some `Allow_same_origin
485
+
| "allow-scripts" -> Some `Allow_scripts
486
+
| "allow-top-navigation" -> Some `Allow_top_navigation
487
+
| "allow-top-navigation-by-user-activation" -> Some `Allow_top_navigation_by_user_activation
488
+
| "allow-top-navigation-to-custom-protocols" -> Some `Allow_top_navigation_to_custom_protocols
489
+
| _ -> None
490
+
491
+
(** Parse sandbox value (space-separated flags) *)
492
+
let parse_sandbox value =
493
+
if String.trim value = "" then
494
+
Some []
495
+
else
496
+
let flags = String.split_on_char ' ' value |> List.filter (fun s -> s <> "") in
497
+
let parsed = List.filter_map parse_sandbox_flag flags in
498
+
if List.length parsed = List.length flags then
499
+
Some parsed
500
+
else
501
+
None
502
+
503
+
(** Parse enterkeyhint value *)
504
+
let parse_enterkeyhint = function
505
+
| "enter" -> Some `Enter
506
+
| "done" -> Some `Done
507
+
| "go" -> Some `Go
508
+
| "next" -> Some `Next
509
+
| "previous" -> Some `Previous
510
+
| "search" -> Some `Search
511
+
| "send" -> Some `Send
512
+
| _ -> None
513
+
514
+
(** Parse inputmode value *)
515
+
let parse_inputmode = function
516
+
| "none" -> Some `None
517
+
| "text" -> Some `Text
518
+
| "decimal" -> Some `Decimal
519
+
| "numeric" -> Some `Numeric
520
+
| "tel" -> Some `Tel
521
+
| "search" -> Some `Search
522
+
| "email" -> Some `Email
523
+
| "url" -> Some `Url
524
+
| _ -> None
525
+
526
+
(** Parse contenteditable value *)
527
+
let parse_contenteditable = function
528
+
| "true" | "" -> Some `True
529
+
| "false" -> Some `False
530
+
| "plaintext-only" -> Some `Plaintext_only
531
+
| _ -> None
532
+
533
+
(** Parse autocapitalize value *)
534
+
let parse_autocapitalize = function
535
+
| "off" -> Some `Off
536
+
| "none" -> Some `None
537
+
| "on" -> Some `On
538
+
| "sentences" -> Some `Sentences
539
+
| "words" -> Some `Words
540
+
| "characters" -> Some `Characters
541
+
| _ -> None
542
+
543
+
(** Parse hidden value *)
544
+
let parse_hidden = function
545
+
| "" | "hidden" -> Some `Hidden
546
+
| "until-found" -> Some `Until_found
547
+
| _ -> None
548
+
549
+
(** Parse popover value *)
550
+
let parse_popover = function
551
+
| "" | "auto" -> Some `Auto
552
+
| "manual" -> Some `Manual
553
+
| "hint" -> Some `Hint
554
+
| _ -> None
555
+
556
+
(** Try to parse an integer *)
557
+
let parse_int s =
558
+
try Some (int_of_string (String.trim s))
559
+
with Failure _ -> None
560
+
561
+
(** Try to parse a float *)
562
+
let parse_float s =
563
+
try Some (float_of_string (String.trim s))
564
+
with Failure _ -> None
565
+
566
+
(** Parse a boolean string *)
567
+
let parse_bool = function
568
+
| "true" | "" -> Some true
569
+
| "false" -> Some false
570
+
| _ -> None
571
+
572
+
(** Parse a single attribute name-value pair to typed attribute *)
573
+
let parse_attr name value : t =
574
+
let name_lower = String.lowercase_ascii name in
575
+
let value_lower = String.lowercase_ascii value in
576
+
match name_lower with
577
+
(* Global attributes *)
578
+
| "id" -> `Id value
579
+
| "class" -> `Class value
580
+
| "style" -> `Style value
581
+
| "title" -> `Title value
582
+
| "lang" -> `Lang value
583
+
| "dir" -> (match parse_dir value_lower with Some d -> `Dir d | None -> `Unknown_attr (name, value))
584
+
| "hidden" -> `Hidden (parse_hidden value_lower)
585
+
| "tabindex" -> (match parse_int value with Some i -> `Tabindex i | None -> `Unknown_attr (name, value))
586
+
| "accesskey" -> `Accesskey value
587
+
| "autocapitalize" -> (match parse_autocapitalize value_lower with Some a -> `Autocapitalize a | None -> `Unknown_attr (name, value))
588
+
| "autofocus" -> `Autofocus
589
+
| "contenteditable" -> `Contenteditable (parse_contenteditable value_lower)
590
+
| "draggable" -> (match parse_bool value_lower with Some b -> `Draggable b | None -> `Unknown_attr (name, value))
591
+
| "enterkeyhint" -> (match parse_enterkeyhint value_lower with Some e -> `Enterkeyhint e | None -> `Unknown_attr (name, value))
592
+
| "inert" -> `Inert
593
+
| "inputmode" -> (match parse_inputmode value_lower with Some i -> `Inputmode i | None -> `Unknown_attr (name, value))
594
+
| "is" -> `Is value
595
+
| "nonce" -> `Nonce value
596
+
| "popover" -> `Popover (parse_popover value_lower)
597
+
| "slot" -> `Slot value
598
+
| "spellcheck" -> `Spellcheck (parse_bool value_lower)
599
+
| "translate" -> (match value_lower with "yes" | "" -> `Translate true | "no" -> `Translate false | _ -> `Unknown_attr (name, value))
600
+
| "exportparts" -> `Exportparts value
601
+
| "part" -> `Part value
602
+
603
+
(* Microdata *)
604
+
| "itemscope" -> `Itemscope
605
+
| "itemtype" -> `Itemtype value
606
+
| "itemprop" -> `Itemprop value
607
+
| "itemid" -> `Itemid value
608
+
| "itemref" -> `Itemref value
609
+
610
+
(* ARIA - role and aria-* *)
611
+
| "role" -> `Role value
612
+
| _ when String.starts_with ~prefix:"aria-" name_lower ->
613
+
let aria_name = String.sub name_lower 5 (String.length name_lower - 5) in
614
+
`Aria (aria_name, value)
615
+
616
+
(* Event handlers - on* *)
617
+
| _ when String.starts_with ~prefix:"on" name_lower && String.length name_lower > 2 ->
618
+
let event_name = String.sub name_lower 2 (String.length name_lower - 2) in
619
+
`Event (event_name, value)
620
+
621
+
(* Link/navigation attributes *)
622
+
| "href" -> `Href value
623
+
| "target" -> `Target (parse_target value)
624
+
| "rel" -> `Rel value
625
+
| "download" -> `Download (if value = "" then None else Some value)
626
+
| "hreflang" -> `Hreflang value
627
+
| "ping" -> `Ping value
628
+
| "referrerpolicy" -> (match parse_referrerpolicy value_lower with Some r -> `Referrerpolicy r | None -> `Unknown_attr (name, value))
629
+
630
+
(* Media/resource attributes *)
631
+
| "src" -> `Src value
632
+
| "srcset" -> `Srcset value
633
+
| "sizes" -> `Sizes value
634
+
| "alt" -> `Alt value
635
+
| "width" -> `Width value
636
+
| "height" -> `Height value
637
+
| "loading" -> (match parse_loading value_lower with Some l -> `Loading l | None -> `Unknown_attr (name, value))
638
+
| "decoding" -> (match parse_decoding value_lower with Some d -> `Decoding d | None -> `Unknown_attr (name, value))
639
+
| "fetchpriority" -> (match parse_fetchpriority value_lower with Some f -> `Fetchpriority f | None -> `Unknown_attr (name, value))
640
+
| "crossorigin" -> `Crossorigin (parse_crossorigin value_lower)
641
+
| "ismap" -> `Ismap
642
+
| "usemap" -> `Usemap value
643
+
| "media" -> `Media value
644
+
645
+
(* Audio/Video specific *)
646
+
| "controls" -> `Controls
647
+
| "autoplay" -> `Autoplay
648
+
| "loop" -> `Loop
649
+
| "muted" -> `Muted
650
+
| "preload" -> (match parse_preload value_lower with Some p -> `Preload p | None -> `Unknown_attr (name, value))
651
+
| "poster" -> `Poster value
652
+
| "playsinline" -> `Playsinline
653
+
654
+
(* Image map *)
655
+
| "coords" -> `Coords value
656
+
| "shape" -> (match parse_shape value_lower with Some s -> `Shape s | None -> `Unknown_attr (name, value))
657
+
658
+
(* iframe *)
659
+
| "sandbox" -> `Sandbox (parse_sandbox value_lower)
660
+
| "allow" -> `Allow value
661
+
| "allowfullscreen" -> `Allowfullscreen
662
+
| "srcdoc" -> `Srcdoc value
663
+
| "csp" -> `Csp value
664
+
665
+
(* Form attributes *)
666
+
| "action" -> `Action value
667
+
| "method" -> (match parse_method value_lower with Some m -> `Method m | None -> `Unknown_attr (name, value))
668
+
| "enctype" -> (match parse_enctype value_lower with Some e -> `Enctype e | None -> `Unknown_attr (name, value))
669
+
| "novalidate" -> `Novalidate
670
+
| "accept-charset" -> `Accept_charset value
671
+
| "autocomplete" -> `Autocomplete value
672
+
| "name" -> `Name value
673
+
| "form" -> `Form value
674
+
675
+
(* Form control attributes *)
676
+
| "value" -> `Value value
677
+
| "type" -> `Unknown_attr (name, value) (* type is context-dependent, handle in element parsing *)
678
+
| "disabled" -> `Disabled
679
+
| "readonly" -> `Readonly
680
+
| "required" -> `Required
681
+
| "checked" -> `Checked
682
+
| "selected" -> `Selected
683
+
| "multiple" -> `Multiple
684
+
| "placeholder" -> `Placeholder value
685
+
| "min" -> `Min value
686
+
| "max" -> `Max value
687
+
| "step" -> `Step value
688
+
| "minlength" -> (match parse_int value with Some i -> `Minlength i | None -> `Unknown_attr (name, value))
689
+
| "maxlength" -> (match parse_int value with Some i -> `Maxlength i | None -> `Unknown_attr (name, value))
690
+
| "pattern" -> `Pattern value
691
+
| "size" -> (match parse_int value with Some i -> `Size i | None -> `Unknown_attr (name, value))
692
+
| "cols" -> (match parse_int value with Some i -> `Cols i | None -> `Unknown_attr (name, value))
693
+
| "rows" -> (match parse_int value with Some i -> `Rows i | None -> `Unknown_attr (name, value))
694
+
| "wrap" -> (match parse_wrap value_lower with Some w -> `Wrap w | None -> `Unknown_attr (name, value))
695
+
| "accept" -> `Accept value
696
+
| "capture" -> (match parse_capture value_lower with Some c -> `Capture c | None -> `Unknown_attr (name, value))
697
+
| "dirname" -> `Dirname value
698
+
| "for" -> `For value
699
+
| "list" -> `List value
700
+
701
+
(* Form submission attributes *)
702
+
| "formaction" -> `Formaction value
703
+
| "formmethod" -> (match parse_method value_lower with Some m -> `Formmethod m | None -> `Unknown_attr (name, value))
704
+
| "formenctype" -> (match parse_enctype value_lower with Some e -> `Formenctype e | None -> `Unknown_attr (name, value))
705
+
| "formnovalidate" -> `Formnovalidate
706
+
| "formtarget" -> `Formtarget (parse_target value)
707
+
708
+
(* Table attributes *)
709
+
| "colspan" -> (match parse_int value with Some i -> `Colspan i | None -> `Unknown_attr (name, value))
710
+
| "rowspan" -> (match parse_int value with Some i -> `Rowspan i | None -> `Unknown_attr (name, value))
711
+
| "headers" -> `Headers value
712
+
| "scope" -> (match parse_scope value_lower with Some s -> `Scope s | None -> `Unknown_attr (name, value))
713
+
| "span" -> (match parse_int value with Some i -> `Span i | None -> `Unknown_attr (name, value))
714
+
715
+
(* Details/Dialog *)
716
+
| "open" -> `Open
717
+
718
+
(* Script *)
719
+
| "async" -> `Async
720
+
| "defer" -> `Defer
721
+
| "integrity" -> `Integrity value
722
+
| "nomodule" -> `Nomodule
723
+
| "blocking" -> `Blocking value
724
+
725
+
(* Meta *)
726
+
| "charset" -> `Charset value
727
+
| "content" -> `Content value
728
+
| "http-equiv" -> `Http_equiv value
729
+
730
+
(* Link element *)
731
+
| "as" -> `As value
732
+
| "imagesizes" -> `Imagesizes value
733
+
| "imagesrcset" -> `Imagesrcset value
734
+
735
+
(* Object *)
736
+
| "data" -> `Data_object value
737
+
738
+
(* Meter/Progress *)
739
+
| "low" -> (match parse_float value with Some f -> `Low f | None -> `Unknown_attr (name, value))
740
+
| "high" -> (match parse_float value with Some f -> `High f | None -> `Unknown_attr (name, value))
741
+
| "optimum" -> (match parse_float value with Some f -> `Optimum f | None -> `Unknown_attr (name, value))
742
+
743
+
(* Time *)
744
+
| "datetime" -> `Datetime value
745
+
746
+
(* Ol *)
747
+
| "start" -> (match parse_int value with Some i -> `Start i | None -> `Unknown_attr (name, value))
748
+
| "reversed" -> `Reversed
749
+
750
+
(* Track *)
751
+
| "kind" -> (match parse_kind value_lower with Some k -> `Kind k | None -> `Unknown_attr (name, value))
752
+
| "srclang" -> `Srclang value
753
+
| "default" -> `Default
754
+
755
+
(* Td/Th *)
756
+
| "abbr" -> `Abbr value
757
+
758
+
(* RDFa *)
759
+
| "property" -> `Property value
760
+
| "typeof" -> `Typeof value
761
+
| "resource" -> `Resource value
762
+
| "prefix" -> `Prefix value
763
+
| "vocab" -> `Vocab value
764
+
| "about" -> `About value
765
+
| "datatype" -> `Datatype value
766
+
| "inlist" -> `Inlist
767
+
| "rev" -> `Rev value
768
+
769
+
(* Data attributes *)
770
+
| _ when String.starts_with ~prefix:"data-" name_lower ->
771
+
let data_name = String.sub name_lower 5 (String.length name_lower - 5) in
772
+
`Data_attr (data_name, value)
773
+
774
+
(* Escape hatch *)
775
+
| _ -> `Unknown_attr (name, value)
776
+
777
+
(** Parse multiple attributes *)
778
+
let parse_attrs (attrs : (string * string) list) : t list =
779
+
List.map (fun (n, v) -> parse_attr n v) attrs
780
+
781
+
(** {1 Accessor Functions} *)
782
+
783
+
(** Get id attribute *)
784
+
let get_id attrs =
785
+
List.find_map (function `Id s -> Some s | _ -> None) attrs
786
+
787
+
(** Get class attribute *)
788
+
let get_class attrs =
789
+
List.find_map (function `Class s -> Some s | _ -> None) attrs
790
+
791
+
(** Get href attribute *)
792
+
let get_href attrs =
793
+
List.find_map (function `Href s -> Some s | _ -> None) attrs
794
+
795
+
(** Get src attribute *)
796
+
let get_src attrs =
797
+
List.find_map (function `Src s -> Some s | _ -> None) attrs
798
+
799
+
(** Get alt attribute *)
800
+
let get_alt attrs =
801
+
List.find_map (function `Alt s -> Some s | _ -> None) attrs
802
+
803
+
(** Get name attribute *)
804
+
let get_name attrs =
805
+
List.find_map (function `Name s -> Some s | _ -> None) attrs
806
+
807
+
(** Get value attribute *)
808
+
let get_value attrs =
809
+
List.find_map (function `Value s -> Some s | _ -> None) attrs
810
+
811
+
(** Get role attribute *)
812
+
let get_role attrs =
813
+
List.find_map (function `Role s -> Some s | _ -> None) attrs
814
+
815
+
(** Get a specific aria-* attribute *)
816
+
let get_aria name attrs =
817
+
List.find_map (function `Aria (n, v) when n = name -> Some v | _ -> None) attrs
818
+
819
+
(** Get a specific data-* attribute *)
820
+
let get_data name attrs =
821
+
List.find_map (function `Data_attr (n, v) when n = name -> Some v | _ -> None) attrs
822
+
823
+
(** Check if disabled is present *)
824
+
let has_disabled attrs =
825
+
List.exists (function `Disabled -> true | _ -> false) attrs
826
+
827
+
(** Check if required is present *)
828
+
let has_required attrs =
829
+
List.exists (function `Required -> true | _ -> false) attrs
830
+
831
+
(** Check if readonly is present *)
832
+
let has_readonly attrs =
833
+
List.exists (function `Readonly -> true | _ -> false) attrs
834
+
835
+
(** Check if checked is present *)
836
+
let has_checked attrs =
837
+
List.exists (function `Checked -> true | _ -> false) attrs
838
+
839
+
(** Check if autofocus is present *)
840
+
let has_autofocus attrs =
841
+
List.exists (function `Autofocus -> true | _ -> false) attrs
842
+
843
+
(** Check if hidden is present *)
844
+
let has_hidden attrs =
845
+
List.exists (function `Hidden _ -> true | _ -> false) attrs
846
+
847
+
(** Check if inert is present *)
848
+
let has_inert attrs =
849
+
List.exists (function `Inert -> true | _ -> false) attrs
850
+
851
+
(** Check if open is present *)
852
+
let has_open attrs =
853
+
List.exists (function `Open -> true | _ -> false) attrs
854
+
855
+
(** Get all aria-* attributes *)
856
+
let get_all_aria attrs =
857
+
List.filter_map (function `Aria (n, v) -> Some (n, v) | _ -> None) attrs
858
+
859
+
(** Get all data-* attributes *)
860
+
let get_all_data attrs =
861
+
List.filter_map (function `Data_attr (n, v) -> Some (n, v) | _ -> None) attrs
862
+
863
+
(** Find an attribute matching a predicate *)
864
+
let find f attrs =
865
+
List.find_map f attrs
866
+
867
+
(** Check if any attribute matches *)
868
+
let exists f attrs =
869
+
List.exists f attrs
870
+
871
+
(** Filter attributes *)
872
+
let filter f attrs =
873
+
List.filter f attrs
+289
lib/htmlrw_check/element/element.ml
+289
lib/htmlrw_check/element/element.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Typed HTML5 element representation.
7
+
8
+
This module combines tags and attributes into a complete typed element
9
+
representation with conversion functions. *)
10
+
11
+
(** {1 Element Type} *)
12
+
13
+
(** A typed HTML element *)
14
+
type t = {
15
+
tag : Tag.element_tag;
16
+
attrs : Attr.t list;
17
+
raw_attrs : (string * string) list; (** Original for fallback *)
18
+
}
19
+
20
+
(** {1 Parsing Functions} *)
21
+
22
+
(** Parse element-specific type attribute based on tag *)
23
+
let parse_type_attr (tag : Tag.html_tag) value : Attr.t =
24
+
let value_lower = String.lowercase_ascii value in
25
+
match tag with
26
+
| `Input ->
27
+
(match Attr.parse_input_type value_lower with
28
+
| Some t -> `Type_input t
29
+
| None -> `Unknown_attr ("type", value))
30
+
| `Button ->
31
+
(match Attr.parse_button_type value_lower with
32
+
| Some t -> `Type_button t
33
+
| None -> `Unknown_attr ("type", value))
34
+
| `Script -> `Type_script value
35
+
| `Link -> `Type_link value
36
+
| `Ol ->
37
+
(match Attr.parse_list_type value_lower with
38
+
| Some t -> `Type_list t
39
+
| None -> `Unknown_attr ("type", value))
40
+
| _ -> `Unknown_attr ("type", value)
41
+
42
+
(** Parse attributes with element context for type attribute *)
43
+
let parse_attrs_for_tag (tag : Tag.element_tag) (raw_attrs : (string * string) list) : Attr.t list =
44
+
List.map (fun (name, value) ->
45
+
let name_lower = String.lowercase_ascii name in
46
+
if name_lower = "type" then
47
+
match tag with
48
+
| Tag.Html html_tag -> parse_type_attr html_tag value
49
+
| _ -> `Unknown_attr (name, value)
50
+
else
51
+
Attr.parse_attr name value
52
+
) raw_attrs
53
+
54
+
(** Create an element from raw input *)
55
+
let create ~name ~namespace ~attrs:raw_attrs =
56
+
let tag = Tag.tag_of_string ?namespace name in
57
+
let attrs = parse_attrs_for_tag tag raw_attrs in
58
+
{ tag; attrs; raw_attrs }
59
+
60
+
(** {1 Accessor Functions} *)
61
+
62
+
(** Get the tag *)
63
+
let tag elem = elem.tag
64
+
65
+
(** Get typed attributes *)
66
+
let attrs elem = elem.attrs
67
+
68
+
(** Get raw attributes *)
69
+
let raw_attrs elem = elem.raw_attrs
70
+
71
+
(** Get the tag name as string *)
72
+
let tag_name elem = Tag.tag_to_string elem.tag
73
+
74
+
(** Check if element is a specific HTML tag *)
75
+
let is_html_tag expected elem =
76
+
Tag.is_html_tag expected elem.tag
77
+
78
+
(** Get the HTML tag if this is an HTML element *)
79
+
let as_html_tag elem =
80
+
Tag.as_html_tag elem.tag
81
+
82
+
(** {1 Attribute Accessors (delegated to Attr module)} *)
83
+
84
+
let get_id elem = Attr.get_id elem.attrs
85
+
let get_class elem = Attr.get_class elem.attrs
86
+
let get_href elem = Attr.get_href elem.attrs
87
+
let get_src elem = Attr.get_src elem.attrs
88
+
let get_alt elem = Attr.get_alt elem.attrs
89
+
let get_name elem = Attr.get_name elem.attrs
90
+
let get_value elem = Attr.get_value elem.attrs
91
+
let get_role elem = Attr.get_role elem.attrs
92
+
let get_aria name elem = Attr.get_aria name elem.attrs
93
+
let get_data name elem = Attr.get_data name elem.attrs
94
+
95
+
let has_disabled elem = Attr.has_disabled elem.attrs
96
+
let has_required elem = Attr.has_required elem.attrs
97
+
let has_readonly elem = Attr.has_readonly elem.attrs
98
+
let has_checked elem = Attr.has_checked elem.attrs
99
+
let has_autofocus elem = Attr.has_autofocus elem.attrs
100
+
let has_hidden elem = Attr.has_hidden elem.attrs
101
+
let has_inert elem = Attr.has_inert elem.attrs
102
+
let has_open elem = Attr.has_open elem.attrs
103
+
104
+
let get_all_aria elem = Attr.get_all_aria elem.attrs
105
+
let get_all_data elem = Attr.get_all_data elem.attrs
106
+
107
+
(** {1 Category Checks} *)
108
+
109
+
(** Check if this is a void element *)
110
+
let is_void elem =
111
+
match elem.tag with
112
+
| Tag.Html t -> Tag.is_void t
113
+
| _ -> false
114
+
115
+
(** Check if this is a heading element *)
116
+
let is_heading elem =
117
+
match elem.tag with
118
+
| Tag.Html t -> Tag.is_heading t
119
+
| _ -> false
120
+
121
+
(** Get heading level (1-6) or None *)
122
+
let heading_level elem =
123
+
match elem.tag with
124
+
| Tag.Html t -> Tag.heading_level t
125
+
| _ -> None
126
+
127
+
(** Check if this is sectioning content *)
128
+
let is_sectioning elem =
129
+
match elem.tag with
130
+
| Tag.Html t -> Tag.is_sectioning t
131
+
| _ -> false
132
+
133
+
(** Check if this is a sectioning root *)
134
+
let is_sectioning_root elem =
135
+
match elem.tag with
136
+
| Tag.Html t -> Tag.is_sectioning_root t
137
+
| _ -> false
138
+
139
+
(** Check if this is embedded content *)
140
+
let is_embedded elem =
141
+
match elem.tag with
142
+
| Tag.Html t -> Tag.is_embedded t
143
+
| _ -> false
144
+
145
+
(** Check if this is interactive content *)
146
+
let is_interactive elem =
147
+
match elem.tag with
148
+
| Tag.Html t -> Tag.is_interactive t
149
+
| _ -> false
150
+
151
+
(** Check if this is form-associated *)
152
+
let is_form_associated elem =
153
+
match elem.tag with
154
+
| Tag.Html t -> Tag.is_form_associated t
155
+
| _ -> false
156
+
157
+
(** Check if this is labelable *)
158
+
let is_labelable elem =
159
+
match elem.tag with
160
+
| Tag.Html t -> Tag.is_labelable t
161
+
| _ -> false
162
+
163
+
(** Check if this is submittable *)
164
+
let is_submittable elem =
165
+
match elem.tag with
166
+
| Tag.Html t -> Tag.is_submittable t
167
+
| _ -> false
168
+
169
+
(** Check if this is a table element *)
170
+
let is_table_element elem =
171
+
match elem.tag with
172
+
| Tag.Html t -> Tag.is_table_element t
173
+
| _ -> false
174
+
175
+
(** Check if this is a media element *)
176
+
let is_media elem =
177
+
match elem.tag with
178
+
| Tag.Html t -> Tag.is_media t
179
+
| _ -> false
180
+
181
+
(** Check if this is a list container *)
182
+
let is_list_container elem =
183
+
match elem.tag with
184
+
| Tag.Html t -> Tag.is_list_container t
185
+
| _ -> false
186
+
187
+
(** Check if this has transparent content model *)
188
+
let is_transparent elem =
189
+
match elem.tag with
190
+
| Tag.Html t -> Tag.is_transparent t
191
+
| _ -> false
192
+
193
+
(** Check if this is phrasing content *)
194
+
let is_phrasing elem =
195
+
match elem.tag with
196
+
| Tag.Html t -> Tag.is_phrasing t
197
+
| _ -> false
198
+
199
+
(** Check if this is flow content *)
200
+
let is_flow elem =
201
+
match elem.tag with
202
+
| Tag.Html t -> Tag.is_flow t
203
+
| _ -> true (* Custom elements are flow content *)
204
+
205
+
(** Check if this is a deprecated element *)
206
+
let is_obsolete elem =
207
+
match elem.tag with
208
+
| Tag.Html t -> Tag.is_obsolete t
209
+
| _ -> false
210
+
211
+
(** Check if this is an SVG element *)
212
+
let is_svg elem =
213
+
match elem.tag with
214
+
| Tag.Svg _ -> true
215
+
| _ -> false
216
+
217
+
(** Check if this is a MathML element *)
218
+
let is_mathml elem =
219
+
match elem.tag with
220
+
| Tag.MathML _ -> true
221
+
| _ -> false
222
+
223
+
(** Check if this is a custom element *)
224
+
let is_custom elem =
225
+
match elem.tag with
226
+
| Tag.Custom _ -> true
227
+
| _ -> false
228
+
229
+
(** Check if this is an unknown element *)
230
+
let is_unknown elem =
231
+
match elem.tag with
232
+
| Tag.Unknown _ -> true
233
+
| _ -> false
234
+
235
+
(** {1 Input Type Utilities} *)
236
+
237
+
(** Get input type for input elements *)
238
+
let get_input_type elem =
239
+
match elem.tag with
240
+
| Tag.Html `Input ->
241
+
List.find_map (function
242
+
| `Type_input t -> Some t
243
+
| _ -> None
244
+
) elem.attrs
245
+
| _ -> None
246
+
247
+
(** Get button type for button elements *)
248
+
let get_button_type elem =
249
+
match elem.tag with
250
+
| Tag.Html `Button ->
251
+
List.find_map (function
252
+
| `Type_button t -> Some t
253
+
| _ -> None
254
+
) elem.attrs
255
+
| _ -> None
256
+
257
+
(** Check if input is of a specific type *)
258
+
let is_input_type expected elem =
259
+
match get_input_type elem with
260
+
| Some t -> t = expected
261
+
| None -> false
262
+
263
+
(** {1 Raw Attribute Fallback} *)
264
+
265
+
(** Get raw attribute value (from original attrs) *)
266
+
let get_raw_attr name elem =
267
+
List.find_map (fun (n, v) ->
268
+
if String.lowercase_ascii n = String.lowercase_ascii name then Some v else None
269
+
) elem.raw_attrs
270
+
271
+
(** Check if raw attribute exists *)
272
+
let has_raw_attr name elem =
273
+
List.exists (fun (n, _) ->
274
+
String.lowercase_ascii n = String.lowercase_ascii name
275
+
) elem.raw_attrs
276
+
277
+
(** {1 Pattern Matching Helpers} *)
278
+
279
+
(** Match on HTML tag or return None *)
280
+
let match_html elem f =
281
+
match elem.tag with
282
+
| Tag.Html tag -> Some (f tag)
283
+
| _ -> None
284
+
285
+
(** Match on specific HTML tag *)
286
+
let when_html_tag expected elem f =
287
+
match elem.tag with
288
+
| Tag.Html tag when tag = expected -> Some (f ())
289
+
| _ -> None
+523
lib/htmlrw_check/element/tag.ml
+523
lib/htmlrw_check/element/tag.ml
···
1
+
(*---------------------------------------------------------------------------
2
+
Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3
+
SPDX-License-Identifier: MIT
4
+
---------------------------------------------------------------------------*)
5
+
6
+
(** Typed HTML5 tag representations using polymorphic variants.
7
+
8
+
This module provides compile-time type safety for HTML elements while
9
+
maintaining escape hatches for unknown/custom elements. *)
10
+
11
+
(** {1 HTML Tag Types} *)
12
+
13
+
(** All standard HTML5 elements plus deprecated elements needed by the validator *)
14
+
type html_tag = [
15
+
(* Document metadata *)
16
+
| `Html | `Head | `Title | `Base | `Link | `Meta | `Style
17
+
18
+
(* Sectioning root *)
19
+
| `Body
20
+
21
+
(* Content sectioning *)
22
+
| `Address | `Article | `Aside | `Footer | `Header | `Hgroup
23
+
| `Main | `Nav | `Search | `Section
24
+
25
+
(* Heading content *)
26
+
| `H1 | `H2 | `H3 | `H4 | `H5 | `H6
27
+
28
+
(* Grouping content *)
29
+
| `Blockquote | `Dd | `Div | `Dl | `Dt | `Figcaption | `Figure
30
+
| `Hr | `Li | `Menu | `Ol | `P | `Pre | `Ul
31
+
32
+
(* Text-level semantics *)
33
+
| `A | `Abbr | `B | `Bdi | `Bdo | `Br | `Cite | `Code | `Data
34
+
| `Dfn | `Em | `I | `Kbd | `Mark | `Q | `Rp | `Rt | `Ruby
35
+
| `S | `Samp | `Small | `Span | `Strong | `Sub | `Sup | `Time
36
+
| `U | `Var | `Wbr
37
+
38
+
(* Edits *)
39
+
| `Del | `Ins
40
+
41
+
(* Embedded content *)
42
+
| `Area | `Audio | `Canvas | `Embed | `Iframe | `Img | `Map | `Object
43
+
| `Picture | `Source | `Track | `Video
44
+
45
+
(* Tabular data *)
46
+
| `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
47
+
| `Th | `Thead | `Tr
48
+
49
+
(* Forms *)
50
+
| `Button | `Datalist | `Fieldset | `Form | `Input | `Label
51
+
| `Legend | `Meter | `Optgroup | `Option | `Output | `Progress
52
+
| `Select | `Textarea
53
+
54
+
(* Interactive elements *)
55
+
| `Details | `Dialog | `Summary
56
+
57
+
(* Scripting *)
58
+
| `Noscript | `Script | `Slot | `Template
59
+
60
+
(* Web Components / Misc *)
61
+
| `Portal | `Param
62
+
63
+
(* Deprecated/obsolete elements (needed by validator) *)
64
+
| `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset
65
+
| `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid
66
+
| `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp
67
+
| `Basefont | `Big | `Blink | `Center | `Font | `Marquee
68
+
| `Multicol | `Nobr | `Spacer | `Tt | `Image
69
+
]
70
+
71
+
(** {1 Category Types}
72
+
73
+
Categories as type aliases for subsets, enabling functions that only accept
74
+
specific categories with compile-time checking. *)
75
+
76
+
(** Void elements - cannot have children *)
77
+
type void_tag = [
78
+
| `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input
79
+
| `Link | `Meta | `Source | `Track | `Wbr
80
+
(* Deprecated void elements *)
81
+
| `Basefont | `Frame | `Isindex | `Keygen | `Param
82
+
]
83
+
84
+
(** Heading elements *)
85
+
type heading_tag = [ `H1 | `H2 | `H3 | `H4 | `H5 | `H6 ]
86
+
87
+
(** Sectioning content *)
88
+
type sectioning_tag = [ `Article | `Aside | `Nav | `Section ]
89
+
90
+
(** Sectioning roots (establish their own outline) *)
91
+
type sectioning_root_tag = [
92
+
| `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td
93
+
]
94
+
95
+
(** Embedded content *)
96
+
type embedded_tag = [
97
+
| `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video
98
+
]
99
+
100
+
(** Interactive content (focusable/activatable) *)
101
+
type interactive_tag = [
102
+
| `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img
103
+
| `Input | `Label | `Select | `Textarea | `Video
104
+
]
105
+
106
+
(** Form-associated elements *)
107
+
type form_associated_tag = [
108
+
| `Button | `Fieldset | `Input | `Label | `Object | `Output
109
+
| `Select | `Textarea | `Meter | `Progress
110
+
]
111
+
112
+
(** Labelable elements *)
113
+
type labelable_tag = [
114
+
| `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea
115
+
]
116
+
117
+
(** Submittable elements *)
118
+
type submittable_tag = [
119
+
| `Button | `Input | `Select | `Textarea
120
+
]
121
+
122
+
(** Resettable elements *)
123
+
type resettable_tag = [
124
+
| `Input | `Output | `Select | `Textarea
125
+
]
126
+
127
+
(** Table elements *)
128
+
type table_tag = [
129
+
| `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
130
+
| `Th | `Thead | `Tr
131
+
]
132
+
133
+
(** Media elements *)
134
+
type media_tag = [ `Audio | `Video ]
135
+
136
+
(** List container elements *)
137
+
type list_container_tag = [ `Ul | `Ol | `Menu | `Dl ]
138
+
139
+
(** List item elements *)
140
+
type list_item_tag = [ `Li | `Dd | `Dt ]
141
+
142
+
(** Script-supporting elements *)
143
+
type script_supporting_tag = [ `Script | `Template ]
144
+
145
+
(** Metadata content *)
146
+
type metadata_tag = [ `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title ]
147
+
148
+
(** {1 Top-Level Element Type} *)
149
+
150
+
(** Top-level element classification *)
151
+
type element_tag =
152
+
| Html of html_tag (** Known HTML5 element *)
153
+
| Svg of string (** SVG element by local name *)
154
+
| MathML of string (** MathML element by local name *)
155
+
| Custom of string (** Custom element like <my-widget> *)
156
+
| Unknown of string (** Truly unknown element *)
157
+
158
+
(** {1 Conversion Functions} *)
159
+
160
+
(** Convert a lowercase tag name string to html_tag option *)
161
+
let html_tag_of_string_opt name =
162
+
match name with
163
+
(* Document metadata *)
164
+
| "html" -> Some `Html | "head" -> Some `Head | "title" -> Some `Title
165
+
| "base" -> Some `Base | "link" -> Some `Link | "meta" -> Some `Meta
166
+
| "style" -> Some `Style
167
+
(* Sectioning root *)
168
+
| "body" -> Some `Body
169
+
(* Content sectioning *)
170
+
| "address" -> Some `Address | "article" -> Some `Article | "aside" -> Some `Aside
171
+
| "footer" -> Some `Footer | "header" -> Some `Header | "hgroup" -> Some `Hgroup
172
+
| "main" -> Some `Main | "nav" -> Some `Nav | "search" -> Some `Search
173
+
| "section" -> Some `Section
174
+
(* Headings *)
175
+
| "h1" -> Some `H1 | "h2" -> Some `H2 | "h3" -> Some `H3
176
+
| "h4" -> Some `H4 | "h5" -> Some `H5 | "h6" -> Some `H6
177
+
(* Grouping content *)
178
+
| "blockquote" -> Some `Blockquote | "dd" -> Some `Dd | "div" -> Some `Div
179
+
| "dl" -> Some `Dl | "dt" -> Some `Dt | "figcaption" -> Some `Figcaption
180
+
| "figure" -> Some `Figure | "hr" -> Some `Hr | "li" -> Some `Li
181
+
| "menu" -> Some `Menu | "ol" -> Some `Ol | "p" -> Some `P
182
+
| "pre" -> Some `Pre | "ul" -> Some `Ul
183
+
(* Text-level semantics *)
184
+
| "a" -> Some `A | "abbr" -> Some `Abbr | "b" -> Some `B
185
+
| "bdi" -> Some `Bdi | "bdo" -> Some `Bdo | "br" -> Some `Br
186
+
| "cite" -> Some `Cite | "code" -> Some `Code | "data" -> Some `Data
187
+
| "dfn" -> Some `Dfn | "em" -> Some `Em | "i" -> Some `I
188
+
| "kbd" -> Some `Kbd | "mark" -> Some `Mark | "q" -> Some `Q
189
+
| "rp" -> Some `Rp | "rt" -> Some `Rt | "ruby" -> Some `Ruby
190
+
| "s" -> Some `S | "samp" -> Some `Samp | "small" -> Some `Small
191
+
| "span" -> Some `Span | "strong" -> Some `Strong | "sub" -> Some `Sub
192
+
| "sup" -> Some `Sup | "time" -> Some `Time | "u" -> Some `U
193
+
| "var" -> Some `Var | "wbr" -> Some `Wbr
194
+
(* Edits *)
195
+
| "del" -> Some `Del | "ins" -> Some `Ins
196
+
(* Embedded content *)
197
+
| "area" -> Some `Area | "audio" -> Some `Audio | "canvas" -> Some `Canvas
198
+
| "embed" -> Some `Embed | "iframe" -> Some `Iframe | "img" -> Some `Img
199
+
| "map" -> Some `Map | "object" -> Some `Object | "picture" -> Some `Picture
200
+
| "source" -> Some `Source | "track" -> Some `Track | "video" -> Some `Video
201
+
(* Tabular data *)
202
+
| "caption" -> Some `Caption | "col" -> Some `Col | "colgroup" -> Some `Colgroup
203
+
| "table" -> Some `Table | "tbody" -> Some `Tbody | "td" -> Some `Td
204
+
| "tfoot" -> Some `Tfoot | "th" -> Some `Th | "thead" -> Some `Thead
205
+
| "tr" -> Some `Tr
206
+
(* Forms *)
207
+
| "button" -> Some `Button | "datalist" -> Some `Datalist
208
+
| "fieldset" -> Some `Fieldset | "form" -> Some `Form | "input" -> Some `Input
209
+
| "label" -> Some `Label | "legend" -> Some `Legend | "meter" -> Some `Meter
210
+
| "optgroup" -> Some `Optgroup | "option" -> Some `Option
211
+
| "output" -> Some `Output | "progress" -> Some `Progress
212
+
| "select" -> Some `Select | "textarea" -> Some `Textarea
213
+
(* Interactive *)
214
+
| "details" -> Some `Details | "dialog" -> Some `Dialog | "summary" -> Some `Summary
215
+
(* Scripting *)
216
+
| "noscript" -> Some `Noscript | "script" -> Some `Script
217
+
| "slot" -> Some `Slot | "template" -> Some `Template
218
+
(* Web Components / Misc *)
219
+
| "portal" -> Some `Portal | "param" -> Some `Param
220
+
(* Deprecated/obsolete elements *)
221
+
| "applet" -> Some `Applet | "acronym" -> Some `Acronym | "bgsound" -> Some `Bgsound
222
+
| "dir" -> Some `Dir | "frame" -> Some `Frame | "frameset" -> Some `Frameset
223
+
| "noframes" -> Some `Noframes | "isindex" -> Some `Isindex | "keygen" -> Some `Keygen
224
+
| "listing" -> Some `Listing | "menuitem" -> Some `Menuitem | "nextid" -> Some `Nextid
225
+
| "noembed" -> Some `Noembed | "plaintext" -> Some `Plaintext
226
+
| "rb" -> Some `Rb | "rtc" -> Some `Rtc | "strike" -> Some `Strike | "xmp" -> Some `Xmp
227
+
| "basefont" -> Some `Basefont | "big" -> Some `Big | "blink" -> Some `Blink
228
+
| "center" -> Some `Center | "font" -> Some `Font | "marquee" -> Some `Marquee
229
+
| "multicol" -> Some `Multicol | "nobr" -> Some `Nobr | "spacer" -> Some `Spacer
230
+
| "tt" -> Some `Tt | "image" -> Some `Image
231
+
| _ -> None
232
+
233
+
(** Check if a name is a valid custom element name (contains hyphen, not reserved) *)
234
+
let is_custom_element_name name =
235
+
String.contains name '-' &&
236
+
not (String.starts_with ~prefix:"xml" (String.lowercase_ascii name)) &&
237
+
not (String.equal (String.lowercase_ascii name) "annotation-xml")
238
+
239
+
(** SVG namespace URI *)
240
+
let svg_namespace = "http://www.w3.org/2000/svg"
241
+
242
+
(** MathML namespace URI *)
243
+
let mathml_namespace = "http://www.w3.org/1998/Math/MathML"
244
+
245
+
(** Check if namespace is SVG (accepts both short and full URI) *)
246
+
let is_svg_namespace = function
247
+
| "svg" | "http://www.w3.org/2000/svg" -> true
248
+
| _ -> false
249
+
250
+
(** Check if namespace is MathML (accepts both short and full URI) *)
251
+
let is_mathml_namespace = function
252
+
| "mathml" | "http://www.w3.org/1998/Math/MathML" -> true
253
+
| _ -> false
254
+
255
+
(** Convert tag name and optional namespace to element_tag *)
256
+
let tag_of_string ?namespace name =
257
+
let name_lower = String.lowercase_ascii name in
258
+
match namespace with
259
+
| Some ns when is_svg_namespace ns -> Svg name_lower
260
+
| Some ns when is_mathml_namespace ns -> MathML name_lower
261
+
| Some _ -> Unknown name_lower (* Unknown namespace *)
262
+
| None ->
263
+
match html_tag_of_string_opt name_lower with
264
+
| Some tag -> Html tag
265
+
| None ->
266
+
if is_custom_element_name name_lower then
267
+
Custom name_lower
268
+
else
269
+
Unknown name_lower
270
+
271
+
(** Convert html_tag to string *)
272
+
let html_tag_to_string (tag : html_tag) : string =
273
+
match tag with
274
+
(* Document metadata *)
275
+
| `Html -> "html" | `Head -> "head" | `Title -> "title"
276
+
| `Base -> "base" | `Link -> "link" | `Meta -> "meta" | `Style -> "style"
277
+
(* Sectioning root *)
278
+
| `Body -> "body"
279
+
(* Content sectioning *)
280
+
| `Address -> "address" | `Article -> "article" | `Aside -> "aside"
281
+
| `Footer -> "footer" | `Header -> "header" | `Hgroup -> "hgroup"
282
+
| `Main -> "main" | `Nav -> "nav" | `Search -> "search" | `Section -> "section"
283
+
(* Headings *)
284
+
| `H1 -> "h1" | `H2 -> "h2" | `H3 -> "h3"
285
+
| `H4 -> "h4" | `H5 -> "h5" | `H6 -> "h6"
286
+
(* Grouping content *)
287
+
| `Blockquote -> "blockquote" | `Dd -> "dd" | `Div -> "div"
288
+
| `Dl -> "dl" | `Dt -> "dt" | `Figcaption -> "figcaption"
289
+
| `Figure -> "figure" | `Hr -> "hr" | `Li -> "li"
290
+
| `Menu -> "menu" | `Ol -> "ol" | `P -> "p" | `Pre -> "pre" | `Ul -> "ul"
291
+
(* Text-level semantics *)
292
+
| `A -> "a" | `Abbr -> "abbr" | `B -> "b"
293
+
| `Bdi -> "bdi" | `Bdo -> "bdo" | `Br -> "br"
294
+
| `Cite -> "cite" | `Code -> "code" | `Data -> "data"
295
+
| `Dfn -> "dfn" | `Em -> "em" | `I -> "i"
296
+
| `Kbd -> "kbd" | `Mark -> "mark" | `Q -> "q"
297
+
| `Rp -> "rp" | `Rt -> "rt" | `Ruby -> "ruby"
298
+
| `S -> "s" | `Samp -> "samp" | `Small -> "small"
299
+
| `Span -> "span" | `Strong -> "strong" | `Sub -> "sub"
300
+
| `Sup -> "sup" | `Time -> "time" | `U -> "u"
301
+
| `Var -> "var" | `Wbr -> "wbr"
302
+
(* Edits *)
303
+
| `Del -> "del" | `Ins -> "ins"
304
+
(* Embedded content *)
305
+
| `Area -> "area" | `Audio -> "audio" | `Canvas -> "canvas"
306
+
| `Embed -> "embed" | `Iframe -> "iframe" | `Img -> "img"
307
+
| `Map -> "map" | `Object -> "object" | `Picture -> "picture"
308
+
| `Source -> "source" | `Track -> "track" | `Video -> "video"
309
+
(* Tabular data *)
310
+
| `Caption -> "caption" | `Col -> "col" | `Colgroup -> "colgroup"
311
+
| `Table -> "table" | `Tbody -> "tbody" | `Td -> "td"
312
+
| `Tfoot -> "tfoot" | `Th -> "th" | `Thead -> "thead" | `Tr -> "tr"
313
+
(* Forms *)
314
+
| `Button -> "button" | `Datalist -> "datalist"
315
+
| `Fieldset -> "fieldset" | `Form -> "form" | `Input -> "input"
316
+
| `Label -> "label" | `Legend -> "legend" | `Meter -> "meter"
317
+
| `Optgroup -> "optgroup" | `Option -> "option"
318
+
| `Output -> "output" | `Progress -> "progress"
319
+
| `Select -> "select" | `Textarea -> "textarea"
320
+
(* Interactive *)
321
+
| `Details -> "details" | `Dialog -> "dialog" | `Summary -> "summary"
322
+
(* Scripting *)
323
+
| `Noscript -> "noscript" | `Script -> "script"
324
+
| `Slot -> "slot" | `Template -> "template"
325
+
(* Web Components / Misc *)
326
+
| `Portal -> "portal" | `Param -> "param"
327
+
(* Deprecated/obsolete elements *)
328
+
| `Applet -> "applet" | `Acronym -> "acronym" | `Bgsound -> "bgsound"
329
+
| `Dir -> "dir" | `Frame -> "frame" | `Frameset -> "frameset"
330
+
| `Noframes -> "noframes" | `Isindex -> "isindex" | `Keygen -> "keygen"
331
+
| `Listing -> "listing" | `Menuitem -> "menuitem" | `Nextid -> "nextid"
332
+
| `Noembed -> "noembed" | `Plaintext -> "plaintext"
333
+
| `Rb -> "rb" | `Rtc -> "rtc" | `Strike -> "strike" | `Xmp -> "xmp"
334
+
| `Basefont -> "basefont" | `Big -> "big" | `Blink -> "blink"
335
+
| `Center -> "center" | `Font -> "font" | `Marquee -> "marquee"
336
+
| `Multicol -> "multicol" | `Nobr -> "nobr" | `Spacer -> "spacer"
337
+
| `Tt -> "tt" | `Image -> "image"
338
+
339
+
(** Convert element_tag to string *)
340
+
let tag_to_string = function
341
+
| Html tag -> html_tag_to_string tag
342
+
| Svg name -> name
343
+
| MathML name -> name
344
+
| Custom name -> name
345
+
| Unknown name -> name
346
+
347
+
(** {1 Category Predicates} *)
348
+
349
+
(** Check if element is a void element *)
350
+
let is_void (tag : html_tag) : bool =
351
+
match tag with
352
+
| `Area | `Base | `Br | `Col | `Embed | `Hr | `Img | `Input
353
+
| `Link | `Meta | `Source | `Track | `Wbr
354
+
| `Basefont | `Frame | `Isindex | `Keygen | `Param -> true
355
+
| _ -> false
356
+
357
+
(** Check if element is a heading *)
358
+
let is_heading (tag : html_tag) : bool =
359
+
match tag with
360
+
| `H1 | `H2 | `H3 | `H4 | `H5 | `H6 -> true
361
+
| _ -> false
362
+
363
+
(** Get heading level (1-6) or None *)
364
+
let heading_level (tag : html_tag) : int option =
365
+
match tag with
366
+
| `H1 -> Some 1 | `H2 -> Some 2 | `H3 -> Some 3
367
+
| `H4 -> Some 4 | `H5 -> Some 5 | `H6 -> Some 6
368
+
| _ -> None
369
+
370
+
(** Check if element is sectioning content *)
371
+
let is_sectioning (tag : html_tag) : bool =
372
+
match tag with
373
+
| `Article | `Aside | `Nav | `Section -> true
374
+
| _ -> false
375
+
376
+
(** Check if element is a sectioning root *)
377
+
let is_sectioning_root (tag : html_tag) : bool =
378
+
match tag with
379
+
| `Blockquote | `Body | `Details | `Dialog | `Fieldset | `Figure | `Td -> true
380
+
| _ -> false
381
+
382
+
(** Check if element is embedded content *)
383
+
let is_embedded (tag : html_tag) : bool =
384
+
match tag with
385
+
| `Audio | `Canvas | `Embed | `Iframe | `Img | `Object | `Picture | `Video -> true
386
+
| _ -> false
387
+
388
+
(** Check if element is interactive content *)
389
+
let is_interactive (tag : html_tag) : bool =
390
+
match tag with
391
+
| `A | `Audio | `Button | `Details | `Embed | `Iframe | `Img
392
+
| `Input | `Label | `Select | `Textarea | `Video -> true
393
+
| _ -> false
394
+
395
+
(** Check if element is form-associated *)
396
+
let is_form_associated (tag : html_tag) : bool =
397
+
match tag with
398
+
| `Button | `Fieldset | `Input | `Label | `Object | `Output
399
+
| `Select | `Textarea | `Meter | `Progress -> true
400
+
| _ -> false
401
+
402
+
(** Check if element is labelable *)
403
+
let is_labelable (tag : html_tag) : bool =
404
+
match tag with
405
+
| `Button | `Input | `Meter | `Output | `Progress | `Select | `Textarea -> true
406
+
| _ -> false
407
+
408
+
(** Check if element is submittable *)
409
+
let is_submittable (tag : html_tag) : bool =
410
+
match tag with
411
+
| `Button | `Input | `Select | `Textarea -> true
412
+
| _ -> false
413
+
414
+
(** Check if element is resettable *)
415
+
let is_resettable (tag : html_tag) : bool =
416
+
match tag with
417
+
| `Input | `Output | `Select | `Textarea -> true
418
+
| _ -> false
419
+
420
+
(** Check if element has transparent content model *)
421
+
let is_transparent (tag : html_tag) : bool =
422
+
match tag with
423
+
| `A | `Abbr | `Audio | `Canvas | `Del | `Ins | `Map | `Noscript
424
+
| `Object | `Slot | `Video -> true
425
+
| _ -> false
426
+
427
+
(** Check if element is script-supporting *)
428
+
let is_script_supporting (tag : html_tag) : bool =
429
+
match tag with
430
+
| `Script | `Template -> true
431
+
| _ -> false
432
+
433
+
(** Check if element is a table element *)
434
+
let is_table_element (tag : html_tag) : bool =
435
+
match tag with
436
+
| `Caption | `Col | `Colgroup | `Table | `Tbody | `Td | `Tfoot
437
+
| `Th | `Thead | `Tr -> true
438
+
| _ -> false
439
+
440
+
(** Check if element is a media element *)
441
+
let is_media (tag : html_tag) : bool =
442
+
match tag with
443
+
| `Audio | `Video -> true
444
+
| _ -> false
445
+
446
+
(** Check if element is a list container *)
447
+
let is_list_container (tag : html_tag) : bool =
448
+
match tag with
449
+
| `Ul | `Ol | `Menu | `Dl -> true
450
+
| _ -> false
451
+
452
+
(** Check if element is a list item *)
453
+
let is_list_item (tag : html_tag) : bool =
454
+
match tag with
455
+
| `Li | `Dd | `Dt -> true
456
+
| _ -> false
457
+
458
+
(** Check if element is metadata content *)
459
+
let is_metadata (tag : html_tag) : bool =
460
+
match tag with
461
+
| `Base | `Link | `Meta | `Noscript | `Script | `Style | `Template | `Title -> true
462
+
| _ -> false
463
+
464
+
(** Check if element is a deprecated/obsolete element *)
465
+
let is_obsolete (tag : html_tag) : bool =
466
+
match tag with
467
+
| `Applet | `Acronym | `Bgsound | `Dir | `Frame | `Frameset
468
+
| `Noframes | `Isindex | `Keygen | `Listing | `Menuitem | `Nextid
469
+
| `Noembed | `Plaintext | `Rb | `Rtc | `Strike | `Xmp
470
+
| `Basefont | `Big | `Blink | `Center | `Font | `Marquee
471
+
| `Multicol | `Nobr | `Spacer | `Tt | `Image -> true
472
+
| _ -> false
473
+
474
+
(** Check if element is a raw text element (script, style) *)
475
+
let is_raw_text (tag : html_tag) : bool =
476
+
match tag with
477
+
| `Script | `Style -> true
478
+
| _ -> false
479
+
480
+
(** Check if element is an escapable raw text element (textarea, title) *)
481
+
let is_escapable_raw_text (tag : html_tag) : bool =
482
+
match tag with
483
+
| `Textarea | `Title -> true
484
+
| _ -> false
485
+
486
+
(** Check if element is a phrasing content element *)
487
+
let is_phrasing (tag : html_tag) : bool =
488
+
match tag with
489
+
| `A | `Abbr | `Audio | `B | `Bdi | `Bdo | `Br | `Button | `Canvas
490
+
| `Cite | `Code | `Data | `Datalist | `Del | `Dfn | `Em | `Embed
491
+
| `I | `Iframe | `Img | `Input | `Ins | `Kbd | `Label | `Map | `Mark
492
+
| `Meter | `Noscript | `Object | `Output | `Picture | `Progress | `Q
493
+
| `Ruby | `S | `Samp | `Script | `Select | `Slot | `Small | `Span
494
+
| `Strong | `Sub | `Sup | `Template | `Textarea | `Time | `U | `Var
495
+
| `Video | `Wbr
496
+
(* Deprecated phrasing *)
497
+
| `Acronym | `Big | `Blink | `Font | `Marquee | `Nobr | `Spacer | `Tt -> true
498
+
| _ -> false
499
+
500
+
(** Check if element is flow content *)
501
+
let is_flow (tag : html_tag) : bool =
502
+
match tag with
503
+
(* Most elements are flow content *)
504
+
| `Html | `Head | `Title | `Base | `Link | `Meta | `Style -> false
505
+
| `Body -> false
506
+
| `Caption | `Col | `Colgroup | `Tbody | `Td | `Tfoot | `Th | `Thead | `Tr -> false
507
+
| `Dd | `Dt | `Li -> false
508
+
| `Optgroup | `Option -> false
509
+
| `Param | `Source | `Track -> false
510
+
| `Area -> false (* Only when descendant of map *)
511
+
| `Rp | `Rt | `Rb | `Rtc -> false
512
+
| `Legend | `Figcaption | `Summary -> false
513
+
| _ -> true
514
+
515
+
(** Pattern for matching HTML tags in element_tag *)
516
+
let as_html_tag = function
517
+
| Html tag -> Some tag
518
+
| _ -> None
519
+
520
+
(** Pattern for matching specific HTML tag *)
521
+
let is_html_tag expected = function
522
+
| Html tag -> tag = expected
523
+
| _ -> false
+36
-31
lib/htmlrw_check/semantic/autofocus_checker.ml
+36
-31
lib/htmlrw_check/semantic/autofocus_checker.ml
···
25
25
state.context_stack <- [];
26
26
state.current_depth <- 0
27
27
28
-
let start_element state ~name ~namespace ~attrs collector =
28
+
let start_element state ~element collector =
29
29
state.current_depth <- state.current_depth + 1;
30
30
31
-
match namespace with
32
-
| Some _ -> ()
33
-
| None ->
34
-
let name_lower = String.lowercase_ascii name in
35
-
36
-
(* Check if we're entering a dialog or popover context *)
37
-
let enters_context = match name_lower with
38
-
| "dialog" -> Some Dialog
39
-
| _ when Attr_utils.has_attr "popover" attrs -> Some Popover
40
-
| _ -> None
41
-
in
42
-
43
-
Option.iter (fun ctx_type ->
44
-
let ctx = { context_type = ctx_type; autofocus_count = 0; depth = state.current_depth } in
31
+
match element.Element.tag with
32
+
| Tag.Html `Dialog ->
33
+
let ctx = { context_type = Dialog; autofocus_count = 0; depth = state.current_depth } in
34
+
state.context_stack <- ctx :: state.context_stack;
35
+
(* Check for autofocus on dialog itself *)
36
+
if Attr.has_autofocus element.attrs then
37
+
begin match state.context_stack with
38
+
| ctx :: _ ->
39
+
ctx.autofocus_count <- ctx.autofocus_count + 1;
40
+
if ctx.autofocus_count > 1 then
41
+
Message_collector.add_typed collector (`Misc `Multiple_autofocus)
42
+
| [] -> ()
43
+
end
44
+
| Tag.Html _ ->
45
+
(* Check if element has popover attribute *)
46
+
let has_popover = Attr_utils.has_attr "popover" element.raw_attrs in
47
+
if has_popover then begin
48
+
let ctx = { context_type = Popover; autofocus_count = 0; depth = state.current_depth } in
45
49
state.context_stack <- ctx :: state.context_stack
46
-
) enters_context;
47
-
50
+
end;
48
51
(* Check for autofocus attribute *)
49
-
if Attr_utils.has_attr "autofocus" attrs then
52
+
if Attr.has_autofocus element.attrs then begin
50
53
match state.context_stack with
51
54
| ctx :: _ ->
52
55
ctx.autofocus_count <- ctx.autofocus_count + 1;
53
56
if ctx.autofocus_count > 1 then
54
57
Message_collector.add_typed collector (`Misc `Multiple_autofocus)
55
58
| [] -> ()
59
+
end
60
+
| _ -> ()
56
61
57
-
let end_element state ~name ~namespace _collector =
58
-
(match namespace with
59
-
| Some _ -> ()
60
-
| None ->
61
-
let name_lower = String.lowercase_ascii name in
62
-
match state.context_stack with
63
-
| ctx :: rest when ctx.depth = state.current_depth ->
64
-
let matches =
65
-
(name_lower = "dialog" && ctx.context_type = Dialog) ||
66
-
(ctx.context_type = Popover)
67
-
in
68
-
if matches then state.context_stack <- rest
69
-
| _ -> ());
62
+
let end_element state ~tag _collector =
63
+
(match tag with
64
+
| Tag.Html `Dialog ->
65
+
(match state.context_stack with
66
+
| ctx :: rest when ctx.depth = state.current_depth && ctx.context_type = Dialog ->
67
+
state.context_stack <- rest
68
+
| _ -> ())
69
+
| Tag.Html _ ->
70
+
(match state.context_stack with
71
+
| ctx :: rest when ctx.depth = state.current_depth && ctx.context_type = Popover ->
72
+
state.context_stack <- rest
73
+
| _ -> ())
74
+
| _ -> ());
70
75
71
76
state.current_depth <- state.current_depth - 1
72
77
+6
-5
lib/htmlrw_check/semantic/form_checker.ml
+6
-5
lib/htmlrw_check/semantic/form_checker.ml
···
31
31
(`Attr (`Bad_value (`Elem element_name, `Attr "autocomplete", `Value value, `Reason reason)))
32
32
end
33
33
34
-
let start_element _state ~name ~namespace:_ ~attrs collector =
34
+
let start_element _state ~element collector =
35
35
(* Check autocomplete attribute on form elements *)
36
-
match name with
37
-
| "input" | "select" | "textarea" ->
38
-
(match Attr_utils.get_attr "autocomplete" attrs with
36
+
match element.Element.tag with
37
+
| Tag.Html (`Input | `Select | `Textarea as tag) ->
38
+
let name = Tag.html_tag_to_string tag in
39
+
(match Attr_utils.get_attr "autocomplete" element.raw_attrs with
39
40
| Some autocomplete_value ->
40
41
check_autocomplete_value autocomplete_value name collector
41
42
| None -> ())
42
43
| _ -> ()
43
44
44
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
45
+
let end_element _state ~tag:_ _collector = ()
45
46
46
47
let characters _state _text _collector = ()
47
48
+12
-12
lib/htmlrw_check/semantic/id_checker.ml
+12
-12
lib/htmlrw_check/semantic/id_checker.ml
···
176
176
| _ -> ()
177
177
) attrs
178
178
179
-
let start_element state ~name ~namespace:_ ~attrs collector =
180
-
(* For now, we don't have location information from the DOM walker,
181
-
so we pass None. In a full implementation, this would be passed
182
-
from the parser. *)
179
+
let start_element state ~element collector =
180
+
let name = Tag.tag_to_string element.Element.tag in
181
+
let attrs = element.raw_attrs in
183
182
let location = None in
184
183
process_attrs state ~element:name ~attrs ~location collector;
185
184
186
185
(* Special check: map element must have matching id and name if both present *)
187
-
if name = "map" then begin
188
-
let id_opt = List.find_map (fun (n, v) -> if n = "id" then Some v else None) attrs in
189
-
let name_opt = List.find_map (fun (n, v) -> if n = "name" then Some v else None) attrs in
190
-
match id_opt, name_opt with
191
-
| Some id_val, Some name_val when id_val <> name_val ->
186
+
(match element.tag with
187
+
| Tag.Html `Map ->
188
+
let id_opt = Attr.get_id element.attrs in
189
+
let name_opt = Attr.get_name element.attrs in
190
+
(match id_opt, name_opt with
191
+
| Some id_val, Some name_val when id_val <> name_val ->
192
192
Message_collector.add_typed collector (`Misc `Map_id_name_mismatch)
193
-
| _ -> ()
194
-
end
193
+
| _ -> ())
194
+
| _ -> ())
195
195
196
-
let end_element _state ~name:_ ~namespace:_ _collector =
196
+
let end_element _state ~tag:_ _collector =
197
197
()
198
198
199
199
let characters _state _text _collector =
+20
-17
lib/htmlrw_check/semantic/lang_detecting_checker.ml
+20
-17
lib/htmlrw_check/semantic/lang_detecting_checker.ml
···
216
216
(* If > 2% are Traditional-only characters, it's Traditional Chinese *)
217
217
!total > 100 && (float_of_int !count /. float_of_int !total) > 0.02
218
218
219
-
let start_element state ~name ~namespace ~attrs _collector =
220
-
let name_lower = String.lowercase_ascii name in
221
-
let ns = Option.value namespace ~default:"" in
222
-
223
-
if name_lower = "html" then begin
219
+
let start_element state ~element _collector =
220
+
let attrs = element.Element.raw_attrs in
221
+
match element.tag with
222
+
| Tag.Html `Html ->
224
223
state.html_lang <- Attr_utils.get_attr "lang" attrs;
225
224
state.html_dir <- Attr_utils.get_attr "dir" attrs;
226
225
(* TODO: get line/column from locator *)
227
226
state.html_locator <- Some (1, 1)
228
-
end
229
-
else if name_lower = "body" then
227
+
| Tag.Html `Body ->
230
228
state.in_body <- true
231
-
else if state.in_body then begin
232
-
(* Track foreign namespace depth (SVG/MathML) *)
233
-
if is_foreign_namespace ns || is_foreign_element name then
229
+
| Tag.Svg _ | Tag.MathML _ ->
230
+
if state.in_body then
234
231
state.foreign_depth <- state.foreign_depth + 1
235
-
else if state.foreign_depth > 0 then
232
+
| Tag.Html tag when state.in_body ->
233
+
let name_lower = Tag.html_tag_to_string tag in
234
+
if state.foreign_depth > 0 then
236
235
state.foreign_depth <- state.foreign_depth + 1
237
236
(* Check if we should skip this element's text *)
238
237
else if List.mem name_lower skip_elements then
···
244
243
state.skip_depth <- state.skip_depth + 1
245
244
| _ -> ()
246
245
end
247
-
end
246
+
| _ -> ()
248
247
249
-
let end_element state ~name ~namespace:_ _collector =
250
-
let name_lower = String.lowercase_ascii name in
251
-
if name_lower = "body" then
248
+
let end_element state ~tag _collector =
249
+
match tag with
250
+
| Tag.Html `Body ->
252
251
state.in_body <- false
253
-
else if state.in_body then begin
252
+
| Tag.Svg _ | Tag.MathML _ when state.in_body ->
253
+
if state.foreign_depth > 0 then
254
+
state.foreign_depth <- state.foreign_depth - 1
255
+
| Tag.Html tag when state.in_body ->
256
+
let name_lower = Tag.html_tag_to_string tag in
254
257
(* Track foreign namespace depth *)
255
258
if state.foreign_depth > 0 then
256
259
state.foreign_depth <- state.foreign_depth - 1
···
261
264
(* TODO: properly track nested elements with different lang *)
262
265
state.skip_depth <- max 0 (state.skip_depth - 1)
263
266
end
264
-
end
267
+
| _ -> ()
265
268
266
269
let characters state text _collector =
267
270
if state.in_body && state.skip_depth = 0 && state.foreign_depth = 0 && state.char_count < max_chars then begin
+10
-8
lib/htmlrw_check/semantic/nesting_checker.ml
+10
-8
lib/htmlrw_check/semantic/nesting_checker.ml
···
300
300
end
301
301
| _ -> ()
302
302
303
-
let start_element state ~name ~namespace ~attrs collector =
303
+
let start_element state ~element collector =
304
304
(* Only check HTML elements, not SVG or MathML *)
305
-
match namespace with
306
-
| Some _ -> ()
307
-
| None ->
305
+
match element.Element.tag with
306
+
| Tag.Html _ ->
307
+
let name = Tag.tag_to_string element.tag in
308
+
let attrs = element.raw_attrs in
308
309
(* Check for nesting violations *)
309
310
check_nesting state name attrs collector;
310
311
check_required_ancestors state name collector;
···
334
335
let node = { ancestor_mask = state.ancestor_mask; name; is_transparent } in
335
336
state.stack <- node :: state.stack;
336
337
state.ancestor_mask <- new_mask
338
+
| _ -> () (* SVG, MathML, Custom, Unknown *)
337
339
338
-
let end_element state ~name:_ ~namespace _collector =
340
+
let end_element state ~tag _collector =
339
341
(* Only track HTML elements *)
340
-
match namespace with
341
-
| Some _ -> ()
342
-
| None ->
342
+
match tag with
343
+
| Tag.Html _ ->
343
344
(* Pop from stack and restore ancestor mask *)
344
345
begin match state.stack with
345
346
| [] -> () (* Should not happen in well-formed documents *)
···
347
348
state.stack <- rest;
348
349
state.ancestor_mask <- node.ancestor_mask
349
350
end
351
+
| _ -> ()
350
352
351
353
let characters _state _text _collector =
352
354
() (* No text-specific nesting checks *)
+11
-13
lib/htmlrw_check/semantic/obsolete_checker.ml
+11
-13
lib/htmlrw_check/semantic/obsolete_checker.ml
···
250
250
251
251
let reset state = state.in_head <- false
252
252
253
-
let start_element state ~name ~namespace ~attrs collector =
254
-
(* Only check HTML elements (no namespace or explicit HTML namespace) *)
255
-
let is_html = match namespace with
256
-
| None -> true
257
-
| Some ns -> String.equal (String.lowercase_ascii ns) "html"
258
-
in
259
-
260
-
if not is_html then ()
261
-
else begin
253
+
let start_element state ~element collector =
254
+
(* Only check HTML elements *)
255
+
match element.Element.tag with
256
+
| Tag.Html _ ->
257
+
let name = Tag.tag_to_string element.tag in
262
258
let name_lower = String.lowercase_ascii name in
259
+
let attrs = element.raw_attrs in
263
260
264
261
(* Track head context *)
265
262
if name_lower = "head" then state.in_head <- true;
···
309
306
(`Element (`Obsolete_global_attr (`Attr attr_name, `Suggestion suggestion))))
310
307
end
311
308
) attrs
312
-
end
309
+
| _ -> () (* Non-HTML elements don't have obsolete checks *)
313
310
314
-
let end_element state ~name ~namespace:_ _collector =
315
-
let name_lower = String.lowercase_ascii name in
316
-
if name_lower = "head" then state.in_head <- false
311
+
let end_element state ~tag _collector =
312
+
match tag with
313
+
| Tag.Html `Head -> state.in_head <- false
314
+
| _ -> ()
317
315
318
316
let characters _state _text _collector = ()
319
317
+30
-41
lib/htmlrw_check/semantic/option_checker.ml
+30
-41
lib/htmlrw_check/semantic/option_checker.ml
···
22
22
state.option_stack <- [];
23
23
state.in_template <- 0
24
24
25
-
let start_element state ~name ~namespace ~attrs collector =
26
-
let name_lower = String.lowercase_ascii name in
25
+
let start_element state ~element collector =
26
+
match element.Element.tag with
27
+
| Tag.Html `Template ->
28
+
state.in_template <- state.in_template + 1
29
+
| Tag.Html `Option when state.in_template = 0 ->
30
+
let label_opt = Attr_utils.get_attr "label" element.raw_attrs in
31
+
let has_label = label_opt <> None in
32
+
let label_empty = match label_opt with
33
+
| Some v -> String.trim v = ""
34
+
| None -> false
35
+
in
36
+
(* Report error for empty label attribute value *)
37
+
if label_empty then
38
+
Message_collector.add_typed collector
39
+
(`Attr (`Bad_value (`Elem "option", `Attr "label", `Value "", `Reason "Bad non-empty string: Must not be empty.")));
40
+
let ctx = { has_text = false; has_label; label_empty } in
41
+
state.option_stack <- ctx :: state.option_stack
42
+
| _ -> ()
27
43
28
-
if namespace <> None then ()
29
-
else begin
30
-
if name_lower = "template" then
31
-
state.in_template <- state.in_template + 1
32
-
else if state.in_template = 0 && name_lower = "option" then begin
33
-
let label_opt = Attr_utils.get_attr "label" attrs in
34
-
let has_label = label_opt <> None in
35
-
let label_empty = match label_opt with
36
-
| Some v -> String.trim v = ""
37
-
| None -> false
38
-
in
39
-
(* Report error for empty label attribute value *)
40
-
if label_empty then
41
-
Message_collector.add_typed collector
42
-
(`Attr (`Bad_value (`Elem "option", `Attr "label", `Value "", `Reason "Bad non-empty string: Must not be empty.")));
43
-
let ctx = { has_text = false; has_label; label_empty } in
44
-
state.option_stack <- ctx :: state.option_stack
45
-
end
46
-
end
47
-
48
-
let end_element state ~name ~namespace collector =
49
-
let name_lower = String.lowercase_ascii name in
50
-
51
-
if namespace <> None then ()
52
-
else begin
53
-
if name_lower = "template" then
54
-
state.in_template <- max 0 (state.in_template - 1)
55
-
else if state.in_template = 0 && name_lower = "option" then begin
56
-
match state.option_stack with
57
-
| ctx :: rest ->
58
-
state.option_stack <- rest;
59
-
(* Validate: option must have text content or non-empty label *)
60
-
(* Note: empty label error is already reported at start_element,
61
-
so only report empty option without label when there's no label attribute at all *)
62
-
if not ctx.has_text && not ctx.has_label then
63
-
Message_collector.add_typed collector (`Misc `Option_empty_without_label)
64
-
| [] -> ()
65
-
end
66
-
end
44
+
let end_element state ~tag collector =
45
+
match tag with
46
+
| Tag.Html `Template ->
47
+
state.in_template <- max 0 (state.in_template - 1)
48
+
| Tag.Html `Option when state.in_template = 0 ->
49
+
(match state.option_stack with
50
+
| ctx :: rest ->
51
+
state.option_stack <- rest;
52
+
if not ctx.has_text && not ctx.has_label then
53
+
Message_collector.add_typed collector (`Misc `Option_empty_without_label)
54
+
| [] -> ())
55
+
| _ -> ()
67
56
68
57
let characters state text _collector =
69
58
if state.in_template = 0 then begin
+23
-20
lib/htmlrw_check/semantic/required_attr_checker.ml
+23
-20
lib/htmlrw_check/semantic/required_attr_checker.ml
···
177
177
(q "value") (q "max")))
178
178
with _ -> ())
179
179
180
-
let start_element state ~name ~namespace:_ ~attrs collector =
181
-
match name with
182
-
| "img" -> check_img_element state attrs collector
183
-
| "area" -> check_area_element attrs collector
184
-
| "input" -> check_input_element attrs collector
185
-
| "script" -> check_script_element attrs collector
186
-
| "meta" -> check_meta_element attrs collector
187
-
| "link" -> check_link_element attrs collector
188
-
| "a" ->
180
+
let start_element state ~element collector =
181
+
let attrs = element.Element.raw_attrs in
182
+
match element.tag with
183
+
| Tag.Html `Img -> check_img_element state attrs collector
184
+
| Tag.Html `Area -> check_area_element attrs collector
185
+
| Tag.Html `Input -> check_input_element attrs collector
186
+
| Tag.Html `Script -> check_script_element attrs collector
187
+
| Tag.Html `Meta -> check_meta_element attrs collector
188
+
| Tag.Html `Link -> check_link_element attrs collector
189
+
| Tag.Html `A ->
189
190
check_a_element attrs collector;
190
191
if Attr_utils.has_attr "href" attrs then state.in_a_with_href <- true
191
-
| "map" -> check_map_element attrs collector
192
-
| "object" -> check_object_element attrs collector
193
-
| "meter" -> check_meter_element attrs collector
194
-
| "progress" -> check_progress_element attrs collector
195
-
| "figure" -> state._in_figure <- true
196
-
| _ ->
197
-
(* Check popover attribute on any element *)
192
+
| Tag.Html `Map -> check_map_element attrs collector
193
+
| Tag.Html `Object -> check_object_element attrs collector
194
+
| Tag.Html `Meter -> check_meter_element attrs collector
195
+
| Tag.Html `Progress -> check_progress_element attrs collector
196
+
| Tag.Html `Figure -> state._in_figure <- true
197
+
| Tag.Html _ ->
198
+
(* Check popover attribute on any HTML element *)
199
+
let name = Tag.tag_to_string element.tag in
198
200
if Attr_utils.has_attr "popover" attrs then check_popover_element name attrs collector
201
+
| _ -> () (* Non-HTML elements *)
199
202
200
-
let end_element state ~name ~namespace:_ _collector =
201
-
match name with
202
-
| "figure" -> state._in_figure <- false
203
-
| "a" -> state.in_a_with_href <- false
203
+
let end_element state ~tag _collector =
204
+
match tag with
205
+
| Tag.Html `Figure -> state._in_figure <- false
206
+
| Tag.Html `A -> state.in_a_with_href <- false
204
207
| _ -> ()
205
208
206
209
let characters _state _text _collector = ()
+13
-11
lib/htmlrw_check/specialized/aria_checker.ml
+13
-11
lib/htmlrw_check/specialized/aria_checker.ml
···
427
427
let quoted = List.map (fun r -> "\"" ^ r ^ "\"") roles in
428
428
String.concat " or " quoted
429
429
430
-
let start_element state ~name ~namespace ~attrs collector =
430
+
let start_element state ~element collector =
431
431
(* Only process HTML elements *)
432
-
match namespace with
433
-
| Some _ -> () (* Skip non-HTML elements *)
434
-
| None ->
432
+
match element.Element.tag with
433
+
| Tag.Html _ ->
434
+
let name = Tag.tag_to_string element.tag in
435
435
let name_lower = String.lowercase_ascii name in
436
+
let attrs = element.raw_attrs in
436
437
let role_attr = List.assoc_opt "role" attrs in
437
438
let aria_label = List.assoc_opt "aria-label" attrs in
438
439
let aria_labelledby = List.assoc_opt "aria-labelledby" attrs in
···
723
724
implicit_role;
724
725
} in
725
726
state.stack <- node :: state.stack
727
+
| _ -> () (* Skip non-HTML elements *)
726
728
727
-
let end_element state ~name:_ ~namespace _collector =
729
+
let end_element state ~tag _collector =
728
730
(* Only process HTML elements *)
729
-
match namespace with
730
-
| Some _ -> () (* Skip non-HTML elements *)
731
-
| None ->
731
+
match tag with
732
+
| Tag.Html _ ->
732
733
(* Pop from stack *)
733
-
match state.stack with
734
-
| _ :: rest -> state.stack <- rest
735
-
| [] -> () (* Stack underflow - shouldn't happen in well-formed docs *)
734
+
(match state.stack with
735
+
| _ :: rest -> state.stack <- rest
736
+
| [] -> ()) (* Stack underflow - shouldn't happen in well-formed docs *)
737
+
| _ -> ()
736
738
737
739
let characters _state _text _collector = ()
738
740
+255
-299
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
+255
-299
lib/htmlrw_check/specialized/attr_restrictions_checker.ml
···
52
52
Message_collector.add_typed collector
53
53
(`Attr (`Not_allowed (`Attr attr, `Elem element)))
54
54
55
-
let start_element state ~name ~namespace ~attrs collector =
56
-
let name_lower = String.lowercase_ascii name in
55
+
let start_element state ~element collector =
56
+
match element.Element.tag with
57
+
| Tag.Html _ ->
58
+
let name = Tag.tag_to_string element.tag in
59
+
let name_lower = String.lowercase_ascii name in
60
+
let attrs = element.raw_attrs in
57
61
58
-
(* Detect XHTML mode from xmlns attribute on html element *)
59
-
if name_lower = "html" then begin
60
-
match Attr_utils.get_attr "xmlns" attrs with
61
-
| Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true
62
-
| _ -> ()
63
-
end;
62
+
(* Detect XHTML mode from xmlns attribute on html element *)
63
+
if name_lower = "html" then begin
64
+
match Attr_utils.get_attr "xmlns" attrs with
65
+
| Some "http://www.w3.org/1999/xhtml" -> state.is_xhtml <- true
66
+
| _ -> ()
67
+
end;
64
68
65
-
(* Check HTML element attribute restrictions *)
66
-
(match namespace with
67
-
| Some _ -> ()
68
-
| None ->
69
-
match List.assoc_opt name_lower disallowed_attrs_html with
69
+
(* Check HTML element attribute restrictions *)
70
+
(match List.assoc_opt name_lower disallowed_attrs_html with
70
71
| Some disallowed ->
71
72
List.iter (fun attr ->
72
73
if Attr_utils.has_attr attr attrs then
···
74
75
) disallowed
75
76
| None -> ());
76
77
77
-
(* Check for xml:base attribute - not allowed in HTML *)
78
-
(match namespace with
79
-
| Some _ -> ()
80
-
| None when name_lower = "html" ->
81
-
if Attr_utils.has_attr "xml:base" attrs then
82
-
report_disallowed_attr name_lower "xml:base" collector
83
-
| None -> ());
78
+
(* Check for xml:base attribute - not allowed in HTML *)
79
+
if name_lower = "html" then begin
80
+
if Attr_utils.has_attr "xml:base" attrs then
81
+
report_disallowed_attr name_lower "xml:base" collector
82
+
end;
84
83
85
-
(* Check for xmlns:* prefixed attributes - not allowed in HTML *)
86
-
(* Standard xmlns declarations are allowed but custom prefixes are not *)
87
-
(match namespace with
88
-
| Some _ -> ()
89
-
| None ->
84
+
(* Check for xmlns:* prefixed attributes - not allowed in HTML *)
85
+
(* Standard xmlns declarations are allowed but custom prefixes are not *)
90
86
List.iter (fun (attr_name, _) ->
91
87
let attr_lower = String.lowercase_ascii attr_name in
92
88
if String.starts_with ~prefix:"xmlns:" attr_lower then begin
···
96
92
Message_collector.add_typed collector
97
93
(`Attr (`Not_allowed_here (`Attr attr_name)))
98
94
end
99
-
) attrs);
95
+
) attrs;
96
+
97
+
(* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *)
98
+
(* xml:id is never valid on SVG elements in HTML5 *)
99
+
if List.mem name_lower svg_no_xml_id then begin
100
+
if Attr_utils.has_attr "xml:id" attrs then
101
+
report_disallowed_attr name_lower "xml:id" collector
102
+
end;
100
103
101
-
(* Check SVG element restrictions - works in both HTML-embedded and XHTML SVG *)
102
-
(* xml:id is never valid on SVG elements in HTML5 *)
103
-
if List.mem name_lower svg_no_xml_id then begin
104
-
if Attr_utils.has_attr "xml:id" attrs then
105
-
report_disallowed_attr name_lower "xml:id" collector
106
-
end;
104
+
(* SVG feConvolveMatrix requires order attribute *)
105
+
if name_lower = "feconvolvematrix" then begin
106
+
if not (Attr_utils.has_attr "order" attrs) then
107
+
Message_collector.add_typed collector
108
+
(`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order")))
109
+
end;
107
110
108
-
(* SVG feConvolveMatrix requires order attribute *)
109
-
if name_lower = "feconvolvematrix" then begin
110
-
if not (Attr_utils.has_attr "order" attrs) then
111
-
Message_collector.add_typed collector
112
-
(`Svg (`Missing_attr (`Elem "feConvolveMatrix", `Attr "order")))
113
-
end;
111
+
(* Validate style type attribute - must be "text/css" or omitted *)
112
+
if name_lower = "style" then begin
113
+
List.iter (fun (attr_name, attr_value) ->
114
+
let attr_lower = String.lowercase_ascii attr_name in
115
+
if attr_lower = "type" then begin
116
+
let value_lower = String.lowercase_ascii (String.trim attr_value) in
117
+
if value_lower <> "text/css" then
118
+
Message_collector.add_typed collector (`Misc `Style_type_invalid)
119
+
end
120
+
) attrs
121
+
end;
114
122
115
-
(* Validate style type attribute - must be "text/css" or omitted *)
116
-
(match namespace with
117
-
| Some _ -> ()
118
-
| None when name_lower = "style" ->
119
-
List.iter (fun (attr_name, attr_value) ->
120
-
let attr_lower = String.lowercase_ascii attr_name in
121
-
if attr_lower = "type" then begin
122
-
let value_lower = String.lowercase_ascii (String.trim attr_value) in
123
-
if value_lower <> "text/css" then
124
-
Message_collector.add_typed collector (`Misc `Style_type_invalid)
125
-
end
126
-
) attrs
127
-
| None -> ());
123
+
(* Validate object element requires data or type attribute *)
124
+
if name_lower = "object" then begin
125
+
let has_data = Attr_utils.has_attr "data" attrs in
126
+
let has_type = Attr_utils.has_attr "type" attrs in
127
+
if not has_data && not has_type then
128
+
Message_collector.add_typed collector
129
+
(`Attr (`Missing (`Elem "object", `Attr "data")))
130
+
end;
128
131
129
-
(* Validate object element requires data or type attribute *)
130
-
(match namespace with
131
-
| Some _ -> ()
132
-
| None when name_lower = "object" ->
133
-
let has_data = Attr_utils.has_attr "data" attrs in
134
-
let has_type = Attr_utils.has_attr "type" attrs in
135
-
if not has_data && not has_type then
136
-
Message_collector.add_typed collector
137
-
(`Attr (`Missing (`Elem "object", `Attr "data")))
138
-
| None -> ());
132
+
(* Validate link imagesizes/imagesrcset attributes *)
133
+
if name_lower = "link" then begin
134
+
let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
135
+
let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
136
+
let rel_value = Attr_utils.get_attr "rel" attrs in
137
+
let as_value = Attr_utils.get_attr "as" attrs in
139
138
140
-
(* Validate link imagesizes/imagesrcset attributes *)
141
-
(match namespace with
142
-
| Some _ -> ()
143
-
| None when name_lower = "link" ->
144
-
let has_imagesizes = Attr_utils.has_attr "imagesizes" attrs in
145
-
let has_imagesrcset = Attr_utils.has_attr "imagesrcset" attrs in
146
-
let rel_value = Attr_utils.get_attr "rel" attrs in
147
-
let as_value = Attr_utils.get_attr "as" attrs in
139
+
(* imagesizes requires imagesrcset *)
140
+
if has_imagesizes && not has_imagesrcset then
141
+
Message_collector.add_typed collector (`Srcset `Imagesizes_without_imagesrcset);
148
142
149
-
(* imagesizes requires imagesrcset *)
150
-
if has_imagesizes && not has_imagesrcset then
151
-
Message_collector.add_typed collector (`Srcset `Imagesizes_without_imagesrcset);
143
+
(* imagesrcset requires as="image" *)
144
+
if has_imagesrcset then begin
145
+
let as_is_image = match as_value with
146
+
| Some v -> String.lowercase_ascii (String.trim v) = "image"
147
+
| None -> false
148
+
in
149
+
if not as_is_image then
150
+
Message_collector.add_typed collector (`Link `Imagesrcset_requires_as_image)
151
+
end;
152
152
153
-
(* imagesrcset requires as="image" *)
154
-
if has_imagesrcset then begin
155
-
let as_is_image = match as_value with
156
-
| Some v -> String.lowercase_ascii (String.trim v) = "image"
157
-
| None -> false
158
-
in
159
-
if not as_is_image then
160
-
Message_collector.add_typed collector (`Link `Imagesrcset_requires_as_image)
153
+
(* as attribute requires rel="preload" or rel="modulepreload" *)
154
+
(match as_value with
155
+
| Some _ ->
156
+
let rel_is_preload = match rel_value with
157
+
| Some v ->
158
+
let rel_lower = String.lowercase_ascii (String.trim v) in
159
+
String.length rel_lower > 0 &&
160
+
(List.mem "preload" (String.split_on_char ' ' rel_lower) ||
161
+
List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
162
+
| None -> false
163
+
in
164
+
if not rel_is_preload then
165
+
Message_collector.add_typed collector (`Link `As_requires_preload)
166
+
| None -> ())
161
167
end;
162
168
163
-
(* as attribute requires rel="preload" or rel="modulepreload" *)
164
-
(match as_value with
165
-
| Some _ ->
166
-
let rel_is_preload = match rel_value with
167
-
| Some v ->
168
-
let rel_lower = String.lowercase_ascii (String.trim v) in
169
-
String.length rel_lower > 0 &&
170
-
(List.mem "preload" (String.split_on_char ' ' rel_lower) ||
171
-
List.mem "modulepreload" (String.split_on_char ' ' rel_lower))
172
-
| None -> false
173
-
in
174
-
if not rel_is_preload then
175
-
Message_collector.add_typed collector (`Link `As_requires_preload)
176
-
| None -> ())
177
-
| None -> ());
178
-
179
-
(* Validate img usemap attribute - must be hash-name reference with content *)
180
-
(match namespace with
181
-
| Some _ -> ()
182
-
| None when name_lower = "img" ->
183
-
List.iter (fun (attr_name, attr_value) ->
184
-
let attr_lower = String.lowercase_ascii attr_name in
185
-
if attr_lower = "usemap" then begin
186
-
if attr_value = "#" then
187
-
Message_collector.add_typed collector
188
-
(`Attr (`Bad_value_generic (`Message (Printf.sprintf
189
-
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d."
190
-
attr_value attr_name name))))
191
-
end
192
-
) attrs
193
-
| None -> ());
169
+
(* Validate img usemap attribute - must be hash-name reference with content *)
170
+
if name_lower = "img" then begin
171
+
List.iter (fun (attr_name, attr_value) ->
172
+
let attr_lower = String.lowercase_ascii attr_name in
173
+
if attr_lower = "usemap" then begin
174
+
if attr_value = "#" then
175
+
Message_collector.add_typed collector
176
+
(`Attr (`Bad_value_generic (`Message (Printf.sprintf
177
+
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad hash-name reference: A hash-name reference must have at least one character after \xe2\x80\x9c#\xe2\x80\x9d."
178
+
attr_value attr_name name))))
179
+
end
180
+
) attrs
181
+
end;
194
182
195
-
(* Validate embed type attribute - must be valid MIME type *)
196
-
(match namespace with
197
-
| Some _ -> ()
198
-
| None when name_lower = "embed" ->
199
-
List.iter (fun (attr_name, attr_value) ->
200
-
let attr_lower = String.lowercase_ascii attr_name in
201
-
if attr_lower = "type" then begin
202
-
match Dt_mime.validate_mime_type attr_value with
203
-
| Ok () -> ()
204
-
| Error msg ->
205
-
Message_collector.add_typed collector
206
-
(`Attr (`Bad_value_generic (`Message (Printf.sprintf
207
-
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s"
208
-
attr_value attr_name name msg))))
209
-
end
210
-
) attrs
211
-
| None -> ());
183
+
(* Validate embed type attribute - must be valid MIME type *)
184
+
if name_lower = "embed" then begin
185
+
List.iter (fun (attr_name, attr_value) ->
186
+
let attr_lower = String.lowercase_ascii attr_name in
187
+
if attr_lower = "type" then begin
188
+
match Dt_mime.validate_mime_type attr_value with
189
+
| Ok () -> ()
190
+
| Error msg ->
191
+
Message_collector.add_typed collector
192
+
(`Attr (`Bad_value_generic (`Message (Printf.sprintf
193
+
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad MIME type: %s"
194
+
attr_value attr_name name msg))))
195
+
end
196
+
) attrs
197
+
end;
212
198
213
-
(* Validate width/height on embed and img - must be non-negative integers *)
214
-
let is_dimension_element = name_lower = "embed" || name_lower = "img" ||
215
-
name_lower = "video" || name_lower = "canvas" ||
216
-
name_lower = "iframe" || name_lower = "source" in
217
-
(match namespace with
218
-
| Some _ -> ()
219
-
| None when is_dimension_element ->
220
-
List.iter (fun (attr_name, attr_value) ->
221
-
let attr_lower = String.lowercase_ascii attr_name in
222
-
if attr_lower = "width" || attr_lower = "height" then begin
223
-
(* Check for non-negative integer only *)
224
-
let is_valid =
225
-
String.length attr_value > 0 &&
226
-
String.for_all (fun c -> c >= '0' && c <= '9') attr_value
227
-
in
228
-
if not is_valid then begin
229
-
(* Determine specific error message *)
230
-
let error_msg =
231
-
if String.length attr_value = 0 then
232
-
Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
233
-
attr_name name
234
-
else if String.contains attr_value '%' then
235
-
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
236
-
attr_value attr_name name
237
-
else if String.length attr_value > 0 && attr_value.[0] = '-' then
238
-
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead."
239
-
attr_value attr_name name
240
-
else
241
-
(* Find first non-digit character *)
242
-
let bad_char =
243
-
try
244
-
let i = ref 0 in
245
-
while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do
246
-
incr i
247
-
done;
248
-
if !i < String.length attr_value then Some attr_value.[!i] else None
249
-
with _ -> None
250
-
in
251
-
match bad_char with
252
-
| Some c ->
253
-
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
254
-
attr_value attr_name name c
255
-
| None ->
256
-
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit."
199
+
(* Validate width/height on embed and img - must be non-negative integers *)
200
+
let is_dimension_element = name_lower = "embed" || name_lower = "img" ||
201
+
name_lower = "video" || name_lower = "canvas" ||
202
+
name_lower = "iframe" || name_lower = "source" in
203
+
if is_dimension_element then begin
204
+
List.iter (fun (attr_name, attr_value) ->
205
+
let attr_lower = String.lowercase_ascii attr_name in
206
+
if attr_lower = "width" || attr_lower = "height" then begin
207
+
(* Check for non-negative integer only *)
208
+
let is_valid =
209
+
String.length attr_value > 0 &&
210
+
String.for_all (fun c -> c >= '0' && c <= '9') attr_value
211
+
in
212
+
if not is_valid then begin
213
+
(* Determine specific error message *)
214
+
let error_msg =
215
+
if String.length attr_value = 0 then
216
+
Printf.sprintf "Bad value \xe2\x80\x9c\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: The empty string is not a valid non-negative integer."
217
+
attr_name name
218
+
else if String.contains attr_value '%' then
219
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%%\xe2\x80\x9d instead."
257
220
attr_value attr_name name
258
-
in
259
-
Message_collector.add_typed collector
260
-
(`Attr (`Bad_value_generic (`Message error_msg)))
221
+
else if String.length attr_value > 0 && attr_value.[0] = '-' then
222
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c-\xe2\x80\x9d instead."
223
+
attr_value attr_name name
224
+
else
225
+
(* Find first non-digit character *)
226
+
let bad_char =
227
+
try
228
+
let i = ref 0 in
229
+
while !i < String.length attr_value && attr_value.[!i] >= '0' && attr_value.[!i] <= '9' do
230
+
incr i
231
+
done;
232
+
if !i < String.length attr_value then Some attr_value.[!i] else None
233
+
with _ -> None
234
+
in
235
+
match bad_char with
236
+
| Some c ->
237
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit but saw \xe2\x80\x9c%c\xe2\x80\x9d instead."
238
+
attr_value attr_name name c
239
+
| None ->
240
+
Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad non-negative integer: Expected a digit."
241
+
attr_value attr_name name
242
+
in
243
+
Message_collector.add_typed collector
244
+
(`Attr (`Bad_value_generic (`Message error_msg)))
245
+
end
261
246
end
262
-
end
263
-
) attrs
264
-
| None -> ());
247
+
) attrs
248
+
end;
265
249
266
-
(* Validate area[shape=default] cannot have coords *)
267
-
(match namespace with
268
-
| Some _ -> ()
269
-
| None when name_lower = "area" ->
270
-
(match Attr_utils.get_attr "shape" attrs with
271
-
| Some s when String.lowercase_ascii (String.trim s) = "default" ->
272
-
if Attr_utils.has_attr "coords" attrs then
273
-
Message_collector.add_typed collector
274
-
(`Attr (`Not_allowed (`Attr "coords", `Elem "area")))
275
-
| _ -> ())
276
-
| None -> ());
250
+
(* Validate area[shape=default] cannot have coords *)
251
+
if name_lower = "area" then begin
252
+
match Attr_utils.get_attr "shape" attrs with
253
+
| Some s when String.lowercase_ascii (String.trim s) = "default" ->
254
+
if Attr_utils.has_attr "coords" attrs then
255
+
Message_collector.add_typed collector
256
+
(`Attr (`Not_allowed (`Attr "coords", `Elem "area")))
257
+
| _ -> ()
258
+
end;
277
259
278
-
(* Validate bdo element requires dir attribute, and dir cannot be "auto" *)
279
-
(match namespace with
280
-
| Some _ -> ()
281
-
| None when name_lower = "bdo" ->
282
-
(match Attr_utils.get_attr "dir" attrs with
283
-
| None ->
284
-
Message_collector.add_typed collector (`Misc `Bdo_missing_dir)
285
-
| Some v when String.lowercase_ascii (String.trim v) = "auto" ->
286
-
Message_collector.add_typed collector (`Misc `Bdo_dir_auto)
287
-
| _ -> ())
288
-
| None -> ());
260
+
(* Validate bdo element requires dir attribute, and dir cannot be "auto" *)
261
+
if name_lower = "bdo" then begin
262
+
match Attr_utils.get_attr "dir" attrs with
263
+
| None ->
264
+
Message_collector.add_typed collector (`Misc `Bdo_missing_dir)
265
+
| Some v when String.lowercase_ascii (String.trim v) = "auto" ->
266
+
Message_collector.add_typed collector (`Misc `Bdo_dir_auto)
267
+
| _ -> ()
268
+
end;
289
269
290
-
(* Validate input list attribute - only allowed for certain types *)
291
-
(match namespace with
292
-
| Some _ -> ()
293
-
| None when name_lower = "input" ->
294
-
if Attr_utils.has_attr "list" attrs then begin
295
-
let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs
296
-
|> String.trim |> String.lowercase_ascii in
297
-
if not (List.mem input_type input_types_allowing_list) then
298
-
Message_collector.add_typed collector (`Input `List_not_allowed)
299
-
end
300
-
| None -> ());
270
+
(* Validate input list attribute - only allowed for certain types *)
271
+
if name_lower = "input" then begin
272
+
if Attr_utils.has_attr "list" attrs then begin
273
+
let input_type = Attr_utils.get_attr_or "type" ~default:"text" attrs
274
+
|> String.trim |> String.lowercase_ascii in
275
+
if not (List.mem input_type input_types_allowing_list) then
276
+
Message_collector.add_typed collector (`Input `List_not_allowed)
277
+
end
278
+
end;
301
279
302
-
(* Validate data-* attributes *)
303
-
(match namespace with
304
-
| Some _ -> ()
305
-
| None ->
280
+
(* Validate data-* attributes *)
306
281
List.iter (fun (attr_name, _) ->
307
282
let attr_lower = String.lowercase_ascii attr_name in
308
283
(* Check if it starts with "data-" *)
···
316
291
Message_collector.add_typed collector
317
292
(`Attr (`Data_invalid_name (`Reason "must be XML 1.0 4th ed. plus Namespaces NCNames")))
318
293
end
319
-
) attrs);
294
+
) attrs;
320
295
321
-
(* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *)
322
-
(match namespace with
323
-
| Some _ -> ()
324
-
| None when not state.is_xhtml ->
325
-
let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in
326
-
let lang_value = Attr_utils.get_attr "lang" attrs in
327
-
(match xmllang_value with
328
-
| Some xmllang ->
329
-
(match lang_value with
330
-
| None ->
331
-
Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
332
-
| Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang ->
333
-
Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
334
-
| _ -> ())
335
-
| None -> ())
336
-
| None -> ());
296
+
(* Validate xml:lang must have matching lang attribute - only in HTML mode, not XHTML *)
297
+
if not state.is_xhtml then begin
298
+
let xmllang_value = Attr_utils.get_attr "xml:lang" attrs in
299
+
let lang_value = Attr_utils.get_attr "lang" attrs in
300
+
match xmllang_value with
301
+
| Some xmllang ->
302
+
(match lang_value with
303
+
| None ->
304
+
Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
305
+
| Some lang when String.lowercase_ascii lang <> String.lowercase_ascii xmllang ->
306
+
Message_collector.add_typed collector (`I18n `Xml_lang_without_lang)
307
+
| _ -> ())
308
+
| None -> ()
309
+
end;
337
310
338
-
(* Validate spellcheck attribute - must be "true" or "false" or empty *)
339
-
(match namespace with
340
-
| Some _ -> ()
341
-
| None ->
311
+
(* Validate spellcheck attribute - must be "true" or "false" or empty *)
342
312
List.iter (fun (attr_name, attr_value) ->
343
313
let attr_lower = String.lowercase_ascii attr_name in
344
314
if attr_lower = "spellcheck" then begin
···
347
317
Message_collector.add_typed collector
348
318
(`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason "")))
349
319
end
350
-
) attrs);
320
+
) attrs;
351
321
352
-
(* Validate enterkeyhint attribute - must be one of specific values *)
353
-
(match namespace with
354
-
| Some _ -> ()
355
-
| None ->
322
+
(* Validate enterkeyhint attribute - must be one of specific values *)
356
323
let valid_enterkeyhint = ["enter"; "done"; "go"; "next"; "previous"; "search"; "send"] in
357
324
List.iter (fun (attr_name, attr_value) ->
358
325
let attr_lower = String.lowercase_ascii attr_name in
···
362
329
Message_collector.add_typed collector
363
330
(`Attr (`Bad_value (`Elem name, `Attr attr_name, `Value attr_value, `Reason "")))
364
331
end
365
-
) attrs);
332
+
) attrs;
366
333
367
-
(* Validate headingoffset attribute - must be a number between 0 and 8 *)
368
-
(match namespace with
369
-
| Some _ -> ()
370
-
| None ->
334
+
(* Validate headingoffset attribute - must be a number between 0 and 8 *)
371
335
List.iter (fun (attr_name, attr_value) ->
372
336
let attr_lower = String.lowercase_ascii attr_name in
373
337
if attr_lower = "headingoffset" then begin
···
383
347
if not is_valid then
384
348
Message_collector.add_typed collector (`Misc `Headingoffset_invalid)
385
349
end
386
-
) attrs);
350
+
) attrs;
387
351
388
-
(* Validate accesskey attribute - each key label must be a single code point *)
389
-
(match namespace with
390
-
| Some _ -> ()
391
-
| None ->
352
+
(* Validate accesskey attribute - each key label must be a single code point *)
392
353
List.iter (fun (attr_name, attr_value) ->
393
354
let attr_lower = String.lowercase_ascii attr_name in
394
355
if attr_lower = "accesskey" then begin
···
433
394
in
434
395
find_duplicates [] keys
435
396
end
436
-
) attrs);
397
+
) attrs;
437
398
438
-
(* Validate that command and popovertarget cannot have aria-expanded *)
439
-
(match namespace with
440
-
| Some _ -> ()
441
-
| None when name_lower = "button" ->
442
-
let has_command = Attr_utils.has_attr "command" attrs in
443
-
let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in
444
-
let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in
399
+
(* Validate that command and popovertarget cannot have aria-expanded *)
400
+
if name_lower = "button" then begin
401
+
let has_command = Attr_utils.has_attr "command" attrs in
402
+
let has_popovertarget = Attr_utils.has_attr "popovertarget" attrs in
403
+
let has_aria_expanded = Attr_utils.has_attr "aria-expanded" attrs in
445
404
446
-
if has_command && has_aria_expanded then
447
-
Message_collector.add_typed collector
448
-
(`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
449
-
`Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute")));
405
+
if has_command && has_aria_expanded then
406
+
Message_collector.add_typed collector
407
+
(`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
408
+
`Condition "a \xe2\x80\x9ccommand\xe2\x80\x9d attribute")));
450
409
451
-
if has_popovertarget && has_aria_expanded then
452
-
Message_collector.add_typed collector
453
-
(`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
454
-
`Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute")))
455
-
| None -> ());
410
+
if has_popovertarget && has_aria_expanded then
411
+
Message_collector.add_typed collector
412
+
(`Attr (`Not_allowed_when (`Attr "aria-expanded", `Elem name,
413
+
`Condition "a \xe2\x80\x9cpopovertarget\xe2\x80\x9d attribute")))
414
+
end;
456
415
457
-
(* Note: data-* uppercase check requires XML parsing which preserves case.
458
-
The HTML5 parser normalizes attribute names to lowercase, so this check
459
-
is only effective when the document is parsed as XML.
460
-
Commenting out until we have XML parsing support. *)
461
-
ignore state.is_xhtml;
416
+
(* Note: data-* uppercase check requires XML parsing which preserves case.
417
+
The HTML5 parser normalizes attribute names to lowercase, so this check
418
+
is only effective when the document is parsed as XML.
419
+
Commenting out until we have XML parsing support. *)
420
+
ignore state.is_xhtml;
462
421
463
-
(* Validate media attribute on link, style, source elements *)
464
-
let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in
465
-
(match namespace with
466
-
| Some _ -> ()
467
-
| None when is_media_element ->
468
-
List.iter (fun (attr_name, attr_value) ->
469
-
let attr_lower = String.lowercase_ascii attr_name in
470
-
if attr_lower = "media" then begin
471
-
let trimmed = String.trim attr_value in
472
-
if trimmed <> "" then begin
473
-
match Dt_media_query.validate_media_query_strict trimmed with
474
-
| Ok () -> ()
475
-
| Error msg ->
476
-
Message_collector.add_typed collector
477
-
(`Attr (`Bad_value_generic (`Message (Printf.sprintf
478
-
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s"
479
-
attr_value attr_name name msg))))
422
+
(* Validate media attribute on link, style, source elements *)
423
+
let is_media_element = name_lower = "link" || name_lower = "style" || name_lower = "source" in
424
+
if is_media_element then begin
425
+
List.iter (fun (attr_name, attr_value) ->
426
+
let attr_lower = String.lowercase_ascii attr_name in
427
+
if attr_lower = "media" then begin
428
+
let trimmed = String.trim attr_value in
429
+
if trimmed <> "" then begin
430
+
match Dt_media_query.validate_media_query_strict trimmed with
431
+
| Ok () -> ()
432
+
| Error msg ->
433
+
Message_collector.add_typed collector
434
+
(`Attr (`Bad_value_generic (`Message (Printf.sprintf
435
+
"Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9c%s\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Bad media query: %s"
436
+
attr_value attr_name name msg))))
437
+
end
480
438
end
481
-
end
482
-
) attrs
483
-
| None -> ());
439
+
) attrs
440
+
end;
484
441
485
-
(* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *)
486
-
(match namespace with
487
-
| Some _ -> ()
488
-
| None ->
442
+
(* Validate RDFa prefix attribute - space-separated list of prefix:iri pairs *)
489
443
List.iter (fun (attr_name, attr_value) ->
490
444
let attr_lower = String.lowercase_ascii attr_name in
491
445
if attr_lower = "prefix" then begin
···
507
461
end
508
462
end
509
463
end
510
-
) attrs)
464
+
) attrs
511
465
512
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
466
+
| _ -> () (* Skip non-HTML elements *)
467
+
468
+
let end_element _state ~tag:_ _collector = ()
513
469
let characters _state _text _collector = ()
514
470
let end_document _state _collector = ()
515
471
+14
-15
lib/htmlrw_check/specialized/base_checker.ml
+14
-15
lib/htmlrw_check/specialized/base_checker.ml
···
11
11
let reset state =
12
12
state.seen_link_or_script <- false
13
13
14
-
let start_element state ~name ~namespace ~attrs collector =
15
-
match namespace with
16
-
| Some _ -> ()
17
-
| None ->
18
-
match String.lowercase_ascii name with
19
-
| "link" | "script" ->
20
-
state.seen_link_or_script <- true
21
-
| "base" ->
22
-
if state.seen_link_or_script then
23
-
Message_collector.add_typed collector (`Misc `Base_after_link_script);
24
-
(* base element must have href or target attribute *)
25
-
if not (Attr_utils.has_attr "href" attrs || Attr_utils.has_attr "target" attrs) then
26
-
Message_collector.add_typed collector (`Misc `Base_missing_href_or_target)
27
-
| _ -> ()
14
+
let start_element state ~element collector =
15
+
match element.Element.tag with
16
+
| Tag.Html (`Link | `Script) ->
17
+
state.seen_link_or_script <- true
18
+
| Tag.Html `Base ->
19
+
if state.seen_link_or_script then
20
+
Message_collector.add_typed collector (`Misc `Base_after_link_script);
21
+
(* base element must have href or target attribute *)
22
+
let has_href = Attr.get_href element.attrs |> Option.is_some in
23
+
let has_target = Attr.exists (function `Target _ -> true | _ -> false) element.attrs in
24
+
if not (has_href || has_target) then
25
+
Message_collector.add_typed collector (`Misc `Base_missing_href_or_target)
26
+
| _ -> ()
28
27
29
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
28
+
let end_element _state ~tag:_ _collector = ()
30
29
let characters _state _text _collector = ()
31
30
let end_document _state _collector = ()
32
31
+8
-8
lib/htmlrw_check/specialized/datetime_checker.ml
+8
-8
lib/htmlrw_check/specialized/datetime_checker.ml
···
445
445
let create () = ()
446
446
let reset _state = ()
447
447
448
-
let start_element _state ~name ~namespace ~attrs collector =
449
-
if namespace <> None then ()
450
-
else begin
451
-
let name_lower = String.lowercase_ascii name in
452
-
if List.mem name_lower datetime_elements then begin
448
+
let start_element _state ~element collector =
449
+
match element.Element.tag with
450
+
| Tag.Html tag ->
451
+
let name = Tag.html_tag_to_string tag in
452
+
if List.mem name datetime_elements then begin
453
453
(* Check for datetime attribute *)
454
454
let datetime_attr = List.find_map (fun (k, v) ->
455
455
if String.lowercase_ascii k = "datetime" then Some v else None
456
-
) attrs in
456
+
) element.raw_attrs in
457
457
match datetime_attr with
458
458
| None -> ()
459
459
| Some value ->
···
468
468
Message_collector.add_typed collector
469
469
(`Generic warn_msg)
470
470
end
471
-
end
471
+
| _ -> () (* Non-HTML elements don't have datetime attributes *)
472
472
473
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
473
+
let end_element _state ~tag:_ _collector = ()
474
474
let characters _state _text _collector = ()
475
475
let end_document _state _collector = ()
476
476
+152
-176
lib/htmlrw_check/specialized/dl_checker.ml
+152
-176
lib/htmlrw_check/specialized/dl_checker.ml
···
57
57
| ctx :: _ -> Some ctx
58
58
| [] -> None
59
59
60
-
let start_element state ~name ~namespace ~attrs collector =
61
-
let name_lower = String.lowercase_ascii name in
60
+
let start_element state ~element collector =
61
+
let name_lower = Tag.tag_to_string element.Element.tag in
62
62
63
63
(* Track parent stack for all HTML elements first *)
64
-
if namespace = None then
65
-
state.parent_stack <- name_lower :: state.parent_stack;
64
+
(match element.tag with
65
+
| Tag.Html _ -> state.parent_stack <- name_lower :: state.parent_stack
66
+
| _ -> ());
66
67
67
-
if namespace <> None then ()
68
-
else begin
69
-
match name_lower with
70
-
| "template" ->
71
-
state.in_template <- state.in_template + 1;
72
-
(* Track if template is direct child of dl *)
73
-
begin match current_dl state with
74
-
| Some dl_ctx when state.div_in_dl_stack = [] ->
75
-
dl_ctx.has_template <- true
76
-
| _ -> ()
77
-
end
68
+
match element.tag with
69
+
| Tag.Html `Template ->
70
+
state.in_template <- state.in_template + 1;
71
+
(* Track if template is direct child of dl *)
72
+
(match current_dl state with
73
+
| Some dl_ctx when state.div_in_dl_stack = [] ->
74
+
dl_ctx.has_template <- true
75
+
| _ -> ())
78
76
79
-
| "dl" when state.in_template = 0 ->
80
-
(* Check for nested dl - error if direct child of dl OR inside div-in-dl *)
81
-
begin match current_div state with
82
-
| Some _ ->
83
-
Message_collector.add_typed collector
84
-
(`Element (`Not_allowed_as_child (`Child "dl", `Parent "div")))
85
-
| None ->
86
-
match current_dl state with
87
-
| Some _ when state.in_dt_dd = 0 ->
88
-
Message_collector.add_typed collector
89
-
(`Element (`Not_allowed_as_child (`Child "dl", `Parent "dl")))
90
-
| _ -> ()
91
-
end;
92
-
let ctx = {
93
-
has_dt = false;
94
-
has_dd = false;
95
-
last_was_dt = false;
96
-
contains_div = false;
97
-
contains_dt_dd = false;
98
-
dd_before_dt_error_reported = false;
99
-
has_template = false;
100
-
} in
101
-
state.dl_stack <- ctx :: state.dl_stack
77
+
| Tag.Html `Dl when state.in_template = 0 ->
78
+
(* Check for nested dl - error if direct child of dl OR inside div-in-dl *)
79
+
(match current_div state with
80
+
| Some _ ->
81
+
Message_collector.add_typed collector
82
+
(`Element (`Not_allowed_as_child (`Child "dl", `Parent "div")))
83
+
| None ->
84
+
match current_dl state with
85
+
| Some _ when state.in_dt_dd = 0 ->
86
+
Message_collector.add_typed collector
87
+
(`Element (`Not_allowed_as_child (`Child "dl", `Parent "dl")))
88
+
| _ -> ());
89
+
let ctx = {
90
+
has_dt = false;
91
+
has_dd = false;
92
+
last_was_dt = false;
93
+
contains_div = false;
94
+
contains_dt_dd = false;
95
+
dd_before_dt_error_reported = false;
96
+
has_template = false;
97
+
} in
98
+
state.dl_stack <- ctx :: state.dl_stack
102
99
103
-
| "div" when state.in_template = 0 ->
104
-
begin match current_dl state with
105
-
| Some dl_ctx when state.div_in_dl_stack = [] ->
106
-
(* Direct div child of dl *)
107
-
dl_ctx.contains_div <- true;
108
-
(* Check for mixed content - if we already have dt/dd, div is not allowed *)
109
-
if dl_ctx.contains_dt_dd then
110
-
Message_collector.add_typed collector
111
-
(`Element (`Not_allowed_as_child (`Child "div", `Parent "dl")));
112
-
(* Check that role is only presentation or none *)
113
-
(match Attr_utils.get_attr "role" attrs with
100
+
| Tag.Html `Div when state.in_template = 0 ->
101
+
(match current_dl state with
102
+
| Some dl_ctx when state.div_in_dl_stack = [] ->
103
+
dl_ctx.contains_div <- true;
104
+
if dl_ctx.contains_dt_dd then
105
+
Message_collector.add_typed collector
106
+
(`Element (`Not_allowed_as_child (`Child "div", `Parent "dl")));
107
+
(match Attr.get_role element.attrs with
114
108
| Some role_value ->
115
109
let role_lower = String.lowercase_ascii (String.trim role_value) in
116
110
if role_lower <> "presentation" && role_lower <> "none" then
117
111
Message_collector.add_typed collector (`Li_role `Div_in_dl_bad_role)
118
112
| None -> ());
119
-
let div_ctx = { has_dt = false; has_dd = false; group_count = 0; in_dd_part = false } in
120
-
state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack
121
-
| Some _ when state.div_in_dl_stack <> [] ->
122
-
Message_collector.add_typed collector
123
-
(`Element (`Not_allowed_as_child (`Child "div", `Parent "div")))
124
-
| _ -> ()
125
-
end
126
-
127
-
| "dt" when state.in_template = 0 ->
128
-
state.in_dt_dd <- state.in_dt_dd + 1;
129
-
begin match current_div state with
130
-
| Some div_ctx ->
131
-
(* If we've already seen dd, this dt starts a new group - which is not allowed *)
132
-
if div_ctx.in_dd_part then begin
133
-
Message_collector.add_typed collector
134
-
(`Element (`Not_allowed_as_child (`Child "dt", `Parent "div")));
135
-
div_ctx.group_count <- div_ctx.group_count + 1;
136
-
div_ctx.in_dd_part <- false
137
-
end;
138
-
div_ctx.has_dt <- true
139
-
| None ->
140
-
match current_dl state with
141
-
| Some dl_ctx ->
142
-
dl_ctx.has_dt <- true;
143
-
dl_ctx.last_was_dt <- true;
144
-
dl_ctx.contains_dt_dd <- true;
145
-
(* Check for mixed content - if we already have div, dt is not allowed *)
146
-
if dl_ctx.contains_div then
147
-
Message_collector.add_typed collector
148
-
(`Element (`Not_allowed_as_child (`Child "dt", `Parent "dl")))
149
-
| None ->
150
-
(* dt outside dl context - error *)
151
-
let parent = match current_parent state with
152
-
| Some p -> p
153
-
| None -> "document"
154
-
in
155
-
Message_collector.add_typed collector
156
-
(`Element (`Not_allowed_as_child (`Child "dt", `Parent parent)))
157
-
end
113
+
let div_ctx = { has_dt = false; has_dd = false; group_count = 0; in_dd_part = false } in
114
+
state.div_in_dl_stack <- div_ctx :: state.div_in_dl_stack
115
+
| Some _ when state.div_in_dl_stack <> [] ->
116
+
Message_collector.add_typed collector
117
+
(`Element (`Not_allowed_as_child (`Child "div", `Parent "div")))
118
+
| _ -> ())
158
119
159
-
| "dd" when state.in_template = 0 ->
160
-
state.in_dt_dd <- state.in_dt_dd + 1;
161
-
begin match current_div state with
162
-
| Some div_ctx ->
163
-
div_ctx.has_dd <- true;
164
-
(* First dd after dt(s) completes the first group *)
165
-
if not div_ctx.in_dd_part then begin
166
-
div_ctx.in_dd_part <- true;
167
-
div_ctx.group_count <- div_ctx.group_count + 1
168
-
end
169
-
| None ->
170
-
match current_dl state with
171
-
| Some dl_ctx ->
172
-
(* Check if dd appears before any dt - only report once per dl *)
173
-
if not dl_ctx.has_dt && not dl_ctx.dd_before_dt_error_reported then begin
174
-
dl_ctx.dd_before_dt_error_reported <- true;
175
-
Message_collector.add_typed collector
176
-
(`Element (`Missing_child_generic (`Parent "dl")))
177
-
end;
178
-
dl_ctx.has_dd <- true;
179
-
dl_ctx.last_was_dt <- false;
180
-
dl_ctx.contains_dt_dd <- true;
181
-
(* Check for mixed content *)
182
-
if dl_ctx.contains_div then
183
-
Message_collector.add_typed collector
184
-
(`Element (`Not_allowed_as_child (`Child "dd", `Parent "dl")))
185
-
| None ->
186
-
(* dd outside dl context - error *)
187
-
let parent = match current_parent state with
188
-
| Some p -> p
189
-
| None -> "document"
190
-
in
191
-
Message_collector.add_typed collector
192
-
(`Element (`Not_allowed_as_child (`Child "dd", `Parent parent)))
193
-
end
120
+
| Tag.Html `Dt when state.in_template = 0 ->
121
+
state.in_dt_dd <- state.in_dt_dd + 1;
122
+
(match current_div state with
123
+
| Some div_ctx ->
124
+
if div_ctx.in_dd_part then begin
125
+
Message_collector.add_typed collector
126
+
(`Element (`Not_allowed_as_child (`Child "dt", `Parent "div")));
127
+
div_ctx.group_count <- div_ctx.group_count + 1;
128
+
div_ctx.in_dd_part <- false
129
+
end;
130
+
div_ctx.has_dt <- true
131
+
| None ->
132
+
match current_dl state with
133
+
| Some dl_ctx ->
134
+
dl_ctx.has_dt <- true;
135
+
dl_ctx.last_was_dt <- true;
136
+
dl_ctx.contains_dt_dd <- true;
137
+
if dl_ctx.contains_div then
138
+
Message_collector.add_typed collector
139
+
(`Element (`Not_allowed_as_child (`Child "dt", `Parent "dl")))
140
+
| None ->
141
+
let parent = match current_parent state with
142
+
| Some p -> p
143
+
| None -> "document"
144
+
in
145
+
Message_collector.add_typed collector
146
+
(`Element (`Not_allowed_as_child (`Child "dt", `Parent parent))))
194
147
195
-
| _ -> ()
196
-
end
148
+
| Tag.Html `Dd when state.in_template = 0 ->
149
+
state.in_dt_dd <- state.in_dt_dd + 1;
150
+
(match current_div state with
151
+
| Some div_ctx ->
152
+
div_ctx.has_dd <- true;
153
+
if not div_ctx.in_dd_part then begin
154
+
div_ctx.in_dd_part <- true;
155
+
div_ctx.group_count <- div_ctx.group_count + 1
156
+
end
157
+
| None ->
158
+
match current_dl state with
159
+
| Some dl_ctx ->
160
+
if not dl_ctx.has_dt && not dl_ctx.dd_before_dt_error_reported then begin
161
+
dl_ctx.dd_before_dt_error_reported <- true;
162
+
Message_collector.add_typed collector
163
+
(`Element (`Missing_child_generic (`Parent "dl")))
164
+
end;
165
+
dl_ctx.has_dd <- true;
166
+
dl_ctx.last_was_dt <- false;
167
+
dl_ctx.contains_dt_dd <- true;
168
+
if dl_ctx.contains_div then
169
+
Message_collector.add_typed collector
170
+
(`Element (`Not_allowed_as_child (`Child "dd", `Parent "dl")))
171
+
| None ->
172
+
let parent = match current_parent state with
173
+
| Some p -> p
174
+
| None -> "document"
175
+
in
176
+
Message_collector.add_typed collector
177
+
(`Element (`Not_allowed_as_child (`Child "dd", `Parent parent))))
197
178
198
-
let end_element state ~name ~namespace collector =
199
-
if namespace <> None then ()
200
-
else begin
201
-
let name_lower = String.lowercase_ascii name in
179
+
| _ -> ()
202
180
181
+
let end_element state ~tag collector =
182
+
match tag with
183
+
| Tag.Html _ ->
203
184
(* Pop from parent stack *)
204
185
(match state.parent_stack with
205
-
| _ :: rest -> state.parent_stack <- rest
206
-
| [] -> ());
186
+
| _ :: rest -> state.parent_stack <- rest
187
+
| [] -> ());
207
188
208
-
match name_lower with
209
-
| "template" ->
210
-
state.in_template <- max 0 (state.in_template - 1)
189
+
(match tag with
190
+
| Tag.Html `Template ->
191
+
state.in_template <- max 0 (state.in_template - 1)
211
192
212
-
| "dt" | "dd" when state.in_template = 0 ->
213
-
state.in_dt_dd <- max 0 (state.in_dt_dd - 1)
193
+
| Tag.Html (`Dt | `Dd) when state.in_template = 0 ->
194
+
state.in_dt_dd <- max 0 (state.in_dt_dd - 1)
214
195
215
-
| "dl" when state.in_template = 0 ->
216
-
begin match state.dl_stack with
217
-
| ctx :: rest ->
218
-
state.dl_stack <- rest;
219
-
(* Check dl content model at end *)
220
-
if ctx.contains_dt_dd then begin
221
-
(* Direct dt/dd content - must have both *)
222
-
if not ctx.has_dt && not ctx.dd_before_dt_error_reported then
223
-
Message_collector.add_typed collector
224
-
(`Element (`Missing_child_generic (`Parent "dl")))
225
-
else if not ctx.has_dd then begin
226
-
if ctx.has_template then
196
+
| Tag.Html `Dl when state.in_template = 0 ->
197
+
(match state.dl_stack with
198
+
| ctx :: rest ->
199
+
state.dl_stack <- rest;
200
+
if ctx.contains_dt_dd then begin
201
+
if not ctx.has_dt && not ctx.dd_before_dt_error_reported then
227
202
Message_collector.add_typed collector
228
-
(`Element (`Missing_child_one_of (`Parent "dl", `Children ["dd"])))
229
-
else
203
+
(`Element (`Missing_child_generic (`Parent "dl")))
204
+
else if not ctx.has_dd then begin
205
+
if ctx.has_template then
206
+
Message_collector.add_typed collector
207
+
(`Element (`Missing_child_one_of (`Parent "dl", `Children ["dd"])))
208
+
else
209
+
Message_collector.add_typed collector
210
+
(`Element (`Missing_child (`Parent "dl", `Child "dd")))
211
+
end
212
+
else if ctx.last_was_dt then
230
213
Message_collector.add_typed collector
231
214
(`Element (`Missing_child (`Parent "dl", `Child "dd")))
232
-
end
233
-
else if ctx.last_was_dt then
234
-
Message_collector.add_typed collector
235
-
(`Element (`Missing_child (`Parent "dl", `Child "dd")))
236
-
end else if not ctx.contains_div && not ctx.has_dt && not ctx.has_dd then
237
-
()
238
-
| [] -> ()
239
-
end
215
+
end else if not ctx.contains_div && not ctx.has_dt && not ctx.has_dd then
216
+
()
217
+
| [] -> ())
240
218
241
-
| "div" when state.in_template = 0 ->
242
-
begin match state.div_in_dl_stack with
243
-
| div_ctx :: rest ->
244
-
state.div_in_dl_stack <- rest;
245
-
(* Check div in dl must have both dt and dd *)
246
-
if not div_ctx.has_dt && not div_ctx.has_dd then
247
-
Message_collector.add_typed collector
248
-
(`Element (`Missing_child (`Parent "div", `Child "dd")))
249
-
else if not div_ctx.has_dt then
250
-
Message_collector.add_typed collector
251
-
(`Element (`Missing_child (`Parent "div", `Child "dt")))
252
-
else if not div_ctx.has_dd then
253
-
Message_collector.add_typed collector
254
-
(`Element (`Missing_child (`Parent "div", `Child "dd")))
255
-
| [] -> ()
256
-
end
219
+
| Tag.Html `Div when state.in_template = 0 ->
220
+
(match state.div_in_dl_stack with
221
+
| div_ctx :: rest ->
222
+
state.div_in_dl_stack <- rest;
223
+
if not div_ctx.has_dt && not div_ctx.has_dd then
224
+
Message_collector.add_typed collector
225
+
(`Element (`Missing_child (`Parent "div", `Child "dd")))
226
+
else if not div_ctx.has_dt then
227
+
Message_collector.add_typed collector
228
+
(`Element (`Missing_child (`Parent "div", `Child "dt")))
229
+
else if not div_ctx.has_dd then
230
+
Message_collector.add_typed collector
231
+
(`Element (`Missing_child (`Parent "div", `Child "dd")))
232
+
| [] -> ())
257
233
258
-
| _ -> ()
259
-
end
234
+
| _ -> ())
235
+
| _ -> ()
260
236
261
237
let characters state text collector =
262
238
if state.in_template > 0 then ()
+12
-11
lib/htmlrw_check/specialized/h1_checker.ml
+12
-11
lib/htmlrw_check/specialized/h1_checker.ml
···
14
14
state.h1_count <- 0;
15
15
state.svg_depth <- 0
16
16
17
-
let start_element state ~name ~namespace ~attrs collector =
18
-
ignore attrs;
19
-
let name_lower = String.lowercase_ascii name in
17
+
let start_element state ~element collector =
20
18
(* Track SVG depth - h1 inside SVG (foreignObject, desc) shouldn't count *)
21
-
if name_lower = "svg" then
19
+
match element.Element.tag with
20
+
| Tag.Svg _ ->
22
21
state.svg_depth <- state.svg_depth + 1
23
-
else if namespace <> None || state.svg_depth > 0 then
24
-
() (* Skip non-HTML namespace or inside SVG *)
25
-
else if name_lower = "h1" then begin
22
+
| Tag.Html `H1 when state.svg_depth = 0 ->
26
23
state.h1_count <- state.h1_count + 1;
27
24
if state.h1_count > 1 then
28
25
Message_collector.add_typed collector (`Misc `Multiple_h1)
29
-
end
26
+
| Tag.Html _ when state.svg_depth = 0 ->
27
+
() (* Other HTML elements outside SVG *)
28
+
| _ ->
29
+
() (* Non-HTML or inside SVG *)
30
30
31
-
let end_element state ~name ~namespace:_ _collector =
32
-
let name_lower = String.lowercase_ascii name in
33
-
if name_lower = "svg" && state.svg_depth > 0 then
31
+
let end_element state ~tag _collector =
32
+
match tag with
33
+
| Tag.Svg _ when state.svg_depth > 0 ->
34
34
state.svg_depth <- state.svg_depth - 1
35
+
| _ -> ()
35
36
36
37
let characters _state _text _collector = ()
37
38
let end_document _state _collector = ()
+48
-59
lib/htmlrw_check/specialized/heading_checker.ml
+48
-59
lib/htmlrw_check/specialized/heading_checker.ml
···
12
12
mutable h1_count : int;
13
13
mutable has_any_heading : bool;
14
14
mutable first_heading_checked : bool;
15
-
mutable in_heading : string option;
15
+
mutable in_heading : Tag.html_tag option;
16
16
mutable heading_has_text : bool;
17
17
}
18
18
···
34
34
state.in_heading <- None;
35
35
state.heading_has_text <- false
36
36
37
-
(** Extract heading level from tag name (e.g., "h1" -> 1). *)
38
-
let heading_level name =
39
-
match String.lowercase_ascii name with
40
-
| "h1" -> Some 1
41
-
| "h2" -> Some 2
42
-
| "h3" -> Some 3
43
-
| "h4" -> Some 4
44
-
| "h5" -> Some 5
45
-
| "h6" -> Some 6
46
-
| _ -> None
47
-
48
37
(** Check if text is effectively empty (only whitespace). *)
49
38
let is_empty_text text =
50
39
let rec check i =
···
57
46
in
58
47
check 0
59
48
60
-
let start_element state ~name ~namespace:_ ~attrs:_ collector =
61
-
match heading_level name with
62
-
| Some level ->
63
-
state.has_any_heading <- true;
49
+
let start_element state ~element collector =
50
+
match element.Element.tag with
51
+
| Tag.Html (#Tag.heading_tag as h) ->
52
+
let level = match Tag.heading_level h with Some l -> l | None -> 0 in
53
+
let name = Tag.html_tag_to_string h in
54
+
state.has_any_heading <- true;
64
55
65
-
(* Check if this is the first heading *)
66
-
if not state.first_heading_checked then begin
67
-
state.first_heading_checked <- true;
68
-
if level <> 1 then
56
+
(* Check if this is the first heading *)
57
+
if not state.first_heading_checked then begin
58
+
state.first_heading_checked <- true;
59
+
if level <> 1 then
60
+
Message_collector.add_typed collector
61
+
(`Generic (Printf.sprintf
62
+
"First heading in document is <%s>, should typically be <h1>" name))
63
+
end;
64
+
65
+
(* Track h1 count *)
66
+
if level = 1 then begin
67
+
state.h1_count <- state.h1_count + 1;
68
+
if state.h1_count > 1 then
69
+
Message_collector.add_typed collector (`Misc `Multiple_h1)
70
+
end;
71
+
72
+
(* Check for skipped levels *)
73
+
begin match state.current_level with
74
+
| None ->
75
+
state.current_level <- Some level
76
+
| Some prev_level ->
77
+
let diff = level - prev_level in
78
+
if diff > 1 then
69
79
Message_collector.add_typed collector
70
80
(`Generic (Printf.sprintf
71
-
"First heading in document is <%s>, should typically be <h1>" name))
72
-
end;
81
+
"Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users"
82
+
name prev_level (diff - 1) (if diff > 2 then "s" else "")));
83
+
state.current_level <- Some level
84
+
end;
73
85
74
-
(* Track h1 count *)
75
-
if level = 1 then begin
76
-
state.h1_count <- state.h1_count + 1;
77
-
if state.h1_count > 1 then
78
-
Message_collector.add_typed collector (`Misc `Multiple_h1)
79
-
end;
86
+
(* Track that we're in a heading to check for empty content *)
87
+
state.in_heading <- Some h;
88
+
state.heading_has_text <- false
89
+
| _ -> ()
80
90
81
-
(* Check for skipped levels *)
82
-
begin match state.current_level with
83
-
| None ->
84
-
state.current_level <- Some level
85
-
| Some prev_level ->
86
-
let diff = level - prev_level in
87
-
if diff > 1 then
88
-
Message_collector.add_typed collector
89
-
(`Generic (Printf.sprintf
90
-
"Heading level skipped: <%s> follows <h%d>, skipping %d level%s. This can confuse screen reader users"
91
-
name prev_level (diff - 1) (if diff > 2 then "s" else "")));
92
-
state.current_level <- Some level
93
-
end;
94
-
95
-
(* Track that we're in a heading to check for empty content *)
96
-
state.in_heading <- Some name;
97
-
state.heading_has_text <- false
98
-
99
-
| None ->
100
-
(* Not a heading element *)
101
-
()
102
-
103
-
let end_element state ~name ~namespace:_ collector =
104
-
match state.in_heading with
105
-
| Some heading when heading = name ->
106
-
if not state.heading_has_text then
107
-
Message_collector.add_typed collector
108
-
(`Generic (Printf.sprintf
109
-
"Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers" name));
110
-
state.in_heading <- None;
111
-
state.heading_has_text <- false
91
+
let end_element state ~tag collector =
92
+
match state.in_heading, tag with
93
+
| Some h, Tag.Html h2 when h = h2 ->
94
+
if not state.heading_has_text then
95
+
Message_collector.add_typed collector
96
+
(`Generic (Printf.sprintf
97
+
"Heading <%s> is empty or contains only whitespace. Empty headings are problematic for screen readers"
98
+
(Tag.html_tag_to_string h)));
99
+
state.in_heading <- None;
100
+
state.heading_has_text <- false
112
101
| _ -> ()
113
102
114
103
let characters state text _collector =
+23
-29
lib/htmlrw_check/specialized/importmap_checker.ml
+23
-29
lib/htmlrw_check/specialized/importmap_checker.ml
···
265
265
266
266
List.rev !errors
267
267
268
-
let start_element state ~name ~namespace ~attrs _collector =
269
-
if namespace <> None then ()
270
-
else begin
271
-
let name_lower = String.lowercase_ascii name in
272
-
if name_lower = "script" then begin
273
-
(* Check if type="importmap" *)
274
-
let type_attr = List.find_opt (fun (n, _) ->
275
-
String.lowercase_ascii n = "type"
276
-
) attrs in
277
-
match type_attr with
278
-
| Some (_, v) when String.lowercase_ascii v = "importmap" ->
279
-
state.in_importmap <- true;
280
-
Buffer.clear state.content
281
-
| _ -> ()
282
-
end
283
-
end
268
+
let start_element state ~element _collector =
269
+
match element.Element.tag with
270
+
| Tag.Html `Script ->
271
+
(* Check if type="importmap" *)
272
+
let type_attr = List.find_opt (fun (n, _) ->
273
+
String.lowercase_ascii n = "type"
274
+
) element.raw_attrs in
275
+
(match type_attr with
276
+
| Some (_, v) when String.lowercase_ascii v = "importmap" ->
277
+
state.in_importmap <- true;
278
+
Buffer.clear state.content
279
+
| _ -> ())
280
+
| _ -> () (* Only script elements can be importmaps *)
284
281
285
282
let error_to_typed = function
286
283
| InvalidJSON _ -> `Importmap `Invalid_json
···
295
292
| InvalidScopeValue _ -> `Importmap `Scopes_value_invalid_url
296
293
| ScopeValueNotObject -> `Importmap `Scopes_values_not_object
297
294
298
-
let end_element state ~name ~namespace collector =
299
-
if namespace <> None then ()
300
-
else begin
301
-
let name_lower = String.lowercase_ascii name in
302
-
if name_lower = "script" && state.in_importmap then begin
303
-
let content = Buffer.contents state.content in
304
-
let errors = validate_importmap content in
305
-
List.iter (fun err ->
306
-
Message_collector.add_typed collector (error_to_typed err)
307
-
) errors;
308
-
state.in_importmap <- false
309
-
end
310
-
end
295
+
let end_element state ~tag collector =
296
+
match tag with
297
+
| Tag.Html `Script when state.in_importmap ->
298
+
let content = Buffer.contents state.content in
299
+
let errors = validate_importmap content in
300
+
List.iter (fun err ->
301
+
Message_collector.add_typed collector (error_to_typed err)
302
+
) errors;
303
+
state.in_importmap <- false
304
+
| _ -> ()
311
305
312
306
let characters state text _collector =
313
307
if state.in_importmap then
+37
-40
lib/htmlrw_check/specialized/label_checker.ml
+37
-40
lib/htmlrw_check/specialized/label_checker.ml
···
50
50
state.labels_for <- [];
51
51
state.labelable_ids <- []
52
52
53
-
let start_element state ~name ~namespace ~attrs collector =
54
-
if namespace <> None then ()
55
-
else begin
56
-
let name_lower = String.lowercase_ascii name in
53
+
let start_element state ~element collector =
54
+
match element.Element.tag with
55
+
| Tag.Html `Label ->
56
+
state.in_label <- true;
57
+
state.label_depth <- 1; (* Start at 1 for the label element itself *)
58
+
state.labelable_count <- 0;
59
+
let for_value = get_attr element.raw_attrs "for" in
60
+
let has_role = get_attr element.raw_attrs "role" <> None in
61
+
let has_aria_label = get_attr element.raw_attrs "aria-label" <> None in
62
+
state.label_for_value <- for_value;
63
+
state.label_has_role <- has_role;
64
+
state.label_has_aria_label <- has_aria_label;
65
+
(* Track this label if it has for= and role/aria-label *)
66
+
(match for_value with
67
+
| Some target when has_role || has_aria_label ->
68
+
state.labels_for <- { for_target = target; has_role; has_aria_label } :: state.labels_for
69
+
| _ -> ())
70
+
71
+
| Tag.Html tag ->
72
+
let name_lower = String.lowercase_ascii (Tag.tag_to_string (Tag.Html tag)) in
57
73
58
-
if name_lower = "label" then begin
59
-
state.in_label <- true;
60
-
state.label_depth <- 1; (* Start at 1 for the label element itself *)
61
-
state.labelable_count <- 0;
62
-
let for_value = get_attr attrs "for" in
63
-
let has_role = get_attr attrs "role" <> None in
64
-
let has_aria_label = get_attr attrs "aria-label" <> None in
65
-
state.label_for_value <- for_value;
66
-
state.label_has_role <- has_role;
67
-
state.label_has_aria_label <- has_aria_label;
68
-
(* Track this label if it has for= and role/aria-label *)
69
-
(match for_value with
70
-
| Some target when has_role || has_aria_label ->
71
-
state.labels_for <- { for_target = target; has_role; has_aria_label } :: state.labels_for
72
-
| _ -> ())
73
-
end;
74
74
(* Track labelable element IDs *)
75
75
(if List.mem name_lower labelable_elements then
76
-
match get_attr attrs "id" with
76
+
match get_attr element.raw_attrs "id" with
77
77
| Some id -> state.labelable_ids <- id :: state.labelable_ids
78
78
| None -> ());
79
79
80
-
if state.in_label && name_lower <> "label" then begin
80
+
if state.in_label then begin
81
81
state.label_depth <- state.label_depth + 1;
82
82
83
83
(* Check for labelable elements inside label *)
···
89
89
(* Check if label has for attribute and descendant has mismatched id *)
90
90
(match state.label_for_value with
91
91
| Some for_value ->
92
-
let descendant_id = get_attr attrs "id" in
92
+
let descendant_id = get_attr element.raw_attrs "id" in
93
93
(match descendant_id with
94
94
| None ->
95
95
Message_collector.add_typed collector (`Label `For_id_mismatch)
···
99
99
| None -> ())
100
100
end
101
101
end
102
-
end
103
102
104
-
let end_element state ~name ~namespace collector =
105
-
if namespace <> None then ()
106
-
else begin
107
-
let name_lower = String.lowercase_ascii name in
103
+
| _ -> () (* Non-HTML elements (SVG, MathML, etc.) *)
108
104
109
-
if state.in_label then begin
110
-
state.label_depth <- state.label_depth - 1;
105
+
let end_element state ~tag collector =
106
+
if state.in_label then begin
107
+
state.label_depth <- state.label_depth - 1;
111
108
112
-
if name_lower = "label" && state.label_depth = 0 then begin
113
-
if state.label_has_role && state.labelable_count > 0 then
114
-
Message_collector.add_typed collector (`Label `Role_on_ancestor);
115
-
state.in_label <- false;
116
-
state.labelable_count <- 0;
117
-
state.label_for_value <- None;
118
-
state.label_has_role <- false;
119
-
state.label_has_aria_label <- false
120
-
end
121
-
end
109
+
match tag with
110
+
| Tag.Html `Label when state.label_depth = 0 ->
111
+
if state.label_has_role && state.labelable_count > 0 then
112
+
Message_collector.add_typed collector (`Label `Role_on_ancestor);
113
+
state.in_label <- false;
114
+
state.labelable_count <- 0;
115
+
state.label_for_value <- None;
116
+
state.label_has_role <- false;
117
+
state.label_has_aria_label <- false
118
+
| _ -> ()
122
119
end
123
120
124
121
let characters _state _text _collector = ()
+4
-3
lib/htmlrw_check/specialized/language_checker.ml
+4
-3
lib/htmlrw_check/specialized/language_checker.ml
···
89
89
| _ -> ()
90
90
end
91
91
92
-
let start_element _state ~name ~namespace ~attrs collector =
92
+
let start_element _state ~element collector =
93
93
let location = None in
94
-
process_language_attrs ~element:name ~namespace ~attrs ~location collector
94
+
let name = Tag.tag_to_string element.Element.tag in
95
+
process_language_attrs ~element:name ~namespace:None ~attrs:element.raw_attrs ~location collector
95
96
96
-
let end_element _state ~name:_ ~namespace:_ _collector =
97
+
let end_element _state ~tag:_ _collector =
97
98
()
98
99
99
100
let characters _state _text _collector =
+5
-2
lib/htmlrw_check/specialized/microdata_checker.ml
+5
-2
lib/htmlrw_check/specialized/microdata_checker.ml
···
270
270
let all_nodes = Hashtbl.to_seq_keys graph |> List.of_seq in
271
271
check_all_nodes [] all_nodes
272
272
273
-
let start_element state ~name ~namespace:_ ~attrs collector =
273
+
let start_element state ~element collector =
274
+
let name = Tag.tag_to_string element.Element.tag in
275
+
let attrs = element.raw_attrs in
274
276
let location = None in
275
277
track_id state attrs;
276
278
process_microdata_attrs state ~element:name ~attrs ~location collector
277
279
278
-
let end_element state ~name ~namespace:_ _collector =
280
+
let end_element state ~tag _collector =
281
+
let name = Tag.tag_to_string tag in
279
282
(* Pop itemscope from stack if this element had one *)
280
283
match state.scope_stack with
281
284
| scope :: rest when scope.element = name ->
+9
-8
lib/htmlrw_check/specialized/mime_type_checker.ml
+9
-8
lib/htmlrw_check/specialized/mime_type_checker.ml
···
156
156
if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None
157
157
) attrs
158
158
159
-
let start_element _state ~name ~namespace ~attrs collector =
160
-
if namespace <> None then ()
161
-
else begin
159
+
let start_element _state ~element collector =
160
+
match element.Element.tag with
161
+
| Tag.Html tag ->
162
+
let name = Tag.html_tag_to_string tag in
162
163
let name_lower = String.lowercase_ascii name in
163
-
match List.assoc_opt name_lower mime_type_attrs with
164
+
(match List.assoc_opt name_lower mime_type_attrs with
164
165
| None -> ()
165
166
| Some type_attrs ->
166
167
List.iter (fun attr_name ->
167
-
match get_attr_value attr_name attrs with
168
+
match get_attr_value attr_name element.raw_attrs with
168
169
| None -> ()
169
170
| Some value ->
170
171
(* Don't validate empty type attributes or special script types *)
···
186
187
| Some err ->
187
188
Message_collector.add_typed collector
188
189
(`Attr (`Bad_value_generic (`Message err)))
189
-
) type_attrs
190
-
end
190
+
) type_attrs)
191
+
| _ -> () (* Non-HTML elements don't have MIME type checks *)
191
192
192
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
193
+
let end_element _state ~tag:_ _collector = ()
193
194
let characters _state _text _collector = ()
194
195
let end_document _state _collector = ()
195
196
+2
-2
lib/htmlrw_check/specialized/normalization_checker.ml
+2
-2
lib/htmlrw_check/specialized/normalization_checker.ml
···
40
40
if end_pos = len then s
41
41
else String.sub s 0 end_pos
42
42
43
-
let start_element _state ~name:_ ~namespace:_ ~attrs:_ _collector = ()
43
+
let start_element _state ~element:_ _collector = ()
44
44
45
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
45
+
let end_element _state ~tag:_ _collector = ()
46
46
47
47
let characters _state text collector =
48
48
(* Skip empty text or whitespace-only text *)
+90
-91
lib/htmlrw_check/specialized/picture_checker.ml
+90
-91
lib/htmlrw_check/specialized/picture_checker.ml
···
93
93
let check_img_attrs attrs collector =
94
94
check_disallowed_attrs "img" disallowed_img_attrs attrs collector
95
95
96
-
let start_element state ~name ~namespace ~attrs collector =
97
-
let name_lower = String.lowercase_ascii name in
96
+
let start_element state ~element collector =
97
+
let name_lower = Tag.tag_to_string element.Element.tag in
98
+
let attrs = element.raw_attrs in
98
99
99
100
(* Check for disallowed children of picture first - even foreign content *)
100
101
if state.in_picture && state.picture_depth = 1 then begin
···
103
104
end;
104
105
105
106
(* Rest of checks only apply to HTML namespace elements *)
106
-
match namespace with
107
-
| Some _ -> ()
108
-
| None ->
109
-
(match name_lower with
110
-
| "picture" ->
111
-
(* Check if picture is in a disallowed parent context *)
112
-
(match state.parent_stack with
113
-
| parent :: _ when List.mem parent disallowed_picture_parents ->
114
-
Message_collector.add_typed collector
115
-
(`Element (`Not_allowed_as_child (`Child "picture", `Parent parent)))
116
-
| _ -> ());
117
-
check_picture_attrs attrs collector;
118
-
state.in_picture <- true;
119
-
state.has_img_in_picture <- false;
120
-
state.picture_depth <- 0;
121
-
state.children_in_picture <- [];
122
-
state.last_was_img <- false;
123
-
state.has_source_after_img <- false;
124
-
state.has_always_matching_source <- false;
125
-
state.source_after_always_matching <- false
107
+
(match element.tag with
108
+
| Tag.Html `Picture ->
109
+
(* Check if picture is in a disallowed parent context *)
110
+
(match state.parent_stack with
111
+
| parent :: _ when List.mem parent disallowed_picture_parents ->
112
+
Message_collector.add_typed collector
113
+
(`Element (`Not_allowed_as_child (`Child "picture", `Parent parent)))
114
+
| _ -> ());
115
+
check_picture_attrs attrs collector;
116
+
state.in_picture <- true;
117
+
state.has_img_in_picture <- false;
118
+
state.picture_depth <- 0;
119
+
state.children_in_picture <- [];
120
+
state.last_was_img <- false;
121
+
state.has_source_after_img <- false;
122
+
state.has_always_matching_source <- false;
123
+
state.source_after_always_matching <- false
126
124
127
-
| "source" when state.in_picture && state.picture_depth = 1 ->
128
-
check_source_attrs_in_picture attrs collector;
129
-
state.children_in_picture <- "source" :: state.children_in_picture;
130
-
if state.last_was_img then
131
-
state.has_source_after_img <- true;
132
-
if state.has_always_matching_source then
133
-
state.source_after_always_matching <- true;
134
-
(* A source is "always matching" if it has no media/type, or media="" or media="all" *)
135
-
let media_value = Attr_utils.get_attr "media" attrs in
136
-
let has_type = Attr_utils.has_attr "type" attrs in
137
-
let is_media_all = match media_value with
138
-
| Some v -> String.lowercase_ascii (String.trim v) = "all"
139
-
| None -> false in
140
-
let is_media_empty = match media_value with
141
-
| Some v -> String.trim v = ""
142
-
| None -> false in
143
-
let is_always_matching = match media_value with
144
-
| None -> not has_type
145
-
| Some v ->
146
-
let trimmed = String.trim v in
147
-
trimmed = "" || String.lowercase_ascii trimmed = "all"
148
-
in
149
-
if is_always_matching then begin
150
-
state.has_always_matching_source <- true;
151
-
(* Only set flags to true, never reset to false *)
152
-
if is_media_all then state.always_matching_is_media_all <- true;
153
-
if is_media_empty then state.always_matching_is_media_empty <- true
154
-
end
125
+
| Tag.Html `Source when state.in_picture && state.picture_depth = 1 ->
126
+
check_source_attrs_in_picture attrs collector;
127
+
state.children_in_picture <- "source" :: state.children_in_picture;
128
+
if state.last_was_img then
129
+
state.has_source_after_img <- true;
130
+
if state.has_always_matching_source then
131
+
state.source_after_always_matching <- true;
132
+
(* A source is "always matching" if it has no media/type, or media="" or media="all" *)
133
+
let media_value = Attr_utils.get_attr "media" attrs in
134
+
let has_type = Attr_utils.has_attr "type" attrs in
135
+
let is_media_all = match media_value with
136
+
| Some v -> String.lowercase_ascii (String.trim v) = "all"
137
+
| None -> false in
138
+
let is_media_empty = match media_value with
139
+
| Some v -> String.trim v = ""
140
+
| None -> false in
141
+
let is_always_matching = match media_value with
142
+
| None -> not has_type
143
+
| Some v ->
144
+
let trimmed = String.trim v in
145
+
trimmed = "" || String.lowercase_ascii trimmed = "all"
146
+
in
147
+
if is_always_matching then begin
148
+
state.has_always_matching_source <- true;
149
+
if is_media_all then state.always_matching_is_media_all <- true;
150
+
if is_media_empty then state.always_matching_is_media_empty <- true
151
+
end
155
152
156
-
| "img" when state.in_picture && state.picture_depth = 1 ->
157
-
check_img_attrs attrs collector;
158
-
state.has_img_in_picture <- true;
159
-
state.children_in_picture <- "img" :: state.children_in_picture;
160
-
state.last_was_img <- true;
161
-
let img_count = List.length (List.filter (( = ) "img") state.children_in_picture) in
162
-
if img_count > 1 then
163
-
report_disallowed_child "picture" "img" collector;
164
-
if state.has_always_matching_source && Attr_utils.has_attr "srcset" attrs then
165
-
Message_collector.add_typed collector
166
-
(if state.always_matching_is_media_all then `Misc `Media_all
167
-
else if state.always_matching_is_media_empty then `Misc `Media_empty
168
-
else `Srcset `Source_needs_media_or_type)
153
+
| Tag.Html `Img when state.in_picture && state.picture_depth = 1 ->
154
+
check_img_attrs attrs collector;
155
+
state.has_img_in_picture <- true;
156
+
state.children_in_picture <- "img" :: state.children_in_picture;
157
+
state.last_was_img <- true;
158
+
let img_count = List.length (List.filter (( = ) "img") state.children_in_picture) in
159
+
if img_count > 1 then
160
+
report_disallowed_child "picture" "img" collector;
161
+
if state.has_always_matching_source && Attr_utils.has_attr "srcset" attrs then
162
+
Message_collector.add_typed collector
163
+
(if state.always_matching_is_media_all then `Misc `Media_all
164
+
else if state.always_matching_is_media_empty then `Misc `Media_empty
165
+
else `Srcset `Source_needs_media_or_type)
169
166
170
-
| "script" when state.in_picture && state.picture_depth = 1 ->
171
-
state.children_in_picture <- "script" :: state.children_in_picture
167
+
| Tag.Html `Script when state.in_picture && state.picture_depth = 1 ->
168
+
state.children_in_picture <- "script" :: state.children_in_picture
172
169
173
-
| "template" when state.in_picture && state.picture_depth = 1 ->
174
-
state.children_in_picture <- "template" :: state.children_in_picture
170
+
| Tag.Html `Template when state.in_picture && state.picture_depth = 1 ->
171
+
state.children_in_picture <- "template" :: state.children_in_picture
175
172
176
-
| "img" ->
177
-
check_img_attrs attrs collector
173
+
| Tag.Html `Img ->
174
+
check_img_attrs attrs collector
178
175
179
-
| _ -> ());
176
+
| _ -> ());
180
177
181
178
(* Track depth when inside picture *)
182
179
if state.in_picture then
183
180
state.picture_depth <- state.picture_depth + 1;
184
181
185
182
(* Push to parent stack (only HTML namespace elements) *)
186
-
if namespace = None then
187
-
state.parent_stack <- name_lower :: state.parent_stack
183
+
(match element.tag with
184
+
| Tag.Html _ -> state.parent_stack <- name_lower :: state.parent_stack
185
+
| _ -> ())
188
186
189
-
let end_element state ~name ~namespace collector =
190
-
match namespace with
191
-
| Some _ -> ()
192
-
| None ->
193
-
let name_lower = String.lowercase_ascii name in
194
-
187
+
let end_element state ~tag collector =
188
+
match tag with
189
+
| Tag.Html _ ->
190
+
let name_lower = Tag.tag_to_string tag in
195
191
if state.in_picture then
196
192
state.picture_depth <- state.picture_depth - 1;
197
193
198
-
if name_lower = "picture" && state.picture_depth = 0 then begin
199
-
if not state.has_img_in_picture then
200
-
Message_collector.add_typed collector (`Srcset `Picture_missing_img);
201
-
if state.has_source_after_img then
202
-
report_disallowed_child "picture" "source" collector;
203
-
if state.source_after_always_matching then
204
-
Message_collector.add_typed collector
205
-
(if state.always_matching_is_media_all then `Misc `Media_all
206
-
else if state.always_matching_is_media_empty then `Misc `Media_empty
207
-
else `Srcset `Source_needs_media_or_type);
208
-
state.in_picture <- false
209
-
end;
194
+
(match tag with
195
+
| Tag.Html `Picture when state.picture_depth = 0 ->
196
+
if not state.has_img_in_picture then
197
+
Message_collector.add_typed collector (`Srcset `Picture_missing_img);
198
+
if state.has_source_after_img then
199
+
report_disallowed_child "picture" "source" collector;
200
+
if state.source_after_always_matching then
201
+
Message_collector.add_typed collector
202
+
(if state.always_matching_is_media_all then `Misc `Media_all
203
+
else if state.always_matching_is_media_empty then `Misc `Media_empty
204
+
else `Srcset `Source_needs_media_or_type);
205
+
state.in_picture <- false
206
+
| _ -> ());
210
207
211
-
state.parent_stack <- match state.parent_stack with _ :: rest -> rest | [] -> []
208
+
ignore name_lower;
209
+
state.parent_stack <- (match state.parent_stack with _ :: rest -> rest | [] -> [])
210
+
| _ -> ()
212
211
213
212
let characters state text collector =
214
213
(* Text in picture element is not allowed *)
+63
-71
lib/htmlrw_check/specialized/ruby_checker.ml
+63
-71
lib/htmlrw_check/specialized/ruby_checker.ml
···
26
26
state.in_template <- 0
27
27
28
28
(** Check if element is phrasing content that can appear before rt *)
29
-
let is_phrasing_content name =
30
-
let name_lower = String.lowercase_ascii name in
31
-
(* rt and rp are special - they don't count as "content before rt" *)
32
-
name_lower <> "rt" && name_lower <> "rp"
29
+
let is_phrasing_content tag =
30
+
match tag with
31
+
| Tag.Html `Rt | Tag.Html `Rp -> false
32
+
| _ -> true
33
33
34
-
let start_element state ~name ~namespace ~attrs _collector =
35
-
ignore attrs;
36
-
if namespace <> None then ()
37
-
else begin
38
-
let name_lower = String.lowercase_ascii name in
34
+
let start_element state ~element _collector =
35
+
match element.Element.tag with
36
+
| Tag.Html `Template ->
37
+
state.in_template <- state.in_template + 1
39
38
40
-
if name_lower = "template" then
41
-
state.in_template <- state.in_template + 1;
39
+
| Tag.Html `Ruby when state.in_template = 0 ->
40
+
(* Push new ruby context *)
41
+
let info = {
42
+
has_rt = false;
43
+
has_content_before_rt = false;
44
+
saw_rt = false;
45
+
depth = 1; (* Set depth to 1 for the ruby element itself *)
46
+
} in
47
+
state.ruby_stack <- info :: state.ruby_stack
42
48
43
-
if state.in_template > 0 then ()
44
-
else begin
45
-
if name_lower = "ruby" then begin
46
-
(* Push new ruby context *)
47
-
let info = {
48
-
has_rt = false;
49
-
has_content_before_rt = false;
50
-
saw_rt = false;
51
-
depth = 0;
52
-
} in
53
-
state.ruby_stack <- info :: state.ruby_stack
49
+
| tag when state.in_template = 0 ->
50
+
(match state.ruby_stack with
51
+
| info :: _ ->
52
+
(* Inside a ruby element *)
53
+
if info.depth = 1 then begin
54
+
(* Direct children of ruby *)
55
+
match tag with
56
+
| Tag.Html `Rt ->
57
+
info.has_rt <- true;
58
+
info.saw_rt <- true
59
+
| _ when is_phrasing_content tag ->
60
+
if not info.saw_rt then
61
+
info.has_content_before_rt <- true
62
+
| _ -> ()
54
63
end;
64
+
info.depth <- info.depth + 1
65
+
| [] -> ())
55
66
56
-
match state.ruby_stack with
57
-
| info :: _ ->
58
-
(* Inside a ruby element *)
59
-
if name_lower = "ruby" then begin
60
-
(* This is the opening of ruby, set depth to 1 *)
61
-
info.depth <- 1
62
-
end else begin
63
-
if info.depth = 1 then begin
64
-
(* Direct children of ruby *)
65
-
if name_lower = "rt" then begin
66
-
info.has_rt <- true;
67
-
info.saw_rt <- true
68
-
end else if is_phrasing_content name_lower then begin
69
-
if not info.saw_rt then
70
-
info.has_content_before_rt <- true
71
-
end
72
-
end;
73
-
info.depth <- info.depth + 1
74
-
end
75
-
| [] -> ()
76
-
end
77
-
end
67
+
| _ -> () (* In template or non-HTML element *)
68
+
69
+
let end_element state ~tag collector =
70
+
match tag with
71
+
| Tag.Html `Template when state.in_template > 0 ->
72
+
state.in_template <- state.in_template - 1
78
73
79
-
let end_element state ~name ~namespace collector =
80
-
if namespace <> None then ()
81
-
else begin
82
-
let name_lower = String.lowercase_ascii name in
74
+
| Tag.Html `Ruby when state.in_template = 0 ->
75
+
(match state.ruby_stack with
76
+
| info :: rest ->
77
+
info.depth <- info.depth - 1;
78
+
(* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *)
79
+
if info.depth <= 0 then begin
80
+
(* Closing ruby element - validate *)
81
+
if not info.has_rt then
82
+
(* Empty ruby or ruby without any rt - needs rp or rt *)
83
+
Message_collector.add_typed collector
84
+
(`Element (`Missing_child_one_of (`Parent "ruby", `Children ["rp"; "rt"])))
85
+
else if not info.has_content_before_rt then
86
+
(* Has rt but missing content before it - needs content *)
87
+
Message_collector.add_typed collector
88
+
(`Element (`Missing_child (`Parent "ruby", `Child "rt")));
89
+
state.ruby_stack <- rest
90
+
end
91
+
| [] -> ())
83
92
84
-
if name_lower = "template" && state.in_template > 0 then
85
-
state.in_template <- state.in_template - 1;
93
+
| _ when state.in_template = 0 ->
94
+
(match state.ruby_stack with
95
+
| info :: _ ->
96
+
info.depth <- info.depth - 1
97
+
| [] -> ())
86
98
87
-
if state.in_template > 0 then ()
88
-
else begin
89
-
match state.ruby_stack with
90
-
| info :: rest ->
91
-
info.depth <- info.depth - 1;
92
-
(* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *)
93
-
if name_lower = "ruby" && info.depth <= 0 then begin
94
-
(* Closing ruby element - validate *)
95
-
if not info.has_rt then
96
-
(* Empty ruby or ruby without any rt - needs rp or rt *)
97
-
Message_collector.add_typed collector
98
-
(`Element (`Missing_child_one_of (`Parent "ruby", `Children ["rp"; "rt"])))
99
-
else if not info.has_content_before_rt then
100
-
(* Has rt but missing content before it - needs content *)
101
-
Message_collector.add_typed collector
102
-
(`Element (`Missing_child (`Parent "ruby", `Child "rt")));
103
-
state.ruby_stack <- rest
104
-
end
105
-
| [] -> ()
106
-
end
107
-
end
99
+
| _ -> () (* In template or non-HTML element *)
108
100
109
101
let characters state text _collector =
110
102
(* Text content counts as phrasing content before rt *)
+33
-44
lib/htmlrw_check/specialized/source_checker.ml
+33
-44
lib/htmlrw_check/specialized/source_checker.ml
···
23
23
| ctx :: _ -> ctx
24
24
| [] -> Other
25
25
26
-
let start_element state ~name ~namespace ~attrs collector =
27
-
if namespace <> None then ()
28
-
else begin
29
-
let name_lower = String.lowercase_ascii name in
30
-
match name_lower with
31
-
| "picture" ->
32
-
state.context_stack <- Picture :: state.context_stack
33
-
| "video" ->
34
-
state.context_stack <- Video :: state.context_stack
35
-
| "audio" ->
36
-
state.context_stack <- Audio :: state.context_stack
37
-
| "source" ->
38
-
let ctx = current_context state in
39
-
begin match ctx with
40
-
| Video | Audio ->
41
-
if Attr_utils.has_attr "srcset" attrs then
42
-
Message_collector.add_typed collector
43
-
(`Attr (`Not_allowed (`Attr "srcset", `Elem "source")));
44
-
if Attr_utils.has_attr "sizes" attrs then
45
-
Message_collector.add_typed collector
46
-
(`Attr (`Not_allowed (`Attr "sizes", `Elem "source")));
47
-
if Attr_utils.has_attr "width" attrs then
48
-
Message_collector.add_typed collector
49
-
(`Attr (`Not_allowed (`Attr "width", `Elem "source")));
50
-
if Attr_utils.has_attr "height" attrs then
51
-
Message_collector.add_typed collector
52
-
(`Attr (`Not_allowed (`Attr "height", `Elem "source")))
53
-
| Picture | Other -> ()
54
-
end
55
-
| _ ->
56
-
(* Any other element maintains current context *)
57
-
()
58
-
end
26
+
let start_element state ~element collector =
27
+
match element.Element.tag with
28
+
| Tag.Html `Picture ->
29
+
state.context_stack <- Picture :: state.context_stack
30
+
| Tag.Html `Video ->
31
+
state.context_stack <- Video :: state.context_stack
32
+
| Tag.Html `Audio ->
33
+
state.context_stack <- Audio :: state.context_stack
34
+
| Tag.Html `Source ->
35
+
let ctx = current_context state in
36
+
(match ctx with
37
+
| Video | Audio ->
38
+
if Attr_utils.has_attr "srcset" element.raw_attrs then
39
+
Message_collector.add_typed collector
40
+
(`Attr (`Not_allowed (`Attr "srcset", `Elem "source")));
41
+
if Attr_utils.has_attr "sizes" element.raw_attrs then
42
+
Message_collector.add_typed collector
43
+
(`Attr (`Not_allowed (`Attr "sizes", `Elem "source")));
44
+
if Attr_utils.has_attr "width" element.raw_attrs then
45
+
Message_collector.add_typed collector
46
+
(`Attr (`Not_allowed (`Attr "width", `Elem "source")));
47
+
if Attr_utils.has_attr "height" element.raw_attrs then
48
+
Message_collector.add_typed collector
49
+
(`Attr (`Not_allowed (`Attr "height", `Elem "source")))
50
+
| Picture | Other -> ())
51
+
| _ -> ()
59
52
60
-
let end_element state ~name ~namespace _collector =
61
-
if namespace <> None then ()
62
-
else begin
63
-
let name_lower = String.lowercase_ascii name in
64
-
match name_lower with
65
-
| "picture" | "video" | "audio" ->
66
-
(match state.context_stack with
67
-
| _ :: rest -> state.context_stack <- rest
68
-
| [] -> ())
69
-
| _ -> ()
70
-
end
53
+
let end_element state ~tag _collector =
54
+
match tag with
55
+
| Tag.Html (`Picture | `Video | `Audio) ->
56
+
(match state.context_stack with
57
+
| _ :: rest -> state.context_stack <- rest
58
+
| [] -> ())
59
+
| _ -> ()
71
60
72
61
let characters _state _text _collector = ()
73
62
+26
-31
lib/htmlrw_check/specialized/srcset_sizes_checker.ml
+26
-31
lib/htmlrw_check/specialized/srcset_sizes_checker.ml
···
960
960
Message_collector.add_typed collector
961
961
(`Attr (`Bad_value_generic (`Message (Printf.sprintf "Bad value \xe2\x80\x9c%s\xe2\x80\x9d for attribute \xe2\x80\x9csrcset\xe2\x80\x9d on element \xe2\x80\x9c%s\xe2\x80\x9d: Mixing width and density descriptors is not allowed." value element_name))))
962
962
963
-
let start_element _state ~name ~namespace ~attrs collector =
964
-
let name_lower = String.lowercase_ascii name in
965
-
966
-
(* SVG image elements should not have srcset *)
967
-
if namespace <> None && name_lower = "image" then begin
968
-
if Attr_utils.get_attr "srcset" attrs <> None then
963
+
let start_element _state ~element collector =
964
+
match element.Element.tag with
965
+
| Tag.Svg "image" ->
966
+
(* SVG image elements should not have srcset *)
967
+
if Attr_utils.get_attr "srcset" element.Element.raw_attrs <> None then
969
968
Message_collector.add_typed collector
970
969
(`Attr (`Not_allowed (`Attr "srcset", `Elem "image")))
971
-
end;
970
+
| Tag.Html (`Img | `Source as tag) ->
971
+
let name_lower = Tag.html_tag_to_string tag in
972
+
let attrs = element.raw_attrs in
973
+
let sizes_value = Attr_utils.get_attr "sizes" attrs in
974
+
let srcset_value = Attr_utils.get_attr "srcset" attrs in
975
+
let has_sizes = sizes_value <> None in
976
+
let has_srcset = srcset_value <> None in
972
977
973
-
if namespace <> None then ()
974
-
else begin
975
-
(* Check sizes and srcset on img and source *)
976
-
if name_lower = "img" || name_lower = "source" then begin
977
-
let sizes_value = Attr_utils.get_attr "sizes" attrs in
978
-
let srcset_value = Attr_utils.get_attr "srcset" attrs in
979
-
let has_sizes = sizes_value <> None in
980
-
let has_srcset = srcset_value <> None in
981
-
982
-
(* Validate sizes if present *)
983
-
(match sizes_value with
984
-
| Some v -> ignore (validate_sizes v name_lower collector)
985
-
| None -> ());
978
+
(* Validate sizes if present *)
979
+
(match sizes_value with
980
+
| Some v -> ignore (validate_sizes v name_lower collector)
981
+
| None -> ());
986
982
987
-
(* Validate srcset if present *)
988
-
(match srcset_value with
989
-
| Some v -> validate_srcset v name_lower has_sizes collector
990
-
| None -> ());
983
+
(* Validate srcset if present *)
984
+
(match srcset_value with
985
+
| Some v -> validate_srcset v name_lower has_sizes collector
986
+
| None -> ());
991
987
992
-
(* Error: sizes without srcset on img *)
993
-
if name_lower = "img" && has_sizes && not has_srcset then
994
-
Message_collector.add_typed collector
995
-
(`Srcset `Sizes_without_srcset)
996
-
end
997
-
end
988
+
(* Error: sizes without srcset on img *)
989
+
if name_lower = "img" && has_sizes && not has_srcset then
990
+
Message_collector.add_typed collector
991
+
(`Srcset `Sizes_without_srcset)
992
+
| _ -> () (* Other elements *)
998
993
999
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
994
+
let end_element _state ~tag:_ _collector = ()
1000
995
let characters _state _text _collector = ()
1001
996
let end_document _state _collector = ()
1002
997
+8
-5
lib/htmlrw_check/specialized/svg_checker.ml
+8
-5
lib/htmlrw_check/specialized/svg_checker.ml
···
30
30
state.fecomponenttransfer_stack <- []
31
31
32
32
(* SVG namespace - the DOM stores this as "svg" shorthand *)
33
-
let svg_ns = "svg"
33
+
let _svg_ns = "svg"
34
34
35
35
(* Full SVG namespace URL for validation *)
36
36
let svg_ns_url = "http://www.w3.org/2000/svg"
···
348
348
end
349
349
with Not_found -> ()
350
350
351
-
let start_element state ~name ~namespace ~attrs collector =
352
-
let is_svg_element = namespace = Some svg_ns in
351
+
let start_element state ~element collector =
352
+
let is_svg_element = match element.Element.tag with Tag.Svg _ -> true | _ -> false in
353
+
let name = Tag.tag_to_string element.tag in
354
+
let attrs = element.raw_attrs in
353
355
354
356
(* Track if we're in SVG context *)
355
357
if name = "svg" && is_svg_element then
···
448
450
| None -> ())
449
451
end
450
452
451
-
let end_element state ~name ~namespace collector =
452
-
let is_svg_element = namespace = Some svg_ns in
453
+
let end_element state ~tag collector =
454
+
let is_svg_element = match tag with Tag.Svg _ -> true | _ -> false in
455
+
let name = Tag.tag_to_string tag in
453
456
454
457
if is_svg_element || state.in_svg then begin
455
458
let name_lower = String.lowercase_ascii name in
+42
-41
lib/htmlrw_check/specialized/table_checker.ml
+42
-41
lib/htmlrw_check/specialized/table_checker.ml
···
688
688
689
689
let reset state = state.tables := []
690
690
691
-
let is_html_namespace = function
691
+
let _is_html_namespace = function
692
692
| None -> true (* HTML mode - no namespace specified *)
693
693
| Some ns -> ns = html_ns (* XHTML mode - check namespace *)
694
694
695
-
let start_element state ~name ~namespace ~attrs collector =
696
-
if is_html_namespace namespace then (
697
-
let name_lower = String.lowercase_ascii name in
698
-
match name_lower with
699
-
| "table" ->
700
-
(* Push a new table onto the stack *)
701
-
state.tables := make_table () :: !(state.tables)
702
-
| _ -> (
703
-
match !(state.tables) with
704
-
| [] -> ()
705
-
| table :: _ -> (
706
-
match name_lower with
707
-
| "td" -> start_cell table false attrs collector
708
-
| "th" -> start_cell table true attrs collector
709
-
| "tr" -> start_row table collector
710
-
| "tbody" | "thead" | "tfoot" -> start_row_group table name collector
711
-
| "col" -> start_col table attrs collector
712
-
| "colgroup" -> start_colgroup table attrs collector
713
-
| _ -> ())))
695
+
let start_element state ~element collector =
696
+
let attrs = element.Element.raw_attrs in
697
+
match element.tag with
698
+
| Tag.Html `Table ->
699
+
(* Push a new table onto the stack *)
700
+
state.tables := make_table () :: !(state.tables)
701
+
| Tag.Html tag -> (
702
+
match !(state.tables) with
703
+
| [] -> ()
704
+
| table :: _ -> (
705
+
match tag with
706
+
| `Td -> start_cell table false attrs collector
707
+
| `Th -> start_cell table true attrs collector
708
+
| `Tr -> start_row table collector
709
+
| `Tbody | `Thead | `Tfoot ->
710
+
let name = Tag.html_tag_to_string tag in
711
+
start_row_group table name collector
712
+
| `Col -> start_col table attrs collector
713
+
| `Colgroup -> start_colgroup table attrs collector
714
+
| _ -> ()))
715
+
| _ -> () (* Non-HTML elements *)
714
716
715
-
let end_element state ~name ~namespace collector =
716
-
if is_html_namespace namespace then (
717
-
let name_lower = String.lowercase_ascii name in
718
-
match name_lower with
719
-
| "table" -> (
720
-
match !(state.tables) with
721
-
| [] -> () (* End tag without start - ignore *)
722
-
| table :: rest ->
723
-
end_table table collector;
724
-
state.tables := rest)
725
-
| _ -> (
726
-
match !(state.tables) with
727
-
| [] -> ()
728
-
| table :: _ -> (
729
-
match name_lower with
730
-
| "td" | "th" -> end_cell table
731
-
| "tr" -> end_row table collector
732
-
| "tbody" | "thead" | "tfoot" -> end_row_group_handler table collector
733
-
| "col" -> end_col table
734
-
| "colgroup" -> end_colgroup table
735
-
| _ -> ())))
717
+
let end_element state ~tag collector =
718
+
match tag with
719
+
| Tag.Html `Table -> (
720
+
match !(state.tables) with
721
+
| [] -> () (* End tag without start - ignore *)
722
+
| table :: rest ->
723
+
end_table table collector;
724
+
state.tables := rest)
725
+
| Tag.Html html_tag -> (
726
+
match !(state.tables) with
727
+
| [] -> ()
728
+
| table :: _ -> (
729
+
match html_tag with
730
+
| `Td | `Th -> end_cell table
731
+
| `Tr -> end_row table collector
732
+
| `Tbody | `Thead | `Tfoot -> end_row_group_handler table collector
733
+
| `Col -> end_col table
734
+
| `Colgroup -> end_colgroup table
735
+
| _ -> ()))
736
+
| _ -> () (* Non-HTML elements *)
736
737
737
738
let characters _state _text _collector = ()
738
739
+26
-43
lib/htmlrw_check/specialized/title_checker.ml
+26
-43
lib/htmlrw_check/specialized/title_checker.ml
···
26
26
state.title_depth <- 0;
27
27
state.is_iframe_srcdoc <- false
28
28
29
-
let start_element state ~name ~namespace ~attrs collector =
30
-
ignore (collector, attrs);
31
-
if namespace <> None then ()
32
-
else begin
33
-
let name_lower = String.lowercase_ascii name in
34
-
match name_lower with
35
-
| "html" ->
36
-
(* Check if this is an iframe srcdoc - title is not required *)
37
-
(* We detect this by checking for srcdoc context - not directly checkable from HTML,
38
-
but we can assume normal HTML document for now *)
39
-
()
40
-
| "head" ->
41
-
state.in_head <- true
42
-
| "title" when state.in_head ->
43
-
state.has_title <- true;
44
-
state.in_title <- true;
45
-
state.title_has_content <- false;
46
-
state.title_depth <- 0
47
-
| _ -> ()
48
-
end;
29
+
let start_element state ~element _collector =
30
+
(match element.Element.tag with
31
+
| Tag.Html `Html -> ()
32
+
| Tag.Html `Head ->
33
+
state.in_head <- true
34
+
| Tag.Html `Title when state.in_head ->
35
+
state.has_title <- true;
36
+
state.in_title <- true;
37
+
state.title_has_content <- false;
38
+
state.title_depth <- 0
39
+
| _ -> ());
49
40
if state.in_title then
50
41
state.title_depth <- state.title_depth + 1
51
42
52
-
let end_element state ~name ~namespace collector =
53
-
if namespace <> None then ()
54
-
else begin
55
-
let name_lower = String.lowercase_ascii name in
56
-
57
-
if state.in_title then
58
-
state.title_depth <- state.title_depth - 1;
59
-
60
-
match name_lower with
61
-
| "title" when state.in_title && state.title_depth = 0 ->
62
-
(* Check if title was empty *)
63
-
if not state.title_has_content then
64
-
Message_collector.add_typed collector
65
-
(`Element (`Must_not_be_empty (`Elem "title")));
66
-
state.in_title <- false
67
-
| "head" ->
68
-
(* Check if head had a title element *)
69
-
if state.in_head && not state.has_title then
70
-
Message_collector.add_typed collector
71
-
(`Element (`Missing_child (`Parent "head", `Child "title")));
72
-
state.in_head <- false
73
-
| _ -> ()
74
-
end
43
+
let end_element state ~tag collector =
44
+
if state.in_title then
45
+
state.title_depth <- state.title_depth - 1;
46
+
match tag with
47
+
| Tag.Html `Title when state.in_title && state.title_depth = 0 ->
48
+
if not state.title_has_content then
49
+
Message_collector.add_typed collector
50
+
(`Element (`Must_not_be_empty (`Elem "title")));
51
+
state.in_title <- false
52
+
| Tag.Html `Head ->
53
+
if state.in_head && not state.has_title then
54
+
Message_collector.add_typed collector
55
+
(`Element (`Missing_child (`Parent "head", `Child "title")));
56
+
state.in_head <- false
57
+
| _ -> ()
75
58
76
59
let characters state text _collector =
77
60
if state.in_title then begin
+24
-26
lib/htmlrw_check/specialized/unknown_element_checker.ml
+24
-26
lib/htmlrw_check/specialized/unknown_element_checker.ml
···
67
67
let reset state =
68
68
state.stack <- []
69
69
70
-
let start_element state ~name ~namespace ~attrs:_ collector =
71
-
(* Only check HTML namespace elements *)
72
-
match namespace with
73
-
| Some _ -> () (* Skip SVG, MathML, etc. *)
74
-
| None ->
75
-
let name_lower = String.lowercase_ascii name in
70
+
let start_element state ~element collector =
71
+
match element.Element.tag with
72
+
| Tag.Unknown name ->
73
+
(* Get the parent element name *)
74
+
let parent = match state.stack with
75
+
| p :: _ -> p
76
+
| [] -> "document"
77
+
in
78
+
(* Produce error: unknown element not allowed as child *)
79
+
Message_collector.add_typed collector
80
+
(`Element (`Not_allowed_as_child (`Child name, `Parent parent)));
81
+
(* Push to stack for tracking *)
82
+
state.stack <- name :: state.stack
76
83
77
-
(* Check if element is unknown *)
78
-
if not (is_known_element name_lower) then begin
79
-
(* Get the parent element name *)
80
-
let parent = match state.stack with
81
-
| p :: _ -> p
82
-
| [] -> "document"
83
-
in
84
-
(* Produce error: unknown element not allowed as child *)
85
-
Message_collector.add_typed collector
86
-
(`Element (`Not_allowed_as_child (`Child name, `Parent parent)))
87
-
end;
84
+
| Tag.Html tag ->
85
+
let name_lower = String.lowercase_ascii (Tag.tag_to_string (Tag.Html tag)) in
86
+
state.stack <- name_lower :: state.stack
88
87
89
-
(* Always push to stack for tracking *)
90
-
state.stack <- name_lower :: state.stack
88
+
| _ -> () (* SVG, MathML, Custom elements are allowed *)
91
89
92
-
let end_element state ~name:_ ~namespace _ =
93
-
match namespace with
94
-
| Some _ -> ()
95
-
| None ->
96
-
match state.stack with
97
-
| _ :: rest -> state.stack <- rest
98
-
| [] -> () (* Stack underflow - shouldn't happen *)
90
+
let end_element state ~tag _ =
91
+
match tag with
92
+
| Tag.Html _ | Tag.Unknown _ ->
93
+
(match state.stack with
94
+
| _ :: rest -> state.stack <- rest
95
+
| [] -> ()) (* Stack underflow - shouldn't happen *)
96
+
| _ -> () (* SVG, MathML, Custom elements *)
99
97
100
98
let characters _state _text _collector = ()
101
99
+7
-5
lib/htmlrw_check/specialized/url_checker.ml
+7
-5
lib/htmlrw_check/specialized/url_checker.ml
···
741
741
if String.lowercase_ascii k = String.lowercase_ascii name then Some v else None
742
742
) attrs
743
743
744
-
let start_element _state ~name ~namespace ~attrs collector =
745
-
if namespace <> None then ()
746
-
else begin
744
+
let start_element _state ~element collector =
745
+
match element.Element.tag with
746
+
| Tag.Html _ ->
747
+
let name = Tag.tag_to_string element.tag in
747
748
let name_lower = String.lowercase_ascii name in
749
+
let attrs = element.raw_attrs in
748
750
(* Check URL attributes for elements that have them *)
749
751
(match List.assoc_opt name_lower url_attributes with
750
752
| None -> ()
···
808
810
| Some warn_msg -> Message_collector.add_typed collector (`Generic warn_msg)
809
811
| None -> ())
810
812
| _ -> ())
811
-
end
813
+
| _ -> () (* Non-HTML elements *)
812
814
813
-
let end_element _state ~name:_ ~namespace:_ _collector = ()
815
+
let end_element _state ~tag:_ _collector = ()
814
816
let characters _state _text _collector = ()
815
817
let end_document _state _collector = ()
816
818
+5
-4
lib/htmlrw_check/specialized/xhtml_content_checker.ml
+5
-4
lib/htmlrw_check/specialized/xhtml_content_checker.ml
···
52
52
Message_collector.add_typed collector (`Attr `Data_uppercase)
53
53
) attrs
54
54
55
-
let start_element state ~name ~namespace ~attrs collector =
56
-
ignore namespace;
55
+
let start_element state ~element collector =
56
+
let name = Tag.tag_to_string element.Element.tag in
57
57
let name_lower = String.lowercase_ascii name in
58
+
let attrs = element.raw_attrs in
58
59
59
60
(* Check data-* attributes for uppercase *)
60
61
check_data_attr_case attrs collector;
···
97
98
(* Push onto stack *)
98
99
state.element_stack <- name :: state.element_stack
99
100
100
-
let end_element state ~name ~namespace:_ _collector =
101
-
let name_lower = String.lowercase_ascii name in
101
+
let end_element state ~tag _collector =
102
+
let name_lower = String.lowercase_ascii (Tag.tag_to_string tag) in
102
103
(* Pop figure state if leaving a figure *)
103
104
if name_lower = "figure" then begin
104
105
match state.figure_stack with