OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Ruby element content model validation checker.
2
3 Validates that:
4 - Ruby contains at least one rt element
5 - Ruby contains phrasing content before rt elements *)
6
7type ruby_info = {
8 mutable has_rt : bool;
9 mutable has_content_before_rt : bool;
10 mutable saw_rt : bool; (* Whether we've seen rt yet *)
11 mutable depth : int; (* Track nesting level *)
12}
13
14type state = {
15 mutable ruby_stack : ruby_info list; (* Stack for nested ruby elements *)
16 mutable in_template : int;
17}
18
19let create () = {
20 ruby_stack = [];
21 in_template = 0;
22}
23
24let reset state =
25 state.ruby_stack <- [];
26 state.in_template <- 0
27
28(** Check if element is phrasing content that can appear before rt *)
29let is_phrasing_content tag =
30 match tag with
31 | Tag.Html `Rt | Tag.Html `Rp -> false
32 | _ -> true
33
34let start_element state ~element _collector =
35 match element.Element.tag with
36 | Tag.Html `Template ->
37 state.in_template <- state.in_template + 1
38
39 | Tag.Html `Ruby when state.in_template = 0 ->
40 (* Push new ruby context *)
41 let info = {
42 has_rt = false;
43 has_content_before_rt = false;
44 saw_rt = false;
45 depth = 1; (* Set depth to 1 for the ruby element itself *)
46 } in
47 state.ruby_stack <- info :: state.ruby_stack
48
49 | tag when state.in_template = 0 ->
50 (match state.ruby_stack with
51 | info :: _ ->
52 (* Inside a ruby element *)
53 if info.depth = 1 then begin
54 (* Direct children of ruby *)
55 match tag with
56 | Tag.Html `Rt ->
57 info.has_rt <- true;
58 info.saw_rt <- true
59 | _ when is_phrasing_content tag ->
60 if not info.saw_rt then
61 info.has_content_before_rt <- true
62 | _ -> ()
63 end;
64 info.depth <- info.depth + 1
65 | [] -> ())
66
67 | _ -> () (* In template or non-HTML element *)
68
69let end_element state ~tag collector =
70 match tag with
71 | Tag.Html `Template when state.in_template > 0 ->
72 state.in_template <- state.in_template - 1
73
74 | Tag.Html `Ruby when state.in_template = 0 ->
75 (match state.ruby_stack with
76 | info :: rest ->
77 info.depth <- info.depth - 1;
78 (* Check if this is the closing ruby tag (depth becomes 0 when ruby closes) *)
79 if info.depth <= 0 then begin
80 (* Closing ruby element - validate *)
81 if not info.has_rt then
82 (* Empty ruby or ruby without any rt - needs rp or rt *)
83 Message_collector.add_typed collector
84 (`Element (`Missing_child_one_of (`Parent "ruby", `Children ["rp"; "rt"])))
85 else if not info.has_content_before_rt then
86 (* Has rt but missing content before it - needs content *)
87 Message_collector.add_typed collector
88 (`Element (`Missing_child (`Parent "ruby", `Child "rt")));
89 state.ruby_stack <- rest
90 end
91 | [] -> ())
92
93 | _ when state.in_template = 0 ->
94 (match state.ruby_stack with
95 | info :: _ ->
96 info.depth <- info.depth - 1
97 | [] -> ())
98
99 | _ -> () (* In template or non-HTML element *)
100
101let characters state text _collector =
102 (* Text content counts as phrasing content before rt *)
103 if state.in_template > 0 then ()
104 else begin
105 match state.ruby_stack with
106 | info :: _ ->
107 if info.depth = 1 then begin
108 (* Direct text child of ruby *)
109 let has_non_whitespace =
110 String.exists (fun c ->
111 c <> ' ' && c <> '\t' && c <> '\n' && c <> '\r'
112 ) text
113 in
114 if has_non_whitespace && not info.saw_rt then
115 info.has_content_before_rt <- true
116 end
117 | [] -> ()
118 end
119
120let checker = Checker.make ~create ~reset ~start_element ~end_element
121 ~characters ()