OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Entity table generator for html5rw.
2 Reads WHATWG entities.json and generates OCaml code. *)
3
4(* Helper functions for jsont *)
5let json_object = function
6 | Jsont.Object (obj, _) -> obj
7 | _ -> failwith "Expected JSON object"
8
9let json_array = function
10 | Jsont.Array (arr, _) -> arr
11 | _ -> failwith "Expected JSON array"
12
13let json_number = function
14 | Jsont.Number (n, _) -> int_of_float n
15 | _ -> failwith "Expected JSON number"
16
17let json_mem name obj =
18 match List.find_opt (fun ((n, _), _) -> n = name) obj with
19 | Some (_, v) -> Some v
20 | None -> None
21
22let () =
23 let json_file = Sys.argv.(1) in
24 let out_file = Sys.argv.(2) in
25
26 let ic = open_in json_file in
27 let n = in_channel_length ic in
28 let s = really_input_string ic n in
29 close_in ic;
30
31 let json = match Jsont_bytesrw.decode_string Jsont.json s with
32 | Ok j -> j
33 | Error e -> failwith (Printf.sprintf "JSON parse error: %s" e)
34 in
35
36 let oc = open_out out_file in
37
38 (* Header *)
39 output_string oc "(* Auto-generated from entities.json - do not edit *)\n\n";
40
41 (* We need two tables:
42 1. Full entity table (name without & -> codepoints)
43 2. Legacy entities set (entities that can be used without semicolon) *)
44
45 let entities = ref [] in
46 let legacy = ref [] in
47
48 let entries = json_object json in
49 List.iter (fun ((name, _), value) ->
50 (* name is like "&" or "&" *)
51 let name_without_amp =
52 if String.length name > 0 && name.[0] = '&' then
53 String.sub name 1 (String.length name - 1)
54 else name
55 in
56 let has_semicolon =
57 String.length name_without_amp > 0 &&
58 name_without_amp.[String.length name_without_amp - 1] = ';'
59 in
60 let key =
61 if has_semicolon then
62 String.sub name_without_amp 0 (String.length name_without_amp - 1)
63 else
64 name_without_amp
65 in
66 let fields = json_object value in
67 let codepoints =
68 match json_mem "codepoints" fields with
69 | Some arr -> List.map json_number (json_array arr)
70 | None -> []
71 in
72 if codepoints <> [] then begin
73 entities := (key, codepoints, has_semicolon) :: !entities;
74 (* Legacy entities are those that appear without semicolon in the JSON *)
75 if not has_semicolon then
76 legacy := key :: !legacy
77 end
78 ) entries;
79
80 (* Remove duplicates - prefer semicolon version *)
81 let seen = Hashtbl.create 2500 in
82 let unique_entities =
83 List.filter (fun (key, _, has_semi) ->
84 if Hashtbl.mem seen key then begin
85 (* If we already have this key without semicolon, and this one has semicolon, replace *)
86 if has_semi then begin
87 Hashtbl.replace seen key true;
88 true
89 end else false
90 end else begin
91 Hashtbl.add seen key has_semi;
92 true
93 end
94 ) (List.rev !entities)
95 in
96
97 (* Sort for binary search *)
98 let sorted = List.sort (fun (a, _, _) (b, _, _) -> String.compare a b) unique_entities in
99
100 (* Generate codepoints to string function *)
101 output_string oc "let codepoints_to_string cps =\n";
102 output_string oc " let buf = Buffer.create 8 in\n";
103 output_string oc " List.iter (fun cp ->\n";
104 output_string oc " if cp <= 0x7F then\n";
105 output_string oc " Buffer.add_char buf (Char.chr cp)\n";
106 output_string oc " else if cp <= 0x7FF then begin\n";
107 output_string oc " Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));\n";
108 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
109 output_string oc " end else if cp <= 0xFFFF then begin\n";
110 output_string oc " Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));\n";
111 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n";
112 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
113 output_string oc " end else begin\n";
114 output_string oc " Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));\n";
115 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));\n";
116 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n";
117 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
118 output_string oc " end\n";
119 output_string oc " ) cps;\n";
120 output_string oc " Buffer.contents buf\n\n";
121
122 (* Generate the entity array for binary search *)
123 output_string oc "let entities = [|\n";
124 List.iter (fun (name, codepoints, _) ->
125 let cps_str = String.concat "; " (List.map string_of_int codepoints) in
126 Printf.fprintf oc " (%S, [%s]);\n" name cps_str
127 ) sorted;
128 output_string oc "|]\n\n";
129
130 (* Binary search lookup *)
131 output_string oc "let lookup name =\n";
132 output_string oc " let rec search lo hi =\n";
133 output_string oc " if lo > hi then None\n";
134 output_string oc " else begin\n";
135 output_string oc " let mid = (lo + hi) / 2 in\n";
136 output_string oc " let (key, cps) = entities.(mid) in\n";
137 output_string oc " let cmp = String.compare name key in\n";
138 output_string oc " if cmp = 0 then Some (codepoints_to_string cps)\n";
139 output_string oc " else if cmp < 0 then search lo (mid - 1)\n";
140 output_string oc " else search (mid + 1) hi\n";
141 output_string oc " end\n";
142 output_string oc " in\n";
143 output_string oc " search 0 (Array.length entities - 1)\n\n";
144
145 (* Generate legacy entities set *)
146 let legacy_sorted = List.sort_uniq String.compare !legacy in
147 output_string oc "let legacy_entities = [|\n";
148 List.iter (fun name ->
149 Printf.fprintf oc " %S;\n" name
150 ) legacy_sorted;
151 output_string oc "|]\n\n";
152
153 output_string oc "let is_legacy name =\n";
154 output_string oc " let rec search lo hi =\n";
155 output_string oc " if lo > hi then false\n";
156 output_string oc " else begin\n";
157 output_string oc " let mid = (lo + hi) / 2 in\n";
158 output_string oc " let cmp = String.compare name legacy_entities.(mid) in\n";
159 output_string oc " if cmp = 0 then true\n";
160 output_string oc " else if cmp < 0 then search lo (mid - 1)\n";
161 output_string oc " else search (mid + 1) hi\n";
162 output_string oc " end\n";
163 output_string oc " in\n";
164 output_string oc " search 0 (Array.length legacy_entities - 1)\n";
165
166 close_out oc;
167 Printf.printf "Generated %s with %d entities (%d legacy)\n"
168 out_file (List.length sorted) (List.length legacy_sorted)