(* Entity table generator for html5rw.
Reads WHATWG entities.json and generates OCaml code. *)
(* Helper functions for jsont *)
let json_object = function
| Jsont.Object (obj, _) -> obj
| _ -> failwith "Expected JSON object"
let json_array = function
| Jsont.Array (arr, _) -> arr
| _ -> failwith "Expected JSON array"
let json_number = function
| Jsont.Number (n, _) -> int_of_float n
| _ -> failwith "Expected JSON number"
let json_mem name obj =
match List.find_opt (fun ((n, _), _) -> n = name) obj with
| Some (_, v) -> Some v
| None -> None
let () =
let json_file = Sys.argv.(1) in
let out_file = Sys.argv.(2) in
let ic = open_in json_file in
let n = in_channel_length ic in
let s = really_input_string ic n in
close_in ic;
let json = match Jsont_bytesrw.decode_string Jsont.json s with
| Ok j -> j
| Error e -> failwith (Printf.sprintf "JSON parse error: %s" e)
in
let oc = open_out out_file in
(* Header *)
output_string oc "(* Auto-generated from entities.json - do not edit *)\n\n";
(* We need two tables:
1. Full entity table (name without & -> codepoints)
2. Legacy entities set (entities that can be used without semicolon) *)
let entities = ref [] in
let legacy = ref [] in
let entries = json_object json in
List.iter (fun ((name, _), value) ->
(* name is like "&" or "&" *)
let name_without_amp =
if String.length name > 0 && name.[0] = '&' then
String.sub name 1 (String.length name - 1)
else name
in
let has_semicolon =
String.length name_without_amp > 0 &&
name_without_amp.[String.length name_without_amp - 1] = ';'
in
let key =
if has_semicolon then
String.sub name_without_amp 0 (String.length name_without_amp - 1)
else
name_without_amp
in
let fields = json_object value in
let codepoints =
match json_mem "codepoints" fields with
| Some arr -> List.map json_number (json_array arr)
| None -> []
in
if codepoints <> [] then begin
entities := (key, codepoints, has_semicolon) :: !entities;
(* Legacy entities are those that appear without semicolon in the JSON *)
if not has_semicolon then
legacy := key :: !legacy
end
) entries;
(* Remove duplicates - prefer semicolon version *)
let seen = Hashtbl.create 2500 in
let unique_entities =
List.filter (fun (key, _, has_semi) ->
if Hashtbl.mem seen key then begin
(* If we already have this key without semicolon, and this one has semicolon, replace *)
if has_semi then begin
Hashtbl.replace seen key true;
true
end else false
end else begin
Hashtbl.add seen key has_semi;
true
end
) (List.rev !entities)
in
(* Sort for binary search *)
let sorted = List.sort (fun (a, _, _) (b, _, _) -> String.compare a b) unique_entities in
(* Generate codepoints to string function *)
output_string oc "let codepoints_to_string cps =\n";
output_string oc " let buf = Buffer.create 8 in\n";
output_string oc " List.iter (fun cp ->\n";
output_string oc " if cp <= 0x7F then\n";
output_string oc " Buffer.add_char buf (Char.chr cp)\n";
output_string oc " else if cp <= 0x7FF then begin\n";
output_string oc " Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));\n";
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
output_string oc " end else if cp <= 0xFFFF then begin\n";
output_string oc " Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));\n";
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n";
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
output_string oc " end else begin\n";
output_string oc " Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));\n";
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));\n";
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n";
output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n";
output_string oc " end\n";
output_string oc " ) cps;\n";
output_string oc " Buffer.contents buf\n\n";
(* Generate the entity array for binary search *)
output_string oc "let entities = [|\n";
List.iter (fun (name, codepoints, _) ->
let cps_str = String.concat "; " (List.map string_of_int codepoints) in
Printf.fprintf oc " (%S, [%s]);\n" name cps_str
) sorted;
output_string oc "|]\n\n";
(* Binary search lookup *)
output_string oc "let lookup name =\n";
output_string oc " let rec search lo hi =\n";
output_string oc " if lo > hi then None\n";
output_string oc " else begin\n";
output_string oc " let mid = (lo + hi) / 2 in\n";
output_string oc " let (key, cps) = entities.(mid) in\n";
output_string oc " let cmp = String.compare name key in\n";
output_string oc " if cmp = 0 then Some (codepoints_to_string cps)\n";
output_string oc " else if cmp < 0 then search lo (mid - 1)\n";
output_string oc " else search (mid + 1) hi\n";
output_string oc " end\n";
output_string oc " in\n";
output_string oc " search 0 (Array.length entities - 1)\n\n";
(* Generate legacy entities set *)
let legacy_sorted = List.sort_uniq String.compare !legacy in
output_string oc "let legacy_entities = [|\n";
List.iter (fun name ->
Printf.fprintf oc " %S;\n" name
) legacy_sorted;
output_string oc "|]\n\n";
output_string oc "let is_legacy name =\n";
output_string oc " let rec search lo hi =\n";
output_string oc " if lo > hi then false\n";
output_string oc " else begin\n";
output_string oc " let mid = (lo + hi) / 2 in\n";
output_string oc " let cmp = String.compare name legacy_entities.(mid) in\n";
output_string oc " if cmp = 0 then true\n";
output_string oc " else if cmp < 0 then search lo (mid - 1)\n";
output_string oc " else search (mid + 1) hi\n";
output_string oc " end\n";
output_string oc " in\n";
output_string oc " search 0 (Array.length legacy_entities - 1)\n";
close_out oc;
Printf.printf "Generated %s with %d entities (%d legacy)\n"
out_file (List.length sorted) (List.length legacy_sorted)