(* Entity table generator for html5rw. Reads WHATWG entities.json and generates OCaml code. *) (* Helper functions for jsont *) let json_object = function | Jsont.Object (obj, _) -> obj | _ -> failwith "Expected JSON object" let json_array = function | Jsont.Array (arr, _) -> arr | _ -> failwith "Expected JSON array" let json_number = function | Jsont.Number (n, _) -> int_of_float n | _ -> failwith "Expected JSON number" let json_mem name obj = match List.find_opt (fun ((n, _), _) -> n = name) obj with | Some (_, v) -> Some v | None -> None let () = let json_file = Sys.argv.(1) in let out_file = Sys.argv.(2) in let ic = open_in json_file in let n = in_channel_length ic in let s = really_input_string ic n in close_in ic; let json = match Jsont_bytesrw.decode_string Jsont.json s with | Ok j -> j | Error e -> failwith (Printf.sprintf "JSON parse error: %s" e) in let oc = open_out out_file in (* Header *) output_string oc "(* Auto-generated from entities.json - do not edit *)\n\n"; (* We need two tables: 1. Full entity table (name without & -> codepoints) 2. Legacy entities set (entities that can be used without semicolon) *) let entities = ref [] in let legacy = ref [] in let entries = json_object json in List.iter (fun ((name, _), value) -> (* name is like "&" or "&" *) let name_without_amp = if String.length name > 0 && name.[0] = '&' then String.sub name 1 (String.length name - 1) else name in let has_semicolon = String.length name_without_amp > 0 && name_without_amp.[String.length name_without_amp - 1] = ';' in let key = if has_semicolon then String.sub name_without_amp 0 (String.length name_without_amp - 1) else name_without_amp in let fields = json_object value in let codepoints = match json_mem "codepoints" fields with | Some arr -> List.map json_number (json_array arr) | None -> [] in if codepoints <> [] then begin entities := (key, codepoints, has_semicolon) :: !entities; (* Legacy entities are those that appear without semicolon in the JSON *) if not has_semicolon then legacy := key :: !legacy end ) entries; (* Remove duplicates - prefer semicolon version *) let seen = Hashtbl.create 2500 in let unique_entities = List.filter (fun (key, _, has_semi) -> if Hashtbl.mem seen key then begin (* If we already have this key without semicolon, and this one has semicolon, replace *) if has_semi then begin Hashtbl.replace seen key true; true end else false end else begin Hashtbl.add seen key has_semi; true end ) (List.rev !entities) in (* Sort for binary search *) let sorted = List.sort (fun (a, _, _) (b, _, _) -> String.compare a b) unique_entities in (* Generate codepoints to string function *) output_string oc "let codepoints_to_string cps =\n"; output_string oc " let buf = Buffer.create 8 in\n"; output_string oc " List.iter (fun cp ->\n"; output_string oc " if cp <= 0x7F then\n"; output_string oc " Buffer.add_char buf (Char.chr cp)\n"; output_string oc " else if cp <= 0x7FF then begin\n"; output_string oc " Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));\n"; output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; output_string oc " end else if cp <= 0xFFFF then begin\n"; output_string oc " Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));\n"; output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n"; output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; output_string oc " end else begin\n"; output_string oc " Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));\n"; output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));\n"; output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n"; output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; output_string oc " end\n"; output_string oc " ) cps;\n"; output_string oc " Buffer.contents buf\n\n"; (* Generate the entity array for binary search *) output_string oc "let entities = [|\n"; List.iter (fun (name, codepoints, _) -> let cps_str = String.concat "; " (List.map string_of_int codepoints) in Printf.fprintf oc " (%S, [%s]);\n" name cps_str ) sorted; output_string oc "|]\n\n"; (* Binary search lookup *) output_string oc "let lookup name =\n"; output_string oc " let rec search lo hi =\n"; output_string oc " if lo > hi then None\n"; output_string oc " else begin\n"; output_string oc " let mid = (lo + hi) / 2 in\n"; output_string oc " let (key, cps) = entities.(mid) in\n"; output_string oc " let cmp = String.compare name key in\n"; output_string oc " if cmp = 0 then Some (codepoints_to_string cps)\n"; output_string oc " else if cmp < 0 then search lo (mid - 1)\n"; output_string oc " else search (mid + 1) hi\n"; output_string oc " end\n"; output_string oc " in\n"; output_string oc " search 0 (Array.length entities - 1)\n\n"; (* Generate legacy entities set *) let legacy_sorted = List.sort_uniq String.compare !legacy in output_string oc "let legacy_entities = [|\n"; List.iter (fun name -> Printf.fprintf oc " %S;\n" name ) legacy_sorted; output_string oc "|]\n\n"; output_string oc "let is_legacy name =\n"; output_string oc " let rec search lo hi =\n"; output_string oc " if lo > hi then false\n"; output_string oc " else begin\n"; output_string oc " let mid = (lo + hi) / 2 in\n"; output_string oc " let cmp = String.compare name legacy_entities.(mid) in\n"; output_string oc " if cmp = 0 then true\n"; output_string oc " else if cmp < 0 then search lo (mid - 1)\n"; output_string oc " else search (mid + 1) hi\n"; output_string oc " end\n"; output_string oc " in\n"; output_string oc " search 0 (Array.length legacy_entities - 1)\n"; close_out oc; Printf.printf "Generated %s with %d entities (%d legacy)\n" out_file (List.length sorted) (List.length legacy_sorted)