OCaml HTML5 parser/serialiser based on Python's JustHTML
at main 6.4 kB view raw
1(* Entity table generator for html5rw. 2 Reads WHATWG entities.json and generates OCaml code. *) 3 4(* Helper functions for jsont *) 5let json_object = function 6 | Jsont.Object (obj, _) -> obj 7 | _ -> failwith "Expected JSON object" 8 9let json_array = function 10 | Jsont.Array (arr, _) -> arr 11 | _ -> failwith "Expected JSON array" 12 13let json_number = function 14 | Jsont.Number (n, _) -> int_of_float n 15 | _ -> failwith "Expected JSON number" 16 17let json_mem name obj = 18 match List.find_opt (fun ((n, _), _) -> n = name) obj with 19 | Some (_, v) -> Some v 20 | None -> None 21 22let () = 23 let json_file = Sys.argv.(1) in 24 let out_file = Sys.argv.(2) in 25 26 let ic = open_in json_file in 27 let n = in_channel_length ic in 28 let s = really_input_string ic n in 29 close_in ic; 30 31 let json = match Jsont_bytesrw.decode_string Jsont.json s with 32 | Ok j -> j 33 | Error e -> failwith (Printf.sprintf "JSON parse error: %s" e) 34 in 35 36 let oc = open_out out_file in 37 38 (* Header *) 39 output_string oc "(* Auto-generated from entities.json - do not edit *)\n\n"; 40 41 (* We need two tables: 42 1. Full entity table (name without & -> codepoints) 43 2. Legacy entities set (entities that can be used without semicolon) *) 44 45 let entities = ref [] in 46 let legacy = ref [] in 47 48 let entries = json_object json in 49 List.iter (fun ((name, _), value) -> 50 (* name is like "&amp;" or "&amp" *) 51 let name_without_amp = 52 if String.length name > 0 && name.[0] = '&' then 53 String.sub name 1 (String.length name - 1) 54 else name 55 in 56 let has_semicolon = 57 String.length name_without_amp > 0 && 58 name_without_amp.[String.length name_without_amp - 1] = ';' 59 in 60 let key = 61 if has_semicolon then 62 String.sub name_without_amp 0 (String.length name_without_amp - 1) 63 else 64 name_without_amp 65 in 66 let fields = json_object value in 67 let codepoints = 68 match json_mem "codepoints" fields with 69 | Some arr -> List.map json_number (json_array arr) 70 | None -> [] 71 in 72 if codepoints <> [] then begin 73 entities := (key, codepoints, has_semicolon) :: !entities; 74 (* Legacy entities are those that appear without semicolon in the JSON *) 75 if not has_semicolon then 76 legacy := key :: !legacy 77 end 78 ) entries; 79 80 (* Remove duplicates - prefer semicolon version *) 81 let seen = Hashtbl.create 2500 in 82 let unique_entities = 83 List.filter (fun (key, _, has_semi) -> 84 if Hashtbl.mem seen key then begin 85 (* If we already have this key without semicolon, and this one has semicolon, replace *) 86 if has_semi then begin 87 Hashtbl.replace seen key true; 88 true 89 end else false 90 end else begin 91 Hashtbl.add seen key has_semi; 92 true 93 end 94 ) (List.rev !entities) 95 in 96 97 (* Sort for binary search *) 98 let sorted = List.sort (fun (a, _, _) (b, _, _) -> String.compare a b) unique_entities in 99 100 (* Generate codepoints to string function *) 101 output_string oc "let codepoints_to_string cps =\n"; 102 output_string oc " let buf = Buffer.create 8 in\n"; 103 output_string oc " List.iter (fun cp ->\n"; 104 output_string oc " if cp <= 0x7F then\n"; 105 output_string oc " Buffer.add_char buf (Char.chr cp)\n"; 106 output_string oc " else if cp <= 0x7FF then begin\n"; 107 output_string oc " Buffer.add_char buf (Char.chr (0xC0 lor (cp lsr 6)));\n"; 108 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; 109 output_string oc " end else if cp <= 0xFFFF then begin\n"; 110 output_string oc " Buffer.add_char buf (Char.chr (0xE0 lor (cp lsr 12)));\n"; 111 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n"; 112 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; 113 output_string oc " end else begin\n"; 114 output_string oc " Buffer.add_char buf (Char.chr (0xF0 lor (cp lsr 18)));\n"; 115 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 12) land 0x3F)));\n"; 116 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor ((cp lsr 6) land 0x3F)));\n"; 117 output_string oc " Buffer.add_char buf (Char.chr (0x80 lor (cp land 0x3F)))\n"; 118 output_string oc " end\n"; 119 output_string oc " ) cps;\n"; 120 output_string oc " Buffer.contents buf\n\n"; 121 122 (* Generate the entity array for binary search *) 123 output_string oc "let entities = [|\n"; 124 List.iter (fun (name, codepoints, _) -> 125 let cps_str = String.concat "; " (List.map string_of_int codepoints) in 126 Printf.fprintf oc " (%S, [%s]);\n" name cps_str 127 ) sorted; 128 output_string oc "|]\n\n"; 129 130 (* Binary search lookup *) 131 output_string oc "let lookup name =\n"; 132 output_string oc " let rec search lo hi =\n"; 133 output_string oc " if lo > hi then None\n"; 134 output_string oc " else begin\n"; 135 output_string oc " let mid = (lo + hi) / 2 in\n"; 136 output_string oc " let (key, cps) = entities.(mid) in\n"; 137 output_string oc " let cmp = String.compare name key in\n"; 138 output_string oc " if cmp = 0 then Some (codepoints_to_string cps)\n"; 139 output_string oc " else if cmp < 0 then search lo (mid - 1)\n"; 140 output_string oc " else search (mid + 1) hi\n"; 141 output_string oc " end\n"; 142 output_string oc " in\n"; 143 output_string oc " search 0 (Array.length entities - 1)\n\n"; 144 145 (* Generate legacy entities set *) 146 let legacy_sorted = List.sort_uniq String.compare !legacy in 147 output_string oc "let legacy_entities = [|\n"; 148 List.iter (fun name -> 149 Printf.fprintf oc " %S;\n" name 150 ) legacy_sorted; 151 output_string oc "|]\n\n"; 152 153 output_string oc "let is_legacy name =\n"; 154 output_string oc " let rec search lo hi =\n"; 155 output_string oc " if lo > hi then false\n"; 156 output_string oc " else begin\n"; 157 output_string oc " let mid = (lo + hi) / 2 in\n"; 158 output_string oc " let cmp = String.compare name legacy_entities.(mid) in\n"; 159 output_string oc " if cmp = 0 then true\n"; 160 output_string oc " else if cmp < 0 then search lo (mid - 1)\n"; 161 output_string oc " else search (mid + 1) hi\n"; 162 output_string oc " end\n"; 163 output_string oc " in\n"; 164 output_string oc " search 0 (Array.length legacy_entities - 1)\n"; 165 166 close_out oc; 167 Printf.printf "Generated %s with %d entities (%d legacy)\n" 168 out_file (List.length sorted) (List.length legacy_sorted)