OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Profile generator - converts JSON language profiles to OCaml module *) 2 3let read_file path = 4 let ic = open_in path in 5 let n = in_channel_length ic in 6 let s = really_input_string ic n in 7 close_in ic; 8 s 9 10(* Simple JSON parser for profile format {"freq": {...}} *) 11let parse_freq_json content = 12 (* Find the freq object *) 13 let freq_start = 14 try String.index content '{' + 1 15 with Not_found -> failwith "No opening brace" 16 in 17 let content = String.sub content freq_start (String.length content - freq_start) in 18 (* Skip to inner object *) 19 let inner_start = 20 try String.index content '{' + 1 21 with Not_found -> failwith "No freq object" 22 in 23 let inner_end = 24 try String.rindex content '}' 25 with Not_found -> failwith "No closing brace" 26 in 27 let inner = String.sub content inner_start (inner_end - inner_start) in 28 29 (* Parse key:value pairs *) 30 let pairs = ref [] in 31 let i = ref 0 in 32 let len = String.length inner in 33 while !i < len do 34 (* Skip whitespace *) 35 while !i < len && (inner.[!i] = ' ' || inner.[!i] = '\n' || inner.[!i] = '\r' || inner.[!i] = '\t' || inner.[!i] = ',') do 36 incr i 37 done; 38 if !i >= len then () 39 else begin 40 (* Expect quote for key *) 41 if inner.[!i] <> '"' then incr i 42 else begin 43 incr i; 44 let key_start = !i in 45 (* Find end of key *) 46 while !i < len && inner.[!i] <> '"' do 47 if inner.[!i] = '\\' then i := !i + 2 48 else incr i 49 done; 50 let key = String.sub inner key_start (!i - key_start) in 51 incr i; (* skip closing quote *) 52 (* Skip colon *) 53 while !i < len && (inner.[!i] = ':' || inner.[!i] = ' ') do incr i done; 54 (* Parse number *) 55 let num_start = !i in 56 while !i < len && inner.[!i] >= '0' && inner.[!i] <= '9' do incr i done; 57 let num_str = String.sub inner num_start (!i - num_start) in 58 if num_str <> "" then begin 59 let num = int_of_string num_str in 60 pairs := (key, num) :: !pairs 61 end 62 end 63 end 64 done; 65 !pairs 66 67(* Escape string for OCaml, preserving UTF-8 characters *) 68let escape_ocaml_string s = 69 let buf = Buffer.create (String.length s * 2) in 70 String.iter (fun c -> 71 match c with 72 | '"' -> Buffer.add_string buf "\\\"" 73 | '\\' -> Buffer.add_string buf "\\\\" 74 | '\n' -> Buffer.add_string buf "\\n" 75 | '\r' -> Buffer.add_string buf "\\r" 76 | '\t' -> Buffer.add_string buf "\\t" 77 | c when Char.code c < 32 -> 78 Buffer.add_string buf (Printf.sprintf "\\x%02x" (Char.code c)) 79 (* Keep all other characters including UTF-8 bytes as-is *) 80 | c -> Buffer.add_char buf c 81 ) s; 82 Buffer.contents buf 83 84let generate_profile_module lang_code pairs = 85 let buf = Buffer.create 65536 in 86 Buffer.add_string buf "(* Auto-generated language profile - do not edit *)\n\n"; 87 Buffer.add_string buf (Printf.sprintf "let lang = %S\n\n" lang_code); 88 Buffer.add_string buf "let freq = [\n"; 89 List.iter (fun (ngram, count) -> 90 (* Use custom escaping to preserve UTF-8 *) 91 Buffer.add_string buf (Printf.sprintf " (\"%s\", %d);\n" (escape_ocaml_string ngram) count) 92 ) (List.rev pairs); 93 Buffer.add_string buf "]\n"; 94 Buffer.contents buf 95 96let () = 97 if Array.length Sys.argv < 3 then begin 98 Printf.eprintf "Usage: %s <profiles_dir> <output_dir>\n" Sys.argv.(0); 99 exit 1 100 end; 101 102 let profiles_dir = Sys.argv.(1) in 103 let output_dir = Sys.argv.(2) in 104 105 (* Process each profile *) 106 let entries = Sys.readdir profiles_dir in 107 let lang_codes = ref [] in 108 109 Array.iter (fun entry -> 110 let path = Filename.concat profiles_dir entry in 111 if Sys.is_directory path then () 112 else begin 113 try 114 let content = read_file path in 115 let pairs = parse_freq_json content in 116 let lang_code = 117 (* Normalize lang code: zh-cn -> zh_cn *) 118 String.map (fun c -> if c = '-' then '_' else c) entry 119 in 120 let ml_content = generate_profile_module entry pairs in 121 let out_path = Filename.concat output_dir (Printf.sprintf "profile_%s.ml" lang_code) in 122 let oc = open_out out_path in 123 output_string oc ml_content; 124 close_out oc; 125 lang_codes := (entry, lang_code) :: !lang_codes 126 with e -> 127 Printf.eprintf "Error processing %s: %s\n%!" entry (Printexc.to_string e); 128 exit 1 129 end 130 ) entries; 131 132 (* Sort language codes for deterministic output *) 133 let sorted_codes = List.sort (fun (a, _) (b, _) -> String.compare a b) !lang_codes in 134 135 (* Generate profiles index module *) 136 let index_path = Filename.concat output_dir "profiles.ml" in 137 let oc = open_out index_path in 138 Printf.fprintf oc "(* Auto-generated profiles index - do not edit *)\n\n"; 139 Printf.fprintf oc "let all_profiles = [\n"; 140 List.iter (fun (orig_code, ml_code) -> 141 Printf.fprintf oc " (%S, Profile_%s.freq);\n" orig_code ml_code 142 ) sorted_codes; 143 Printf.fprintf oc "]\n"; 144 close_out oc