OCaml HTML5 parser/serialiser based on Python's JustHTML
1(* Profile generator - converts JSON language profiles to OCaml module *)
2
3let read_file path =
4 let ic = open_in path in
5 let n = in_channel_length ic in
6 let s = really_input_string ic n in
7 close_in ic;
8 s
9
10(* Simple JSON parser for profile format {"freq": {...}} *)
11let parse_freq_json content =
12 (* Find the freq object *)
13 let freq_start =
14 try String.index content '{' + 1
15 with Not_found -> failwith "No opening brace"
16 in
17 let content = String.sub content freq_start (String.length content - freq_start) in
18 (* Skip to inner object *)
19 let inner_start =
20 try String.index content '{' + 1
21 with Not_found -> failwith "No freq object"
22 in
23 let inner_end =
24 try String.rindex content '}'
25 with Not_found -> failwith "No closing brace"
26 in
27 let inner = String.sub content inner_start (inner_end - inner_start) in
28
29 (* Parse key:value pairs *)
30 let pairs = ref [] in
31 let i = ref 0 in
32 let len = String.length inner in
33 while !i < len do
34 (* Skip whitespace *)
35 while !i < len && (inner.[!i] = ' ' || inner.[!i] = '\n' || inner.[!i] = '\r' || inner.[!i] = '\t' || inner.[!i] = ',') do
36 incr i
37 done;
38 if !i >= len then ()
39 else begin
40 (* Expect quote for key *)
41 if inner.[!i] <> '"' then incr i
42 else begin
43 incr i;
44 let key_start = !i in
45 (* Find end of key *)
46 while !i < len && inner.[!i] <> '"' do
47 if inner.[!i] = '\\' then i := !i + 2
48 else incr i
49 done;
50 let key = String.sub inner key_start (!i - key_start) in
51 incr i; (* skip closing quote *)
52 (* Skip colon *)
53 while !i < len && (inner.[!i] = ':' || inner.[!i] = ' ') do incr i done;
54 (* Parse number *)
55 let num_start = !i in
56 while !i < len && inner.[!i] >= '0' && inner.[!i] <= '9' do incr i done;
57 let num_str = String.sub inner num_start (!i - num_start) in
58 if num_str <> "" then begin
59 let num = int_of_string num_str in
60 pairs := (key, num) :: !pairs
61 end
62 end
63 end
64 done;
65 !pairs
66
67(* Escape string for OCaml, preserving UTF-8 characters *)
68let escape_ocaml_string s =
69 let buf = Buffer.create (String.length s * 2) in
70 String.iter (fun c ->
71 match c with
72 | '"' -> Buffer.add_string buf "\\\""
73 | '\\' -> Buffer.add_string buf "\\\\"
74 | '\n' -> Buffer.add_string buf "\\n"
75 | '\r' -> Buffer.add_string buf "\\r"
76 | '\t' -> Buffer.add_string buf "\\t"
77 | c when Char.code c < 32 ->
78 Buffer.add_string buf (Printf.sprintf "\\x%02x" (Char.code c))
79 (* Keep all other characters including UTF-8 bytes as-is *)
80 | c -> Buffer.add_char buf c
81 ) s;
82 Buffer.contents buf
83
84let generate_profile_module lang_code pairs =
85 let buf = Buffer.create 65536 in
86 Buffer.add_string buf "(* Auto-generated language profile - do not edit *)\n\n";
87 Buffer.add_string buf (Printf.sprintf "let lang = %S\n\n" lang_code);
88 Buffer.add_string buf "let freq = [\n";
89 List.iter (fun (ngram, count) ->
90 (* Use custom escaping to preserve UTF-8 *)
91 Buffer.add_string buf (Printf.sprintf " (\"%s\", %d);\n" (escape_ocaml_string ngram) count)
92 ) (List.rev pairs);
93 Buffer.add_string buf "]\n";
94 Buffer.contents buf
95
96let () =
97 if Array.length Sys.argv < 3 then begin
98 Printf.eprintf "Usage: %s <profiles_dir> <output_dir>\n" Sys.argv.(0);
99 exit 1
100 end;
101
102 let profiles_dir = Sys.argv.(1) in
103 let output_dir = Sys.argv.(2) in
104
105 (* Create output directory *)
106 (try Unix.mkdir output_dir 0o755 with Unix.Unix_error (Unix.EEXIST, _, _) -> ());
107
108 (* Process each profile *)
109 let entries = Sys.readdir profiles_dir in
110 let lang_codes = ref [] in
111
112 Array.iter (fun entry ->
113 let path = Filename.concat profiles_dir entry in
114 if Sys.is_directory path then ()
115 else begin
116 Printf.printf "Processing %s...\n%!" entry;
117 try
118 let content = read_file path in
119 let pairs = parse_freq_json content in
120 let lang_code =
121 (* Normalize lang code: zh-cn -> zh_cn *)
122 String.map (fun c -> if c = '-' then '_' else c) entry
123 in
124 let ml_content = generate_profile_module entry pairs in
125 let out_path = Filename.concat output_dir (Printf.sprintf "profile_%s.ml" lang_code) in
126 let oc = open_out out_path in
127 output_string oc ml_content;
128 close_out oc;
129 lang_codes := (entry, lang_code) :: !lang_codes;
130 Printf.printf " Generated %s (%d n-grams)\n%!" out_path (List.length pairs)
131 with e ->
132 Printf.eprintf " Error processing %s: %s\n%!" entry (Printexc.to_string e)
133 end
134 ) entries;
135
136 (* Generate profiles index module *)
137 let index_path = Filename.concat output_dir "profiles.ml" in
138 let oc = open_out index_path in
139 Printf.fprintf oc "(* Auto-generated profiles index - do not edit *)\n\n";
140 Printf.fprintf oc "let all_profiles = [\n";
141 List.iter (fun (orig_code, ml_code) ->
142 Printf.fprintf oc " (%S, Profile_%s.freq);\n" orig_code ml_code
143 ) (List.rev !lang_codes);
144 Printf.fprintf oc "]\n";
145 close_out oc;
146
147 Printf.printf "Generated %d profiles\n" (List.length !lang_codes)