OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Language detection library based on n-gram frequency analysis.
2
3 This is an OCaml port of the Cybozu langdetect algorithm. *)
4
5module StringMap = Map.Make(String)
6
7(** Language detection result *)
8type result = {
9 lang: string;
10 prob: float;
11}
12
13(** Detection parameters *)
14type config = {
15 alpha: float; (** Smoothing parameter (default 0.5) *)
16 n_trial: int; (** Number of random trials (default 7) *)
17 max_text_length: int; (** Maximum text length to process *)
18 conv_threshold: float; (** Convergence threshold *)
19 prob_threshold: float; (** Minimum probability to report *)
20}
21
22let default_config = {
23 alpha = 0.5;
24 n_trial = 7;
25 max_text_length = 10000;
26 conv_threshold = 0.99999;
27 prob_threshold = 0.1;
28}
29
30(** N-gram extraction parameters *)
31let n_gram_max = 3
32let base_freq = 10000
33let iteration_limit = 1000
34let alpha_width = 0.05
35
36(** Detector state *)
37type t = {
38 config: config;
39 (* Map from n-gram -> array of probabilities per language *)
40 word_lang_prob: float array StringMap.t;
41 (* List of language codes *)
42 lang_list: string array;
43 (* Random seed for reproducibility *)
44 mutable seed: int option;
45}
46
47(** Normalize a Unicode code point for n-gram extraction *)
48let normalize_uchar uchar =
49 let code = Uchar.to_int uchar in
50 (* Basic Latin: keep only letters *)
51 if code < 128 then begin
52 let c = Char.chr code in
53 if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') then
54 Some (String.make 1 c)
55 else
56 None (* Treat as space/separator *)
57 end
58 else begin
59 (* Keep non-ASCII characters as-is *)
60 let buf = Buffer.create 4 in
61 Buffer.add_utf_8_uchar buf uchar;
62 Some (Buffer.contents buf)
63 end
64
65(** Extract n-grams from UTF-8 text.
66 N-grams are sequences of 1-3 Unicode characters. *)
67let extract_ngrams ?(max_len=10000) text word_lang_prob =
68 let ngrams = ref [] in
69 (* Buffer stores up to 3 most recent character strings *)
70 let char_buffer = Array.make n_gram_max "" in
71 let char_count = ref 0 in
72 let processed = ref 0 in
73
74 (* Process each UTF-8 character *)
75 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
76 let rec process () =
77 if !processed >= max_len then ()
78 else match Uutf.decode decoder with
79 | `Await -> () (* String source never awaits *)
80 | `End -> ()
81 | `Malformed _ -> process () (* Skip malformed sequences *)
82 | `Uchar uchar ->
83 incr processed;
84 match normalize_uchar uchar with
85 | None ->
86 (* Separator - reset buffer *)
87 char_buffer.(0) <- "";
88 char_buffer.(1) <- "";
89 char_buffer.(2) <- "";
90 char_count := 0;
91 process ()
92 | Some char_str ->
93 (* Shift buffer left and add new char *)
94 char_buffer.(0) <- char_buffer.(1);
95 char_buffer.(1) <- char_buffer.(2);
96 char_buffer.(2) <- char_str;
97 incr char_count;
98
99 (* Extract 1, 2, 3 grams based on how many chars we have *)
100 let available = min !char_count n_gram_max in
101 for n = 1 to available do
102 let ngram =
103 let start_idx = n_gram_max - n in
104 let parts = ref [] in
105 for i = start_idx to n_gram_max - 1 do
106 parts := char_buffer.(i) :: !parts
107 done;
108 String.concat "" (List.rev !parts)
109 in
110 if StringMap.mem ngram word_lang_prob then
111 ngrams := ngram :: !ngrams
112 done;
113 process ()
114 in
115 process ();
116 Array.of_list (List.rev !ngrams)
117
118(** Initialize uniform probability distribution *)
119let init_prob n_langs =
120 let prob = Array.make n_langs (1.0 /. float_of_int n_langs) in
121 prob
122
123(** Update language probabilities with an n-gram *)
124let update_lang_prob prob ngram word_lang_prob alpha =
125 match StringMap.find_opt ngram word_lang_prob with
126 | None -> false
127 | Some lang_prob_map ->
128 let weight = alpha /. float_of_int base_freq in
129 for i = 0 to Array.length prob - 1 do
130 prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i))
131 done;
132 true
133
134(** Normalize probabilities and return max *)
135let normalize_prob prob =
136 let sum = Array.fold_left (+.) 0.0 prob in
137 if sum <= 0.0 then 0.0
138 else begin
139 let max_p = ref 0.0 in
140 for i = 0 to Array.length prob - 1 do
141 prob.(i) <- prob.(i) /. sum;
142 if prob.(i) > !max_p then max_p := prob.(i)
143 done;
144 !max_p
145 end
146
147(** Simple pseudo-random number generator *)
148let random_state = ref 12345
149
150let set_seed seed =
151 random_state := seed
152
153let next_random () =
154 random_state := (!random_state * 1103515245 + 12345) land 0x7FFFFFFF;
155 !random_state
156
157let random_int bound =
158 (next_random ()) mod bound
159
160let random_gaussian () =
161 (* Box-Muller transform approximation *)
162 let u1 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
163 let u2 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
164 let u1 = max 0.0001 u1 in (* Avoid log(0) *)
165 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)
166
167(** Run detection on extracted n-grams *)
168let detect_block t ngrams =
169 let n_langs = Array.length t.lang_list in
170 if n_langs = 0 || Array.length ngrams = 0 then [||]
171 else begin
172 let lang_prob = Array.make n_langs 0.0 in
173
174 (* Set seed if specified *)
175 (match t.seed with
176 | Some s -> set_seed s
177 | None -> set_seed (int_of_float (Unix.gettimeofday () *. 1000.0)));
178
179 for _ = 0 to t.config.n_trial - 1 do
180 let prob = init_prob n_langs in
181 let alpha = t.config.alpha +. random_gaussian () *. alpha_width in
182
183 let converged = ref false in
184 let i = ref 0 in
185 while not !converged && !i < iteration_limit do
186 let r = random_int (Array.length ngrams) in
187 let _ = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
188 if !i mod 5 = 0 then begin
189 let max_p = normalize_prob prob in
190 if max_p > t.config.conv_threshold then converged := true
191 end;
192 incr i
193 done;
194
195 (* Accumulate probabilities *)
196 for j = 0 to n_langs - 1 do
197 lang_prob.(j) <- lang_prob.(j) +. prob.(j) /. float_of_int t.config.n_trial
198 done
199 done;
200
201 lang_prob
202 end
203
204(** Create detector from profiles *)
205let create ?(config=default_config) profiles =
206 let lang_list = Array.of_list (List.map fst profiles) in
207 let n_langs = Array.length lang_list in
208
209 (* Build word -> lang prob map *)
210 (* First, collect all unique n-grams and their frequencies per language *)
211 let all_ngrams = Hashtbl.create 65536 in
212 let lang_totals = Array.make n_langs 0 in
213
214 List.iteri (fun lang_idx (_, freq_list) ->
215 List.iter (fun (ngram, count) ->
216 let current =
217 match Hashtbl.find_opt all_ngrams ngram with
218 | Some arr -> arr
219 | None ->
220 let arr = Array.make n_langs 0 in
221 Hashtbl.add all_ngrams ngram arr;
222 arr
223 in
224 current.(lang_idx) <- count;
225 lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count
226 ) freq_list
227 ) profiles;
228
229 (* Convert to probability map *)
230 let word_lang_prob =
231 Hashtbl.fold (fun ngram counts acc ->
232 (* Compute probability for each language *)
233 let probs = Array.make n_langs 0.0 in
234 for i = 0 to n_langs - 1 do
235 if lang_totals.(i) > 0 then
236 probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
237 done;
238 StringMap.add ngram probs acc
239 ) all_ngrams StringMap.empty
240 in
241
242 { config; word_lang_prob; lang_list; seed = None }
243
244(** Set random seed for reproducibility *)
245let set_random_seed t seed =
246 t.seed <- Some seed
247
248(** Detect language of text *)
249let detect t text =
250 let ngrams = extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob in
251 if Array.length ngrams = 0 then []
252 else begin
253 let probs = detect_block t ngrams in
254 (* Sort by probability descending *)
255 let results = ref [] in
256 for i = 0 to Array.length probs - 1 do
257 if probs.(i) > t.config.prob_threshold then
258 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
259 done;
260 List.sort (fun a b -> compare b.prob a.prob) !results
261 end
262
263(** Get best language or None *)
264let detect_best t text =
265 match detect t text with
266 | [] -> None
267 | best :: _ -> Some best.lang
268
269(** Get best language with probability *)
270let detect_with_prob t text =
271 match detect t text with
272 | [] -> None
273 | best :: _ -> Some (best.lang, best.prob)
274
275(** Create a detector with all built-in profiles *)
276let create_default ?config () =
277 create ?config Profiles.all_profiles