lib/langdetect/langdetect.ml at 6387bcc507f92984dc0154360b8bc442d5332615 · anil.recoil.org/ocaml-html5rw

OCaml HTML5 parser/serialiser based on Python's JustHTML
ocaml-html5rw / lib / langdetect / langdetect.ml
at 6387bcc507f92984dc0154360b8bc442d5332615 8.6 kB view raw
  1(** Language detection library based on n-gram frequency analysis.
  2
  3    This is an OCaml port of the Cybozu langdetect algorithm. *)
  4
  5module StringMap = Map.Make(String)
  6
  7(** Language detection result *)
  8type result = {
  9  lang: string;
 10  prob: float;
 11}
 12
 13(** Detection parameters *)
 14type config = {
 15  alpha: float;          (** Smoothing parameter (default 0.5) *)
 16  n_trial: int;          (** Number of random trials (default 7) *)
 17  max_text_length: int;  (** Maximum text length to process *)
 18  conv_threshold: float; (** Convergence threshold *)
 19  prob_threshold: float; (** Minimum probability to report *)
 20}
 21
 22let default_config = {
 23  alpha = 0.5;
 24  n_trial = 7;
 25  max_text_length = 10000;
 26  conv_threshold = 0.99999;
 27  prob_threshold = 0.1;
 28}
 29
 30(** N-gram extraction parameters *)
 31let n_gram_max = 3
 32let base_freq = 10000
 33let iteration_limit = 1000
 34let alpha_width = 0.05
 35
 36(** Detector state *)
 37type t = {
 38  config: config;
 39  (* Map from n-gram -> array of probabilities per language *)
 40  word_lang_prob: float array StringMap.t;
 41  (* List of language codes *)
 42  lang_list: string array;
 43  (* Random seed for reproducibility *)
 44  mutable seed: int option;
 45}
 46
 47(** Normalize a Unicode code point for n-gram extraction *)
 48let normalize_uchar uchar =
 49  let code = Uchar.to_int uchar in
 50  (* Basic Latin: keep only letters *)
 51  if code < 128 then begin
 52    let c = Char.chr code in
 53    if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') then
 54      Some (String.make 1 c)
 55    else
 56      None (* Treat as space/separator *)
 57  end
 58  else begin
 59    (* Keep non-ASCII characters as-is *)
 60    let buf = Buffer.create 4 in
 61    Buffer.add_utf_8_uchar buf uchar;
 62    Some (Buffer.contents buf)
 63  end
 64
 65(** Extract n-grams from UTF-8 text.
 66    N-grams are sequences of 1-3 Unicode characters. *)
 67let extract_ngrams ?(max_len=10000) text word_lang_prob =
 68  let ngrams = ref [] in
 69  (* Buffer stores up to 3 most recent character strings *)
 70  let char_buffer = Array.make n_gram_max "" in
 71  let char_count = ref 0 in
 72  let processed = ref 0 in
 73
 74  (* Process each UTF-8 character *)
 75  let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
 76  let rec process () =
 77    if !processed >= max_len then ()
 78    else match Uutf.decode decoder with
 79    | `Await -> ()  (* String source never awaits *)
 80    | `End -> ()
 81    | `Malformed _ -> process ()  (* Skip malformed sequences *)
 82    | `Uchar uchar ->
 83        incr processed;
 84        match normalize_uchar uchar with
 85        | None ->
 86            (* Separator - reset buffer *)
 87            char_buffer.(0) <- "";
 88            char_buffer.(1) <- "";
 89            char_buffer.(2) <- "";
 90            char_count := 0;
 91            process ()
 92        | Some char_str ->
 93            (* Shift buffer left and add new char *)
 94            char_buffer.(0) <- char_buffer.(1);
 95            char_buffer.(1) <- char_buffer.(2);
 96            char_buffer.(2) <- char_str;
 97            incr char_count;
 98
 99            (* Extract 1, 2, 3 grams based on how many chars we have *)
100            let available = min !char_count n_gram_max in
101            for n = 1 to available do
102              let ngram =
103                let start_idx = n_gram_max - n in
104                let parts = ref [] in
105                for i = start_idx to n_gram_max - 1 do
106                  parts := char_buffer.(i) :: !parts
107                done;
108                String.concat "" (List.rev !parts)
109              in
110              if StringMap.mem ngram word_lang_prob then
111                ngrams := ngram :: !ngrams
112            done;
113            process ()
114  in
115  process ();
116  Array.of_list (List.rev !ngrams)
117
118(** Initialize uniform probability distribution *)
119let init_prob n_langs =
120  let prob = Array.make n_langs (1.0 /. float_of_int n_langs) in
121  prob
122
123(** Update language probabilities with an n-gram *)
124let update_lang_prob prob ngram word_lang_prob alpha =
125  match StringMap.find_opt ngram word_lang_prob with
126  | None -> false
127  | Some lang_prob_map ->
128      let weight = alpha /. float_of_int base_freq in
129      for i = 0 to Array.length prob - 1 do
130        prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i))
131      done;
132      true
133
134(** Normalize probabilities and return max *)
135let normalize_prob prob =
136  let sum = Array.fold_left (+.) 0.0 prob in
137  if sum <= 0.0 then 0.0
138  else begin
139    let max_p = ref 0.0 in
140    for i = 0 to Array.length prob - 1 do
141      prob.(i) <- prob.(i) /. sum;
142      if prob.(i) > !max_p then max_p := prob.(i)
143    done;
144    !max_p
145  end
146
147(** Simple pseudo-random number generator *)
148let random_state = ref 12345
149
150let set_seed seed =
151  random_state := seed
152
153let next_random () =
154  random_state := (!random_state * 1103515245 + 12345) land 0x7FFFFFFF;
155  !random_state
156
157let random_int bound =
158  (next_random ()) mod bound
159
160let random_gaussian () =
161  (* Box-Muller transform approximation *)
162  let u1 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
163  let u2 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in
164  let u1 = max 0.0001 u1 in (* Avoid log(0) *)
165  sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)
166
167(** Run detection on extracted n-grams *)
168let detect_block t ngrams =
169  let n_langs = Array.length t.lang_list in
170  if n_langs = 0 || Array.length ngrams = 0 then [||]
171  else begin
172    let lang_prob = Array.make n_langs 0.0 in
173
174    (* Set seed if specified *)
175    (match t.seed with
176    | Some s -> set_seed s
177    | None -> set_seed (int_of_float (Unix.gettimeofday () *. 1000.0)));
178
179    for _ = 0 to t.config.n_trial - 1 do
180      let prob = init_prob n_langs in
181      let alpha = t.config.alpha +. random_gaussian () *. alpha_width in
182
183      let converged = ref false in
184      let i = ref 0 in
185      while not !converged && !i < iteration_limit do
186        let r = random_int (Array.length ngrams) in
187        let _ = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
188        if !i mod 5 = 0 then begin
189          let max_p = normalize_prob prob in
190          if max_p > t.config.conv_threshold then converged := true
191        end;
192        incr i
193      done;
194
195      (* Accumulate probabilities *)
196      for j = 0 to n_langs - 1 do
197        lang_prob.(j) <- lang_prob.(j) +. prob.(j) /. float_of_int t.config.n_trial
198      done
199    done;
200
201    lang_prob
202  end
203
204(** Create detector from profiles *)
205let create ?(config=default_config) profiles =
206  let lang_list = Array.of_list (List.map fst profiles) in
207  let n_langs = Array.length lang_list in
208
209  (* Build word -> lang prob map *)
210  (* First, collect all unique n-grams and their frequencies per language *)
211  let all_ngrams = Hashtbl.create 65536 in
212  let lang_totals = Array.make n_langs 0 in
213
214  List.iteri (fun lang_idx (_, freq_list) ->
215    List.iter (fun (ngram, count) ->
216      let current =
217        match Hashtbl.find_opt all_ngrams ngram with
218        | Some arr -> arr
219        | None ->
220            let arr = Array.make n_langs 0 in
221            Hashtbl.add all_ngrams ngram arr;
222            arr
223      in
224      current.(lang_idx) <- count;
225      lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count
226    ) freq_list
227  ) profiles;
228
229  (* Convert to probability map *)
230  let word_lang_prob =
231    Hashtbl.fold (fun ngram counts acc ->
232      (* Compute probability for each language *)
233      let probs = Array.make n_langs 0.0 in
234      for i = 0 to n_langs - 1 do
235        if lang_totals.(i) > 0 then
236          probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
237      done;
238      StringMap.add ngram probs acc
239    ) all_ngrams StringMap.empty
240  in
241
242  { config; word_lang_prob; lang_list; seed = None }
243
244(** Set random seed for reproducibility *)
245let set_random_seed t seed =
246  t.seed <- Some seed
247
248(** Detect language of text *)
249let detect t text =
250  let ngrams = extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob in
251  if Array.length ngrams = 0 then []
252  else begin
253    let probs = detect_block t ngrams in
254    (* Sort by probability descending *)
255    let results = ref [] in
256    for i = 0 to Array.length probs - 1 do
257      if probs.(i) > t.config.prob_threshold then
258        results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
259    done;
260    List.sort (fun a b -> compare b.prob a.prob) !results
261  end
262
263(** Get best language or None *)
264let detect_best t text =
265  match detect t text with
266  | [] -> None
267  | best :: _ -> Some best.lang
268
269(** Get best language with probability *)
270let detect_with_prob t text =
271  match detect t text with
272  | [] -> None
273  | best :: _ -> Some (best.lang, best.prob)
274
275(** Create a detector with all built-in profiles *)
276let create_default ?config () =
277  create ?config Profiles.all_profiles