OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Language detection library based on n-gram frequency analysis. 2 3 This is an OCaml port of the Cybozu langdetect algorithm. *) 4 5module StringMap = Map.Make(String) 6 7(** Language detection result *) 8type result = { 9 lang: string; 10 prob: float; 11} 12 13(** Detection parameters *) 14type config = { 15 alpha: float; (** Smoothing parameter (default 0.5) *) 16 n_trial: int; (** Number of random trials (default 7) *) 17 max_text_length: int; (** Maximum text length to process *) 18 conv_threshold: float; (** Convergence threshold *) 19 prob_threshold: float; (** Minimum probability to report *) 20} 21 22let default_config = { 23 alpha = 0.5; 24 n_trial = 7; 25 max_text_length = 10000; 26 conv_threshold = 0.99999; 27 prob_threshold = 0.1; 28} 29 30(** N-gram extraction parameters *) 31let n_gram_max = 3 32let base_freq = 10000 33let iteration_limit = 1000 34let alpha_width = 0.05 35 36(** Detector state *) 37type t = { 38 config: config; 39 (* Map from n-gram -> array of probabilities per language *) 40 word_lang_prob: float array StringMap.t; 41 (* List of language codes *) 42 lang_list: string array; 43 (* Random seed for reproducibility *) 44 mutable seed: int option; 45} 46 47(** Normalize a Unicode code point for n-gram extraction *) 48let normalize_uchar uchar = 49 let code = Uchar.to_int uchar in 50 (* Basic Latin: keep only letters *) 51 if code < 128 then begin 52 let c = Char.chr code in 53 if (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') then 54 Some (String.make 1 c) 55 else 56 None (* Treat as space/separator *) 57 end 58 else begin 59 (* Keep non-ASCII characters as-is *) 60 let buf = Buffer.create 4 in 61 Buffer.add_utf_8_uchar buf uchar; 62 Some (Buffer.contents buf) 63 end 64 65(** Extract n-grams from UTF-8 text. 66 N-grams are sequences of 1-3 Unicode characters. *) 67let extract_ngrams ?(max_len=10000) text word_lang_prob = 68 let ngrams = ref [] in 69 (* Buffer stores up to 3 most recent character strings *) 70 let char_buffer = Array.make n_gram_max "" in 71 let char_count = ref 0 in 72 let processed = ref 0 in 73 74 (* Process each UTF-8 character *) 75 let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in 76 let rec process () = 77 if !processed >= max_len then () 78 else match Uutf.decode decoder with 79 | `Await -> () (* String source never awaits *) 80 | `End -> () 81 | `Malformed _ -> process () (* Skip malformed sequences *) 82 | `Uchar uchar -> 83 incr processed; 84 match normalize_uchar uchar with 85 | None -> 86 (* Separator - reset buffer *) 87 char_buffer.(0) <- ""; 88 char_buffer.(1) <- ""; 89 char_buffer.(2) <- ""; 90 char_count := 0; 91 process () 92 | Some char_str -> 93 (* Shift buffer left and add new char *) 94 char_buffer.(0) <- char_buffer.(1); 95 char_buffer.(1) <- char_buffer.(2); 96 char_buffer.(2) <- char_str; 97 incr char_count; 98 99 (* Extract 1, 2, 3 grams based on how many chars we have *) 100 let available = min !char_count n_gram_max in 101 for n = 1 to available do 102 let ngram = 103 let start_idx = n_gram_max - n in 104 let parts = ref [] in 105 for i = start_idx to n_gram_max - 1 do 106 parts := char_buffer.(i) :: !parts 107 done; 108 String.concat "" (List.rev !parts) 109 in 110 if StringMap.mem ngram word_lang_prob then 111 ngrams := ngram :: !ngrams 112 done; 113 process () 114 in 115 process (); 116 Array.of_list (List.rev !ngrams) 117 118(** Initialize uniform probability distribution *) 119let init_prob n_langs = 120 let prob = Array.make n_langs (1.0 /. float_of_int n_langs) in 121 prob 122 123(** Update language probabilities with an n-gram *) 124let update_lang_prob prob ngram word_lang_prob alpha = 125 match StringMap.find_opt ngram word_lang_prob with 126 | None -> false 127 | Some lang_prob_map -> 128 let weight = alpha /. float_of_int base_freq in 129 for i = 0 to Array.length prob - 1 do 130 prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i)) 131 done; 132 true 133 134(** Normalize probabilities and return max *) 135let normalize_prob prob = 136 let sum = Array.fold_left (+.) 0.0 prob in 137 if sum <= 0.0 then 0.0 138 else begin 139 let max_p = ref 0.0 in 140 for i = 0 to Array.length prob - 1 do 141 prob.(i) <- prob.(i) /. sum; 142 if prob.(i) > !max_p then max_p := prob.(i) 143 done; 144 !max_p 145 end 146 147(** Simple pseudo-random number generator *) 148let random_state = ref 12345 149 150let set_seed seed = 151 random_state := seed 152 153let next_random () = 154 random_state := (!random_state * 1103515245 + 12345) land 0x7FFFFFFF; 155 !random_state 156 157let random_int bound = 158 (next_random ()) mod bound 159 160let random_gaussian () = 161 (* Box-Muller transform approximation *) 162 let u1 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in 163 let u2 = (float_of_int (next_random ())) /. float_of_int 0x7FFFFFFF in 164 let u1 = max 0.0001 u1 in (* Avoid log(0) *) 165 sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2) 166 167(** Run detection on extracted n-grams *) 168let detect_block t ngrams = 169 let n_langs = Array.length t.lang_list in 170 if n_langs = 0 || Array.length ngrams = 0 then [||] 171 else begin 172 let lang_prob = Array.make n_langs 0.0 in 173 174 (* Set seed if specified *) 175 (match t.seed with 176 | Some s -> set_seed s 177 | None -> set_seed (int_of_float (Unix.gettimeofday () *. 1000.0))); 178 179 for _ = 0 to t.config.n_trial - 1 do 180 let prob = init_prob n_langs in 181 let alpha = t.config.alpha +. random_gaussian () *. alpha_width in 182 183 let converged = ref false in 184 let i = ref 0 in 185 while not !converged && !i < iteration_limit do 186 let r = random_int (Array.length ngrams) in 187 let _ = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in 188 if !i mod 5 = 0 then begin 189 let max_p = normalize_prob prob in 190 if max_p > t.config.conv_threshold then converged := true 191 end; 192 incr i 193 done; 194 195 (* Accumulate probabilities *) 196 for j = 0 to n_langs - 1 do 197 lang_prob.(j) <- lang_prob.(j) +. prob.(j) /. float_of_int t.config.n_trial 198 done 199 done; 200 201 lang_prob 202 end 203 204(** Create detector from profiles *) 205let create ?(config=default_config) profiles = 206 let lang_list = Array.of_list (List.map fst profiles) in 207 let n_langs = Array.length lang_list in 208 209 (* Build word -> lang prob map *) 210 (* First, collect all unique n-grams and their frequencies per language *) 211 let all_ngrams = Hashtbl.create 65536 in 212 let lang_totals = Array.make n_langs 0 in 213 214 List.iteri (fun lang_idx (_, freq_list) -> 215 List.iter (fun (ngram, count) -> 216 let current = 217 match Hashtbl.find_opt all_ngrams ngram with 218 | Some arr -> arr 219 | None -> 220 let arr = Array.make n_langs 0 in 221 Hashtbl.add all_ngrams ngram arr; 222 arr 223 in 224 current.(lang_idx) <- count; 225 lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count 226 ) freq_list 227 ) profiles; 228 229 (* Convert to probability map *) 230 let word_lang_prob = 231 Hashtbl.fold (fun ngram counts acc -> 232 (* Compute probability for each language *) 233 let probs = Array.make n_langs 0.0 in 234 for i = 0 to n_langs - 1 do 235 if lang_totals.(i) > 0 then 236 probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i) 237 done; 238 StringMap.add ngram probs acc 239 ) all_ngrams StringMap.empty 240 in 241 242 { config; word_lang_prob; lang_list; seed = None } 243 244(** Set random seed for reproducibility *) 245let set_random_seed t seed = 246 t.seed <- Some seed 247 248(** Detect language of text *) 249let detect t text = 250 let ngrams = extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob in 251 if Array.length ngrams = 0 then [] 252 else begin 253 let probs = detect_block t ngrams in 254 (* Sort by probability descending *) 255 let results = ref [] in 256 for i = 0 to Array.length probs - 1 do 257 if probs.(i) > t.config.prob_threshold then 258 results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results 259 done; 260 List.sort (fun a b -> compare b.prob a.prob) !results 261 end 262 263(** Get best language or None *) 264let detect_best t text = 265 match detect t text with 266 | [] -> None 267 | best :: _ -> Some best.lang 268 269(** Get best language with probability *) 270let detect_with_prob t text = 271 match detect t text with 272 | [] -> None 273 | best :: _ -> Some (best.lang, best.prob) 274 275(** Create a detector with all built-in profiles *) 276let create_default ?config () = 277 create ?config Profiles.all_profiles