OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Language detection library based on n-gram frequency analysis. *)
2
3(** Language detection result *)
4type result = {
5 lang: string;
6 prob: float;
7}
8
9(** Detection parameters *)
10type config = {
11 alpha: float; (** Smoothing parameter (default 0.5) *)
12 n_trial: int; (** Number of random trials (default 7) *)
13 max_text_length: int; (** Maximum text length to process *)
14 conv_threshold: float; (** Convergence threshold *)
15 prob_threshold: float; (** Minimum probability to report *)
16}
17
18(** Default configuration *)
19val default_config : config
20
21(** Detector state *)
22type t
23
24(** Create detector from language profiles.
25 Each profile is (lang_code, frequency_list) where frequency_list is
26 a list of (ngram, count) pairs. *)
27val create : ?config:config -> (string * (string * int) list) list -> t
28
29(** Set random seed for reproducible results *)
30val set_random_seed : t -> int -> unit
31
32(** Detect language of text.
33 Returns list of possible languages with probabilities, sorted by
34 probability descending. Only languages above prob_threshold are included. *)
35val detect : t -> string -> result list
36
37(** Detect best matching language.
38 Returns None if no language could be detected. *)
39val detect_best : t -> string -> string option
40
41(** Detect best matching language with its probability.
42 Returns None if no language could be detected. *)
43val detect_with_prob : t -> string -> (string * float) option
44
45(** Create a detector with all built-in language profiles.
46 This is a convenience function that calls create with all supported profiles. *)
47val create_default : ?config:config -> unit -> t