OCaml HTML5 parser/serialiser based on Python's JustHTML
1(** Language detection library based on n-gram frequency analysis. *) 2 3(** Language detection result *) 4type result = { 5 lang: string; 6 prob: float; 7} 8 9(** Detection parameters *) 10type config = { 11 alpha: float; (** Smoothing parameter (default 0.5) *) 12 n_trial: int; (** Number of random trials (default 7) *) 13 max_text_length: int; (** Maximum text length to process *) 14 conv_threshold: float; (** Convergence threshold *) 15 prob_threshold: float; (** Minimum probability to report *) 16} 17 18(** Default configuration *) 19val default_config : config 20 21(** Detector state *) 22type t 23 24(** Create detector from language profiles. 25 Each profile is (lang_code, frequency_list) where frequency_list is 26 a list of (ngram, count) pairs. *) 27val create : ?config:config -> (string * (string * int) list) list -> t 28 29(** Set random seed for reproducible results *) 30val set_random_seed : t -> int -> unit 31 32(** Detect language of text. 33 Returns list of possible languages with probabilities, sorted by 34 probability descending. Only languages above prob_threshold are included. *) 35val detect : t -> string -> result list 36 37(** Detect best matching language. 38 Returns None if no language could be detected. *) 39val detect_best : t -> string -> string option 40 41(** Detect best matching language with its probability. 42 Returns None if no language could be detected. *) 43val detect_with_prob : t -> string -> (string * float) option 44 45(** Create a detector with all built-in language profiles. 46 This is a convenience function that calls create with all supported profiles. *) 47val create_default : ?config:config -> unit -> t