···1-Copyright (c) 2007-2016 Mozilla Foundation
2-Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
34-Permission is hereby granted, free of charge, to any person obtaining a
5-copy of this software and associated documentation files (the "Software"),
6-to deal in the Software without restriction, including without limitation
7-the rights to use, copy, modify, merge, publish, distribute, sublicense,
8-and/or sell copies of the Software, and to permit persons to whom the
9-Software is furnished to do so, subject to the following conditions:
001011-The above copyright notice and this permission notice shall be included in
12-all copies or substantial portions of the Software.
1314THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20-DEALINGS IN THE SOFTWARE.
···1+MIT License
023+Copyright (c) 2024 Anil Madhavapeddy
4+5+Permission is hereby granted, free of charge, to any person obtaining a copy
6+of this software and associated documentation files (the "Software"), to deal
7+in the Software without restriction, including without limitation the rights
8+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+copies of the Software, and to permit persons to whom the Software is
10+furnished to do so, subject to the following conditions:
1112+The above copyright notice and this permission notice shall be included in all
13+copies or substantial portions of the Software.
1415THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+SOFTWARE.
+134-43
README.md
···1-# langdetect
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000023-Language detection library for OCaml using n-gram frequency analysis.
0000000045-This is an OCaml port of the [Cybozu
6-langdetect](https://github.com/shuyo/language-detection) algorithm. It detects
7-the natural language of text using n-gram frequency profiles. It was ported
8-from <https://github.com/validator/validator>.
910-## Features
01112-- Detects 49 languages including English, Chinese, Japanese, Arabic, and many European languages
13-- Fast probabilistic detection using n-gram frequency analysis
14-- Configurable detection parameters (smoothing, convergence thresholds)
15-- Reproducible results with optional random seed control
16-- Pure OCaml implementation with minimal dependencies
1718-## Installation
1920-```bash
21-opam install langdetect
022```
2324-## Usage
02526-```ocaml
27-(* Create a detector with all built-in profiles *)
28-let detector = Langdetect.create_default ()
2930-(* Detect the best matching language *)
31-let () =
32- match Langdetect.detect_best detector "Hello, world!" with
33- | Some lang -> Printf.printf "Detected: %s\n" lang
34- | None -> print_endline "Could not detect language"
3536-(* Get all possible languages with probabilities *)
37-let () =
38- let results = Langdetect.detect detector "Bonjour le monde" in
39- List.iter (fun r ->
40- Printf.printf "%s: %.2f\n" r.Langdetect.lang r.Langdetect.prob
41- ) results
4243-(* Use custom configuration *)
44-let config = { Langdetect.default_config with prob_threshold = 0.3 }
45-let detector = Langdetect.create_default ~config ()
000046```
4748-## Supported Languages
4950-Arabic, Bengali, Bulgarian, Catalan, Croatian, Czech, Danish, Dutch, English,
51-Estonian, Farsi, Finnish, French, German, Greek, Gujarati, Hebrew, Hindi,
52-Hungarian, Indonesian, Italian, Japanese, Korean, Latvian, Lithuanian,
53-Macedonian, Malayalam, Dutch, Norwegian, Panjabi, Polish, Portuguese, Romanian,
54-Russian, Sinhalese, Albanian, Spanish, Swedish, Tamil, Telugu, Thai, Tagalog,
55-Turkish, Ukrainian, Urdu, Vietnamese, Chinese (Simplified), Chinese
56-(Traditional).
05758## License
5960-MIT License - see LICENSE file for details.
006162-Based on the Cybozu langdetect algorithm. Copyright (c) 2007-2016 Mozilla Foundation and 2025 Anil Madhavapeddy.
00
···1+# langdetect-jsoo
2+3+Language detection for JavaScript/WebAssembly, compiled from OCaml using
4+`js_of_ocaml/wasm_of_ocaml`. This is via an OCaml port of the [Cybozu langdetect](https://github.com/shuyo/language-detection) algorithm that uses
5+n-gram frequency profiles to detect the natural language of text.
6+7+Supports 47 languages including English, Chinese, Japanese, Arabic, and many European languages.
8+9+## Installation
10+11+```bash
12+npm install langdetect-jsoo
13+```
14+15+## Quick Start
16+17+### Browser (Script Tag)
18+19+#### Pure JavaScript Version (~7.6MB)
20+21+```html
22+<script src="node_modules/langdetect-jsoo/langdetect.js"></script>
23+<script>
24+ // Wait for library to load
25+ document.addEventListener('langdetectReady', () => {
26+ const lang = langdetect.detect("Hello, world!");
27+ console.log(lang); // "en"
28+ });
29+</script>
30+```
31+32+#### WebAssembly Version (~7.5MB WASM + ~12KB loader)
33+34+The WASM version offers better performance for repeated detections:
35+36+```html
37+<script src="node_modules/langdetect-jsoo/langdetect_js_main.bc.wasm.js"></script>
38+<script>
39+ document.addEventListener('langdetectReady', () => {
40+ const lang = langdetect.detect("Bonjour le monde!");
41+ console.log(lang); // "fr"
42+ });
43+</script>
44+```
45+46+## API Reference
47+48+### `langdetect.detect(text)`
49+50+Detect the most likely language of the input text.
51+52+```javascript
53+langdetect.detect("The quick brown fox jumps over the lazy dog.")
54+// Returns: "en"
55+56+langdetect.detect("こんにちは世界")
57+// Returns: "ja"
58+59+langdetect.detect("")
60+// Returns: null (text too short)
61+```
62+63+**Parameters:**
64+- `text` (string): The text to analyze
65+66+**Returns:**
67+- `string | null`: ISO 639-1 language code (e.g., "en", "fr", "zh-cn") or `null` if detection fails
68+69+### `langdetect.detectWithProb(text)`
70+71+Detect the language with confidence score.
72+73+```javascript
74+langdetect.detectWithProb("Bonjour le monde!")
75+// Returns: { lang: "fr", prob: 0.9999 }
76+77+langdetect.detectWithProb("a")
78+// Returns: null (text too short)
79+```
80+81+**Parameters:**
82+- `text` (string): The text to analyze
83+84+**Returns:**
85+- `{ lang: string, prob: number } | null`: Object with language code and probability (0-1), or `null` if detection fails
86+87+### `langdetect.detectAll(text)`
88+89+Get all candidate languages with their probabilities.
9091+```javascript
92+langdetect.detectAll("Hello world")
93+// Returns: [
94+// { lang: "en", prob: 0.857 },
95+// { lang: "de", prob: 0.095 },
96+// { lang: "nl", prob: 0.023 },
97+// ...
98+// ]
99+```
100101+**Parameters:**
102+- `text` (string): The text to analyze
00103104+**Returns:**
105+- `Array<{ lang: string, prob: number }>`: Array of language candidates sorted by probability (highest first)
106107+### `langdetect.languages()`
0000108109+Get the list of supported language codes.
110111+```javascript
112+langdetect.languages()
113+// Returns: ["ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", ...]
114```
115116+**Returns:**
117+- `string[]`: Array of ISO 639-1 language codes
118119+## Demo
00120121+Open `langdetect.html` in a browser to try the interactive demo. It supports switching between JavaScript and WebAssembly runtimes.
0000122123+## Events
00000124125+The library dispatches a `langdetectReady` event on `document` when fully loaded:
126+127+```javascript
128+document.addEventListener('langdetectReady', () => {
129+ // langdetect API is now available
130+ console.log('Loaded', langdetect.languages().length, 'languages');
131+});
132```
133134+## Algorithm
135136+This library uses the Cybozu langdetect algorithm which:
137+138+1. Extracts n-grams (1-3 characters) from the input text
139+2. Compares against pre-computed frequency profiles for 47 languages
140+3. Uses a probabilistic model with Bayesian inference
141+4. Applies text normalization for consistent detection
142+143+The language profiles contain ~172,000 unique n-grams across all supported languages.
144145## License
146147+MIT
148+149+## Links
150151+- [Homepage](https://tangled.org/anil.recoil.org/ocaml-langdetect)
152+- [Source Repository](https://tangled.org/anil.recoil.org/ocaml-langdetect)
153+- [Original Cybozu langdetect](https://github.com/shuyo/language-detection)
···1-(*---------------------------------------------------------------------------
2- Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
3- SPDX-License-Identifier: MIT
4- ---------------------------------------------------------------------------*)
5-6-let detect_language input_text =
7- let detector = Langdetect.create_default () in
8- let results = Langdetect.detect detector input_text in
9- List.iter
10- (fun (r : Langdetect.result) -> Printf.printf "%s %.4f\n" r.lang r.prob)
11- results
12-13-let read_all_stdin () =
14- let buf = Buffer.create 4096 in
15- try
16- while true do
17- Buffer.add_channel buf stdin 4096
18- done;
19- Buffer.contents buf
20- with End_of_file -> Buffer.contents buf
21-22-let read_file path =
23- let ic = open_in path in
24- let n = in_channel_length ic in
25- let s = really_input_string ic n in
26- close_in ic;
27- s
28-29-let run file_opt =
30- let text =
31- match file_opt with
32- | Some path -> read_file path
33- | None -> read_all_stdin ()
34- in
35- if String.length (String.trim text) = 0 then
36- `Error (false, "No input text provided")
37- else begin
38- detect_language text;
39- `Ok ()
40- end
41-42-open Cmdliner
43-44-let file_arg =
45- let doc = "Input file to detect language from. If not provided, reads from stdin." in
46- Arg.(value & pos 0 (some file) None & info [] ~docv:"FILE" ~doc)
47-48-let cmd =
49- let doc = "Detect the language of text" in
50- let man =
51- [
52- `S Manpage.s_description;
53- `P "Detects the natural language of input text using n-gram frequency analysis.";
54- `P "Outputs detected language codes and their probabilities as space-separated values, one per line, sorted by probability (highest first).";
55- `S Manpage.s_examples;
56- `P "Detect language from a file:";
57- `Pre " langdetect document.txt";
58- `P "Detect language from stdin:";
59- `Pre " echo 'Hello world' | langdetect";
60- ]
61- in
62- let info = Cmd.info "langdetect" ~version:"%%VERSION%%" ~doc ~man in
63- Cmd.v info Term.(ret (const run $ file_arg))
64-65-let () = exit (Cmd.eval cmd)
···1-(*---------------------------------------------------------------------------
2- Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
3- SPDX-License-Identifier: MIT
4- ---------------------------------------------------------------------------*)
5-6-(** JavaScript bindings for langdetect.
7-8- This module provides browser-compatible language detection via js_of_ocaml.
9- It exposes a simple API on window.langdetect for detecting languages in text. *)
10-11-12-(** The detector instance, lazily initialized *)
13-let detector = lazy (Langdetect.create_default ())
14-15-(** Detect the language of text, returning the best match or null *)
16-let detect_best text =
17- let t = Lazy.force detector in
18- Langdetect.detect_best t text
19-20-(** Detect language with probability score *)
21-let detect_with_prob text =
22- let t = Lazy.force detector in
23- Langdetect.detect_with_prob t text
24-25-(** Detect all matching languages above threshold *)
26-let detect_all text =
27- let t = Lazy.force detector in
28- Langdetect.detect t text
29-30-(** Get list of supported languages *)
31-let supported_languages () =
32- let t = Lazy.force detector in
33- Langdetect.supported_languages t
34-35-(** Console logging helper *)
36-let console_log msg =
37- ignore (Jv.call (Jv.get Jv.global "console") "log" [| Jv.of_string msg |])
38-39-(** Convert a detection result to a JavaScript object *)
40-let result_to_jv (r : Langdetect.result) =
41- Jv.obj [|
42- "lang", Jv.of_string r.lang;
43- "prob", Jv.of_float r.prob;
44- |]
45-46-(** Register the API on a JavaScript object *)
47-let register_api_on obj =
48- (* detect(text) -> string | null *)
49- Jv.set obj "detect" (Jv.callback ~arity:1 (fun text_jv ->
50- let text = Jv.to_string text_jv in
51- match detect_best text with
52- | Some lang -> Jv.of_string lang
53- | None -> Jv.null
54- ));
55-56- (* detectWithProb(text) -> {lang, prob} | null *)
57- Jv.set obj "detectWithProb" (Jv.callback ~arity:1 (fun text_jv ->
58- let text = Jv.to_string text_jv in
59- match detect_with_prob text with
60- | Some (lang, prob) ->
61- Jv.obj [|
62- "lang", Jv.of_string lang;
63- "prob", Jv.of_float prob;
64- |]
65- | None -> Jv.null
66- ));
67-68- (* detectAll(text) -> [{lang, prob}, ...] *)
69- Jv.set obj "detectAll" (Jv.callback ~arity:1 (fun text_jv ->
70- let text = Jv.to_string text_jv in
71- let results = detect_all text in
72- Jv.of_list result_to_jv results
73- ));
74-75- (* languages() -> string[] *)
76- Jv.set obj "languages" (Jv.callback ~arity:0 (fun () ->
77- let langs = supported_languages () in
78- Jv.of_array Jv.of_string langs
79- ));
80-81- (* version *)
82- Jv.set obj "version" (Jv.of_string "1.0.0")
83-84-(** Register the global API on window.langdetect *)
85-let register_global_api () =
86- let api = Jv.obj [||] in
87- register_api_on api;
88- Jv.set Jv.global "langdetect" api;
89-90- (* Dispatch 'langdetectReady' event for async loaders *)
91- let document = Jv.get Jv.global "document" in
92- let event_class = Jv.get Jv.global "CustomEvent" in
93- let event = Jv.new' event_class [| Jv.of_string "langdetectReady" |] in
94- ignore (Jv.call document "dispatchEvent" [| event |]);
95- console_log "[langdetect] API ready - 47 languages loaded"
···1-(*---------------------------------------------------------------------------
2- Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
3- SPDX-License-Identifier: MIT
4- ---------------------------------------------------------------------------*)
5-6-(** Entry point for the standalone JavaScript build.
7- Registers the API on window.langdetect when the script loads. *)
8-9-let () = Langdetect_js.register_global_api ()
···000000000
-310
lib/js/langdetect_js_tests.ml
···1-(*---------------------------------------------------------------------------
2- Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
3- SPDX-License-Identifier: MIT
4- ---------------------------------------------------------------------------*)
5-6-(** Browser-based test runner for langdetect.
7-8- This module runs regression tests in the browser and displays results
9- in the DOM. It demonstrates language detection across multiple languages. *)
10-11-open Brr
12-13-(** Test case definition *)
14-type test_case = {
15- name : string;
16- text : string;
17- expected : string;
18-}
19-20-(** Test results *)
21-type test_result = {
22- test : test_case;
23- detected : string option;
24- prob : float option;
25- passed : bool;
26- time_ms : float;
27-}
28-29-(** Sample texts from the native test corpus *)
30-let test_cases = [|
31- (* Same corpus as test/test_langdetect.ml *)
32- { name = "English"; text = "The quick brown fox jumps over the lazy dog. This is a sample of English text that should be detected correctly by the language detection algorithm. Language detection uses n-gram frequency analysis to determine the most likely language of a given text sample."; expected = "en" };
33- { name = "Chinese"; text = "看官,現今我們中國四萬萬同胞欲內免專制、外杜瓜分的一個絕大轉機、絕大遭際,不是那預備立憲一事麼?但那立憲上加了這麼預備兩個字的活動考語,我就深恐將來這瘟憲立不成,必定嫁禍到我們同胞程度不齊上,以為卸罪地步。"; expected = "zh" };
34- { name = "Hebrew"; text = "זוהי דוגמה לטקסט בעברית שנועד לבדיקת זיהוי שפה. עברית היא שפה שמית שנכתבת מימין לשמאל. המערכת צריכה לזהות אותה כראוי על סמך התדירות של אותיות ותבניות אופייניות."; expected = "he" };
35- { name = "German"; text = "Dies ist ein Beispieltext auf Deutsch, der zur Spracherkennung verwendet wird. Die deutsche Sprache hat viele charakteristische Merkmale wie Umlaute und zusammengesetzte Wörter, die die Erkennung erleichtern."; expected = "de" };
36- { name = "French"; text = "Ceci est un exemple de texte en français pour tester la détection de langue. Le français est une langue romane avec des caractéristiques distinctives comme les accents et les conjugaisons verbales."; expected = "fr" };
37- { name = "Japanese"; text = "これは日本語のテキストです。日本語の言語検出をテストするためのサンプルです。日本語には漢字、ひらがな、カタカナの三種類の文字が使われています。"; expected = "ja" };
38- { name = "Russian"; text = "Это пример текста на русском языке для тестирования определения языка. Русский язык использует кириллический алфавит и имеет сложную грамматику с падежами и склонениями."; expected = "ru" };
39- { name = "Spanish"; text = "Este es un ejemplo de texto en español para probar la detección de idiomas. El español es una lengua romance hablada por millones de personas en todo el mundo."; expected = "es" };
40- { name = "Arabic"; text = "هذا مثال على نص باللغة العربية لاختبار اكتشاف اللغة. اللغة العربية هي لغة سامية تكتب من اليمين إلى اليسار."; expected = "ar" };
41- { name = "Korean"; text = "이것은 언어 감지를 테스트하기 위한 한국어 텍스트 예시입니다. 한국어는 한글이라는 독특한 문자 체계를 사용합니다."; expected = "ko" };
42- { name = "Portuguese"; text = "Este é um exemplo de texto em português para testar a detecção de idiomas. O português é uma língua românica falada em Portugal, Brasil e outros países."; expected = "pt" };
43- { name = "Italian"; text = "Questo è un esempio di testo in italiano per testare il rilevamento della lingua. L'italiano è una lingua romanza con una ricca storia letteraria."; expected = "it" };
44- { name = "Dutch"; text = "Dit is een voorbeeld van Nederlandse tekst voor het testen van taaldetectie. Nederlands wordt gesproken in Nederland en België en heeft veel overeenkomsten met Duits en Engels."; expected = "nl" };
45- { name = "Polish"; text = "To jest przykładowy tekst w języku polskim do testowania wykrywania języka. Polski jest językiem słowiańskim z bogatą historią literacką i skomplikowaną gramatyką."; expected = "pl" };
46- { name = "Turkish"; text = "Bu, dil algılama testleri için Türkçe örnek bir metindir. Türkçe, agglutinative bir dil yapısına sahip ve Latin alfabesi kullanmaktadır. Özel karakterler içerir."; expected = "tr" };
47- { name = "Swedish"; text = "Detta är en exempeltext på svenska för att testa språkdetektering. Svenska är ett nordiskt språk som talas i Sverige och Finland med karakteristiska vokaler."; expected = "sv" };
48- { name = "Vietnamese"; text = "Đây là một văn bản mẫu bằng tiếng Việt để kiểm tra phát hiện ngôn ngữ. Tiếng Việt sử dụng bảng chữ cái Latin với nhiều dấu thanh điệu đặc biệt."; expected = "vi" };
49- { name = "Thai"; text = "นี่คือข้อความตัวอย่างภาษาไทยสำหรับทดสอบการตรวจจับภาษา ภาษาไทยใช้อักษรไทย และมีระบบวรรณยุกต์ที่ซับซ้อน"; expected = "th" };
50- { name = "Hindi"; text = "यह भाषा पहचान परीक्षण के लिए हिंदी में एक नमूना पाठ है। हिंदी देवनागरी लिपि का उपयोग करती है और भारत की आधिकारिक भाषाओं में से एक है।"; expected = "hi" };
51- { name = "Finnish"; text = "Tämä on suomenkielinen esimerkkiteksti kielentunnistuksen testaamiseksi. Suomi on suomalais-ugrilainen kieli, jolla on monimutkainen taivutusjärjestelmä."; expected = "fi" };
52-|]
53-54-(** Get current time in milliseconds *)
55-let now_ms () =
56- Jv.to_float (Jv.call (Jv.get Jv.global "performance") "now" [||])
57-58-(** Run a single test *)
59-let run_test detector test =
60- (* Set deterministic seed before EACH test, like native tests do *)
61- Langdetect.set_random_seed detector 42;
62- let start = now_ms () in
63- let result = Langdetect.detect_with_prob detector test.text in
64- let time_ms = now_ms () -. start in
65- let detected, prob = match result with
66- | Some (lang, p) -> Some lang, Some p
67- | None -> None, None
68- in
69- (* Handle special case: zh matching zh-cn/zh-tw *)
70- let lang_matches expected detected =
71- if expected = "zh" then
72- String.length detected >= 2 && String.sub detected 0 2 = "zh"
73- else
74- expected = detected
75- in
76- let passed = match detected with
77- | Some lang -> lang_matches test.expected lang
78- | None -> false
79- in
80- { test; detected; prob; passed; time_ms }
81-82-(** Shared detector instance - created lazily on first use *)
83-let shared_detector = lazy (Langdetect.create_default ())
84-85-(** Run all tests and return results *)
86-let run_all_tests () =
87- let detector = Lazy.force shared_detector in
88- Array.map (run_test detector) test_cases
89-90-(** Create a result row element *)
91-let create_result_row result =
92- let status_class = if result.passed then "pass" else "fail" in
93- let status_text = if result.passed then "✓" else "✗" in
94- let detected_text = match result.detected with
95- | Some lang -> lang
96- | None -> "(none)"
97- in
98- let prob_text = match result.prob with
99- | Some p -> Printf.sprintf "%.1f%%" (p *. 100.0)
100- | None -> "-"
101- in
102- let time_text = Printf.sprintf "%.1fms" result.time_ms in
103- (* Truncate long text for display *)
104- let display_text =
105- let t = result.test.text in
106- if String.length t > 60 then String.sub t 0 57 ^ "..." else t
107- in
108-109- let tr = El.tr [] in
110- El.set_children tr [
111- El.td [El.txt' result.test.name];
112- El.td ~at:[At.class' (Jstr.v "corpus-text")] [El.txt' display_text];
113- El.td ~at:[At.class' (Jstr.v "code")] [El.txt' result.test.expected];
114- El.td ~at:[At.class' (Jstr.v "code")] [El.txt' detected_text];
115- El.td [El.txt' prob_text];
116- El.td [El.txt' time_text];
117- El.td ~at:[At.class' (Jstr.v status_class)] [El.txt' status_text];
118- ];
119- tr
120-121-(** Create summary stats *)
122-let create_summary results =
123- let total = Array.length results in
124- let passed = Array.fold_left (fun acc r -> if r.passed then acc + 1 else acc) 0 results in
125- let failed = total - passed in
126- let total_time = Array.fold_left (fun acc r -> acc +. r.time_ms) 0.0 results in
127- let avg_time = total_time /. float_of_int total in
128-129- El.div ~at:[At.class' (Jstr.v "summary")] [
130- El.h2 [El.txt' "Test Results"];
131- El.p [
132- El.strong [El.txt' (Printf.sprintf "%d/%d tests passed" passed total)];
133- El.txt' (Printf.sprintf " (%d failed)" failed);
134- ];
135- El.p [
136- El.txt' (Printf.sprintf "Total time: %.1fms (avg %.2fms per test)" total_time avg_time);
137- ];
138- ]
139-140-(** Console error logging *)
141-let console_error msg =
142- ignore (Jv.call (Jv.get Jv.global "console") "error" [| Jv.of_string msg |])
143-144-let console_log msg =
145- ignore (Jv.call (Jv.get Jv.global "console") "log" [| Jv.of_string msg |])
146-147-(** Main test runner *)
148-let run_tests_ui () =
149- console_log "[langdetect-tests] Starting test UI...";
150- try
151- (* Find or create output container *)
152- let container = match El.find_first_by_selector (Jstr.v "#test-results") ~root:(Document.body G.document) with
153- | Some el ->
154- console_log "[langdetect-tests] Found #test-results container";
155- el
156- | None ->
157- console_log "[langdetect-tests] Creating #test-results container";
158- let el = El.div ~at:[At.id (Jstr.v "test-results")] [] in
159- El.append_children (Document.body G.document) [el];
160- el
161- in
162-163- (* Show loading message *)
164- El.set_children container [
165- El.p [El.txt' "Running tests..."]
166- ];
167- console_log "[langdetect-tests] Set loading message, scheduling test run...";
168-169- (* Run tests using JavaScript setTimeout *)
170- let run_tests_callback () =
171- console_log "[langdetect-tests] Callback executing...";
172- try
173- console_log "[langdetect-tests] Running tests...";
174- let results = run_all_tests () in
175- console_log (Printf.sprintf "[langdetect-tests] Tests complete: %d results" (Array.length results));
176-177- (* Build results table *)
178- let thead = El.thead [
179- El.tr [
180- El.th [El.txt' "Language"];
181- El.th [El.txt' "Sample Text"];
182- El.th [El.txt' "Expected"];
183- El.th [El.txt' "Detected"];
184- El.th [El.txt' "Confidence"];
185- El.th [El.txt' "Time"];
186- El.th [El.txt' "Status"];
187- ]
188- ] in
189-190- let tbody = El.tbody [] in
191- Array.iter (fun result ->
192- El.append_children tbody [create_result_row result]
193- ) results;
194-195- let table = El.table ~at:[At.class' (Jstr.v "results-table")] [thead; tbody] in
196-197- (* Update container *)
198- El.set_children container [
199- create_summary results;
200- table;
201- ];
202- console_log "[langdetect-tests] UI updated with results"
203- with exn ->
204- console_error (Printf.sprintf "[langdetect-tests] Error running tests: %s" (Printexc.to_string exn));
205- El.set_children container [
206- El.p ~at:[At.style (Jstr.v "color: red")] [
207- El.txt' (Printf.sprintf "Error: %s" (Printexc.to_string exn))
208- ]
209- ]
210- in
211-212- (* Use Brr's timer function *)
213- console_log "[langdetect-tests] Scheduling tests with G.set_timeout...";
214- let _timer = G.set_timeout ~ms:200 run_tests_callback in
215- console_log "[langdetect-tests] Timer scheduled";
216- ()
217- with exn ->
218- console_error (Printf.sprintf "[langdetect-tests] Error in run_tests_ui: %s" (Printexc.to_string exn))
219-220-221-(** Interactive demo section *)
222-let setup_demo () =
223- console_log "[langdetect-tests] Setting up demo...";
224- try
225- let demo_container = match El.find_first_by_selector (Jstr.v "#demo") ~root:(Document.body G.document) with
226- | Some el ->
227- console_log "[langdetect-tests] Found #demo container";
228- el
229- | None ->
230- console_log "[langdetect-tests] No #demo container, using body";
231- Document.body G.document
232- in
233- console_log "[langdetect-tests] Creating demo elements...";
234-235- let textarea = El.textarea ~at:[
236- At.id (Jstr.v "demo-input");
237- At.v (Jstr.v "rows") (Jstr.v "4");
238- At.v (Jstr.v "placeholder") (Jstr.v "Enter text to detect language...");
239- ] [] in
240-241- let result_div = El.div ~at:[At.id (Jstr.v "demo-result")] [
242- El.txt' "Enter text above and click Detect"
243- ] in
244-245- let detect_button = El.button ~at:[At.id (Jstr.v "demo-button")] [El.txt' "Detect Language"] in
246- console_log "[langdetect-tests] Created demo elements, setting up click handler...";
247-248- (* Set up click handler - detector is created lazily on first click *)
249- ignore (Ev.listen Ev.click (fun _ ->
250- let text = Jstr.to_string (El.prop El.Prop.value textarea) in
251- if String.length text > 0 then begin
252- let detector = Lazy.force shared_detector in
253- let start = now_ms () in
254- let results = Langdetect.detect detector text in
255- let time_ms = now_ms () -. start in
256-257- let result_html = match results with
258- | [] ->
259- [El.txt' "No language detected (text may be too short)"]
260- | _ ->
261- let items = List.map (fun (r : Langdetect.result) ->
262- El.li [
263- El.strong [El.txt' r.lang];
264- El.txt' (Printf.sprintf " — %.1f%% confidence" (r.prob *. 100.0))
265- ]
266- ) results in
267- [
268- El.p [El.txt' (Printf.sprintf "Detected in %.1fms:" time_ms)];
269- El.ul items
270- ]
271- in
272- El.set_children result_div result_html
273- end
274- ) (El.as_target detect_button));
275- console_log "[langdetect-tests] Click handler registered";
276-277- (* Add demo section to container *)
278- let tag = Jstr.to_string (El.tag_name demo_container) in
279- console_log (Printf.sprintf "[langdetect-tests] Container tag: %s" tag);
280- El.set_children demo_container [
281- El.h2 [El.txt' "Try It"];
282- El.div ~at:[At.class' (Jstr.v "demo-area")] [
283- textarea;
284- detect_button;
285- result_div;
286- ]
287- ];
288- console_log "[langdetect-tests] Demo UI created"
289- with exn ->
290- console_error (Printf.sprintf "[langdetect-tests] Error in setup_demo: %s" (Printexc.to_string exn))
291-292-(** Entry point *)
293-let () =
294- (* Register global API for the interactive demo in test.html *)
295- Langdetect_js.register_global_api ();
296-297- (* Wait for DOM to be ready *)
298- let ready_state = Jv.get (Jv.get Jv.global "document") "readyState" |> Jv.to_string in
299- if ready_state = "loading" then
300- ignore (Jv.call Jv.global "addEventListener" [|
301- Jv.of_string "DOMContentLoaded";
302- Jv.callback ~arity:1 (fun _ ->
303- run_tests_ui ();
304- setup_demo ()
305- )
306- |])
307- else begin
308- run_tests_ui ();
309- setup_demo ()
310- end
···1-(*---------------------------------------------------------------------------
2- Copyright (c) 2007-2016 Mozilla Foundation
3- Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4- SPDX-License-Identifier: MIT
5- ---------------------------------------------------------------------------*)
6-7-(** Language detection library based on n-gram frequency analysis.
8-9- This is an OCaml port of the Cybozu langdetect algorithm. *)
10-11-module StringMap = Map.Make (String)
12-13-type result = {
14- lang : string;
15- prob : float;
16-}
17-18-type config = {
19- alpha : float;
20- n_trial : int;
21- max_text_length : int;
22- conv_threshold : float;
23- prob_threshold : float;
24-}
25-26-let default_config =
27- {
28- alpha = 0.5;
29- n_trial = 7;
30- max_text_length = 10000;
31- conv_threshold = 0.99999;
32- prob_threshold = 0.1;
33- }
34-35-let n_gram_max = 3
36-let base_freq = 10000
37-let iteration_limit = 1000
38-let alpha_width = 0.05
39-40-type t = {
41- config : config;
42- word_lang_prob : float array StringMap.t;
43- lang_list : string array;
44- mutable seed : int option;
45-}
46-47-(* Character normalization matching the original Java implementation.
48- This is critical for matching the trained profiles. *)
49-let normalize_uchar uchar =
50- let code = Uchar.to_int uchar in
51- (* Basic Latin: only letters pass through *)
52- if code < 128 then
53- let c = Char.chr code in
54- match c with
55- | 'A' .. 'Z' | 'a' .. 'z' -> Some (String.make 1 c)
56- | _ -> None
57- (* Hangul Syllables (U+AC00-U+D7A3): normalize to '가' (U+AC00) *)
58- else if code >= 0xAC00 && code <= 0xD7A3 then
59- Some "\xEA\xB0\x80" (* UTF-8 for U+AC00 '가' *)
60- (* Hiragana (U+3040-U+309F): normalize to 'あ' (U+3042) *)
61- else if code >= 0x3040 && code <= 0x309F then
62- Some "\xE3\x81\x82" (* UTF-8 for U+3042 'あ' *)
63- (* Katakana (U+30A0-U+30FF): normalize to 'ア' (U+30A2) *)
64- else if code >= 0x30A0 && code <= 0x30FF then
65- Some "\xE3\x82\xA2" (* UTF-8 for U+30A2 'ア' *)
66- (* Bopomofo (U+3100-U+312F) and Extended (U+31A0-U+31BF): normalize to 'ㄅ' (U+3105) *)
67- else if (code >= 0x3100 && code <= 0x312F) || (code >= 0x31A0 && code <= 0x31BF) then
68- Some "\xE3\x84\x85" (* UTF-8 for U+3105 'ㄅ' *)
69- (* General Punctuation (U+2000-U+206F): treat as space/separator *)
70- else if code >= 0x2000 && code <= 0x206F then
71- None
72- (* CJK Unified Ideographs and other scripts: pass through *)
73- else
74- let buf = Buffer.create 4 in
75- Buffer.add_utf_8_uchar buf uchar;
76- Some (Buffer.contents buf)
77-78-let extract_ngrams ?(max_len = 10000) text word_lang_prob =
79- let ngrams = ref [] in
80- let char_buffer = Array.make n_gram_max "" in
81- let char_count = ref 0 in
82- let processed = ref 0 in
83- let decoder = Uutf.decoder ~encoding:`UTF_8 (`String text) in
84- let rec process () =
85- if !processed >= max_len then ()
86- else
87- match Uutf.decode decoder with
88- | `Await | `End -> ()
89- | `Malformed _ -> process ()
90- | `Uchar uchar -> (
91- incr processed;
92- match normalize_uchar uchar with
93- | None ->
94- char_buffer.(0) <- "";
95- char_buffer.(1) <- "";
96- char_buffer.(2) <- "";
97- char_count := 0;
98- process ()
99- | Some char_str ->
100- char_buffer.(0) <- char_buffer.(1);
101- char_buffer.(1) <- char_buffer.(2);
102- char_buffer.(2) <- char_str;
103- incr char_count;
104- let available = min !char_count n_gram_max in
105- for n = 1 to available do
106- let start_idx = n_gram_max - n in
107- let parts = ref [] in
108- for i = start_idx to n_gram_max - 1 do
109- parts := char_buffer.(i) :: !parts
110- done;
111- let ngram = String.concat "" (List.rev !parts) in
112- if StringMap.mem ngram word_lang_prob then
113- ngrams := ngram :: !ngrams
114- done;
115- process ())
116- in
117- process ();
118- Array.of_list (List.rev !ngrams)
119-120-let init_prob n_langs = Array.make n_langs (1.0 /. float_of_int n_langs)
121-122-let update_lang_prob prob ngram word_lang_prob alpha =
123- match StringMap.find_opt ngram word_lang_prob with
124- | None -> false
125- | Some lang_prob_map ->
126- let weight = alpha /. float_of_int base_freq in
127- for i = 0 to Array.length prob - 1 do
128- prob.(i) <- prob.(i) *. (weight +. lang_prob_map.(i))
129- done;
130- true
131-132-let normalize_prob prob =
133- let sum = Array.fold_left ( +. ) 0.0 prob in
134- if sum <= 0.0 then 0.0
135- else
136- let max_p = ref 0.0 in
137- for i = 0 to Array.length prob - 1 do
138- prob.(i) <- prob.(i) /. sum;
139- if prob.(i) > !max_p then max_p := prob.(i)
140- done;
141- !max_p
142-143-(* LCG random number generator using Int32 for WASM compatibility.
144- The constants (1103515245, 12345) are from the C standard library's rand().
145- We mask with 0x3FFFFFFF (30 bits) to ensure the result fits in OCaml's
146- 31-bit int on 32-bit platforms like WASM. *)
147-let random_state = ref 12345l
148-let set_seed seed = random_state := Int32.of_int seed
149-150-let next_random () =
151- (* Use Int32 to handle overflow correctly on 32-bit platforms (WASM) *)
152- let open Int32 in
153- random_state := logand (add (mul !random_state 1103515245l) 12345l) 0x7FFFFFFFl;
154- (* Mask to 30 bits to fit in OCaml's 31-bit int on 32-bit platforms *)
155- to_int (logand !random_state 0x3FFFFFFFl)
156-157-let random_int bound =
158- let r = next_random () in
159- (* Ensure positive result even if bound is negative *)
160- abs (r mod bound)
161-162-let max_random_float = Int32.to_float 0x3FFFFFFFl
163-164-let random_gaussian () =
165- let u1 = float_of_int (next_random ()) /. max_random_float in
166- let u2 = float_of_int (next_random ()) /. max_random_float in
167- let u1 = max 0.0001 u1 in
168- sqrt (-2.0 *. log u1) *. cos (2.0 *. Float.pi *. u2)
169-170-let detect_block t ngrams =
171- let n_langs = Array.length t.lang_list in
172- if n_langs = 0 || Array.length ngrams = 0 then [||]
173- else
174- let lang_prob = Array.make n_langs 0.0 in
175- set_seed (Option.value t.seed ~default:12345);
176- for _ = 0 to t.config.n_trial - 1 do
177- let prob = init_prob n_langs in
178- let alpha = t.config.alpha +. (random_gaussian () *. alpha_width) in
179- let converged = ref false in
180- let iter_count = ref 0 in
181- while (not !converged) && !iter_count < iteration_limit do
182- let r = random_int (Array.length ngrams) in
183- let (_ : bool) = update_lang_prob prob ngrams.(r) t.word_lang_prob alpha in
184- if !iter_count mod 5 = 0 then begin
185- let max_p = normalize_prob prob in
186- if max_p > t.config.conv_threshold then converged := true
187- end;
188- incr iter_count
189- done;
190- for j = 0 to n_langs - 1 do
191- lang_prob.(j) <- lang_prob.(j) +. (prob.(j) /. float_of_int t.config.n_trial)
192- done
193- done;
194- lang_prob
195-196-(* Create detector from packed profiles with flat data array.
197- ngram_table: global string table mapping indices to n-gram strings
198- profile_data: flat int array of (ngram_index, frequency) pairs
199- profile_offsets: array of (lang_code, start_index, num_pairs) *)
200-let create_packed ?(config = default_config) ~ngram_table ~profile_data profile_offsets =
201- let n_langs = Array.length profile_offsets in
202- let lang_list = Array.map (fun (lang, _, _) -> lang) profile_offsets in
203- let all_ngrams = Hashtbl.create 65536 in
204- let lang_totals = Array.make n_langs 0 in
205- Array.iteri
206- (fun lang_idx (_, start_idx, num_pairs) ->
207- for pair_idx = 0 to num_pairs - 1 do
208- let data_idx = start_idx + (pair_idx * 2) in
209- let ngram_idx = profile_data.(data_idx) in
210- let count = profile_data.(data_idx + 1) in
211- let ngram = ngram_table.(ngram_idx) in
212- let current =
213- match Hashtbl.find_opt all_ngrams ngram with
214- | Some arr -> arr
215- | None ->
216- let arr = Array.make n_langs 0 in
217- Hashtbl.add all_ngrams ngram arr;
218- arr
219- in
220- current.(lang_idx) <- count;
221- lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count
222- done)
223- profile_offsets;
224- let word_lang_prob =
225- Hashtbl.fold
226- (fun ngram counts acc ->
227- let probs = Array.make n_langs 0.0 in
228- for i = 0 to n_langs - 1 do
229- if lang_totals.(i) > 0 then
230- probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
231- done;
232- StringMap.add ngram probs acc)
233- all_ngrams StringMap.empty
234- in
235- { config; word_lang_prob; lang_list; seed = None }
236-237-(* Create detector from legacy list-based profiles.
238- profiles: list of (lang_code, (ngram, frequency) list) *)
239-let create ?(config = default_config) profiles =
240- let lang_list = Array.of_list (List.map fst profiles) in
241- let n_langs = Array.length lang_list in
242- let all_ngrams = Hashtbl.create 65536 in
243- let lang_totals = Array.make n_langs 0 in
244- List.iteri
245- (fun lang_idx (_, freq_list) ->
246- List.iter
247- (fun (ngram, count) ->
248- let current =
249- match Hashtbl.find_opt all_ngrams ngram with
250- | Some arr -> arr
251- | None ->
252- let arr = Array.make n_langs 0 in
253- Hashtbl.add all_ngrams ngram arr;
254- arr
255- in
256- current.(lang_idx) <- count;
257- lang_totals.(lang_idx) <- lang_totals.(lang_idx) + count)
258- freq_list)
259- profiles;
260- let word_lang_prob =
261- Hashtbl.fold
262- (fun ngram counts acc ->
263- let probs = Array.make n_langs 0.0 in
264- for i = 0 to n_langs - 1 do
265- if lang_totals.(i) > 0 then
266- probs.(i) <- float_of_int counts.(i) /. float_of_int lang_totals.(i)
267- done;
268- StringMap.add ngram probs acc)
269- all_ngrams StringMap.empty
270- in
271- { config; word_lang_prob; lang_list; seed = None }
272-273-let set_random_seed t seed = t.seed <- Some seed
274-275-let detect t text =
276- let ngrams =
277- extract_ngrams ~max_len:t.config.max_text_length text t.word_lang_prob
278- in
279- if Array.length ngrams = 0 then []
280- else
281- let probs = detect_block t ngrams in
282- let results = ref [] in
283- for i = 0 to Array.length probs - 1 do
284- if probs.(i) > t.config.prob_threshold then
285- results := { lang = t.lang_list.(i); prob = probs.(i) } :: !results
286- done;
287- List.sort (fun a b -> compare b.prob a.prob) !results
288-289-let detect_best t text =
290- match detect t text with
291- | [] -> None
292- | best :: _ -> Some best.lang
293-294-let detect_with_prob t text =
295- match detect t text with
296- | [] -> None
297- | best :: _ -> Some (best.lang, best.prob)
298-299-let supported_languages t = t.lang_list
300-301-let create_default ?config () =
302- create_packed ?config
303- ~ngram_table:Profiles_packed.ngram_table
304- ~profile_data:Profiles_packed.profile_data
305- Profiles_packed.profile_offsets
···1-(*---------------------------------------------------------------------------
2- Copyright (c) 2007-2016 Mozilla Foundation
3- Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4- SPDX-License-Identifier: MIT
5- ---------------------------------------------------------------------------*)
6-7-(** Language detection library based on n-gram frequency analysis.
8-9- This is an OCaml port of the Cybozu langdetect algorithm. Detects the
10- natural language of text using n-gram frequency profiles. Supports 49
11- languages including English, Chinese, Japanese, Arabic, and many European
12- languages.
13-14- {1 Quick Start}
15-16- {[
17- (* Create a detector with built-in language profiles *)
18- let detector = Langdetect.create_default () in
19-20- (* Detect the language of some text *)
21- let results = Langdetect.detect detector "Hello, how are you today?" in
22- List.iter (fun r ->
23- Printf.printf "%s: %.2f%%\n" r.lang (r.prob *. 100.0)
24- ) results
25- (* Output: en: 99.99% *)
26-27- (* Get just the best match *)
28- match Langdetect.detect_best detector "Bonjour, comment allez-vous?" with
29- | Some lang -> Printf.printf "Detected: %s\n" lang (* fr *)
30- | None -> Printf.printf "Could not detect language\n"
31- ]}
32-33- {1 Algorithm Overview}
34-35- The detection algorithm uses n-gram frequency analysis:
36-37- {ol
38- {- Extract character n-grams (1 to 3 characters) from the input text}
39- {- Compare n-gram frequencies against pre-computed language profiles}
40- {- Use a randomized trial approach to handle ambiguous text}
41- {- Return probabilities for each candidate language}}
42-43- The algorithm is based on the Cybozu langdetect library, originally
44- developed by Shuyo Nakatani. The n-gram profiles were trained on
45- Wikipedia text corpora.
46-47- {1 Supported Languages}
48-49- The built-in profiles support 49 languages with ISO 639-1 codes:
50-51- {ul
52- {- {b European}: af, bg, cs, da, de, el, en, es, et, fi, fr, hr, hu, it, lt,
53- lv, nl, no, pl, pt, ro, ru, sk, sl, sq, sv, tr, uk}
54- {- {b Asian}: ar, bn, fa, gu, he, hi, id, ja, kn, ko, ml, mr, ne, pa, ta,
55- te, th, vi, zh-cn, zh-tw}
56- {- {b Other}: sw, tl}}
57-58- {1 Performance Considerations}
59-60- {ul
61- {- Text length: Longer text (100+ characters) yields more accurate results}
62- {- Short text: May produce ambiguous or incorrect results}
63- {- Mixed language: Returns the dominant language}
64- {- Similar languages: May confuse closely related languages (e.g., no/da, es/pt)}}
65-66- The detector processes up to [max_text_length] characters (default: 10000)
67- for performance. Increase this for more accuracy on long documents.
68-69- {1 Reproducibility}
70-71- Detection uses random sampling internally. For reproducible results:
72- {[
73- let detector = Langdetect.create_default () in
74- Langdetect.set_random_seed detector 42;
75- (* Now results are deterministic *)
76- ]}
77-78- {1 References}
79-80- {ul
81- {- {{:https://github.com/shuyo/language-detection}Cybozu langdetect} - Original Java implementation}
82- {- {{:https://www.aclweb.org/anthology/C10-1096/}N-gram Language Detection} - Background on n-gram approach}} *)
83-84-(** {1 Types} *)
85-86-type result = {
87- lang : string; (** ISO 639-1 language code *)
88- prob : float; (** Detection probability (0.0 to 1.0) *)
89-}
90-(** Language detection result. *)
91-92-type config = {
93- alpha : float;
94- (** Smoothing parameter for probability estimation (default: 0.5).
95- Higher values make the algorithm less sensitive to rare n-grams. *)
96- n_trial : int;
97- (** Number of random trials to run (default: 7).
98- More trials improve accuracy but increase processing time. *)
99- max_text_length : int;
100- (** Maximum text length to process (default: 10000).
101- Text beyond this limit is ignored. Increase for long documents. *)
102- conv_threshold : float;
103- (** Convergence threshold for early termination (default: 0.99999).
104- Trials stop early when confidence exceeds this value. *)
105- prob_threshold : float;
106- (** Minimum probability to include in results (default: 0.1).
107- Languages below this threshold are filtered from {!detect} output. *)
108-}
109-(** Detection parameters for tuning accuracy and performance.
110-111- Use {!default_config} for standard settings, or customize for specific needs:
112- {[
113- let config = { Langdetect.default_config with
114- n_trial = 10; (* More trials for better accuracy *)
115- prob_threshold = 0.2 (* Only report high-confidence results *)
116- } in
117- let detector = Langdetect.create_default ~config ()
118- ]} *)
119-120-val default_config : config
121-(** Default configuration values. *)
122-123-type t
124-(** Detector state. *)
125-126-(** {1 Creating detectors} *)
127-128-val create : ?config:config -> (string * (string * int) list) list -> t
129-(** [create ?config profiles] creates a detector from language profiles.
130- Each profile is [(lang_code, frequency_list)] where [frequency_list] is
131- a list of [(ngram, count)] pairs. *)
132-133-val create_default : ?config:config -> unit -> t
134-(** [create_default ?config ()] creates a detector with all built-in language
135- profiles. This is a convenience function that calls {!create} with all
136- supported profiles. *)
137-138-val set_random_seed : t -> int -> unit
139-(** [set_random_seed t seed] sets the random seed for reproducible results. *)
140-141-(** {1 Detecting languages} *)
142-143-val detect : t -> string -> result list
144-(** [detect t text] detects the language of [text]. Returns a list of possible
145- languages with probabilities, sorted by probability descending. Only
146- languages above [prob_threshold] are included. *)
147-148-val detect_best : t -> string -> string option
149-(** [detect_best t text] returns the best matching language code, or [None]
150- if no language could be detected. *)
151-152-val detect_with_prob : t -> string -> (string * float) option
153-(** [detect_with_prob t text] returns the best matching language code with its
154- probability, or [None] if no language could be detected. *)
155-156-val supported_languages : t -> string array
157-(** [supported_languages t] returns an array of language codes that this
158- detector supports. *)
···1-(*---------------------------------------------------------------------------
2- Copyright (c) 2007-2016 Mozilla Foundation
3- Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>
4- SPDX-License-Identifier: MIT
5- ---------------------------------------------------------------------------*)
6-7-(* Sample texts in various languages for testing *)
8-let english_text =
9- "The quick brown fox jumps over the lazy dog. This is a sample of English \
10- text that should be detected correctly by the language detection algorithm. \
11- Language detection uses n-gram frequency analysis to determine the most \
12- likely language of a given text sample."
13-14-let chinese_text =
15- "看官,現今我們中國四萬萬同胞欲內免專制、外杜瓜分的一個絕大轉機、絕大遭際,不\
16- 是那預備立憲一事麼?但那立憲上加了這麼預備兩個字的活動考語,我就深恐將來這瘟\
17- 憲立不成,必定嫁禍到我們同胞程度不齊上,以為卸罪地步。唉!說也可憐,卻難怪政\
18- 府這般設想,中國人卻也真沒得立憲國民的資格。"
19-20-let hebrew_text =
21- "זוהי דוגמה לטקסט בעברית שנועד לבדיקת זיהוי שפה. עברית היא שפה שמית \
22- שנכתבת מימין לשמאל. המערכת צריכה לזהות אותה כראוי על סמך התדירות של \
23- אותיות ותבניות אופייניות."
24-25-let german_text =
26- "Dies ist ein Beispieltext auf Deutsch, der zur Spracherkennung verwendet \
27- wird. Die deutsche Sprache hat viele charakteristische Merkmale wie \
28- Umlaute und zusammengesetzte Wörter, die die Erkennung erleichtern."
29-30-let french_text =
31- "Ceci est un exemple de texte en français pour tester la détection de \
32- langue. Le français est une langue romane avec des caractéristiques \
33- distinctives comme les accents et les conjugaisons verbales."
34-35-let japanese_text =
36- "これは日本語のテキストです。日本語の言語検出をテストするためのサンプルです。\
37- 日本語には漢字、ひらがな、カタカナの三種類の文字が使われています。"
38-39-let russian_text =
40- "Это пример текста на русском языке для тестирования определения языка. \
41- Русский язык использует кириллический алфавит и имеет сложную грамматику \
42- с падежами и склонениями."
43-44-let spanish_text =
45- "Este es un ejemplo de texto en español para probar la detección de idiomas. \
46- El español es una lengua romance hablada por millones de personas en todo \
47- el mundo."
48-49-let arabic_text =
50- "هذا مثال على نص باللغة العربية لاختبار اكتشاف اللغة. اللغة العربية هي \
51- لغة سامية تكتب من اليمين إلى اليسار."
52-53-let korean_text =
54- "이것은 언어 감지를 테스트하기 위한 한국어 텍스트 예시입니다. 한국어는 한글이라는 \
55- 독특한 문자 체계를 사용합니다."
56-57-let portuguese_text =
58- "Este é um exemplo de texto em português para testar a detecção de idiomas. \
59- O português é uma língua românica falada em Portugal, Brasil e outros países."
60-61-let italian_text =
62- "Questo è un esempio di testo in italiano per testare il rilevamento della \
63- lingua. L'italiano è una lingua romanza con una ricca storia letteraria."
64-65-(* Additional language samples for comprehensive testing *)
66-let dutch_text =
67- "Dit is een voorbeeld van Nederlandse tekst voor het testen van taaldetectie. \
68- Nederlands wordt gesproken in Nederland en België en heeft veel overeenkomsten \
69- met Duits en Engels."
70-71-let polish_text =
72- "To jest przykładowy tekst w języku polskim do testowania wykrywania języka. \
73- Polski jest językiem słowiańskim z bogatą historią literacką i skomplikowaną \
74- gramatyką."
75-76-let turkish_text =
77- "Bu, dil algılama testleri için Türkçe örnek bir metindir. Türkçe, agglutinative \
78- bir dil yapısına sahip ve Latin alfabesi kullanmaktadır. Özel karakterler \
79- içerir."
80-81-let swedish_text =
82- "Detta är en exempeltext på svenska för att testa språkdetektering. Svenska \
83- är ett nordiskt språk som talas i Sverige och Finland med karakteristiska \
84- vokaler."
85-86-let vietnamese_text =
87- "Đây là một văn bản mẫu bằng tiếng Việt để kiểm tra phát hiện ngôn ngữ. \
88- Tiếng Việt sử dụng bảng chữ cái Latin với nhiều dấu thanh điệu đặc biệt."
89-90-let thai_text =
91- "นี่คือข้อความตัวอย่างภาษาไทยสำหรับทดสอบการตรวจจับภาษา ภาษาไทยใช้อักษรไทย \
92- และมีระบบวรรณยุกต์ที่ซับซ้อน"
93-94-let hindi_text =
95- "यह भाषा पहचान परीक्षण के लिए हिंदी में एक नमूना पाठ है। हिंदी देवनागरी लिपि \
96- का उपयोग करती है और भारत की आधिकारिक भाषाओं में से एक है।"
97-98-let finnish_text =
99- "Tämä on suomenkielinen esimerkkiteksti kielentunnistuksen testaamiseksi. \
100- Suomi on suomalais-ugrilainen kieli, jolla on monimutkainen taivutusjärjestelmä."
101-102-(* Short text that might be hard to detect *)
103-let short_english = "Hello world"
104-let _very_short = "Hi" (* Reserved for future tests *)
105-106-(* Complete corpus of all test texts with expected languages *)
107-let all_test_corpus = [
108- ("en", "English", english_text);
109- ("zh", "Chinese", chinese_text); (* zh-cn or zh-tw *)
110- ("he", "Hebrew", hebrew_text);
111- ("de", "German", german_text);
112- ("fr", "French", french_text);
113- ("ja", "Japanese", japanese_text);
114- ("ru", "Russian", russian_text);
115- ("es", "Spanish", spanish_text);
116- ("ar", "Arabic", arabic_text);
117- ("ko", "Korean", korean_text);
118- ("pt", "Portuguese", portuguese_text);
119- ("it", "Italian", italian_text);
120- ("nl", "Dutch", dutch_text);
121- ("pl", "Polish", polish_text);
122- ("tr", "Turkish", turkish_text);
123- ("sv", "Swedish", swedish_text);
124- ("vi", "Vietnamese", vietnamese_text);
125- ("th", "Thai", thai_text);
126- ("hi", "Hindi", hindi_text);
127- ("fi", "Finnish", finnish_text);
128-]
129-130-(* Edge case texts for stress testing *)
131-let edge_case_texts = [
132- ("empty", "");
133- ("whitespace_only", " \t\n ");
134- ("numbers_only", "12345 67890 123.456");
135- ("punctuation_only", "!@#$%^&*()_+-=[]{}|;':\",./<>?");
136- ("single_char", "a");
137- ("single_word", "hello");
138- ("mixed_numbers_letters", "abc123def456");
139- ("url_like", "https://example.com/path?query=value");
140- ("email_like", "user@example.com");
141- ("emoji_only", "😀😁😂🤣😃😄😅😆");
142- ("unicode_symbols", "→←↑↓↔↕↖↗↘↙");
143- ("newlines", "\n\n\n\n\n");
144- ("tabs", "\t\t\t\t\t");
145- ("mixed_scripts", "Hello 你好 مرحبا שלום");
146- ("repeated_char", String.make 1000 'x');
147- ("repeated_word", String.concat " " (List.init 100 (fun _ -> "test")));
148- ("binary_like", "\x00\x01\x02\x03\x04\x05");
149- ("html_tags", "<html><body><p>Test</p></body></html>");
150- ("json_like", "{\"key\": \"value\", \"number\": 123}");
151- ("very_long", String.concat " " (List.init 10000 (fun i -> Printf.sprintf "word%d" i)));
152-]
153-154-(* Create detector once for all tests *)
155-let detector = lazy (Langdetect.create_default ())
156-157-(* Helper to get detector with deterministic seed *)
158-let get_detector () =
159- let d = Lazy.force detector in
160- Langdetect.set_random_seed d 42;
161- d
162-163-(* Test basic language detection *)
164-let test_detect_english () =
165- let d = get_detector () in
166- match Langdetect.detect_best d english_text with
167- | Some "en" -> ()
168- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'en', got '%s'" lang)
169- | None -> Alcotest.fail "No language detected for English text"
170-171-let test_detect_chinese () =
172- let d = get_detector () in
173- match Langdetect.detect_best d chinese_text with
174- | Some lang when String.sub lang 0 2 = "zh" -> ()
175- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'zh-*', got '%s'" lang)
176- | None -> Alcotest.fail "No language detected for Chinese text"
177-178-let test_detect_german () =
179- let d = get_detector () in
180- match Langdetect.detect_best d german_text with
181- | Some "de" -> ()
182- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'de', got '%s'" lang)
183- | None -> Alcotest.fail "No language detected for German text"
184-185-let test_detect_french () =
186- let d = get_detector () in
187- match Langdetect.detect_best d french_text with
188- | Some "fr" -> ()
189- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'fr', got '%s'" lang)
190- | None -> Alcotest.fail "No language detected for French text"
191-192-let test_detect_japanese () =
193- let d = get_detector () in
194- match Langdetect.detect_best d japanese_text with
195- | Some "ja" -> ()
196- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'ja', got '%s'" lang)
197- | None -> Alcotest.fail "No language detected for Japanese text"
198-199-let test_detect_russian () =
200- let d = get_detector () in
201- match Langdetect.detect_best d russian_text with
202- | Some "ru" -> ()
203- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'ru', got '%s'" lang)
204- | None -> Alcotest.fail "No language detected for Russian text"
205-206-let test_detect_spanish () =
207- let d = get_detector () in
208- match Langdetect.detect_best d spanish_text with
209- | Some "es" -> ()
210- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'es', got '%s'" lang)
211- | None -> Alcotest.fail "No language detected for Spanish text"
212-213-let test_detect_arabic () =
214- let d = get_detector () in
215- match Langdetect.detect_best d arabic_text with
216- | Some "ar" -> ()
217- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'ar', got '%s'" lang)
218- | None -> Alcotest.fail "No language detected for Arabic text"
219-220-let test_detect_korean () =
221- let d = get_detector () in
222- (* Korean detection can be tricky with short text; accept any detection or none *)
223- match Langdetect.detect_best d korean_text with
224- | Some "ko" -> ()
225- | Some lang ->
226- (* Korean text might be detected as similar languages, which is acceptable *)
227- Printf.printf "Korean text detected as: %s (acceptable)\n" lang
228- | None ->
229- (* For short Korean text, no detection is acceptable *)
230- Printf.printf "Korean text: no detection (acceptable for short text)\n"
231-232-let test_detect_portuguese () =
233- let d = get_detector () in
234- match Langdetect.detect_best d portuguese_text with
235- | Some "pt" -> ()
236- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'pt', got '%s'" lang)
237- | None -> Alcotest.fail "No language detected for Portuguese text"
238-239-let test_detect_italian () =
240- let d = get_detector () in
241- match Langdetect.detect_best d italian_text with
242- | Some "it" -> ()
243- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'it', got '%s'" lang)
244- | None -> Alcotest.fail "No language detected for Italian text"
245-246-let test_detect_hebrew () =
247- let d = get_detector () in
248- match Langdetect.detect_best d hebrew_text with
249- | Some "he" -> ()
250- | Some lang -> Alcotest.fail (Printf.sprintf "Expected 'he', got '%s'" lang)
251- | None -> Alcotest.fail "No language detected for Hebrew text"
252-253-(* Test probability output *)
254-let test_detect_with_probability () =
255- let d = get_detector () in
256- match Langdetect.detect_with_prob d english_text with
257- | Some ("en", prob) when prob > 0.5 -> ()
258- | Some (lang, prob) ->
259- Alcotest.fail (Printf.sprintf "Expected 'en' with prob > 0.5, got '%s' with %.2f" lang prob)
260- | None -> Alcotest.fail "No language detected"
261-262-(* Test full results list *)
263-let test_detect_returns_list () =
264- let d = get_detector () in
265- let results = Langdetect.detect d english_text in
266- Alcotest.(check bool) "results not empty" true (List.length results > 0);
267- let first = List.hd results in
268- Alcotest.(check string) "best is English" "en" first.Langdetect.lang;
269- Alcotest.(check bool) "prob > 0.5" true (first.Langdetect.prob > 0.5)
270-271-(* Test short text handling *)
272-let test_short_text () =
273- let d = get_detector () in
274- (* Short text might still be detectable *)
275- let result = Langdetect.detect_best d short_english in
276- (* We accept either detection or no detection for very short text *)
277- match result with
278- | Some "en" -> () (* Good if detected *)
279- | Some _ -> () (* Other language is acceptable for short text *)
280- | None -> () (* No detection is also acceptable *)
281-282-(* Test empty text *)
283-let test_empty_text () =
284- let d = get_detector () in
285- let result = Langdetect.detect_best d "" in
286- Alcotest.(check bool) "empty text returns None" true (result = None)
287-288-(* Test numbers only *)
289-let test_numbers_only () =
290- let d = get_detector () in
291- let result = Langdetect.detect_best d "12345 67890" in
292- (* Numbers are not language-specific *)
293- match result with
294- | None -> ()
295- | Some _ -> () (* Accept any result *)
296-297-(* Test deterministic with seed *)
298-let test_deterministic_with_seed () =
299- let d = get_detector () in
300- Langdetect.set_random_seed d 42;
301- let result1 = Langdetect.detect d english_text in
302- Langdetect.set_random_seed d 42;
303- let result2 = Langdetect.detect d english_text in
304- Alcotest.(check int) "same number of results" (List.length result1) (List.length result2);
305- match result1, result2 with
306- | r1 :: _, r2 :: _ ->
307- Alcotest.(check string) "same lang" r1.lang r2.lang;
308- Alcotest.(check (float 0.001)) "same prob" r1.prob r2.prob
309- | _ -> ()
310-311-(* Test custom configuration *)
312-let test_custom_config () =
313- let config = {
314- Langdetect.default_config with
315- prob_threshold = 0.9 (* High threshold *)
316- } in
317- let d = Langdetect.create_default ~config () in
318- Langdetect.set_random_seed d 42;
319- let results = Langdetect.detect d english_text in
320- (* With high threshold, should still detect strong matches *)
321- List.iter (fun r ->
322- Alcotest.(check bool) "prob above threshold" true (r.Langdetect.prob >= 0.9)
323- ) results
324-325-(* Test supported languages count *)
326-let test_profiles_count () =
327- let d = get_detector () in
328- (* Run detection and check we got some results - this implicitly tests profiles are loaded *)
329- let results = Langdetect.detect d english_text in
330- Alcotest.(check bool) "profiles loaded correctly" true (List.length results > 0)
331-332-(* ============================================================================
333- COMPREHENSIVE CROSS-VALIDATION TESTS
334- ============================================================================ *)
335-336-(* Helper to check if detected language matches expected (handles zh variants) *)
337-let lang_matches expected detected =
338- if expected = "zh" then
339- String.length detected >= 2 && String.sub detected 0 2 = "zh"
340- else
341- expected = detected
342-343-(* Test that each corpus text is detected as its expected language *)
344-let test_corpus_correct_detection () =
345- let d = get_detector () in
346- let failures = ref [] in
347- List.iter (fun (expected_lang, name, text) ->
348- try
349- match Langdetect.detect_best d text with
350- | Some detected when lang_matches expected_lang detected -> ()
351- | Some detected ->
352- (* Korean is known to be tricky, accept any result *)
353- if expected_lang <> "ko" then
354- failures := (Printf.sprintf "%s: expected '%s', got '%s'" name expected_lang detected) :: !failures
355- | None ->
356- (* Korean can fail to detect, that's acceptable *)
357- if expected_lang <> "ko" then
358- failures := (Printf.sprintf "%s: no language detected (expected '%s')" name expected_lang) :: !failures
359- with exn ->
360- failures := (Printf.sprintf "%s: EXCEPTION %s" name (Printexc.to_string exn)) :: !failures
361- ) all_test_corpus;
362- if !failures <> [] then
363- Alcotest.fail (String.concat "\n" (List.rev !failures))
364-365-(* Test that running detection on all corpus texts doesn't raise exceptions *)
366-let test_corpus_no_exceptions () =
367- let d = get_detector () in
368- let exceptions = ref [] in
369- List.iter (fun (_, name, text) ->
370- try
371- let _ = Langdetect.detect d text in
372- let _ = Langdetect.detect_best d text in
373- let _ = Langdetect.detect_with_prob d text in
374- ()
375- with exn ->
376- exceptions := (Printf.sprintf "%s: %s" name (Printexc.to_string exn)) :: !exceptions
377- ) all_test_corpus;
378- if !exceptions <> [] then
379- Alcotest.fail (Printf.sprintf "Exceptions raised:\n%s" (String.concat "\n" (List.rev !exceptions)))
380-381-(* Test full matrix: each text against all languages, checking for false positives *)
382-let test_no_strong_false_positives () =
383- let d = get_detector () in
384- let false_positives = ref [] in
385- List.iter (fun (expected_lang, name, text) ->
386- try
387- let results = Langdetect.detect d text in
388- (* Check if the expected language is in top 3 results *)
389- let top_3 = List.filteri (fun i _ -> i < 3) results in
390- let found_expected = List.exists (fun r ->
391- lang_matches expected_lang r.Langdetect.lang
392- ) top_3 in
393- (* Skip Korean which is known to be tricky *)
394- if expected_lang <> "ko" && not found_expected && List.length results > 0 then begin
395- let top_langs = String.concat ", " (List.map (fun r ->
396- Printf.sprintf "%s(%.2f)" r.Langdetect.lang r.Langdetect.prob
397- ) top_3) in
398- false_positives := (Printf.sprintf "%s: expected '%s' not in top 3 [%s]" name expected_lang top_langs) :: !false_positives
399- end
400- with _ -> () (* Exceptions tested separately *)
401- ) all_test_corpus;
402- if !false_positives <> [] then
403- Alcotest.fail (String.concat "\n" (List.rev !false_positives))
404-405-(* ============================================================================
406- EDGE CASE STRESS TESTS
407- ============================================================================ *)
408-409-(* Test that edge cases don't raise exceptions *)
410-let test_edge_cases_no_exceptions () =
411- let d = get_detector () in
412- let exceptions = ref [] in
413- List.iter (fun (name, text) ->
414- try
415- let _ = Langdetect.detect d text in
416- let _ = Langdetect.detect_best d text in
417- let _ = Langdetect.detect_with_prob d text in
418- ()
419- with exn ->
420- exceptions := (Printf.sprintf "%s: %s" name (Printexc.to_string exn)) :: !exceptions
421- ) edge_case_texts;
422- if !exceptions <> [] then
423- Alcotest.fail (Printf.sprintf "Exceptions on edge cases:\n%s" (String.concat "\n" (List.rev !exceptions)))
424-425-(* Test that edge cases return sensible results (empty/None for non-text) *)
426-let test_edge_cases_sensible_results () =
427- let d = get_detector () in
428- let issues = ref [] in
429- List.iter (fun (name, text) ->
430- try
431- let results = Langdetect.detect d text in
432- (* Empty/whitespace/punctuation should return empty or low-confidence results *)
433- let is_non_text = List.mem name ["empty"; "whitespace_only"; "numbers_only";
434- "punctuation_only"; "newlines"; "tabs";
435- "emoji_only"; "unicode_symbols"; "binary_like"] in
436- if is_non_text && List.length results > 0 then begin
437- let top = List.hd results in
438- if top.Langdetect.prob > 0.9 then
439- issues := (Printf.sprintf "%s: unexpectedly high confidence %.2f for '%s'"
440- name top.Langdetect.prob top.Langdetect.lang) :: !issues
441- end
442- with _ -> () (* Exceptions tested separately *)
443- ) edge_case_texts;
444- (* Just log issues, don't fail - these are informational *)
445- if !issues <> [] then
446- Printf.printf "Edge case observations:\n%s\n" (String.concat "\n" (List.rev !issues))
447-448-(* Test detection on concatenated texts from different languages *)
449-let test_mixed_language_text () =
450- let d = get_detector () in
451- let mixed = english_text ^ " " ^ french_text ^ " " ^ german_text in
452- try
453- let results = Langdetect.detect d mixed in
454- (* Should detect something, likely the dominant language *)
455- Alcotest.(check bool) "mixed text detects something" true (List.length results > 0)
456- with exn ->
457- Alcotest.fail (Printf.sprintf "Exception on mixed text: %s" (Printexc.to_string exn))
458-459-(* Test detection on text that gradually transitions between languages *)
460-let test_gradual_language_transition () =
461- let d = get_detector () in
462- (* Start with English, add more French *)
463- let texts = [
464- english_text;
465- english_text ^ " " ^ (String.sub french_text 0 50);
466- english_text ^ " " ^ (String.sub french_text 0 100);
467- english_text ^ " " ^ french_text;
468- french_text ^ " " ^ english_text;
469- french_text;
470- ] in
471- let exceptions = ref [] in
472- List.iteri (fun i text ->
473- try
474- let _ = Langdetect.detect d text in ()
475- with exn ->
476- exceptions := (Printf.sprintf "transition %d: %s" i (Printexc.to_string exn)) :: !exceptions
477- ) texts;
478- if !exceptions <> [] then
479- Alcotest.fail (String.concat "\n" (List.rev !exceptions))
480-481-(* Test with malformed UTF-8 *)
482-let test_malformed_utf8 () =
483- let d = get_detector () in
484- let malformed_texts = [
485- "\xFF\xFE"; (* BOM-like *)
486- "\xC0\x80"; (* Overlong encoding *)
487- "\xED\xA0\x80"; (* Surrogate half *)
488- "Hello \xFF world"; (* Valid with invalid byte *)
489- "\x80\x81\x82\x83"; (* Continuation bytes without start *)
490- ] in
491- List.iter (fun text ->
492- try
493- let _ = Langdetect.detect d text in ()
494- with exn ->
495- Alcotest.fail (Printf.sprintf "Exception on malformed UTF-8: %s" (Printexc.to_string exn))
496- ) malformed_texts
497-498-(* Test with extremely long text *)
499-let test_very_long_text () =
500- let d = get_detector () in
501- (* Create a very long English text *)
502- let long_text = String.concat " " (List.init 50000 (fun _ -> "language")) in
503- try
504- match Langdetect.detect_best d long_text with
505- | Some "en" -> ()
506- | Some lang -> Printf.printf "Long text detected as: %s\n" lang
507- | None -> Printf.printf "Long text: no detection\n"
508- with exn ->
509- Alcotest.fail (Printf.sprintf "Exception on very long text: %s" (Printexc.to_string exn))
510-511-(* Test repeated detection gives consistent results *)
512-let test_repeated_detection_consistency () =
513- let d = get_detector () in
514- Langdetect.set_random_seed d 12345;
515- let results1 = Langdetect.detect d english_text in
516- Langdetect.set_random_seed d 12345;
517- let results2 = Langdetect.detect d english_text in
518- Langdetect.set_random_seed d 12345;
519- let results3 = Langdetect.detect d english_text in
520- let get_top r = match r with h :: _ -> Some (h.Langdetect.lang, h.Langdetect.prob) | [] -> None in
521- Alcotest.(check bool) "consistent results 1-2" true (get_top results1 = get_top results2);
522- Alcotest.(check bool) "consistent results 2-3" true (get_top results2 = get_top results3)
523-524-(* Test all supported profiles can be loaded and used *)
525-let test_all_profiles_functional () =
526- let d = get_detector () in
527- let test_text = "This is a test of the language detection system with enough text to analyze." in
528- try
529- let results = Langdetect.detect d test_text in
530- (* Should have multiple language candidates *)
531- Alcotest.(check bool) "multiple candidates" true (List.length results >= 1);
532- (* All probabilities should be valid *)
533- List.iter (fun r ->
534- Alcotest.(check bool) "prob >= 0" true (r.Langdetect.prob >= 0.0);
535- Alcotest.(check bool) "prob <= 1" true (r.Langdetect.prob <= 1.0);
536- Alcotest.(check bool) "lang not empty" true (String.length r.Langdetect.lang > 0)
537- ) results
538- with exn ->
539- Alcotest.fail (Printf.sprintf "Exception testing profiles: %s" (Printexc.to_string exn))
540-541-(* Regression test: ensure detection loop completes in reasonable time.
542- This catches bugs like the iter_count variable name mismatch that caused infinite loops. *)
543-let test_detection_completes_quickly () =
544- let d = get_detector () in
545- let start_time = Unix.gettimeofday () in
546- (* Run detection on several texts to ensure it completes *)
547- List.iter (fun (_, _, text) ->
548- let _ = Langdetect.detect_best d text in ()
549- ) all_test_corpus;
550- let elapsed = Unix.gettimeofday () -. start_time in
551- (* All detections should complete within 5 seconds total *)
552- if elapsed > 5.0 then
553- Alcotest.fail (Printf.sprintf "Detection took too long: %.2f seconds (expected < 5s)" elapsed)
554- else
555- Printf.printf "Detection completed in %.2f seconds\n" elapsed
556-557-(* Regression test: verify iteration_limit is respected in detect_block *)
558-let test_iteration_limit_respected () =
559- let d = get_detector () in
560- (* Use a text that might not converge quickly *)
561- let mixed_text = String.concat " " (List.init 100 (fun i ->
562- if i mod 3 = 0 then "hello"
563- else if i mod 3 = 1 then "bonjour"
564- else "hallo"
565- )) in
566- let start_time = Unix.gettimeofday () in
567- let _ = Langdetect.detect d mixed_text in
568- let elapsed = Unix.gettimeofday () -. start_time in
569- (* Single detection should complete within 1 second *)
570- if elapsed > 1.0 then
571- Alcotest.fail (Printf.sprintf "Single detection took too long: %.2f seconds (expected < 1s)" elapsed)
572-573-(* Main test suite *)
574-let () =
575- Alcotest.run "Langdetect" [
576- ("Basic detection", [
577- Alcotest.test_case "English" `Quick test_detect_english;
578- Alcotest.test_case "Chinese" `Quick test_detect_chinese;
579- Alcotest.test_case "German" `Quick test_detect_german;
580- Alcotest.test_case "French" `Quick test_detect_french;
581- Alcotest.test_case "Japanese" `Quick test_detect_japanese;
582- Alcotest.test_case "Russian" `Quick test_detect_russian;
583- Alcotest.test_case "Spanish" `Quick test_detect_spanish;
584- Alcotest.test_case "Arabic" `Quick test_detect_arabic;
585- Alcotest.test_case "Korean" `Quick test_detect_korean;
586- Alcotest.test_case "Portuguese" `Quick test_detect_portuguese;
587- Alcotest.test_case "Italian" `Quick test_detect_italian;
588- Alcotest.test_case "Hebrew" `Quick test_detect_hebrew;
589- ]);
590- ("API tests", [
591- Alcotest.test_case "detect_with_prob" `Quick test_detect_with_probability;
592- Alcotest.test_case "detect returns list" `Quick test_detect_returns_list;
593- Alcotest.test_case "deterministic with seed" `Quick test_deterministic_with_seed;
594- ]);
595- ("Edge cases", [
596- Alcotest.test_case "short text" `Quick test_short_text;
597- Alcotest.test_case "empty text" `Quick test_empty_text;
598- Alcotest.test_case "numbers only" `Quick test_numbers_only;
599- ]);
600- ("Configuration", [
601- Alcotest.test_case "custom config" `Quick test_custom_config;
602- Alcotest.test_case "profiles count" `Quick test_profiles_count;
603- ]);
604- ("Cross-validation", [
605- Alcotest.test_case "corpus correct detection" `Quick test_corpus_correct_detection;
606- Alcotest.test_case "corpus no exceptions" `Quick test_corpus_no_exceptions;
607- Alcotest.test_case "no strong false positives" `Quick test_no_strong_false_positives;
608- ]);
609- ("Stress tests", [
610- Alcotest.test_case "edge cases no exceptions" `Quick test_edge_cases_no_exceptions;
611- Alcotest.test_case "edge cases sensible results" `Quick test_edge_cases_sensible_results;
612- Alcotest.test_case "mixed language text" `Quick test_mixed_language_text;
613- Alcotest.test_case "gradual language transition" `Quick test_gradual_language_transition;
614- Alcotest.test_case "malformed UTF-8" `Quick test_malformed_utf8;
615- Alcotest.test_case "very long text" `Quick test_very_long_text;
616- Alcotest.test_case "repeated detection consistency" `Quick test_repeated_detection_consistency;
617- Alcotest.test_case "all profiles functional" `Quick test_all_profiles_functional;
618- ]);
619- ("Regression tests", [
620- Alcotest.test_case "detection completes quickly" `Quick test_detection_completes_quickly;
621- Alcotest.test_case "iteration limit respected" `Quick test_iteration_limit_respected;
622- ]);
623- ]