Serenity Operating System
1/*
2 * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <LibTest/TestCase.h>
8
9#include <AK/StringView.h>
10#include <LibUnicode/CharacterTypes.h>
11#include <ctype.h>
12
13static void compare_to_ascii(auto& old_function, auto& new_function)
14{
15 i64 result1 = 0;
16 i64 result2 = 0;
17
18 for (u32 i = 0; i < 0x80; ++i) {
19 EXPECT_EQ(result1 = old_function(i), result2 = new_function(i));
20 if (result1 != result2)
21 dbgln("Function input value was {}.", i);
22 }
23}
24
25TEST_CASE(to_unicode_lowercase)
26{
27 compare_to_ascii(tolower, Unicode::to_unicode_lowercase);
28
29 EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω"
30 EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω"
31
32 // Code points encoded by ranges in UnicodeData.txt
33 EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u);
34 EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u);
35 EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u);
36 EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu);
37}
38
39TEST_CASE(to_unicode_uppercase)
40{
41 compare_to_ascii(toupper, Unicode::to_unicode_uppercase);
42
43 EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω"
44 EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
45
46 // Code points encoded by ranges in UnicodeData.txt
47 EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u);
48 EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u);
49 EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
50 EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
51
52 // Code points whose uppercase and titlecase mappings actually differ.
53 EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c6u), 0x01c4u); // "dž" to "DŽ"
54 EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c9u), 0x01c7u); // "lj" to "LJ"
55 EXPECT_EQ(Unicode::to_unicode_uppercase(0x01ccu), 0x01cau); // "nj" to "NJ"
56 EXPECT_EQ(Unicode::to_unicode_uppercase(0x01f3u), 0x01f1u); // "dz" to "DZ"
57}
58
59TEST_CASE(to_unicode_titlecase)
60{
61 compare_to_ascii(toupper, Unicode::to_unicode_titlecase);
62
63 EXPECT_EQ(Unicode::to_unicode_titlecase(0x03c9u), 0x03a9u); // "ω" to "Ω"
64 EXPECT_EQ(Unicode::to_unicode_titlecase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
65
66 // Code points encoded by ranges in UnicodeData.txt
67 EXPECT_EQ(Unicode::to_unicode_titlecase(0x3400u), 0x3400u);
68 EXPECT_EQ(Unicode::to_unicode_titlecase(0x3401u), 0x3401u);
69 EXPECT_EQ(Unicode::to_unicode_titlecase(0x3402u), 0x3402u);
70 EXPECT_EQ(Unicode::to_unicode_titlecase(0x4dbfu), 0x4dbfu);
71
72 // Code points whose uppercase and titlecase mappings actually differ.
73 EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c6u), 0x01c5u); // "dž" to "Dž"
74 EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj"
75 EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj"
76 EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz"
77
78 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(""sv)), ""sv);
79 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" "sv)), " "sv);
80 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" - "sv)), " - "sv);
81
82 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a"sv)), "A"sv);
83 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("A"sv)), "A"sv);
84 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" a"sv)), " A"sv);
85 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a "sv)), "A "sv);
86
87 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab"sv)), "Ab"sv);
88 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("Ab"sv)), "Ab"sv);
89 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("aB"sv)), "Ab"sv);
90 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("AB"sv)), "Ab"sv);
91 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" ab"sv)), " Ab"sv);
92 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab "sv)), "Ab "sv);
93
94 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv);
95 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv);
96 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'ar'"sv);
97 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv);
98}
99
100TEST_CASE(to_unicode_casefold)
101{
102 for (u8 code_point = 0; code_point < 0x80; ++code_point) {
103 auto ascii = tolower(code_point);
104 auto unicode = MUST(Unicode::to_unicode_casefold_full({ reinterpret_cast<char const*>(&code_point), 1 }));
105
106 EXPECT_EQ(unicode.bytes_as_string_view().length(), 1u);
107 EXPECT_EQ(unicode.bytes_as_string_view()[0], ascii);
108 }
109
110 // LATIN SMALL LETTER SHARP S
111 auto result = MUST(Unicode::to_unicode_casefold_full("\u00DF"sv));
112 EXPECT_EQ(result, "\u0073\u0073"sv);
113
114 // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
115 result = MUST(Unicode::to_unicode_casefold_full("\u1FB3"sv));
116 EXPECT_EQ(result, "\u03B1\u03B9"sv);
117
118 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI
119 result = MUST(Unicode::to_unicode_casefold_full("\u1FB6"sv));
120 EXPECT_EQ(result, "\u03B1\u0342"sv);
121
122 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
123 result = MUST(Unicode::to_unicode_casefold_full("\u1FB7"sv));
124 EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv);
125}
126
127TEST_CASE(to_unicode_lowercase_unconditional_special_casing)
128{
129 // LATIN SMALL LETTER SHARP S
130 auto result = MUST(Unicode::to_unicode_lowercase_full("\u00DF"sv));
131 EXPECT_EQ(result, "\u00DF");
132
133 // LATIN CAPITAL LETTER I WITH DOT ABOVE
134 result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv));
135 EXPECT_EQ(result, "\u0069\u0307");
136
137 // LATIN SMALL LIGATURE FF
138 result = MUST(Unicode::to_unicode_lowercase_full("\uFB00"sv));
139 EXPECT_EQ(result, "\uFB00");
140
141 // LATIN SMALL LIGATURE FI
142 result = MUST(Unicode::to_unicode_lowercase_full("\uFB01"sv));
143 EXPECT_EQ(result, "\uFB01");
144
145 // LATIN SMALL LIGATURE FL
146 result = MUST(Unicode::to_unicode_lowercase_full("\uFB02"sv));
147 EXPECT_EQ(result, "\uFB02");
148
149 // LATIN SMALL LIGATURE FFI
150 result = MUST(Unicode::to_unicode_lowercase_full("\uFB03"sv));
151 EXPECT_EQ(result, "\uFB03");
152
153 // LATIN SMALL LIGATURE FFL
154 result = MUST(Unicode::to_unicode_lowercase_full("\uFB04"sv));
155 EXPECT_EQ(result, "\uFB04");
156
157 // LATIN SMALL LIGATURE LONG S T
158 result = MUST(Unicode::to_unicode_lowercase_full("\uFB05"sv));
159 EXPECT_EQ(result, "\uFB05");
160
161 // LATIN SMALL LIGATURE ST
162 result = MUST(Unicode::to_unicode_lowercase_full("\uFB06"sv));
163 EXPECT_EQ(result, "\uFB06");
164
165 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
166 result = MUST(Unicode::to_unicode_lowercase_full("\u1FB7"sv));
167 EXPECT_EQ(result, "\u1FB7");
168
169 // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
170 result = MUST(Unicode::to_unicode_lowercase_full("\u1FC7"sv));
171 EXPECT_EQ(result, "\u1FC7");
172
173 // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
174 result = MUST(Unicode::to_unicode_lowercase_full("\u1FF7"sv));
175 EXPECT_EQ(result, "\u1FF7");
176}
177
178TEST_CASE(to_unicode_lowercase_special_casing_sigma)
179{
180 auto result = MUST(Unicode::to_unicode_lowercase_full("ABCI"sv));
181 EXPECT_EQ(result, "abci");
182
183 // Sigma preceded by A
184 result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3"sv));
185 EXPECT_EQ(result, "a\u03C2");
186
187 // Sigma preceded by FEMININE ORDINAL INDICATOR
188 result = MUST(Unicode::to_unicode_lowercase_full("\u00AA\u03A3"sv));
189 EXPECT_EQ(result, "\u00AA\u03C2");
190
191 // Sigma preceded by ROMAN NUMERAL ONE
192 result = MUST(Unicode::to_unicode_lowercase_full("\u2160\u03A3"sv));
193 EXPECT_EQ(result, "\u2170\u03C2");
194
195 // Sigma preceded by COMBINING GREEK YPOGEGRAMMENI
196 result = MUST(Unicode::to_unicode_lowercase_full("\u0345\u03A3"sv));
197 EXPECT_EQ(result, "\u0345\u03C3");
198
199 // Sigma preceded by A and FULL STOP
200 result = MUST(Unicode::to_unicode_lowercase_full("A.\u03A3"sv));
201 EXPECT_EQ(result, "a.\u03C2");
202
203 // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR
204 result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3"sv));
205 EXPECT_EQ(result, "a\u180E\u03C2");
206
207 // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B
208 result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3B"sv));
209 EXPECT_EQ(result, "a\u180E\u03C3b");
210
211 // Sigma followed by A
212 result = MUST(Unicode::to_unicode_lowercase_full("\u03A3A"sv));
213 EXPECT_EQ(result, "\u03C3a");
214
215 // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR
216 result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3\u180E"sv));
217 EXPECT_EQ(result, "a\u03C2\u180E");
218
219 // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B
220 result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3\u180EB"sv));
221 EXPECT_EQ(result, "a\u03C3\u180Eb");
222
223 // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR
224 result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180E"sv));
225 EXPECT_EQ(result, "a\u180E\u03C2\u180E");
226
227 // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B
228 result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180EB"sv));
229 EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
230}
231
232TEST_CASE(to_unicode_lowercase_special_casing_i)
233{
234 // LATIN CAPITAL LETTER I
235 auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv));
236 EXPECT_EQ(result, "i"sv);
237
238 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "az"sv));
239 EXPECT_EQ(result, "\u0131"sv);
240
241 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "tr"sv));
242 EXPECT_EQ(result, "\u0131"sv);
243
244 // LATIN CAPITAL LETTER I WITH DOT ABOVE
245 result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv));
246 EXPECT_EQ(result, "\u0069\u0307"sv);
247
248 result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv));
249 EXPECT_EQ(result, "i"sv);
250
251 result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv));
252 EXPECT_EQ(result, "i"sv);
253
254 // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
255 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv));
256 EXPECT_EQ(result, "i\u0307"sv);
257
258 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv));
259 EXPECT_EQ(result, "i"sv);
260
261 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv));
262 EXPECT_EQ(result, "i"sv);
263
264 // LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE
265 result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv));
266 EXPECT_EQ(result, "ia\u0307"sv);
267
268 result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv));
269 EXPECT_EQ(result, "\u0131a\u0307"sv);
270
271 result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv));
272 EXPECT_EQ(result, "\u0131a\u0307"sv);
273}
274
275TEST_CASE(to_unicode_lowercase_special_casing_more_above)
276{
277 // LATIN CAPITAL LETTER I
278 auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv));
279 EXPECT_EQ(result, "i"sv);
280
281 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "lt"sv));
282 EXPECT_EQ(result, "i"sv);
283
284 // LATIN CAPITAL LETTER J
285 result = MUST(Unicode::to_unicode_lowercase_full("J"sv, "en"sv));
286 EXPECT_EQ(result, "j"sv);
287
288 result = MUST(Unicode::to_unicode_lowercase_full("J"sv, "lt"sv));
289 EXPECT_EQ(result, "j"sv);
290
291 // LATIN CAPITAL LETTER I WITH OGONEK
292 result = MUST(Unicode::to_unicode_lowercase_full("\u012e"sv, "en"sv));
293 EXPECT_EQ(result, "\u012f"sv);
294
295 result = MUST(Unicode::to_unicode_lowercase_full("\u012e"sv, "lt"sv));
296 EXPECT_EQ(result, "\u012f"sv);
297
298 // LATIN CAPITAL LETTER I followed by COMBINING GRAVE ACCENT
299 result = MUST(Unicode::to_unicode_lowercase_full("I\u0300"sv, "en"sv));
300 EXPECT_EQ(result, "i\u0300"sv);
301
302 result = MUST(Unicode::to_unicode_lowercase_full("I\u0300"sv, "lt"sv));
303 EXPECT_EQ(result, "i\u0307\u0300"sv);
304
305 // LATIN CAPITAL LETTER J followed by COMBINING GRAVE ACCENT
306 result = MUST(Unicode::to_unicode_lowercase_full("J\u0300"sv, "en"sv));
307 EXPECT_EQ(result, "j\u0300"sv);
308
309 result = MUST(Unicode::to_unicode_lowercase_full("J\u0300"sv, "lt"sv));
310 EXPECT_EQ(result, "j\u0307\u0300"sv);
311
312 // LATIN CAPITAL LETTER I WITH OGONEK followed by COMBINING GRAVE ACCENT
313 result = MUST(Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "en"sv));
314 EXPECT_EQ(result, "\u012f\u0300"sv);
315
316 result = MUST(Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "lt"sv));
317 EXPECT_EQ(result, "\u012f\u0307\u0300"sv);
318}
319
320TEST_CASE(to_unicode_lowercase_special_casing_not_before_dot)
321{
322 // LATIN CAPITAL LETTER I
323 auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv));
324 EXPECT_EQ(result, "i"sv);
325
326 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "az"sv));
327 EXPECT_EQ(result, "\u0131"sv);
328
329 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "tr"sv));
330 EXPECT_EQ(result, "\u0131"sv);
331
332 // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
333 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv));
334 EXPECT_EQ(result, "i\u0307"sv);
335
336 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv));
337 EXPECT_EQ(result, "i"sv);
338
339 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv));
340 EXPECT_EQ(result, "i"sv);
341}
342
343TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
344{
345 // LATIN SMALL LETTER SHARP S
346 auto result = MUST(Unicode::to_unicode_uppercase_full("\u00DF"sv));
347 EXPECT_EQ(result, "\u0053\u0053");
348
349 // LATIN CAPITAL LETTER I WITH DOT ABOVE
350 result = MUST(Unicode::to_unicode_uppercase_full("\u0130"sv));
351 EXPECT_EQ(result, "\u0130");
352
353 // LATIN SMALL LIGATURE FF
354 result = MUST(Unicode::to_unicode_uppercase_full("\uFB00"sv));
355 EXPECT_EQ(result, "\u0046\u0046");
356
357 // LATIN SMALL LIGATURE FI
358 result = MUST(Unicode::to_unicode_uppercase_full("\uFB01"sv));
359 EXPECT_EQ(result, "\u0046\u0049");
360
361 // LATIN SMALL LIGATURE FL
362 result = MUST(Unicode::to_unicode_uppercase_full("\uFB02"sv));
363 EXPECT_EQ(result, "\u0046\u004C");
364
365 // LATIN SMALL LIGATURE FFI
366 result = MUST(Unicode::to_unicode_uppercase_full("\uFB03"sv));
367 EXPECT_EQ(result, "\u0046\u0046\u0049");
368
369 // LATIN SMALL LIGATURE FFL
370 result = MUST(Unicode::to_unicode_uppercase_full("\uFB04"sv));
371 EXPECT_EQ(result, "\u0046\u0046\u004C");
372
373 // LATIN SMALL LIGATURE LONG S T
374 result = MUST(Unicode::to_unicode_uppercase_full("\uFB05"sv));
375 EXPECT_EQ(result, "\u0053\u0054");
376
377 // LATIN SMALL LIGATURE ST
378 result = MUST(Unicode::to_unicode_uppercase_full("\uFB06"sv));
379 EXPECT_EQ(result, "\u0053\u0054");
380
381 // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
382 result = MUST(Unicode::to_unicode_uppercase_full("\u0390"sv));
383 EXPECT_EQ(result, "\u0399\u0308\u0301");
384
385 // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
386 result = MUST(Unicode::to_unicode_uppercase_full("\u03B0"sv));
387 EXPECT_EQ(result, "\u03A5\u0308\u0301");
388
389 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
390 result = MUST(Unicode::to_unicode_uppercase_full("\u1FB7"sv));
391 EXPECT_EQ(result, "\u0391\u0342\u0399");
392
393 // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
394 result = MUST(Unicode::to_unicode_uppercase_full("\u1FC7"sv));
395 EXPECT_EQ(result, "\u0397\u0342\u0399");
396
397 // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
398 result = MUST(Unicode::to_unicode_uppercase_full("\u1FF7"sv));
399 EXPECT_EQ(result, "\u03A9\u0342\u0399");
400}
401
402TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted)
403{
404 // LATIN SMALL LETTER I
405 auto result = MUST(Unicode::to_unicode_uppercase_full("i"sv, "en"sv));
406 EXPECT_EQ(result, "I"sv);
407
408 result = MUST(Unicode::to_unicode_uppercase_full("i"sv, "lt"sv));
409 EXPECT_EQ(result, "I"sv);
410
411 // LATIN SMALL LETTER J
412 result = MUST(Unicode::to_unicode_uppercase_full("j"sv, "en"sv));
413 EXPECT_EQ(result, "J"sv);
414
415 result = MUST(Unicode::to_unicode_uppercase_full("j"sv, "lt"sv));
416 EXPECT_EQ(result, "J"sv);
417
418 // LATIN SMALL LETTER I followed by COMBINING DOT ABOVE
419 result = MUST(Unicode::to_unicode_uppercase_full("i\u0307"sv, "en"sv));
420 EXPECT_EQ(result, "I\u0307"sv);
421
422 result = MUST(Unicode::to_unicode_uppercase_full("i\u0307"sv, "lt"sv));
423 EXPECT_EQ(result, "I"sv);
424
425 // LATIN SMALL LETTER J followed by COMBINING DOT ABOVE
426 result = MUST(Unicode::to_unicode_uppercase_full("j\u0307"sv, "en"sv));
427 EXPECT_EQ(result, "J\u0307"sv);
428
429 result = MUST(Unicode::to_unicode_uppercase_full("j\u0307"sv, "lt"sv));
430 EXPECT_EQ(result, "J"sv);
431}
432
433TEST_CASE(to_unicode_titlecase_unconditional_special_casing)
434{
435 // LATIN SMALL LETTER SHARP S
436 auto result = MUST(Unicode::to_unicode_titlecase_full("\u00DF"sv));
437 EXPECT_EQ(result, "\u0053\u0073"sv);
438
439 // LATIN CAPITAL LETTER I WITH DOT ABOVE
440 result = MUST(Unicode::to_unicode_titlecase_full("\u0130"sv));
441 EXPECT_EQ(result, "\u0130"sv);
442
443 // LATIN SMALL LIGATURE FF
444 result = MUST(Unicode::to_unicode_titlecase_full("\uFB00"sv));
445 EXPECT_EQ(result, "\u0046\u0066"sv);
446
447 // LATIN SMALL LIGATURE FI
448 result = MUST(Unicode::to_unicode_titlecase_full("\uFB01"sv));
449 EXPECT_EQ(result, "\u0046\u0069"sv);
450
451 // LATIN SMALL LIGATURE FL
452 result = MUST(Unicode::to_unicode_titlecase_full("\uFB02"sv));
453 EXPECT_EQ(result, "\u0046\u006C"sv);
454
455 // LATIN SMALL LIGATURE FFI
456 result = MUST(Unicode::to_unicode_titlecase_full("\uFB03"sv));
457 EXPECT_EQ(result, "\u0046\u0066\u0069"sv);
458
459 // LATIN SMALL LIGATURE FFL
460 result = MUST(Unicode::to_unicode_titlecase_full("\uFB04"sv));
461 EXPECT_EQ(result, "\u0046\u0066\u006C"sv);
462
463 // LATIN SMALL LIGATURE LONG S T
464 result = MUST(Unicode::to_unicode_titlecase_full("\uFB05"sv));
465 EXPECT_EQ(result, "\u0053\u0074"sv);
466
467 // LATIN SMALL LIGATURE ST
468 result = MUST(Unicode::to_unicode_titlecase_full("\uFB06"sv));
469 EXPECT_EQ(result, "\u0053\u0074"sv);
470
471 // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
472 result = MUST(Unicode::to_unicode_titlecase_full("\u0390"sv));
473 EXPECT_EQ(result, "\u0399\u0308\u0301"sv);
474
475 // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
476 result = MUST(Unicode::to_unicode_titlecase_full("\u03B0"sv));
477 EXPECT_EQ(result, "\u03A5\u0308\u0301"sv);
478
479 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
480 result = MUST(Unicode::to_unicode_titlecase_full("\u1FB7"sv));
481 EXPECT_EQ(result, "\u0391\u0342\u0345"sv);
482
483 // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
484 result = MUST(Unicode::to_unicode_titlecase_full("\u1FC7"sv));
485 EXPECT_EQ(result, "\u0397\u0342\u0345"sv);
486
487 // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
488 result = MUST(Unicode::to_unicode_titlecase_full("\u1FF7"sv));
489 EXPECT_EQ(result, "\u03A9\u0342\u0345"sv);
490}
491
492TEST_CASE(to_unicode_titlecase_special_casing_i)
493{
494 // LATIN SMALL LETTER I
495 auto result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "en"sv));
496 EXPECT_EQ(result, "I"sv);
497
498 result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "az"sv));
499 EXPECT_EQ(result, "\u0130"sv);
500
501 result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "tr"sv));
502 EXPECT_EQ(result, "\u0130"sv);
503}
504
505TEST_CASE(general_category)
506{
507 auto general_category = [](StringView name) {
508 auto general_category = Unicode::general_category_from_string(name);
509 VERIFY(general_category.has_value());
510 return *general_category;
511 };
512
513 auto general_category_c = general_category("C"sv);
514 auto general_category_other = general_category("Other"sv);
515 EXPECT_EQ(general_category_c, general_category_other);
516
517 auto general_category_cc = general_category("Cc"sv);
518 auto general_category_control = general_category("Control"sv);
519 EXPECT_EQ(general_category_cc, general_category_control);
520
521 auto general_category_co = general_category("Co"sv);
522 auto general_category_private_use = general_category("Private_Use"sv);
523 EXPECT_EQ(general_category_co, general_category_private_use);
524
525 auto general_category_cn = general_category("Cn"sv);
526 auto general_category_unassigned = general_category("Unassigned"sv);
527 EXPECT_EQ(general_category_cn, general_category_unassigned);
528
529 auto general_category_lc = general_category("LC"sv);
530 auto general_category_cased_letter = general_category("Cased_Letter"sv);
531 EXPECT_EQ(general_category_lc, general_category_cased_letter);
532
533 auto general_category_ll = general_category("Ll"sv);
534 auto general_category_lowercase_letter = general_category("Lowercase_Letter"sv);
535 EXPECT_EQ(general_category_ll, general_category_lowercase_letter);
536
537 auto general_category_lu = general_category("Lu"sv);
538 auto general_category_uppercase_letter = general_category("Uppercase_Letter"sv);
539 EXPECT_EQ(general_category_lu, general_category_uppercase_letter);
540
541 for (u32 code_point = 0; code_point <= 0x1f; ++code_point) {
542 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
543 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cc));
544
545 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
546 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
547 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
548 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
549 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
550 }
551
552 for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) {
553 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
554 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_co));
555
556 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
557 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
558 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
559 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
560 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
561 }
562
563 for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
564 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
565 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cn));
566
567 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
568 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
569 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
570 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
571 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
572 }
573
574 for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
575 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
576 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_ll));
577
578 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
579 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
580 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
581 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
582 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
583 }
584
585 for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
586 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
587 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lu));
588
589 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
590 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
591 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
592 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
593 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
594 }
595}
596
597TEST_CASE(property)
598{
599 auto property = [](StringView name) {
600 auto property = Unicode::property_from_string(name);
601 VERIFY(property.has_value());
602 return *property;
603 };
604
605 auto property_any = property("Any"sv);
606 auto property_assigned = property("Assigned"sv);
607 auto property_ascii = property("ASCII"sv);
608
609 auto property_white_space = property("White_Space"sv);
610 auto property_wspace = property("WSpace"sv);
611 auto property_space = property("space"sv);
612 EXPECT_EQ(property_white_space, property_wspace);
613 EXPECT_EQ(property_white_space, property_space);
614
615 auto property_emoji_presentation = property("Emoji_Presentation"sv);
616 auto property_epres = property("EPres"sv);
617 EXPECT_EQ(property_emoji_presentation, property_epres);
618
619 for (u32 code_point = 0; code_point <= 0x10ffff; code_point += 1000)
620 EXPECT(Unicode::code_point_has_property(code_point, property_any));
621
622 for (u32 code_point = 0x101d0; code_point <= 0x101fd; ++code_point) {
623 EXPECT(Unicode::code_point_has_property(code_point, property_any));
624 EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
625
626 EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
627 EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
628 EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
629 }
630
631 for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
632 EXPECT(Unicode::code_point_has_property(code_point, property_any));
633
634 EXPECT(!Unicode::code_point_has_property(code_point, property_assigned));
635 EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
636 EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
637 EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
638 }
639
640 for (u32 code_point = 0; code_point <= 0x7f; ++code_point) {
641 EXPECT(Unicode::code_point_has_property(code_point, property_any));
642 EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
643 EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
644
645 EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
646 }
647
648 for (u32 code_point = 0x9; code_point <= 0xd; ++code_point) {
649 EXPECT(Unicode::code_point_has_property(code_point, property_any));
650 EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
651 EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
652 EXPECT(Unicode::code_point_has_property(code_point, property_white_space));
653
654 EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
655 }
656
657 for (u32 code_point = 0x1f3e5; code_point <= 0x1f3f0; ++code_point) {
658 EXPECT(Unicode::code_point_has_property(code_point, property_any));
659 EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
660 EXPECT(Unicode::code_point_has_property(code_point, property_emoji_presentation));
661
662 EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
663 EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
664 }
665}
666
667TEST_CASE(script)
668{
669 auto script = [](StringView name) {
670 auto script = Unicode::script_from_string(name);
671 VERIFY(script.has_value());
672 return *script;
673 };
674
675 auto script_latin = script("Latin"sv);
676 auto script_latn = script("Latn"sv);
677 EXPECT_EQ(script_latin, script_latn);
678
679 auto script_cyrillic = script("Cyrillic"sv);
680 auto script_cyrl = script("Cyrl"sv);
681 EXPECT_EQ(script_cyrillic, script_cyrl);
682
683 auto script_greek = script("Greek"sv);
684 auto script_grek = script("Grek"sv);
685 EXPECT_EQ(script_greek, script_grek);
686
687 for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
688 EXPECT(Unicode::code_point_has_script(code_point, script_latin));
689 EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
690
691 EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
692 EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
693 }
694
695 for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
696 EXPECT(Unicode::code_point_has_script(code_point, script_latin));
697 EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
698
699 EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
700 EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
701 }
702
703 for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
704 EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
705 EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
706
707 EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
708 EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
709 }
710
711 for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
712 EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
713 EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
714
715 EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
716 EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
717 }
718
719 for (u32 code_point = 0x1f80; code_point <= 0x1fb4; ++code_point) {
720 EXPECT(Unicode::code_point_has_script(code_point, script_greek));
721 EXPECT(Unicode::code_point_has_script_extension(code_point, script_greek));
722
723 EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
724 EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
725 }
726}
727
728TEST_CASE(block)
729{
730 auto block = [](StringView name) {
731 auto block = Unicode::block_from_string(name);
732 VERIFY(block.has_value());
733 return *block;
734 };
735
736 auto no_block = block("No_Block"sv);
737 auto block_nb = block("NB"sv);
738 EXPECT_EQ(no_block, block_nb);
739
740 auto block_basic_latin = block("Basic_Latin"sv);
741 auto block_ascii = block("ASCII"sv);
742 EXPECT_EQ(block_basic_latin, block_ascii);
743
744 auto block_greek_coptic = block("Greek_And_Coptic"sv);
745 auto block_greek = block("Greek"sv);
746 EXPECT_EQ(block_greek_coptic, block_greek);
747
748 auto block_variation = block("Variation_Selectors_Supplement"sv);
749 auto block_vs_sup = block("VS_Sup"sv);
750 EXPECT_EQ(block_variation, block_vs_sup);
751
752 for (u32 code_point = 0x0000; code_point <= 0x007F; ++code_point)
753 EXPECT(Unicode::code_point_has_block(code_point, block_basic_latin));
754
755 for (u32 code_point = 0xE0100; code_point <= 0xE01EF; ++code_point)
756 EXPECT(Unicode::code_point_has_block(code_point, block_variation));
757
758 for (u32 code_point = 0x0000; code_point <= 0x007F; ++code_point)
759 EXPECT_EQ("Basic Latin"sv, Unicode::code_point_block_display_name(code_point).value());
760
761 for (u32 code_point = 0x0370; code_point <= 0x03FF; ++code_point)
762 EXPECT_EQ("Greek and Coptic"sv, Unicode::code_point_block_display_name(code_point).value());
763}
764
765TEST_CASE(script_extension)
766{
767 auto script = [](StringView name) {
768 auto script = Unicode::script_from_string(name);
769 VERIFY(script.has_value());
770 return *script;
771 };
772
773 auto script_latin = script("Latin"sv);
774 auto script_greek = script("Greek"sv);
775
776 for (u32 code_point = 0x363; code_point <= 0x36f; ++code_point) {
777 EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
778 EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
779 }
780
781 EXPECT(!Unicode::code_point_has_script(0x342, script_greek));
782 EXPECT(Unicode::code_point_has_script_extension(0x342, script_greek));
783
784 EXPECT(!Unicode::code_point_has_script(0x345, script_greek));
785 EXPECT(Unicode::code_point_has_script_extension(0x345, script_greek));
786
787 EXPECT(!Unicode::code_point_has_script(0x1dc0, script_greek));
788 EXPECT(Unicode::code_point_has_script_extension(0x1dc0, script_greek));
789
790 EXPECT(!Unicode::code_point_has_script(0x1dc1, script_greek));
791 EXPECT(Unicode::code_point_has_script_extension(0x1dc1, script_greek));
792
793 auto script_common = script("Common"sv);
794 auto script_zyyy = script("Zyyy"sv);
795 EXPECT_EQ(script_common, script_zyyy);
796
797 EXPECT(Unicode::code_point_has_script(0x202f, script_common));
798 EXPECT(!Unicode::code_point_has_script_extension(0x202f, script_common));
799
800 EXPECT(Unicode::code_point_has_script(0x3000, script_common));
801 EXPECT(Unicode::code_point_has_script_extension(0x3000, script_common));
802
803 auto script_inherited = script("Inherited"sv);
804 auto script_qaai = script("Qaai"sv);
805 auto script_zinh = script("Zinh"sv);
806 EXPECT_EQ(script_inherited, script_qaai);
807 EXPECT_EQ(script_inherited, script_zinh);
808
809 EXPECT(Unicode::code_point_has_script(0x1ced, script_inherited));
810 EXPECT(!Unicode::code_point_has_script_extension(0x1ced, script_inherited));
811
812 EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited));
813 EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
814}
815
816TEST_CASE(code_point_display_name)
817{
818 auto code_point_display_name = [](u32 code_point) {
819 auto name = Unicode::code_point_display_name(code_point);
820 VERIFY(name.has_value());
821 return name.release_value();
822 };
823
824 // Control code points.
825 EXPECT_EQ(code_point_display_name(0), "NULL"sv);
826 EXPECT_EQ(code_point_display_name(1), "START OF HEADING"sv);
827 EXPECT_EQ(code_point_display_name(0xa), "LINE FEED"sv);
828
829 // Ideographic code points (which already appeared in a range in UnicodeData.txt).
830 EXPECT_EQ(code_point_display_name(0x3400), "CJK UNIFIED IDEOGRAPH-3400"sv);
831 EXPECT_EQ(code_point_display_name(0x3401), "CJK UNIFIED IDEOGRAPH-3401"sv);
832 EXPECT_EQ(code_point_display_name(0x3402), "CJK UNIFIED IDEOGRAPH-3402"sv);
833 EXPECT_EQ(code_point_display_name(0x4dbf), "CJK UNIFIED IDEOGRAPH-4DBF"sv);
834
835 EXPECT_EQ(code_point_display_name(0x20000), "CJK UNIFIED IDEOGRAPH-20000"sv);
836 EXPECT_EQ(code_point_display_name(0x20001), "CJK UNIFIED IDEOGRAPH-20001"sv);
837 EXPECT_EQ(code_point_display_name(0x20002), "CJK UNIFIED IDEOGRAPH-20002"sv);
838 EXPECT_EQ(code_point_display_name(0x2a6df), "CJK UNIFIED IDEOGRAPH-2A6DF"sv);
839 EXPECT(!Unicode::code_point_display_name(0x2a6e0).has_value());
840
841 // Ideographic code points (which appeared individually in UnicodeData.txt and were coalesced into a range).
842 EXPECT_EQ(code_point_display_name(0x2f800), "CJK COMPATIBILITY IDEOGRAPH-2F800"sv);
843 EXPECT_EQ(code_point_display_name(0x2f801), "CJK COMPATIBILITY IDEOGRAPH-2F801"sv);
844 EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv);
845 EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv);
846}