Serenity Operating System
at master 846 lines 36 kB view raw
1/* 2 * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <LibTest/TestCase.h> 8 9#include <AK/StringView.h> 10#include <LibUnicode/CharacterTypes.h> 11#include <ctype.h> 12 13static void compare_to_ascii(auto& old_function, auto& new_function) 14{ 15 i64 result1 = 0; 16 i64 result2 = 0; 17 18 for (u32 i = 0; i < 0x80; ++i) { 19 EXPECT_EQ(result1 = old_function(i), result2 = new_function(i)); 20 if (result1 != result2) 21 dbgln("Function input value was {}.", i); 22 } 23} 24 25TEST_CASE(to_unicode_lowercase) 26{ 27 compare_to_ascii(tolower, Unicode::to_unicode_lowercase); 28 29 EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω" 30 EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω" 31 32 // Code points encoded by ranges in UnicodeData.txt 33 EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u); 34 EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u); 35 EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u); 36 EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu); 37} 38 39TEST_CASE(to_unicode_uppercase) 40{ 41 compare_to_ascii(toupper, Unicode::to_unicode_uppercase); 42 43 EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω" 44 EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω" 45 46 // Code points encoded by ranges in UnicodeData.txt 47 EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u); 48 EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u); 49 EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u); 50 EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu); 51 52 // Code points whose uppercase and titlecase mappings actually differ. 53 EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c6u), 0x01c4u); // "dž" to "DŽ" 54 EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c9u), 0x01c7u); // "lj" to "LJ" 55 EXPECT_EQ(Unicode::to_unicode_uppercase(0x01ccu), 0x01cau); // "nj" to "NJ" 56 EXPECT_EQ(Unicode::to_unicode_uppercase(0x01f3u), 0x01f1u); // "dz" to "DZ" 57} 58 59TEST_CASE(to_unicode_titlecase) 60{ 61 compare_to_ascii(toupper, Unicode::to_unicode_titlecase); 62 63 EXPECT_EQ(Unicode::to_unicode_titlecase(0x03c9u), 0x03a9u); // "ω" to "Ω" 64 EXPECT_EQ(Unicode::to_unicode_titlecase(0x03a9u), 0x03a9u); // "Ω" to "Ω" 65 66 // Code points encoded by ranges in UnicodeData.txt 67 EXPECT_EQ(Unicode::to_unicode_titlecase(0x3400u), 0x3400u); 68 EXPECT_EQ(Unicode::to_unicode_titlecase(0x3401u), 0x3401u); 69 EXPECT_EQ(Unicode::to_unicode_titlecase(0x3402u), 0x3402u); 70 EXPECT_EQ(Unicode::to_unicode_titlecase(0x4dbfu), 0x4dbfu); 71 72 // Code points whose uppercase and titlecase mappings actually differ. 73 EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c6u), 0x01c5u); // "dž" to "Dž" 74 EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj" 75 EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj" 76 EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz" 77 78 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(""sv)), ""sv); 79 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" "sv)), " "sv); 80 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" - "sv)), " - "sv); 81 82 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a"sv)), "A"sv); 83 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("A"sv)), "A"sv); 84 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" a"sv)), " A"sv); 85 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a "sv)), "A "sv); 86 87 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab"sv)), "Ab"sv); 88 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("Ab"sv)), "Ab"sv); 89 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("aB"sv)), "Ab"sv); 90 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("AB"sv)), "Ab"sv); 91 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" ab"sv)), " Ab"sv); 92 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab "sv)), "Ab "sv); 93 94 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv); 95 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv); 96 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'ar'"sv); 97 EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv); 98} 99 100TEST_CASE(to_unicode_casefold) 101{ 102 for (u8 code_point = 0; code_point < 0x80; ++code_point) { 103 auto ascii = tolower(code_point); 104 auto unicode = MUST(Unicode::to_unicode_casefold_full({ reinterpret_cast<char const*>(&code_point), 1 })); 105 106 EXPECT_EQ(unicode.bytes_as_string_view().length(), 1u); 107 EXPECT_EQ(unicode.bytes_as_string_view()[0], ascii); 108 } 109 110 // LATIN SMALL LETTER SHARP S 111 auto result = MUST(Unicode::to_unicode_casefold_full("\u00DF"sv)); 112 EXPECT_EQ(result, "\u0073\u0073"sv); 113 114 // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI 115 result = MUST(Unicode::to_unicode_casefold_full("\u1FB3"sv)); 116 EXPECT_EQ(result, "\u03B1\u03B9"sv); 117 118 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI 119 result = MUST(Unicode::to_unicode_casefold_full("\u1FB6"sv)); 120 EXPECT_EQ(result, "\u03B1\u0342"sv); 121 122 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 123 result = MUST(Unicode::to_unicode_casefold_full("\u1FB7"sv)); 124 EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv); 125} 126 127TEST_CASE(to_unicode_lowercase_unconditional_special_casing) 128{ 129 // LATIN SMALL LETTER SHARP S 130 auto result = MUST(Unicode::to_unicode_lowercase_full("\u00DF"sv)); 131 EXPECT_EQ(result, "\u00DF"); 132 133 // LATIN CAPITAL LETTER I WITH DOT ABOVE 134 result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv)); 135 EXPECT_EQ(result, "\u0069\u0307"); 136 137 // LATIN SMALL LIGATURE FF 138 result = MUST(Unicode::to_unicode_lowercase_full("\uFB00"sv)); 139 EXPECT_EQ(result, "\uFB00"); 140 141 // LATIN SMALL LIGATURE FI 142 result = MUST(Unicode::to_unicode_lowercase_full("\uFB01"sv)); 143 EXPECT_EQ(result, "\uFB01"); 144 145 // LATIN SMALL LIGATURE FL 146 result = MUST(Unicode::to_unicode_lowercase_full("\uFB02"sv)); 147 EXPECT_EQ(result, "\uFB02"); 148 149 // LATIN SMALL LIGATURE FFI 150 result = MUST(Unicode::to_unicode_lowercase_full("\uFB03"sv)); 151 EXPECT_EQ(result, "\uFB03"); 152 153 // LATIN SMALL LIGATURE FFL 154 result = MUST(Unicode::to_unicode_lowercase_full("\uFB04"sv)); 155 EXPECT_EQ(result, "\uFB04"); 156 157 // LATIN SMALL LIGATURE LONG S T 158 result = MUST(Unicode::to_unicode_lowercase_full("\uFB05"sv)); 159 EXPECT_EQ(result, "\uFB05"); 160 161 // LATIN SMALL LIGATURE ST 162 result = MUST(Unicode::to_unicode_lowercase_full("\uFB06"sv)); 163 EXPECT_EQ(result, "\uFB06"); 164 165 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 166 result = MUST(Unicode::to_unicode_lowercase_full("\u1FB7"sv)); 167 EXPECT_EQ(result, "\u1FB7"); 168 169 // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 170 result = MUST(Unicode::to_unicode_lowercase_full("\u1FC7"sv)); 171 EXPECT_EQ(result, "\u1FC7"); 172 173 // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 174 result = MUST(Unicode::to_unicode_lowercase_full("\u1FF7"sv)); 175 EXPECT_EQ(result, "\u1FF7"); 176} 177 178TEST_CASE(to_unicode_lowercase_special_casing_sigma) 179{ 180 auto result = MUST(Unicode::to_unicode_lowercase_full("ABCI"sv)); 181 EXPECT_EQ(result, "abci"); 182 183 // Sigma preceded by A 184 result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3"sv)); 185 EXPECT_EQ(result, "a\u03C2"); 186 187 // Sigma preceded by FEMININE ORDINAL INDICATOR 188 result = MUST(Unicode::to_unicode_lowercase_full("\u00AA\u03A3"sv)); 189 EXPECT_EQ(result, "\u00AA\u03C2"); 190 191 // Sigma preceded by ROMAN NUMERAL ONE 192 result = MUST(Unicode::to_unicode_lowercase_full("\u2160\u03A3"sv)); 193 EXPECT_EQ(result, "\u2170\u03C2"); 194 195 // Sigma preceded by COMBINING GREEK YPOGEGRAMMENI 196 result = MUST(Unicode::to_unicode_lowercase_full("\u0345\u03A3"sv)); 197 EXPECT_EQ(result, "\u0345\u03C3"); 198 199 // Sigma preceded by A and FULL STOP 200 result = MUST(Unicode::to_unicode_lowercase_full("A.\u03A3"sv)); 201 EXPECT_EQ(result, "a.\u03C2"); 202 203 // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR 204 result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3"sv)); 205 EXPECT_EQ(result, "a\u180E\u03C2"); 206 207 // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B 208 result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3B"sv)); 209 EXPECT_EQ(result, "a\u180E\u03C3b"); 210 211 // Sigma followed by A 212 result = MUST(Unicode::to_unicode_lowercase_full("\u03A3A"sv)); 213 EXPECT_EQ(result, "\u03C3a"); 214 215 // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR 216 result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3\u180E"sv)); 217 EXPECT_EQ(result, "a\u03C2\u180E"); 218 219 // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B 220 result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3\u180EB"sv)); 221 EXPECT_EQ(result, "a\u03C3\u180Eb"); 222 223 // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR 224 result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180E"sv)); 225 EXPECT_EQ(result, "a\u180E\u03C2\u180E"); 226 227 // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B 228 result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180EB"sv)); 229 EXPECT_EQ(result, "a\u180E\u03C3\u180Eb"); 230} 231 232TEST_CASE(to_unicode_lowercase_special_casing_i) 233{ 234 // LATIN CAPITAL LETTER I 235 auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv)); 236 EXPECT_EQ(result, "i"sv); 237 238 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "az"sv)); 239 EXPECT_EQ(result, "\u0131"sv); 240 241 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "tr"sv)); 242 EXPECT_EQ(result, "\u0131"sv); 243 244 // LATIN CAPITAL LETTER I WITH DOT ABOVE 245 result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv)); 246 EXPECT_EQ(result, "\u0069\u0307"sv); 247 248 result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv)); 249 EXPECT_EQ(result, "i"sv); 250 251 result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv)); 252 EXPECT_EQ(result, "i"sv); 253 254 // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE 255 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv)); 256 EXPECT_EQ(result, "i\u0307"sv); 257 258 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv)); 259 EXPECT_EQ(result, "i"sv); 260 261 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv)); 262 EXPECT_EQ(result, "i"sv); 263 264 // LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE 265 result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv)); 266 EXPECT_EQ(result, "ia\u0307"sv); 267 268 result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv)); 269 EXPECT_EQ(result, "\u0131a\u0307"sv); 270 271 result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv)); 272 EXPECT_EQ(result, "\u0131a\u0307"sv); 273} 274 275TEST_CASE(to_unicode_lowercase_special_casing_more_above) 276{ 277 // LATIN CAPITAL LETTER I 278 auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv)); 279 EXPECT_EQ(result, "i"sv); 280 281 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "lt"sv)); 282 EXPECT_EQ(result, "i"sv); 283 284 // LATIN CAPITAL LETTER J 285 result = MUST(Unicode::to_unicode_lowercase_full("J"sv, "en"sv)); 286 EXPECT_EQ(result, "j"sv); 287 288 result = MUST(Unicode::to_unicode_lowercase_full("J"sv, "lt"sv)); 289 EXPECT_EQ(result, "j"sv); 290 291 // LATIN CAPITAL LETTER I WITH OGONEK 292 result = MUST(Unicode::to_unicode_lowercase_full("\u012e"sv, "en"sv)); 293 EXPECT_EQ(result, "\u012f"sv); 294 295 result = MUST(Unicode::to_unicode_lowercase_full("\u012e"sv, "lt"sv)); 296 EXPECT_EQ(result, "\u012f"sv); 297 298 // LATIN CAPITAL LETTER I followed by COMBINING GRAVE ACCENT 299 result = MUST(Unicode::to_unicode_lowercase_full("I\u0300"sv, "en"sv)); 300 EXPECT_EQ(result, "i\u0300"sv); 301 302 result = MUST(Unicode::to_unicode_lowercase_full("I\u0300"sv, "lt"sv)); 303 EXPECT_EQ(result, "i\u0307\u0300"sv); 304 305 // LATIN CAPITAL LETTER J followed by COMBINING GRAVE ACCENT 306 result = MUST(Unicode::to_unicode_lowercase_full("J\u0300"sv, "en"sv)); 307 EXPECT_EQ(result, "j\u0300"sv); 308 309 result = MUST(Unicode::to_unicode_lowercase_full("J\u0300"sv, "lt"sv)); 310 EXPECT_EQ(result, "j\u0307\u0300"sv); 311 312 // LATIN CAPITAL LETTER I WITH OGONEK followed by COMBINING GRAVE ACCENT 313 result = MUST(Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "en"sv)); 314 EXPECT_EQ(result, "\u012f\u0300"sv); 315 316 result = MUST(Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "lt"sv)); 317 EXPECT_EQ(result, "\u012f\u0307\u0300"sv); 318} 319 320TEST_CASE(to_unicode_lowercase_special_casing_not_before_dot) 321{ 322 // LATIN CAPITAL LETTER I 323 auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv)); 324 EXPECT_EQ(result, "i"sv); 325 326 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "az"sv)); 327 EXPECT_EQ(result, "\u0131"sv); 328 329 result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "tr"sv)); 330 EXPECT_EQ(result, "\u0131"sv); 331 332 // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE 333 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv)); 334 EXPECT_EQ(result, "i\u0307"sv); 335 336 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv)); 337 EXPECT_EQ(result, "i"sv); 338 339 result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv)); 340 EXPECT_EQ(result, "i"sv); 341} 342 343TEST_CASE(to_unicode_uppercase_unconditional_special_casing) 344{ 345 // LATIN SMALL LETTER SHARP S 346 auto result = MUST(Unicode::to_unicode_uppercase_full("\u00DF"sv)); 347 EXPECT_EQ(result, "\u0053\u0053"); 348 349 // LATIN CAPITAL LETTER I WITH DOT ABOVE 350 result = MUST(Unicode::to_unicode_uppercase_full("\u0130"sv)); 351 EXPECT_EQ(result, "\u0130"); 352 353 // LATIN SMALL LIGATURE FF 354 result = MUST(Unicode::to_unicode_uppercase_full("\uFB00"sv)); 355 EXPECT_EQ(result, "\u0046\u0046"); 356 357 // LATIN SMALL LIGATURE FI 358 result = MUST(Unicode::to_unicode_uppercase_full("\uFB01"sv)); 359 EXPECT_EQ(result, "\u0046\u0049"); 360 361 // LATIN SMALL LIGATURE FL 362 result = MUST(Unicode::to_unicode_uppercase_full("\uFB02"sv)); 363 EXPECT_EQ(result, "\u0046\u004C"); 364 365 // LATIN SMALL LIGATURE FFI 366 result = MUST(Unicode::to_unicode_uppercase_full("\uFB03"sv)); 367 EXPECT_EQ(result, "\u0046\u0046\u0049"); 368 369 // LATIN SMALL LIGATURE FFL 370 result = MUST(Unicode::to_unicode_uppercase_full("\uFB04"sv)); 371 EXPECT_EQ(result, "\u0046\u0046\u004C"); 372 373 // LATIN SMALL LIGATURE LONG S T 374 result = MUST(Unicode::to_unicode_uppercase_full("\uFB05"sv)); 375 EXPECT_EQ(result, "\u0053\u0054"); 376 377 // LATIN SMALL LIGATURE ST 378 result = MUST(Unicode::to_unicode_uppercase_full("\uFB06"sv)); 379 EXPECT_EQ(result, "\u0053\u0054"); 380 381 // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 382 result = MUST(Unicode::to_unicode_uppercase_full("\u0390"sv)); 383 EXPECT_EQ(result, "\u0399\u0308\u0301"); 384 385 // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 386 result = MUST(Unicode::to_unicode_uppercase_full("\u03B0"sv)); 387 EXPECT_EQ(result, "\u03A5\u0308\u0301"); 388 389 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 390 result = MUST(Unicode::to_unicode_uppercase_full("\u1FB7"sv)); 391 EXPECT_EQ(result, "\u0391\u0342\u0399"); 392 393 // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 394 result = MUST(Unicode::to_unicode_uppercase_full("\u1FC7"sv)); 395 EXPECT_EQ(result, "\u0397\u0342\u0399"); 396 397 // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 398 result = MUST(Unicode::to_unicode_uppercase_full("\u1FF7"sv)); 399 EXPECT_EQ(result, "\u03A9\u0342\u0399"); 400} 401 402TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted) 403{ 404 // LATIN SMALL LETTER I 405 auto result = MUST(Unicode::to_unicode_uppercase_full("i"sv, "en"sv)); 406 EXPECT_EQ(result, "I"sv); 407 408 result = MUST(Unicode::to_unicode_uppercase_full("i"sv, "lt"sv)); 409 EXPECT_EQ(result, "I"sv); 410 411 // LATIN SMALL LETTER J 412 result = MUST(Unicode::to_unicode_uppercase_full("j"sv, "en"sv)); 413 EXPECT_EQ(result, "J"sv); 414 415 result = MUST(Unicode::to_unicode_uppercase_full("j"sv, "lt"sv)); 416 EXPECT_EQ(result, "J"sv); 417 418 // LATIN SMALL LETTER I followed by COMBINING DOT ABOVE 419 result = MUST(Unicode::to_unicode_uppercase_full("i\u0307"sv, "en"sv)); 420 EXPECT_EQ(result, "I\u0307"sv); 421 422 result = MUST(Unicode::to_unicode_uppercase_full("i\u0307"sv, "lt"sv)); 423 EXPECT_EQ(result, "I"sv); 424 425 // LATIN SMALL LETTER J followed by COMBINING DOT ABOVE 426 result = MUST(Unicode::to_unicode_uppercase_full("j\u0307"sv, "en"sv)); 427 EXPECT_EQ(result, "J\u0307"sv); 428 429 result = MUST(Unicode::to_unicode_uppercase_full("j\u0307"sv, "lt"sv)); 430 EXPECT_EQ(result, "J"sv); 431} 432 433TEST_CASE(to_unicode_titlecase_unconditional_special_casing) 434{ 435 // LATIN SMALL LETTER SHARP S 436 auto result = MUST(Unicode::to_unicode_titlecase_full("\u00DF"sv)); 437 EXPECT_EQ(result, "\u0053\u0073"sv); 438 439 // LATIN CAPITAL LETTER I WITH DOT ABOVE 440 result = MUST(Unicode::to_unicode_titlecase_full("\u0130"sv)); 441 EXPECT_EQ(result, "\u0130"sv); 442 443 // LATIN SMALL LIGATURE FF 444 result = MUST(Unicode::to_unicode_titlecase_full("\uFB00"sv)); 445 EXPECT_EQ(result, "\u0046\u0066"sv); 446 447 // LATIN SMALL LIGATURE FI 448 result = MUST(Unicode::to_unicode_titlecase_full("\uFB01"sv)); 449 EXPECT_EQ(result, "\u0046\u0069"sv); 450 451 // LATIN SMALL LIGATURE FL 452 result = MUST(Unicode::to_unicode_titlecase_full("\uFB02"sv)); 453 EXPECT_EQ(result, "\u0046\u006C"sv); 454 455 // LATIN SMALL LIGATURE FFI 456 result = MUST(Unicode::to_unicode_titlecase_full("\uFB03"sv)); 457 EXPECT_EQ(result, "\u0046\u0066\u0069"sv); 458 459 // LATIN SMALL LIGATURE FFL 460 result = MUST(Unicode::to_unicode_titlecase_full("\uFB04"sv)); 461 EXPECT_EQ(result, "\u0046\u0066\u006C"sv); 462 463 // LATIN SMALL LIGATURE LONG S T 464 result = MUST(Unicode::to_unicode_titlecase_full("\uFB05"sv)); 465 EXPECT_EQ(result, "\u0053\u0074"sv); 466 467 // LATIN SMALL LIGATURE ST 468 result = MUST(Unicode::to_unicode_titlecase_full("\uFB06"sv)); 469 EXPECT_EQ(result, "\u0053\u0074"sv); 470 471 // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS 472 result = MUST(Unicode::to_unicode_titlecase_full("\u0390"sv)); 473 EXPECT_EQ(result, "\u0399\u0308\u0301"sv); 474 475 // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS 476 result = MUST(Unicode::to_unicode_titlecase_full("\u03B0"sv)); 477 EXPECT_EQ(result, "\u03A5\u0308\u0301"sv); 478 479 // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI 480 result = MUST(Unicode::to_unicode_titlecase_full("\u1FB7"sv)); 481 EXPECT_EQ(result, "\u0391\u0342\u0345"sv); 482 483 // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI 484 result = MUST(Unicode::to_unicode_titlecase_full("\u1FC7"sv)); 485 EXPECT_EQ(result, "\u0397\u0342\u0345"sv); 486 487 // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI 488 result = MUST(Unicode::to_unicode_titlecase_full("\u1FF7"sv)); 489 EXPECT_EQ(result, "\u03A9\u0342\u0345"sv); 490} 491 492TEST_CASE(to_unicode_titlecase_special_casing_i) 493{ 494 // LATIN SMALL LETTER I 495 auto result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "en"sv)); 496 EXPECT_EQ(result, "I"sv); 497 498 result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "az"sv)); 499 EXPECT_EQ(result, "\u0130"sv); 500 501 result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "tr"sv)); 502 EXPECT_EQ(result, "\u0130"sv); 503} 504 505TEST_CASE(general_category) 506{ 507 auto general_category = [](StringView name) { 508 auto general_category = Unicode::general_category_from_string(name); 509 VERIFY(general_category.has_value()); 510 return *general_category; 511 }; 512 513 auto general_category_c = general_category("C"sv); 514 auto general_category_other = general_category("Other"sv); 515 EXPECT_EQ(general_category_c, general_category_other); 516 517 auto general_category_cc = general_category("Cc"sv); 518 auto general_category_control = general_category("Control"sv); 519 EXPECT_EQ(general_category_cc, general_category_control); 520 521 auto general_category_co = general_category("Co"sv); 522 auto general_category_private_use = general_category("Private_Use"sv); 523 EXPECT_EQ(general_category_co, general_category_private_use); 524 525 auto general_category_cn = general_category("Cn"sv); 526 auto general_category_unassigned = general_category("Unassigned"sv); 527 EXPECT_EQ(general_category_cn, general_category_unassigned); 528 529 auto general_category_lc = general_category("LC"sv); 530 auto general_category_cased_letter = general_category("Cased_Letter"sv); 531 EXPECT_EQ(general_category_lc, general_category_cased_letter); 532 533 auto general_category_ll = general_category("Ll"sv); 534 auto general_category_lowercase_letter = general_category("Lowercase_Letter"sv); 535 EXPECT_EQ(general_category_ll, general_category_lowercase_letter); 536 537 auto general_category_lu = general_category("Lu"sv); 538 auto general_category_uppercase_letter = general_category("Uppercase_Letter"sv); 539 EXPECT_EQ(general_category_lu, general_category_uppercase_letter); 540 541 for (u32 code_point = 0; code_point <= 0x1f; ++code_point) { 542 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c)); 543 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cc)); 544 545 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); 546 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn)); 547 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc)); 548 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); 549 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); 550 } 551 552 for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) { 553 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c)); 554 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_co)); 555 556 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); 557 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn)); 558 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc)); 559 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); 560 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); 561 } 562 563 for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) { 564 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c)); 565 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cn)); 566 567 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); 568 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); 569 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc)); 570 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); 571 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); 572 } 573 574 for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) { 575 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc)); 576 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_ll)); 577 578 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c)); 579 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); 580 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); 581 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn)); 582 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu)); 583 } 584 585 for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) { 586 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc)); 587 EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lu)); 588 589 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c)); 590 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc)); 591 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co)); 592 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn)); 593 EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll)); 594 } 595} 596 597TEST_CASE(property) 598{ 599 auto property = [](StringView name) { 600 auto property = Unicode::property_from_string(name); 601 VERIFY(property.has_value()); 602 return *property; 603 }; 604 605 auto property_any = property("Any"sv); 606 auto property_assigned = property("Assigned"sv); 607 auto property_ascii = property("ASCII"sv); 608 609 auto property_white_space = property("White_Space"sv); 610 auto property_wspace = property("WSpace"sv); 611 auto property_space = property("space"sv); 612 EXPECT_EQ(property_white_space, property_wspace); 613 EXPECT_EQ(property_white_space, property_space); 614 615 auto property_emoji_presentation = property("Emoji_Presentation"sv); 616 auto property_epres = property("EPres"sv); 617 EXPECT_EQ(property_emoji_presentation, property_epres); 618 619 for (u32 code_point = 0; code_point <= 0x10ffff; code_point += 1000) 620 EXPECT(Unicode::code_point_has_property(code_point, property_any)); 621 622 for (u32 code_point = 0x101d0; code_point <= 0x101fd; ++code_point) { 623 EXPECT(Unicode::code_point_has_property(code_point, property_any)); 624 EXPECT(Unicode::code_point_has_property(code_point, property_assigned)); 625 626 EXPECT(!Unicode::code_point_has_property(code_point, property_ascii)); 627 EXPECT(!Unicode::code_point_has_property(code_point, property_white_space)); 628 EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation)); 629 } 630 631 for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) { 632 EXPECT(Unicode::code_point_has_property(code_point, property_any)); 633 634 EXPECT(!Unicode::code_point_has_property(code_point, property_assigned)); 635 EXPECT(!Unicode::code_point_has_property(code_point, property_ascii)); 636 EXPECT(!Unicode::code_point_has_property(code_point, property_white_space)); 637 EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation)); 638 } 639 640 for (u32 code_point = 0; code_point <= 0x7f; ++code_point) { 641 EXPECT(Unicode::code_point_has_property(code_point, property_any)); 642 EXPECT(Unicode::code_point_has_property(code_point, property_assigned)); 643 EXPECT(Unicode::code_point_has_property(code_point, property_ascii)); 644 645 EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation)); 646 } 647 648 for (u32 code_point = 0x9; code_point <= 0xd; ++code_point) { 649 EXPECT(Unicode::code_point_has_property(code_point, property_any)); 650 EXPECT(Unicode::code_point_has_property(code_point, property_assigned)); 651 EXPECT(Unicode::code_point_has_property(code_point, property_ascii)); 652 EXPECT(Unicode::code_point_has_property(code_point, property_white_space)); 653 654 EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation)); 655 } 656 657 for (u32 code_point = 0x1f3e5; code_point <= 0x1f3f0; ++code_point) { 658 EXPECT(Unicode::code_point_has_property(code_point, property_any)); 659 EXPECT(Unicode::code_point_has_property(code_point, property_assigned)); 660 EXPECT(Unicode::code_point_has_property(code_point, property_emoji_presentation)); 661 662 EXPECT(!Unicode::code_point_has_property(code_point, property_ascii)); 663 EXPECT(!Unicode::code_point_has_property(code_point, property_white_space)); 664 } 665} 666 667TEST_CASE(script) 668{ 669 auto script = [](StringView name) { 670 auto script = Unicode::script_from_string(name); 671 VERIFY(script.has_value()); 672 return *script; 673 }; 674 675 auto script_latin = script("Latin"sv); 676 auto script_latn = script("Latn"sv); 677 EXPECT_EQ(script_latin, script_latn); 678 679 auto script_cyrillic = script("Cyrillic"sv); 680 auto script_cyrl = script("Cyrl"sv); 681 EXPECT_EQ(script_cyrillic, script_cyrl); 682 683 auto script_greek = script("Greek"sv); 684 auto script_grek = script("Grek"sv); 685 EXPECT_EQ(script_greek, script_grek); 686 687 for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) { 688 EXPECT(Unicode::code_point_has_script(code_point, script_latin)); 689 EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin)); 690 691 EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic)); 692 EXPECT(!Unicode::code_point_has_script(code_point, script_greek)); 693 } 694 695 for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) { 696 EXPECT(Unicode::code_point_has_script(code_point, script_latin)); 697 EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin)); 698 699 EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic)); 700 EXPECT(!Unicode::code_point_has_script(code_point, script_greek)); 701 } 702 703 for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) { 704 EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic)); 705 EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic)); 706 707 EXPECT(!Unicode::code_point_has_script(code_point, script_latin)); 708 EXPECT(!Unicode::code_point_has_script(code_point, script_greek)); 709 } 710 711 for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) { 712 EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic)); 713 EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic)); 714 715 EXPECT(!Unicode::code_point_has_script(code_point, script_latin)); 716 EXPECT(!Unicode::code_point_has_script(code_point, script_greek)); 717 } 718 719 for (u32 code_point = 0x1f80; code_point <= 0x1fb4; ++code_point) { 720 EXPECT(Unicode::code_point_has_script(code_point, script_greek)); 721 EXPECT(Unicode::code_point_has_script_extension(code_point, script_greek)); 722 723 EXPECT(!Unicode::code_point_has_script(code_point, script_latin)); 724 EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic)); 725 } 726} 727 728TEST_CASE(block) 729{ 730 auto block = [](StringView name) { 731 auto block = Unicode::block_from_string(name); 732 VERIFY(block.has_value()); 733 return *block; 734 }; 735 736 auto no_block = block("No_Block"sv); 737 auto block_nb = block("NB"sv); 738 EXPECT_EQ(no_block, block_nb); 739 740 auto block_basic_latin = block("Basic_Latin"sv); 741 auto block_ascii = block("ASCII"sv); 742 EXPECT_EQ(block_basic_latin, block_ascii); 743 744 auto block_greek_coptic = block("Greek_And_Coptic"sv); 745 auto block_greek = block("Greek"sv); 746 EXPECT_EQ(block_greek_coptic, block_greek); 747 748 auto block_variation = block("Variation_Selectors_Supplement"sv); 749 auto block_vs_sup = block("VS_Sup"sv); 750 EXPECT_EQ(block_variation, block_vs_sup); 751 752 for (u32 code_point = 0x0000; code_point <= 0x007F; ++code_point) 753 EXPECT(Unicode::code_point_has_block(code_point, block_basic_latin)); 754 755 for (u32 code_point = 0xE0100; code_point <= 0xE01EF; ++code_point) 756 EXPECT(Unicode::code_point_has_block(code_point, block_variation)); 757 758 for (u32 code_point = 0x0000; code_point <= 0x007F; ++code_point) 759 EXPECT_EQ("Basic Latin"sv, Unicode::code_point_block_display_name(code_point).value()); 760 761 for (u32 code_point = 0x0370; code_point <= 0x03FF; ++code_point) 762 EXPECT_EQ("Greek and Coptic"sv, Unicode::code_point_block_display_name(code_point).value()); 763} 764 765TEST_CASE(script_extension) 766{ 767 auto script = [](StringView name) { 768 auto script = Unicode::script_from_string(name); 769 VERIFY(script.has_value()); 770 return *script; 771 }; 772 773 auto script_latin = script("Latin"sv); 774 auto script_greek = script("Greek"sv); 775 776 for (u32 code_point = 0x363; code_point <= 0x36f; ++code_point) { 777 EXPECT(!Unicode::code_point_has_script(code_point, script_latin)); 778 EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin)); 779 } 780 781 EXPECT(!Unicode::code_point_has_script(0x342, script_greek)); 782 EXPECT(Unicode::code_point_has_script_extension(0x342, script_greek)); 783 784 EXPECT(!Unicode::code_point_has_script(0x345, script_greek)); 785 EXPECT(Unicode::code_point_has_script_extension(0x345, script_greek)); 786 787 EXPECT(!Unicode::code_point_has_script(0x1dc0, script_greek)); 788 EXPECT(Unicode::code_point_has_script_extension(0x1dc0, script_greek)); 789 790 EXPECT(!Unicode::code_point_has_script(0x1dc1, script_greek)); 791 EXPECT(Unicode::code_point_has_script_extension(0x1dc1, script_greek)); 792 793 auto script_common = script("Common"sv); 794 auto script_zyyy = script("Zyyy"sv); 795 EXPECT_EQ(script_common, script_zyyy); 796 797 EXPECT(Unicode::code_point_has_script(0x202f, script_common)); 798 EXPECT(!Unicode::code_point_has_script_extension(0x202f, script_common)); 799 800 EXPECT(Unicode::code_point_has_script(0x3000, script_common)); 801 EXPECT(Unicode::code_point_has_script_extension(0x3000, script_common)); 802 803 auto script_inherited = script("Inherited"sv); 804 auto script_qaai = script("Qaai"sv); 805 auto script_zinh = script("Zinh"sv); 806 EXPECT_EQ(script_inherited, script_qaai); 807 EXPECT_EQ(script_inherited, script_zinh); 808 809 EXPECT(Unicode::code_point_has_script(0x1ced, script_inherited)); 810 EXPECT(!Unicode::code_point_has_script_extension(0x1ced, script_inherited)); 811 812 EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited)); 813 EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited)); 814} 815 816TEST_CASE(code_point_display_name) 817{ 818 auto code_point_display_name = [](u32 code_point) { 819 auto name = Unicode::code_point_display_name(code_point); 820 VERIFY(name.has_value()); 821 return name.release_value(); 822 }; 823 824 // Control code points. 825 EXPECT_EQ(code_point_display_name(0), "NULL"sv); 826 EXPECT_EQ(code_point_display_name(1), "START OF HEADING"sv); 827 EXPECT_EQ(code_point_display_name(0xa), "LINE FEED"sv); 828 829 // Ideographic code points (which already appeared in a range in UnicodeData.txt). 830 EXPECT_EQ(code_point_display_name(0x3400), "CJK UNIFIED IDEOGRAPH-3400"sv); 831 EXPECT_EQ(code_point_display_name(0x3401), "CJK UNIFIED IDEOGRAPH-3401"sv); 832 EXPECT_EQ(code_point_display_name(0x3402), "CJK UNIFIED IDEOGRAPH-3402"sv); 833 EXPECT_EQ(code_point_display_name(0x4dbf), "CJK UNIFIED IDEOGRAPH-4DBF"sv); 834 835 EXPECT_EQ(code_point_display_name(0x20000), "CJK UNIFIED IDEOGRAPH-20000"sv); 836 EXPECT_EQ(code_point_display_name(0x20001), "CJK UNIFIED IDEOGRAPH-20001"sv); 837 EXPECT_EQ(code_point_display_name(0x20002), "CJK UNIFIED IDEOGRAPH-20002"sv); 838 EXPECT_EQ(code_point_display_name(0x2a6df), "CJK UNIFIED IDEOGRAPH-2A6DF"sv); 839 EXPECT(!Unicode::code_point_display_name(0x2a6e0).has_value()); 840 841 // Ideographic code points (which appeared individually in UnicodeData.txt and were coalesced into a range). 842 EXPECT_EQ(code_point_display_name(0x2f800), "CJK COMPATIBILITY IDEOGRAPH-2F800"sv); 843 EXPECT_EQ(code_point_display_name(0x2f801), "CJK COMPATIBILITY IDEOGRAPH-2F801"sv); 844 EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv); 845 EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv); 846}