Serenity Operating System
at master 361 lines 13 kB view raw
1/* 2 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/Platform.h> 8#include <AK/String.h> 9#include <AK/StringBuilder.h> 10#include <AK/Types.h> 11#include <LibUnicode/CharacterTypes.h> 12#include <LibUnicode/Segmentation.h> 13#include <LibUnicode/UnicodeUtils.h> 14 15#if ENABLE_UNICODE_DATA 16# include <LibUnicode/UnicodeData.h> 17#endif 18 19// For details on the algorithms used here, see Section 3.13 Default Case Algorithms 20// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf 21 22namespace Unicode::Detail { 23 24#if ENABLE_UNICODE_DATA 25 26static bool is_after_uppercase_i(Utf8View const& string, size_t index) 27{ 28 // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0. 29 auto preceding_view = string.substring_view(0, index); 30 bool found_uppercase_i = false; 31 32 // FIXME: Would be better if Utf8View supported reverse iteration. 33 for (auto code_point : preceding_view) { 34 if (code_point == 'I') { 35 found_uppercase_i = true; 36 continue; 37 } 38 39 auto combining_class = canonical_combining_class(code_point); 40 if (combining_class == 0 || combining_class == 230) 41 found_uppercase_i = false; 42 } 43 44 return found_uppercase_i; 45} 46 47static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index) 48{ 49 // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above). 50 auto preceding_view = string.substring_view(0, index); 51 bool found_soft_dotted_code_point = false; 52 53 // FIXME: Would be better if Utf8View supported reverse iteration. 54 for (auto code_point : preceding_view) { 55 if (code_point_has_property(code_point, Property::Soft_Dotted)) { 56 found_soft_dotted_code_point = true; 57 continue; 58 } 59 60 auto combining_class = canonical_combining_class(code_point); 61 if (combining_class == 0 || combining_class == 230) 62 found_soft_dotted_code_point = false; 63 } 64 65 return found_soft_dotted_code_point; 66} 67 68static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) 69{ 70 // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable 71 // characters, and C is not followed by a sequence consisting of zero or more case-ignorable 72 // characters and then a cased letter. 73 auto preceding_view = string.substring_view(0, index); 74 auto following_view = ((index + byte_length) < string.byte_length()) 75 ? string.substring_view(index + byte_length) 76 : Utf8View {}; 77 78 size_t cased_letter_count = 0; 79 80 for (auto code_point : preceding_view) { 81 bool is_cased = code_point_has_property(code_point, Property::Cased); 82 bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); 83 84 if (is_cased && !is_case_ignorable) 85 ++cased_letter_count; 86 else if (!is_case_ignorable) 87 cased_letter_count = 0; 88 } 89 90 if (cased_letter_count == 0) 91 return false; 92 93 for (auto code_point : following_view) { 94 bool is_cased = code_point_has_property(code_point, Property::Cased); 95 bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); 96 97 if (is_case_ignorable) 98 continue; 99 if (is_cased) 100 return false; 101 102 break; 103 } 104 105 return true; 106} 107 108static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length) 109{ 110 // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above). 111 auto following_view = ((index + byte_length) < string.byte_length()) 112 ? string.substring_view(index + byte_length) 113 : Utf8View {}; 114 115 for (auto code_point : following_view) { 116 u32 combining_class = canonical_combining_class(code_point); 117 118 if (combining_class == 0) 119 return false; 120 if (combining_class == 230) 121 return true; 122 } 123 124 return false; 125} 126 127static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length) 128{ 129 // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may 130 // intervene between the current character and the combining dot above. 131 auto following_view = ((index + byte_length) < string.byte_length()) 132 ? string.substring_view(index + byte_length) 133 : Utf8View {}; 134 135 for (auto code_point : following_view) { 136 if (code_point == 0x307) 137 return true; 138 139 u32 combining_class = canonical_combining_class(code_point); 140 141 if (combining_class == 0) 142 return false; 143 if (combining_class == 230) 144 return false; 145 } 146 147 return false; 148} 149 150static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length) 151{ 152 auto requested_locale = Locale::None; 153 154 if (locale.has_value()) { 155 if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value()) 156 requested_locale = *maybe_locale; 157 } 158 159 auto special_casings = special_case_mapping(code_point); 160 161 for (auto const* special_casing : special_casings) { 162 if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) 163 continue; 164 165 switch (special_casing->condition) { 166 case Condition::None: 167 return special_casing; 168 169 case Condition::AfterI: 170 if (is_after_uppercase_i(string, index)) 171 return special_casing; 172 break; 173 174 case Condition::AfterSoftDotted: 175 if (is_after_soft_dotted_code_point(string, index)) 176 return special_casing; 177 break; 178 179 case Condition::FinalSigma: 180 if (is_final_code_point(string, index, byte_length)) 181 return special_casing; 182 break; 183 184 case Condition::MoreAbove: 185 if (is_followed_by_combining_class_above(string, index, byte_length)) 186 return special_casing; 187 break; 188 189 case Condition::NotBeforeDot: 190 if (!is_followed_by_combining_dot_above(string, index, byte_length)) 191 return special_casing; 192 break; 193 } 194 } 195 196 return nullptr; 197} 198 199template<CaseFoldingStatus... StatusFilter> 200static CaseFolding const* find_matching_case_folding(u32 code_point) 201{ 202 auto case_foldings = case_folding_mapping(code_point); 203 204 for (auto const* case_folding : case_foldings) { 205 if (((case_folding->status == StatusFilter) || ...)) 206 return case_folding; 207 } 208 209 return nullptr; 210} 211 212#endif 213 214// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 215ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale) 216{ 217#if ENABLE_UNICODE_DATA 218 size_t index = 0; 219 size_t byte_length = 0; 220 221 for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) { 222 u32 code_point = *it; 223 byte_length = it.underlying_code_point_length_in_bytes(); 224 225 auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); 226 if (!special_casing) { 227 TRY(builder.try_append_code_point(to_unicode_lowercase(code_point))); 228 continue; 229 } 230 231 for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i) 232 TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i])); 233 } 234 235 return {}; 236#else 237 return Error::from_string_literal("Unicode data has been disabled"); 238#endif 239} 240 241// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 242ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale) 243{ 244#if ENABLE_UNICODE_DATA 245 size_t index = 0; 246 size_t byte_length = 0; 247 248 for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) { 249 u32 code_point = *it; 250 byte_length = it.underlying_code_point_length_in_bytes(); 251 252 auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); 253 if (!special_casing) { 254 TRY(builder.try_append_code_point(to_unicode_uppercase(code_point))); 255 continue; 256 } 257 258 for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i) 259 TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i])); 260 } 261 262 return {}; 263#else 264 return Error::from_string_literal("Unicode data has been disabled"); 265#endif 266} 267 268// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 269ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale) 270{ 271#if ENABLE_UNICODE_DATA 272 // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29, 273 // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following 274 // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between 275 // F and the following word boundary to Lowercase_Mapping(C). 276 277 auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> { 278 auto it = code_points.iterator_at_byte_offset_without_validation(boundary); 279 auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary); 280 281 for (; it != end; ++it) { 282 if (code_point_has_property(*it, Property::Cased)) 283 return it; 284 } 285 286 return {}; 287 }; 288 289 auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> { 290 auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); 291 if (!special_casing) { 292 TRY(builder.try_append_code_point(to_unicode_titlecase(code_point))); 293 return {}; 294 } 295 296 for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i) 297 TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i])); 298 return {}; 299 }; 300 301 size_t boundary = 0; 302 303 while (true) { 304 auto next_boundary = next_word_segmentation_boundary(code_points, boundary); 305 if (!next_boundary.has_value()) 306 break; 307 308 if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) { 309 auto code_point = *it.value(); 310 auto code_point_offset = code_points.byte_offset_of(*it); 311 auto code_point_length = it->underlying_code_point_length_in_bytes(); 312 313 auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary); 314 TRY(builder.try_append(caseless_code_points.as_string())); 315 316 TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length)); 317 boundary = code_point_offset + code_point_length; 318 } 319 320 auto substring_to_lowercase = code_points.substring_view(boundary, *next_boundary - boundary); 321 TRY(build_lowercase_string(substring_to_lowercase, builder, locale)); 322 323 boundary = *next_boundary; 324 } 325 326 return {}; 327#else 328 return Error::from_string_literal("Unicode data has been disabled"); 329#endif 330} 331 332// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 333ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder) 334{ 335 // toCasefold(X): Map each character C in X to Case_Folding(C). 336 for (auto code_point : code_points) { 337 auto case_folding = casefold_code_point(code_point); 338 TRY(builder.try_append(case_folding)); 339 } 340 341 return {}; 342} 343 344// https://www.unicode.org/reports/tr44/#CaseFolding.txt 345// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 346Utf32View casefold_code_point(u32 const& code_point) 347{ 348#if ENABLE_UNICODE_DATA 349 // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file 350 // CaseFolding.txt in the Unicode Character Database. 351 using enum CaseFoldingStatus; 352 353 if (auto const* case_folding = find_matching_case_folding<Common, Full>(code_point)) 354 return Utf32View { case_folding->mapping, case_folding->mapping_size }; 355#endif 356 357 // The case foldings are omitted in the data file if they are the same as the code point itself. 358 return Utf32View { &code_point, 1 }; 359} 360 361}