Serenity Operating System
1/*
2 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/Platform.h>
8#include <AK/String.h>
9#include <AK/StringBuilder.h>
10#include <AK/Types.h>
11#include <LibUnicode/CharacterTypes.h>
12#include <LibUnicode/Segmentation.h>
13#include <LibUnicode/UnicodeUtils.h>
14
15#if ENABLE_UNICODE_DATA
16# include <LibUnicode/UnicodeData.h>
17#endif
18
19// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
20// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
21
22namespace Unicode::Detail {
23
24#if ENABLE_UNICODE_DATA
25
26static bool is_after_uppercase_i(Utf8View const& string, size_t index)
27{
28 // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
29 auto preceding_view = string.substring_view(0, index);
30 bool found_uppercase_i = false;
31
32 // FIXME: Would be better if Utf8View supported reverse iteration.
33 for (auto code_point : preceding_view) {
34 if (code_point == 'I') {
35 found_uppercase_i = true;
36 continue;
37 }
38
39 auto combining_class = canonical_combining_class(code_point);
40 if (combining_class == 0 || combining_class == 230)
41 found_uppercase_i = false;
42 }
43
44 return found_uppercase_i;
45}
46
47static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
48{
49 // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
50 auto preceding_view = string.substring_view(0, index);
51 bool found_soft_dotted_code_point = false;
52
53 // FIXME: Would be better if Utf8View supported reverse iteration.
54 for (auto code_point : preceding_view) {
55 if (code_point_has_property(code_point, Property::Soft_Dotted)) {
56 found_soft_dotted_code_point = true;
57 continue;
58 }
59
60 auto combining_class = canonical_combining_class(code_point);
61 if (combining_class == 0 || combining_class == 230)
62 found_soft_dotted_code_point = false;
63 }
64
65 return found_soft_dotted_code_point;
66}
67
68static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
69{
70 // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
71 // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
72 // characters and then a cased letter.
73 auto preceding_view = string.substring_view(0, index);
74 auto following_view = ((index + byte_length) < string.byte_length())
75 ? string.substring_view(index + byte_length)
76 : Utf8View {};
77
78 size_t cased_letter_count = 0;
79
80 for (auto code_point : preceding_view) {
81 bool is_cased = code_point_has_property(code_point, Property::Cased);
82 bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
83
84 if (is_cased && !is_case_ignorable)
85 ++cased_letter_count;
86 else if (!is_case_ignorable)
87 cased_letter_count = 0;
88 }
89
90 if (cased_letter_count == 0)
91 return false;
92
93 for (auto code_point : following_view) {
94 bool is_cased = code_point_has_property(code_point, Property::Cased);
95 bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
96
97 if (is_case_ignorable)
98 continue;
99 if (is_cased)
100 return false;
101
102 break;
103 }
104
105 return true;
106}
107
108static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
109{
110 // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
111 auto following_view = ((index + byte_length) < string.byte_length())
112 ? string.substring_view(index + byte_length)
113 : Utf8View {};
114
115 for (auto code_point : following_view) {
116 u32 combining_class = canonical_combining_class(code_point);
117
118 if (combining_class == 0)
119 return false;
120 if (combining_class == 230)
121 return true;
122 }
123
124 return false;
125}
126
127static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
128{
129 // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
130 // intervene between the current character and the combining dot above.
131 auto following_view = ((index + byte_length) < string.byte_length())
132 ? string.substring_view(index + byte_length)
133 : Utf8View {};
134
135 for (auto code_point : following_view) {
136 if (code_point == 0x307)
137 return true;
138
139 u32 combining_class = canonical_combining_class(code_point);
140
141 if (combining_class == 0)
142 return false;
143 if (combining_class == 230)
144 return false;
145 }
146
147 return false;
148}
149
150static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
151{
152 auto requested_locale = Locale::None;
153
154 if (locale.has_value()) {
155 if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
156 requested_locale = *maybe_locale;
157 }
158
159 auto special_casings = special_case_mapping(code_point);
160
161 for (auto const* special_casing : special_casings) {
162 if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
163 continue;
164
165 switch (special_casing->condition) {
166 case Condition::None:
167 return special_casing;
168
169 case Condition::AfterI:
170 if (is_after_uppercase_i(string, index))
171 return special_casing;
172 break;
173
174 case Condition::AfterSoftDotted:
175 if (is_after_soft_dotted_code_point(string, index))
176 return special_casing;
177 break;
178
179 case Condition::FinalSigma:
180 if (is_final_code_point(string, index, byte_length))
181 return special_casing;
182 break;
183
184 case Condition::MoreAbove:
185 if (is_followed_by_combining_class_above(string, index, byte_length))
186 return special_casing;
187 break;
188
189 case Condition::NotBeforeDot:
190 if (!is_followed_by_combining_dot_above(string, index, byte_length))
191 return special_casing;
192 break;
193 }
194 }
195
196 return nullptr;
197}
198
199template<CaseFoldingStatus... StatusFilter>
200static CaseFolding const* find_matching_case_folding(u32 code_point)
201{
202 auto case_foldings = case_folding_mapping(code_point);
203
204 for (auto const* case_folding : case_foldings) {
205 if (((case_folding->status == StatusFilter) || ...))
206 return case_folding;
207 }
208
209 return nullptr;
210}
211
212#endif
213
214// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
215ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
216{
217#if ENABLE_UNICODE_DATA
218 size_t index = 0;
219 size_t byte_length = 0;
220
221 for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
222 u32 code_point = *it;
223 byte_length = it.underlying_code_point_length_in_bytes();
224
225 auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
226 if (!special_casing) {
227 TRY(builder.try_append_code_point(to_unicode_lowercase(code_point)));
228 continue;
229 }
230
231 for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
232 TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i]));
233 }
234
235 return {};
236#else
237 return Error::from_string_literal("Unicode data has been disabled");
238#endif
239}
240
241// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
242ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
243{
244#if ENABLE_UNICODE_DATA
245 size_t index = 0;
246 size_t byte_length = 0;
247
248 for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
249 u32 code_point = *it;
250 byte_length = it.underlying_code_point_length_in_bytes();
251
252 auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
253 if (!special_casing) {
254 TRY(builder.try_append_code_point(to_unicode_uppercase(code_point)));
255 continue;
256 }
257
258 for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
259 TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i]));
260 }
261
262 return {};
263#else
264 return Error::from_string_literal("Unicode data has been disabled");
265#endif
266}
267
268// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
269ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
270{
271#if ENABLE_UNICODE_DATA
272 // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29,
273 // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following
274 // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between
275 // F and the following word boundary to Lowercase_Mapping(C).
276
277 auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> {
278 auto it = code_points.iterator_at_byte_offset_without_validation(boundary);
279 auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary);
280
281 for (; it != end; ++it) {
282 if (code_point_has_property(*it, Property::Cased))
283 return it;
284 }
285
286 return {};
287 };
288
289 auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> {
290 auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length);
291 if (!special_casing) {
292 TRY(builder.try_append_code_point(to_unicode_titlecase(code_point)));
293 return {};
294 }
295
296 for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i)
297 TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i]));
298 return {};
299 };
300
301 size_t boundary = 0;
302
303 while (true) {
304 auto next_boundary = next_word_segmentation_boundary(code_points, boundary);
305 if (!next_boundary.has_value())
306 break;
307
308 if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) {
309 auto code_point = *it.value();
310 auto code_point_offset = code_points.byte_offset_of(*it);
311 auto code_point_length = it->underlying_code_point_length_in_bytes();
312
313 auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary);
314 TRY(builder.try_append(caseless_code_points.as_string()));
315
316 TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length));
317 boundary = code_point_offset + code_point_length;
318 }
319
320 auto substring_to_lowercase = code_points.substring_view(boundary, *next_boundary - boundary);
321 TRY(build_lowercase_string(substring_to_lowercase, builder, locale));
322
323 boundary = *next_boundary;
324 }
325
326 return {};
327#else
328 return Error::from_string_literal("Unicode data has been disabled");
329#endif
330}
331
332// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
333ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder)
334{
335 // toCasefold(X): Map each character C in X to Case_Folding(C).
336 for (auto code_point : code_points) {
337 auto case_folding = casefold_code_point(code_point);
338 TRY(builder.try_append(case_folding));
339 }
340
341 return {};
342}
343
344// https://www.unicode.org/reports/tr44/#CaseFolding.txt
345// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
346Utf32View casefold_code_point(u32 const& code_point)
347{
348#if ENABLE_UNICODE_DATA
349 // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
350 // CaseFolding.txt in the Unicode Character Database.
351 using enum CaseFoldingStatus;
352
353 if (auto const* case_folding = find_matching_case_folding<Common, Full>(code_point))
354 return Utf32View { case_folding->mapping, case_folding->mapping_size };
355#endif
356
357 // The case foldings are omitted in the data file if they are the same as the code point itself.
358 return Utf32View { &code_point, 1 };
359}
360
361}