Userland/Libraries/LibUnicode/UnicodeUtils.cpp at master

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / Userland / Libraries / LibUnicode / UnicodeUtils.cpp
at master 361 lines 13 kB view raw
wrap content
Timothy Flynn AK+LibUnicode: Implement String::equals_ignoring_case without allocating 3y ago
1393ed20
  1/*
  2 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
  3 *
  4 * SPDX-License-Identifier: BSD-2-Clause
  5 */
  6
  7#include <AK/Platform.h>
  8#include <AK/String.h>
  9#include <AK/StringBuilder.h>
 10#include <AK/Types.h>
 11#include <LibUnicode/CharacterTypes.h>
 12#include <LibUnicode/Segmentation.h>
 13#include <LibUnicode/UnicodeUtils.h>
 14
 15#if ENABLE_UNICODE_DATA
 16#    include <LibUnicode/UnicodeData.h>
 17#endif
 18
 19// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
 20// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
 21
 22namespace Unicode::Detail {
 23
 24#if ENABLE_UNICODE_DATA
 25
 26static bool is_after_uppercase_i(Utf8View const& string, size_t index)
 27{
 28    // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
 29    auto preceding_view = string.substring_view(0, index);
 30    bool found_uppercase_i = false;
 31
 32    // FIXME: Would be better if Utf8View supported reverse iteration.
 33    for (auto code_point : preceding_view) {
 34        if (code_point == 'I') {
 35            found_uppercase_i = true;
 36            continue;
 37        }
 38
 39        auto combining_class = canonical_combining_class(code_point);
 40        if (combining_class == 0 || combining_class == 230)
 41            found_uppercase_i = false;
 42    }
 43
 44    return found_uppercase_i;
 45}
 46
 47static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
 48{
 49    // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
 50    auto preceding_view = string.substring_view(0, index);
 51    bool found_soft_dotted_code_point = false;
 52
 53    // FIXME: Would be better if Utf8View supported reverse iteration.
 54    for (auto code_point : preceding_view) {
 55        if (code_point_has_property(code_point, Property::Soft_Dotted)) {
 56            found_soft_dotted_code_point = true;
 57            continue;
 58        }
 59
 60        auto combining_class = canonical_combining_class(code_point);
 61        if (combining_class == 0 || combining_class == 230)
 62            found_soft_dotted_code_point = false;
 63    }
 64
 65    return found_soft_dotted_code_point;
 66}
 67
 68static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
 69{
 70    // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
 71    // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
 72    // characters and then a cased letter.
 73    auto preceding_view = string.substring_view(0, index);
 74    auto following_view = ((index + byte_length) < string.byte_length())
 75        ? string.substring_view(index + byte_length)
 76        : Utf8View {};
 77
 78    size_t cased_letter_count = 0;
 79
 80    for (auto code_point : preceding_view) {
 81        bool is_cased = code_point_has_property(code_point, Property::Cased);
 82        bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
 83
 84        if (is_cased && !is_case_ignorable)
 85            ++cased_letter_count;
 86        else if (!is_case_ignorable)
 87            cased_letter_count = 0;
 88    }
 89
 90    if (cased_letter_count == 0)
 91        return false;
 92
 93    for (auto code_point : following_view) {
 94        bool is_cased = code_point_has_property(code_point, Property::Cased);
 95        bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
 96
 97        if (is_case_ignorable)
 98            continue;
 99        if (is_cased)
100            return false;
101
102        break;
103    }
104
105    return true;
106}
107
108static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
109{
110    // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
111    auto following_view = ((index + byte_length) < string.byte_length())
112        ? string.substring_view(index + byte_length)
113        : Utf8View {};
114
115    for (auto code_point : following_view) {
116        u32 combining_class = canonical_combining_class(code_point);
117
118        if (combining_class == 0)
119            return false;
120        if (combining_class == 230)
121            return true;
122    }
123
124    return false;
125}
126
127static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
128{
129    // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
130    // intervene between the current character and the combining dot above.
131    auto following_view = ((index + byte_length) < string.byte_length())
132        ? string.substring_view(index + byte_length)
133        : Utf8View {};
134
135    for (auto code_point : following_view) {
136        if (code_point == 0x307)
137            return true;
138
139        u32 combining_class = canonical_combining_class(code_point);
140
141        if (combining_class == 0)
142            return false;
143        if (combining_class == 230)
144            return false;
145    }
146
147    return false;
148}
149
150static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
151{
152    auto requested_locale = Locale::None;
153
154    if (locale.has_value()) {
155        if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
156            requested_locale = *maybe_locale;
157    }
158
159    auto special_casings = special_case_mapping(code_point);
160
161    for (auto const* special_casing : special_casings) {
162        if (special_casing->locale != Locale::None && special_casing->locale != requested_locale)
163            continue;
164
165        switch (special_casing->condition) {
166        case Condition::None:
167            return special_casing;
168
169        case Condition::AfterI:
170            if (is_after_uppercase_i(string, index))
171                return special_casing;
172            break;
173
174        case Condition::AfterSoftDotted:
175            if (is_after_soft_dotted_code_point(string, index))
176                return special_casing;
177            break;
178
179        case Condition::FinalSigma:
180            if (is_final_code_point(string, index, byte_length))
181                return special_casing;
182            break;
183
184        case Condition::MoreAbove:
185            if (is_followed_by_combining_class_above(string, index, byte_length))
186                return special_casing;
187            break;
188
189        case Condition::NotBeforeDot:
190            if (!is_followed_by_combining_dot_above(string, index, byte_length))
191                return special_casing;
192            break;
193        }
194    }
195
196    return nullptr;
197}
198
199template<CaseFoldingStatus... StatusFilter>
200static CaseFolding const* find_matching_case_folding(u32 code_point)
201{
202    auto case_foldings = case_folding_mapping(code_point);
203
204    for (auto const* case_folding : case_foldings) {
205        if (((case_folding->status == StatusFilter) || ...))
206            return case_folding;
207    }
208
209    return nullptr;
210}
211
212#endif
213
214// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
215ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
216{
217#if ENABLE_UNICODE_DATA
218    size_t index = 0;
219    size_t byte_length = 0;
220
221    for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
222        u32 code_point = *it;
223        byte_length = it.underlying_code_point_length_in_bytes();
224
225        auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
226        if (!special_casing) {
227            TRY(builder.try_append_code_point(to_unicode_lowercase(code_point)));
228            continue;
229        }
230
231        for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
232            TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i]));
233    }
234
235    return {};
236#else
237    return Error::from_string_literal("Unicode data has been disabled");
238#endif
239}
240
241// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
242ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
243{
244#if ENABLE_UNICODE_DATA
245    size_t index = 0;
246    size_t byte_length = 0;
247
248    for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
249        u32 code_point = *it;
250        byte_length = it.underlying_code_point_length_in_bytes();
251
252        auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
253        if (!special_casing) {
254            TRY(builder.try_append_code_point(to_unicode_uppercase(code_point)));
255            continue;
256        }
257
258        for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
259            TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i]));
260    }
261
262    return {};
263#else
264    return Error::from_string_literal("Unicode data has been disabled");
265#endif
266}
267
268// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
269ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
270{
271#if ENABLE_UNICODE_DATA
272    // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29,
273    // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following
274    // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between
275    // F and the following word boundary to Lowercase_Mapping(C).
276
277    auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> {
278        auto it = code_points.iterator_at_byte_offset_without_validation(boundary);
279        auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary);
280
281        for (; it != end; ++it) {
282            if (code_point_has_property(*it, Property::Cased))
283                return it;
284        }
285
286        return {};
287    };
288
289    auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> {
290        auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length);
291        if (!special_casing) {
292            TRY(builder.try_append_code_point(to_unicode_titlecase(code_point)));
293            return {};
294        }
295
296        for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i)
297            TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i]));
298        return {};
299    };
300
301    size_t boundary = 0;
302
303    while (true) {
304        auto next_boundary = next_word_segmentation_boundary(code_points, boundary);
305        if (!next_boundary.has_value())
306            break;
307
308        if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) {
309            auto code_point = *it.value();
310            auto code_point_offset = code_points.byte_offset_of(*it);
311            auto code_point_length = it->underlying_code_point_length_in_bytes();
312
313            auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary);
314            TRY(builder.try_append(caseless_code_points.as_string()));
315
316            TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length));
317            boundary = code_point_offset + code_point_length;
318        }
319
320        auto substring_to_lowercase = code_points.substring_view(boundary, *next_boundary - boundary);
321        TRY(build_lowercase_string(substring_to_lowercase, builder, locale));
322
323        boundary = *next_boundary;
324    }
325
326    return {};
327#else
328    return Error::from_string_literal("Unicode data has been disabled");
329#endif
330}
331
332// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
333ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder)
334{
335    // toCasefold(X): Map each character C in X to Case_Folding(C).
336    for (auto code_point : code_points) {
337        auto case_folding = casefold_code_point(code_point);
338        TRY(builder.try_append(case_folding));
339    }
340
341    return {};
342}
343
344// https://www.unicode.org/reports/tr44/#CaseFolding.txt
345// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
346Utf32View casefold_code_point(u32 const& code_point)
347{
348#if ENABLE_UNICODE_DATA
349    // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
350    // CaseFolding.txt in the Unicode Character Database.
351    using enum CaseFoldingStatus;
352
353    if (auto const* case_folding = find_matching_case_folding<Common, Full>(code_point))
354        return Utf32View { case_folding->mapping, case_folding->mapping_size };
355#endif
356
357    // The case foldings are omitted in the data file if they are the same as the code point itself.
358    return Utf32View { &code_point, 1 };
359}
360
361}