AK/Utf16View.cpp at master · jcs.org/serenity

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / AK / Utf16View.cpp
at master 319 lines 9.7 kB view raw
wrap content
Nico Weber AK: Add spec comments to Utf16CodePointIterator::operator*() 3y ago
aa9037ee
  1/*
  2 * Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
  3 *
  4 * SPDX-License-Identifier: BSD-2-Clause
  5 */
  6
  7#include <AK/CharacterTypes.h>
  8#include <AK/Concepts.h>
  9#include <AK/StringBuilder.h>
 10#include <AK/StringView.h>
 11#include <AK/Utf16View.h>
 12#include <AK/Utf32View.h>
 13#include <AK/Utf8View.h>
 14
 15namespace AK {
 16
 17static constexpr u16 high_surrogate_min = 0xd800;
 18static constexpr u16 high_surrogate_max = 0xdbff;
 19static constexpr u16 low_surrogate_min = 0xdc00;
 20static constexpr u16 low_surrogate_max = 0xdfff;
 21static constexpr u32 replacement_code_point = 0xfffd;
 22static constexpr u32 first_supplementary_plane_code_point = 0x10000;
 23
 24template<OneOf<Utf8View, Utf32View> UtfViewType>
 25static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view)
 26{
 27    Utf16Data utf16_data;
 28    TRY(utf16_data.try_ensure_capacity(view.length()));
 29
 30    for (auto code_point : view)
 31        TRY(code_point_to_utf16(utf16_data, code_point));
 32
 33    return utf16_data;
 34}
 35
 36ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view)
 37{
 38    return to_utf16_impl(Utf8View { utf8_view });
 39}
 40
 41ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view)
 42{
 43    return to_utf16_impl(utf8_view);
 44}
 45
 46ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view)
 47{
 48    return to_utf16_impl(utf32_view);
 49}
 50
 51ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point)
 52{
 53    VERIFY(is_unicode(code_point));
 54
 55    if (code_point < first_supplementary_plane_code_point) {
 56        TRY(string.try_append(static_cast<u16>(code_point)));
 57    } else {
 58        code_point -= first_supplementary_plane_code_point;
 59        TRY(string.try_append(static_cast<u16>(high_surrogate_min | (code_point >> 10))));
 60        TRY(string.try_append(static_cast<u16>(low_surrogate_min | (code_point & 0x3ff))));
 61    }
 62
 63    return {};
 64}
 65
 66bool Utf16View::is_high_surrogate(u16 code_unit)
 67{
 68    return (code_unit >= high_surrogate_min) && (code_unit <= high_surrogate_max);
 69}
 70
 71bool Utf16View::is_low_surrogate(u16 code_unit)
 72{
 73    return (code_unit >= low_surrogate_min) && (code_unit <= low_surrogate_max);
 74}
 75
 76u32 Utf16View::decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate)
 77{
 78    VERIFY(is_high_surrogate(high_surrogate));
 79    VERIFY(is_low_surrogate(low_surrogate));
 80
 81    return ((high_surrogate - high_surrogate_min) << 10) + (low_surrogate - low_surrogate_min) + first_supplementary_plane_code_point;
 82}
 83
 84ErrorOr<DeprecatedString> Utf16View::to_deprecated_string(AllowInvalidCodeUnits allow_invalid_code_units) const
 85{
 86    return TRY(to_utf8(allow_invalid_code_units)).to_deprecated_string();
 87}
 88
 89ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
 90{
 91    StringBuilder builder;
 92
 93    if (allow_invalid_code_units == AllowInvalidCodeUnits::Yes) {
 94        for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
 95            if (is_high_surrogate(*ptr)) {
 96                auto const* next = ptr + 1;
 97
 98                if ((next < end_ptr()) && is_low_surrogate(*next)) {
 99                    auto code_point = decode_surrogate_pair(*ptr, *next);
100                    TRY(builder.try_append_code_point(code_point));
101                    ++ptr;
102                    continue;
103                }
104            }
105
106            TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
107        }
108    } else {
109        for (auto code_point : *this)
110            TRY(builder.try_append_code_point(code_point));
111    }
112
113    return builder.to_string();
114}
115
116size_t Utf16View::length_in_code_points() const
117{
118    if (!m_length_in_code_points.has_value())
119        m_length_in_code_points = calculate_length_in_code_points();
120    return *m_length_in_code_points;
121}
122
123u16 Utf16View::code_unit_at(size_t index) const
124{
125    VERIFY(index < length_in_code_units());
126    return m_code_units[index];
127}
128
129u32 Utf16View::code_point_at(size_t index) const
130{
131    VERIFY(index < length_in_code_units());
132
133    u32 code_point = code_unit_at(index);
134    if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point))
135        return code_point;
136    if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
137        return code_point;
138
139    auto second = code_unit_at(index + 1);
140    if (!is_low_surrogate(second))
141        return code_point;
142
143    return decode_surrogate_pair(code_point, second);
144}
145
146size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
147{
148    size_t code_point_offset = 0;
149
150    for (auto it = begin(); it != end(); ++it) {
151        if (code_unit_offset == 0)
152            return code_point_offset;
153
154        code_unit_offset -= it.length_in_code_units();
155        ++code_point_offset;
156    }
157
158    return code_point_offset;
159}
160
161size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
162{
163    size_t code_unit_offset = 0;
164
165    for (auto it = begin(); it != end(); ++it) {
166        if (code_point_offset == 0)
167            return code_unit_offset;
168
169        code_unit_offset += it.length_in_code_units();
170        --code_point_offset;
171    }
172
173    return code_unit_offset;
174}
175
176size_t Utf16View::code_unit_offset_of(Utf16CodePointIterator const& it) const
177{
178    VERIFY(it.m_ptr >= begin_ptr());
179    VERIFY(it.m_ptr <= end_ptr());
180
181    return it.m_ptr - begin_ptr();
182}
183
184Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const
185{
186    VERIFY(!Checked<size_t>::addition_would_overflow(code_unit_offset, code_unit_length));
187    VERIFY(code_unit_offset + code_unit_length <= length_in_code_units());
188
189    return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) };
190}
191
192Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
193{
194    if (code_point_length == 0)
195        return {};
196
197    auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); };
198    size_t code_point_index = 0;
199    size_t code_unit_offset = 0;
200
201    for (auto it = begin(); it != end(); ++it) {
202        if (code_point_index == code_point_offset)
203            code_unit_offset = code_unit_offset_of(it);
204
205        if (code_point_index == (code_point_offset + code_point_length - 1)) {
206            size_t code_unit_length = code_unit_offset_of(++it) - code_unit_offset;
207            return substring_view(code_unit_offset, code_unit_length);
208        }
209
210        ++code_point_index;
211    }
212
213    VERIFY_NOT_REACHED();
214}
215
216bool Utf16View::validate(size_t& valid_code_units) const
217{
218    valid_code_units = 0;
219
220    for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
221        if (is_high_surrogate(*ptr)) {
222            if ((++ptr >= end_ptr()) || !is_low_surrogate(*ptr))
223                return false;
224            ++valid_code_units;
225        } else if (is_low_surrogate(*ptr)) {
226            return false;
227        }
228
229        ++valid_code_units;
230    }
231
232    return true;
233}
234
235size_t Utf16View::calculate_length_in_code_points() const
236{
237    size_t code_points = 0;
238    for ([[maybe_unused]] auto code_point : *this)
239        ++code_points;
240    return code_points;
241}
242
243bool Utf16View::equals_ignoring_case(Utf16View const& other) const
244{
245    if (length_in_code_units() == 0)
246        return other.length_in_code_units() == 0;
247    if (length_in_code_units() != other.length_in_code_units())
248        return false;
249
250    for (size_t i = 0; i < length_in_code_units(); ++i) {
251        // FIXME: Handle non-ASCII case insensitive comparisons.
252        if (to_ascii_lowercase(m_code_units[i]) != to_ascii_lowercase(other.m_code_units[i]))
253            return false;
254    }
255
256    return true;
257}
258
259Utf16CodePointIterator& Utf16CodePointIterator::operator++()
260{
261    size_t code_units = length_in_code_units();
262
263    if (code_units > m_remaining_code_units) {
264        // If there aren't enough code units remaining, skip to the end.
265        m_ptr += m_remaining_code_units;
266        m_remaining_code_units = 0;
267    } else {
268        m_ptr += code_units;
269        m_remaining_code_units -= code_units;
270    }
271
272    return *this;
273}
274
275u32 Utf16CodePointIterator::operator*() const
276{
277    VERIFY(m_remaining_code_units > 0);
278
279    // rfc2781, 2.2 Decoding UTF-16
280    // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
281    //    of W1. Terminate.
282    // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
283    //    is in error and no valid character can be obtained using W1.
284    //    Terminate.
285    // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
286    //    is not between 0xDC00 and 0xDFFF, the sequence is in error.
287    //    Terminate.
288    // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
289    //    bits of W1 as its 10 high-order bits and the 10 low-order bits of
290    //    W2 as its 10 low-order bits.
291    // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
292
293    if (Utf16View::is_high_surrogate(*m_ptr)) {
294        if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1)))
295            return Utf16View::decode_surrogate_pair(*m_ptr, *(m_ptr + 1));
296        return replacement_code_point;
297    } else if (Utf16View::is_low_surrogate(*m_ptr)) {
298        return replacement_code_point;
299    }
300
301    return static_cast<u32>(*m_ptr);
302}
303
304size_t Utf16CodePointIterator::length_in_code_units() const
305{
306    VERIFY(m_remaining_code_units > 0);
307
308    if (Utf16View::is_high_surrogate(*m_ptr)) {
309        if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1)))
310            return 2;
311    }
312
313    // If this return is reached, either the encoded code point is a valid single code unit, or that
314    // code point is invalid (e.g. began with a low surrogate, or a low surrogate did not follow a
315    // high surrogate). In the latter case, a single replacement code unit will be used.
316    return 1;
317}
318
319}