Serenity Operating System
1/*
2 * Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/CharacterTypes.h>
8#include <AK/Concepts.h>
9#include <AK/StringBuilder.h>
10#include <AK/StringView.h>
11#include <AK/Utf16View.h>
12#include <AK/Utf32View.h>
13#include <AK/Utf8View.h>
14
15namespace AK {
16
17static constexpr u16 high_surrogate_min = 0xd800;
18static constexpr u16 high_surrogate_max = 0xdbff;
19static constexpr u16 low_surrogate_min = 0xdc00;
20static constexpr u16 low_surrogate_max = 0xdfff;
21static constexpr u32 replacement_code_point = 0xfffd;
22static constexpr u32 first_supplementary_plane_code_point = 0x10000;
23
24template<OneOf<Utf8View, Utf32View> UtfViewType>
25static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view)
26{
27 Utf16Data utf16_data;
28 TRY(utf16_data.try_ensure_capacity(view.length()));
29
30 for (auto code_point : view)
31 TRY(code_point_to_utf16(utf16_data, code_point));
32
33 return utf16_data;
34}
35
36ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view)
37{
38 return to_utf16_impl(Utf8View { utf8_view });
39}
40
41ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view)
42{
43 return to_utf16_impl(utf8_view);
44}
45
46ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view)
47{
48 return to_utf16_impl(utf32_view);
49}
50
51ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point)
52{
53 VERIFY(is_unicode(code_point));
54
55 if (code_point < first_supplementary_plane_code_point) {
56 TRY(string.try_append(static_cast<u16>(code_point)));
57 } else {
58 code_point -= first_supplementary_plane_code_point;
59 TRY(string.try_append(static_cast<u16>(high_surrogate_min | (code_point >> 10))));
60 TRY(string.try_append(static_cast<u16>(low_surrogate_min | (code_point & 0x3ff))));
61 }
62
63 return {};
64}
65
66bool Utf16View::is_high_surrogate(u16 code_unit)
67{
68 return (code_unit >= high_surrogate_min) && (code_unit <= high_surrogate_max);
69}
70
71bool Utf16View::is_low_surrogate(u16 code_unit)
72{
73 return (code_unit >= low_surrogate_min) && (code_unit <= low_surrogate_max);
74}
75
76u32 Utf16View::decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate)
77{
78 VERIFY(is_high_surrogate(high_surrogate));
79 VERIFY(is_low_surrogate(low_surrogate));
80
81 return ((high_surrogate - high_surrogate_min) << 10) + (low_surrogate - low_surrogate_min) + first_supplementary_plane_code_point;
82}
83
84ErrorOr<DeprecatedString> Utf16View::to_deprecated_string(AllowInvalidCodeUnits allow_invalid_code_units) const
85{
86 return TRY(to_utf8(allow_invalid_code_units)).to_deprecated_string();
87}
88
89ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
90{
91 StringBuilder builder;
92
93 if (allow_invalid_code_units == AllowInvalidCodeUnits::Yes) {
94 for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
95 if (is_high_surrogate(*ptr)) {
96 auto const* next = ptr + 1;
97
98 if ((next < end_ptr()) && is_low_surrogate(*next)) {
99 auto code_point = decode_surrogate_pair(*ptr, *next);
100 TRY(builder.try_append_code_point(code_point));
101 ++ptr;
102 continue;
103 }
104 }
105
106 TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
107 }
108 } else {
109 for (auto code_point : *this)
110 TRY(builder.try_append_code_point(code_point));
111 }
112
113 return builder.to_string();
114}
115
116size_t Utf16View::length_in_code_points() const
117{
118 if (!m_length_in_code_points.has_value())
119 m_length_in_code_points = calculate_length_in_code_points();
120 return *m_length_in_code_points;
121}
122
123u16 Utf16View::code_unit_at(size_t index) const
124{
125 VERIFY(index < length_in_code_units());
126 return m_code_units[index];
127}
128
129u32 Utf16View::code_point_at(size_t index) const
130{
131 VERIFY(index < length_in_code_units());
132
133 u32 code_point = code_unit_at(index);
134 if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point))
135 return code_point;
136 if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
137 return code_point;
138
139 auto second = code_unit_at(index + 1);
140 if (!is_low_surrogate(second))
141 return code_point;
142
143 return decode_surrogate_pair(code_point, second);
144}
145
146size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
147{
148 size_t code_point_offset = 0;
149
150 for (auto it = begin(); it != end(); ++it) {
151 if (code_unit_offset == 0)
152 return code_point_offset;
153
154 code_unit_offset -= it.length_in_code_units();
155 ++code_point_offset;
156 }
157
158 return code_point_offset;
159}
160
161size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
162{
163 size_t code_unit_offset = 0;
164
165 for (auto it = begin(); it != end(); ++it) {
166 if (code_point_offset == 0)
167 return code_unit_offset;
168
169 code_unit_offset += it.length_in_code_units();
170 --code_point_offset;
171 }
172
173 return code_unit_offset;
174}
175
176size_t Utf16View::code_unit_offset_of(Utf16CodePointIterator const& it) const
177{
178 VERIFY(it.m_ptr >= begin_ptr());
179 VERIFY(it.m_ptr <= end_ptr());
180
181 return it.m_ptr - begin_ptr();
182}
183
184Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const
185{
186 VERIFY(!Checked<size_t>::addition_would_overflow(code_unit_offset, code_unit_length));
187 VERIFY(code_unit_offset + code_unit_length <= length_in_code_units());
188
189 return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) };
190}
191
192Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
193{
194 if (code_point_length == 0)
195 return {};
196
197 auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); };
198 size_t code_point_index = 0;
199 size_t code_unit_offset = 0;
200
201 for (auto it = begin(); it != end(); ++it) {
202 if (code_point_index == code_point_offset)
203 code_unit_offset = code_unit_offset_of(it);
204
205 if (code_point_index == (code_point_offset + code_point_length - 1)) {
206 size_t code_unit_length = code_unit_offset_of(++it) - code_unit_offset;
207 return substring_view(code_unit_offset, code_unit_length);
208 }
209
210 ++code_point_index;
211 }
212
213 VERIFY_NOT_REACHED();
214}
215
216bool Utf16View::validate(size_t& valid_code_units) const
217{
218 valid_code_units = 0;
219
220 for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
221 if (is_high_surrogate(*ptr)) {
222 if ((++ptr >= end_ptr()) || !is_low_surrogate(*ptr))
223 return false;
224 ++valid_code_units;
225 } else if (is_low_surrogate(*ptr)) {
226 return false;
227 }
228
229 ++valid_code_units;
230 }
231
232 return true;
233}
234
235size_t Utf16View::calculate_length_in_code_points() const
236{
237 size_t code_points = 0;
238 for ([[maybe_unused]] auto code_point : *this)
239 ++code_points;
240 return code_points;
241}
242
243bool Utf16View::equals_ignoring_case(Utf16View const& other) const
244{
245 if (length_in_code_units() == 0)
246 return other.length_in_code_units() == 0;
247 if (length_in_code_units() != other.length_in_code_units())
248 return false;
249
250 for (size_t i = 0; i < length_in_code_units(); ++i) {
251 // FIXME: Handle non-ASCII case insensitive comparisons.
252 if (to_ascii_lowercase(m_code_units[i]) != to_ascii_lowercase(other.m_code_units[i]))
253 return false;
254 }
255
256 return true;
257}
258
259Utf16CodePointIterator& Utf16CodePointIterator::operator++()
260{
261 size_t code_units = length_in_code_units();
262
263 if (code_units > m_remaining_code_units) {
264 // If there aren't enough code units remaining, skip to the end.
265 m_ptr += m_remaining_code_units;
266 m_remaining_code_units = 0;
267 } else {
268 m_ptr += code_units;
269 m_remaining_code_units -= code_units;
270 }
271
272 return *this;
273}
274
275u32 Utf16CodePointIterator::operator*() const
276{
277 VERIFY(m_remaining_code_units > 0);
278
279 // rfc2781, 2.2 Decoding UTF-16
280 // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
281 // of W1. Terminate.
282 // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
283 // is in error and no valid character can be obtained using W1.
284 // Terminate.
285 // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
286 // is not between 0xDC00 and 0xDFFF, the sequence is in error.
287 // Terminate.
288 // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
289 // bits of W1 as its 10 high-order bits and the 10 low-order bits of
290 // W2 as its 10 low-order bits.
291 // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
292
293 if (Utf16View::is_high_surrogate(*m_ptr)) {
294 if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1)))
295 return Utf16View::decode_surrogate_pair(*m_ptr, *(m_ptr + 1));
296 return replacement_code_point;
297 } else if (Utf16View::is_low_surrogate(*m_ptr)) {
298 return replacement_code_point;
299 }
300
301 return static_cast<u32>(*m_ptr);
302}
303
304size_t Utf16CodePointIterator::length_in_code_units() const
305{
306 VERIFY(m_remaining_code_units > 0);
307
308 if (Utf16View::is_high_surrogate(*m_ptr)) {
309 if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1)))
310 return 2;
311 }
312
313 // If this return is reached, either the encoded code point is a valid single code unit, or that
314 // code point is invalid (e.g. began with a low surrogate, or a low surrogate did not follow a
315 // high surrogate). In the latter case, a single replacement code unit will be used.
316 return 1;
317}
318
319}