Serenity Operating System
at master 319 lines 9.7 kB view raw
1/* 2 * Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <AK/CharacterTypes.h> 8#include <AK/Concepts.h> 9#include <AK/StringBuilder.h> 10#include <AK/StringView.h> 11#include <AK/Utf16View.h> 12#include <AK/Utf32View.h> 13#include <AK/Utf8View.h> 14 15namespace AK { 16 17static constexpr u16 high_surrogate_min = 0xd800; 18static constexpr u16 high_surrogate_max = 0xdbff; 19static constexpr u16 low_surrogate_min = 0xdc00; 20static constexpr u16 low_surrogate_max = 0xdfff; 21static constexpr u32 replacement_code_point = 0xfffd; 22static constexpr u32 first_supplementary_plane_code_point = 0x10000; 23 24template<OneOf<Utf8View, Utf32View> UtfViewType> 25static ErrorOr<Utf16Data> to_utf16_impl(UtfViewType const& view) 26{ 27 Utf16Data utf16_data; 28 TRY(utf16_data.try_ensure_capacity(view.length())); 29 30 for (auto code_point : view) 31 TRY(code_point_to_utf16(utf16_data, code_point)); 32 33 return utf16_data; 34} 35 36ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view) 37{ 38 return to_utf16_impl(Utf8View { utf8_view }); 39} 40 41ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view) 42{ 43 return to_utf16_impl(utf8_view); 44} 45 46ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view) 47{ 48 return to_utf16_impl(utf32_view); 49} 50 51ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point) 52{ 53 VERIFY(is_unicode(code_point)); 54 55 if (code_point < first_supplementary_plane_code_point) { 56 TRY(string.try_append(static_cast<u16>(code_point))); 57 } else { 58 code_point -= first_supplementary_plane_code_point; 59 TRY(string.try_append(static_cast<u16>(high_surrogate_min | (code_point >> 10)))); 60 TRY(string.try_append(static_cast<u16>(low_surrogate_min | (code_point & 0x3ff)))); 61 } 62 63 return {}; 64} 65 66bool Utf16View::is_high_surrogate(u16 code_unit) 67{ 68 return (code_unit >= high_surrogate_min) && (code_unit <= high_surrogate_max); 69} 70 71bool Utf16View::is_low_surrogate(u16 code_unit) 72{ 73 return (code_unit >= low_surrogate_min) && (code_unit <= low_surrogate_max); 74} 75 76u32 Utf16View::decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate) 77{ 78 VERIFY(is_high_surrogate(high_surrogate)); 79 VERIFY(is_low_surrogate(low_surrogate)); 80 81 return ((high_surrogate - high_surrogate_min) << 10) + (low_surrogate - low_surrogate_min) + first_supplementary_plane_code_point; 82} 83 84ErrorOr<DeprecatedString> Utf16View::to_deprecated_string(AllowInvalidCodeUnits allow_invalid_code_units) const 85{ 86 return TRY(to_utf8(allow_invalid_code_units)).to_deprecated_string(); 87} 88 89ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const 90{ 91 StringBuilder builder; 92 93 if (allow_invalid_code_units == AllowInvalidCodeUnits::Yes) { 94 for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) { 95 if (is_high_surrogate(*ptr)) { 96 auto const* next = ptr + 1; 97 98 if ((next < end_ptr()) && is_low_surrogate(*next)) { 99 auto code_point = decode_surrogate_pair(*ptr, *next); 100 TRY(builder.try_append_code_point(code_point)); 101 ++ptr; 102 continue; 103 } 104 } 105 106 TRY(builder.try_append_code_point(static_cast<u32>(*ptr))); 107 } 108 } else { 109 for (auto code_point : *this) 110 TRY(builder.try_append_code_point(code_point)); 111 } 112 113 return builder.to_string(); 114} 115 116size_t Utf16View::length_in_code_points() const 117{ 118 if (!m_length_in_code_points.has_value()) 119 m_length_in_code_points = calculate_length_in_code_points(); 120 return *m_length_in_code_points; 121} 122 123u16 Utf16View::code_unit_at(size_t index) const 124{ 125 VERIFY(index < length_in_code_units()); 126 return m_code_units[index]; 127} 128 129u32 Utf16View::code_point_at(size_t index) const 130{ 131 VERIFY(index < length_in_code_units()); 132 133 u32 code_point = code_unit_at(index); 134 if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point)) 135 return code_point; 136 if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units())) 137 return code_point; 138 139 auto second = code_unit_at(index + 1); 140 if (!is_low_surrogate(second)) 141 return code_point; 142 143 return decode_surrogate_pair(code_point, second); 144} 145 146size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const 147{ 148 size_t code_point_offset = 0; 149 150 for (auto it = begin(); it != end(); ++it) { 151 if (code_unit_offset == 0) 152 return code_point_offset; 153 154 code_unit_offset -= it.length_in_code_units(); 155 ++code_point_offset; 156 } 157 158 return code_point_offset; 159} 160 161size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const 162{ 163 size_t code_unit_offset = 0; 164 165 for (auto it = begin(); it != end(); ++it) { 166 if (code_point_offset == 0) 167 return code_unit_offset; 168 169 code_unit_offset += it.length_in_code_units(); 170 --code_point_offset; 171 } 172 173 return code_unit_offset; 174} 175 176size_t Utf16View::code_unit_offset_of(Utf16CodePointIterator const& it) const 177{ 178 VERIFY(it.m_ptr >= begin_ptr()); 179 VERIFY(it.m_ptr <= end_ptr()); 180 181 return it.m_ptr - begin_ptr(); 182} 183 184Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const 185{ 186 VERIFY(!Checked<size_t>::addition_would_overflow(code_unit_offset, code_unit_length)); 187 VERIFY(code_unit_offset + code_unit_length <= length_in_code_units()); 188 189 return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) }; 190} 191 192Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const 193{ 194 if (code_point_length == 0) 195 return {}; 196 197 auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); }; 198 size_t code_point_index = 0; 199 size_t code_unit_offset = 0; 200 201 for (auto it = begin(); it != end(); ++it) { 202 if (code_point_index == code_point_offset) 203 code_unit_offset = code_unit_offset_of(it); 204 205 if (code_point_index == (code_point_offset + code_point_length - 1)) { 206 size_t code_unit_length = code_unit_offset_of(++it) - code_unit_offset; 207 return substring_view(code_unit_offset, code_unit_length); 208 } 209 210 ++code_point_index; 211 } 212 213 VERIFY_NOT_REACHED(); 214} 215 216bool Utf16View::validate(size_t& valid_code_units) const 217{ 218 valid_code_units = 0; 219 220 for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) { 221 if (is_high_surrogate(*ptr)) { 222 if ((++ptr >= end_ptr()) || !is_low_surrogate(*ptr)) 223 return false; 224 ++valid_code_units; 225 } else if (is_low_surrogate(*ptr)) { 226 return false; 227 } 228 229 ++valid_code_units; 230 } 231 232 return true; 233} 234 235size_t Utf16View::calculate_length_in_code_points() const 236{ 237 size_t code_points = 0; 238 for ([[maybe_unused]] auto code_point : *this) 239 ++code_points; 240 return code_points; 241} 242 243bool Utf16View::equals_ignoring_case(Utf16View const& other) const 244{ 245 if (length_in_code_units() == 0) 246 return other.length_in_code_units() == 0; 247 if (length_in_code_units() != other.length_in_code_units()) 248 return false; 249 250 for (size_t i = 0; i < length_in_code_units(); ++i) { 251 // FIXME: Handle non-ASCII case insensitive comparisons. 252 if (to_ascii_lowercase(m_code_units[i]) != to_ascii_lowercase(other.m_code_units[i])) 253 return false; 254 } 255 256 return true; 257} 258 259Utf16CodePointIterator& Utf16CodePointIterator::operator++() 260{ 261 size_t code_units = length_in_code_units(); 262 263 if (code_units > m_remaining_code_units) { 264 // If there aren't enough code units remaining, skip to the end. 265 m_ptr += m_remaining_code_units; 266 m_remaining_code_units = 0; 267 } else { 268 m_ptr += code_units; 269 m_remaining_code_units -= code_units; 270 } 271 272 return *this; 273} 274 275u32 Utf16CodePointIterator::operator*() const 276{ 277 VERIFY(m_remaining_code_units > 0); 278 279 // rfc2781, 2.2 Decoding UTF-16 280 // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value 281 // of W1. Terminate. 282 // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence 283 // is in error and no valid character can be obtained using W1. 284 // Terminate. 285 // 3) If there is no W2 (that is, the sequence ends with W1), or if W2 286 // is not between 0xDC00 and 0xDFFF, the sequence is in error. 287 // Terminate. 288 // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order 289 // bits of W1 as its 10 high-order bits and the 10 low-order bits of 290 // W2 as its 10 low-order bits. 291 // 5) Add 0x10000 to U' to obtain the character value U. Terminate. 292 293 if (Utf16View::is_high_surrogate(*m_ptr)) { 294 if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1))) 295 return Utf16View::decode_surrogate_pair(*m_ptr, *(m_ptr + 1)); 296 return replacement_code_point; 297 } else if (Utf16View::is_low_surrogate(*m_ptr)) { 298 return replacement_code_point; 299 } 300 301 return static_cast<u32>(*m_ptr); 302} 303 304size_t Utf16CodePointIterator::length_in_code_units() const 305{ 306 VERIFY(m_remaining_code_units > 0); 307 308 if (Utf16View::is_high_surrogate(*m_ptr)) { 309 if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1))) 310 return 2; 311 } 312 313 // If this return is reached, either the encoded code point is a valid single code unit, or that 314 // code point is invalid (e.g. began with a low surrogate, or a low surrogate did not follow a 315 // high surrogate). In the latter case, a single replacement code unit will be used. 316 return 1; 317} 318 319}