Serenity Operating System
at master 247 lines 8.0 kB view raw
1/* 2 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org> 3 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> 4 * 5 * SPDX-License-Identifier: BSD-2-Clause 6 */ 7 8#include <AK/Assertions.h> 9#include <AK/Debug.h> 10#include <AK/Format.h> 11#include <AK/Utf8View.h> 12 13namespace AK { 14 15Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const 16{ 17 size_t current_offset = 0; 18 for (auto iterator = begin(); !iterator.done(); ++iterator) { 19 if (current_offset >= byte_offset) 20 return iterator; 21 current_offset += iterator.underlying_code_point_length_in_bytes(); 22 } 23 return end(); 24} 25 26Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_t byte_offset) const 27{ 28 return Utf8CodePointIterator { reinterpret_cast<u8 const*>(m_string.characters_without_null_termination()) + byte_offset, m_string.length() - byte_offset }; 29} 30 31size_t Utf8View::byte_offset_of(Utf8CodePointIterator const& it) const 32{ 33 VERIFY(it.m_ptr >= begin_ptr()); 34 VERIFY(it.m_ptr <= end_ptr()); 35 36 return it.m_ptr - begin_ptr(); 37} 38 39size_t Utf8View::byte_offset_of(size_t code_point_offset) const 40{ 41 size_t byte_offset = 0; 42 43 for (auto it = begin(); !it.done(); ++it) { 44 if (code_point_offset == 0) 45 return byte_offset; 46 47 byte_offset += it.underlying_code_point_length_in_bytes(); 48 --code_point_offset; 49 } 50 51 return byte_offset; 52} 53 54Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const 55{ 56 if (code_point_length == 0) 57 return {}; 58 59 size_t code_point_index = 0, offset_in_bytes = 0; 60 for (auto iterator = begin(); !iterator.done(); ++iterator) { 61 if (code_point_index == code_point_offset) 62 offset_in_bytes = byte_offset_of(iterator); 63 if (code_point_index == code_point_offset + code_point_length - 1) { 64 size_t length_in_bytes = byte_offset_of(++iterator) - offset_in_bytes; 65 return substring_view(offset_in_bytes, length_in_bytes); 66 } 67 ++code_point_index; 68 } 69 70 VERIFY_NOT_REACHED(); 71} 72 73size_t Utf8View::calculate_length() const 74{ 75 size_t length = 0; 76 77 for (size_t i = 0; i < m_string.length(); ++length) { 78 auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast<u8>(m_string[i])); 79 80 // Similar to Utf8CodePointIterator::operator++, if the byte is invalid, try the next byte. 81 i += is_valid ? byte_length : 1; 82 } 83 84 return length; 85} 86 87bool Utf8View::starts_with(Utf8View const& start) const 88{ 89 if (start.is_empty()) 90 return true; 91 if (is_empty()) 92 return false; 93 if (start.length() > length()) 94 return false; 95 if (begin_ptr() == start.begin_ptr()) 96 return true; 97 98 for (auto k = begin(), l = start.begin(); l != start.end(); ++k, ++l) { 99 if (*k != *l) 100 return false; 101 } 102 return true; 103} 104 105bool Utf8View::contains(u32 needle) const 106{ 107 for (u32 code_point : *this) { 108 if (code_point == needle) 109 return true; 110 } 111 return false; 112} 113 114Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const 115{ 116 size_t substring_start = 0; 117 size_t substring_length = byte_length(); 118 119 if (mode == TrimMode::Left || mode == TrimMode::Both) { 120 for (auto code_point = begin(); code_point != end(); ++code_point) { 121 if (substring_length == 0) 122 return {}; 123 if (!characters.contains(*code_point)) 124 break; 125 substring_start += code_point.underlying_code_point_length_in_bytes(); 126 substring_length -= code_point.underlying_code_point_length_in_bytes(); 127 } 128 } 129 130 if (mode == TrimMode::Right || mode == TrimMode::Both) { 131 size_t seen_whitespace_length = 0; 132 for (auto code_point = begin(); code_point != end(); ++code_point) { 133 if (characters.contains(*code_point)) 134 seen_whitespace_length += code_point.underlying_code_point_length_in_bytes(); 135 else 136 seen_whitespace_length = 0; 137 } 138 if (seen_whitespace_length >= substring_length) 139 return {}; 140 substring_length -= seen_whitespace_length; 141 } 142 143 return substring_view(substring_start, substring_length); 144} 145 146Utf8CodePointIterator& Utf8CodePointIterator::operator++() 147{ 148 VERIFY(m_length > 0); 149 150 size_t code_point_length_in_bytes = underlying_code_point_length_in_bytes(); 151 if (code_point_length_in_bytes > m_length) { 152 // We don't have enough data for the next code point. Skip one character and try again. 153 // The rest of the code will output replacement characters as needed for any eventual extension bytes we might encounter afterwards. 154 dbgln_if(UTF8_DEBUG, "Expected code point size {} is too big for the remaining length {}. Moving forward one byte.", code_point_length_in_bytes, m_length); 155 m_ptr += 1; 156 m_length -= 1; 157 return *this; 158 } 159 160 m_ptr += code_point_length_in_bytes; 161 m_length -= code_point_length_in_bytes; 162 return *this; 163} 164 165size_t Utf8CodePointIterator::underlying_code_point_length_in_bytes() const 166{ 167 VERIFY(m_length > 0); 168 auto [code_point_length_in_bytes, value, first_byte_makes_sense] = Utf8View::decode_leading_byte(*m_ptr); 169 170 // If any of these tests fail, we will output a replacement character for this byte and treat it as a code point of size 1. 171 if (!first_byte_makes_sense) 172 return 1; 173 174 if (code_point_length_in_bytes > m_length) 175 return 1; 176 177 for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) { 178 if (m_ptr[offset] >> 6 != 2) 179 return 1; 180 } 181 182 return code_point_length_in_bytes; 183} 184 185ReadonlyBytes Utf8CodePointIterator::underlying_code_point_bytes() const 186{ 187 return { m_ptr, underlying_code_point_length_in_bytes() }; 188} 189 190u32 Utf8CodePointIterator::operator*() const 191{ 192 VERIFY(m_length > 0); 193 auto [code_point_length_in_bytes, code_point_value_so_far, first_byte_makes_sense] = Utf8View::decode_leading_byte(*m_ptr); 194 195 if (!first_byte_makes_sense) { 196 // The first byte of the code point doesn't make sense: output a replacement character 197 dbgln_if(UTF8_DEBUG, "First byte doesn't make sense: {:#02x}.", m_ptr[0]); 198 return 0xFFFD; 199 } 200 201 if (code_point_length_in_bytes > m_length) { 202 // There is not enough data left for the full code point: output a replacement character 203 dbgln_if(UTF8_DEBUG, "Not enough bytes (need {}, have {}), first byte is: {:#02x}.", code_point_length_in_bytes, m_length, m_ptr[0]); 204 return 0xFFFD; 205 } 206 207 for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) { 208 if (m_ptr[offset] >> 6 != 2) { 209 // One of the extension bytes of the code point doesn't make sense: output a replacement character 210 dbgln_if(UTF8_DEBUG, "Extension byte {:#02x} in {} position after first byte {:#02x} doesn't make sense.", m_ptr[offset], offset, m_ptr[0]); 211 return 0xFFFD; 212 } 213 214 code_point_value_so_far <<= 6; 215 code_point_value_so_far |= m_ptr[offset] & 63; 216 } 217 218 if (code_point_value_so_far > 0x10FFFF) { 219 dbgln_if(UTF8_DEBUG, "Multi-byte sequence is otherwise valid, but code point {:#x} is not permissible.", code_point_value_so_far); 220 return 0xFFFD; 221 } 222 return code_point_value_so_far; 223} 224 225Optional<u32> Utf8CodePointIterator::peek(size_t offset) const 226{ 227 if (offset == 0) { 228 if (this->done()) 229 return {}; 230 return this->operator*(); 231 } 232 233 auto new_iterator = *this; 234 for (size_t index = 0; index < offset; ++index) { 235 ++new_iterator; 236 if (new_iterator.done()) 237 return {}; 238 } 239 return *new_iterator; 240} 241 242ErrorOr<void> Formatter<Utf8View>::format(FormatBuilder& builder, Utf8View const& string) 243{ 244 return Formatter<StringView>::format(builder, string.as_string()); 245} 246 247}