Serenity Operating System
at hosted 205 lines 5.9 kB view raw
1/* 2 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include <AK/Assertions.h> 28#include <AK/LogStream.h> 29#include <AK/Utf8View.h> 30 31namespace AK { 32 33Utf8View::Utf8View(const String& string) 34 : m_string(string) 35{ 36} 37 38Utf8View::Utf8View(const StringView& string) 39 : m_string(string) 40{ 41} 42 43Utf8View::Utf8View(const char* string) 44 : m_string(string) 45{ 46} 47 48const unsigned char* Utf8View::begin_ptr() const 49{ 50 return (const unsigned char*)m_string.characters_without_null_termination(); 51} 52 53const unsigned char* Utf8View::end_ptr() const 54{ 55 return begin_ptr() + m_string.length(); 56} 57 58Utf8CodepointIterator Utf8View::begin() const 59{ 60 return { begin_ptr(), (int)m_string.length() }; 61} 62 63Utf8CodepointIterator Utf8View::end() const 64{ 65 return { end_ptr(), 0 }; 66} 67 68int Utf8View::byte_offset_of(const Utf8CodepointIterator& it) const 69{ 70 ASSERT(it.m_ptr >= begin_ptr()); 71 ASSERT(it.m_ptr <= end_ptr()); 72 73 return it.m_ptr - begin_ptr(); 74} 75 76Utf8View Utf8View::substring_view(int byte_offset, int byte_length) const 77{ 78 StringView string = m_string.substring_view(byte_offset, byte_length); 79 return Utf8View { string }; 80} 81 82static inline bool decode_first_byte( 83 unsigned char byte, 84 int& out_codepoint_length_in_bytes, 85 u32& out_value) 86{ 87 if ((byte & 128) == 0) { 88 out_value = byte; 89 out_codepoint_length_in_bytes = 1; 90 return true; 91 } 92 if ((byte & 64) == 0) { 93 return false; 94 } 95 if ((byte & 32) == 0) { 96 out_value = byte & 31; 97 out_codepoint_length_in_bytes = 2; 98 return true; 99 } 100 if ((byte & 16) == 0) { 101 out_value = byte & 15; 102 out_codepoint_length_in_bytes = 3; 103 return true; 104 } 105 if ((byte & 8) == 0) { 106 out_value = byte & 7; 107 out_codepoint_length_in_bytes = 4; 108 return true; 109 } 110 111 return false; 112} 113 114bool Utf8View::validate() const 115{ 116 for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) { 117 int codepoint_length_in_bytes; 118 u32 value; 119 bool first_byte_makes_sense = decode_first_byte(*ptr, codepoint_length_in_bytes, value); 120 if (!first_byte_makes_sense) 121 return false; 122 123 for (int i = 1; i < codepoint_length_in_bytes; i++) { 124 ptr++; 125 if (ptr >= end_ptr()) 126 return false; 127 if (*ptr >> 6 != 2) 128 return false; 129 } 130 } 131 132 return true; 133} 134 135Utf8CodepointIterator::Utf8CodepointIterator(const unsigned char* ptr, int length) 136 : m_ptr(ptr) 137 , m_length(length) 138{ 139} 140 141bool Utf8CodepointIterator::operator==(const Utf8CodepointIterator& other) const 142{ 143 return m_ptr == other.m_ptr && m_length == other.m_length; 144} 145 146bool Utf8CodepointIterator::operator!=(const Utf8CodepointIterator& other) const 147{ 148 return !(*this == other); 149} 150 151Utf8CodepointIterator& Utf8CodepointIterator::operator++() 152{ 153 ASSERT(m_length > 0); 154 155 int codepoint_length_in_bytes = 0; 156 u32 value; 157 bool first_byte_makes_sense = decode_first_byte(*m_ptr, codepoint_length_in_bytes, value); 158 159 ASSERT(first_byte_makes_sense); 160 (void)value; 161 162 ASSERT(codepoint_length_in_bytes <= m_length); 163 m_ptr += codepoint_length_in_bytes; 164 m_length -= codepoint_length_in_bytes; 165 166 return *this; 167} 168 169int Utf8CodepointIterator::codepoint_length_in_bytes() const 170{ 171 ASSERT(m_length > 0); 172 int codepoint_length_in_bytes = 0; 173 u32 value; 174 bool first_byte_makes_sense = decode_first_byte(*m_ptr, codepoint_length_in_bytes, value); 175 ASSERT(first_byte_makes_sense); 176 return codepoint_length_in_bytes; 177} 178 179u32 Utf8CodepointIterator::operator*() const 180{ 181 ASSERT(m_length > 0); 182 183 u32 codepoint_value_so_far = 0; 184 int codepoint_length_in_bytes = 0; 185 186 bool first_byte_makes_sense = decode_first_byte(m_ptr[0], codepoint_length_in_bytes, codepoint_value_so_far); 187 if (!first_byte_makes_sense) { 188 dbg() << "First byte doesn't make sense, bytes: " << StringView((const char*)m_ptr, m_length); 189 } 190 ASSERT(first_byte_makes_sense); 191 if (codepoint_length_in_bytes > m_length) { 192 dbg() << "Not enough bytes (need " << codepoint_length_in_bytes << ", have " << m_length << "), first byte is: " << m_ptr[0] << " " << (const char*)m_ptr; 193 } 194 ASSERT(codepoint_length_in_bytes <= m_length); 195 196 for (int offset = 1; offset < codepoint_length_in_bytes; offset++) { 197 ASSERT(m_ptr[offset] >> 6 == 2); 198 codepoint_value_so_far <<= 6; 199 codepoint_value_so_far |= m_ptr[offset] & 63; 200 } 201 202 return codepoint_value_so_far; 203} 204 205}