AK/Utf8View.cpp at hosted · jcs.org/serenity

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / AK / Utf8View.cpp
at hosted 205 lines 5.9 kB view raw
wrap content
Emanuel Sprung AK, LibGfx, LibGUI: Initialize various variables to zero. 6y ago
074d935c
  1/*
  2 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
  3 * All rights reserved.
  4 *
  5 * Redistribution and use in source and binary forms, with or without
  6 * modification, are permitted provided that the following conditions are met:
  7 *
  8 * 1. Redistributions of source code must retain the above copyright notice, this
  9 *    list of conditions and the following disclaimer.
 10 *
 11 * 2. Redistributions in binary form must reproduce the above copyright notice,
 12 *    this list of conditions and the following disclaimer in the documentation
 13 *    and/or other materials provided with the distribution.
 14 *
 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 */
 26
 27#include <AK/Assertions.h>
 28#include <AK/LogStream.h>
 29#include <AK/Utf8View.h>
 30
 31namespace AK {
 32
 33Utf8View::Utf8View(const String& string)
 34    : m_string(string)
 35{
 36}
 37
 38Utf8View::Utf8View(const StringView& string)
 39    : m_string(string)
 40{
 41}
 42
 43Utf8View::Utf8View(const char* string)
 44    : m_string(string)
 45{
 46}
 47
 48const unsigned char* Utf8View::begin_ptr() const
 49{
 50    return (const unsigned char*)m_string.characters_without_null_termination();
 51}
 52
 53const unsigned char* Utf8View::end_ptr() const
 54{
 55    return begin_ptr() + m_string.length();
 56}
 57
 58Utf8CodepointIterator Utf8View::begin() const
 59{
 60    return { begin_ptr(), (int)m_string.length() };
 61}
 62
 63Utf8CodepointIterator Utf8View::end() const
 64{
 65    return { end_ptr(), 0 };
 66}
 67
 68int Utf8View::byte_offset_of(const Utf8CodepointIterator& it) const
 69{
 70    ASSERT(it.m_ptr >= begin_ptr());
 71    ASSERT(it.m_ptr <= end_ptr());
 72
 73    return it.m_ptr - begin_ptr();
 74}
 75
 76Utf8View Utf8View::substring_view(int byte_offset, int byte_length) const
 77{
 78    StringView string = m_string.substring_view(byte_offset, byte_length);
 79    return Utf8View { string };
 80}
 81
 82static inline bool decode_first_byte(
 83    unsigned char byte,
 84    int& out_codepoint_length_in_bytes,
 85    u32& out_value)
 86{
 87    if ((byte & 128) == 0) {
 88        out_value = byte;
 89        out_codepoint_length_in_bytes = 1;
 90        return true;
 91    }
 92    if ((byte & 64) == 0) {
 93        return false;
 94    }
 95    if ((byte & 32) == 0) {
 96        out_value = byte & 31;
 97        out_codepoint_length_in_bytes = 2;
 98        return true;
 99    }
100    if ((byte & 16) == 0) {
101        out_value = byte & 15;
102        out_codepoint_length_in_bytes = 3;
103        return true;
104    }
105    if ((byte & 8) == 0) {
106        out_value = byte & 7;
107        out_codepoint_length_in_bytes = 4;
108        return true;
109    }
110
111    return false;
112}
113
114bool Utf8View::validate() const
115{
116    for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
117        int codepoint_length_in_bytes;
118        u32 value;
119        bool first_byte_makes_sense = decode_first_byte(*ptr, codepoint_length_in_bytes, value);
120        if (!first_byte_makes_sense)
121            return false;
122
123        for (int i = 1; i < codepoint_length_in_bytes; i++) {
124            ptr++;
125            if (ptr >= end_ptr())
126                return false;
127            if (*ptr >> 6 != 2)
128                return false;
129        }
130    }
131
132    return true;
133}
134
135Utf8CodepointIterator::Utf8CodepointIterator(const unsigned char* ptr, int length)
136    : m_ptr(ptr)
137    , m_length(length)
138{
139}
140
141bool Utf8CodepointIterator::operator==(const Utf8CodepointIterator& other) const
142{
143    return m_ptr == other.m_ptr && m_length == other.m_length;
144}
145
146bool Utf8CodepointIterator::operator!=(const Utf8CodepointIterator& other) const
147{
148    return !(*this == other);
149}
150
151Utf8CodepointIterator& Utf8CodepointIterator::operator++()
152{
153    ASSERT(m_length > 0);
154
155    int codepoint_length_in_bytes = 0;
156    u32 value;
157    bool first_byte_makes_sense = decode_first_byte(*m_ptr, codepoint_length_in_bytes, value);
158
159    ASSERT(first_byte_makes_sense);
160    (void)value;
161
162    ASSERT(codepoint_length_in_bytes <= m_length);
163    m_ptr += codepoint_length_in_bytes;
164    m_length -= codepoint_length_in_bytes;
165
166    return *this;
167}
168
169int Utf8CodepointIterator::codepoint_length_in_bytes() const
170{
171    ASSERT(m_length > 0);
172    int codepoint_length_in_bytes = 0;
173    u32 value;
174    bool first_byte_makes_sense = decode_first_byte(*m_ptr, codepoint_length_in_bytes, value);
175    ASSERT(first_byte_makes_sense);
176    return codepoint_length_in_bytes;
177}
178
179u32 Utf8CodepointIterator::operator*() const
180{
181    ASSERT(m_length > 0);
182
183    u32 codepoint_value_so_far = 0;
184    int codepoint_length_in_bytes = 0;
185
186    bool first_byte_makes_sense = decode_first_byte(m_ptr[0], codepoint_length_in_bytes, codepoint_value_so_far);
187    if (!first_byte_makes_sense) {
188        dbg() << "First byte doesn't make sense, bytes: " << StringView((const char*)m_ptr, m_length);
189    }
190    ASSERT(first_byte_makes_sense);
191    if (codepoint_length_in_bytes > m_length) {
192        dbg() << "Not enough bytes (need " << codepoint_length_in_bytes << ", have " << m_length << "), first byte is: " << m_ptr[0] << " " << (const char*)m_ptr;
193    }
194    ASSERT(codepoint_length_in_bytes <= m_length);
195
196    for (int offset = 1; offset < codepoint_length_in_bytes; offset++) {
197        ASSERT(m_ptr[offset] >> 6 == 2);
198        codepoint_value_so_far <<= 6;
199        codepoint_value_so_far |= m_ptr[offset] & 63;
200    }
201
202    return codepoint_value_so_far;
203}
204
205}