Serenity Operating System
1/*
2 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <AK/Assertions.h>
28#include <AK/LogStream.h>
29#include <AK/Utf8View.h>
30
31namespace AK {
32
33Utf8View::Utf8View(const String& string)
34 : m_string(string)
35{
36}
37
38Utf8View::Utf8View(const StringView& string)
39 : m_string(string)
40{
41}
42
43Utf8View::Utf8View(const char* string)
44 : m_string(string)
45{
46}
47
48const unsigned char* Utf8View::begin_ptr() const
49{
50 return (const unsigned char*)m_string.characters_without_null_termination();
51}
52
53const unsigned char* Utf8View::end_ptr() const
54{
55 return begin_ptr() + m_string.length();
56}
57
58Utf8CodepointIterator Utf8View::begin() const
59{
60 return { begin_ptr(), (int)m_string.length() };
61}
62
63Utf8CodepointIterator Utf8View::end() const
64{
65 return { end_ptr(), 0 };
66}
67
68int Utf8View::byte_offset_of(const Utf8CodepointIterator& it) const
69{
70 ASSERT(it.m_ptr >= begin_ptr());
71 ASSERT(it.m_ptr <= end_ptr());
72
73 return it.m_ptr - begin_ptr();
74}
75
76Utf8View Utf8View::substring_view(int byte_offset, int byte_length) const
77{
78 StringView string = m_string.substring_view(byte_offset, byte_length);
79 return Utf8View { string };
80}
81
82static inline bool decode_first_byte(
83 unsigned char byte,
84 int& out_codepoint_length_in_bytes,
85 u32& out_value)
86{
87 if ((byte & 128) == 0) {
88 out_value = byte;
89 out_codepoint_length_in_bytes = 1;
90 return true;
91 }
92 if ((byte & 64) == 0) {
93 return false;
94 }
95 if ((byte & 32) == 0) {
96 out_value = byte & 31;
97 out_codepoint_length_in_bytes = 2;
98 return true;
99 }
100 if ((byte & 16) == 0) {
101 out_value = byte & 15;
102 out_codepoint_length_in_bytes = 3;
103 return true;
104 }
105 if ((byte & 8) == 0) {
106 out_value = byte & 7;
107 out_codepoint_length_in_bytes = 4;
108 return true;
109 }
110
111 return false;
112}
113
114bool Utf8View::validate() const
115{
116 for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
117 int codepoint_length_in_bytes;
118 u32 value;
119 bool first_byte_makes_sense = decode_first_byte(*ptr, codepoint_length_in_bytes, value);
120 if (!first_byte_makes_sense)
121 return false;
122
123 for (int i = 1; i < codepoint_length_in_bytes; i++) {
124 ptr++;
125 if (ptr >= end_ptr())
126 return false;
127 if (*ptr >> 6 != 2)
128 return false;
129 }
130 }
131
132 return true;
133}
134
135Utf8CodepointIterator::Utf8CodepointIterator(const unsigned char* ptr, int length)
136 : m_ptr(ptr)
137 , m_length(length)
138{
139}
140
141bool Utf8CodepointIterator::operator==(const Utf8CodepointIterator& other) const
142{
143 return m_ptr == other.m_ptr && m_length == other.m_length;
144}
145
146bool Utf8CodepointIterator::operator!=(const Utf8CodepointIterator& other) const
147{
148 return !(*this == other);
149}
150
151Utf8CodepointIterator& Utf8CodepointIterator::operator++()
152{
153 ASSERT(m_length > 0);
154
155 int codepoint_length_in_bytes = 0;
156 u32 value;
157 bool first_byte_makes_sense = decode_first_byte(*m_ptr, codepoint_length_in_bytes, value);
158
159 ASSERT(first_byte_makes_sense);
160 (void)value;
161
162 ASSERT(codepoint_length_in_bytes <= m_length);
163 m_ptr += codepoint_length_in_bytes;
164 m_length -= codepoint_length_in_bytes;
165
166 return *this;
167}
168
169int Utf8CodepointIterator::codepoint_length_in_bytes() const
170{
171 ASSERT(m_length > 0);
172 int codepoint_length_in_bytes = 0;
173 u32 value;
174 bool first_byte_makes_sense = decode_first_byte(*m_ptr, codepoint_length_in_bytes, value);
175 ASSERT(first_byte_makes_sense);
176 return codepoint_length_in_bytes;
177}
178
179u32 Utf8CodepointIterator::operator*() const
180{
181 ASSERT(m_length > 0);
182
183 u32 codepoint_value_so_far = 0;
184 int codepoint_length_in_bytes = 0;
185
186 bool first_byte_makes_sense = decode_first_byte(m_ptr[0], codepoint_length_in_bytes, codepoint_value_so_far);
187 if (!first_byte_makes_sense) {
188 dbg() << "First byte doesn't make sense, bytes: " << StringView((const char*)m_ptr, m_length);
189 }
190 ASSERT(first_byte_makes_sense);
191 if (codepoint_length_in_bytes > m_length) {
192 dbg() << "Not enough bytes (need " << codepoint_length_in_bytes << ", have " << m_length << "), first byte is: " << m_ptr[0] << " " << (const char*)m_ptr;
193 }
194 ASSERT(codepoint_length_in_bytes <= m_length);
195
196 for (int offset = 1; offset < codepoint_length_in_bytes; offset++) {
197 ASSERT(m_ptr[offset] >> 6 == 2);
198 codepoint_value_so_far <<= 6;
199 codepoint_value_so_far |= m_ptr[offset] & 63;
200 }
201
202 return codepoint_value_so_far;
203}
204
205}