Tests/AK/TestUtf8.cpp at master

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / Tests / AK / TestUtf8.cpp
at master 297 lines 13 kB view raw
wrap content
Timothy Flynn AK: Invalidate overlong UTF-8 code point encodings 3y ago
c4d78c29
  1/*
  2 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
  3 *
  4 * SPDX-License-Identifier: BSD-2-Clause
  5 */
  6
  7#include <LibTest/TestCase.h>
  8
  9#include <AK/ByteBuffer.h>
 10#include <AK/Utf8View.h>
 11
 12TEST_CASE(decode_ascii)
 13{
 14    Utf8View utf8 { "Hello World!11"sv };
 15    EXPECT(utf8.validate());
 16
 17    u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 };
 18    size_t expected_size = sizeof(expected) / sizeof(expected[0]);
 19
 20    size_t i = 0;
 21    for (u32 code_point : utf8) {
 22        VERIFY(i < expected_size);
 23        EXPECT_EQ(code_point, expected[i]);
 24        i++;
 25    }
 26    EXPECT_EQ(i, expected_size);
 27}
 28
 29TEST_CASE(decode_utf8)
 30{
 31    Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv };
 32    size_t valid_bytes;
 33    EXPECT(utf8.validate(valid_bytes));
 34    EXPECT(valid_bytes == (size_t)utf8.byte_length());
 35
 36    u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
 37    DeprecatedString expected_underlying_bytes[] = { "П", "р", "и", "в", "е", "т", ",", " ", "м", "и", "р", "!", " ", "😀", " ", "γ", "ε", "ι", "ά", " ", "σ", "ο", "υ", " ", "κ", "ό", "σ", "μ", "ο", "ς", " ", "こ", "ん", "に", "ち", "は", "世", "界" };
 38    size_t expected_size = sizeof(expected) / sizeof(expected[0]);
 39
 40    size_t i = 0;
 41    for (auto it = utf8.begin(); it != utf8.end(); ++it) {
 42        u32 code_point = *it;
 43        VERIFY(i < expected_size);
 44        EXPECT_EQ(code_point, expected[i]);
 45        EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
 46        i++;
 47    }
 48    EXPECT_EQ(i, expected_size);
 49}
 50
 51TEST_CASE(validate_invalid_ut8)
 52{
 53    size_t valid_bytes;
 54    char invalid_utf8_1[] = { 42, 35, (char)182, 9 };
 55    Utf8View utf8_1 { StringView { invalid_utf8_1, 4 } };
 56    EXPECT(!utf8_1.validate(valid_bytes));
 57    EXPECT(valid_bytes == 2);
 58
 59    char invalid_utf8_2[] = { 42, 35, (char)208, (char)208 };
 60    Utf8View utf8_2 { StringView { invalid_utf8_2, 4 } };
 61    EXPECT(!utf8_2.validate(valid_bytes));
 62    EXPECT(valid_bytes == 2);
 63
 64    char invalid_utf8_3[] = { (char)208 };
 65    Utf8View utf8_3 { StringView { invalid_utf8_3, 1 } };
 66    EXPECT(!utf8_3.validate(valid_bytes));
 67    EXPECT(valid_bytes == 0);
 68
 69    char invalid_utf8_4[] = { (char)208, 35 };
 70    Utf8View utf8_4 { StringView { invalid_utf8_4, 2 } };
 71    EXPECT(!utf8_4.validate(valid_bytes));
 72    EXPECT(valid_bytes == 0);
 73
 74    char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0 }; // U+110000
 75    Utf8View utf8_5 { StringView { invalid_utf8_5, 4 } };
 76    EXPECT(!utf8_5.validate(valid_bytes));
 77    EXPECT(valid_bytes == 0);
 78
 79    char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd }; // U+121c3d
 80    Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } };
 81    EXPECT(!utf8_6.validate(valid_bytes));
 82    EXPECT(valid_bytes == 0);
 83}
 84
 85TEST_CASE(validate_overlong_utf8)
 86{
 87    size_t valid_bytes = 0;
 88
 89    // Overlong 2-byte encoding of U+002F
 90    char invalid_utf8_1[] = { 42, 35, static_cast<char>(0xc0), static_cast<char>(0xaf) };
 91    Utf8View utf8_1 { StringView { invalid_utf8_1, sizeof(invalid_utf8_1) } };
 92    EXPECT(!utf8_1.validate(valid_bytes));
 93    EXPECT(valid_bytes == 2);
 94
 95    // Overlong 3-byte encoding of U+002F
 96    char invalid_utf8_2[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x80), static_cast<char>(0xaf) };
 97    Utf8View utf8_2 { StringView { invalid_utf8_2, sizeof(invalid_utf8_2) } };
 98    EXPECT(!utf8_2.validate(valid_bytes));
 99    EXPECT(valid_bytes == 2);
100
101    // Overlong 4-byte encoding of U+002F
102    char invalid_utf8_3[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x80), static_cast<char>(0xaf) };
103    Utf8View utf8_3 { StringView { invalid_utf8_3, sizeof(invalid_utf8_3) } };
104    EXPECT(!utf8_3.validate(valid_bytes));
105    EXPECT(valid_bytes == 2);
106
107    // Overlong 3-byte encoding of U+00FF
108    char invalid_utf8_4[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x83), static_cast<char>(0xbf) };
109    Utf8View utf8_4 { StringView { invalid_utf8_4, sizeof(invalid_utf8_4) } };
110    EXPECT(!utf8_4.validate(valid_bytes));
111    EXPECT(valid_bytes == 2);
112
113    // Overlong 4-byte encoding of U+00FF
114    char invalid_utf8_5[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x83), static_cast<char>(0xbf) };
115    Utf8View utf8_5 { StringView { invalid_utf8_5, sizeof(invalid_utf8_5) } };
116    EXPECT(!utf8_5.validate(valid_bytes));
117    EXPECT(valid_bytes == 2);
118
119    // Overlong 4-byte encoding of U+0FFF
120    char invalid_utf8_6[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x8f), static_cast<char>(0xbf), static_cast<char>(0xbf) };
121    Utf8View utf8_6 { StringView { invalid_utf8_6, sizeof(invalid_utf8_6) } };
122    EXPECT(!utf8_6.validate(valid_bytes));
123    EXPECT(valid_bytes == 2);
124}
125
126TEST_CASE(iterate_utf8)
127{
128    Utf8View view("Some weird characters \u00A9\u266A\uA755"sv);
129    Utf8CodePointIterator iterator = view.begin();
130
131    EXPECT(*iterator == 'S');
132    EXPECT(iterator.peek().has_value() && iterator.peek().value() == 'S');
133    EXPECT(iterator.peek(0).has_value() && iterator.peek(0).value() == 'S');
134    EXPECT(iterator.peek(1).has_value() && iterator.peek(1).value() == 'o');
135    EXPECT(iterator.peek(22).has_value() && iterator.peek(22).value() == 0x00A9);
136    EXPECT(iterator.peek(24).has_value() && iterator.peek(24).value() == 0xA755);
137    EXPECT(!iterator.peek(25).has_value());
138
139    ++iterator;
140
141    EXPECT(*iterator == 'o');
142    EXPECT(iterator.peek(23).has_value() && iterator.peek(23).value() == 0xA755);
143
144    for (size_t i = 0; i < 23; ++i)
145        ++iterator;
146
147    EXPECT(!iterator.done());
148    EXPECT(*iterator == 0xA755);
149    EXPECT(iterator.peek().has_value() && iterator.peek().value() == 0xA755);
150    EXPECT(!iterator.peek(1).has_value());
151
152    ++iterator;
153
154    EXPECT(iterator.done());
155    EXPECT(!iterator.peek(0).has_value());
156    EXPECT_CRASH("Dereferencing Utf8CodePointIterator which is already done.", [&iterator] {
157        *iterator;
158        return Test::Crash::Failure::DidNotCrash;
159    });
160}
161
162TEST_CASE(decode_invalid_ut8)
163{
164    // Test case 1 : Getting an extension byte as first byte of the code point
165    {
166        char raw_data[] = { 'a', 'b', (char)0xA0, 'd' };
167        Utf8View view { StringView { raw_data, 4 } };
168        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' };
169        DeprecatedString expected_underlying_bytes[] = { "a", "b", "\xA0", "d" };
170        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
171        size_t i = 0;
172        for (auto it = view.begin(); it != view.end(); ++it) {
173            u32 code_point = *it;
174            VERIFY(i < expected_size);
175            EXPECT_EQ(code_point, expected_characters[i]);
176            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
177            i++;
178        }
179        VERIFY(i == expected_size);
180    }
181
182    // Test case 2 : Getting a non-extension byte when an extension byte is expected
183    {
184        char raw_data[] = { 'a', 'b', (char)0xC0, 'd', 'e' };
185        Utf8View view { StringView { raw_data, 5 } };
186        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd', 'e' };
187        DeprecatedString expected_underlying_bytes[] = { "a", "b", "\xC0", "d", "e" };
188        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
189        size_t i = 0;
190        for (auto it = view.begin(); it != view.end(); ++it) {
191            u32 code_point = *it;
192            VERIFY(i < expected_size);
193            EXPECT_EQ(code_point, expected_characters[i]);
194            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
195            i++;
196        }
197        VERIFY(i == expected_size);
198    }
199
200    // Test case 3 : Not enough bytes before the end of the string
201    {
202        char raw_data[] = { 'a', 'b', (char)0x90, 'd' };
203        Utf8View view { StringView { raw_data, 4 } };
204        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' };
205        DeprecatedString expected_underlying_bytes[] = { "a", "b", "\x90", "d" };
206        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
207        size_t i = 0;
208        for (auto it = view.begin(); it != view.end(); ++it) {
209            u32 code_point = *it;
210            VERIFY(i < expected_size);
211            EXPECT_EQ(code_point, expected_characters[i]);
212            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
213            i++;
214        }
215        VERIFY(i == expected_size);
216    }
217
218    // Test case 4 : Not enough bytes at the end of the string
219    {
220        char raw_data[] = { 'a', 'b', 'c', (char)0x90 };
221        Utf8View view { StringView { raw_data, 4 } };
222        u32 expected_characters[] = { 'a', 'b', 'c', 0xFFFD };
223        DeprecatedString expected_underlying_bytes[] = { "a", "b", "c", "\x90" };
224        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
225        size_t i = 0;
226        for (auto it = view.begin(); it != view.end(); ++it) {
227            u32 code_point = *it;
228            VERIFY(i < expected_size);
229            EXPECT_EQ(code_point, expected_characters[i]);
230            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
231            i++;
232        }
233        VERIFY(i == expected_size);
234    }
235
236    // Test case 5 : Oversized four-byte sequence (e.g. U+123456)
237    {
238        // Want to encode: (000)1 0010 0011 0100 0101 0110
239        // Into mask: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
240        // Shifted:        100   100011   010001   010110
241        // Result:    11110100 10100011 10010001 10010110
242        char raw_data[] = { 'a', (char)0xF4, (char)0xA3, (char)0x91, (char)0x96, 'b' };
243        Utf8View view { StringView { raw_data, 6 } };
244        // This definition seems to suggest that we should instead output multiple replacement characters:
245        // https://encoding.spec.whatwg.org/#ref-for-concept-stream-prepend②
246        // This is supported by the plaintext description and example collection, which annoyingly does not give an example of how to deal with this:
247        // https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf , section "U+FFFD Substitution of Maximal Subparts"
248        // However, that would go against how we deal with several other kinds of errors, so we stick to emitting only one U+FFFD.
249        u32 expected_characters[] = { 'a', 0xFFFD, 'b' };
250        DeprecatedString expected_underlying_bytes[] = { "a", "\xF4\xA3\x91\x96", "b" };
251        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
252        size_t i = 0;
253        for (auto it = view.begin(); it != view.end(); ++it) {
254            u32 code_point = *it;
255            VERIFY(i < expected_size);
256            EXPECT_EQ(code_point, expected_characters[i]);
257            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
258            i++;
259        }
260        VERIFY(i == expected_size);
261    }
262}
263
264TEST_CASE(trim)
265{
266    Utf8View whitespace { " "sv };
267    {
268        Utf8View view { "word"sv };
269        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
270        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
271        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
272    }
273    {
274        Utf8View view { "   word"sv };
275        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
276        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
277        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "   word");
278    }
279    {
280        Utf8View view { "word   "sv };
281        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
282        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word   ");
283        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
284    }
285    {
286        Utf8View view { "   word   "sv };
287        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
288        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word   ");
289        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "   word");
290    }
291    {
292        Utf8View view { "\u180E"sv };
293        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "\u180E");
294        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "\u180E");
295        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "\u180E");
296    }
297}