Serenity Operating System
at master 297 lines 13 kB view raw
1/* 2 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org> 3 * 4 * SPDX-License-Identifier: BSD-2-Clause 5 */ 6 7#include <LibTest/TestCase.h> 8 9#include <AK/ByteBuffer.h> 10#include <AK/Utf8View.h> 11 12TEST_CASE(decode_ascii) 13{ 14 Utf8View utf8 { "Hello World!11"sv }; 15 EXPECT(utf8.validate()); 16 17 u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 }; 18 size_t expected_size = sizeof(expected) / sizeof(expected[0]); 19 20 size_t i = 0; 21 for (u32 code_point : utf8) { 22 VERIFY(i < expected_size); 23 EXPECT_EQ(code_point, expected[i]); 24 i++; 25 } 26 EXPECT_EQ(i, expected_size); 27} 28 29TEST_CASE(decode_utf8) 30{ 31 Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv }; 32 size_t valid_bytes; 33 EXPECT(utf8.validate(valid_bytes)); 34 EXPECT(valid_bytes == (size_t)utf8.byte_length()); 35 36 u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 }; 37 DeprecatedString expected_underlying_bytes[] = { "П", "р", "и", "в", "е", "т", ",", " ", "м", "и", "р", "!", " ", "😀", " ", "γ", "ε", "ι", "ά", " ", "σ", "ο", "υ", " ", "κ", "ό", "σ", "μ", "ο", "ς", " ", "", "", "", "", "", "", "" }; 38 size_t expected_size = sizeof(expected) / sizeof(expected[0]); 39 40 size_t i = 0; 41 for (auto it = utf8.begin(); it != utf8.end(); ++it) { 42 u32 code_point = *it; 43 VERIFY(i < expected_size); 44 EXPECT_EQ(code_point, expected[i]); 45 EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); 46 i++; 47 } 48 EXPECT_EQ(i, expected_size); 49} 50 51TEST_CASE(validate_invalid_ut8) 52{ 53 size_t valid_bytes; 54 char invalid_utf8_1[] = { 42, 35, (char)182, 9 }; 55 Utf8View utf8_1 { StringView { invalid_utf8_1, 4 } }; 56 EXPECT(!utf8_1.validate(valid_bytes)); 57 EXPECT(valid_bytes == 2); 58 59 char invalid_utf8_2[] = { 42, 35, (char)208, (char)208 }; 60 Utf8View utf8_2 { StringView { invalid_utf8_2, 4 } }; 61 EXPECT(!utf8_2.validate(valid_bytes)); 62 EXPECT(valid_bytes == 2); 63 64 char invalid_utf8_3[] = { (char)208 }; 65 Utf8View utf8_3 { StringView { invalid_utf8_3, 1 } }; 66 EXPECT(!utf8_3.validate(valid_bytes)); 67 EXPECT(valid_bytes == 0); 68 69 char invalid_utf8_4[] = { (char)208, 35 }; 70 Utf8View utf8_4 { StringView { invalid_utf8_4, 2 } }; 71 EXPECT(!utf8_4.validate(valid_bytes)); 72 EXPECT(valid_bytes == 0); 73 74 char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0 }; // U+110000 75 Utf8View utf8_5 { StringView { invalid_utf8_5, 4 } }; 76 EXPECT(!utf8_5.validate(valid_bytes)); 77 EXPECT(valid_bytes == 0); 78 79 char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd }; // U+121c3d 80 Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } }; 81 EXPECT(!utf8_6.validate(valid_bytes)); 82 EXPECT(valid_bytes == 0); 83} 84 85TEST_CASE(validate_overlong_utf8) 86{ 87 size_t valid_bytes = 0; 88 89 // Overlong 2-byte encoding of U+002F 90 char invalid_utf8_1[] = { 42, 35, static_cast<char>(0xc0), static_cast<char>(0xaf) }; 91 Utf8View utf8_1 { StringView { invalid_utf8_1, sizeof(invalid_utf8_1) } }; 92 EXPECT(!utf8_1.validate(valid_bytes)); 93 EXPECT(valid_bytes == 2); 94 95 // Overlong 3-byte encoding of U+002F 96 char invalid_utf8_2[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x80), static_cast<char>(0xaf) }; 97 Utf8View utf8_2 { StringView { invalid_utf8_2, sizeof(invalid_utf8_2) } }; 98 EXPECT(!utf8_2.validate(valid_bytes)); 99 EXPECT(valid_bytes == 2); 100 101 // Overlong 4-byte encoding of U+002F 102 char invalid_utf8_3[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x80), static_cast<char>(0xaf) }; 103 Utf8View utf8_3 { StringView { invalid_utf8_3, sizeof(invalid_utf8_3) } }; 104 EXPECT(!utf8_3.validate(valid_bytes)); 105 EXPECT(valid_bytes == 2); 106 107 // Overlong 3-byte encoding of U+00FF 108 char invalid_utf8_4[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x83), static_cast<char>(0xbf) }; 109 Utf8View utf8_4 { StringView { invalid_utf8_4, sizeof(invalid_utf8_4) } }; 110 EXPECT(!utf8_4.validate(valid_bytes)); 111 EXPECT(valid_bytes == 2); 112 113 // Overlong 4-byte encoding of U+00FF 114 char invalid_utf8_5[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x83), static_cast<char>(0xbf) }; 115 Utf8View utf8_5 { StringView { invalid_utf8_5, sizeof(invalid_utf8_5) } }; 116 EXPECT(!utf8_5.validate(valid_bytes)); 117 EXPECT(valid_bytes == 2); 118 119 // Overlong 4-byte encoding of U+0FFF 120 char invalid_utf8_6[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x8f), static_cast<char>(0xbf), static_cast<char>(0xbf) }; 121 Utf8View utf8_6 { StringView { invalid_utf8_6, sizeof(invalid_utf8_6) } }; 122 EXPECT(!utf8_6.validate(valid_bytes)); 123 EXPECT(valid_bytes == 2); 124} 125 126TEST_CASE(iterate_utf8) 127{ 128 Utf8View view("Some weird characters \u00A9\u266A\uA755"sv); 129 Utf8CodePointIterator iterator = view.begin(); 130 131 EXPECT(*iterator == 'S'); 132 EXPECT(iterator.peek().has_value() && iterator.peek().value() == 'S'); 133 EXPECT(iterator.peek(0).has_value() && iterator.peek(0).value() == 'S'); 134 EXPECT(iterator.peek(1).has_value() && iterator.peek(1).value() == 'o'); 135 EXPECT(iterator.peek(22).has_value() && iterator.peek(22).value() == 0x00A9); 136 EXPECT(iterator.peek(24).has_value() && iterator.peek(24).value() == 0xA755); 137 EXPECT(!iterator.peek(25).has_value()); 138 139 ++iterator; 140 141 EXPECT(*iterator == 'o'); 142 EXPECT(iterator.peek(23).has_value() && iterator.peek(23).value() == 0xA755); 143 144 for (size_t i = 0; i < 23; ++i) 145 ++iterator; 146 147 EXPECT(!iterator.done()); 148 EXPECT(*iterator == 0xA755); 149 EXPECT(iterator.peek().has_value() && iterator.peek().value() == 0xA755); 150 EXPECT(!iterator.peek(1).has_value()); 151 152 ++iterator; 153 154 EXPECT(iterator.done()); 155 EXPECT(!iterator.peek(0).has_value()); 156 EXPECT_CRASH("Dereferencing Utf8CodePointIterator which is already done.", [&iterator] { 157 *iterator; 158 return Test::Crash::Failure::DidNotCrash; 159 }); 160} 161 162TEST_CASE(decode_invalid_ut8) 163{ 164 // Test case 1 : Getting an extension byte as first byte of the code point 165 { 166 char raw_data[] = { 'a', 'b', (char)0xA0, 'd' }; 167 Utf8View view { StringView { raw_data, 4 } }; 168 u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' }; 169 DeprecatedString expected_underlying_bytes[] = { "a", "b", "\xA0", "d" }; 170 size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); 171 size_t i = 0; 172 for (auto it = view.begin(); it != view.end(); ++it) { 173 u32 code_point = *it; 174 VERIFY(i < expected_size); 175 EXPECT_EQ(code_point, expected_characters[i]); 176 EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); 177 i++; 178 } 179 VERIFY(i == expected_size); 180 } 181 182 // Test case 2 : Getting a non-extension byte when an extension byte is expected 183 { 184 char raw_data[] = { 'a', 'b', (char)0xC0, 'd', 'e' }; 185 Utf8View view { StringView { raw_data, 5 } }; 186 u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd', 'e' }; 187 DeprecatedString expected_underlying_bytes[] = { "a", "b", "\xC0", "d", "e" }; 188 size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); 189 size_t i = 0; 190 for (auto it = view.begin(); it != view.end(); ++it) { 191 u32 code_point = *it; 192 VERIFY(i < expected_size); 193 EXPECT_EQ(code_point, expected_characters[i]); 194 EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); 195 i++; 196 } 197 VERIFY(i == expected_size); 198 } 199 200 // Test case 3 : Not enough bytes before the end of the string 201 { 202 char raw_data[] = { 'a', 'b', (char)0x90, 'd' }; 203 Utf8View view { StringView { raw_data, 4 } }; 204 u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' }; 205 DeprecatedString expected_underlying_bytes[] = { "a", "b", "\x90", "d" }; 206 size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); 207 size_t i = 0; 208 for (auto it = view.begin(); it != view.end(); ++it) { 209 u32 code_point = *it; 210 VERIFY(i < expected_size); 211 EXPECT_EQ(code_point, expected_characters[i]); 212 EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); 213 i++; 214 } 215 VERIFY(i == expected_size); 216 } 217 218 // Test case 4 : Not enough bytes at the end of the string 219 { 220 char raw_data[] = { 'a', 'b', 'c', (char)0x90 }; 221 Utf8View view { StringView { raw_data, 4 } }; 222 u32 expected_characters[] = { 'a', 'b', 'c', 0xFFFD }; 223 DeprecatedString expected_underlying_bytes[] = { "a", "b", "c", "\x90" }; 224 size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); 225 size_t i = 0; 226 for (auto it = view.begin(); it != view.end(); ++it) { 227 u32 code_point = *it; 228 VERIFY(i < expected_size); 229 EXPECT_EQ(code_point, expected_characters[i]); 230 EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); 231 i++; 232 } 233 VERIFY(i == expected_size); 234 } 235 236 // Test case 5 : Oversized four-byte sequence (e.g. U+123456) 237 { 238 // Want to encode: (000)1 0010 0011 0100 0101 0110 239 // Into mask: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 240 // Shifted: 100 100011 010001 010110 241 // Result: 11110100 10100011 10010001 10010110 242 char raw_data[] = { 'a', (char)0xF4, (char)0xA3, (char)0x91, (char)0x96, 'b' }; 243 Utf8View view { StringView { raw_data, 6 } }; 244 // This definition seems to suggest that we should instead output multiple replacement characters: 245 // https://encoding.spec.whatwg.org/#ref-for-concept-stream-prepend② 246 // This is supported by the plaintext description and example collection, which annoyingly does not give an example of how to deal with this: 247 // https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf , section "U+FFFD Substitution of Maximal Subparts" 248 // However, that would go against how we deal with several other kinds of errors, so we stick to emitting only one U+FFFD. 249 u32 expected_characters[] = { 'a', 0xFFFD, 'b' }; 250 DeprecatedString expected_underlying_bytes[] = { "a", "\xF4\xA3\x91\x96", "b" }; 251 size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); 252 size_t i = 0; 253 for (auto it = view.begin(); it != view.end(); ++it) { 254 u32 code_point = *it; 255 VERIFY(i < expected_size); 256 EXPECT_EQ(code_point, expected_characters[i]); 257 EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); 258 i++; 259 } 260 VERIFY(i == expected_size); 261 } 262} 263 264TEST_CASE(trim) 265{ 266 Utf8View whitespace { " "sv }; 267 { 268 Utf8View view { "word"sv }; 269 EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word"); 270 EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word"); 271 EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word"); 272 } 273 { 274 Utf8View view { " word"sv }; 275 EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word"); 276 EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word"); 277 EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), " word"); 278 } 279 { 280 Utf8View view { "word "sv }; 281 EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word"); 282 EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word "); 283 EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word"); 284 } 285 { 286 Utf8View view { " word "sv }; 287 EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word"); 288 EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word "); 289 EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), " word"); 290 } 291 { 292 Utf8View view { "\u180E"sv }; 293 EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "\u180E"); 294 EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "\u180E"); 295 EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "\u180E"); 296 } 297}