Serenity Operating System
at master 637 lines 27 kB view raw
1/* 2 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org> 3 * Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl> 4 * Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org> 5 * 6 * SPDX-License-Identifier: BSD-2-Clause 7 */ 8 9#include <AK/StringBuilder.h> 10#include <AK/Utf16View.h> 11#include <AK/Utf8View.h> 12#include <LibTextCodec/Decoder.h> 13 14namespace TextCodec { 15 16static constexpr u32 replacement_code_point = 0xfffd; 17 18namespace { 19Latin1Decoder s_latin1_decoder; 20UTF8Decoder s_utf8_decoder; 21UTF16BEDecoder s_utf16be_decoder; 22UTF16LEDecoder s_utf16le_decoder; 23Latin2Decoder s_latin2_decoder; 24HebrewDecoder s_hebrew_decoder; 25CyrillicDecoder s_cyrillic_decoder; 26Koi8RDecoder s_koi8r_decoder; 27Latin9Decoder s_latin9_decoder; 28MacRomanDecoder s_mac_roman_decoder; 29TurkishDecoder s_turkish_decoder; 30XUserDefinedDecoder s_x_user_defined_decoder; 31} 32 33Optional<Decoder&> decoder_for(StringView a_encoding) 34{ 35 auto encoding = get_standardized_encoding(a_encoding); 36 if (encoding.has_value()) { 37 if (encoding.value().equals_ignoring_ascii_case("windows-1252"sv)) 38 return s_latin1_decoder; 39 if (encoding.value().equals_ignoring_ascii_case("utf-8"sv)) 40 return s_utf8_decoder; 41 if (encoding.value().equals_ignoring_ascii_case("utf-16be"sv)) 42 return s_utf16be_decoder; 43 if (encoding.value().equals_ignoring_ascii_case("utf-16le"sv)) 44 return s_utf16le_decoder; 45 if (encoding.value().equals_ignoring_ascii_case("iso-8859-2"sv)) 46 return s_latin2_decoder; 47 if (encoding.value().equals_ignoring_ascii_case("windows-1255"sv)) 48 return s_hebrew_decoder; 49 if (encoding.value().equals_ignoring_ascii_case("windows-1251"sv)) 50 return s_cyrillic_decoder; 51 if (encoding.value().equals_ignoring_ascii_case("koi8-r"sv)) 52 return s_koi8r_decoder; 53 if (encoding.value().equals_ignoring_ascii_case("iso-8859-15"sv)) 54 return s_latin9_decoder; 55 if (encoding.value().equals_ignoring_ascii_case("macintosh"sv)) 56 return s_mac_roman_decoder; 57 if (encoding.value().equals_ignoring_ascii_case("windows-1254"sv)) 58 return s_turkish_decoder; 59 if (encoding.value().equals_ignoring_ascii_case("x-user-defined"sv)) 60 return s_x_user_defined_decoder; 61 } 62 dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding); 63 return {}; 64} 65 66// https://encoding.spec.whatwg.org/#concept-encoding-get 67Optional<StringView> get_standardized_encoding(StringView encoding) 68{ 69 encoding = encoding.trim_whitespace(); 70 71 if (encoding.is_one_of_ignoring_ascii_case("unicode-1-1-utf-8"sv, "unicode11utf8"sv, "unicode20utf8"sv, "utf-8"sv, "utf8"sv, "x-unicode20utf8"sv)) 72 return "UTF-8"sv; 73 if (encoding.is_one_of_ignoring_ascii_case("866"sv, "cp866"sv, "csibm866"sv, "ibm866"sv)) 74 return "IBM866"sv; 75 if (encoding.is_one_of_ignoring_ascii_case("csisolatin2"sv, "iso-8859-2"sv, "iso-ir-101"sv, "iso8859-2"sv, "iso88592"sv, "iso_8859-2"sv, "iso_8859-2:1987"sv, "l2"sv, "latin2"sv)) 76 return "ISO-8859-2"sv; 77 if (encoding.is_one_of_ignoring_ascii_case("csisolatin3"sv, "iso-8859-3"sv, "iso-ir-109"sv, "iso8859-3"sv, "iso88593"sv, "iso_8859-3"sv, "iso_8859-3:1988"sv, "l3"sv, "latin3"sv)) 78 return "ISO-8859-3"sv; 79 if (encoding.is_one_of_ignoring_ascii_case("csisolatin4"sv, "iso-8859-4"sv, "iso-ir-110"sv, "iso8859-4"sv, "iso88594"sv, "iso_8859-4"sv, "iso_8859-4:1989"sv, "l4"sv, "latin4"sv)) 80 return "ISO-8859-4"sv; 81 if (encoding.is_one_of_ignoring_ascii_case("csisolatincyrillic"sv, "cyrillic"sv, "iso-8859-5"sv, "iso-ir-144"sv, "iso8859-5"sv, "iso88595"sv, "iso_8859-5"sv, "iso_8859-5:1988"sv)) 82 return "ISO-8859-5"sv; 83 if (encoding.is_one_of_ignoring_ascii_case("arabic"sv, "asmo-708"sv, "csiso88596e"sv, "csiso88596i"sv, "csisolatinarabic"sv, "ecma-114"sv, "iso-8859-6"sv, "iso-8859-6-e"sv, "iso-8859-6-i"sv, "iso-ir-127"sv, "iso8859-6"sv, "iso88596"sv, "iso_8859-6"sv, "iso_8859-6:1987"sv)) 84 return "ISO-8859-6"sv; 85 if (encoding.is_one_of_ignoring_ascii_case("csisolatingreek"sv, "ecma-118"sv, "elot_928"sv, "greek"sv, "greek8"sv, "iso-8859-7"sv, "iso-ir-126"sv, "iso8859-7"sv, "iso88597"sv, "iso_8859-7"sv, "iso_8859-7:1987"sv, "sun_eu_greek"sv)) 86 return "ISO-8859-7"sv; 87 if (encoding.is_one_of_ignoring_ascii_case("csiso88598e"sv, "csisolatinhebrew"sv, "hebrew"sv, "iso-8859-8"sv, "iso-8859-8-e"sv, "iso-ir-138"sv, "iso8859-8"sv, "iso88598"sv, "iso_8859-8"sv, "iso_8859-8:1988"sv, "visual"sv)) 88 return "ISO-8859-8"sv; 89 if (encoding.is_one_of_ignoring_ascii_case("csiso88598i"sv, "iso-8859-8-i"sv, "logical"sv)) 90 return "ISO-8859-8-I"sv; 91 if (encoding.is_one_of_ignoring_ascii_case("csisolatin6"sv, "iso8859-10"sv, "iso-ir-157"sv, "iso8859-10"sv, "iso885910"sv, "l6"sv, "latin6"sv)) 92 return "ISO-8859-10"sv; 93 if (encoding.is_one_of_ignoring_ascii_case("iso-8859-13"sv, "iso8859-13"sv, "iso885913"sv)) 94 return "ISO-8859-13"sv; 95 if (encoding.is_one_of_ignoring_ascii_case("iso-8859-14"sv, "iso8859-14"sv, "iso885914"sv)) 96 return "ISO-8859-14"sv; 97 if (encoding.is_one_of_ignoring_ascii_case("csisolatin9"sv, "iso-8859-15"sv, "iso8859-15"sv, "iso885915"sv, "iso_8859-15"sv, "l9"sv)) 98 return "ISO-8859-15"sv; 99 if (encoding.is_one_of_ignoring_ascii_case("iso-8859-16"sv)) 100 return "ISO-8859-16"sv; 101 if (encoding.is_one_of_ignoring_ascii_case("cskoi8r"sv, "koi"sv, "koi8"sv, "koi8-r"sv, "koi8_r"sv)) 102 return "KOI8-R"sv; 103 if (encoding.is_one_of_ignoring_ascii_case("koi8-ru"sv, "koi8-u"sv)) 104 return "KOI8-U"sv; 105 if (encoding.is_one_of_ignoring_ascii_case("csmacintosh"sv, "mac"sv, "macintosh"sv, "x-mac-roman"sv)) 106 return "macintosh"sv; 107 if (encoding.is_one_of_ignoring_ascii_case("dos-874"sv, "iso-8859-11"sv, "iso8859-11"sv, "iso885911"sv, "tis-620"sv, "windows-874"sv)) 108 return "windows-874"sv; 109 if (encoding.is_one_of_ignoring_ascii_case("cp1250"sv, "windows-1250"sv, "x-cp1250"sv)) 110 return "windows-1250"sv; 111 if (encoding.is_one_of_ignoring_ascii_case("cp1251"sv, "windows-1251"sv, "x-cp1251"sv)) 112 return "windows-1251"sv; 113 if (encoding.is_one_of_ignoring_ascii_case("ansi_x3.4-1968"sv, "ascii"sv, "cp1252"sv, "cp819"sv, "csisolatin1"sv, "ibm819"sv, "iso-8859-1"sv, "iso-ir-100"sv, "iso8859-1"sv, "iso88591"sv, "iso_8859-1"sv, "iso_8859-1:1987"sv, "l1"sv, "latin1"sv, "us-ascii"sv, "windows-1252"sv, "x-cp1252"sv)) 114 return "windows-1252"sv; 115 if (encoding.is_one_of_ignoring_ascii_case("cp1253"sv, "windows-1253"sv, "x-cp1253"sv)) 116 return "windows-1253"sv; 117 if (encoding.is_one_of_ignoring_ascii_case("cp1254"sv, "csisolatin5"sv, "iso-8859-9"sv, "iso-ir-148"sv, "iso-8859-9"sv, "iso-88599"sv, "iso_8859-9"sv, "iso_8859-9:1989"sv, "l5"sv, "latin5"sv, "windows-1254"sv, "x-cp1254"sv)) 118 return "windows-1254"sv; 119 if (encoding.is_one_of_ignoring_ascii_case("cp1255"sv, "windows-1255"sv, "x-cp1255"sv)) 120 return "windows-1255"sv; 121 if (encoding.is_one_of_ignoring_ascii_case("cp1256"sv, "windows-1256"sv, "x-cp1256"sv)) 122 return "windows-1256"sv; 123 if (encoding.is_one_of_ignoring_ascii_case("cp1257"sv, "windows-1257"sv, "x-cp1257"sv)) 124 return "windows-1257"sv; 125 if (encoding.is_one_of_ignoring_ascii_case("cp1258"sv, "windows-1258"sv, "x-cp1258"sv)) 126 return "windows-1258"sv; 127 if (encoding.is_one_of_ignoring_ascii_case("x-mac-cyrillic"sv, "x-mac-ukrainian"sv)) 128 return "x-mac-cyrillic"sv; 129 if (encoding.is_one_of_ignoring_ascii_case("koi8-r"sv, "koi8r"sv)) 130 return "koi8-r"sv; 131 if (encoding.is_one_of_ignoring_ascii_case("chinese"sv, "csgb2312"sv, "csiso58gb231280"sv, "gb2312"sv, "gb_2312"sv, "gb_2312-80"sv, "gbk"sv, "iso-ir-58"sv, "x-gbk"sv)) 132 return "GBK"sv; 133 if (encoding.is_one_of_ignoring_ascii_case("gb18030"sv)) 134 return "gb18030"sv; 135 if (encoding.is_one_of_ignoring_ascii_case("big5"sv, "big5-hkscs"sv, "cn-big5"sv, "csbig5"sv, "x-x-big5"sv)) 136 return "Big5"sv; 137 if (encoding.is_one_of_ignoring_ascii_case("cseucpkdfmtjapanese"sv, "euc-jp"sv, "x-euc-jp"sv)) 138 return "EUC-JP"sv; 139 if (encoding.is_one_of_ignoring_ascii_case("csiso2022jp"sv, "iso-2022-jp"sv)) 140 return "ISO-2022-JP"sv; 141 if (encoding.is_one_of_ignoring_ascii_case("csshiftjis"sv, "ms932"sv, "ms_kanji"sv, "shift-jis"sv, "shift_jis"sv, "sjis"sv, "windows-31j"sv, "x-sjis"sv)) 142 return "Shift_JIS"sv; 143 if (encoding.is_one_of_ignoring_ascii_case("cseuckr"sv, "csksc56011987"sv, "euc-kr"sv, "iso-ir-149"sv, "korean"sv, "ks_c_5601-1987"sv, "ks_c_5601-1989"sv, "ksc5601"sv, "ksc_5601"sv, "windows-949"sv)) 144 return "EUC-KR"sv; 145 if (encoding.is_one_of_ignoring_ascii_case("csiso2022kr"sv, "hz-gb-2312"sv, "iso-2022-cn"sv, "iso-2022-cn-ext"sv, "iso-2022-kr"sv, "replacement"sv)) 146 return "replacement"sv; 147 if (encoding.is_one_of_ignoring_ascii_case("unicodefffe"sv, "utf-16be"sv)) 148 return "UTF-16BE"sv; 149 if (encoding.is_one_of_ignoring_ascii_case("csunicode"sv, "iso-10646-ucs-2"sv, "ucs-2"sv, "unicode"sv, "unicodefeff"sv, "utf-16"sv, "utf-16le"sv)) 150 return "UTF-16LE"sv; 151 if (encoding.is_one_of_ignoring_ascii_case("x-user-defined"sv)) 152 return "x-user-defined"sv; 153 154 dbgln("TextCodec: Unrecognized encoding: {}", encoding); 155 return {}; 156} 157 158// https://encoding.spec.whatwg.org/#bom-sniff 159Optional<Decoder&> bom_sniff_to_decoder(StringView input) 160{ 161 // 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence. 162 // 2. For each of the rows in the table below, starting with the first one and going down, 163 // if BOM starts with the bytes given in the first column, then return the encoding given 164 // in the cell in the second column of that row. Otherwise, return null. 165 166 // Byte Order Mark | Encoding 167 // -------------------------- 168 // 0xEF 0xBB 0xBF | UTF-8 169 // 0xFE 0xFF | UTF-16BE 170 // 0xFF 0xFE | UTF-16LE 171 172 auto bytes = input.bytes(); 173 if (bytes.size() < 2) 174 return {}; 175 176 auto first_byte = bytes[0]; 177 178 switch (first_byte) { 179 case 0xEF: // UTF-8 180 if (bytes.size() < 3) 181 return {}; 182 if (bytes[1] == 0xBB && bytes[2] == 0xBF) 183 return s_utf8_decoder; 184 return {}; 185 case 0xFE: // UTF-16BE 186 if (bytes[1] == 0xFF) 187 return s_utf16be_decoder; 188 return {}; 189 case 0xFF: // UTF-16LE 190 if (bytes[1] == 0xFE) 191 return s_utf16le_decoder; 192 return {}; 193 } 194 195 return {}; 196} 197 198// https://encoding.spec.whatwg.org/#decode 199ErrorOr<String> convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(Decoder& fallback_decoder, StringView input) 200{ 201 Decoder* actual_decoder = &fallback_decoder; 202 203 // 1. Let BOMEncoding be the result of BOM sniffing ioQueue. 204 // 2. If BOMEncoding is non-null: 205 if (auto unicode_decoder = bom_sniff_to_decoder(input); unicode_decoder.has_value()) { 206 // 1. Set encoding to BOMEncoding. 207 actual_decoder = &unicode_decoder.value(); 208 209 // 2. Read three bytes from ioQueue, if BOMEncoding is UTF-8; otherwise read two bytes. (Do nothing with those bytes.) 210 // FIXME: I imagine this will be pretty slow for large inputs, as it's regenerating the input without the first 2/3 bytes. 211 input = input.substring_view(&unicode_decoder.value() == &s_utf8_decoder ? 3 : 2); 212 } 213 214 VERIFY(actual_decoder); 215 216 // FIXME: 3. Process a queue with an instance of encoding’s decoder, ioQueue, output, and "replacement". 217 // This isn't the exact same as the spec, especially the error mode of "replacement", which we don't have the concept of yet. 218 // 4. Return output. 219 return actual_decoder->to_utf8(input); 220} 221 222ErrorOr<String> Decoder::to_utf8(StringView input) 223{ 224 StringBuilder builder(input.length()); 225 TRY(process(input, [&builder](u32 c) { return builder.try_append_code_point(c); })); 226 return builder.to_string(); 227} 228 229ErrorOr<void> UTF8Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 230{ 231 for (auto c : Utf8View(input)) { 232 TRY(on_code_point(c)); 233 } 234 return {}; 235} 236 237ErrorOr<String> UTF8Decoder::to_utf8(StringView input) 238{ 239 // Discard the BOM 240 auto bomless_input = input; 241 if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { 242 bomless_input = input.substring_view(3); 243 } 244 245 return String::from_utf8(bomless_input); 246} 247 248ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 249{ 250 // rfc2781, 2.2 Decoding UTF-16 251 size_t utf16_length = input.length() - (input.length() % 2); 252 for (size_t i = 0; i < utf16_length; i += 2) { 253 // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value 254 // of W1. Terminate. 255 u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]); 256 if (!is_unicode_surrogate(w1)) { 257 TRY(on_code_point(w1)); 258 continue; 259 } 260 261 // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence 262 // is in error and no valid character can be obtained using W1. 263 // Terminate. 264 // 3) If there is no W2 (that is, the sequence ends with W1), or if W2 265 // is not between 0xDC00 and 0xDFFF, the sequence is in error. 266 // Terminate. 267 if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) { 268 TRY(on_code_point(replacement_code_point)); 269 continue; 270 } 271 272 u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]); 273 if (!Utf16View::is_low_surrogate(w2)) { 274 TRY(on_code_point(replacement_code_point)); 275 continue; 276 } 277 278 // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order 279 // bits of W1 as its 10 high-order bits and the 10 low-order bits of 280 // W2 as its 10 low-order bits. 281 // 5) Add 0x10000 to U' to obtain the character value U. Terminate. 282 TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2))); 283 i += 2; 284 } 285 286 return {}; 287} 288 289ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input) 290{ 291 // Discard the BOM 292 auto bomless_input = input; 293 if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) 294 bomless_input = input.substring_view(2); 295 296 StringBuilder builder(bomless_input.length() / 2); 297 TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); })); 298 return builder.to_string(); 299} 300 301ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 302{ 303 // rfc2781, 2.2 Decoding UTF-16 304 size_t utf16_length = input.length() - (input.length() % 2); 305 for (size_t i = 0; i < utf16_length; i += 2) { 306 // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value 307 // of W1. Terminate. 308 u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8); 309 if (!is_unicode_surrogate(w1)) { 310 TRY(on_code_point(w1)); 311 continue; 312 } 313 314 // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence 315 // is in error and no valid character can be obtained using W1. 316 // Terminate. 317 // 3) If there is no W2 (that is, the sequence ends with W1), or if W2 318 // is not between 0xDC00 and 0xDFFF, the sequence is in error. 319 // Terminate. 320 if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) { 321 TRY(on_code_point(replacement_code_point)); 322 continue; 323 } 324 325 u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8); 326 if (!Utf16View::is_low_surrogate(w2)) { 327 TRY(on_code_point(replacement_code_point)); 328 continue; 329 } 330 331 // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order 332 // bits of W1 as its 10 high-order bits and the 10 low-order bits of 333 // W2 as its 10 low-order bits. 334 // 5) Add 0x10000 to U' to obtain the character value U. Terminate. 335 TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2))); 336 i += 2; 337 } 338 339 return {}; 340} 341 342ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input) 343{ 344 // Discard the BOM 345 auto bomless_input = input; 346 if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE) 347 bomless_input = input.substring_view(2); 348 349 StringBuilder builder(bomless_input.length() / 2); 350 TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); })); 351 return builder.to_string(); 352} 353 354ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 355{ 356 for (u8 ch : input) { 357 // Latin1 is the same as the first 256 Unicode code_points, so no mapping is needed, just utf-8 encoding. 358 TRY(on_code_point(ch)); 359 } 360 361 return {}; 362} 363 364namespace { 365u32 convert_latin2_to_utf8(u8 in) 366{ 367 switch (in) { 368 369#define MAP(X, Y) \ 370 case X: \ 371 return Y 372 373 MAP(0xA1, 0x104); 374 MAP(0xA2, 0x2D8); 375 MAP(0xA3, 0x141); 376 MAP(0xA5, 0x13D); 377 MAP(0xA6, 0x15A); 378 MAP(0xA9, 0x160); 379 MAP(0xAA, 0x15E); 380 MAP(0xAB, 0x164); 381 MAP(0xAC, 0x179); 382 MAP(0xAE, 0x17D); 383 MAP(0xAF, 0x17B); 384 385 MAP(0xB1, 0x105); 386 MAP(0xB2, 0x2DB); 387 MAP(0xB3, 0x142); 388 MAP(0xB5, 0x13E); 389 MAP(0xB6, 0x15B); 390 MAP(0xB7, 0x2C7); 391 MAP(0xB9, 0x161); 392 MAP(0xBA, 0x15F); 393 MAP(0xBB, 0x165); 394 MAP(0xBC, 0x17A); 395 MAP(0xBD, 0x2DD); 396 MAP(0xBE, 0x17E); 397 MAP(0xBF, 0x17C); 398 399 MAP(0xC0, 0x154); 400 MAP(0xC3, 0x102); 401 MAP(0xC5, 0x139); 402 MAP(0xC6, 0x106); 403 MAP(0xC8, 0x10C); 404 MAP(0xCA, 0x118); 405 MAP(0xCC, 0x11A); 406 MAP(0xCF, 0x10E); 407 408 MAP(0xD0, 0x110); 409 MAP(0xD1, 0x143); 410 MAP(0xD2, 0x147); 411 MAP(0xD5, 0x150); 412 MAP(0xD8, 0x158); 413 MAP(0xD9, 0x16E); 414 MAP(0xDB, 0x170); 415 MAP(0xDE, 0x162); 416 417 MAP(0xE0, 0x155); 418 MAP(0xE3, 0x103); 419 MAP(0xE5, 0x13A); 420 MAP(0xE6, 0x107); 421 MAP(0xE8, 0x10D); 422 MAP(0xEA, 0x119); 423 MAP(0xEC, 0x11B); 424 MAP(0xEF, 0x10F); 425 426 MAP(0xF0, 0x111); 427 MAP(0xF1, 0x144); 428 MAP(0xF2, 0x148); 429 MAP(0xF5, 0x151); 430 MAP(0xF8, 0x159); 431 MAP(0xF9, 0x16F); 432 MAP(0xFB, 0x171); 433 MAP(0xFE, 0x163); 434 MAP(0xFF, 0x2D9); 435#undef MAP 436 437 default: 438 return in; 439 } 440} 441} 442 443ErrorOr<void> Latin2Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 444{ 445 for (auto c : input) { 446 TRY(on_code_point(convert_latin2_to_utf8(c))); 447 } 448 449 return {}; 450} 451 452ErrorOr<void> HebrewDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 453{ 454 static constexpr Array<u32, 128> translation_table = { 455 0x20AC, 0xFFFD, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021, 0x2C6, 0x2030, 0xFFFD, 0x2039, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 456 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x2DC, 0x2122, 0xFFFD, 0x203A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 457 0xA0, 0xA1, 0xA2, 0xA3, 0x20AA, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 458 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, 459 0x5B0, 0x5B1, 0x5B2, 0x5B3, 0x5B4, 0x5B5, 0x5B6, 0x5B7, 0x5B8, 0x5B9, 0x5BA, 0x5BB, 0x5BC, 0x5BD, 0x5BE, 0x5BF, 460 0x5C0, 0x5C1, 0x5C2, 0x5C3, 0x5F0, 0x5F1, 0x5F2, 0x5F3, 0x5F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 461 0x5D0, 0x5D1, 0x5D2, 0x5D3, 0x5D4, 0x5D5, 0x5D6, 0x5D7, 0x5D8, 0x5D9, 0x5DA, 0x5DB, 0x5DC, 0x5DD, 0x5DE, 0x5DF, 462 0x5E0, 0x5E1, 0x5E2, 0x5E3, 0x5E4, 0x5E5, 0x5E6, 0x5E7, 0x5E8, 0x5E9, 0x5EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD 463 }; 464 for (unsigned char ch : input) { 465 if (ch < 0x80) { // Superset of ASCII 466 TRY(on_code_point(ch)); 467 } else { 468 TRY(on_code_point(translation_table[ch - 0x80])); 469 } 470 } 471 472 return {}; 473} 474 475ErrorOr<void> CyrillicDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 476{ 477 static constexpr Array<u32, 128> translation_table = { 478 0x402, 0x403, 0x201A, 0x453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x409, 0x2039, 0x40A, 0x40C, 0x40B, 0x40F, 479 0x452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x459, 0x203A, 0x45A, 0x45C, 0x45B, 0x45F, 480 0xA0, 0x40E, 0x45E, 0x408, 0xA4, 0x490, 0xA6, 0xA7, 0x401, 0xA9, 0x404, 0xAB, 0xAC, 0xAD, 0xAE, 0x407, 481 0xB0, 0xB1, 0x406, 0x456, 0x491, 0xB5, 0xB6, 0xB7, 0x451, 0x2116, 0x454, 0xBB, 0x458, 0x405, 0x455, 0x457, 482 0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D, 0x41E, 0x41F, 483 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F, 484 0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B, 0x43C, 0x43D, 0x43E, 0x43F, 485 0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x44A, 0x44B, 0x44C, 0x44D, 0x44E, 0x44F 486 }; 487 for (unsigned char ch : input) { 488 if (ch < 0x80) { // Superset of ASCII 489 TRY(on_code_point(ch)); 490 } else { 491 TRY(on_code_point(translation_table[ch - 0x80])); 492 } 493 } 494 495 return {}; 496} 497 498ErrorOr<void> Koi8RDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 499{ 500 // clang-format off 501 static constexpr Array<u32, 128> translation_table = { 502 0x2500,0x2502,0x250c,0x2510,0x2514,0x2518,0x251c,0x2524,0x252c,0x2534,0x253c,0x2580,0x2584,0x2588,0x258c,0x2590, 503 0x2591,0x2592,0x2593,0x2320,0x25a0,0x2219,0x221a,0x2248,0x2264,0x2265,0xA0,0x2321,0xb0,0xb2,0xb7,0xf7, 504 0x2550,0x2551,0x2552,0xd191,0x2553,0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255a,0x255b,0x255c,0x255d,0x255e, 505 0x255f,0x2560,0x2561,0xd081,0x2562,0x2563,0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256a,0x256b,0x256c,0xa9, 506 0x44e,0x430,0x431,0x446,0x434,0x435,0x444,0x433,0x445,0x438,0x439,0x43a,0x43b,0x43c,0x43d,0x43e, 507 0x43f,0x44f,0x440,0x441,0x442,0x443,0x436,0x432,0x44c,0x44b,0x437,0x448,0x44d,0x449,0x447,0x44a, 508 0x42e,0x410,0x441,0x426,0x414,0x415,0x424,0x413,0x425,0x418,0x419,0x41a,0x41b,0x41c,0x41d,0x41e, 509 0x41f,0x42f,0x420,0x421,0x422,0x423,0x416,0x412,0x42c,0x42b,0x417,0x428,0x42d,0x429,0x427,0x42a, 510 }; 511 // clang-format on 512 513 for (unsigned char ch : input) { 514 if (ch < 0x80) { // Superset of ASCII 515 TRY(on_code_point(ch)); 516 } else { 517 TRY(on_code_point(translation_table[ch - 0x80])); 518 } 519 } 520 521 return {}; 522} 523 524ErrorOr<void> Latin9Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 525{ 526 auto convert_latin9_to_utf8 = [](u8 ch) -> u32 { 527 // Latin9 is the same as the first 256 Unicode code points, except for 8 characters. 528 switch (ch) { 529 case 0xA4: 530 return 0x20AC; 531 case 0xA6: 532 return 0x160; 533 case 0xA8: 534 return 0x161; 535 case 0xB4: 536 return 0x17D; 537 case 0xB8: 538 return 0x17E; 539 case 0xBC: 540 return 0x152; 541 case 0xBD: 542 return 0x153; 543 case 0xBE: 544 return 0x178; 545 default: 546 return ch; 547 } 548 }; 549 550 for (auto ch : input) { 551 TRY(on_code_point(convert_latin9_to_utf8(ch))); 552 } 553 554 return {}; 555} 556 557ErrorOr<void> MacRomanDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 558{ 559 // https://encoding.spec.whatwg.org/index-macintosh.txt 560 // clang-format off 561 static constexpr Array<u32, 128> translation_table = { 562 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8, 563 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC, 564 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8, 565 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8, 566 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153, 567 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02, 568 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4, 569 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7, 570 }; 571 // clang-format on 572 573 for (u8 ch : input) { 574 if (ch < 0x80) { // Superset of ASCII 575 TRY(on_code_point(ch)); 576 } else { 577 TRY(on_code_point(translation_table[ch - 0x80])); 578 } 579 } 580 581 return {}; 582} 583 584ErrorOr<void> TurkishDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 585{ 586 auto convert_turkish_to_utf8 = [](u8 ch) -> u32 { 587 // Turkish (aka ISO-8859-9, Windows-1254) is the same as the first 256 Unicode code points, except for 6 characters. 588 switch (ch) { 589 case 0xD0: 590 return 0x11E; 591 case 0xDD: 592 return 0x130; 593 case 0xDE: 594 return 0x15E; 595 case 0xF0: 596 return 0x11F; 597 case 0xFD: 598 return 0x131; 599 case 0xFE: 600 return 0x15F; 601 default: 602 return ch; 603 } 604 }; 605 606 for (auto ch : input) { 607 TRY(on_code_point(convert_turkish_to_utf8(ch))); 608 } 609 610 return {}; 611} 612 613// https://encoding.spec.whatwg.org/#x-user-defined-decoder 614ErrorOr<void> XUserDefinedDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) 615{ 616 auto convert_x_user_defined_to_utf8 = [](u8 ch) -> u32 { 617 // 2. If byte is an ASCII byte, return a code point whose value is byte. 618 // https://infra.spec.whatwg.org/#ascii-byte 619 // An ASCII byte is a byte in the range 0x00 (NUL) to 0x7F (DEL), inclusive. 620 // NOTE: This doesn't check for ch >= 0x00, as that would always be true due to being unsigned. 621 if (ch <= 0x7f) 622 return ch; 623 624 // 3. Return a code point whose value is 0xF780 + byte − 0x80. 625 return 0xF780 + ch - 0x80; 626 }; 627 628 for (auto ch : input) { 629 TRY(on_code_point(convert_x_user_defined_to_utf8(ch))); 630 } 631 632 // 1. If byte is end-of-queue, return finished. 633 634 return {}; 635} 636 637}