Userland/Libraries/LibTextCodec/Decoder.cpp at master

jcs.org / serenity
fork atom
Serenity Operating System
fork atom
serenity / Userland / Libraries / LibTextCodec / Decoder.cpp
at master 637 lines 27 kB view raw
wrap content
Andreas Kling Everywhere: Rename equals_ignoring_case => equals_ignoring_ascii_case 3y ago
a504ac3e
  1/*
  2 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3 * Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
  4 * Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
  5 *
  6 * SPDX-License-Identifier: BSD-2-Clause
  7 */
  8
  9#include <AK/StringBuilder.h>
 10#include <AK/Utf16View.h>
 11#include <AK/Utf8View.h>
 12#include <LibTextCodec/Decoder.h>
 13
 14namespace TextCodec {
 15
 16static constexpr u32 replacement_code_point = 0xfffd;
 17
 18namespace {
 19Latin1Decoder s_latin1_decoder;
 20UTF8Decoder s_utf8_decoder;
 21UTF16BEDecoder s_utf16be_decoder;
 22UTF16LEDecoder s_utf16le_decoder;
 23Latin2Decoder s_latin2_decoder;
 24HebrewDecoder s_hebrew_decoder;
 25CyrillicDecoder s_cyrillic_decoder;
 26Koi8RDecoder s_koi8r_decoder;
 27Latin9Decoder s_latin9_decoder;
 28MacRomanDecoder s_mac_roman_decoder;
 29TurkishDecoder s_turkish_decoder;
 30XUserDefinedDecoder s_x_user_defined_decoder;
 31}
 32
 33Optional<Decoder&> decoder_for(StringView a_encoding)
 34{
 35    auto encoding = get_standardized_encoding(a_encoding);
 36    if (encoding.has_value()) {
 37        if (encoding.value().equals_ignoring_ascii_case("windows-1252"sv))
 38            return s_latin1_decoder;
 39        if (encoding.value().equals_ignoring_ascii_case("utf-8"sv))
 40            return s_utf8_decoder;
 41        if (encoding.value().equals_ignoring_ascii_case("utf-16be"sv))
 42            return s_utf16be_decoder;
 43        if (encoding.value().equals_ignoring_ascii_case("utf-16le"sv))
 44            return s_utf16le_decoder;
 45        if (encoding.value().equals_ignoring_ascii_case("iso-8859-2"sv))
 46            return s_latin2_decoder;
 47        if (encoding.value().equals_ignoring_ascii_case("windows-1255"sv))
 48            return s_hebrew_decoder;
 49        if (encoding.value().equals_ignoring_ascii_case("windows-1251"sv))
 50            return s_cyrillic_decoder;
 51        if (encoding.value().equals_ignoring_ascii_case("koi8-r"sv))
 52            return s_koi8r_decoder;
 53        if (encoding.value().equals_ignoring_ascii_case("iso-8859-15"sv))
 54            return s_latin9_decoder;
 55        if (encoding.value().equals_ignoring_ascii_case("macintosh"sv))
 56            return s_mac_roman_decoder;
 57        if (encoding.value().equals_ignoring_ascii_case("windows-1254"sv))
 58            return s_turkish_decoder;
 59        if (encoding.value().equals_ignoring_ascii_case("x-user-defined"sv))
 60            return s_x_user_defined_decoder;
 61    }
 62    dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
 63    return {};
 64}
 65
 66// https://encoding.spec.whatwg.org/#concept-encoding-get
 67Optional<StringView> get_standardized_encoding(StringView encoding)
 68{
 69    encoding = encoding.trim_whitespace();
 70
 71    if (encoding.is_one_of_ignoring_ascii_case("unicode-1-1-utf-8"sv, "unicode11utf8"sv, "unicode20utf8"sv, "utf-8"sv, "utf8"sv, "x-unicode20utf8"sv))
 72        return "UTF-8"sv;
 73    if (encoding.is_one_of_ignoring_ascii_case("866"sv, "cp866"sv, "csibm866"sv, "ibm866"sv))
 74        return "IBM866"sv;
 75    if (encoding.is_one_of_ignoring_ascii_case("csisolatin2"sv, "iso-8859-2"sv, "iso-ir-101"sv, "iso8859-2"sv, "iso88592"sv, "iso_8859-2"sv, "iso_8859-2:1987"sv, "l2"sv, "latin2"sv))
 76        return "ISO-8859-2"sv;
 77    if (encoding.is_one_of_ignoring_ascii_case("csisolatin3"sv, "iso-8859-3"sv, "iso-ir-109"sv, "iso8859-3"sv, "iso88593"sv, "iso_8859-3"sv, "iso_8859-3:1988"sv, "l3"sv, "latin3"sv))
 78        return "ISO-8859-3"sv;
 79    if (encoding.is_one_of_ignoring_ascii_case("csisolatin4"sv, "iso-8859-4"sv, "iso-ir-110"sv, "iso8859-4"sv, "iso88594"sv, "iso_8859-4"sv, "iso_8859-4:1989"sv, "l4"sv, "latin4"sv))
 80        return "ISO-8859-4"sv;
 81    if (encoding.is_one_of_ignoring_ascii_case("csisolatincyrillic"sv, "cyrillic"sv, "iso-8859-5"sv, "iso-ir-144"sv, "iso8859-5"sv, "iso88595"sv, "iso_8859-5"sv, "iso_8859-5:1988"sv))
 82        return "ISO-8859-5"sv;
 83    if (encoding.is_one_of_ignoring_ascii_case("arabic"sv, "asmo-708"sv, "csiso88596e"sv, "csiso88596i"sv, "csisolatinarabic"sv, "ecma-114"sv, "iso-8859-6"sv, "iso-8859-6-e"sv, "iso-8859-6-i"sv, "iso-ir-127"sv, "iso8859-6"sv, "iso88596"sv, "iso_8859-6"sv, "iso_8859-6:1987"sv))
 84        return "ISO-8859-6"sv;
 85    if (encoding.is_one_of_ignoring_ascii_case("csisolatingreek"sv, "ecma-118"sv, "elot_928"sv, "greek"sv, "greek8"sv, "iso-8859-7"sv, "iso-ir-126"sv, "iso8859-7"sv, "iso88597"sv, "iso_8859-7"sv, "iso_8859-7:1987"sv, "sun_eu_greek"sv))
 86        return "ISO-8859-7"sv;
 87    if (encoding.is_one_of_ignoring_ascii_case("csiso88598e"sv, "csisolatinhebrew"sv, "hebrew"sv, "iso-8859-8"sv, "iso-8859-8-e"sv, "iso-ir-138"sv, "iso8859-8"sv, "iso88598"sv, "iso_8859-8"sv, "iso_8859-8:1988"sv, "visual"sv))
 88        return "ISO-8859-8"sv;
 89    if (encoding.is_one_of_ignoring_ascii_case("csiso88598i"sv, "iso-8859-8-i"sv, "logical"sv))
 90        return "ISO-8859-8-I"sv;
 91    if (encoding.is_one_of_ignoring_ascii_case("csisolatin6"sv, "iso8859-10"sv, "iso-ir-157"sv, "iso8859-10"sv, "iso885910"sv, "l6"sv, "latin6"sv))
 92        return "ISO-8859-10"sv;
 93    if (encoding.is_one_of_ignoring_ascii_case("iso-8859-13"sv, "iso8859-13"sv, "iso885913"sv))
 94        return "ISO-8859-13"sv;
 95    if (encoding.is_one_of_ignoring_ascii_case("iso-8859-14"sv, "iso8859-14"sv, "iso885914"sv))
 96        return "ISO-8859-14"sv;
 97    if (encoding.is_one_of_ignoring_ascii_case("csisolatin9"sv, "iso-8859-15"sv, "iso8859-15"sv, "iso885915"sv, "iso_8859-15"sv, "l9"sv))
 98        return "ISO-8859-15"sv;
 99    if (encoding.is_one_of_ignoring_ascii_case("iso-8859-16"sv))
100        return "ISO-8859-16"sv;
101    if (encoding.is_one_of_ignoring_ascii_case("cskoi8r"sv, "koi"sv, "koi8"sv, "koi8-r"sv, "koi8_r"sv))
102        return "KOI8-R"sv;
103    if (encoding.is_one_of_ignoring_ascii_case("koi8-ru"sv, "koi8-u"sv))
104        return "KOI8-U"sv;
105    if (encoding.is_one_of_ignoring_ascii_case("csmacintosh"sv, "mac"sv, "macintosh"sv, "x-mac-roman"sv))
106        return "macintosh"sv;
107    if (encoding.is_one_of_ignoring_ascii_case("dos-874"sv, "iso-8859-11"sv, "iso8859-11"sv, "iso885911"sv, "tis-620"sv, "windows-874"sv))
108        return "windows-874"sv;
109    if (encoding.is_one_of_ignoring_ascii_case("cp1250"sv, "windows-1250"sv, "x-cp1250"sv))
110        return "windows-1250"sv;
111    if (encoding.is_one_of_ignoring_ascii_case("cp1251"sv, "windows-1251"sv, "x-cp1251"sv))
112        return "windows-1251"sv;
113    if (encoding.is_one_of_ignoring_ascii_case("ansi_x3.4-1968"sv, "ascii"sv, "cp1252"sv, "cp819"sv, "csisolatin1"sv, "ibm819"sv, "iso-8859-1"sv, "iso-ir-100"sv, "iso8859-1"sv, "iso88591"sv, "iso_8859-1"sv, "iso_8859-1:1987"sv, "l1"sv, "latin1"sv, "us-ascii"sv, "windows-1252"sv, "x-cp1252"sv))
114        return "windows-1252"sv;
115    if (encoding.is_one_of_ignoring_ascii_case("cp1253"sv, "windows-1253"sv, "x-cp1253"sv))
116        return "windows-1253"sv;
117    if (encoding.is_one_of_ignoring_ascii_case("cp1254"sv, "csisolatin5"sv, "iso-8859-9"sv, "iso-ir-148"sv, "iso-8859-9"sv, "iso-88599"sv, "iso_8859-9"sv, "iso_8859-9:1989"sv, "l5"sv, "latin5"sv, "windows-1254"sv, "x-cp1254"sv))
118        return "windows-1254"sv;
119    if (encoding.is_one_of_ignoring_ascii_case("cp1255"sv, "windows-1255"sv, "x-cp1255"sv))
120        return "windows-1255"sv;
121    if (encoding.is_one_of_ignoring_ascii_case("cp1256"sv, "windows-1256"sv, "x-cp1256"sv))
122        return "windows-1256"sv;
123    if (encoding.is_one_of_ignoring_ascii_case("cp1257"sv, "windows-1257"sv, "x-cp1257"sv))
124        return "windows-1257"sv;
125    if (encoding.is_one_of_ignoring_ascii_case("cp1258"sv, "windows-1258"sv, "x-cp1258"sv))
126        return "windows-1258"sv;
127    if (encoding.is_one_of_ignoring_ascii_case("x-mac-cyrillic"sv, "x-mac-ukrainian"sv))
128        return "x-mac-cyrillic"sv;
129    if (encoding.is_one_of_ignoring_ascii_case("koi8-r"sv, "koi8r"sv))
130        return "koi8-r"sv;
131    if (encoding.is_one_of_ignoring_ascii_case("chinese"sv, "csgb2312"sv, "csiso58gb231280"sv, "gb2312"sv, "gb_2312"sv, "gb_2312-80"sv, "gbk"sv, "iso-ir-58"sv, "x-gbk"sv))
132        return "GBK"sv;
133    if (encoding.is_one_of_ignoring_ascii_case("gb18030"sv))
134        return "gb18030"sv;
135    if (encoding.is_one_of_ignoring_ascii_case("big5"sv, "big5-hkscs"sv, "cn-big5"sv, "csbig5"sv, "x-x-big5"sv))
136        return "Big5"sv;
137    if (encoding.is_one_of_ignoring_ascii_case("cseucpkdfmtjapanese"sv, "euc-jp"sv, "x-euc-jp"sv))
138        return "EUC-JP"sv;
139    if (encoding.is_one_of_ignoring_ascii_case("csiso2022jp"sv, "iso-2022-jp"sv))
140        return "ISO-2022-JP"sv;
141    if (encoding.is_one_of_ignoring_ascii_case("csshiftjis"sv, "ms932"sv, "ms_kanji"sv, "shift-jis"sv, "shift_jis"sv, "sjis"sv, "windows-31j"sv, "x-sjis"sv))
142        return "Shift_JIS"sv;
143    if (encoding.is_one_of_ignoring_ascii_case("cseuckr"sv, "csksc56011987"sv, "euc-kr"sv, "iso-ir-149"sv, "korean"sv, "ks_c_5601-1987"sv, "ks_c_5601-1989"sv, "ksc5601"sv, "ksc_5601"sv, "windows-949"sv))
144        return "EUC-KR"sv;
145    if (encoding.is_one_of_ignoring_ascii_case("csiso2022kr"sv, "hz-gb-2312"sv, "iso-2022-cn"sv, "iso-2022-cn-ext"sv, "iso-2022-kr"sv, "replacement"sv))
146        return "replacement"sv;
147    if (encoding.is_one_of_ignoring_ascii_case("unicodefffe"sv, "utf-16be"sv))
148        return "UTF-16BE"sv;
149    if (encoding.is_one_of_ignoring_ascii_case("csunicode"sv, "iso-10646-ucs-2"sv, "ucs-2"sv, "unicode"sv, "unicodefeff"sv, "utf-16"sv, "utf-16le"sv))
150        return "UTF-16LE"sv;
151    if (encoding.is_one_of_ignoring_ascii_case("x-user-defined"sv))
152        return "x-user-defined"sv;
153
154    dbgln("TextCodec: Unrecognized encoding: {}", encoding);
155    return {};
156}
157
158// https://encoding.spec.whatwg.org/#bom-sniff
159Optional<Decoder&> bom_sniff_to_decoder(StringView input)
160{
161    // 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
162    // 2. For each of the rows in the table below, starting with the first one and going down,
163    //    if BOM starts with the bytes given in the first column, then return the encoding given
164    //    in the cell in the second column of that row. Otherwise, return null.
165
166    // Byte Order Mark | Encoding
167    // --------------------------
168    // 0xEF 0xBB 0xBF  | UTF-8
169    // 0xFE 0xFF       | UTF-16BE
170    // 0xFF 0xFE       | UTF-16LE
171
172    auto bytes = input.bytes();
173    if (bytes.size() < 2)
174        return {};
175
176    auto first_byte = bytes[0];
177
178    switch (first_byte) {
179    case 0xEF: // UTF-8
180        if (bytes.size() < 3)
181            return {};
182        if (bytes[1] == 0xBB && bytes[2] == 0xBF)
183            return s_utf8_decoder;
184        return {};
185    case 0xFE: // UTF-16BE
186        if (bytes[1] == 0xFF)
187            return s_utf16be_decoder;
188        return {};
189    case 0xFF: // UTF-16LE
190        if (bytes[1] == 0xFE)
191            return s_utf16le_decoder;
192        return {};
193    }
194
195    return {};
196}
197
198// https://encoding.spec.whatwg.org/#decode
199ErrorOr<String> convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(Decoder& fallback_decoder, StringView input)
200{
201    Decoder* actual_decoder = &fallback_decoder;
202
203    // 1. Let BOMEncoding be the result of BOM sniffing ioQueue.
204    // 2. If BOMEncoding is non-null:
205    if (auto unicode_decoder = bom_sniff_to_decoder(input); unicode_decoder.has_value()) {
206        // 1. Set encoding to BOMEncoding.
207        actual_decoder = &unicode_decoder.value();
208
209        // 2. Read three bytes from ioQueue, if BOMEncoding is UTF-8; otherwise read two bytes. (Do nothing with those bytes.)
210        // FIXME: I imagine this will be pretty slow for large inputs, as it's regenerating the input without the first 2/3 bytes.
211        input = input.substring_view(&unicode_decoder.value() == &s_utf8_decoder ? 3 : 2);
212    }
213
214    VERIFY(actual_decoder);
215
216    // FIXME: 3. Process a queue with an instance of encoding’s decoder, ioQueue, output, and "replacement".
217    //        This isn't the exact same as the spec, especially the error mode of "replacement", which we don't have the concept of yet.
218    // 4. Return output.
219    return actual_decoder->to_utf8(input);
220}
221
222ErrorOr<String> Decoder::to_utf8(StringView input)
223{
224    StringBuilder builder(input.length());
225    TRY(process(input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
226    return builder.to_string();
227}
228
229ErrorOr<void> UTF8Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
230{
231    for (auto c : Utf8View(input)) {
232        TRY(on_code_point(c));
233    }
234    return {};
235}
236
237ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
238{
239    // Discard the BOM
240    auto bomless_input = input;
241    if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
242        bomless_input = input.substring_view(3);
243    }
244
245    return String::from_utf8(bomless_input);
246}
247
248ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
249{
250    // rfc2781, 2.2 Decoding UTF-16
251    size_t utf16_length = input.length() - (input.length() % 2);
252    for (size_t i = 0; i < utf16_length; i += 2) {
253        // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
254        //    of W1. Terminate.
255        u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
256        if (!is_unicode_surrogate(w1)) {
257            TRY(on_code_point(w1));
258            continue;
259        }
260
261        // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
262        //    is in error and no valid character can be obtained using W1.
263        //    Terminate.
264        // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
265        //    is not between 0xDC00 and 0xDFFF, the sequence is in error.
266        //    Terminate.
267        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
268            TRY(on_code_point(replacement_code_point));
269            continue;
270        }
271
272        u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
273        if (!Utf16View::is_low_surrogate(w2)) {
274            TRY(on_code_point(replacement_code_point));
275            continue;
276        }
277
278        // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
279        //    bits of W1 as its 10 high-order bits and the 10 low-order bits of
280        //    W2 as its 10 low-order bits.
281        // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
282        TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
283        i += 2;
284    }
285
286    return {};
287}
288
289ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
290{
291    // Discard the BOM
292    auto bomless_input = input;
293    if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
294        bomless_input = input.substring_view(2);
295
296    StringBuilder builder(bomless_input.length() / 2);
297    TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
298    return builder.to_string();
299}
300
301ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
302{
303    // rfc2781, 2.2 Decoding UTF-16
304    size_t utf16_length = input.length() - (input.length() % 2);
305    for (size_t i = 0; i < utf16_length; i += 2) {
306        // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
307        //    of W1. Terminate.
308        u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
309        if (!is_unicode_surrogate(w1)) {
310            TRY(on_code_point(w1));
311            continue;
312        }
313
314        // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
315        //    is in error and no valid character can be obtained using W1.
316        //    Terminate.
317        // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
318        //    is not between 0xDC00 and 0xDFFF, the sequence is in error.
319        //    Terminate.
320        if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
321            TRY(on_code_point(replacement_code_point));
322            continue;
323        }
324
325        u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
326        if (!Utf16View::is_low_surrogate(w2)) {
327            TRY(on_code_point(replacement_code_point));
328            continue;
329        }
330
331        // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
332        //    bits of W1 as its 10 high-order bits and the 10 low-order bits of
333        //    W2 as its 10 low-order bits.
334        // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
335        TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
336        i += 2;
337    }
338
339    return {};
340}
341
342ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
343{
344    // Discard the BOM
345    auto bomless_input = input;
346    if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
347        bomless_input = input.substring_view(2);
348
349    StringBuilder builder(bomless_input.length() / 2);
350    TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
351    return builder.to_string();
352}
353
354ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
355{
356    for (u8 ch : input) {
357        // Latin1 is the same as the first 256 Unicode code_points, so no mapping is needed, just utf-8 encoding.
358        TRY(on_code_point(ch));
359    }
360
361    return {};
362}
363
364namespace {
365u32 convert_latin2_to_utf8(u8 in)
366{
367    switch (in) {
368
369#define MAP(X, Y) \
370    case X:       \
371        return Y
372
373        MAP(0xA1, 0x104);
374        MAP(0xA2, 0x2D8);
375        MAP(0xA3, 0x141);
376        MAP(0xA5, 0x13D);
377        MAP(0xA6, 0x15A);
378        MAP(0xA9, 0x160);
379        MAP(0xAA, 0x15E);
380        MAP(0xAB, 0x164);
381        MAP(0xAC, 0x179);
382        MAP(0xAE, 0x17D);
383        MAP(0xAF, 0x17B);
384
385        MAP(0xB1, 0x105);
386        MAP(0xB2, 0x2DB);
387        MAP(0xB3, 0x142);
388        MAP(0xB5, 0x13E);
389        MAP(0xB6, 0x15B);
390        MAP(0xB7, 0x2C7);
391        MAP(0xB9, 0x161);
392        MAP(0xBA, 0x15F);
393        MAP(0xBB, 0x165);
394        MAP(0xBC, 0x17A);
395        MAP(0xBD, 0x2DD);
396        MAP(0xBE, 0x17E);
397        MAP(0xBF, 0x17C);
398
399        MAP(0xC0, 0x154);
400        MAP(0xC3, 0x102);
401        MAP(0xC5, 0x139);
402        MAP(0xC6, 0x106);
403        MAP(0xC8, 0x10C);
404        MAP(0xCA, 0x118);
405        MAP(0xCC, 0x11A);
406        MAP(0xCF, 0x10E);
407
408        MAP(0xD0, 0x110);
409        MAP(0xD1, 0x143);
410        MAP(0xD2, 0x147);
411        MAP(0xD5, 0x150);
412        MAP(0xD8, 0x158);
413        MAP(0xD9, 0x16E);
414        MAP(0xDB, 0x170);
415        MAP(0xDE, 0x162);
416
417        MAP(0xE0, 0x155);
418        MAP(0xE3, 0x103);
419        MAP(0xE5, 0x13A);
420        MAP(0xE6, 0x107);
421        MAP(0xE8, 0x10D);
422        MAP(0xEA, 0x119);
423        MAP(0xEC, 0x11B);
424        MAP(0xEF, 0x10F);
425
426        MAP(0xF0, 0x111);
427        MAP(0xF1, 0x144);
428        MAP(0xF2, 0x148);
429        MAP(0xF5, 0x151);
430        MAP(0xF8, 0x159);
431        MAP(0xF9, 0x16F);
432        MAP(0xFB, 0x171);
433        MAP(0xFE, 0x163);
434        MAP(0xFF, 0x2D9);
435#undef MAP
436
437    default:
438        return in;
439    }
440}
441}
442
443ErrorOr<void> Latin2Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
444{
445    for (auto c : input) {
446        TRY(on_code_point(convert_latin2_to_utf8(c)));
447    }
448
449    return {};
450}
451
452ErrorOr<void> HebrewDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
453{
454    static constexpr Array<u32, 128> translation_table = {
455        0x20AC, 0xFFFD, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021, 0x2C6, 0x2030, 0xFFFD, 0x2039, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
456        0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x2DC, 0x2122, 0xFFFD, 0x203A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
457        0xA0, 0xA1, 0xA2, 0xA3, 0x20AA, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
458        0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
459        0x5B0, 0x5B1, 0x5B2, 0x5B3, 0x5B4, 0x5B5, 0x5B6, 0x5B7, 0x5B8, 0x5B9, 0x5BA, 0x5BB, 0x5BC, 0x5BD, 0x5BE, 0x5BF,
460        0x5C0, 0x5C1, 0x5C2, 0x5C3, 0x5F0, 0x5F1, 0x5F2, 0x5F3, 0x5F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
461        0x5D0, 0x5D1, 0x5D2, 0x5D3, 0x5D4, 0x5D5, 0x5D6, 0x5D7, 0x5D8, 0x5D9, 0x5DA, 0x5DB, 0x5DC, 0x5DD, 0x5DE, 0x5DF,
462        0x5E0, 0x5E1, 0x5E2, 0x5E3, 0x5E4, 0x5E5, 0x5E6, 0x5E7, 0x5E8, 0x5E9, 0x5EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
463    };
464    for (unsigned char ch : input) {
465        if (ch < 0x80) { // Superset of ASCII
466            TRY(on_code_point(ch));
467        } else {
468            TRY(on_code_point(translation_table[ch - 0x80]));
469        }
470    }
471
472    return {};
473}
474
475ErrorOr<void> CyrillicDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
476{
477    static constexpr Array<u32, 128> translation_table = {
478        0x402, 0x403, 0x201A, 0x453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x409, 0x2039, 0x40A, 0x40C, 0x40B, 0x40F,
479        0x452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x459, 0x203A, 0x45A, 0x45C, 0x45B, 0x45F,
480        0xA0, 0x40E, 0x45E, 0x408, 0xA4, 0x490, 0xA6, 0xA7, 0x401, 0xA9, 0x404, 0xAB, 0xAC, 0xAD, 0xAE, 0x407,
481        0xB0, 0xB1, 0x406, 0x456, 0x491, 0xB5, 0xB6, 0xB7, 0x451, 0x2116, 0x454, 0xBB, 0x458, 0x405, 0x455, 0x457,
482        0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D, 0x41E, 0x41F,
483        0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F,
484        0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B, 0x43C, 0x43D, 0x43E, 0x43F,
485        0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x44A, 0x44B, 0x44C, 0x44D, 0x44E, 0x44F
486    };
487    for (unsigned char ch : input) {
488        if (ch < 0x80) { // Superset of ASCII
489            TRY(on_code_point(ch));
490        } else {
491            TRY(on_code_point(translation_table[ch - 0x80]));
492        }
493    }
494
495    return {};
496}
497
498ErrorOr<void> Koi8RDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
499{
500    // clang-format off
501    static constexpr Array<u32, 128> translation_table = {
502        0x2500,0x2502,0x250c,0x2510,0x2514,0x2518,0x251c,0x2524,0x252c,0x2534,0x253c,0x2580,0x2584,0x2588,0x258c,0x2590,
503        0x2591,0x2592,0x2593,0x2320,0x25a0,0x2219,0x221a,0x2248,0x2264,0x2265,0xA0,0x2321,0xb0,0xb2,0xb7,0xf7,
504        0x2550,0x2551,0x2552,0xd191,0x2553,0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255a,0x255b,0x255c,0x255d,0x255e,
505        0x255f,0x2560,0x2561,0xd081,0x2562,0x2563,0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256a,0x256b,0x256c,0xa9,
506        0x44e,0x430,0x431,0x446,0x434,0x435,0x444,0x433,0x445,0x438,0x439,0x43a,0x43b,0x43c,0x43d,0x43e,
507        0x43f,0x44f,0x440,0x441,0x442,0x443,0x436,0x432,0x44c,0x44b,0x437,0x448,0x44d,0x449,0x447,0x44a,
508        0x42e,0x410,0x441,0x426,0x414,0x415,0x424,0x413,0x425,0x418,0x419,0x41a,0x41b,0x41c,0x41d,0x41e,
509        0x41f,0x42f,0x420,0x421,0x422,0x423,0x416,0x412,0x42c,0x42b,0x417,0x428,0x42d,0x429,0x427,0x42a,
510    };
511    // clang-format on
512
513    for (unsigned char ch : input) {
514        if (ch < 0x80) { // Superset of ASCII
515            TRY(on_code_point(ch));
516        } else {
517            TRY(on_code_point(translation_table[ch - 0x80]));
518        }
519    }
520
521    return {};
522}
523
524ErrorOr<void> Latin9Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
525{
526    auto convert_latin9_to_utf8 = [](u8 ch) -> u32 {
527        // Latin9 is the same as the first 256 Unicode code points, except for 8 characters.
528        switch (ch) {
529        case 0xA4:
530            return 0x20AC;
531        case 0xA6:
532            return 0x160;
533        case 0xA8:
534            return 0x161;
535        case 0xB4:
536            return 0x17D;
537        case 0xB8:
538            return 0x17E;
539        case 0xBC:
540            return 0x152;
541        case 0xBD:
542            return 0x153;
543        case 0xBE:
544            return 0x178;
545        default:
546            return ch;
547        }
548    };
549
550    for (auto ch : input) {
551        TRY(on_code_point(convert_latin9_to_utf8(ch)));
552    }
553
554    return {};
555}
556
557ErrorOr<void> MacRomanDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
558{
559    // https://encoding.spec.whatwg.org/index-macintosh.txt
560    // clang-format off
561    static constexpr Array<u32, 128> translation_table = {
562        0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
563        0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
564        0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
565        0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
566        0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
567        0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
568        0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
569        0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,
570    };
571    // clang-format on
572
573    for (u8 ch : input) {
574        if (ch < 0x80) { // Superset of ASCII
575            TRY(on_code_point(ch));
576        } else {
577            TRY(on_code_point(translation_table[ch - 0x80]));
578        }
579    }
580
581    return {};
582}
583
584ErrorOr<void> TurkishDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
585{
586    auto convert_turkish_to_utf8 = [](u8 ch) -> u32 {
587        // Turkish (aka ISO-8859-9, Windows-1254) is the same as the first 256 Unicode code points, except for 6 characters.
588        switch (ch) {
589        case 0xD0:
590            return 0x11E;
591        case 0xDD:
592            return 0x130;
593        case 0xDE:
594            return 0x15E;
595        case 0xF0:
596            return 0x11F;
597        case 0xFD:
598            return 0x131;
599        case 0xFE:
600            return 0x15F;
601        default:
602            return ch;
603        }
604    };
605
606    for (auto ch : input) {
607        TRY(on_code_point(convert_turkish_to_utf8(ch)));
608    }
609
610    return {};
611}
612
613// https://encoding.spec.whatwg.org/#x-user-defined-decoder
614ErrorOr<void> XUserDefinedDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
615{
616    auto convert_x_user_defined_to_utf8 = [](u8 ch) -> u32 {
617        // 2. If byte is an ASCII byte, return a code point whose value is byte.
618        // https://infra.spec.whatwg.org/#ascii-byte
619        // An ASCII byte is a byte in the range 0x00 (NUL) to 0x7F (DEL), inclusive.
620        // NOTE: This doesn't check for ch >= 0x00, as that would always be true due to being unsigned.
621        if (ch <= 0x7f)
622            return ch;
623
624        // 3. Return a code point whose value is 0xF780 + byte − 0x80.
625        return 0xF780 + ch - 0x80;
626    };
627
628    for (auto ch : input) {
629        TRY(on_code_point(convert_x_user_defined_to_utf8(ch)));
630    }
631
632    // 1. If byte is end-of-queue, return finished.
633
634    return {};
635}
636
637}