Serenity Operating System
1/*
2 * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
3 * Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
4 * Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
5 *
6 * SPDX-License-Identifier: BSD-2-Clause
7 */
8
9#include <AK/StringBuilder.h>
10#include <AK/Utf16View.h>
11#include <AK/Utf8View.h>
12#include <LibTextCodec/Decoder.h>
13
14namespace TextCodec {
15
16static constexpr u32 replacement_code_point = 0xfffd;
17
18namespace {
19Latin1Decoder s_latin1_decoder;
20UTF8Decoder s_utf8_decoder;
21UTF16BEDecoder s_utf16be_decoder;
22UTF16LEDecoder s_utf16le_decoder;
23Latin2Decoder s_latin2_decoder;
24HebrewDecoder s_hebrew_decoder;
25CyrillicDecoder s_cyrillic_decoder;
26Koi8RDecoder s_koi8r_decoder;
27Latin9Decoder s_latin9_decoder;
28MacRomanDecoder s_mac_roman_decoder;
29TurkishDecoder s_turkish_decoder;
30XUserDefinedDecoder s_x_user_defined_decoder;
31}
32
33Optional<Decoder&> decoder_for(StringView a_encoding)
34{
35 auto encoding = get_standardized_encoding(a_encoding);
36 if (encoding.has_value()) {
37 if (encoding.value().equals_ignoring_ascii_case("windows-1252"sv))
38 return s_latin1_decoder;
39 if (encoding.value().equals_ignoring_ascii_case("utf-8"sv))
40 return s_utf8_decoder;
41 if (encoding.value().equals_ignoring_ascii_case("utf-16be"sv))
42 return s_utf16be_decoder;
43 if (encoding.value().equals_ignoring_ascii_case("utf-16le"sv))
44 return s_utf16le_decoder;
45 if (encoding.value().equals_ignoring_ascii_case("iso-8859-2"sv))
46 return s_latin2_decoder;
47 if (encoding.value().equals_ignoring_ascii_case("windows-1255"sv))
48 return s_hebrew_decoder;
49 if (encoding.value().equals_ignoring_ascii_case("windows-1251"sv))
50 return s_cyrillic_decoder;
51 if (encoding.value().equals_ignoring_ascii_case("koi8-r"sv))
52 return s_koi8r_decoder;
53 if (encoding.value().equals_ignoring_ascii_case("iso-8859-15"sv))
54 return s_latin9_decoder;
55 if (encoding.value().equals_ignoring_ascii_case("macintosh"sv))
56 return s_mac_roman_decoder;
57 if (encoding.value().equals_ignoring_ascii_case("windows-1254"sv))
58 return s_turkish_decoder;
59 if (encoding.value().equals_ignoring_ascii_case("x-user-defined"sv))
60 return s_x_user_defined_decoder;
61 }
62 dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
63 return {};
64}
65
66// https://encoding.spec.whatwg.org/#concept-encoding-get
67Optional<StringView> get_standardized_encoding(StringView encoding)
68{
69 encoding = encoding.trim_whitespace();
70
71 if (encoding.is_one_of_ignoring_ascii_case("unicode-1-1-utf-8"sv, "unicode11utf8"sv, "unicode20utf8"sv, "utf-8"sv, "utf8"sv, "x-unicode20utf8"sv))
72 return "UTF-8"sv;
73 if (encoding.is_one_of_ignoring_ascii_case("866"sv, "cp866"sv, "csibm866"sv, "ibm866"sv))
74 return "IBM866"sv;
75 if (encoding.is_one_of_ignoring_ascii_case("csisolatin2"sv, "iso-8859-2"sv, "iso-ir-101"sv, "iso8859-2"sv, "iso88592"sv, "iso_8859-2"sv, "iso_8859-2:1987"sv, "l2"sv, "latin2"sv))
76 return "ISO-8859-2"sv;
77 if (encoding.is_one_of_ignoring_ascii_case("csisolatin3"sv, "iso-8859-3"sv, "iso-ir-109"sv, "iso8859-3"sv, "iso88593"sv, "iso_8859-3"sv, "iso_8859-3:1988"sv, "l3"sv, "latin3"sv))
78 return "ISO-8859-3"sv;
79 if (encoding.is_one_of_ignoring_ascii_case("csisolatin4"sv, "iso-8859-4"sv, "iso-ir-110"sv, "iso8859-4"sv, "iso88594"sv, "iso_8859-4"sv, "iso_8859-4:1989"sv, "l4"sv, "latin4"sv))
80 return "ISO-8859-4"sv;
81 if (encoding.is_one_of_ignoring_ascii_case("csisolatincyrillic"sv, "cyrillic"sv, "iso-8859-5"sv, "iso-ir-144"sv, "iso8859-5"sv, "iso88595"sv, "iso_8859-5"sv, "iso_8859-5:1988"sv))
82 return "ISO-8859-5"sv;
83 if (encoding.is_one_of_ignoring_ascii_case("arabic"sv, "asmo-708"sv, "csiso88596e"sv, "csiso88596i"sv, "csisolatinarabic"sv, "ecma-114"sv, "iso-8859-6"sv, "iso-8859-6-e"sv, "iso-8859-6-i"sv, "iso-ir-127"sv, "iso8859-6"sv, "iso88596"sv, "iso_8859-6"sv, "iso_8859-6:1987"sv))
84 return "ISO-8859-6"sv;
85 if (encoding.is_one_of_ignoring_ascii_case("csisolatingreek"sv, "ecma-118"sv, "elot_928"sv, "greek"sv, "greek8"sv, "iso-8859-7"sv, "iso-ir-126"sv, "iso8859-7"sv, "iso88597"sv, "iso_8859-7"sv, "iso_8859-7:1987"sv, "sun_eu_greek"sv))
86 return "ISO-8859-7"sv;
87 if (encoding.is_one_of_ignoring_ascii_case("csiso88598e"sv, "csisolatinhebrew"sv, "hebrew"sv, "iso-8859-8"sv, "iso-8859-8-e"sv, "iso-ir-138"sv, "iso8859-8"sv, "iso88598"sv, "iso_8859-8"sv, "iso_8859-8:1988"sv, "visual"sv))
88 return "ISO-8859-8"sv;
89 if (encoding.is_one_of_ignoring_ascii_case("csiso88598i"sv, "iso-8859-8-i"sv, "logical"sv))
90 return "ISO-8859-8-I"sv;
91 if (encoding.is_one_of_ignoring_ascii_case("csisolatin6"sv, "iso8859-10"sv, "iso-ir-157"sv, "iso8859-10"sv, "iso885910"sv, "l6"sv, "latin6"sv))
92 return "ISO-8859-10"sv;
93 if (encoding.is_one_of_ignoring_ascii_case("iso-8859-13"sv, "iso8859-13"sv, "iso885913"sv))
94 return "ISO-8859-13"sv;
95 if (encoding.is_one_of_ignoring_ascii_case("iso-8859-14"sv, "iso8859-14"sv, "iso885914"sv))
96 return "ISO-8859-14"sv;
97 if (encoding.is_one_of_ignoring_ascii_case("csisolatin9"sv, "iso-8859-15"sv, "iso8859-15"sv, "iso885915"sv, "iso_8859-15"sv, "l9"sv))
98 return "ISO-8859-15"sv;
99 if (encoding.is_one_of_ignoring_ascii_case("iso-8859-16"sv))
100 return "ISO-8859-16"sv;
101 if (encoding.is_one_of_ignoring_ascii_case("cskoi8r"sv, "koi"sv, "koi8"sv, "koi8-r"sv, "koi8_r"sv))
102 return "KOI8-R"sv;
103 if (encoding.is_one_of_ignoring_ascii_case("koi8-ru"sv, "koi8-u"sv))
104 return "KOI8-U"sv;
105 if (encoding.is_one_of_ignoring_ascii_case("csmacintosh"sv, "mac"sv, "macintosh"sv, "x-mac-roman"sv))
106 return "macintosh"sv;
107 if (encoding.is_one_of_ignoring_ascii_case("dos-874"sv, "iso-8859-11"sv, "iso8859-11"sv, "iso885911"sv, "tis-620"sv, "windows-874"sv))
108 return "windows-874"sv;
109 if (encoding.is_one_of_ignoring_ascii_case("cp1250"sv, "windows-1250"sv, "x-cp1250"sv))
110 return "windows-1250"sv;
111 if (encoding.is_one_of_ignoring_ascii_case("cp1251"sv, "windows-1251"sv, "x-cp1251"sv))
112 return "windows-1251"sv;
113 if (encoding.is_one_of_ignoring_ascii_case("ansi_x3.4-1968"sv, "ascii"sv, "cp1252"sv, "cp819"sv, "csisolatin1"sv, "ibm819"sv, "iso-8859-1"sv, "iso-ir-100"sv, "iso8859-1"sv, "iso88591"sv, "iso_8859-1"sv, "iso_8859-1:1987"sv, "l1"sv, "latin1"sv, "us-ascii"sv, "windows-1252"sv, "x-cp1252"sv))
114 return "windows-1252"sv;
115 if (encoding.is_one_of_ignoring_ascii_case("cp1253"sv, "windows-1253"sv, "x-cp1253"sv))
116 return "windows-1253"sv;
117 if (encoding.is_one_of_ignoring_ascii_case("cp1254"sv, "csisolatin5"sv, "iso-8859-9"sv, "iso-ir-148"sv, "iso-8859-9"sv, "iso-88599"sv, "iso_8859-9"sv, "iso_8859-9:1989"sv, "l5"sv, "latin5"sv, "windows-1254"sv, "x-cp1254"sv))
118 return "windows-1254"sv;
119 if (encoding.is_one_of_ignoring_ascii_case("cp1255"sv, "windows-1255"sv, "x-cp1255"sv))
120 return "windows-1255"sv;
121 if (encoding.is_one_of_ignoring_ascii_case("cp1256"sv, "windows-1256"sv, "x-cp1256"sv))
122 return "windows-1256"sv;
123 if (encoding.is_one_of_ignoring_ascii_case("cp1257"sv, "windows-1257"sv, "x-cp1257"sv))
124 return "windows-1257"sv;
125 if (encoding.is_one_of_ignoring_ascii_case("cp1258"sv, "windows-1258"sv, "x-cp1258"sv))
126 return "windows-1258"sv;
127 if (encoding.is_one_of_ignoring_ascii_case("x-mac-cyrillic"sv, "x-mac-ukrainian"sv))
128 return "x-mac-cyrillic"sv;
129 if (encoding.is_one_of_ignoring_ascii_case("koi8-r"sv, "koi8r"sv))
130 return "koi8-r"sv;
131 if (encoding.is_one_of_ignoring_ascii_case("chinese"sv, "csgb2312"sv, "csiso58gb231280"sv, "gb2312"sv, "gb_2312"sv, "gb_2312-80"sv, "gbk"sv, "iso-ir-58"sv, "x-gbk"sv))
132 return "GBK"sv;
133 if (encoding.is_one_of_ignoring_ascii_case("gb18030"sv))
134 return "gb18030"sv;
135 if (encoding.is_one_of_ignoring_ascii_case("big5"sv, "big5-hkscs"sv, "cn-big5"sv, "csbig5"sv, "x-x-big5"sv))
136 return "Big5"sv;
137 if (encoding.is_one_of_ignoring_ascii_case("cseucpkdfmtjapanese"sv, "euc-jp"sv, "x-euc-jp"sv))
138 return "EUC-JP"sv;
139 if (encoding.is_one_of_ignoring_ascii_case("csiso2022jp"sv, "iso-2022-jp"sv))
140 return "ISO-2022-JP"sv;
141 if (encoding.is_one_of_ignoring_ascii_case("csshiftjis"sv, "ms932"sv, "ms_kanji"sv, "shift-jis"sv, "shift_jis"sv, "sjis"sv, "windows-31j"sv, "x-sjis"sv))
142 return "Shift_JIS"sv;
143 if (encoding.is_one_of_ignoring_ascii_case("cseuckr"sv, "csksc56011987"sv, "euc-kr"sv, "iso-ir-149"sv, "korean"sv, "ks_c_5601-1987"sv, "ks_c_5601-1989"sv, "ksc5601"sv, "ksc_5601"sv, "windows-949"sv))
144 return "EUC-KR"sv;
145 if (encoding.is_one_of_ignoring_ascii_case("csiso2022kr"sv, "hz-gb-2312"sv, "iso-2022-cn"sv, "iso-2022-cn-ext"sv, "iso-2022-kr"sv, "replacement"sv))
146 return "replacement"sv;
147 if (encoding.is_one_of_ignoring_ascii_case("unicodefffe"sv, "utf-16be"sv))
148 return "UTF-16BE"sv;
149 if (encoding.is_one_of_ignoring_ascii_case("csunicode"sv, "iso-10646-ucs-2"sv, "ucs-2"sv, "unicode"sv, "unicodefeff"sv, "utf-16"sv, "utf-16le"sv))
150 return "UTF-16LE"sv;
151 if (encoding.is_one_of_ignoring_ascii_case("x-user-defined"sv))
152 return "x-user-defined"sv;
153
154 dbgln("TextCodec: Unrecognized encoding: {}", encoding);
155 return {};
156}
157
158// https://encoding.spec.whatwg.org/#bom-sniff
159Optional<Decoder&> bom_sniff_to_decoder(StringView input)
160{
161 // 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
162 // 2. For each of the rows in the table below, starting with the first one and going down,
163 // if BOM starts with the bytes given in the first column, then return the encoding given
164 // in the cell in the second column of that row. Otherwise, return null.
165
166 // Byte Order Mark | Encoding
167 // --------------------------
168 // 0xEF 0xBB 0xBF | UTF-8
169 // 0xFE 0xFF | UTF-16BE
170 // 0xFF 0xFE | UTF-16LE
171
172 auto bytes = input.bytes();
173 if (bytes.size() < 2)
174 return {};
175
176 auto first_byte = bytes[0];
177
178 switch (first_byte) {
179 case 0xEF: // UTF-8
180 if (bytes.size() < 3)
181 return {};
182 if (bytes[1] == 0xBB && bytes[2] == 0xBF)
183 return s_utf8_decoder;
184 return {};
185 case 0xFE: // UTF-16BE
186 if (bytes[1] == 0xFF)
187 return s_utf16be_decoder;
188 return {};
189 case 0xFF: // UTF-16LE
190 if (bytes[1] == 0xFE)
191 return s_utf16le_decoder;
192 return {};
193 }
194
195 return {};
196}
197
198// https://encoding.spec.whatwg.org/#decode
199ErrorOr<String> convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(Decoder& fallback_decoder, StringView input)
200{
201 Decoder* actual_decoder = &fallback_decoder;
202
203 // 1. Let BOMEncoding be the result of BOM sniffing ioQueue.
204 // 2. If BOMEncoding is non-null:
205 if (auto unicode_decoder = bom_sniff_to_decoder(input); unicode_decoder.has_value()) {
206 // 1. Set encoding to BOMEncoding.
207 actual_decoder = &unicode_decoder.value();
208
209 // 2. Read three bytes from ioQueue, if BOMEncoding is UTF-8; otherwise read two bytes. (Do nothing with those bytes.)
210 // FIXME: I imagine this will be pretty slow for large inputs, as it's regenerating the input without the first 2/3 bytes.
211 input = input.substring_view(&unicode_decoder.value() == &s_utf8_decoder ? 3 : 2);
212 }
213
214 VERIFY(actual_decoder);
215
216 // FIXME: 3. Process a queue with an instance of encoding’s decoder, ioQueue, output, and "replacement".
217 // This isn't the exact same as the spec, especially the error mode of "replacement", which we don't have the concept of yet.
218 // 4. Return output.
219 return actual_decoder->to_utf8(input);
220}
221
222ErrorOr<String> Decoder::to_utf8(StringView input)
223{
224 StringBuilder builder(input.length());
225 TRY(process(input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
226 return builder.to_string();
227}
228
229ErrorOr<void> UTF8Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
230{
231 for (auto c : Utf8View(input)) {
232 TRY(on_code_point(c));
233 }
234 return {};
235}
236
237ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
238{
239 // Discard the BOM
240 auto bomless_input = input;
241 if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
242 bomless_input = input.substring_view(3);
243 }
244
245 return String::from_utf8(bomless_input);
246}
247
248ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
249{
250 // rfc2781, 2.2 Decoding UTF-16
251 size_t utf16_length = input.length() - (input.length() % 2);
252 for (size_t i = 0; i < utf16_length; i += 2) {
253 // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
254 // of W1. Terminate.
255 u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
256 if (!is_unicode_surrogate(w1)) {
257 TRY(on_code_point(w1));
258 continue;
259 }
260
261 // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
262 // is in error and no valid character can be obtained using W1.
263 // Terminate.
264 // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
265 // is not between 0xDC00 and 0xDFFF, the sequence is in error.
266 // Terminate.
267 if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
268 TRY(on_code_point(replacement_code_point));
269 continue;
270 }
271
272 u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
273 if (!Utf16View::is_low_surrogate(w2)) {
274 TRY(on_code_point(replacement_code_point));
275 continue;
276 }
277
278 // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
279 // bits of W1 as its 10 high-order bits and the 10 low-order bits of
280 // W2 as its 10 low-order bits.
281 // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
282 TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
283 i += 2;
284 }
285
286 return {};
287}
288
289ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
290{
291 // Discard the BOM
292 auto bomless_input = input;
293 if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
294 bomless_input = input.substring_view(2);
295
296 StringBuilder builder(bomless_input.length() / 2);
297 TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
298 return builder.to_string();
299}
300
301ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
302{
303 // rfc2781, 2.2 Decoding UTF-16
304 size_t utf16_length = input.length() - (input.length() % 2);
305 for (size_t i = 0; i < utf16_length; i += 2) {
306 // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
307 // of W1. Terminate.
308 u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
309 if (!is_unicode_surrogate(w1)) {
310 TRY(on_code_point(w1));
311 continue;
312 }
313
314 // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
315 // is in error and no valid character can be obtained using W1.
316 // Terminate.
317 // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
318 // is not between 0xDC00 and 0xDFFF, the sequence is in error.
319 // Terminate.
320 if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) {
321 TRY(on_code_point(replacement_code_point));
322 continue;
323 }
324
325 u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
326 if (!Utf16View::is_low_surrogate(w2)) {
327 TRY(on_code_point(replacement_code_point));
328 continue;
329 }
330
331 // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
332 // bits of W1 as its 10 high-order bits and the 10 low-order bits of
333 // W2 as its 10 low-order bits.
334 // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
335 TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2)));
336 i += 2;
337 }
338
339 return {};
340}
341
342ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
343{
344 // Discard the BOM
345 auto bomless_input = input;
346 if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
347 bomless_input = input.substring_view(2);
348
349 StringBuilder builder(bomless_input.length() / 2);
350 TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
351 return builder.to_string();
352}
353
354ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
355{
356 for (u8 ch : input) {
357 // Latin1 is the same as the first 256 Unicode code_points, so no mapping is needed, just utf-8 encoding.
358 TRY(on_code_point(ch));
359 }
360
361 return {};
362}
363
364namespace {
365u32 convert_latin2_to_utf8(u8 in)
366{
367 switch (in) {
368
369#define MAP(X, Y) \
370 case X: \
371 return Y
372
373 MAP(0xA1, 0x104);
374 MAP(0xA2, 0x2D8);
375 MAP(0xA3, 0x141);
376 MAP(0xA5, 0x13D);
377 MAP(0xA6, 0x15A);
378 MAP(0xA9, 0x160);
379 MAP(0xAA, 0x15E);
380 MAP(0xAB, 0x164);
381 MAP(0xAC, 0x179);
382 MAP(0xAE, 0x17D);
383 MAP(0xAF, 0x17B);
384
385 MAP(0xB1, 0x105);
386 MAP(0xB2, 0x2DB);
387 MAP(0xB3, 0x142);
388 MAP(0xB5, 0x13E);
389 MAP(0xB6, 0x15B);
390 MAP(0xB7, 0x2C7);
391 MAP(0xB9, 0x161);
392 MAP(0xBA, 0x15F);
393 MAP(0xBB, 0x165);
394 MAP(0xBC, 0x17A);
395 MAP(0xBD, 0x2DD);
396 MAP(0xBE, 0x17E);
397 MAP(0xBF, 0x17C);
398
399 MAP(0xC0, 0x154);
400 MAP(0xC3, 0x102);
401 MAP(0xC5, 0x139);
402 MAP(0xC6, 0x106);
403 MAP(0xC8, 0x10C);
404 MAP(0xCA, 0x118);
405 MAP(0xCC, 0x11A);
406 MAP(0xCF, 0x10E);
407
408 MAP(0xD0, 0x110);
409 MAP(0xD1, 0x143);
410 MAP(0xD2, 0x147);
411 MAP(0xD5, 0x150);
412 MAP(0xD8, 0x158);
413 MAP(0xD9, 0x16E);
414 MAP(0xDB, 0x170);
415 MAP(0xDE, 0x162);
416
417 MAP(0xE0, 0x155);
418 MAP(0xE3, 0x103);
419 MAP(0xE5, 0x13A);
420 MAP(0xE6, 0x107);
421 MAP(0xE8, 0x10D);
422 MAP(0xEA, 0x119);
423 MAP(0xEC, 0x11B);
424 MAP(0xEF, 0x10F);
425
426 MAP(0xF0, 0x111);
427 MAP(0xF1, 0x144);
428 MAP(0xF2, 0x148);
429 MAP(0xF5, 0x151);
430 MAP(0xF8, 0x159);
431 MAP(0xF9, 0x16F);
432 MAP(0xFB, 0x171);
433 MAP(0xFE, 0x163);
434 MAP(0xFF, 0x2D9);
435#undef MAP
436
437 default:
438 return in;
439 }
440}
441}
442
443ErrorOr<void> Latin2Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
444{
445 for (auto c : input) {
446 TRY(on_code_point(convert_latin2_to_utf8(c)));
447 }
448
449 return {};
450}
451
452ErrorOr<void> HebrewDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
453{
454 static constexpr Array<u32, 128> translation_table = {
455 0x20AC, 0xFFFD, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021, 0x2C6, 0x2030, 0xFFFD, 0x2039, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
456 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x2DC, 0x2122, 0xFFFD, 0x203A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
457 0xA0, 0xA1, 0xA2, 0xA3, 0x20AA, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
458 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
459 0x5B0, 0x5B1, 0x5B2, 0x5B3, 0x5B4, 0x5B5, 0x5B6, 0x5B7, 0x5B8, 0x5B9, 0x5BA, 0x5BB, 0x5BC, 0x5BD, 0x5BE, 0x5BF,
460 0x5C0, 0x5C1, 0x5C2, 0x5C3, 0x5F0, 0x5F1, 0x5F2, 0x5F3, 0x5F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
461 0x5D0, 0x5D1, 0x5D2, 0x5D3, 0x5D4, 0x5D5, 0x5D6, 0x5D7, 0x5D8, 0x5D9, 0x5DA, 0x5DB, 0x5DC, 0x5DD, 0x5DE, 0x5DF,
462 0x5E0, 0x5E1, 0x5E2, 0x5E3, 0x5E4, 0x5E5, 0x5E6, 0x5E7, 0x5E8, 0x5E9, 0x5EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
463 };
464 for (unsigned char ch : input) {
465 if (ch < 0x80) { // Superset of ASCII
466 TRY(on_code_point(ch));
467 } else {
468 TRY(on_code_point(translation_table[ch - 0x80]));
469 }
470 }
471
472 return {};
473}
474
475ErrorOr<void> CyrillicDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
476{
477 static constexpr Array<u32, 128> translation_table = {
478 0x402, 0x403, 0x201A, 0x453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x409, 0x2039, 0x40A, 0x40C, 0x40B, 0x40F,
479 0x452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x459, 0x203A, 0x45A, 0x45C, 0x45B, 0x45F,
480 0xA0, 0x40E, 0x45E, 0x408, 0xA4, 0x490, 0xA6, 0xA7, 0x401, 0xA9, 0x404, 0xAB, 0xAC, 0xAD, 0xAE, 0x407,
481 0xB0, 0xB1, 0x406, 0x456, 0x491, 0xB5, 0xB6, 0xB7, 0x451, 0x2116, 0x454, 0xBB, 0x458, 0x405, 0x455, 0x457,
482 0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D, 0x41E, 0x41F,
483 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F,
484 0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B, 0x43C, 0x43D, 0x43E, 0x43F,
485 0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x44A, 0x44B, 0x44C, 0x44D, 0x44E, 0x44F
486 };
487 for (unsigned char ch : input) {
488 if (ch < 0x80) { // Superset of ASCII
489 TRY(on_code_point(ch));
490 } else {
491 TRY(on_code_point(translation_table[ch - 0x80]));
492 }
493 }
494
495 return {};
496}
497
498ErrorOr<void> Koi8RDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
499{
500 // clang-format off
501 static constexpr Array<u32, 128> translation_table = {
502 0x2500,0x2502,0x250c,0x2510,0x2514,0x2518,0x251c,0x2524,0x252c,0x2534,0x253c,0x2580,0x2584,0x2588,0x258c,0x2590,
503 0x2591,0x2592,0x2593,0x2320,0x25a0,0x2219,0x221a,0x2248,0x2264,0x2265,0xA0,0x2321,0xb0,0xb2,0xb7,0xf7,
504 0x2550,0x2551,0x2552,0xd191,0x2553,0x2554,0x2555,0x2556,0x2557,0x2558,0x2559,0x255a,0x255b,0x255c,0x255d,0x255e,
505 0x255f,0x2560,0x2561,0xd081,0x2562,0x2563,0x2564,0x2565,0x2566,0x2567,0x2568,0x2569,0x256a,0x256b,0x256c,0xa9,
506 0x44e,0x430,0x431,0x446,0x434,0x435,0x444,0x433,0x445,0x438,0x439,0x43a,0x43b,0x43c,0x43d,0x43e,
507 0x43f,0x44f,0x440,0x441,0x442,0x443,0x436,0x432,0x44c,0x44b,0x437,0x448,0x44d,0x449,0x447,0x44a,
508 0x42e,0x410,0x441,0x426,0x414,0x415,0x424,0x413,0x425,0x418,0x419,0x41a,0x41b,0x41c,0x41d,0x41e,
509 0x41f,0x42f,0x420,0x421,0x422,0x423,0x416,0x412,0x42c,0x42b,0x417,0x428,0x42d,0x429,0x427,0x42a,
510 };
511 // clang-format on
512
513 for (unsigned char ch : input) {
514 if (ch < 0x80) { // Superset of ASCII
515 TRY(on_code_point(ch));
516 } else {
517 TRY(on_code_point(translation_table[ch - 0x80]));
518 }
519 }
520
521 return {};
522}
523
524ErrorOr<void> Latin9Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
525{
526 auto convert_latin9_to_utf8 = [](u8 ch) -> u32 {
527 // Latin9 is the same as the first 256 Unicode code points, except for 8 characters.
528 switch (ch) {
529 case 0xA4:
530 return 0x20AC;
531 case 0xA6:
532 return 0x160;
533 case 0xA8:
534 return 0x161;
535 case 0xB4:
536 return 0x17D;
537 case 0xB8:
538 return 0x17E;
539 case 0xBC:
540 return 0x152;
541 case 0xBD:
542 return 0x153;
543 case 0xBE:
544 return 0x178;
545 default:
546 return ch;
547 }
548 };
549
550 for (auto ch : input) {
551 TRY(on_code_point(convert_latin9_to_utf8(ch)));
552 }
553
554 return {};
555}
556
557ErrorOr<void> MacRomanDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
558{
559 // https://encoding.spec.whatwg.org/index-macintosh.txt
560 // clang-format off
561 static constexpr Array<u32, 128> translation_table = {
562 0x00C4, 0x00C5, 0x00C7, 0x00C9, 0x00D1, 0x00D6, 0x00DC, 0x00E1, 0x00E0, 0x00E2, 0x00E4, 0x00E3, 0x00E5, 0x00E7, 0x00E9, 0x00E8,
563 0x00EA, 0x00EB, 0x00ED, 0x00EC, 0x00EE, 0x00EF, 0x00F1, 0x00F3, 0x00F2, 0x00F4, 0x00F6, 0x00F5, 0x00FA, 0x00F9, 0x00FB, 0x00FC,
564 0x2020, 0x00B0, 0x00A2, 0x00A3, 0x00A7, 0x2022, 0x00B6, 0x00DF, 0x00AE, 0x00A9, 0x2122, 0x00B4, 0x00A8, 0x2260, 0x00C6, 0x00D8,
565 0x221E, 0x00B1, 0x2264, 0x2265, 0x00A5, 0x00B5, 0x2202, 0x2211, 0x220F, 0x03C0, 0x222B, 0x00AA, 0x00BA, 0x03A9, 0x00E6, 0x00F8,
566 0x00BF, 0x00A1, 0x00AC, 0x221A, 0x0192, 0x2248, 0x2206, 0x00AB, 0x00BB, 0x2026, 0x00A0, 0x00C0, 0x00C3, 0x00D5, 0x0152, 0x0153,
567 0x2013, 0x2014, 0x201C, 0x201D, 0x2018, 0x2019, 0x00F7, 0x25CA, 0x00FF, 0x0178, 0x2044, 0x20AC, 0x2039, 0x203A, 0xFB01, 0xFB02,
568 0x2021, 0x00B7, 0x201A, 0x201E, 0x2030, 0x00C2, 0x00CA, 0x00C1, 0x00CB, 0x00C8, 0x00CD, 0x00CE, 0x00CF, 0x00CC, 0x00D3, 0x00D4,
569 0xF8FF, 0x00D2, 0x00DA, 0x00DB, 0x00D9, 0x0131, 0x02C6, 0x02DC, 0x00AF, 0x02D8, 0x02D9, 0x02DA, 0x00B8, 0x02DD, 0x02DB, 0x02C7,
570 };
571 // clang-format on
572
573 for (u8 ch : input) {
574 if (ch < 0x80) { // Superset of ASCII
575 TRY(on_code_point(ch));
576 } else {
577 TRY(on_code_point(translation_table[ch - 0x80]));
578 }
579 }
580
581 return {};
582}
583
584ErrorOr<void> TurkishDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
585{
586 auto convert_turkish_to_utf8 = [](u8 ch) -> u32 {
587 // Turkish (aka ISO-8859-9, Windows-1254) is the same as the first 256 Unicode code points, except for 6 characters.
588 switch (ch) {
589 case 0xD0:
590 return 0x11E;
591 case 0xDD:
592 return 0x130;
593 case 0xDE:
594 return 0x15E;
595 case 0xF0:
596 return 0x11F;
597 case 0xFD:
598 return 0x131;
599 case 0xFE:
600 return 0x15F;
601 default:
602 return ch;
603 }
604 };
605
606 for (auto ch : input) {
607 TRY(on_code_point(convert_turkish_to_utf8(ch)));
608 }
609
610 return {};
611}
612
613// https://encoding.spec.whatwg.org/#x-user-defined-decoder
614ErrorOr<void> XUserDefinedDecoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
615{
616 auto convert_x_user_defined_to_utf8 = [](u8 ch) -> u32 {
617 // 2. If byte is an ASCII byte, return a code point whose value is byte.
618 // https://infra.spec.whatwg.org/#ascii-byte
619 // An ASCII byte is a byte in the range 0x00 (NUL) to 0x7F (DEL), inclusive.
620 // NOTE: This doesn't check for ch >= 0x00, as that would always be true due to being unsigned.
621 if (ch <= 0x7f)
622 return ch;
623
624 // 3. Return a code point whose value is 0xF780 + byte − 0x80.
625 return 0xF780 + ch - 0x80;
626 };
627
628 for (auto ch : input) {
629 TRY(on_code_point(convert_x_user_defined_to_utf8(ch)));
630 }
631
632 // 1. If byte is end-of-queue, return finished.
633
634 return {};
635}
636
637}