this repo has no description
1#!/usr/bin/env python3
2# Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
3import _codecs
4import unittest
5from array import array
6
7from test_support import pyro_only
8
9
10try:
11 from builtins import _str_array
12except ImportError:
13 pass
14
15
16class CharmapTests(unittest.TestCase):
17 def test_charmap_decode_with_empty_bytes_input_returns_tuple(self):
18 self.assertEqual(_codecs.charmap_decode(b"", "strict", "abc"), ("", 0))
19 self.assertEqual(
20 _codecs.charmap_decode(b"", "strict", {0: "a", 1: "b", 2: "c"}), ("", 0)
21 )
22
23 def test_charmap_decode_with_string_map_raises_unicode_decode_error(self):
24 self.assertRaises(
25 UnicodeDecodeError, _codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
26 )
27
28 def test_charmap_decode_with_string_map_with_bom_raises_unicode_decode_error(self):
29 self.assertRaises(
30 UnicodeDecodeError,
31 _codecs.charmap_decode,
32 b"\x00\x01\x02",
33 "strict",
34 "ab\ufffe",
35 )
36
37 def test_charmap_decode_with_string_map_returns_tuple(self):
38 self.assertEqual(
39 _codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"), ("abc", 3)
40 )
41
42 def test_charmap_decode_with_int2str_map_returns_tuple(self):
43 self.assertEqual(
44 _codecs.charmap_decode(b"\x00\x01\x02", "strict", {0: "a", 1: "b", 2: "c"}),
45 ("abc", 3),
46 )
47 self.assertEqual(
48 _codecs.charmap_decode(
49 b"\x00\x01\x02", "strict", {0: "Aa", 1: "Bb", 2: "Cc"}
50 ),
51 ("AaBbCc", 3),
52 )
53 self.assertEqual(
54 _codecs.charmap_decode(
55 b"\x00\x01\x02", "strict", {0: "\U0010FFFF", 1: "b", 2: "c"}
56 ),
57 ("\U0010FFFFbc", 3),
58 )
59 self.assertEqual(
60 _codecs.charmap_decode(b"\x00\x01\x02", "strict", {0: "a", 1: "b", 2: ""}),
61 ("ab", 3),
62 )
63 self.assertEqual(
64 _codecs.charmap_decode(b"\x00\x01\x02", "strict", {0: "A", 1: "Bb", 2: 37}),
65 ("ABb%", 3),
66 )
67
68 def test_charmap_decode_with_int2str_map_raises_unicode_decode_error(self):
69 self.assertRaises(
70 UnicodeDecodeError,
71 _codecs.charmap_decode,
72 b"\x00\x01\x02",
73 "strict",
74 {0: "a", 1: "b"},
75 )
76
77 def test_charmap_decode_with_int2str_map_with_none_raises_unicode_decode_error(
78 self,
79 ):
80 self.assertRaises(
81 UnicodeDecodeError,
82 _codecs.charmap_decode,
83 b"\x00\x01\x02",
84 "strict",
85 {0: "a", 1: "b", 2: None},
86 )
87
88 def test_charmap_decode_with_int2str_map_with_bom_raises_unicode_decode_error(self):
89 self.assertRaises(
90 UnicodeDecodeError,
91 _codecs.charmap_decode,
92 b"\x00\x01\x02",
93 "strict",
94 {0: "a", 1: "b", 2: "\ufffe"},
95 )
96
97 def test_charmap_decode_with_mapped_value_tuple_raises_type_error(self):
98 self.assertRaises(
99 TypeError,
100 _codecs.charmap_decode,
101 b"\x00\x01\x02",
102 "strict",
103 {0: "a", 1: "b", 2: ("c",)},
104 )
105
106 def test_charmap_decode_with_mapped_value_list_raises_type_error(self):
107 self.assertRaises(
108 TypeError,
109 _codecs.charmap_decode,
110 b"\x00\x01\x02",
111 "strict",
112 {0: "a", 1: "b", 2: ["c"]},
113 )
114
115 def test_charmap_decode_with_mapped_value_out_of_range_min_raises_type_error(self):
116 self.assertRaises(
117 TypeError,
118 _codecs.charmap_decode,
119 b"\x00\x01\x02",
120 "strict",
121 {0: "a", 1: "b", 2: -2},
122 )
123
124 def test_charmap_decode_with_mapped_value_out_of_range_max_raises_type_error(self):
125 self.assertRaises(
126 TypeError,
127 _codecs.charmap_decode,
128 b"\x00\x01\x02",
129 "strict",
130 {0: "a", 1: "b", 2: 9999999},
131 )
132
133
134class CodecsTests(unittest.TestCase):
135 def test_register_error_with_non_string_first_raises_type_error(self):
136 with self.assertRaises(TypeError):
137 _codecs.register_error([], [])
138
139 def test_register_error_with_non_callable_second_raises_type_error(self):
140 with self.assertRaises(TypeError):
141 _codecs.register_error("", [])
142
143 def test_lookup_error_with_non_string_raises_type_error(self):
144 with self.assertRaises(TypeError):
145 _codecs.lookup_error([])
146
147 def test_lookup_error_with_unknown_name_raises_lookup_error(self):
148 with self.assertRaises(LookupError):
149 _codecs.lookup_error("not-an-error")
150
151 def test_lookup_error_with_ignore_returns_ignore_function(self):
152 func = _codecs.lookup_error("ignore")
153 self.assertEqual(func.__name__, "ignore_errors")
154
155 def test_lookup_error_with_strict_returns_strict_function(self):
156 func = _codecs.lookup_error("strict")
157 self.assertEqual(func.__name__, "strict_errors")
158
159 def test_lookup_error_with_registered_error_returns_function(self):
160 def test_func():
161 pass
162
163 _codecs.register_error("test", test_func)
164 func = _codecs.lookup_error("test")
165 self.assertEqual(func.__name__, "test_func")
166
167 def test_register_with_non_callable_fails(self):
168 with self.assertRaises(TypeError):
169 _codecs.register("not-a-callable")
170
171 def test_lookup_unknown_codec_raises_lookup_error(self):
172 with self.assertRaises(LookupError):
173 _codecs.lookup("not-the-name-of-a-codec")
174
175 def test_lookup_that_doesnt_return_tuple_raises_type_error(self):
176 def lookup_function(encoding):
177 if encoding == "lookup_that_doesnt_return_tuple":
178 return "not-a-tuple"
179
180 _codecs.register(lookup_function)
181 with self.assertRaises(TypeError):
182 _codecs.lookup("lookup_that_doesnt_return_tuple")
183
184 def test_lookup_that_doesnt_return_four_tuple_raises_type_error(self):
185 def lookup_function(encoding):
186 if encoding == "lookup_that_doesnt_return_four_tuple":
187 return "one", "two", "three"
188
189 _codecs.register(lookup_function)
190 with self.assertRaises(TypeError):
191 _codecs.lookup("lookup_that_doesnt_return_four_tuple")
192
193 def test_lookup_returns_first_registered_codec(self):
194 def first_lookup_function(encoding):
195 if encoding == "lookup_return_of_registered_codec":
196 return 1, 2, 3, 4
197
198 def second_lookup_function(encoding):
199 if encoding == "lookup_return_of_registered_codec":
200 return 5, 6, 7, 8
201
202 _codecs.register(first_lookup_function)
203 _codecs.register(second_lookup_function)
204 self.assertEqual(
205 _codecs.lookup("lookup_return_of_registered_codec"), (1, 2, 3, 4)
206 )
207
208 def test_lookup_properly_caches_encodings(self):
209 accumulator = []
210
211 def incrementing_lookup_function(encoding):
212 if (
213 encoding == "incrementing_state_one"
214 or encoding == "incrementing_state_two"
215 ):
216 accumulator.append(1)
217 return len(accumulator), 0, 0, 0
218
219 _codecs.register(incrementing_lookup_function)
220 first_one = _codecs.lookup("incrementing_state_one")
221 first_two = _codecs.lookup("incrementing_state_two")
222 second_one = _codecs.lookup("incrementing_state_one")
223 self.assertNotEqual(first_one, first_two)
224 self.assertEqual(first_one, second_one)
225
226 def test_lookup_properly_normalizes_encoding_string(self):
227 def lookup_function(encoding):
228 if encoding == "normalized-string":
229 return 0, 0, 0, 0
230
231 _codecs.register(lookup_function)
232 self.assertEqual(_codecs.lookup("normalized string"), (0, 0, 0, 0))
233
234 def test_decode_with_unknown_codec_raises_lookup_error(self):
235 with self.assertRaises(LookupError) as context:
236 _codecs.decode(b"bytes", "not-a-codec")
237 self.assertEqual(str(context.exception), "unknown encoding: not-a-codec")
238
239 def test_decode_with_function_with_int_codec_raises_type_error(self):
240 def lookup_function(encoding):
241 if encoding == "decode-with-function-with-int-codec":
242 return 0, 0, 0, 0
243
244 _codecs.register(lookup_function)
245 with self.assertRaises(TypeError) as context:
246 _codecs.decode(b"bytes", "decode-with-function-with-int-codec")
247 self.assertIn("'int' object is not callable", str(context.exception))
248
249 def test_decode_with_function_with_non_tuple_return_raises_type_error(self):
250 def lookup_function(encoding):
251 if encoding == "decode-with-function-with-faulty-codec":
252 return 0, (lambda uni: 0), 0, 0
253
254 _codecs.register(lookup_function)
255 with self.assertRaises(TypeError) as context:
256 _codecs.decode(b"bytes", "decode-with-function-with-faulty-codec")
257 self.assertEqual(
258 str(context.exception), "decoder must return a tuple (object,integer)"
259 )
260
261 def test_decode_with_function_with_tuple_return_returns_first_element(self):
262 def decoder(s):
263 return ("one", "two")
264
265 def lookup_function(encoding):
266 if encoding == "decode-with-function-with-two-tuple-codec":
267 return 0, decoder, 0, 0
268
269 _codecs.register(lookup_function)
270 self.assertEqual(
271 _codecs.decode(b"bytes", "decode-with-function-with-two-tuple-codec"), "one"
272 )
273
274 def test_decode_with_errors_passes_multiple_arguments(self):
275 def decoder(s, err):
276 return (s, err)
277
278 def lookup_function(encoding):
279 if encoding == "decode-with-function-with-two-arguments":
280 return 0, decoder, 0, 0
281
282 _codecs.register(lookup_function)
283 self.assertEqual(
284 _codecs.decode(
285 b"bytes", "decode-with-function-with-two-arguments", "error"
286 ),
287 b"bytes",
288 )
289
290 def test_encode_with_unknown_codec_raises_lookup_error(self):
291 with self.assertRaises(LookupError) as context:
292 _codecs.encode("str", "not-a-codec")
293 self.assertEqual(str(context.exception), "unknown encoding: not-a-codec")
294
295 def test_encode_with_function_with_int_codec_raises_type_error(self):
296 def lookup_function(encoding):
297 if encoding == "encode-with-function-with-int-codec":
298 return 0, 0, 0, 0
299
300 _codecs.register(lookup_function)
301 with self.assertRaises(TypeError) as context:
302 _codecs.encode("str", "encode-with-function-with-int-codec")
303 self.assertIn("'int' object is not callable", str(context.exception))
304
305 def test_encode_with_function_with_non_tuple_return_raises_type_error(self):
306 def lookup_function(encoding):
307 if encoding == "encode-with-function-with-faulty-codec":
308 return (lambda uni: 0), 0, 0, 0
309
310 _codecs.register(lookup_function)
311 with self.assertRaises(TypeError) as context:
312 _codecs.encode("str", "encode-with-function-with-faulty-codec")
313 self.assertEqual(
314 str(context.exception), "encoder must return a tuple (object, integer)"
315 )
316
317 def test_encode_with_function_with_tuple_return_returns_first_element(self):
318 def encoder(s):
319 return ("one", "two")
320
321 def lookup_function(encoding):
322 if encoding == "encode-with-function-with-two-tuple-codec":
323 return encoder, 0, 0, 0
324
325 _codecs.register(lookup_function)
326 self.assertEqual(
327 _codecs.encode("str", "encode-with-function-with-two-tuple-codec"), "one"
328 )
329
330 def test_encode_with_errors_passes_multiple_arguments(self):
331 def encoder(s, err):
332 return (s, err)
333
334 def lookup_function(encoding):
335 if encoding == "encode-with-function-with-two-arguments":
336 return encoder, 0, 0, 0
337
338 _codecs.register(lookup_function)
339 self.assertEqual(
340 _codecs.encode("str", "encode-with-function-with-two-arguments", "error"),
341 "str",
342 )
343
344 @pyro_only
345 def test_getincrementaldecoder_with_utf_8_returns_utf_8_incremental_decoder(self):
346 inc_dec = _codecs.getincrementaldecoder("utf-8")
347 self.assertIs(inc_dec, _codecs.UTF8IncrementalDecoder)
348
349 @pyro_only
350 def test_getincrementalencoder_with_utf_8_returns_utf_8_incremental_encoder(self):
351 inc_dec = _codecs.getincrementalencoder("utf-8")
352 self.assertIs(inc_dec, _codecs.UTF8IncrementalEncoder)
353
354
355class DecodeASCIITests(unittest.TestCase):
356 def test_decode_ascii_with_non_bytes_first_raises_type_error(self):
357 with self.assertRaises(TypeError):
358 _codecs.ascii_decode([])
359
360 def test_decode_ascii_with_non_string_second_raises_type_error(self):
361 with self.assertRaises(TypeError):
362 _codecs.ascii_decode(b"", [])
363
364 def test_decode_ascii_with_zero_length_returns_empty_string(self):
365 decoded, consumed = _codecs.ascii_decode(b"")
366 self.assertEqual(decoded, "")
367 self.assertEqual(consumed, 0)
368
369 def test_decode_ascii_with_well_formed_ascii_returns_string(self):
370 decoded, consumed = _codecs.ascii_decode(b"hello")
371 self.assertEqual(decoded, "hello")
372 self.assertEqual(consumed, 5)
373
374 def test_decode_ascii_with_well_formed_ascii_array_returns_string(self):
375 decoded, consumed = _codecs.ascii_decode(array("B", b"hello"))
376 self.assertEqual(decoded, "hello")
377 self.assertEqual(consumed, 5)
378
379 def test_decode_ascii_with_well_formed_ascii_bytearray_returns_string(self):
380 decoded, consumed = _codecs.ascii_decode(bytearray(b"hello"))
381 self.assertEqual(decoded, "hello")
382 self.assertEqual(consumed, 5)
383
384 def test_decode_ascii_with_well_formed_ascii_bytearray_subclass_returns_string(
385 self,
386 ):
387 class B(bytearray):
388 pass
389
390 decoded, consumed = _codecs.ascii_decode(B(b"hello"))
391 self.assertEqual(decoded, "hello")
392 self.assertEqual(consumed, 5)
393
394 def test_decode_ascii_with_well_formed_ascii_memoryview_returns_string(self):
395 decoded, consumed = _codecs.ascii_decode(memoryview(b"hello"))
396 self.assertEqual(decoded, "hello")
397 self.assertEqual(consumed, 5)
398
399 def test_decode_ascii_with_custom_error_handler_returns_string(self):
400 _codecs.register_error("test", lambda x: ("-testing-", x.end))
401 decoded, consumed = _codecs.ascii_decode(b"ab\x90c", "test")
402 self.assertEqual(decoded, "ab-testing-c")
403 self.assertEqual(consumed, 4)
404
405
406class DecodeEscapeTests(unittest.TestCase):
407 def test_decode_escape_with_non_bytes_first_raises_type_error(self):
408 with self.assertRaises(TypeError):
409 _codecs.escape_decode([])
410
411 def test_decode_escape_with_non_string_second_raises_type_error(self):
412 with self.assertRaises(TypeError):
413 _codecs.escape_decode(b"", [])
414
415 def test_decode_escape_with_zero_length_returns_empty_string(self):
416 decoded, consumed = _codecs.escape_decode(b"")
417 self.assertEqual(decoded, b"")
418 self.assertEqual(consumed, 0)
419
420 # TODO(atalaba): This should not need @pyro_only
421 @pyro_only
422 def test_decode_escape_with_well_formed_latin_1_returns_string(self):
423 decoded, consumed = _codecs.escape_decode(b"hello\x95")
424 self.assertEqual(decoded, b"hello\xC2\x95")
425 self.assertEqual(consumed, 6)
426
427 def test_decode_escape_with_end_of_string_slash_raises_value_error(self):
428 with self.assertRaises(ValueError) as context:
429 _codecs.escape_decode(b"ab\\")
430 self.assertEqual(str(context.exception), "Trailing \\ in string")
431
432 def test_decode_escape_with_truncated_hex_raises_value_error(self):
433 with self.assertRaises(ValueError) as context:
434 _codecs.escape_decode(b"ab\\x1h")
435 self.assertEqual(str(context.exception), "invalid \\x escape at position 2")
436
437 def test_decode_escape_with_truncated_hex_unknown_error_raises_value_error(self):
438 with self.assertRaises(ValueError) as context:
439 _codecs.escape_decode(b"ab\\x1h", "unknown")
440 self.assertEqual(
441 str(context.exception),
442 "decoding error; unknown error handling code: unknown",
443 )
444
445 @pyro_only
446 def test_decode_escape_stateful_returns_first_invalid_escape(self):
447 decoded, consumed, first_invalid = _codecs._escape_decode_stateful(b"ab\\yc")
448 self.assertEqual(decoded, b"ab\\yc")
449 self.assertEqual(consumed, 5)
450 self.assertEqual(first_invalid, 3)
451
452
453class DecodeLatin1Tests(unittest.TestCase):
454 def test_decode_latin_1_with_non_bytes_first_raises_type_error(self):
455 with self.assertRaises(TypeError):
456 _codecs.latin_1_decode([])
457
458 def test_decode_latin_1_with_non_string_second_raises_type_error(self):
459 with self.assertRaises(TypeError):
460 _codecs.latin_1_decode(b"", [])
461
462 def test_decode_latin_1_with_zero_length_returns_empty_string(self):
463 decoded, consumed = _codecs.latin_1_decode(b"")
464 self.assertEqual(decoded, "")
465 self.assertEqual(consumed, 0)
466
467 def test_decode_latin_1_with_ascii_returns_string(self):
468 decoded, consumed = _codecs.latin_1_decode(b"hello")
469 self.assertEqual(decoded, "hello")
470 self.assertEqual(consumed, 5)
471
472 def test_decode_latin_1_with_ascii_array_returns_string(self):
473 decoded, consumed = _codecs.latin_1_decode(array("B", b"hello"))
474 self.assertEqual(decoded, "hello")
475 self.assertEqual(consumed, 5)
476
477 def test_decode_latin_1_with_ascii_bytearray_returns_string(self):
478 decoded, consumed = _codecs.latin_1_decode(bytearray(b"hello"))
479 self.assertEqual(decoded, "hello")
480 self.assertEqual(consumed, 5)
481
482 def test_decode_latin_1_with_ascii_bytearray_subclass_returns_string(self):
483 class B(bytearray):
484 pass
485
486 decoded, consumed = _codecs.latin_1_decode(B(b"hello"))
487 self.assertEqual(decoded, "hello")
488 self.assertEqual(consumed, 5)
489
490 def test_decode_latin_1_with_ascii_memoryview_returns_string(self):
491 decoded, consumed = _codecs.latin_1_decode(memoryview(b"hello"))
492 self.assertEqual(decoded, "hello")
493 self.assertEqual(consumed, 5)
494
495 def test_decode_latin_1_with_latin_1_returns_string(self):
496 decoded, consumed = _codecs.latin_1_decode(b"\x7D\x7E\x7F\x80\x81\x82")
497 self.assertEqual(decoded, "\x7D\x7E\x7F\x80\x81\x82")
498 self.assertEqual(consumed, 6)
499
500
501class DecodeUnicodeEscapeTests(unittest.TestCase):
502 def test_decode_unicode_escape_with_non_bytes_first_raises_type_error(self):
503 with self.assertRaises(TypeError):
504 _codecs.unicode_escape_decode([])
505
506 def test_decode_unicode_escape_with_non_string_second_raises_type_error(self):
507 with self.assertRaises(TypeError):
508 _codecs.unicode_escape_decode(b"", [])
509
510 def test_decode_unicode_escape_with_zero_length_returns_empty_string(self):
511 decoded, consumed = _codecs.unicode_escape_decode(b"")
512 self.assertEqual(decoded, "")
513 self.assertEqual(consumed, 0)
514
515 def test_decode_unicode_escape_with_well_formed_latin_1_returns_string(self):
516 decoded, consumed = _codecs.unicode_escape_decode(b"hello\x95")
517 self.assertEqual(decoded, "hello\x95")
518 self.assertEqual(consumed, 6)
519
520 def test_decode_unicode_escape_with_well_formed_latin_1_array_returns_string(
521 self,
522 ):
523 decoded, consumed = _codecs.unicode_escape_decode(array("B", b"hello\x95"))
524 self.assertEqual(decoded, "hello\x95")
525 self.assertEqual(consumed, 6)
526
527 def test_decode_unicode_escape_with_well_formed_latin_1_bytearray_returns_string(
528 self,
529 ):
530 decoded, consumed = _codecs.unicode_escape_decode(bytearray(b"hello\x95"))
531 self.assertEqual(decoded, "hello\x95")
532 self.assertEqual(consumed, 6)
533
534 def test_decode_unicode_escape_with_latin_1_bytearray_subclass_returns_string(self):
535 class B(bytearray):
536 pass
537
538 decoded, consumed = _codecs.unicode_escape_decode(B(b"hello\x95"))
539 self.assertEqual(decoded, "hello\x95")
540 self.assertEqual(consumed, 6)
541
542 def test_decode_unicode_escape_with_well_formed_latin_1_memoryview_returns_string(
543 self,
544 ):
545 decoded, consumed = _codecs.unicode_escape_decode(memoryview(b"hello\x95"))
546 self.assertEqual(decoded, "hello\x95")
547 self.assertEqual(consumed, 6)
548
549 def test_decode_unicode_escape_with_escaped_back_slash_returns_string(self):
550 decoded, consumed = _codecs.unicode_escape_decode(b"hello\\x95")
551 self.assertEqual(decoded, "hello\x95")
552 self.assertEqual(consumed, 9)
553
554 def test_decode_unicode_escape_with_valid_hangul_returns_string(self):
555 decoded, consumed = _codecs.unicode_escape_decode(
556 b"\\N{HANGUL SYLLABLE BBYAENG}"
557 )
558 self.assertEqual(decoded, "\uBEC9")
559 self.assertEqual(consumed, 27)
560
561 def test_decode_unicode_escape_with_lowercase_hangul_raises_exception(self):
562 with self.assertRaises(UnicodeDecodeError):
563 _codecs.unicode_escape_decode(b"\\N{HANGUL SYLLABLE ddalg}")
564
565 def test_decode_unicode_escape_with_invalid_leading_raises_exception(self):
566 with self.assertRaises(UnicodeDecodeError):
567 _codecs.unicode_escape_decode(b"\\N{HANGUL SYLLABLE BLANJ}")
568
569 def test_decode_unicode_escape_with_invalid_vowel_raises_exception(self):
570 with self.assertRaises(UnicodeDecodeError):
571 _codecs.unicode_escape_decode(b"\\N{HANGUL SYLLABLE CAOGS}")
572
573 def test_decode_unicode_escape_with_invalid_trailing_raises_exception(self):
574 with self.assertRaises(UnicodeDecodeError):
575 _codecs.unicode_escape_decode(b"\\N{HANGUL SYLLABLE PYOLL}")
576
577 def test_decode_unicode_escape_with_valid_cjk_ideograph_returns_string(self):
578 decoded, consumed = _codecs.unicode_escape_decode(
579 b"\\N{CJK UNIFIED IDEOGRAPH-4DB0}"
580 )
581 self.assertEqual(decoded, "\u4DB0")
582 self.assertEqual(consumed, 30)
583
584 decoded, consumed = _codecs.unicode_escape_decode(
585 b"\\N{CJK UNIFIED IDEOGRAPH-2B75A}"
586 )
587 self.assertEqual(decoded, "\U0002B75A")
588 self.assertEqual(consumed, 31)
589
590 def test_decode_unicode_escape_with_lowercase_cjk_ideograph_returns_string(self):
591 with self.assertRaises(UnicodeDecodeError):
592 _codecs.unicode_escape_decode(b"\\N{CJK UNIFIED IDEOGRAPH-4db0}")
593
594 def test_decode_unicode_escape_with_invalid_cjk_ideograph_returns_string(self):
595 with self.assertRaises(UnicodeDecodeError):
596 _codecs.unicode_escape_decode(b"\\N{CJK UNIFIED IDEOGRAPH-4DB6}")
597
598 def test_decode_unicode_escape_with_valid_name_escape_returns_string(self):
599 decoded, consumed = _codecs.unicode_escape_decode(
600 b"\\N{LATIN SMALL LETTER A WITH MACRON}"
601 )
602 self.assertEqual(decoded, "\u0101")
603 self.assertEqual(consumed, 36)
604
605 def test_decode_unicode_escape_with_invalid_word_raises_unicode_decode_error(self):
606 with self.assertRaises(UnicodeDecodeError):
607 _codecs.unicode_escape_decode(b"\\N{INVALID}")
608
609 def test_decode_unicode_escape_with_invalid_name_raises_unicode_decode_error(self):
610 with self.assertRaises(UnicodeDecodeError):
611 _codecs.unicode_escape_decode(b"\\N{LATIN S LETTER CAPITAL}")
612
613 def test_decode_unicode_escape_with_custom_error_handler_returns_string(self):
614 _codecs.register_error("test", lambda x: ("-testing-", x.end))
615 decoded, consumed = _codecs.unicode_escape_decode(b"ab\\U90gc", "test")
616 self.assertEqual(decoded, "ab-testing-gc")
617 self.assertEqual(consumed, 8)
618
619 @pyro_only
620 def test_decode_unicode_escape_stateful_returns_first_invalid_escape(self):
621 decoded, consumed, first_invalid = _codecs._unicode_escape_decode_stateful(
622 b"ab\\yc"
623 )
624 self.assertEqual(decoded, "ab\\yc")
625 self.assertEqual(consumed, 5)
626 self.assertEqual(first_invalid, 3)
627
628
629class DecodeRawUnicodeEscapeTests(unittest.TestCase):
630 def test_decode_raw_unicode_escape_with_non_bytes_first_raises_type_error(self):
631 with self.assertRaises(TypeError):
632 _codecs.raw_unicode_escape_decode([])
633
634 def test_decode_raw_unicode_escape_with_non_string_second_raises_type_error(self):
635 with self.assertRaises(TypeError):
636 _codecs.raw_unicode_escape_decode(b"", [])
637
638 def test_decode_raw_unicode_escape_with_zero_length_returns_empty_string(self):
639 decoded, consumed = _codecs.raw_unicode_escape_decode(b"")
640 self.assertEqual(decoded, "")
641 self.assertEqual(consumed, 0)
642
643 def test_decode_raw_unicode_escape_with_well_formed_latin_1_returns_string(self):
644 decoded, consumed = _codecs.raw_unicode_escape_decode(b"hello\x95")
645 self.assertEqual(decoded, "hello\x95")
646 self.assertEqual(consumed, 6)
647
648 def test_decode_raw_unicode_escape_with_escaped_back_slash_returns_string(self):
649 decoded, consumed = _codecs.raw_unicode_escape_decode(b"hello\\x95")
650 self.assertEqual(decoded, "hello\\x95")
651 self.assertEqual(consumed, 9)
652
653 def test_decode_raw_unicode_escape_with_well_formed_latin_1_array_returns_string(
654 self,
655 ):
656 decoded, consumed = _codecs.raw_unicode_escape_decode(array("B", b"hello\x95"))
657 self.assertEqual(decoded, "hello\x95")
658 self.assertEqual(consumed, 6)
659
660 def test_decode_raw_unicode_escape_with_well_formed_latin_1_bytearray_returns_string(
661 self,
662 ):
663 decoded, consumed = _codecs.raw_unicode_escape_decode(bytearray(b"hello\x95"))
664 self.assertEqual(decoded, "hello\x95")
665 self.assertEqual(consumed, 6)
666
667 def test_decode_raw_unicode_escape_with_well_formed_latin_1_memoryview_returns_string(
668 self,
669 ):
670 decoded, consumed = _codecs.raw_unicode_escape_decode(memoryview(b"hello\x95"))
671 self.assertEqual(decoded, "hello\x95")
672 self.assertEqual(consumed, 6)
673
674 def test_decode_raw_unicode_escape_with_latin_1_bytearray_subclass_returns_string(
675 self,
676 ):
677 class B(bytearray):
678 pass
679
680 decoded, consumed = _codecs.raw_unicode_escape_decode(B(b"hello\x95"))
681 self.assertEqual(decoded, "hello\x95")
682 self.assertEqual(consumed, 6)
683
684 def test_decode_raw_unicode_escape_with_out_of_range_32_bit_unicode_raises_error(
685 self,
686 ):
687 with self.assertRaises(UnicodeDecodeError) as context:
688 _codecs.raw_unicode_escape_decode(b"\\U00FFFFFF")
689 exc = context.exception
690 self.assertEqual(exc.encoding, "rawunicodeescape")
691 self.assertEqual(exc.reason, "\\Uxxxxxxxx out of range")
692 self.assertEqual(exc.object, b"\\U00FFFFFF")
693 self.assertEqual(exc.start, 0)
694 self.assertEqual(exc.end, 10)
695
696 def test_decode_raw_unicode_escape_with_truncated_16_bit_unicode_raises_error(
697 self,
698 ):
699 with self.assertRaises(UnicodeDecodeError) as context:
700 _codecs.raw_unicode_escape_decode(b"\\u123")
701 exc = context.exception
702 self.assertEqual(exc.encoding, "rawunicodeescape")
703 self.assertEqual(exc.reason, "truncated \\uXXXX escape")
704 self.assertEqual(exc.object, b"\\u123")
705 self.assertEqual(exc.start, 0)
706 self.assertEqual(exc.end, 5)
707
708 def test_decode_raw_unicode_escape_with_truncated_32_bit_unicode_raises_error(
709 self,
710 ):
711 with self.assertRaises(UnicodeDecodeError) as context:
712 _codecs.raw_unicode_escape_decode(b"\\U001234")
713 exc = context.exception
714 self.assertEqual(exc.encoding, "rawunicodeescape")
715 self.assertEqual(exc.reason, "truncated \\UXXXXXXXX escape")
716 self.assertEqual(exc.object, b"\\U001234")
717 self.assertEqual(exc.start, 0)
718 self.assertEqual(exc.end, 8)
719
720 def test_decode_raw_unicode_escape_with_valid_unicode_returns_string(self):
721 decoded, consumed = _codecs.raw_unicode_escape_decode(b"\u26f7")
722 self.assertEqual(decoded, "\u26F7")
723 self.assertEqual(consumed, 6)
724
725 def test_decode_raw_unicode_escape_with_valid_cjk_ideograph_returns_string(self):
726 decoded, consumed = _codecs.unicode_escape_decode(b"\\u4DB0")
727 self.assertEqual(decoded, "\u4DB0")
728 self.assertEqual(consumed, 6)
729
730 decoded, consumed = _codecs.raw_unicode_escape_decode(b"\\U0002B75A")
731 self.assertEqual(decoded, "\U0002B75A")
732 self.assertEqual(consumed, 10)
733
734 def test_decode_raw_unicode_escape_with_valid_name_escape_returns_string(self):
735 decoded, consumed = _codecs.raw_unicode_escape_decode(
736 b"\\N{HANGUL SYLLABLE BBYAENG}"
737 )
738 self.assertEqual(decoded, "\\N{HANGUL SYLLABLE BBYAENG}")
739 self.assertEqual(consumed, 27)
740
741 def test_decode_raw_unicode_escape_with_invalid_word_returns_string(self):
742 decoded, consumed = _codecs.raw_unicode_escape_decode(b"\\N{INVALID}")
743 self.assertEqual(decoded, "\\N{INVALID}")
744 self.assertEqual(consumed, 11)
745
746 def test_decode_raw_unicode_escape_with_ignore_error_handler_returns_string(self):
747 decoded, consumed = _codecs.raw_unicode_escape_decode(b"ab\\U90gc", "ignore")
748 self.assertEqual(decoded, "abgc")
749 self.assertEqual(consumed, 8)
750
751 def test_decode_raw_unicode_escape_with_custom_error_handler_returns_string(self):
752 _codecs.register_error("test", lambda x: ("-testing-", x.end))
753 decoded, consumed = _codecs.raw_unicode_escape_decode(b"ab\\U90gc", "test")
754 self.assertEqual(decoded, "ab-testing-gc")
755 self.assertEqual(consumed, 8)
756
757 def test_decode_raw_unicode_escape_with_replace_error_handler_returns_string(self):
758 decoded, consumed = _codecs.raw_unicode_escape_decode(b"ab\\U90gc", "replace")
759 self.assertEqual(decoded, "ab\uFFFDgc")
760 self.assertEqual(consumed, 8)
761
762 def test_decode_raw_unicode_escape_with_trailing_back_slash_returns_string(
763 self,
764 ):
765 decoded, consumed = _codecs.raw_unicode_escape_decode(b"porcupine\\")
766 self.assertEqual(decoded, "porcupine\\")
767 self.assertEqual(consumed, 10)
768
769
770class DecodeUTF8Tests(unittest.TestCase):
771 def test_decode_utf_8_with_non_bytes_first_raises_type_error(self):
772 with self.assertRaises(TypeError):
773 _codecs.utf_8_decode([])
774
775 def test_decode_utf_8_with_non_string_second_raises_type_error(self):
776 with self.assertRaises(TypeError):
777 _codecs.utf_8_decode(b"", [])
778
779 def test_decode_utf_8_with_zero_length_returns_empty_string(self):
780 decoded, consumed = _codecs.utf_8_decode(b"")
781 self.assertEqual(decoded, "")
782 self.assertEqual(consumed, 0)
783
784 def test_decode_utf_8_with_well_formed_utf_8_returns_string(self):
785 decoded, consumed = _codecs.utf_8_decode(
786 b"\xf0\x9f\x86\x92h\xc3\xa4l\xe2\xb3\x80"
787 )
788 self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
789 self.assertEqual(consumed, 11)
790
791 def test_decode_utf_8_with_well_formed_utf8_array_returns_string(self):
792 decoded, consumed = _codecs.utf_8_decode(
793 array("B", b"\xf0\x9f\x86\x92h\xc3\xa4l\xe2\xb3\x80")
794 )
795 self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
796 self.assertEqual(consumed, 11)
797
798 def test_decode_utf_8_with_well_formed_utf8_bytearray_returns_string(self):
799 decoded, consumed = _codecs.utf_8_decode(
800 bytearray(b"\xf0\x9f\x86\x92h\xc3\xa4l\xe2\xb3\x80")
801 )
802 self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
803 self.assertEqual(consumed, 11)
804
805 def test_decode_utf_8_with_well_formed_utf8_bytearray_subclass_returns_string(self):
806 class B(bytearray):
807 pass
808
809 decoded, consumed = _codecs.utf_8_decode(
810 B(b"\xf0\x9f\x86\x92h\xc3\xa4l\xe2\xb3\x80")
811 )
812 self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
813 self.assertEqual(consumed, 11)
814
815 def test_decode_utf_8_with_well_formed_utf8_memoryview_returns_string(self):
816 decoded, consumed = _codecs.utf_8_decode(
817 memoryview(b"\xf0\x9f\x86\x92h\xc3\xa4l\xe2\xb3\x80")
818 )
819 self.assertEqual(decoded, "\U0001f192h\xe4l\u2cc0")
820 self.assertEqual(consumed, 11)
821
822 def test_decode_utf_8_with_custom_error_handler_returns_string(self):
823 _codecs.register_error("test", lambda x: ("-testing-", x.end))
824 decoded, consumed = _codecs.utf_8_decode(b"ab\x90c", "test")
825 self.assertEqual(decoded, "ab-testing-c")
826 self.assertEqual(consumed, 4)
827
828 def test_decode_utf_8_with_invalid_start_byte_raises_decode_error(self):
829 with self.assertRaises(UnicodeDecodeError) as context:
830 _codecs.utf_8_decode(b"ab\x90c")
831 self.assertEqual(str(context.exception.reason), "invalid start byte")
832
833
834class EncodeASCIITests(unittest.TestCase):
835 def test_encode_ascii_with_non_str_first_argument_raises_type_error(self):
836 with self.assertRaises(TypeError):
837 _codecs.ascii_encode([])
838
839 def test_encode_ascii_with_non_str_second_argument_raises_type_error(self):
840 with self.assertRaises(TypeError):
841 _codecs.ascii_encode("", [])
842
843 def test_encode_ascii_with_zero_length_returns_empty_bytes(self):
844 encoded, consumed = _codecs.ascii_encode("")
845 self.assertEqual(encoded, b"")
846 self.assertEqual(consumed, 0)
847
848 def test_encode_ascii_with_well_formed_ascii_returns_bytes(self):
849 encoded, consumed = _codecs.ascii_encode("hello")
850 self.assertEqual(encoded, b"hello")
851 self.assertEqual(consumed, 5)
852
853 def test_encode_ascii_with_well_formed_latin_1_raises_encode_error(self):
854 with self.assertRaises(UnicodeEncodeError):
855 _codecs.ascii_encode("hell\xe5")
856
857 def test_encode_ascii_with_custom_error_handler_mid_bytes_error_returns_bytes(self):
858 _codecs.register_error("test", lambda x: (b"-testing-", x.end))
859 encoded, consumed = _codecs.ascii_encode("ab\udc80c", "test")
860 self.assertEqual(encoded, b"ab-testing-c")
861 self.assertEqual(consumed, 4)
862
863 def test_encode_ascii_with_custom_error_handler_end_bytes_error_returns_bytes(self):
864 _codecs.register_error("test", lambda x: (b"-testing-", x.end))
865 encoded, consumed = _codecs.ascii_encode("ab\x80", "test")
866 self.assertEqual(encoded, b"ab-testing-")
867 self.assertEqual(consumed, 3)
868
869 def test_encode_ascii_with_non_ascii_error_handler_raises_encode_error(self):
870 _codecs.register_error("test", lambda x: ("\x80", x.end))
871 with self.assertRaises(UnicodeEncodeError) as context:
872 _codecs.ascii_encode("ab\x80", "test")
873 exc = context.exception
874 self.assertEqual(exc.encoding, "ascii")
875 self.assertEqual(exc.reason, "ordinal not in range(128)")
876 self.assertEqual(exc.object, "ab\x80")
877 self.assertEqual(exc.start, 2)
878 self.assertEqual(exc.end, 3)
879
880
881class EncodeLatin1Tests(unittest.TestCase):
882 def test_encode_latin_1_with_non_str_first_argument_raises_type_error(self):
883 with self.assertRaises(TypeError):
884 _codecs.latin_1_encode([])
885
886 def test_encode_latin_1_with_non_str_second_argument_raises_type_error(self):
887 with self.assertRaises(TypeError):
888 _codecs.latin_1_encode("", [])
889
890 def test_encode_latin_1_with_zero_length_returns_empty_bytes(self):
891 encoded, consumed = _codecs.latin_1_encode("")
892 self.assertEqual(encoded, b"")
893 self.assertEqual(consumed, 0)
894
895 def test_encode_latin_1_with_well_formed_latin_1_returns_bytes(self):
896 encoded, consumed = _codecs.latin_1_encode("hell\xe5")
897 self.assertEqual(encoded, b"hell\xe5")
898 self.assertEqual(consumed, 5)
899
900 def test_encode_ascii_with_well_formed_non_latin_1_raises_encode_error(self):
901 with self.assertRaises(UnicodeEncodeError):
902 _codecs.ascii_encode("hell\u01ff")
903
904 def test_encode_latin_1_with_custom_error_handler_mid_bytes_error_returns_bytes(
905 self,
906 ):
907 _codecs.register_error("test", lambda x: (b"-testing-", x.end))
908 encoded, consumed = _codecs.latin_1_encode("ab\udc80c", "test")
909 self.assertEqual(encoded, b"ab-testing-c")
910 self.assertEqual(consumed, 4)
911
912 def test_encode_latin_1_with_custom_error_handler_end_bytes_error_returns_bytes(
913 self,
914 ):
915 _codecs.register_error("test", lambda x: (b"-testing-", x.end))
916 encoded, consumed = _codecs.latin_1_encode("ab\u0180", "test")
917 self.assertEqual(encoded, b"ab-testing-")
918 self.assertEqual(consumed, 3)
919
920 def test_encode_latin_1_with_non_ascii_error_handler_returns_bytes(self):
921 _codecs.register_error("test", lambda x: ("\x80", x.end))
922 encoded, consumed = _codecs.latin_1_encode("ab\u0180", "test")
923 self.assertEqual(encoded, b"ab\x80")
924 self.assertEqual(consumed, 3)
925
926 def test_encode_latin_1_with_non_latin_1_error_handler_raises_encode_error(self):
927 _codecs.register_error("test", lambda x: ("\u0180", x.end))
928 with self.assertRaises(UnicodeEncodeError) as context:
929 _codecs.latin_1_encode("ab\u0f80", "test")
930 exc = context.exception
931 self.assertEqual(exc.encoding, "latin-1")
932 self.assertEqual(exc.reason, "ordinal not in range(256)")
933 self.assertEqual(exc.object, "ab\u0f80")
934 self.assertEqual(exc.start, 2)
935 self.assertEqual(exc.end, 3)
936
937
938class EncodeUTF16Tests(unittest.TestCase):
939 def test_encode_utf_16_with_non_str_first_argument_raises_type_error(self):
940 with self.assertRaises(TypeError):
941 _codecs.utf_16_encode([])
942
943 def test_encode_utf_16_with_non_str_second_argument_raises_type_error(self):
944 with self.assertRaises(TypeError):
945 _codecs.utf_16_encode("", [])
946
947 def test_encode_utf_16_with_zero_length_returns_bom(self):
948 encoded, consumed = _codecs.utf_16_encode("")
949 self.assertEqual(encoded, b"\xff\xfe")
950 self.assertEqual(consumed, 0)
951
952 def test_encode_utf_16_with_ascii_returns_bytes(self):
953 encoded, consumed = _codecs.utf_16_encode("hi")
954 self.assertEqual(encoded, b"\xff\xfeh\x00i\x00")
955 self.assertEqual(consumed, 2)
956
957 def test_encode_utf_16_with_latin_1_returns_bytes(self):
958 encoded, consumed = _codecs.utf_16_encode("h\xe5")
959 self.assertEqual(encoded, b"\xff\xfeh\x00\xe5\x00")
960 self.assertEqual(consumed, 2)
961
962 def test_encode_utf_16_with_bmp_returns_bytes(self):
963 encoded, consumed = _codecs.utf_16_encode("h\u1005")
964 self.assertEqual(encoded, b"\xff\xfeh\x00\x05\x10")
965 self.assertEqual(consumed, 2)
966
967 def test_encode_utf_16_with_supplementary_plane_returns_bytes(self):
968 encoded, consumed = _codecs.utf_16_encode("h\U0001d1f0i")
969 self.assertEqual(encoded, b"\xff\xfeh\x004\xd8\xf0\xddi\x00")
970 self.assertEqual(consumed, 3)
971
972 def test_encode_utf_16_le_with_supplementary_plane_returns_bytes(self):
973 encoded, consumed = _codecs.utf_16_le_encode("h\U0001d1f0i")
974 self.assertEqual(encoded, b"h\x004\xd8\xf0\xddi\x00")
975 self.assertEqual(consumed, 3)
976
977 def test_encode_utf_16_be_with_supplementary_plane_returns_bytes(self):
978 encoded, consumed = _codecs.utf_16_be_encode("h\U0001d1f0i")
979 self.assertEqual(encoded, b"\x00h\xd84\xdd\xf0\x00i")
980 self.assertEqual(consumed, 3)
981
982 def test_encode_utf_16_with_custom_error_handler_mid_bytes_error_returns_bytes(
983 self,
984 ):
985 _codecs.register_error("test", lambda x: (b"--", x.end))
986 encoded, consumed = _codecs.utf_16_encode("ab\udc80c", "test")
987 self.assertEqual(encoded, b"\xff\xfea\x00b\x00--c\x00")
988 self.assertEqual(consumed, 4)
989
990 def test_encode_utf_16_with_custom_error_handler_end_bytes_error_returns_bytes(
991 self,
992 ):
993 _codecs.register_error("test", lambda x: (b"--", x.end))
994 encoded, consumed = _codecs.utf_16_encode("ab\udc80", "test")
995 self.assertEqual(encoded, b"\xff\xfea\x00b\x00--")
996 self.assertEqual(consumed, 3)
997
998 def test_encode_utf_16_with_string_returning_error_handler_returns_bytes(self):
999 _codecs.register_error("test", lambda x: ("h", x.end))
1000 encoded, consumed = _codecs.utf_16_encode("ab\udc80", "test")
1001 self.assertEqual(encoded, b"\xff\xfea\x00b\x00h\x00")
1002 self.assertEqual(consumed, 3)
1003
1004 def test_encode_utf_16_with_non_ascii_error_handler_raises_encode_error(self):
1005 _codecs.register_error("test", lambda x: ("\x80", x.end))
1006 with self.assertRaises(UnicodeEncodeError) as context:
1007 _codecs.utf_16_encode("ab\udc80", "test")
1008 exc = context.exception
1009 self.assertEqual(exc.encoding, "utf-16")
1010 self.assertEqual(exc.reason, "surrogates not allowed")
1011 self.assertEqual(exc.object, "ab\udc80")
1012 self.assertEqual(exc.start, 2)
1013 self.assertEqual(exc.end, 3)
1014
1015
1016class EncodeUTF32Tests(unittest.TestCase):
1017 def test_encode_utf_32_with_non_str_first_argument_raises_type_error(self):
1018 with self.assertRaises(TypeError):
1019 _codecs.utf_32_encode([])
1020
1021 def test_encode_utf_32_with_non_str_second_argument_raises_type_error(self):
1022 with self.assertRaises(TypeError):
1023 _codecs.utf_32_encode("", [])
1024
1025 def test_encode_utf_32_with_zero_length_returns_bom(self):
1026 encoded, consumed = _codecs.utf_32_encode("")
1027 self.assertEqual(encoded, b"\xff\xfe\x00\x00")
1028 self.assertEqual(consumed, 0)
1029
1030 def test_encode_utf_32_with_ascii_returns_bytes(self):
1031 encoded, consumed = _codecs.utf_32_encode("hi")
1032 self.assertEqual(encoded, b"\xff\xfe\x00\x00h\x00\x00\x00i\x00\x00\x00")
1033 self.assertEqual(consumed, 2)
1034
1035 def test_encode_utf_32_with_latin_1_returns_bytes(self):
1036 encoded, consumed = _codecs.utf_32_encode("h\xe5")
1037 self.assertEqual(encoded, b"\xff\xfe\x00\x00h\x00\x00\x00\xe5\x00\x00\x00")
1038 self.assertEqual(consumed, 2)
1039
1040 def test_encode_utf_32_with_bmp_returns_bytes(self):
1041 encoded, consumed = _codecs.utf_32_encode("h\u1005")
1042 self.assertEqual(encoded, b"\xff\xfe\x00\x00h\x00\x00\x00\x05\x10\x00\x00")
1043 self.assertEqual(consumed, 2)
1044
1045 def test_encode_utf_32_with_supplementary_plane_returns_bytes(self):
1046 encoded, consumed = _codecs.utf_32_encode("h\U0001d1f0i")
1047 self.assertEqual(
1048 encoded, b"\xff\xfe\x00\x00h\x00\x00\x00\xf0\xd1\x01\x00i\x00\x00\x00"
1049 )
1050 self.assertEqual(consumed, 3)
1051
1052 def test_encode_utf_32_le_with_supplementary_plane_returns_bytes(self):
1053 encoded, consumed = _codecs.utf_32_le_encode("h\U0001d1f0i")
1054 self.assertEqual(encoded, b"h\x00\x00\x00\xf0\xd1\x01\x00i\x00\x00\x00")
1055 self.assertEqual(consumed, 3)
1056
1057 def test_encode_utf_32_be_with_supplementary_plane_returns_bytes(self):
1058 encoded, consumed = _codecs.utf_32_be_encode("h\U0001d1f0i")
1059 self.assertEqual(encoded, b"\x00\x00\x00h\x00\x01\xd1\xf0\x00\x00\x00i")
1060 self.assertEqual(consumed, 3)
1061
1062 def test_encode_utf_32_with_custom_error_handler_mid_bytes_error_returns_bytes(
1063 self,
1064 ):
1065 _codecs.register_error("test", lambda x: (b"----", x.end))
1066 encoded, consumed = _codecs.utf_32_encode("ab\udc80c", "test")
1067 self.assertEqual(
1068 encoded, b"\xff\xfe\x00\x00a\x00\x00\x00b\x00\x00\x00----c\x00\x00\x00"
1069 )
1070 self.assertEqual(consumed, 4)
1071
1072 def test_encode_utf_32_with_custom_error_handler_end_bytes_error_returns_bytes(
1073 self,
1074 ):
1075 _codecs.register_error("test", lambda x: (b"----", x.end))
1076 encoded, consumed = _codecs.utf_32_encode("ab\udc80", "test")
1077 self.assertEqual(encoded, b"\xff\xfe\x00\x00a\x00\x00\x00b\x00\x00\x00----")
1078 self.assertEqual(consumed, 3)
1079
1080 def test_encode_utf_32_with_string_returning_error_handler_returns_bytes(self):
1081 _codecs.register_error("test", lambda x: ("h", x.end))
1082 encoded, consumed = _codecs.utf_32_encode("ab\udc80", "test")
1083 self.assertEqual(
1084 encoded, b"\xff\xfe\x00\x00a\x00\x00\x00b\x00\x00\x00h\x00\x00\x00"
1085 )
1086 self.assertEqual(consumed, 3)
1087
1088 def test_encode_utf_32_with_non_ascii_error_handler_raises_encode_error(self):
1089 _codecs.register_error("test", lambda x: ("\x80", x.end))
1090 with self.assertRaises(UnicodeEncodeError) as context:
1091 _codecs.utf_32_encode("ab\udc80", "test")
1092 exc = context.exception
1093 self.assertEqual(exc.encoding, "utf-32")
1094 self.assertEqual(exc.reason, "surrogates not allowed")
1095 self.assertEqual(exc.object, "ab\udc80")
1096 self.assertEqual(exc.start, 2)
1097 self.assertEqual(exc.end, 3)
1098
1099
1100class EncodeUTF8Tests(unittest.TestCase):
1101 def test_encode_utf_8_with_non_str_first_argument_raises_type_error(self):
1102 with self.assertRaises(TypeError):
1103 _codecs.utf_8_encode([])
1104
1105 def test_encode_utf_8_with_non_str_second_argument_raises_type_error(self):
1106 with self.assertRaises(TypeError):
1107 _codecs.utf_8_encode("", [])
1108
1109 def test_encode_utf_8_with_zero_length_returns_empty_bytes(self):
1110 encoded, consumed = _codecs.utf_8_encode("")
1111 self.assertEqual(encoded, b"")
1112 self.assertEqual(consumed, 0)
1113
1114 def test_encode_utf_8_with_well_formed_ascii_returns_bytes(self):
1115 encoded, consumed = _codecs.utf_8_encode("hello")
1116 self.assertEqual(encoded, b"hello")
1117 self.assertEqual(consumed, 5)
1118
1119 def test_encode_utf_8_with_surrogatepass_passes_surrogate(self):
1120 # high surrogate
1121 encoded, consumed = _codecs.utf_8_encode("ab\udc80c", "surrogatepass")
1122 self.assertEqual(encoded, b"ab\xed\xb2\x80c")
1123 self.assertEqual(consumed, 4)
1124
1125 # low surrogate
1126 encoded, consumed = _codecs.utf_8_encode("ab\ud9a0c", "surrogatepass")
1127 self.assertEqual(encoded, b"ab\xed\xa6\xa0c")
1128 self.assertEqual(consumed, 4)
1129
1130 def test_encode_utf_8_without_surrogatepass_raises_on_surrogate(self):
1131 with self.assertRaises(UnicodeEncodeError):
1132 _codecs.utf_8_encode("ab\udc80c")
1133
1134 def test_encode_utf_8_with_custom_error_handler_mid_bytes_error_returns_bytes(self):
1135 _codecs.register_error("test", lambda x: (b"-testing-", x.end))
1136 encoded, consumed = _codecs.utf_8_encode("ab\udc80c", "test")
1137 self.assertEqual(encoded, b"ab-testing-c")
1138 self.assertEqual(consumed, 4)
1139
1140 def test_encode_utf_8_with_custom_error_handler_end_bytes_error_returns_bytes(self):
1141 _codecs.register_error("test", lambda x: (b"-testing-", x.end))
1142 encoded, consumed = _codecs.utf_8_encode("ab\udc80", "test")
1143 self.assertEqual(encoded, b"ab-testing-")
1144 self.assertEqual(consumed, 3)
1145
1146 def test_encode_utf_8_with_non_ascii_error_handler_raises_encode_error(self):
1147 _codecs.register_error("test", lambda x: ("\x80", x.end))
1148 with self.assertRaises(UnicodeEncodeError) as context:
1149 _codecs.utf_8_encode("ab\udc80", "test")
1150 exc = context.exception
1151 self.assertEqual(exc.encoding, "utf-8")
1152 self.assertEqual(exc.reason, "surrogates not allowed")
1153 self.assertEqual(exc.object, "ab\udc80")
1154 self.assertEqual(exc.start, 2)
1155 self.assertEqual(exc.end, 3)
1156
1157
1158class EncodeRawUnicodeEscapeTests(unittest.TestCase):
1159 def test_encode_raw_unicode_escape_with_non_str_first_argument_raises_type_error(
1160 self,
1161 ):
1162 with self.assertRaises(TypeError):
1163 _codecs.raw_unicode_escape_encode([])
1164
1165 def test_encode_raw_unicode_escape_with_non_str_second_argument_raises_type_error(
1166 self,
1167 ):
1168 with self.assertRaises(TypeError):
1169 _codecs.raw_unicode_escape_encode("", [])
1170
1171 def test_encode_raw_unicode_escape_with_zero_length_returns_bytes(self):
1172 encoded, consumed = _codecs.raw_unicode_escape_encode("")
1173 self.assertEqual(encoded, b"")
1174 self.assertEqual(consumed, 0)
1175
1176 def test_encode_raw_unicode_escape_with_ascii_returns_bytes(self):
1177 encoded, consumed = _codecs.raw_unicode_escape_encode("hi")
1178 self.assertEqual(encoded, b"hi")
1179 self.assertEqual(consumed, 2)
1180
1181 def test_encode_raw_unicode_escape_with_latin_1_returns_bytes(self):
1182 encoded, consumed = _codecs.raw_unicode_escape_encode("h\xe5")
1183 self.assertEqual(encoded, b"h\xe5")
1184 self.assertEqual(consumed, 2)
1185
1186 def test_encode_raw_unicode_escape_with_bmp_returns_bytes(self):
1187 encoded, consumed = _codecs.raw_unicode_escape_encode("h\u1005")
1188 self.assertEqual(encoded, b"h\\u1005")
1189 self.assertEqual(consumed, 2)
1190
1191 def test_encode_raw_unicode_escape_with_supplementary_plane_returns_bytes(self):
1192 encoded, consumed = _codecs.raw_unicode_escape_encode("h\U0001d1f0i")
1193 self.assertEqual(encoded, b"h\\U0001d1f0i")
1194 self.assertEqual(consumed, 3)
1195
1196
1197class ErrorHandlerTests(unittest.TestCase):
1198 def test_backslashreplace_with_non_unicode_error_raises_type_error(self):
1199 handler = _codecs.lookup_error("backslashreplace")
1200 e = UserWarning()
1201 with self.assertRaisesRegex(
1202 TypeError, "don't know how to handle UserWarning in error callback"
1203 ):
1204 handler(e)
1205
1206 def test_backslashreplace_with_unicode_decode_error_returns_tuple(self):
1207 handler = _codecs.lookup_error("backslashreplace")
1208 result = handler(UnicodeDecodeError("foo", b"a", 0, 1, "baz"))
1209 self.assertIs(type(result), tuple)
1210 self.assertEqual(result, ("\\x61", 1))
1211 result = handler(
1212 UnicodeDecodeError("foo", b"i need \xca\xfe now!", 6, 11, "baz")
1213 )
1214 self.assertIs(type(result), tuple)
1215 self.assertEqual(result, ("\\x20\\xca\\xfe\\x20\\x6e", 11))
1216
1217 def test_backslashreplace_with_unicode_encode_error_returns_tuple(self):
1218 handler = _codecs.lookup_error("backslashreplace")
1219 result = handler(UnicodeEncodeError("foo", "a", 0, 1, "baz"))
1220 self.assertIs(type(result), tuple)
1221 self.assertEqual(result, ("\\x61", 1))
1222 result = handler(UnicodeEncodeError("foo", "\xdc", 0, 1, "baz"))
1223 self.assertEqual(result, ("\\xdc", 1))
1224 result = handler(UnicodeEncodeError("foo", "\u1234", 0, 1, "baz"))
1225 self.assertEqual(result, ("\\u1234", 1))
1226 result = handler(UnicodeEncodeError("foo", "\U00012345", 0, 1, "baz"))
1227 self.assertEqual(result, ("\\U00012345", 1))
1228
1229 result = handler(
1230 UnicodeEncodeError("foo", "hello\xac\u4213\U0001f40dbaz!", 4, 9, "baz")
1231 )
1232 self.assertIs(type(result), tuple)
1233 self.assertEqual(result, ("\\x6f\\xac\\u4213\\U0001f40d\\x62", 9))
1234
1235 def test_backslashreplace_with_unicode_translate_error_returns_tuple(self):
1236 handler = _codecs.lookup_error("backslashreplace")
1237 result = handler(UnicodeTranslateError("a", 0, 1, "baz"))
1238 self.assertIs(type(result), tuple)
1239 self.assertEqual(result, ("\\x61", 1))
1240 result = handler(UnicodeTranslateError("\xdc", 0, 1, "baz"))
1241 self.assertEqual(result, ("\\xdc", 1))
1242 result = handler(UnicodeTranslateError("\u1234", 0, 1, "baz"))
1243 self.assertEqual(result, ("\\u1234", 1))
1244 result = handler(UnicodeTranslateError("\U00012345", 0, 1, "baz"))
1245 self.assertEqual(result, ("\\U00012345", 1))
1246
1247 result = handler(
1248 UnicodeTranslateError("hello\xac\u4213\U0001f40dbaz!", 4, 9, "baz")
1249 )
1250 self.assertIs(type(result), tuple)
1251 self.assertEqual(result, ("\\x6f\\xac\\u4213\\U0001f40d\\x62", 9))
1252
1253 def test_ignore_with_unicode_encode_error_returns_tuple(self):
1254 handler = _codecs.lookup_error("ignore")
1255 e = UnicodeEncodeError("foo", "bar", 44, 2, "baz")
1256 self.assertEqual(handler(e), ("", 2))
1257
1258 def test_ignore_with_unicode_decode_error_returns_tuple(self):
1259 handler = _codecs.lookup_error("ignore")
1260 e = UnicodeDecodeError("foo", b"barbam", 99, 3, "baz")
1261 self.assertEqual(handler(e), ("", 3))
1262
1263 def test_ignore_with_unicode_translate_error_returns_tuple(self):
1264 handler = _codecs.lookup_error("ignore")
1265 e = UnicodeTranslateError("barbazbam", 99, 5, "baz")
1266 self.assertEqual(handler(e), ("", 5))
1267
1268 def test_ignore_with_non_unicode_error_raises_type_error(self):
1269 handler = _codecs.lookup_error("ignore")
1270 with self.assertRaisesRegex(
1271 TypeError, "don't know how to handle int in error callback"
1272 ):
1273 handler(42)
1274
1275 def test_strict_with_exception_raises(self):
1276 handler = _codecs.lookup_error("strict")
1277 e = UserWarning()
1278 with self.assertRaises(UserWarning) as ctx:
1279 handler(e)
1280 self.assertIs(ctx.exception, e)
1281
1282 def test_strict_without_exception_raises_type_error(self):
1283 handler = _codecs.lookup_error("strict")
1284 with self.assertRaisesRegex(TypeError, "codec must pass exception instance"):
1285 handler(42)
1286
1287
1288@pyro_only
1289class GeneralizedErrorHandlerTests(unittest.TestCase):
1290 def test_call_decode_error_with_strict_raises_unicode_decode_error(self):
1291 with self.assertRaises(UnicodeDecodeError):
1292 _codecs._call_decode_errorhandler(
1293 "strict", b"bad input", _str_array(), "reason", "encoding", 0, 0
1294 )
1295
1296 def test_call_decode_error_with_ignore_returns_tuple(self):
1297 new_input, new_pos = _codecs._call_decode_errorhandler(
1298 "ignore", b"bad_input", _str_array(), "reason", "encoding", 1, 2
1299 )
1300 self.assertEqual(new_input, b"bad_input")
1301 self.assertEqual(new_pos, 2)
1302
1303 def test_call_decode_error_with_non_tuple_return_raises_type_error(self):
1304 def error_function(exc):
1305 return "not-a-tuple"
1306
1307 _codecs.register_error("not-a-tuple", error_function)
1308 with self.assertRaises(TypeError):
1309 _codecs._call_decode_errorhandler(
1310 "not-a-tuple", b"bad_input", _str_array(), "reason", "encoding", 1, 2
1311 )
1312
1313 def test_call_decode_error_with_small_tuple_return_raises_type_error(self):
1314 def error_function(exc):
1315 return ("one",)
1316
1317 _codecs.register_error("small-tuple", error_function)
1318 with self.assertRaises(TypeError):
1319 _codecs._call_decode_errorhandler(
1320 "small-tuple", b"bad_input", _str_array(), "reason", "encoding", 1, 2
1321 )
1322
1323 def test_call_decode_error_with_int_first_tuple_return_raises_type_error(self):
1324 def error_function(exc):
1325 return 1, 1
1326
1327 _codecs.register_error("int-first", error_function)
1328 with self.assertRaises(TypeError):
1329 _codecs._call_decode_errorhandler(
1330 "int-first", b"bad_input", _str_array(), "reason", "encoding", 1, 2
1331 )
1332
1333 def test_call_decode_error_with_string_second_tuple_return_raises_type_error(self):
1334 def error_function(exc):
1335 return "str_to_append", "new_pos"
1336
1337 _codecs.register_error("str-second", error_function)
1338 with self.assertRaises(TypeError):
1339 _codecs._call_decode_errorhandler(
1340 "str-second", b"bad_input", _str_array(), "reason", "encoding", 1, 2
1341 )
1342
1343 def test_call_decode_error_with_non_bytes_changed_input_returns_error(self):
1344 def error_function(err):
1345 err.object = 1
1346 return "str_to_append", err.end
1347
1348 _codecs.register_error("change-input-to-int", error_function)
1349 with self.assertRaises(TypeError):
1350 _codecs._call_decode_errorhandler(
1351 "change-input-to-int",
1352 b"bad_input",
1353 _str_array(),
1354 "reason",
1355 "encoding",
1356 1,
1357 2,
1358 )
1359
1360 def test_call_decode_error_with_improper_index_returns_error(self):
1361 def error_function(exc):
1362 return "str_to_append", 10
1363
1364 _codecs.register_error("out-of-bounds-pos", error_function)
1365 with self.assertRaises(IndexError):
1366 _codecs._call_decode_errorhandler(
1367 "out-of-bounds-pos",
1368 b"bad_input",
1369 _str_array(),
1370 "reason",
1371 "encoding",
1372 1,
1373 2,
1374 )
1375
1376 def test_call_decode_error_with_negative_index_return_returns_proper_index(self):
1377 def error_function(exc):
1378 return "str_to_append", -1
1379
1380 _codecs.register_error("negative-pos", error_function)
1381 new_input, new_pos = _codecs._call_decode_errorhandler(
1382 "negative-pos", b"bad_input", _str_array(), "reason", "encoding", 1, 2
1383 )
1384 self.assertEqual(new_input, b"bad_input")
1385 self.assertEqual(new_pos, 8)
1386
1387 def test_call_decode_appends_string_to_output(self):
1388 def error_function(exc):
1389 return "str_to_append", exc.end
1390
1391 _codecs.register_error("well-behaved-test", error_function)
1392 result = _str_array()
1393 _codecs._call_decode_errorhandler(
1394 "well-behaved-test", b"bad_input", result, "reason", "encoding", 1, 2
1395 )
1396 self.assertEqual(str(result), "str_to_append")
1397
1398 def test_call_encode_error_with_strict_calls_function(self):
1399 with self.assertRaises(UnicodeEncodeError):
1400 _codecs._call_encode_errorhandler(
1401 "strict", "bad_input", "reason", "encoding", 0, 0
1402 )
1403
1404 def test_call_encode_error_with_ignore_calls_function(self):
1405 result, new_pos = _codecs._call_encode_errorhandler(
1406 "ignore", "bad_input", "reason", "encoding", 1, 2
1407 )
1408 self.assertEqual(result, "")
1409 self.assertEqual(new_pos, 2)
1410
1411 def test_call_encode_error_with_non_tuple_return_raises_type_error(self):
1412 def error_function(exc):
1413 return "not-a-tuple"
1414
1415 _codecs.register_error("not-a-tuple", error_function)
1416 with self.assertRaises(TypeError):
1417 _codecs._call_encode_errorhandler(
1418 "not-a-tuple", "bad_input", "reason", "encoding", 1, 2
1419 )
1420
1421 def test_call_encode_error_with_small_tuple_return_raises_type_error(self):
1422 def error_function(exc):
1423 return ("one",)
1424
1425 _codecs.register_error("small-tuple", error_function)
1426 with self.assertRaises(TypeError):
1427 _codecs._call_encode_errorhandler(
1428 "small-tuple", "bad_input", "reason", "encoding", 1, 2
1429 )
1430
1431 def test_call_encode_error_with_int_first_tuple_return_raises_type_error(self):
1432 def error_function(exc):
1433 return 1, 1
1434
1435 _codecs.register_error("int-first", error_function)
1436 with self.assertRaises(TypeError):
1437 _codecs._call_encode_errorhandler(
1438 "int-first", "bad_input", "reason", "encoding", 1, 2
1439 )
1440
1441 def test_call_encode_error_with_str_second_tuple_return_raises_type_error(self):
1442 def error_function(exc):
1443 return "str_to_append", "new_pos"
1444
1445 _codecs.register_error("str-second", error_function)
1446 with self.assertRaises(TypeError):
1447 _codecs._call_encode_errorhandler(
1448 "str-second", "bad_input", "reason", "encoding", 1, 2
1449 )
1450
1451 def test_call_encode_error_with_changed_input_ignores_change(self):
1452 def error_function(err):
1453 err.object = 1
1454 return "str_to_append", err.end
1455
1456 _codecs.register_error("change-input-to-int", error_function)
1457 result, new_pos = _codecs._call_encode_errorhandler(
1458 "change-input-to-int", "bad_input", "reason", "encoding", 1, 2
1459 )
1460 self.assertEqual(result, "str_to_append")
1461 self.assertEqual(new_pos, 2)
1462
1463 def test_call_encode_error_with_out_of_bounds_index_raises_index_error(self):
1464 def error_function(exc):
1465 return "str_to_append", 10
1466
1467 _codecs.register_error("out-of-bounds-pos", error_function)
1468 with self.assertRaises(IndexError):
1469 _codecs._call_encode_errorhandler(
1470 "out-of-bounds-pos", "bad_input", "reason", "encoding", 1, 2
1471 )
1472
1473 def test_call_encode_error_with_negative_index_returns_proper_index(self):
1474 def error_function(exc):
1475 return "str_to_append", -1
1476
1477 _codecs.register_error("negative-pos", error_function)
1478 result, new_pos = _codecs._call_encode_errorhandler(
1479 "negative-pos", "bad_input", "reason", "encoding", 1, 2
1480 )
1481 self.assertEqual(result, "str_to_append")
1482 self.assertEqual(new_pos, 8)
1483
1484
1485if __name__ == "__main__":
1486 unittest.main()