this repo has no description
1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
2#include "builtins.h"
3#include "bytearray-builtins.h"
4#include "bytes-builtins.h"
5#include "byteslike.h"
6#include "formatter-utils.h"
7#include "frame.h"
8#include "int-builtins.h"
9#include "modules.h"
10#include "runtime.h"
11#include "str-builtins.h"
12#include "unicode-db.h"
13#include "unicode.h"
14#include "utils.h"
15
16namespace py {
17
18const char kASCIIReplacement = '?';
19
20static SymbolId lookupSymbolForErrorHandler(const Str& error) {
21 if (error.equalsCStr("strict")) {
22 return ID(strict);
23 }
24 if (error.equalsCStr("ignore")) {
25 return ID(ignore);
26 }
27 if (error.equalsCStr("replace")) {
28 return ID(replace);
29 }
30 if (error.equalsCStr("surrogateescape")) {
31 return ID(surrogateescape);
32 }
33 if (error.equalsCStr("surrogatepass")) {
34 return ID(surrogatepass);
35 }
36 return SymbolId::kInvalid;
37}
38
39static int asciiDecode(Thread* thread, const StrArray& dst,
40 const Byteslike& src, word start, word end) {
41 // TODO(T41032331): Implement a fastpass to read longs instead of chars
42 Runtime* runtime = thread->runtime();
43 for (word i = start; i < end; i++) {
44 byte ch = src.byteAt(i);
45 if (ch > kMaxASCII) {
46 return i;
47 }
48 runtime->strArrayAddASCII(thread, dst, ch);
49 }
50 return end;
51}
52
53RawObject FUNC(_codecs, _ascii_decode)(Thread* thread, Arguments args) {
54 Runtime* runtime = thread->runtime();
55 HandleScope scope(thread);
56 Object data(&scope, args.get(0));
57 Str errors(&scope, strUnderlying(args.get(1)));
58 word index = intUnderlying(args.get(2)).asWord();
59 StrArray dst(&scope, args.get(3));
60
61 Byteslike bytes(&scope, thread, *data);
62 word length = bytes.length();
63 runtime->strArrayEnsureCapacity(thread, dst, length);
64 word outpos = asciiDecode(thread, dst, bytes, index, length);
65 if (outpos == length) {
66 Object dst_obj(&scope, runtime->strFromStrArray(dst));
67 Object length_obj(&scope, runtime->newInt(length));
68 return runtime->newTupleWith2(dst_obj, length_obj);
69 }
70
71 SymbolId error_id = lookupSymbolForErrorHandler(errors);
72 while (outpos < length) {
73 byte c = bytes.byteAt(outpos);
74 if (c < 128) {
75 runtime->strArrayAddASCII(thread, dst, c);
76 ++outpos;
77 continue;
78 }
79 switch (error_id) {
80 case ID(replace): {
81 Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
82 runtime->strArrayAddStr(thread, dst, temp);
83 ++outpos;
84 break;
85 }
86 case ID(surrogateescape): {
87 Str temp(&scope,
88 SmallStr::fromCodePoint(Unicode::kLowSurrogateStart + c));
89 runtime->strArrayAddStr(thread, dst, temp);
90 ++outpos;
91 break;
92 }
93 case ID(ignore):
94 ++outpos;
95 break;
96 default: {
97 Object outpos1(&scope, runtime->newIntFromUnsigned(outpos));
98 Object outpos2(&scope, runtime->newIntFromUnsigned(outpos + 1));
99 return runtime->newTupleWith2(outpos1, outpos2);
100 }
101 }
102 }
103 Object dst_obj(&scope, runtime->strFromStrArray(dst));
104 Object length_obj(&scope, runtime->newInt(length));
105 return runtime->newTupleWith2(dst_obj, length_obj);
106}
107
108// CPython encodes latin1 codepoints into the low-surrogate range, and is able
109// to recover the original codepoints from those decodable surrogate points.
110static bool isEscapedLatin1Surrogate(int32_t codepoint) {
111 return (Unicode::kLowSurrogateStart + kMaxASCII) < codepoint &&
112 codepoint <= (Unicode::kLowSurrogateStart + kMaxByte);
113}
114
115RawObject FUNC(_codecs, _ascii_encode)(Thread* thread, Arguments args) {
116 Runtime* runtime = thread->runtime();
117 HandleScope scope(thread);
118 Object output_obj(&scope, args.get(3));
119 DCHECK(runtime->isInstanceOfBytearray(*output_obj),
120 "Fourth arg to _ascii_encode must be bytearray");
121 Str data(&scope, strUnderlying(args.get(0)));
122 Str errors(&scope, strUnderlying(args.get(1)));
123 word i = intUnderlying(args.get(2)).asWord();
124 Bytearray output(&scope, *output_obj);
125
126 SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
127 // TODO(T43252439): Optimize this by first checking whether the entire string
128 // is ASCII, and just memcpy into a string if so
129 for (word byte_offset = thread->strOffset(data, i);
130 byte_offset < data.length(); i++) {
131 word num_bytes;
132 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
133 byte_offset += num_bytes;
134 if (codepoint <= kMaxASCII) {
135 bytearrayAdd(thread, runtime, output, codepoint);
136 } else {
137 switch (error_symbol) {
138 case ID(ignore):
139 continue;
140 case ID(replace):
141 bytearrayAdd(thread, runtime, output, kASCIIReplacement);
142 continue;
143 case ID(surrogateescape):
144 if (isEscapedLatin1Surrogate(codepoint)) {
145 bytearrayAdd(thread, runtime, output,
146 codepoint - Unicode::kLowSurrogateStart);
147 continue;
148 }
149 break;
150 default:
151 break;
152 }
153 Object outpos1(&scope, runtime->newInt(i));
154 while (byte_offset < data.length() &&
155 data.codePointAt(byte_offset, &num_bytes) > kMaxASCII) {
156 byte_offset += num_bytes;
157 i++;
158 }
159 Object outpos2(&scope, runtime->newInt(i + 1));
160 return runtime->newTupleWith2(outpos1, outpos2);
161 }
162 }
163 Object output_bytes(&scope, bytearrayAsBytes(thread, output));
164 Object outpos_obj(&scope, runtime->newInt(i));
165 return runtime->newTupleWith2(output_bytes, outpos_obj);
166}
167
168// Decodes a sequence of unicode encoded bytes into a codepoint, returns
169// -1 if no value should be written, and -2 if an error occurred. Sets the
170// iterating variable to where decoding should continue, and sets
171// invalid_escape_index if it doesn't recognize the escape sequence.
172static int32_t decodeEscaped(const Byteslike& bytes, word* i,
173 word* invalid_escape_index) {
174 word length = bytes.length();
175 switch (byte ch = bytes.byteAt((*i)++)) {
176 // \x escapes
177 case '\n':
178 return -1;
179 case '\\':
180 case '\'':
181 case '\"':
182 return ch;
183 case 'b':
184 return '\b';
185 case 't':
186 return '\t';
187 case 'n':
188 return '\n';
189 case 'r':
190 return '\r';
191 // BEL,
192 case 'a':
193 return '\x07';
194 // VT
195 case 'v':
196 return '\x0B';
197 // FF
198 case 'f':
199 return '\x0C';
200
201 // \OOO (octal) escapes
202 case '0':
203 case '1':
204 case '2':
205 case '3':
206 case '4':
207 case '5':
208 case '6':
209 case '7': {
210 word escaped = ch - '0';
211 word octal_index = *i;
212 if (octal_index < length) {
213 word ch2 = bytes.byteAt(octal_index);
214 if ('0' <= ch2 && ch2 <= '7') {
215 escaped = (escaped << 3) + ch2 - '0';
216 if (++octal_index < length) {
217 word ch3 = bytes.byteAt(octal_index);
218 if ('0' <= ch3 && ch3 <= '7') {
219 octal_index++;
220 escaped = (escaped << 3) + ch3 - '0';
221 }
222 }
223 }
224 }
225 *i = octal_index;
226 return escaped;
227 }
228
229 // hex escapes
230 // \xXX
231 case 'x': {
232 word hex_index = *i;
233 if (hex_index + 1 < length) {
234 int digit1, digit2;
235 digit1 = _PyLong_DigitValue[bytes.byteAt(hex_index)];
236 digit2 = _PyLong_DigitValue[bytes.byteAt(hex_index + 1)];
237 if (digit1 < 16 && digit2 < 16) {
238 *i += 2;
239 return (digit1 << 4) + digit2;
240 }
241 }
242 return -2;
243 }
244 default:
245 *invalid_escape_index = *i - 1;
246 return ch;
247 }
248}
249
250RawObject FUNC(_codecs, _escape_decode)(Thread* thread, Arguments args) {
251 HandleScope scope(thread);
252 Object bytes_obj(&scope, args.get(0));
253 Runtime* runtime = thread->runtime();
254 if (runtime->isInstanceOfStr(*bytes_obj)) {
255 // TODO(T44739505): Make sure we can decode a str
256 UNIMPLEMENTED("_codecs.escape_decode with a str");
257 }
258 DCHECK(runtime->isInstanceOfStr(args.get(2)),
259 "Third arg to _escape_decode must be str");
260 Byteslike bytes(&scope, thread, *bytes_obj);
261 Str errors(&scope, strUnderlying(args.get(1)));
262
263 Bytearray dst(&scope, runtime->newBytearray());
264 word length = bytes.length();
265 runtime->bytearrayEnsureCapacity(thread, dst, length);
266 word first_invalid_escape_index = -1;
267 for (word i = 0; i < length;) {
268 byte ch = bytes.byteAt(i++);
269 if (ch != '\\') {
270 // TODO(T45134397): Support the recode_encoding parameter
271 if (ch <= kMaxASCII) {
272 bytearrayAdd(thread, runtime, dst, ch);
273 continue;
274 }
275 Str temp(&scope, SmallStr::fromCodePoint(ch));
276 bytearrayAdd(thread, runtime, dst, temp.byteAt(0));
277 bytearrayAdd(thread, runtime, dst, temp.byteAt(1));
278 continue;
279 }
280 if (i >= length) {
281 return runtime->newStrFromCStr("Trailing \\ in string");
282 }
283 word invalid_escape_index = -1;
284 int32_t decoded = decodeEscaped(bytes, &i, &invalid_escape_index);
285 if (invalid_escape_index != -1) {
286 bytearrayAdd(thread, runtime, dst, '\\');
287 if (first_invalid_escape_index == -1) {
288 first_invalid_escape_index = invalid_escape_index;
289 }
290 }
291 if (decoded >= 0) {
292 bytearrayAdd(thread, runtime, dst, decoded);
293 continue;
294 }
295 if (decoded == -1) {
296 continue;
297 }
298 SymbolId error_id = lookupSymbolForErrorHandler(errors);
299 switch (error_id) {
300 case ID(strict):
301 return runtime->newStrFromFmt("invalid \\x escape at position %d",
302 i - 2);
303 case ID(replace): {
304 bytearrayAdd(thread, runtime, dst, '?');
305 break;
306 }
307 case ID(ignore):
308 break;
309 default:
310 return runtime->newStrFromFmt(
311 "decoding error; unknown error handling code: %S", &errors);
312 }
313 if (i < length && Byte::isHexDigit(bytes.byteAt(i))) {
314 i++;
315 }
316 }
317 Object dst_obj(&scope, bytearrayAsBytes(thread, dst));
318 Object length_obj(&scope, runtime->newInt(length));
319 Object escape_obj(&scope, runtime->newInt(first_invalid_escape_index));
320 return runtime->newTupleWith3(dst_obj, length_obj, escape_obj);
321}
322
323RawObject FUNC(_codecs, _latin_1_decode)(Thread* thread, Arguments args) {
324 Runtime* runtime = thread->runtime();
325 HandleScope scope(thread);
326 Object data(&scope, args.get(0));
327 StrArray array(&scope, runtime->newStrArray());
328 word length;
329 Byteslike bytes(&scope, thread, *data);
330 length = bytes.length();
331 runtime->strArrayEnsureCapacity(thread, array, length);
332 // First, try a quick ASCII decoding
333 word num_bytes = asciiDecode(thread, array, bytes, 0, length);
334 if (num_bytes != length) {
335 // A non-ASCII character was found; switch to a Latin-1 decoding for the
336 // remainder of the input sequence
337 for (word i = num_bytes; i < length; ++i) {
338 byte code_point = bytes.byteAt(i);
339 if (code_point <= kMaxASCII) {
340 runtime->strArrayAddASCII(thread, array, code_point);
341 } else {
342 runtime->strArrayAddCodePoint(thread, array, code_point);
343 }
344 }
345 }
346 Object array_str(&scope, runtime->strFromStrArray(array));
347 Object length_obj(&scope, runtime->newInt(length));
348 return runtime->newTupleWith2(array_str, length_obj);
349}
350
351RawObject FUNC(_codecs, _latin_1_encode)(Thread* thread, Arguments args) {
352 Runtime* runtime = thread->runtime();
353 HandleScope scope(thread);
354 Object output_obj(&scope, args.get(3));
355 DCHECK(runtime->isInstanceOfBytearray(*output_obj),
356 "Fourth arg to _latin_1_encode must be bytearray");
357 Str data(&scope, strUnderlying(args.get(0)));
358 Str errors(&scope, strUnderlying(args.get(1)));
359 word i = intUnderlying(args.get(2)).asWord();
360 Bytearray output(&scope, *output_obj);
361
362 SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
363 for (word byte_offset = thread->strOffset(data, i);
364 byte_offset < data.length(); i++) {
365 word num_bytes;
366 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
367 byte_offset += num_bytes;
368 if (codepoint <= kMaxByte) {
369 bytearrayAdd(thread, runtime, output, codepoint);
370 } else {
371 switch (error_symbol) {
372 case ID(ignore):
373 continue;
374 case ID(replace):
375 bytearrayAdd(thread, runtime, output, kASCIIReplacement);
376 continue;
377 case ID(surrogateescape):
378 if (isEscapedLatin1Surrogate(codepoint)) {
379 bytearrayAdd(thread, runtime, output,
380 codepoint - Unicode::kLowSurrogateStart);
381 continue;
382 }
383 break;
384 default:
385 break;
386 }
387 Object outpos1(&scope, runtime->newInt(i));
388 while (byte_offset < data.length() &&
389 data.codePointAt(byte_offset, &num_bytes) > kMaxByte) {
390 byte_offset += num_bytes;
391 i++;
392 }
393 Object outpos2(&scope, runtime->newInt(i + 1));
394 return runtime->newTupleWith2(outpos1, outpos2);
395 }
396 }
397 Object output_bytes(&scope, bytearrayAsBytes(thread, output));
398 Object outpos(&scope, runtime->newInt(i));
399 return runtime->newTupleWith2(output_bytes, outpos);
400}
401
402// Decodes a sequence of hexadecimal encoded bytes into a codepoint or returns
403// a negative value if the value could not be decoded. Sets the start variable
404// to where decoding should continue.
405static int32_t decodeHexEscaped(const Byteslike& bytes, word* start,
406 word count) {
407 DCHECK_BOUND(count, 8);
408 word result = 0;
409 word i = *start;
410 for (word len = bytes.length(); i < len && count != 0; i++, count--) {
411 byte ch = bytes.byteAt(i);
412 result <<= 4;
413 if (ch >= '0' && ch <= '9') {
414 result += ch - '0';
415 } else if (ch >= 'a' && ch <= 'f') {
416 result += ch - ('a' - 10);
417 } else if (ch >= 'A' && ch <= 'F') {
418 result += ch - ('A' - 10);
419 } else {
420 break; // not a hexadecimal digit, stop reading
421 }
422 }
423 *start = i;
424 if (count != 0) {
425 return -1;
426 }
427 // if count is 4, result could be a 32-bit unicode character
428 if (result > kMaxUnicode) {
429 return -2;
430 }
431 return result;
432}
433
434// Decodes a sequence of unicode encoded bytes into a codepoint or returns
435// a negative value if no value should be written. Sets the iterating variable
436// to where decoding should continue, sets invalid_escape_index if it doesn't
437// recognize the escape sequence, and sets error_message if an error occurred.
438static int32_t decodeUnicodeEscaped(const Byteslike& bytes, word* i,
439 word* invalid_escape_index,
440 const char** error_message) {
441 switch (byte ch = bytes.byteAt((*i)++)) {
442 // \x escapes
443 case '\n':
444 return -1;
445 case '\\':
446 case '\'':
447 case '\"':
448 return ch;
449 case 'b':
450 return '\b';
451 case 't':
452 return '\t';
453 case 'n':
454 return '\n';
455 case 'r':
456 return '\r';
457 // BEL
458 case 'a':
459 return '\007';
460 // FF
461 case 'f':
462 return '\014';
463 // VT
464 case 'v':
465 return '\013';
466
467 // \OOO (octal) escapes
468 case '0':
469 case '1':
470 case '2':
471 case '3':
472 case '4':
473 case '5':
474 case '6':
475 case '7': {
476 word escaped = ch - '0';
477 word octal_index = *i;
478 word length = bytes.length();
479 if (octal_index < length) {
480 word ch2 = bytes.byteAt(octal_index);
481 if ('0' <= ch2 && ch2 <= '7') {
482 escaped = (escaped << 3) + ch2 - '0';
483 if (++octal_index < length) {
484 word ch3 = bytes.byteAt(octal_index);
485 if ('0' <= ch3 && ch3 <= '7') {
486 octal_index++;
487 escaped = (escaped << 3) + ch3 - '0';
488 }
489 }
490 }
491 }
492 *i = octal_index;
493 return escaped;
494 }
495
496 // hex escapes
497 // \xXX
498 case 'x': {
499 word escaped;
500 if ((escaped = decodeHexEscaped(bytes, i, 2)) < 0) {
501 *error_message = (escaped == -1 ? "truncated \\xXX escape"
502 : "illegal Unicode character");
503 return -1;
504 }
505 return escaped;
506 }
507
508 // \uXXXX
509 case 'u': {
510 word escaped;
511 if ((escaped = decodeHexEscaped(bytes, i, 4)) < 0) {
512 *error_message = (escaped == -1 ? "truncated \\uXXXX escape"
513 : "illegal Unicode character");
514 return -1;
515 }
516 return escaped;
517 }
518
519 // \UXXXXXXXX
520 case 'U': {
521 word escaped;
522 if ((escaped = decodeHexEscaped(bytes, i, 8)) < 0) {
523 *error_message = (escaped == -1 ? "truncated \\uXXXXXXXX escape"
524 : "illegal Unicode character");
525 return -1;
526 }
527 return escaped;
528 }
529
530 // \N{name}
531 case 'N': {
532 *error_message = "malformed \\N character escape";
533 word length = bytes.length();
534 if (*i >= length || bytes.byteAt(*i) != '{') {
535 return -1;
536 }
537 word start = ++(*i);
538 while (*i < length && bytes.byteAt(*i) != '}') {
539 *i += 1;
540 }
541 word size = *i - start;
542 if (size == 0 || *i == length) {
543 return -1;
544 }
545 *i += 1;
546 *error_message = "unknown Unicode character name";
547
548 unique_c_ptr<byte> buffer(reinterpret_cast<byte*>(std::malloc(size)));
549 bytes.copyToStartAt(buffer.get(), size, start);
550 return codePointFromName(buffer.get(), size);
551 }
552
553 default: {
554 *invalid_escape_index = *i - 1;
555 return ch;
556 }
557 }
558}
559
560RawObject FUNC(_codecs, _unicode_escape_decode)(Thread* thread,
561 Arguments args) {
562 HandleScope scope(thread);
563 Runtime* runtime = thread->runtime();
564 Object data(&scope, args.get(0));
565 Str errors(&scope, strUnderlying(args.get(1)));
566 word index = intUnderlying(args.get(2)).asWord();
567 StrArray dst(&scope, args.get(3));
568
569 Byteslike bytes(&scope, thread, *data);
570 word length = bytes.length();
571 runtime->strArrayEnsureCapacity(thread, dst, length);
572 word first_invalid_escape_index = -1;
573 for (word i = index; i < length;) {
574 const char* message = nullptr;
575 word start_pos = i;
576 byte ch = bytes.byteAt(i++);
577 if (ch != '\\') {
578 if (ch <= kMaxASCII) {
579 runtime->strArrayAddASCII(thread, dst, ch);
580 continue;
581 }
582 Str temp(&scope, SmallStr::fromCodePoint(ch));
583 runtime->strArrayAddStr(thread, dst, temp);
584 continue;
585 }
586 if (i >= length) {
587 message = "\\ at end of string";
588 } else {
589 word invalid_escape_index = -1;
590 int32_t decoded =
591 decodeUnicodeEscaped(bytes, &i, &invalid_escape_index, &message);
592 if (invalid_escape_index != -1) {
593 runtime->strArrayAddASCII(thread, dst, '\\');
594 if (first_invalid_escape_index == -1) {
595 first_invalid_escape_index = invalid_escape_index;
596 }
597 }
598 if (decoded != -1) {
599 if (decoded <= kMaxASCII) {
600 runtime->strArrayAddASCII(thread, dst, decoded);
601 continue;
602 }
603 Str temp(&scope, SmallStr::fromCodePoint(decoded));
604 runtime->strArrayAddStr(thread, dst, temp);
605 continue;
606 }
607 }
608 if (message != nullptr) {
609 SymbolId error_id = lookupSymbolForErrorHandler(errors);
610 switch (error_id) {
611 case ID(replace): {
612 Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
613 runtime->strArrayAddStr(thread, dst, temp);
614 break;
615 }
616 case ID(ignore):
617 break;
618 default: {
619 Object start_pos_obj(&scope, runtime->newInt(start_pos));
620 Object outpos_obj(&scope, runtime->newInt(i));
621 Object message_obj(&scope, runtime->newStrFromCStr(message));
622 Object escape_obj(&scope,
623 runtime->newInt(first_invalid_escape_index));
624 return runtime->newTupleWith4(start_pos_obj, outpos_obj, message_obj,
625 escape_obj);
626 }
627 }
628 }
629 }
630 Object dst_obj(&scope, runtime->strFromStrArray(dst));
631 Object length_obj(&scope, runtime->newInt(length));
632 Object message_obj(&scope, runtime->newStrFromCStr(""));
633 Object escape_obj(&scope, runtime->newInt(first_invalid_escape_index));
634 return runtime->newTupleWith4(dst_obj, length_obj, message_obj, escape_obj);
635}
636
637enum Utf8DecoderResult {
638 k1Byte = 1,
639 k2Byte = 2,
640 k3Byte = 3,
641 k4Byte = 4,
642 kInvalidStart = 0,
643 kInvalidContinuation1 = -1,
644 kInvalidContinuation2 = -2,
645 kInvalidContinuation3 = -3,
646 kUnexpectedEndOfData = -4,
647};
648
649// This functionality is taken mostly from CPython:
650// Objects/stringlib/codecs.h::utf8_decode
651// This does error checking to ensure well-formedness of the passed in UTF-8
652// bytes, and returns the number of bytes of the codepoint at `index` as a
653// Utf8DecoderResult enum value.
654// Since this is supposed to work as an incremental decoder as well, this
655// function returns specific values for errors to determine whether they could
656// be caused by incremental decoding, or if they would be an error no matter
657// what other bytes might be streamed in later.
658static Utf8DecoderResult isValidUtf8Codepoint(const Byteslike& bytes,
659 word index) {
660 word length = bytes.length();
661 byte ch = bytes.byteAt(index);
662 if (ch <= kMaxASCII) {
663 return k1Byte;
664 }
665 if (ch < 0xE0) {
666 // \xC2\x80-\xDF\xBF -- 0080-07FF
667 if (ch < 0xC2) {
668 // invalid sequence
669 // \x80-\xBF -- continuation byte
670 // \xC0-\xC1 -- fake 0000-007F
671 return kInvalidStart;
672 }
673 if (index + 1 >= length) {
674 return kUnexpectedEndOfData;
675 }
676 if (!UTF8::isTrailByte(bytes.byteAt(index + 1))) {
677 return kInvalidContinuation1;
678 }
679 return k2Byte;
680 }
681 if (ch < 0xF0) {
682 // \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF
683 if (index + 2 >= length) {
684 if (index + 1 >= length) {
685 return kUnexpectedEndOfData;
686 }
687 byte ch2 = bytes.byteAt(index + 1);
688 if (!UTF8::isTrailByte(ch2) || (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) {
689 return kInvalidContinuation1;
690 }
691 return kUnexpectedEndOfData;
692 }
693 byte ch2 = bytes.byteAt(index + 1);
694 if (!UTF8::isTrailByte(ch2)) {
695 return kInvalidContinuation1;
696 }
697 if (ch == 0xE0) {
698 if (ch2 < 0xA0) {
699 // invalid sequence
700 // \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800
701 return kInvalidContinuation1;
702 }
703 } else if (ch == 0xED && ch2 >= 0xA0) {
704 // Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
705 // will result in surrogates in range D800-DFFF. Surrogates are
706 // not valid UTF-8 so they are rejected.
707 // See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
708 // (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
709 return kInvalidContinuation1;
710 }
711 if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
712 return kInvalidContinuation2;
713 }
714 return k3Byte;
715 }
716 if (ch < 0xF5) {
717 // \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF
718 if (index + 3 >= length) {
719 if (index + 1 >= length) {
720 return kUnexpectedEndOfData;
721 }
722 byte ch2 = bytes.byteAt(index + 1);
723 if (!UTF8::isTrailByte(ch2) || (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) {
724 return kInvalidContinuation1;
725 }
726 if (index + 2 >= length) {
727 return kUnexpectedEndOfData;
728 }
729 if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
730 return kInvalidContinuation2;
731 }
732 return kUnexpectedEndOfData;
733 }
734 byte ch2 = bytes.byteAt(index + 1);
735 if (!UTF8::isTrailByte(ch2)) {
736 return kInvalidContinuation1;
737 }
738 if (ch == 0xF0) {
739 if (ch2 < 0x90) {
740 // invalid sequence
741 // \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
742 return kInvalidContinuation1;
743 }
744 } else if (ch == 0xF4 && ch2 >= 0x90) {
745 // invalid sequence
746 // \xF4\x90\x80\80- -- 110000- overflow
747 return kInvalidContinuation1;
748 }
749 if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
750 return kInvalidContinuation2;
751 }
752 if (!UTF8::isTrailByte(bytes.byteAt(index + 3))) {
753 return kInvalidContinuation3;
754 }
755 return k4Byte;
756 }
757 return kInvalidStart;
758}
759
760RawObject FUNC(_codecs, _utf_8_decode)(Thread* thread, Arguments args) {
761 Runtime* runtime = thread->runtime();
762 HandleScope scope(thread);
763 Object final_obj(&scope, args.get(4));
764 DCHECK(final_obj.isBool(), "Fifth arg to _utf_8_decode must be bool");
765 Object data(&scope, args.get(0));
766 Str errors(&scope, strUnderlying(args.get(1)));
767 word index = intUnderlying(args.get(2)).asWord();
768 StrArray dst(&scope, args.get(3));
769
770 word length;
771 Byteslike bytes(&scope, thread, *data);
772 length = bytes.length();
773 runtime->strArrayEnsureCapacity(thread, dst, length);
774 word i = asciiDecode(thread, dst, bytes, index, length);
775 if (i == length) {
776 Object dst_obj(&scope, runtime->strFromStrArray(dst));
777 Object length_obj(&scope, runtime->newInt(length));
778 Object message_obj(&scope, runtime->newStrFromCStr(""));
779 return runtime->newTupleWith3(dst_obj, length_obj, message_obj);
780 }
781
782 SymbolId error_id = lookupSymbolForErrorHandler(errors);
783 bool is_final = Bool::cast(*final_obj).value();
784 while (i < length) {
785 // TODO(T41032331): Scan for non-ASCII characters by words instead of chars
786 Utf8DecoderResult validator_result = isValidUtf8Codepoint(bytes, i);
787 if (validator_result >= k1Byte) {
788 byte codepoint[4] = {0};
789 for (int codeunit = 0; codeunit + 1 <= validator_result; codeunit++) {
790 codepoint[codeunit] = bytes.byteAt(i + codeunit);
791 }
792 i += validator_result;
793 Str temp(&scope,
794 runtime->newStrWithAll(View<byte>{codepoint, validator_result}));
795 runtime->strArrayAddStr(thread, dst, temp);
796 continue;
797 }
798 if (validator_result != kInvalidStart && !is_final) {
799 break;
800 }
801 word error_end = i;
802 const char* error_message = nullptr;
803 switch (validator_result) {
804 case kInvalidStart:
805 error_end += 1;
806 error_message = "invalid start byte";
807 break;
808 case kInvalidContinuation1:
809 case kInvalidContinuation2:
810 case kInvalidContinuation3:
811 error_end -= validator_result;
812 error_message = "invalid continuation byte";
813 break;
814 case kUnexpectedEndOfData:
815 error_end = length;
816 error_message = "unexpected end of data";
817 break;
818 default:
819 UNREACHABLE(
820 "valid utf-8 codepoints should have been decoded by this point");
821 }
822 switch (error_id) {
823 case ID(replace): {
824 Str temp(&scope, SmallStr::fromCodePoint(kReplacementCharacter));
825 runtime->strArrayAddStr(thread, dst, temp);
826 i = error_end;
827 break;
828 }
829 case ID(surrogateescape): {
830 for (; i < error_end; ++i) {
831 Str temp(&scope, SmallStr::fromCodePoint(Unicode::kLowSurrogateStart +
832 bytes.byteAt(i)));
833 runtime->strArrayAddStr(thread, dst, temp);
834 }
835 break;
836 }
837 case ID(ignore):
838 i = error_end;
839 break;
840 default: {
841 Object outpos_obj(&scope, runtime->newInt(i));
842 Object error_end_obj(&scope, runtime->newInt(error_end));
843 Object message_obj(&scope, runtime->newStrFromCStr(error_message));
844 return runtime->newTupleWith3(outpos_obj, error_end_obj, message_obj);
845 }
846 }
847 }
848 Object dst_obj(&scope, runtime->strFromStrArray(dst));
849 Object outpos_obj(&scope, runtime->newInt(i));
850 Object message_obj(&scope, Str::empty());
851 return runtime->newTupleWith3(dst_obj, outpos_obj, message_obj);
852}
853
854RawObject FUNC(_codecs, _utf_8_encode)(Thread* thread, Arguments args) {
855 Runtime* runtime = thread->runtime();
856 HandleScope scope(thread);
857 Object output_obj(&scope, args.get(3));
858 DCHECK(runtime->isInstanceOfBytearray(*output_obj),
859 "Fourth arg to _utf_8_encode must be bytearray");
860 Str data(&scope, strUnderlying(args.get(0)));
861 Str errors(&scope, strUnderlying(args.get(1)));
862 word index = intUnderlying(args.get(2)).asWord();
863 Bytearray output(&scope, *output_obj);
864
865 SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
866 for (word byte_offset = thread->strOffset(data, index);
867 byte_offset < data.length(); index++) {
868 word num_bytes;
869 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
870 byte_offset += num_bytes;
871 if (!Unicode::isSurrogate(codepoint)) {
872 for (word j = byte_offset - num_bytes; j < byte_offset; j++) {
873 bytearrayAdd(thread, runtime, output, data.byteAt(j));
874 }
875 } else {
876 switch (error_symbol) {
877 case ID(ignore):
878 continue;
879 case ID(replace):
880 bytearrayAdd(thread, runtime, output, kASCIIReplacement);
881 continue;
882 case ID(surrogateescape):
883 if (isEscapedLatin1Surrogate(codepoint)) {
884 bytearrayAdd(thread, runtime, output,
885 codepoint - Unicode::kLowSurrogateStart);
886 continue;
887 }
888 break;
889 case ID(surrogatepass):
890 if (Unicode::isSurrogate(codepoint)) {
891 bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 3));
892 bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 2));
893 bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 1));
894 continue;
895 }
896 break;
897 default:
898 break;
899 }
900 Object outpos1(&scope, runtime->newInt(index));
901 while (byte_offset < data.length() &&
902 Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
903 byte_offset += num_bytes;
904 index++;
905 }
906 Object outpos2(&scope, runtime->newInt(index + 1));
907 return runtime->newTupleWith2(outpos1, outpos2);
908 }
909 }
910 Object output_bytes(&scope, bytearrayAsBytes(thread, output));
911 Object index_obj(&scope, runtime->newInt(index));
912 return runtime->newTupleWith2(output_bytes, index_obj);
913}
914
915static void appendUtf16ToBytearray(Thread* thread, Runtime* runtime,
916 const Bytearray& writer, int32_t codepoint,
917 endian endianness) {
918 if (endianness == endian::little) {
919 bytearrayAdd(thread, runtime, writer, codepoint);
920 bytearrayAdd(thread, runtime, writer, codepoint >> kBitsPerByte);
921 } else {
922 bytearrayAdd(thread, runtime, writer, codepoint >> kBitsPerByte);
923 bytearrayAdd(thread, runtime, writer, codepoint);
924 }
925}
926
927RawObject FUNC(_codecs, _utf_16_encode)(Thread* thread, Arguments args) {
928 Runtime* runtime = thread->runtime();
929 HandleScope scope(thread);
930 Object output_obj(&scope, args.get(3));
931 DCHECK(runtime->isInstanceOfBytearray(*output_obj),
932 "Fourth arg to _utf_16_encode must be bytearray");
933 Str data(&scope, strUnderlying(args.get(0)));
934 Str errors(&scope, strUnderlying(args.get(1)));
935 word index = intUnderlying(args.get(2)).asWord();
936 Bytearray output(&scope, *output_obj);
937 OptInt<int32_t> byteorder = intUnderlying(args.get(4)).asInt<int32_t>();
938 if (byteorder.error != CastError::None) {
939 return thread->raiseWithFmt(LayoutId::kOverflowError,
940 "Python int too large to convert to C int");
941 }
942
943 SymbolId error_id = lookupSymbolForErrorHandler(errors);
944 for (word byte_offset = thread->strOffset(data, index);
945 byte_offset < data.length(); index++) {
946 endian endianness = byteorder.value <= 0 ? endian::little : endian::big;
947 word num_bytes;
948 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
949 byte_offset += num_bytes;
950 if (!Unicode::isSurrogate(codepoint)) {
951 if (codepoint < Unicode::kHighSurrogateStart) {
952 appendUtf16ToBytearray(thread, runtime, output, codepoint, endianness);
953 } else {
954 appendUtf16ToBytearray(thread, runtime, output,
955 Unicode::highSurrogateFor(codepoint),
956 endianness);
957 appendUtf16ToBytearray(thread, runtime, output,
958 Unicode::lowSurrogateFor(codepoint), endianness);
959 }
960 } else {
961 switch (error_id) {
962 case ID(ignore):
963 continue;
964 case ID(replace):
965 appendUtf16ToBytearray(thread, runtime, output, kASCIIReplacement,
966 endianness);
967 continue;
968 case ID(surrogateescape):
969 if (isEscapedLatin1Surrogate(codepoint)) {
970 appendUtf16ToBytearray(thread, runtime, output,
971 codepoint - Unicode::kLowSurrogateStart,
972 endianness);
973 continue;
974 }
975 break;
976 default:
977 break;
978 }
979 Object outpos1(&scope, runtime->newInt(index));
980 while (byte_offset < data.length() &&
981 Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
982 byte_offset += num_bytes;
983 index++;
984 }
985 Object outpos2(&scope, runtime->newInt(index + 1));
986 return runtime->newTupleWith2(outpos1, outpos2);
987 }
988 }
989 Object output_bytes(&scope, bytearrayAsBytes(thread, output));
990 Object index_obj(&scope, runtime->newInt(index));
991 return runtime->newTupleWith2(output_bytes, index_obj);
992}
993
994static void appendUtf32ToBytearray(Thread* thread, Runtime* runtime,
995 const Bytearray& writer, int32_t codepoint,
996 endian endianness) {
997 if (endianness == endian::little) {
998 bytearrayAdd(thread, runtime, writer, codepoint);
999 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte));
1000 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 2));
1001 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 3));
1002 } else {
1003 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 3));
1004 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 2));
1005 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte));
1006 bytearrayAdd(thread, runtime, writer, codepoint);
1007 }
1008}
1009
1010RawObject FUNC(_codecs, _utf_32_encode)(Thread* thread, Arguments args) {
1011 Runtime* runtime = thread->runtime();
1012 HandleScope scope(thread);
1013 Object output_obj(&scope, args.get(3));
1014 DCHECK(runtime->isInstanceOfBytearray(*output_obj),
1015 "Fourth arg to _utf_32_encode must be bytearray");
1016 Str data(&scope, strUnderlying(args.get(0)));
1017 Str errors(&scope, strUnderlying(args.get(1)));
1018 word index = intUnderlying(args.get(2)).asWord();
1019 Bytearray output(&scope, *output_obj);
1020 OptInt<int32_t> byteorder = intUnderlying(args.get(4)).asInt<int32_t>();
1021 if (byteorder.error != CastError::None) {
1022 return thread->raiseWithFmt(LayoutId::kOverflowError,
1023 "Python int too large to convert to C int");
1024 }
1025
1026 SymbolId error_id = lookupSymbolForErrorHandler(errors);
1027 for (word byte_offset = thread->strOffset(data, index);
1028 byte_offset < data.length(); index++) {
1029 endian endianness = byteorder.value <= 0 ? endian::little : endian::big;
1030 word num_bytes;
1031 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
1032 byte_offset += num_bytes;
1033 if (!Unicode::isSurrogate(codepoint)) {
1034 appendUtf32ToBytearray(thread, runtime, output, codepoint, endianness);
1035 } else {
1036 switch (error_id) {
1037 case ID(ignore):
1038 continue;
1039 case ID(replace):
1040 appendUtf32ToBytearray(thread, runtime, output, kASCIIReplacement,
1041 endianness);
1042 continue;
1043 case ID(surrogateescape):
1044 if (isEscapedLatin1Surrogate(codepoint)) {
1045 appendUtf32ToBytearray(thread, runtime, output,
1046 codepoint - Unicode::kLowSurrogateStart,
1047 endianness);
1048 continue;
1049 }
1050 break;
1051 default:
1052 break;
1053 }
1054 Object outpos1(&scope, runtime->newInt(index));
1055 while (byte_offset < data.length() &&
1056 Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
1057 byte_offset += num_bytes;
1058 index++;
1059 }
1060 Object outpos2(&scope, runtime->newInt(index + 1));
1061 return runtime->newTupleWith2(outpos1, outpos2);
1062 }
1063 }
1064 Object output_bytes(&scope, bytearrayAsBytes(thread, output));
1065 Object index_obj(&scope, runtime->newInt(index));
1066 return runtime->newTupleWith2(output_bytes, index_obj);
1067}
1068
1069// Takes a Bytearray and a Str object, and appends each byte in the Str to the
1070// Bytearray one by one
1071RawObject FUNC(_codecs, _bytearray_string_append)(Thread* thread,
1072 Arguments args) {
1073 HandleScope scope(thread);
1074 Bytearray dst(&scope, args.get(0));
1075 Str data(&scope, args.get(1));
1076 for (word i = 0; i < data.length(); ++i) {
1077 bytearrayAdd(thread, thread->runtime(), dst, data.byteAt(i));
1078 }
1079 return NoneType::object();
1080}
1081
1082RawObject FUNC(_codecs, _raw_unicode_escape_encode)(Thread* thread,
1083 Arguments args) {
1084 HandleScope scope(thread);
1085 Runtime* runtime = thread->runtime();
1086 Str data(&scope, strUnderlying(args.get(0)));
1087 word size = data.codePointLength();
1088 Bytearray dst(&scope, runtime->newBytearray());
1089 word length = data.length();
1090
1091 // 2 byte codepoints can be expanded to 4 bytes + 2 escape characters
1092 // 4 byte codepoints well be expanded to 8 bytes + 2 escape characters
1093 // To be safe we double the bytecount and add space for 2 escape characters
1094 // per codepoint.
1095 word expanded_size = length * 2 + size * 2;
1096 runtime->bytearrayEnsureCapacity(thread, dst, expanded_size);
1097 word num_bytes;
1098 for (word index = 0, byte_offset = thread->strOffset(data, index);
1099 byte_offset < data.length(); index++) {
1100 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
1101 byte_offset += num_bytes;
1102 // U+0000-U+00ff range: Copy 8-bit characters as-is
1103 if (codepoint <= kMaxByte) {
1104 bytearrayAdd(thread, runtime, dst, codepoint);
1105 }
1106 // U+0100-U+ffff range: Map 16-bit characters to '\uHHHH'
1107 else if (codepoint <= kMaxUint16) {
1108 bytearrayAdd(thread, runtime, dst, '\\');
1109 bytearrayAdd(thread, runtime, dst, 'u');
1110 bytearrayAdd(thread, runtime, dst,
1111 lowerCaseHexDigit((codepoint >> 12) & 0xf));
1112 bytearrayAdd(thread, runtime, dst,
1113 lowerCaseHexDigit((codepoint >> 8) & 0xf));
1114 bytearrayAdd(thread, runtime, dst,
1115 lowerCaseHexDigit((codepoint >> 4) & 0xf));
1116 bytearrayAdd(thread, runtime, dst, lowerCaseHexDigit(codepoint & 15));
1117 }
1118 // U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH'
1119 else {
1120 CHECK(codepoint <= kMaxUnicode, "expected a valid unicode code point");
1121 bytearrayAdd(thread, runtime, dst, '\\');
1122 bytearrayAdd(thread, runtime, dst, 'U');
1123 bytearrayAdd(thread, runtime, dst, '0');
1124 bytearrayAdd(thread, runtime, dst, '0');
1125 bytearrayAdd(thread, runtime, dst,
1126 lowerCaseHexDigit((codepoint >> 20) & 0xf));
1127 bytearrayAdd(thread, runtime, dst,
1128 lowerCaseHexDigit((codepoint >> 16) & 0xf));
1129 bytearrayAdd(thread, runtime, dst,
1130 lowerCaseHexDigit((codepoint >> 12) & 0xf));
1131 bytearrayAdd(thread, runtime, dst,
1132 lowerCaseHexDigit((codepoint >> 8) & 0xf));
1133 bytearrayAdd(thread, runtime, dst,
1134 lowerCaseHexDigit((codepoint >> 4) & 0xf));
1135 bytearrayAdd(thread, runtime, dst, lowerCaseHexDigit(codepoint & 15));
1136 }
1137 }
1138 Object output_bytes(&scope, bytearrayAsBytes(thread, dst));
1139 Object size_obj(&scope, runtime->newInt(size));
1140 return runtime->newTupleWith2(output_bytes, size_obj);
1141}
1142
1143RawObject FUNC(_codecs, _raw_unicode_escape_decode)(Thread* thread,
1144 Arguments args) {
1145 HandleScope scope(thread);
1146 Runtime* runtime = thread->runtime();
1147 Object data(&scope, args.get(0));
1148 Str errors(&scope, strUnderlying(args.get(1)));
1149 word index = intUnderlying(args.get(2)).asWord();
1150 StrArray dst(&scope, args.get(3));
1151
1152 Byteslike bytes(&scope, thread, *data);
1153 word length = bytes.length();
1154 runtime->strArrayEnsureCapacity(thread, dst, length);
1155 for (word i = index; i < length;) {
1156 const char* message = nullptr;
1157 word start_pos = i;
1158 byte ch = bytes.byteAt(i);
1159 i++;
1160 if (ch != '\\') {
1161 if (ch <= kMaxASCII) {
1162 runtime->strArrayAddASCII(thread, dst, ch);
1163 continue;
1164 }
1165 Str temp(&scope, SmallStr::fromCodePoint(ch));
1166 runtime->strArrayAddStr(thread, dst, temp);
1167 continue;
1168 }
1169 if (i >= length) {
1170 // \\ at end of string
1171 runtime->strArrayAddASCII(thread, dst, '\\');
1172 } else {
1173 int32_t decoded;
1174 ch = bytes.byteAt(i);
1175 i++;
1176 // Only care about \uXXXX and \UXXXXXXXX when decoding raw unicode.
1177 switch (ch) {
1178 // \uXXXX
1179 case 'u': {
1180 if ((decoded = decodeHexEscaped(bytes, &i, 4)) < 0) {
1181 message = (decoded == -1 ? "truncated \\uXXXX escape"
1182 : "illegal Unicode character");
1183 }
1184 break;
1185 }
1186 // \UXXXXXXXX
1187 case 'U': {
1188 if ((decoded = decodeHexEscaped(bytes, &i, 8)) < 0) {
1189 if (decoded == -1) {
1190 message = "truncated \\UXXXXXXXX escape";
1191 } else if (decoded == -2) {
1192 message = "\\Uxxxxxxxx out of range";
1193 } else {
1194 message = "illegal Unicode character";
1195 }
1196 }
1197 break;
1198 }
1199 default: {
1200 runtime->strArrayAddASCII(thread, dst, '\\');
1201 decoded = ch;
1202 }
1203 }
1204 if (decoded >= 0) {
1205 if (decoded <= kMaxASCII) {
1206 runtime->strArrayAddASCII(thread, dst, decoded);
1207 continue;
1208 }
1209 Str temp(&scope, SmallStr::fromCodePoint(decoded));
1210 runtime->strArrayAddStr(thread, dst, temp);
1211 continue;
1212 }
1213 }
1214 if (message != nullptr) {
1215 SymbolId error_id = lookupSymbolForErrorHandler(errors);
1216 switch (error_id) {
1217 case ID(replace): {
1218 Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
1219 runtime->strArrayAddStr(thread, dst, temp);
1220 break;
1221 }
1222 case ID(ignore):
1223 break;
1224 default: {
1225 Object start_pos_obj(&scope, runtime->newInt(start_pos));
1226 Object outpos_obj(&scope, runtime->newInt(i));
1227 Object message_obj(&scope, runtime->newStrFromCStr(message));
1228 return runtime->newTupleWith3(start_pos_obj, outpos_obj, message_obj);
1229 }
1230 }
1231 }
1232 }
1233 Object dst_obj(&scope, runtime->strFromStrArray(dst));
1234 Object length_obj(&scope, runtime->newInt(length));
1235 Object message_obj(&scope, runtime->newStrFromCStr(""));
1236 return runtime->newTupleWith3(dst_obj, length_obj, message_obj);
1237}
1238
1239RawObject FUNC(_codecs, backslashreplace_errors)(Thread* thread,
1240 Arguments args) {
1241 HandleScope scope(thread);
1242 Runtime* runtime = thread->runtime();
1243 Object error(&scope, args.get(0));
1244 Object object(&scope, NoneType::object());
1245 word start;
1246 word end;
1247 if (runtime->isInstanceOfUnicodeDecodeError(*error)) {
1248 UnicodeErrorBase unicode_error(&scope, *error);
1249 start = SmallInt::cast(unicode_error.start()).value();
1250 end = SmallInt::cast(unicode_error.end()).value();
1251 object = unicode_error.object();
1252 if (!runtime->isInstanceOfBytes(*object)) {
1253 return thread->raiseWithFmt(LayoutId::kTypeError,
1254 "object attribute must be bytes");
1255 }
1256 Bytes bytes(&scope, bytesUnderlying(*object));
1257 word length = bytes.length();
1258 if (start >= length) start = length - 1;
1259 if (start < 0) start = 0;
1260 if (end >= length) end = length;
1261 if (end < 1) end = 1;
1262 word result_size = end - start;
1263 if (result_size < 0) {
1264 return thread->raiseWithFmt(LayoutId::kValueError, "end before start");
1265 }
1266 result_size *= 4;
1267 MutableBytes result(&scope,
1268 runtime->newMutableBytesUninitialized(result_size));
1269 word pos = 0;
1270 for (word i = start; i < end; i++) {
1271 byte b = bytes.byteAt(i);
1272 result.byteAtPut(pos++, '\\');
1273 result.byteAtPut(pos++, 'x');
1274 uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/2, b);
1275 pos += 2;
1276 }
1277 DCHECK(pos == result.length(), "size mismatch");
1278 Object result_str(&scope, result.becomeStr());
1279 Object end_obj(&scope, SmallInt::fromWord(end));
1280 return runtime->newTupleWith2(result_str, end_obj);
1281 }
1282
1283 if (runtime->isInstanceOfUnicodeEncodeError(*error) ||
1284 runtime->isInstanceOfUnicodeTranslateError(*error)) {
1285 UnicodeErrorBase unicode_error(&scope, *error);
1286 start = SmallInt::cast(unicode_error.start()).value();
1287 end = SmallInt::cast(unicode_error.end()).value();
1288 object = unicode_error.object();
1289 if (!runtime->isInstanceOfStr(*object)) {
1290 return thread->raiseWithFmt(LayoutId::kTypeError,
1291 "object attribute must be unicode");
1292 }
1293 Str str(&scope, strUnderlying(*object));
1294
1295 if (start < 0) start = 0;
1296 if (end < 1) end = 1;
1297 if (end < start) {
1298 return thread->raiseWithFmt(LayoutId::kValueError, "end before start");
1299 }
1300 word start_byte = str.offsetByCodePoints(0, start);
1301 word end_byte = str.offsetByCodePoints(start_byte, end - start);
1302 word result_size = 0;
1303 for (word i = start_byte; i < end_byte;) {
1304 word num_bytes;
1305 int32_t cp = str.codePointAt(i, &num_bytes);
1306 i += num_bytes;
1307 if (cp > kMaxUint16) {
1308 result_size += 10; // Will replace with `\Uxxxxxxxx`
1309 } else if (cp > kMaxByte) {
1310 result_size += 6; // Will replace with `\uxxxx`
1311 } else {
1312 result_size += 4; // Will replace with `\xyy`
1313 }
1314 }
1315 MutableBytes result(&scope,
1316 runtime->newMutableBytesUninitialized(result_size));
1317 word pos = 0;
1318 for (word i = start_byte; i < end_byte;) {
1319 word num_bytes;
1320 int32_t cp = str.codePointAt(i, &num_bytes);
1321 i += num_bytes;
1322 result.byteAtPut(pos++, '\\');
1323 if (cp > kMaxUint16) {
1324 result.byteAtPut(pos++, 'U');
1325 uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/8, cp);
1326 pos += 8;
1327 } else if (cp > kMaxByte) {
1328 result.byteAtPut(pos++, 'u');
1329 uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/4, cp);
1330 pos += 4;
1331 } else {
1332 result.byteAtPut(pos++, 'x');
1333 uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/2, cp);
1334 pos += 2;
1335 }
1336 }
1337 DCHECK(pos == result.length(), "size mismatch");
1338 Object result_bytes(&scope, result.becomeStr());
1339 Object end_obj(&scope, SmallInt::fromWord(end));
1340 return runtime->newTupleWith2(result_bytes, end_obj);
1341 }
1342 return thread->raiseWithFmt(LayoutId::kTypeError,
1343 "don't know how to handle %T in error callback",
1344 &error);
1345}
1346
1347} // namespace py