this repo has no description
at trunk 1347 lines 47 kB view raw
1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) 2#include "builtins.h" 3#include "bytearray-builtins.h" 4#include "bytes-builtins.h" 5#include "byteslike.h" 6#include "formatter-utils.h" 7#include "frame.h" 8#include "int-builtins.h" 9#include "modules.h" 10#include "runtime.h" 11#include "str-builtins.h" 12#include "unicode-db.h" 13#include "unicode.h" 14#include "utils.h" 15 16namespace py { 17 18const char kASCIIReplacement = '?'; 19 20static SymbolId lookupSymbolForErrorHandler(const Str& error) { 21 if (error.equalsCStr("strict")) { 22 return ID(strict); 23 } 24 if (error.equalsCStr("ignore")) { 25 return ID(ignore); 26 } 27 if (error.equalsCStr("replace")) { 28 return ID(replace); 29 } 30 if (error.equalsCStr("surrogateescape")) { 31 return ID(surrogateescape); 32 } 33 if (error.equalsCStr("surrogatepass")) { 34 return ID(surrogatepass); 35 } 36 return SymbolId::kInvalid; 37} 38 39static int asciiDecode(Thread* thread, const StrArray& dst, 40 const Byteslike& src, word start, word end) { 41 // TODO(T41032331): Implement a fastpass to read longs instead of chars 42 Runtime* runtime = thread->runtime(); 43 for (word i = start; i < end; i++) { 44 byte ch = src.byteAt(i); 45 if (ch > kMaxASCII) { 46 return i; 47 } 48 runtime->strArrayAddASCII(thread, dst, ch); 49 } 50 return end; 51} 52 53RawObject FUNC(_codecs, _ascii_decode)(Thread* thread, Arguments args) { 54 Runtime* runtime = thread->runtime(); 55 HandleScope scope(thread); 56 Object data(&scope, args.get(0)); 57 Str errors(&scope, strUnderlying(args.get(1))); 58 word index = intUnderlying(args.get(2)).asWord(); 59 StrArray dst(&scope, args.get(3)); 60 61 Byteslike bytes(&scope, thread, *data); 62 word length = bytes.length(); 63 runtime->strArrayEnsureCapacity(thread, dst, length); 64 word outpos = asciiDecode(thread, dst, bytes, index, length); 65 if (outpos == length) { 66 Object dst_obj(&scope, runtime->strFromStrArray(dst)); 67 Object length_obj(&scope, runtime->newInt(length)); 68 return runtime->newTupleWith2(dst_obj, length_obj); 69 } 70 71 SymbolId error_id = lookupSymbolForErrorHandler(errors); 72 while (outpos < length) { 73 byte c = bytes.byteAt(outpos); 74 if (c < 128) { 75 runtime->strArrayAddASCII(thread, dst, c); 76 ++outpos; 77 continue; 78 } 79 switch (error_id) { 80 case ID(replace): { 81 Str temp(&scope, SmallStr::fromCodePoint(0xFFFD)); 82 runtime->strArrayAddStr(thread, dst, temp); 83 ++outpos; 84 break; 85 } 86 case ID(surrogateescape): { 87 Str temp(&scope, 88 SmallStr::fromCodePoint(Unicode::kLowSurrogateStart + c)); 89 runtime->strArrayAddStr(thread, dst, temp); 90 ++outpos; 91 break; 92 } 93 case ID(ignore): 94 ++outpos; 95 break; 96 default: { 97 Object outpos1(&scope, runtime->newIntFromUnsigned(outpos)); 98 Object outpos2(&scope, runtime->newIntFromUnsigned(outpos + 1)); 99 return runtime->newTupleWith2(outpos1, outpos2); 100 } 101 } 102 } 103 Object dst_obj(&scope, runtime->strFromStrArray(dst)); 104 Object length_obj(&scope, runtime->newInt(length)); 105 return runtime->newTupleWith2(dst_obj, length_obj); 106} 107 108// CPython encodes latin1 codepoints into the low-surrogate range, and is able 109// to recover the original codepoints from those decodable surrogate points. 110static bool isEscapedLatin1Surrogate(int32_t codepoint) { 111 return (Unicode::kLowSurrogateStart + kMaxASCII) < codepoint && 112 codepoint <= (Unicode::kLowSurrogateStart + kMaxByte); 113} 114 115RawObject FUNC(_codecs, _ascii_encode)(Thread* thread, Arguments args) { 116 Runtime* runtime = thread->runtime(); 117 HandleScope scope(thread); 118 Object output_obj(&scope, args.get(3)); 119 DCHECK(runtime->isInstanceOfBytearray(*output_obj), 120 "Fourth arg to _ascii_encode must be bytearray"); 121 Str data(&scope, strUnderlying(args.get(0))); 122 Str errors(&scope, strUnderlying(args.get(1))); 123 word i = intUnderlying(args.get(2)).asWord(); 124 Bytearray output(&scope, *output_obj); 125 126 SymbolId error_symbol = lookupSymbolForErrorHandler(errors); 127 // TODO(T43252439): Optimize this by first checking whether the entire string 128 // is ASCII, and just memcpy into a string if so 129 for (word byte_offset = thread->strOffset(data, i); 130 byte_offset < data.length(); i++) { 131 word num_bytes; 132 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes); 133 byte_offset += num_bytes; 134 if (codepoint <= kMaxASCII) { 135 bytearrayAdd(thread, runtime, output, codepoint); 136 } else { 137 switch (error_symbol) { 138 case ID(ignore): 139 continue; 140 case ID(replace): 141 bytearrayAdd(thread, runtime, output, kASCIIReplacement); 142 continue; 143 case ID(surrogateescape): 144 if (isEscapedLatin1Surrogate(codepoint)) { 145 bytearrayAdd(thread, runtime, output, 146 codepoint - Unicode::kLowSurrogateStart); 147 continue; 148 } 149 break; 150 default: 151 break; 152 } 153 Object outpos1(&scope, runtime->newInt(i)); 154 while (byte_offset < data.length() && 155 data.codePointAt(byte_offset, &num_bytes) > kMaxASCII) { 156 byte_offset += num_bytes; 157 i++; 158 } 159 Object outpos2(&scope, runtime->newInt(i + 1)); 160 return runtime->newTupleWith2(outpos1, outpos2); 161 } 162 } 163 Object output_bytes(&scope, bytearrayAsBytes(thread, output)); 164 Object outpos_obj(&scope, runtime->newInt(i)); 165 return runtime->newTupleWith2(output_bytes, outpos_obj); 166} 167 168// Decodes a sequence of unicode encoded bytes into a codepoint, returns 169// -1 if no value should be written, and -2 if an error occurred. Sets the 170// iterating variable to where decoding should continue, and sets 171// invalid_escape_index if it doesn't recognize the escape sequence. 172static int32_t decodeEscaped(const Byteslike& bytes, word* i, 173 word* invalid_escape_index) { 174 word length = bytes.length(); 175 switch (byte ch = bytes.byteAt((*i)++)) { 176 // \x escapes 177 case '\n': 178 return -1; 179 case '\\': 180 case '\'': 181 case '\"': 182 return ch; 183 case 'b': 184 return '\b'; 185 case 't': 186 return '\t'; 187 case 'n': 188 return '\n'; 189 case 'r': 190 return '\r'; 191 // BEL, 192 case 'a': 193 return '\x07'; 194 // VT 195 case 'v': 196 return '\x0B'; 197 // FF 198 case 'f': 199 return '\x0C'; 200 201 // \OOO (octal) escapes 202 case '0': 203 case '1': 204 case '2': 205 case '3': 206 case '4': 207 case '5': 208 case '6': 209 case '7': { 210 word escaped = ch - '0'; 211 word octal_index = *i; 212 if (octal_index < length) { 213 word ch2 = bytes.byteAt(octal_index); 214 if ('0' <= ch2 && ch2 <= '7') { 215 escaped = (escaped << 3) + ch2 - '0'; 216 if (++octal_index < length) { 217 word ch3 = bytes.byteAt(octal_index); 218 if ('0' <= ch3 && ch3 <= '7') { 219 octal_index++; 220 escaped = (escaped << 3) + ch3 - '0'; 221 } 222 } 223 } 224 } 225 *i = octal_index; 226 return escaped; 227 } 228 229 // hex escapes 230 // \xXX 231 case 'x': { 232 word hex_index = *i; 233 if (hex_index + 1 < length) { 234 int digit1, digit2; 235 digit1 = _PyLong_DigitValue[bytes.byteAt(hex_index)]; 236 digit2 = _PyLong_DigitValue[bytes.byteAt(hex_index + 1)]; 237 if (digit1 < 16 && digit2 < 16) { 238 *i += 2; 239 return (digit1 << 4) + digit2; 240 } 241 } 242 return -2; 243 } 244 default: 245 *invalid_escape_index = *i - 1; 246 return ch; 247 } 248} 249 250RawObject FUNC(_codecs, _escape_decode)(Thread* thread, Arguments args) { 251 HandleScope scope(thread); 252 Object bytes_obj(&scope, args.get(0)); 253 Runtime* runtime = thread->runtime(); 254 if (runtime->isInstanceOfStr(*bytes_obj)) { 255 // TODO(T44739505): Make sure we can decode a str 256 UNIMPLEMENTED("_codecs.escape_decode with a str"); 257 } 258 DCHECK(runtime->isInstanceOfStr(args.get(2)), 259 "Third arg to _escape_decode must be str"); 260 Byteslike bytes(&scope, thread, *bytes_obj); 261 Str errors(&scope, strUnderlying(args.get(1))); 262 263 Bytearray dst(&scope, runtime->newBytearray()); 264 word length = bytes.length(); 265 runtime->bytearrayEnsureCapacity(thread, dst, length); 266 word first_invalid_escape_index = -1; 267 for (word i = 0; i < length;) { 268 byte ch = bytes.byteAt(i++); 269 if (ch != '\\') { 270 // TODO(T45134397): Support the recode_encoding parameter 271 if (ch <= kMaxASCII) { 272 bytearrayAdd(thread, runtime, dst, ch); 273 continue; 274 } 275 Str temp(&scope, SmallStr::fromCodePoint(ch)); 276 bytearrayAdd(thread, runtime, dst, temp.byteAt(0)); 277 bytearrayAdd(thread, runtime, dst, temp.byteAt(1)); 278 continue; 279 } 280 if (i >= length) { 281 return runtime->newStrFromCStr("Trailing \\ in string"); 282 } 283 word invalid_escape_index = -1; 284 int32_t decoded = decodeEscaped(bytes, &i, &invalid_escape_index); 285 if (invalid_escape_index != -1) { 286 bytearrayAdd(thread, runtime, dst, '\\'); 287 if (first_invalid_escape_index == -1) { 288 first_invalid_escape_index = invalid_escape_index; 289 } 290 } 291 if (decoded >= 0) { 292 bytearrayAdd(thread, runtime, dst, decoded); 293 continue; 294 } 295 if (decoded == -1) { 296 continue; 297 } 298 SymbolId error_id = lookupSymbolForErrorHandler(errors); 299 switch (error_id) { 300 case ID(strict): 301 return runtime->newStrFromFmt("invalid \\x escape at position %d", 302 i - 2); 303 case ID(replace): { 304 bytearrayAdd(thread, runtime, dst, '?'); 305 break; 306 } 307 case ID(ignore): 308 break; 309 default: 310 return runtime->newStrFromFmt( 311 "decoding error; unknown error handling code: %S", &errors); 312 } 313 if (i < length && Byte::isHexDigit(bytes.byteAt(i))) { 314 i++; 315 } 316 } 317 Object dst_obj(&scope, bytearrayAsBytes(thread, dst)); 318 Object length_obj(&scope, runtime->newInt(length)); 319 Object escape_obj(&scope, runtime->newInt(first_invalid_escape_index)); 320 return runtime->newTupleWith3(dst_obj, length_obj, escape_obj); 321} 322 323RawObject FUNC(_codecs, _latin_1_decode)(Thread* thread, Arguments args) { 324 Runtime* runtime = thread->runtime(); 325 HandleScope scope(thread); 326 Object data(&scope, args.get(0)); 327 StrArray array(&scope, runtime->newStrArray()); 328 word length; 329 Byteslike bytes(&scope, thread, *data); 330 length = bytes.length(); 331 runtime->strArrayEnsureCapacity(thread, array, length); 332 // First, try a quick ASCII decoding 333 word num_bytes = asciiDecode(thread, array, bytes, 0, length); 334 if (num_bytes != length) { 335 // A non-ASCII character was found; switch to a Latin-1 decoding for the 336 // remainder of the input sequence 337 for (word i = num_bytes; i < length; ++i) { 338 byte code_point = bytes.byteAt(i); 339 if (code_point <= kMaxASCII) { 340 runtime->strArrayAddASCII(thread, array, code_point); 341 } else { 342 runtime->strArrayAddCodePoint(thread, array, code_point); 343 } 344 } 345 } 346 Object array_str(&scope, runtime->strFromStrArray(array)); 347 Object length_obj(&scope, runtime->newInt(length)); 348 return runtime->newTupleWith2(array_str, length_obj); 349} 350 351RawObject FUNC(_codecs, _latin_1_encode)(Thread* thread, Arguments args) { 352 Runtime* runtime = thread->runtime(); 353 HandleScope scope(thread); 354 Object output_obj(&scope, args.get(3)); 355 DCHECK(runtime->isInstanceOfBytearray(*output_obj), 356 "Fourth arg to _latin_1_encode must be bytearray"); 357 Str data(&scope, strUnderlying(args.get(0))); 358 Str errors(&scope, strUnderlying(args.get(1))); 359 word i = intUnderlying(args.get(2)).asWord(); 360 Bytearray output(&scope, *output_obj); 361 362 SymbolId error_symbol = lookupSymbolForErrorHandler(errors); 363 for (word byte_offset = thread->strOffset(data, i); 364 byte_offset < data.length(); i++) { 365 word num_bytes; 366 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes); 367 byte_offset += num_bytes; 368 if (codepoint <= kMaxByte) { 369 bytearrayAdd(thread, runtime, output, codepoint); 370 } else { 371 switch (error_symbol) { 372 case ID(ignore): 373 continue; 374 case ID(replace): 375 bytearrayAdd(thread, runtime, output, kASCIIReplacement); 376 continue; 377 case ID(surrogateescape): 378 if (isEscapedLatin1Surrogate(codepoint)) { 379 bytearrayAdd(thread, runtime, output, 380 codepoint - Unicode::kLowSurrogateStart); 381 continue; 382 } 383 break; 384 default: 385 break; 386 } 387 Object outpos1(&scope, runtime->newInt(i)); 388 while (byte_offset < data.length() && 389 data.codePointAt(byte_offset, &num_bytes) > kMaxByte) { 390 byte_offset += num_bytes; 391 i++; 392 } 393 Object outpos2(&scope, runtime->newInt(i + 1)); 394 return runtime->newTupleWith2(outpos1, outpos2); 395 } 396 } 397 Object output_bytes(&scope, bytearrayAsBytes(thread, output)); 398 Object outpos(&scope, runtime->newInt(i)); 399 return runtime->newTupleWith2(output_bytes, outpos); 400} 401 402// Decodes a sequence of hexadecimal encoded bytes into a codepoint or returns 403// a negative value if the value could not be decoded. Sets the start variable 404// to where decoding should continue. 405static int32_t decodeHexEscaped(const Byteslike& bytes, word* start, 406 word count) { 407 DCHECK_BOUND(count, 8); 408 word result = 0; 409 word i = *start; 410 for (word len = bytes.length(); i < len && count != 0; i++, count--) { 411 byte ch = bytes.byteAt(i); 412 result <<= 4; 413 if (ch >= '0' && ch <= '9') { 414 result += ch - '0'; 415 } else if (ch >= 'a' && ch <= 'f') { 416 result += ch - ('a' - 10); 417 } else if (ch >= 'A' && ch <= 'F') { 418 result += ch - ('A' - 10); 419 } else { 420 break; // not a hexadecimal digit, stop reading 421 } 422 } 423 *start = i; 424 if (count != 0) { 425 return -1; 426 } 427 // if count is 4, result could be a 32-bit unicode character 428 if (result > kMaxUnicode) { 429 return -2; 430 } 431 return result; 432} 433 434// Decodes a sequence of unicode encoded bytes into a codepoint or returns 435// a negative value if no value should be written. Sets the iterating variable 436// to where decoding should continue, sets invalid_escape_index if it doesn't 437// recognize the escape sequence, and sets error_message if an error occurred. 438static int32_t decodeUnicodeEscaped(const Byteslike& bytes, word* i, 439 word* invalid_escape_index, 440 const char** error_message) { 441 switch (byte ch = bytes.byteAt((*i)++)) { 442 // \x escapes 443 case '\n': 444 return -1; 445 case '\\': 446 case '\'': 447 case '\"': 448 return ch; 449 case 'b': 450 return '\b'; 451 case 't': 452 return '\t'; 453 case 'n': 454 return '\n'; 455 case 'r': 456 return '\r'; 457 // BEL 458 case 'a': 459 return '\007'; 460 // FF 461 case 'f': 462 return '\014'; 463 // VT 464 case 'v': 465 return '\013'; 466 467 // \OOO (octal) escapes 468 case '0': 469 case '1': 470 case '2': 471 case '3': 472 case '4': 473 case '5': 474 case '6': 475 case '7': { 476 word escaped = ch - '0'; 477 word octal_index = *i; 478 word length = bytes.length(); 479 if (octal_index < length) { 480 word ch2 = bytes.byteAt(octal_index); 481 if ('0' <= ch2 && ch2 <= '7') { 482 escaped = (escaped << 3) + ch2 - '0'; 483 if (++octal_index < length) { 484 word ch3 = bytes.byteAt(octal_index); 485 if ('0' <= ch3 && ch3 <= '7') { 486 octal_index++; 487 escaped = (escaped << 3) + ch3 - '0'; 488 } 489 } 490 } 491 } 492 *i = octal_index; 493 return escaped; 494 } 495 496 // hex escapes 497 // \xXX 498 case 'x': { 499 word escaped; 500 if ((escaped = decodeHexEscaped(bytes, i, 2)) < 0) { 501 *error_message = (escaped == -1 ? "truncated \\xXX escape" 502 : "illegal Unicode character"); 503 return -1; 504 } 505 return escaped; 506 } 507 508 // \uXXXX 509 case 'u': { 510 word escaped; 511 if ((escaped = decodeHexEscaped(bytes, i, 4)) < 0) { 512 *error_message = (escaped == -1 ? "truncated \\uXXXX escape" 513 : "illegal Unicode character"); 514 return -1; 515 } 516 return escaped; 517 } 518 519 // \UXXXXXXXX 520 case 'U': { 521 word escaped; 522 if ((escaped = decodeHexEscaped(bytes, i, 8)) < 0) { 523 *error_message = (escaped == -1 ? "truncated \\uXXXXXXXX escape" 524 : "illegal Unicode character"); 525 return -1; 526 } 527 return escaped; 528 } 529 530 // \N{name} 531 case 'N': { 532 *error_message = "malformed \\N character escape"; 533 word length = bytes.length(); 534 if (*i >= length || bytes.byteAt(*i) != '{') { 535 return -1; 536 } 537 word start = ++(*i); 538 while (*i < length && bytes.byteAt(*i) != '}') { 539 *i += 1; 540 } 541 word size = *i - start; 542 if (size == 0 || *i == length) { 543 return -1; 544 } 545 *i += 1; 546 *error_message = "unknown Unicode character name"; 547 548 unique_c_ptr<byte> buffer(reinterpret_cast<byte*>(std::malloc(size))); 549 bytes.copyToStartAt(buffer.get(), size, start); 550 return codePointFromName(buffer.get(), size); 551 } 552 553 default: { 554 *invalid_escape_index = *i - 1; 555 return ch; 556 } 557 } 558} 559 560RawObject FUNC(_codecs, _unicode_escape_decode)(Thread* thread, 561 Arguments args) { 562 HandleScope scope(thread); 563 Runtime* runtime = thread->runtime(); 564 Object data(&scope, args.get(0)); 565 Str errors(&scope, strUnderlying(args.get(1))); 566 word index = intUnderlying(args.get(2)).asWord(); 567 StrArray dst(&scope, args.get(3)); 568 569 Byteslike bytes(&scope, thread, *data); 570 word length = bytes.length(); 571 runtime->strArrayEnsureCapacity(thread, dst, length); 572 word first_invalid_escape_index = -1; 573 for (word i = index; i < length;) { 574 const char* message = nullptr; 575 word start_pos = i; 576 byte ch = bytes.byteAt(i++); 577 if (ch != '\\') { 578 if (ch <= kMaxASCII) { 579 runtime->strArrayAddASCII(thread, dst, ch); 580 continue; 581 } 582 Str temp(&scope, SmallStr::fromCodePoint(ch)); 583 runtime->strArrayAddStr(thread, dst, temp); 584 continue; 585 } 586 if (i >= length) { 587 message = "\\ at end of string"; 588 } else { 589 word invalid_escape_index = -1; 590 int32_t decoded = 591 decodeUnicodeEscaped(bytes, &i, &invalid_escape_index, &message); 592 if (invalid_escape_index != -1) { 593 runtime->strArrayAddASCII(thread, dst, '\\'); 594 if (first_invalid_escape_index == -1) { 595 first_invalid_escape_index = invalid_escape_index; 596 } 597 } 598 if (decoded != -1) { 599 if (decoded <= kMaxASCII) { 600 runtime->strArrayAddASCII(thread, dst, decoded); 601 continue; 602 } 603 Str temp(&scope, SmallStr::fromCodePoint(decoded)); 604 runtime->strArrayAddStr(thread, dst, temp); 605 continue; 606 } 607 } 608 if (message != nullptr) { 609 SymbolId error_id = lookupSymbolForErrorHandler(errors); 610 switch (error_id) { 611 case ID(replace): { 612 Str temp(&scope, SmallStr::fromCodePoint(0xFFFD)); 613 runtime->strArrayAddStr(thread, dst, temp); 614 break; 615 } 616 case ID(ignore): 617 break; 618 default: { 619 Object start_pos_obj(&scope, runtime->newInt(start_pos)); 620 Object outpos_obj(&scope, runtime->newInt(i)); 621 Object message_obj(&scope, runtime->newStrFromCStr(message)); 622 Object escape_obj(&scope, 623 runtime->newInt(first_invalid_escape_index)); 624 return runtime->newTupleWith4(start_pos_obj, outpos_obj, message_obj, 625 escape_obj); 626 } 627 } 628 } 629 } 630 Object dst_obj(&scope, runtime->strFromStrArray(dst)); 631 Object length_obj(&scope, runtime->newInt(length)); 632 Object message_obj(&scope, runtime->newStrFromCStr("")); 633 Object escape_obj(&scope, runtime->newInt(first_invalid_escape_index)); 634 return runtime->newTupleWith4(dst_obj, length_obj, message_obj, escape_obj); 635} 636 637enum Utf8DecoderResult { 638 k1Byte = 1, 639 k2Byte = 2, 640 k3Byte = 3, 641 k4Byte = 4, 642 kInvalidStart = 0, 643 kInvalidContinuation1 = -1, 644 kInvalidContinuation2 = -2, 645 kInvalidContinuation3 = -3, 646 kUnexpectedEndOfData = -4, 647}; 648 649// This functionality is taken mostly from CPython: 650// Objects/stringlib/codecs.h::utf8_decode 651// This does error checking to ensure well-formedness of the passed in UTF-8 652// bytes, and returns the number of bytes of the codepoint at `index` as a 653// Utf8DecoderResult enum value. 654// Since this is supposed to work as an incremental decoder as well, this 655// function returns specific values for errors to determine whether they could 656// be caused by incremental decoding, or if they would be an error no matter 657// what other bytes might be streamed in later. 658static Utf8DecoderResult isValidUtf8Codepoint(const Byteslike& bytes, 659 word index) { 660 word length = bytes.length(); 661 byte ch = bytes.byteAt(index); 662 if (ch <= kMaxASCII) { 663 return k1Byte; 664 } 665 if (ch < 0xE0) { 666 // \xC2\x80-\xDF\xBF -- 0080-07FF 667 if (ch < 0xC2) { 668 // invalid sequence 669 // \x80-\xBF -- continuation byte 670 // \xC0-\xC1 -- fake 0000-007F 671 return kInvalidStart; 672 } 673 if (index + 1 >= length) { 674 return kUnexpectedEndOfData; 675 } 676 if (!UTF8::isTrailByte(bytes.byteAt(index + 1))) { 677 return kInvalidContinuation1; 678 } 679 return k2Byte; 680 } 681 if (ch < 0xF0) { 682 // \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF 683 if (index + 2 >= length) { 684 if (index + 1 >= length) { 685 return kUnexpectedEndOfData; 686 } 687 byte ch2 = bytes.byteAt(index + 1); 688 if (!UTF8::isTrailByte(ch2) || (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) { 689 return kInvalidContinuation1; 690 } 691 return kUnexpectedEndOfData; 692 } 693 byte ch2 = bytes.byteAt(index + 1); 694 if (!UTF8::isTrailByte(ch2)) { 695 return kInvalidContinuation1; 696 } 697 if (ch == 0xE0) { 698 if (ch2 < 0xA0) { 699 // invalid sequence 700 // \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 701 return kInvalidContinuation1; 702 } 703 } else if (ch == 0xED && ch2 >= 0xA0) { 704 // Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF 705 // will result in surrogates in range D800-DFFF. Surrogates are 706 // not valid UTF-8 so they are rejected. 707 // See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf 708 // (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt 709 return kInvalidContinuation1; 710 } 711 if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) { 712 return kInvalidContinuation2; 713 } 714 return k3Byte; 715 } 716 if (ch < 0xF5) { 717 // \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF 718 if (index + 3 >= length) { 719 if (index + 1 >= length) { 720 return kUnexpectedEndOfData; 721 } 722 byte ch2 = bytes.byteAt(index + 1); 723 if (!UTF8::isTrailByte(ch2) || (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) { 724 return kInvalidContinuation1; 725 } 726 if (index + 2 >= length) { 727 return kUnexpectedEndOfData; 728 } 729 if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) { 730 return kInvalidContinuation2; 731 } 732 return kUnexpectedEndOfData; 733 } 734 byte ch2 = bytes.byteAt(index + 1); 735 if (!UTF8::isTrailByte(ch2)) { 736 return kInvalidContinuation1; 737 } 738 if (ch == 0xF0) { 739 if (ch2 < 0x90) { 740 // invalid sequence 741 // \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF 742 return kInvalidContinuation1; 743 } 744 } else if (ch == 0xF4 && ch2 >= 0x90) { 745 // invalid sequence 746 // \xF4\x90\x80\80- -- 110000- overflow 747 return kInvalidContinuation1; 748 } 749 if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) { 750 return kInvalidContinuation2; 751 } 752 if (!UTF8::isTrailByte(bytes.byteAt(index + 3))) { 753 return kInvalidContinuation3; 754 } 755 return k4Byte; 756 } 757 return kInvalidStart; 758} 759 760RawObject FUNC(_codecs, _utf_8_decode)(Thread* thread, Arguments args) { 761 Runtime* runtime = thread->runtime(); 762 HandleScope scope(thread); 763 Object final_obj(&scope, args.get(4)); 764 DCHECK(final_obj.isBool(), "Fifth arg to _utf_8_decode must be bool"); 765 Object data(&scope, args.get(0)); 766 Str errors(&scope, strUnderlying(args.get(1))); 767 word index = intUnderlying(args.get(2)).asWord(); 768 StrArray dst(&scope, args.get(3)); 769 770 word length; 771 Byteslike bytes(&scope, thread, *data); 772 length = bytes.length(); 773 runtime->strArrayEnsureCapacity(thread, dst, length); 774 word i = asciiDecode(thread, dst, bytes, index, length); 775 if (i == length) { 776 Object dst_obj(&scope, runtime->strFromStrArray(dst)); 777 Object length_obj(&scope, runtime->newInt(length)); 778 Object message_obj(&scope, runtime->newStrFromCStr("")); 779 return runtime->newTupleWith3(dst_obj, length_obj, message_obj); 780 } 781 782 SymbolId error_id = lookupSymbolForErrorHandler(errors); 783 bool is_final = Bool::cast(*final_obj).value(); 784 while (i < length) { 785 // TODO(T41032331): Scan for non-ASCII characters by words instead of chars 786 Utf8DecoderResult validator_result = isValidUtf8Codepoint(bytes, i); 787 if (validator_result >= k1Byte) { 788 byte codepoint[4] = {0}; 789 for (int codeunit = 0; codeunit + 1 <= validator_result; codeunit++) { 790 codepoint[codeunit] = bytes.byteAt(i + codeunit); 791 } 792 i += validator_result; 793 Str temp(&scope, 794 runtime->newStrWithAll(View<byte>{codepoint, validator_result})); 795 runtime->strArrayAddStr(thread, dst, temp); 796 continue; 797 } 798 if (validator_result != kInvalidStart && !is_final) { 799 break; 800 } 801 word error_end = i; 802 const char* error_message = nullptr; 803 switch (validator_result) { 804 case kInvalidStart: 805 error_end += 1; 806 error_message = "invalid start byte"; 807 break; 808 case kInvalidContinuation1: 809 case kInvalidContinuation2: 810 case kInvalidContinuation3: 811 error_end -= validator_result; 812 error_message = "invalid continuation byte"; 813 break; 814 case kUnexpectedEndOfData: 815 error_end = length; 816 error_message = "unexpected end of data"; 817 break; 818 default: 819 UNREACHABLE( 820 "valid utf-8 codepoints should have been decoded by this point"); 821 } 822 switch (error_id) { 823 case ID(replace): { 824 Str temp(&scope, SmallStr::fromCodePoint(kReplacementCharacter)); 825 runtime->strArrayAddStr(thread, dst, temp); 826 i = error_end; 827 break; 828 } 829 case ID(surrogateescape): { 830 for (; i < error_end; ++i) { 831 Str temp(&scope, SmallStr::fromCodePoint(Unicode::kLowSurrogateStart + 832 bytes.byteAt(i))); 833 runtime->strArrayAddStr(thread, dst, temp); 834 } 835 break; 836 } 837 case ID(ignore): 838 i = error_end; 839 break; 840 default: { 841 Object outpos_obj(&scope, runtime->newInt(i)); 842 Object error_end_obj(&scope, runtime->newInt(error_end)); 843 Object message_obj(&scope, runtime->newStrFromCStr(error_message)); 844 return runtime->newTupleWith3(outpos_obj, error_end_obj, message_obj); 845 } 846 } 847 } 848 Object dst_obj(&scope, runtime->strFromStrArray(dst)); 849 Object outpos_obj(&scope, runtime->newInt(i)); 850 Object message_obj(&scope, Str::empty()); 851 return runtime->newTupleWith3(dst_obj, outpos_obj, message_obj); 852} 853 854RawObject FUNC(_codecs, _utf_8_encode)(Thread* thread, Arguments args) { 855 Runtime* runtime = thread->runtime(); 856 HandleScope scope(thread); 857 Object output_obj(&scope, args.get(3)); 858 DCHECK(runtime->isInstanceOfBytearray(*output_obj), 859 "Fourth arg to _utf_8_encode must be bytearray"); 860 Str data(&scope, strUnderlying(args.get(0))); 861 Str errors(&scope, strUnderlying(args.get(1))); 862 word index = intUnderlying(args.get(2)).asWord(); 863 Bytearray output(&scope, *output_obj); 864 865 SymbolId error_symbol = lookupSymbolForErrorHandler(errors); 866 for (word byte_offset = thread->strOffset(data, index); 867 byte_offset < data.length(); index++) { 868 word num_bytes; 869 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes); 870 byte_offset += num_bytes; 871 if (!Unicode::isSurrogate(codepoint)) { 872 for (word j = byte_offset - num_bytes; j < byte_offset; j++) { 873 bytearrayAdd(thread, runtime, output, data.byteAt(j)); 874 } 875 } else { 876 switch (error_symbol) { 877 case ID(ignore): 878 continue; 879 case ID(replace): 880 bytearrayAdd(thread, runtime, output, kASCIIReplacement); 881 continue; 882 case ID(surrogateescape): 883 if (isEscapedLatin1Surrogate(codepoint)) { 884 bytearrayAdd(thread, runtime, output, 885 codepoint - Unicode::kLowSurrogateStart); 886 continue; 887 } 888 break; 889 case ID(surrogatepass): 890 if (Unicode::isSurrogate(codepoint)) { 891 bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 3)); 892 bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 2)); 893 bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 1)); 894 continue; 895 } 896 break; 897 default: 898 break; 899 } 900 Object outpos1(&scope, runtime->newInt(index)); 901 while (byte_offset < data.length() && 902 Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) { 903 byte_offset += num_bytes; 904 index++; 905 } 906 Object outpos2(&scope, runtime->newInt(index + 1)); 907 return runtime->newTupleWith2(outpos1, outpos2); 908 } 909 } 910 Object output_bytes(&scope, bytearrayAsBytes(thread, output)); 911 Object index_obj(&scope, runtime->newInt(index)); 912 return runtime->newTupleWith2(output_bytes, index_obj); 913} 914 915static void appendUtf16ToBytearray(Thread* thread, Runtime* runtime, 916 const Bytearray& writer, int32_t codepoint, 917 endian endianness) { 918 if (endianness == endian::little) { 919 bytearrayAdd(thread, runtime, writer, codepoint); 920 bytearrayAdd(thread, runtime, writer, codepoint >> kBitsPerByte); 921 } else { 922 bytearrayAdd(thread, runtime, writer, codepoint >> kBitsPerByte); 923 bytearrayAdd(thread, runtime, writer, codepoint); 924 } 925} 926 927RawObject FUNC(_codecs, _utf_16_encode)(Thread* thread, Arguments args) { 928 Runtime* runtime = thread->runtime(); 929 HandleScope scope(thread); 930 Object output_obj(&scope, args.get(3)); 931 DCHECK(runtime->isInstanceOfBytearray(*output_obj), 932 "Fourth arg to _utf_16_encode must be bytearray"); 933 Str data(&scope, strUnderlying(args.get(0))); 934 Str errors(&scope, strUnderlying(args.get(1))); 935 word index = intUnderlying(args.get(2)).asWord(); 936 Bytearray output(&scope, *output_obj); 937 OptInt<int32_t> byteorder = intUnderlying(args.get(4)).asInt<int32_t>(); 938 if (byteorder.error != CastError::None) { 939 return thread->raiseWithFmt(LayoutId::kOverflowError, 940 "Python int too large to convert to C int"); 941 } 942 943 SymbolId error_id = lookupSymbolForErrorHandler(errors); 944 for (word byte_offset = thread->strOffset(data, index); 945 byte_offset < data.length(); index++) { 946 endian endianness = byteorder.value <= 0 ? endian::little : endian::big; 947 word num_bytes; 948 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes); 949 byte_offset += num_bytes; 950 if (!Unicode::isSurrogate(codepoint)) { 951 if (codepoint < Unicode::kHighSurrogateStart) { 952 appendUtf16ToBytearray(thread, runtime, output, codepoint, endianness); 953 } else { 954 appendUtf16ToBytearray(thread, runtime, output, 955 Unicode::highSurrogateFor(codepoint), 956 endianness); 957 appendUtf16ToBytearray(thread, runtime, output, 958 Unicode::lowSurrogateFor(codepoint), endianness); 959 } 960 } else { 961 switch (error_id) { 962 case ID(ignore): 963 continue; 964 case ID(replace): 965 appendUtf16ToBytearray(thread, runtime, output, kASCIIReplacement, 966 endianness); 967 continue; 968 case ID(surrogateescape): 969 if (isEscapedLatin1Surrogate(codepoint)) { 970 appendUtf16ToBytearray(thread, runtime, output, 971 codepoint - Unicode::kLowSurrogateStart, 972 endianness); 973 continue; 974 } 975 break; 976 default: 977 break; 978 } 979 Object outpos1(&scope, runtime->newInt(index)); 980 while (byte_offset < data.length() && 981 Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) { 982 byte_offset += num_bytes; 983 index++; 984 } 985 Object outpos2(&scope, runtime->newInt(index + 1)); 986 return runtime->newTupleWith2(outpos1, outpos2); 987 } 988 } 989 Object output_bytes(&scope, bytearrayAsBytes(thread, output)); 990 Object index_obj(&scope, runtime->newInt(index)); 991 return runtime->newTupleWith2(output_bytes, index_obj); 992} 993 994static void appendUtf32ToBytearray(Thread* thread, Runtime* runtime, 995 const Bytearray& writer, int32_t codepoint, 996 endian endianness) { 997 if (endianness == endian::little) { 998 bytearrayAdd(thread, runtime, writer, codepoint); 999 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte)); 1000 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 2)); 1001 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 3)); 1002 } else { 1003 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 3)); 1004 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 2)); 1005 bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte)); 1006 bytearrayAdd(thread, runtime, writer, codepoint); 1007 } 1008} 1009 1010RawObject FUNC(_codecs, _utf_32_encode)(Thread* thread, Arguments args) { 1011 Runtime* runtime = thread->runtime(); 1012 HandleScope scope(thread); 1013 Object output_obj(&scope, args.get(3)); 1014 DCHECK(runtime->isInstanceOfBytearray(*output_obj), 1015 "Fourth arg to _utf_32_encode must be bytearray"); 1016 Str data(&scope, strUnderlying(args.get(0))); 1017 Str errors(&scope, strUnderlying(args.get(1))); 1018 word index = intUnderlying(args.get(2)).asWord(); 1019 Bytearray output(&scope, *output_obj); 1020 OptInt<int32_t> byteorder = intUnderlying(args.get(4)).asInt<int32_t>(); 1021 if (byteorder.error != CastError::None) { 1022 return thread->raiseWithFmt(LayoutId::kOverflowError, 1023 "Python int too large to convert to C int"); 1024 } 1025 1026 SymbolId error_id = lookupSymbolForErrorHandler(errors); 1027 for (word byte_offset = thread->strOffset(data, index); 1028 byte_offset < data.length(); index++) { 1029 endian endianness = byteorder.value <= 0 ? endian::little : endian::big; 1030 word num_bytes; 1031 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes); 1032 byte_offset += num_bytes; 1033 if (!Unicode::isSurrogate(codepoint)) { 1034 appendUtf32ToBytearray(thread, runtime, output, codepoint, endianness); 1035 } else { 1036 switch (error_id) { 1037 case ID(ignore): 1038 continue; 1039 case ID(replace): 1040 appendUtf32ToBytearray(thread, runtime, output, kASCIIReplacement, 1041 endianness); 1042 continue; 1043 case ID(surrogateescape): 1044 if (isEscapedLatin1Surrogate(codepoint)) { 1045 appendUtf32ToBytearray(thread, runtime, output, 1046 codepoint - Unicode::kLowSurrogateStart, 1047 endianness); 1048 continue; 1049 } 1050 break; 1051 default: 1052 break; 1053 } 1054 Object outpos1(&scope, runtime->newInt(index)); 1055 while (byte_offset < data.length() && 1056 Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) { 1057 byte_offset += num_bytes; 1058 index++; 1059 } 1060 Object outpos2(&scope, runtime->newInt(index + 1)); 1061 return runtime->newTupleWith2(outpos1, outpos2); 1062 } 1063 } 1064 Object output_bytes(&scope, bytearrayAsBytes(thread, output)); 1065 Object index_obj(&scope, runtime->newInt(index)); 1066 return runtime->newTupleWith2(output_bytes, index_obj); 1067} 1068 1069// Takes a Bytearray and a Str object, and appends each byte in the Str to the 1070// Bytearray one by one 1071RawObject FUNC(_codecs, _bytearray_string_append)(Thread* thread, 1072 Arguments args) { 1073 HandleScope scope(thread); 1074 Bytearray dst(&scope, args.get(0)); 1075 Str data(&scope, args.get(1)); 1076 for (word i = 0; i < data.length(); ++i) { 1077 bytearrayAdd(thread, thread->runtime(), dst, data.byteAt(i)); 1078 } 1079 return NoneType::object(); 1080} 1081 1082RawObject FUNC(_codecs, _raw_unicode_escape_encode)(Thread* thread, 1083 Arguments args) { 1084 HandleScope scope(thread); 1085 Runtime* runtime = thread->runtime(); 1086 Str data(&scope, strUnderlying(args.get(0))); 1087 word size = data.codePointLength(); 1088 Bytearray dst(&scope, runtime->newBytearray()); 1089 word length = data.length(); 1090 1091 // 2 byte codepoints can be expanded to 4 bytes + 2 escape characters 1092 // 4 byte codepoints well be expanded to 8 bytes + 2 escape characters 1093 // To be safe we double the bytecount and add space for 2 escape characters 1094 // per codepoint. 1095 word expanded_size = length * 2 + size * 2; 1096 runtime->bytearrayEnsureCapacity(thread, dst, expanded_size); 1097 word num_bytes; 1098 for (word index = 0, byte_offset = thread->strOffset(data, index); 1099 byte_offset < data.length(); index++) { 1100 int32_t codepoint = data.codePointAt(byte_offset, &num_bytes); 1101 byte_offset += num_bytes; 1102 // U+0000-U+00ff range: Copy 8-bit characters as-is 1103 if (codepoint <= kMaxByte) { 1104 bytearrayAdd(thread, runtime, dst, codepoint); 1105 } 1106 // U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' 1107 else if (codepoint <= kMaxUint16) { 1108 bytearrayAdd(thread, runtime, dst, '\\'); 1109 bytearrayAdd(thread, runtime, dst, 'u'); 1110 bytearrayAdd(thread, runtime, dst, 1111 lowerCaseHexDigit((codepoint >> 12) & 0xf)); 1112 bytearrayAdd(thread, runtime, dst, 1113 lowerCaseHexDigit((codepoint >> 8) & 0xf)); 1114 bytearrayAdd(thread, runtime, dst, 1115 lowerCaseHexDigit((codepoint >> 4) & 0xf)); 1116 bytearrayAdd(thread, runtime, dst, lowerCaseHexDigit(codepoint & 15)); 1117 } 1118 // U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' 1119 else { 1120 CHECK(codepoint <= kMaxUnicode, "expected a valid unicode code point"); 1121 bytearrayAdd(thread, runtime, dst, '\\'); 1122 bytearrayAdd(thread, runtime, dst, 'U'); 1123 bytearrayAdd(thread, runtime, dst, '0'); 1124 bytearrayAdd(thread, runtime, dst, '0'); 1125 bytearrayAdd(thread, runtime, dst, 1126 lowerCaseHexDigit((codepoint >> 20) & 0xf)); 1127 bytearrayAdd(thread, runtime, dst, 1128 lowerCaseHexDigit((codepoint >> 16) & 0xf)); 1129 bytearrayAdd(thread, runtime, dst, 1130 lowerCaseHexDigit((codepoint >> 12) & 0xf)); 1131 bytearrayAdd(thread, runtime, dst, 1132 lowerCaseHexDigit((codepoint >> 8) & 0xf)); 1133 bytearrayAdd(thread, runtime, dst, 1134 lowerCaseHexDigit((codepoint >> 4) & 0xf)); 1135 bytearrayAdd(thread, runtime, dst, lowerCaseHexDigit(codepoint & 15)); 1136 } 1137 } 1138 Object output_bytes(&scope, bytearrayAsBytes(thread, dst)); 1139 Object size_obj(&scope, runtime->newInt(size)); 1140 return runtime->newTupleWith2(output_bytes, size_obj); 1141} 1142 1143RawObject FUNC(_codecs, _raw_unicode_escape_decode)(Thread* thread, 1144 Arguments args) { 1145 HandleScope scope(thread); 1146 Runtime* runtime = thread->runtime(); 1147 Object data(&scope, args.get(0)); 1148 Str errors(&scope, strUnderlying(args.get(1))); 1149 word index = intUnderlying(args.get(2)).asWord(); 1150 StrArray dst(&scope, args.get(3)); 1151 1152 Byteslike bytes(&scope, thread, *data); 1153 word length = bytes.length(); 1154 runtime->strArrayEnsureCapacity(thread, dst, length); 1155 for (word i = index; i < length;) { 1156 const char* message = nullptr; 1157 word start_pos = i; 1158 byte ch = bytes.byteAt(i); 1159 i++; 1160 if (ch != '\\') { 1161 if (ch <= kMaxASCII) { 1162 runtime->strArrayAddASCII(thread, dst, ch); 1163 continue; 1164 } 1165 Str temp(&scope, SmallStr::fromCodePoint(ch)); 1166 runtime->strArrayAddStr(thread, dst, temp); 1167 continue; 1168 } 1169 if (i >= length) { 1170 // \\ at end of string 1171 runtime->strArrayAddASCII(thread, dst, '\\'); 1172 } else { 1173 int32_t decoded; 1174 ch = bytes.byteAt(i); 1175 i++; 1176 // Only care about \uXXXX and \UXXXXXXXX when decoding raw unicode. 1177 switch (ch) { 1178 // \uXXXX 1179 case 'u': { 1180 if ((decoded = decodeHexEscaped(bytes, &i, 4)) < 0) { 1181 message = (decoded == -1 ? "truncated \\uXXXX escape" 1182 : "illegal Unicode character"); 1183 } 1184 break; 1185 } 1186 // \UXXXXXXXX 1187 case 'U': { 1188 if ((decoded = decodeHexEscaped(bytes, &i, 8)) < 0) { 1189 if (decoded == -1) { 1190 message = "truncated \\UXXXXXXXX escape"; 1191 } else if (decoded == -2) { 1192 message = "\\Uxxxxxxxx out of range"; 1193 } else { 1194 message = "illegal Unicode character"; 1195 } 1196 } 1197 break; 1198 } 1199 default: { 1200 runtime->strArrayAddASCII(thread, dst, '\\'); 1201 decoded = ch; 1202 } 1203 } 1204 if (decoded >= 0) { 1205 if (decoded <= kMaxASCII) { 1206 runtime->strArrayAddASCII(thread, dst, decoded); 1207 continue; 1208 } 1209 Str temp(&scope, SmallStr::fromCodePoint(decoded)); 1210 runtime->strArrayAddStr(thread, dst, temp); 1211 continue; 1212 } 1213 } 1214 if (message != nullptr) { 1215 SymbolId error_id = lookupSymbolForErrorHandler(errors); 1216 switch (error_id) { 1217 case ID(replace): { 1218 Str temp(&scope, SmallStr::fromCodePoint(0xFFFD)); 1219 runtime->strArrayAddStr(thread, dst, temp); 1220 break; 1221 } 1222 case ID(ignore): 1223 break; 1224 default: { 1225 Object start_pos_obj(&scope, runtime->newInt(start_pos)); 1226 Object outpos_obj(&scope, runtime->newInt(i)); 1227 Object message_obj(&scope, runtime->newStrFromCStr(message)); 1228 return runtime->newTupleWith3(start_pos_obj, outpos_obj, message_obj); 1229 } 1230 } 1231 } 1232 } 1233 Object dst_obj(&scope, runtime->strFromStrArray(dst)); 1234 Object length_obj(&scope, runtime->newInt(length)); 1235 Object message_obj(&scope, runtime->newStrFromCStr("")); 1236 return runtime->newTupleWith3(dst_obj, length_obj, message_obj); 1237} 1238 1239RawObject FUNC(_codecs, backslashreplace_errors)(Thread* thread, 1240 Arguments args) { 1241 HandleScope scope(thread); 1242 Runtime* runtime = thread->runtime(); 1243 Object error(&scope, args.get(0)); 1244 Object object(&scope, NoneType::object()); 1245 word start; 1246 word end; 1247 if (runtime->isInstanceOfUnicodeDecodeError(*error)) { 1248 UnicodeErrorBase unicode_error(&scope, *error); 1249 start = SmallInt::cast(unicode_error.start()).value(); 1250 end = SmallInt::cast(unicode_error.end()).value(); 1251 object = unicode_error.object(); 1252 if (!runtime->isInstanceOfBytes(*object)) { 1253 return thread->raiseWithFmt(LayoutId::kTypeError, 1254 "object attribute must be bytes"); 1255 } 1256 Bytes bytes(&scope, bytesUnderlying(*object)); 1257 word length = bytes.length(); 1258 if (start >= length) start = length - 1; 1259 if (start < 0) start = 0; 1260 if (end >= length) end = length; 1261 if (end < 1) end = 1; 1262 word result_size = end - start; 1263 if (result_size < 0) { 1264 return thread->raiseWithFmt(LayoutId::kValueError, "end before start"); 1265 } 1266 result_size *= 4; 1267 MutableBytes result(&scope, 1268 runtime->newMutableBytesUninitialized(result_size)); 1269 word pos = 0; 1270 for (word i = start; i < end; i++) { 1271 byte b = bytes.byteAt(i); 1272 result.byteAtPut(pos++, '\\'); 1273 result.byteAtPut(pos++, 'x'); 1274 uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/2, b); 1275 pos += 2; 1276 } 1277 DCHECK(pos == result.length(), "size mismatch"); 1278 Object result_str(&scope, result.becomeStr()); 1279 Object end_obj(&scope, SmallInt::fromWord(end)); 1280 return runtime->newTupleWith2(result_str, end_obj); 1281 } 1282 1283 if (runtime->isInstanceOfUnicodeEncodeError(*error) || 1284 runtime->isInstanceOfUnicodeTranslateError(*error)) { 1285 UnicodeErrorBase unicode_error(&scope, *error); 1286 start = SmallInt::cast(unicode_error.start()).value(); 1287 end = SmallInt::cast(unicode_error.end()).value(); 1288 object = unicode_error.object(); 1289 if (!runtime->isInstanceOfStr(*object)) { 1290 return thread->raiseWithFmt(LayoutId::kTypeError, 1291 "object attribute must be unicode"); 1292 } 1293 Str str(&scope, strUnderlying(*object)); 1294 1295 if (start < 0) start = 0; 1296 if (end < 1) end = 1; 1297 if (end < start) { 1298 return thread->raiseWithFmt(LayoutId::kValueError, "end before start"); 1299 } 1300 word start_byte = str.offsetByCodePoints(0, start); 1301 word end_byte = str.offsetByCodePoints(start_byte, end - start); 1302 word result_size = 0; 1303 for (word i = start_byte; i < end_byte;) { 1304 word num_bytes; 1305 int32_t cp = str.codePointAt(i, &num_bytes); 1306 i += num_bytes; 1307 if (cp > kMaxUint16) { 1308 result_size += 10; // Will replace with `\Uxxxxxxxx` 1309 } else if (cp > kMaxByte) { 1310 result_size += 6; // Will replace with `\uxxxx` 1311 } else { 1312 result_size += 4; // Will replace with `\xyy` 1313 } 1314 } 1315 MutableBytes result(&scope, 1316 runtime->newMutableBytesUninitialized(result_size)); 1317 word pos = 0; 1318 for (word i = start_byte; i < end_byte;) { 1319 word num_bytes; 1320 int32_t cp = str.codePointAt(i, &num_bytes); 1321 i += num_bytes; 1322 result.byteAtPut(pos++, '\\'); 1323 if (cp > kMaxUint16) { 1324 result.byteAtPut(pos++, 'U'); 1325 uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/8, cp); 1326 pos += 8; 1327 } else if (cp > kMaxByte) { 1328 result.byteAtPut(pos++, 'u'); 1329 uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/4, cp); 1330 pos += 4; 1331 } else { 1332 result.byteAtPut(pos++, 'x'); 1333 uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/2, cp); 1334 pos += 2; 1335 } 1336 } 1337 DCHECK(pos == result.length(), "size mismatch"); 1338 Object result_bytes(&scope, result.becomeStr()); 1339 Object end_obj(&scope, SmallInt::fromWord(end)); 1340 return runtime->newTupleWith2(result_bytes, end_obj); 1341 } 1342 return thread->raiseWithFmt(LayoutId::kTypeError, 1343 "don't know how to handle %T in error callback", 1344 &error); 1345} 1346 1347} // namespace py