runtime/under-codecs-module.cpp at trunk · bernsteinbear.com/skybison

bernsteinbear.com / skybison
fork atom
this repo has no description
fork atom
skybison / runtime / under-codecs-module.cpp
at trunk 1347 lines 47 kB view raw
wrap content
Max Bernstein Add license headers 4y ago
29d072a3
   1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
   2#include "builtins.h"
   3#include "bytearray-builtins.h"
   4#include "bytes-builtins.h"
   5#include "byteslike.h"
   6#include "formatter-utils.h"
   7#include "frame.h"
   8#include "int-builtins.h"
   9#include "modules.h"
  10#include "runtime.h"
  11#include "str-builtins.h"
  12#include "unicode-db.h"
  13#include "unicode.h"
  14#include "utils.h"
  15
  16namespace py {
  17
  18const char kASCIIReplacement = '?';
  19
  20static SymbolId lookupSymbolForErrorHandler(const Str& error) {
  21  if (error.equalsCStr("strict")) {
  22    return ID(strict);
  23  }
  24  if (error.equalsCStr("ignore")) {
  25    return ID(ignore);
  26  }
  27  if (error.equalsCStr("replace")) {
  28    return ID(replace);
  29  }
  30  if (error.equalsCStr("surrogateescape")) {
  31    return ID(surrogateescape);
  32  }
  33  if (error.equalsCStr("surrogatepass")) {
  34    return ID(surrogatepass);
  35  }
  36  return SymbolId::kInvalid;
  37}
  38
  39static int asciiDecode(Thread* thread, const StrArray& dst,
  40                       const Byteslike& src, word start, word end) {
  41  // TODO(T41032331): Implement a fastpass to read longs instead of chars
  42  Runtime* runtime = thread->runtime();
  43  for (word i = start; i < end; i++) {
  44    byte ch = src.byteAt(i);
  45    if (ch > kMaxASCII) {
  46      return i;
  47    }
  48    runtime->strArrayAddASCII(thread, dst, ch);
  49  }
  50  return end;
  51}
  52
  53RawObject FUNC(_codecs, _ascii_decode)(Thread* thread, Arguments args) {
  54  Runtime* runtime = thread->runtime();
  55  HandleScope scope(thread);
  56  Object data(&scope, args.get(0));
  57  Str errors(&scope, strUnderlying(args.get(1)));
  58  word index = intUnderlying(args.get(2)).asWord();
  59  StrArray dst(&scope, args.get(3));
  60
  61  Byteslike bytes(&scope, thread, *data);
  62  word length = bytes.length();
  63  runtime->strArrayEnsureCapacity(thread, dst, length);
  64  word outpos = asciiDecode(thread, dst, bytes, index, length);
  65  if (outpos == length) {
  66    Object dst_obj(&scope, runtime->strFromStrArray(dst));
  67    Object length_obj(&scope, runtime->newInt(length));
  68    return runtime->newTupleWith2(dst_obj, length_obj);
  69  }
  70
  71  SymbolId error_id = lookupSymbolForErrorHandler(errors);
  72  while (outpos < length) {
  73    byte c = bytes.byteAt(outpos);
  74    if (c < 128) {
  75      runtime->strArrayAddASCII(thread, dst, c);
  76      ++outpos;
  77      continue;
  78    }
  79    switch (error_id) {
  80      case ID(replace): {
  81        Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
  82        runtime->strArrayAddStr(thread, dst, temp);
  83        ++outpos;
  84        break;
  85      }
  86      case ID(surrogateescape): {
  87        Str temp(&scope,
  88                 SmallStr::fromCodePoint(Unicode::kLowSurrogateStart + c));
  89        runtime->strArrayAddStr(thread, dst, temp);
  90        ++outpos;
  91        break;
  92      }
  93      case ID(ignore):
  94        ++outpos;
  95        break;
  96      default: {
  97        Object outpos1(&scope, runtime->newIntFromUnsigned(outpos));
  98        Object outpos2(&scope, runtime->newIntFromUnsigned(outpos + 1));
  99        return runtime->newTupleWith2(outpos1, outpos2);
 100      }
 101    }
 102  }
 103  Object dst_obj(&scope, runtime->strFromStrArray(dst));
 104  Object length_obj(&scope, runtime->newInt(length));
 105  return runtime->newTupleWith2(dst_obj, length_obj);
 106}
 107
 108// CPython encodes latin1 codepoints into the low-surrogate range, and is able
 109// to recover the original codepoints from those decodable surrogate points.
 110static bool isEscapedLatin1Surrogate(int32_t codepoint) {
 111  return (Unicode::kLowSurrogateStart + kMaxASCII) < codepoint &&
 112         codepoint <= (Unicode::kLowSurrogateStart + kMaxByte);
 113}
 114
 115RawObject FUNC(_codecs, _ascii_encode)(Thread* thread, Arguments args) {
 116  Runtime* runtime = thread->runtime();
 117  HandleScope scope(thread);
 118  Object output_obj(&scope, args.get(3));
 119  DCHECK(runtime->isInstanceOfBytearray(*output_obj),
 120         "Fourth arg to _ascii_encode must be bytearray");
 121  Str data(&scope, strUnderlying(args.get(0)));
 122  Str errors(&scope, strUnderlying(args.get(1)));
 123  word i = intUnderlying(args.get(2)).asWord();
 124  Bytearray output(&scope, *output_obj);
 125
 126  SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
 127  // TODO(T43252439): Optimize this by first checking whether the entire string
 128  // is ASCII, and just memcpy into a string if so
 129  for (word byte_offset = thread->strOffset(data, i);
 130       byte_offset < data.length(); i++) {
 131    word num_bytes;
 132    int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
 133    byte_offset += num_bytes;
 134    if (codepoint <= kMaxASCII) {
 135      bytearrayAdd(thread, runtime, output, codepoint);
 136    } else {
 137      switch (error_symbol) {
 138        case ID(ignore):
 139          continue;
 140        case ID(replace):
 141          bytearrayAdd(thread, runtime, output, kASCIIReplacement);
 142          continue;
 143        case ID(surrogateescape):
 144          if (isEscapedLatin1Surrogate(codepoint)) {
 145            bytearrayAdd(thread, runtime, output,
 146                         codepoint - Unicode::kLowSurrogateStart);
 147            continue;
 148          }
 149          break;
 150        default:
 151          break;
 152      }
 153      Object outpos1(&scope, runtime->newInt(i));
 154      while (byte_offset < data.length() &&
 155             data.codePointAt(byte_offset, &num_bytes) > kMaxASCII) {
 156        byte_offset += num_bytes;
 157        i++;
 158      }
 159      Object outpos2(&scope, runtime->newInt(i + 1));
 160      return runtime->newTupleWith2(outpos1, outpos2);
 161    }
 162  }
 163  Object output_bytes(&scope, bytearrayAsBytes(thread, output));
 164  Object outpos_obj(&scope, runtime->newInt(i));
 165  return runtime->newTupleWith2(output_bytes, outpos_obj);
 166}
 167
 168// Decodes a sequence of unicode encoded bytes into a codepoint, returns
 169// -1 if no value should be written, and -2 if an error occurred. Sets the
 170// iterating variable to where decoding should continue, and sets
 171// invalid_escape_index if it doesn't recognize the escape sequence.
 172static int32_t decodeEscaped(const Byteslike& bytes, word* i,
 173                             word* invalid_escape_index) {
 174  word length = bytes.length();
 175  switch (byte ch = bytes.byteAt((*i)++)) {
 176      // \x escapes
 177    case '\n':
 178      return -1;
 179    case '\\':
 180    case '\'':
 181    case '\"':
 182      return ch;
 183    case 'b':
 184      return '\b';
 185    case 't':
 186      return '\t';
 187    case 'n':
 188      return '\n';
 189    case 'r':
 190      return '\r';
 191    // BEL,
 192    case 'a':
 193      return '\x07';
 194    // VT
 195    case 'v':
 196      return '\x0B';
 197    // FF
 198    case 'f':
 199      return '\x0C';
 200
 201    // \OOO (octal) escapes
 202    case '0':
 203    case '1':
 204    case '2':
 205    case '3':
 206    case '4':
 207    case '5':
 208    case '6':
 209    case '7': {
 210      word escaped = ch - '0';
 211      word octal_index = *i;
 212      if (octal_index < length) {
 213        word ch2 = bytes.byteAt(octal_index);
 214        if ('0' <= ch2 && ch2 <= '7') {
 215          escaped = (escaped << 3) + ch2 - '0';
 216          if (++octal_index < length) {
 217            word ch3 = bytes.byteAt(octal_index);
 218            if ('0' <= ch3 && ch3 <= '7') {
 219              octal_index++;
 220              escaped = (escaped << 3) + ch3 - '0';
 221            }
 222          }
 223        }
 224      }
 225      *i = octal_index;
 226      return escaped;
 227    }
 228
 229    // hex escapes
 230    // \xXX
 231    case 'x': {
 232      word hex_index = *i;
 233      if (hex_index + 1 < length) {
 234        int digit1, digit2;
 235        digit1 = _PyLong_DigitValue[bytes.byteAt(hex_index)];
 236        digit2 = _PyLong_DigitValue[bytes.byteAt(hex_index + 1)];
 237        if (digit1 < 16 && digit2 < 16) {
 238          *i += 2;
 239          return (digit1 << 4) + digit2;
 240        }
 241      }
 242      return -2;
 243    }
 244    default:
 245      *invalid_escape_index = *i - 1;
 246      return ch;
 247  }
 248}
 249
 250RawObject FUNC(_codecs, _escape_decode)(Thread* thread, Arguments args) {
 251  HandleScope scope(thread);
 252  Object bytes_obj(&scope, args.get(0));
 253  Runtime* runtime = thread->runtime();
 254  if (runtime->isInstanceOfStr(*bytes_obj)) {
 255    // TODO(T44739505): Make sure we can decode a str
 256    UNIMPLEMENTED("_codecs.escape_decode with a str");
 257  }
 258  DCHECK(runtime->isInstanceOfStr(args.get(2)),
 259         "Third arg to _escape_decode must be str");
 260  Byteslike bytes(&scope, thread, *bytes_obj);
 261  Str errors(&scope, strUnderlying(args.get(1)));
 262
 263  Bytearray dst(&scope, runtime->newBytearray());
 264  word length = bytes.length();
 265  runtime->bytearrayEnsureCapacity(thread, dst, length);
 266  word first_invalid_escape_index = -1;
 267  for (word i = 0; i < length;) {
 268    byte ch = bytes.byteAt(i++);
 269    if (ch != '\\') {
 270      // TODO(T45134397): Support the recode_encoding parameter
 271      if (ch <= kMaxASCII) {
 272        bytearrayAdd(thread, runtime, dst, ch);
 273        continue;
 274      }
 275      Str temp(&scope, SmallStr::fromCodePoint(ch));
 276      bytearrayAdd(thread, runtime, dst, temp.byteAt(0));
 277      bytearrayAdd(thread, runtime, dst, temp.byteAt(1));
 278      continue;
 279    }
 280    if (i >= length) {
 281      return runtime->newStrFromCStr("Trailing \\ in string");
 282    }
 283    word invalid_escape_index = -1;
 284    int32_t decoded = decodeEscaped(bytes, &i, &invalid_escape_index);
 285    if (invalid_escape_index != -1) {
 286      bytearrayAdd(thread, runtime, dst, '\\');
 287      if (first_invalid_escape_index == -1) {
 288        first_invalid_escape_index = invalid_escape_index;
 289      }
 290    }
 291    if (decoded >= 0) {
 292      bytearrayAdd(thread, runtime, dst, decoded);
 293      continue;
 294    }
 295    if (decoded == -1) {
 296      continue;
 297    }
 298    SymbolId error_id = lookupSymbolForErrorHandler(errors);
 299    switch (error_id) {
 300      case ID(strict):
 301        return runtime->newStrFromFmt("invalid \\x escape at position %d",
 302                                      i - 2);
 303      case ID(replace): {
 304        bytearrayAdd(thread, runtime, dst, '?');
 305        break;
 306      }
 307      case ID(ignore):
 308        break;
 309      default:
 310        return runtime->newStrFromFmt(
 311            "decoding error; unknown error handling code: %S", &errors);
 312    }
 313    if (i < length && Byte::isHexDigit(bytes.byteAt(i))) {
 314      i++;
 315    }
 316  }
 317  Object dst_obj(&scope, bytearrayAsBytes(thread, dst));
 318  Object length_obj(&scope, runtime->newInt(length));
 319  Object escape_obj(&scope, runtime->newInt(first_invalid_escape_index));
 320  return runtime->newTupleWith3(dst_obj, length_obj, escape_obj);
 321}
 322
 323RawObject FUNC(_codecs, _latin_1_decode)(Thread* thread, Arguments args) {
 324  Runtime* runtime = thread->runtime();
 325  HandleScope scope(thread);
 326  Object data(&scope, args.get(0));
 327  StrArray array(&scope, runtime->newStrArray());
 328  word length;
 329  Byteslike bytes(&scope, thread, *data);
 330  length = bytes.length();
 331  runtime->strArrayEnsureCapacity(thread, array, length);
 332  // First, try a quick ASCII decoding
 333  word num_bytes = asciiDecode(thread, array, bytes, 0, length);
 334  if (num_bytes != length) {
 335    // A non-ASCII character was found; switch to a Latin-1 decoding for the
 336    // remainder of the input sequence
 337    for (word i = num_bytes; i < length; ++i) {
 338      byte code_point = bytes.byteAt(i);
 339      if (code_point <= kMaxASCII) {
 340        runtime->strArrayAddASCII(thread, array, code_point);
 341      } else {
 342        runtime->strArrayAddCodePoint(thread, array, code_point);
 343      }
 344    }
 345  }
 346  Object array_str(&scope, runtime->strFromStrArray(array));
 347  Object length_obj(&scope, runtime->newInt(length));
 348  return runtime->newTupleWith2(array_str, length_obj);
 349}
 350
 351RawObject FUNC(_codecs, _latin_1_encode)(Thread* thread, Arguments args) {
 352  Runtime* runtime = thread->runtime();
 353  HandleScope scope(thread);
 354  Object output_obj(&scope, args.get(3));
 355  DCHECK(runtime->isInstanceOfBytearray(*output_obj),
 356         "Fourth arg to _latin_1_encode must be bytearray");
 357  Str data(&scope, strUnderlying(args.get(0)));
 358  Str errors(&scope, strUnderlying(args.get(1)));
 359  word i = intUnderlying(args.get(2)).asWord();
 360  Bytearray output(&scope, *output_obj);
 361
 362  SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
 363  for (word byte_offset = thread->strOffset(data, i);
 364       byte_offset < data.length(); i++) {
 365    word num_bytes;
 366    int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
 367    byte_offset += num_bytes;
 368    if (codepoint <= kMaxByte) {
 369      bytearrayAdd(thread, runtime, output, codepoint);
 370    } else {
 371      switch (error_symbol) {
 372        case ID(ignore):
 373          continue;
 374        case ID(replace):
 375          bytearrayAdd(thread, runtime, output, kASCIIReplacement);
 376          continue;
 377        case ID(surrogateescape):
 378          if (isEscapedLatin1Surrogate(codepoint)) {
 379            bytearrayAdd(thread, runtime, output,
 380                         codepoint - Unicode::kLowSurrogateStart);
 381            continue;
 382          }
 383          break;
 384        default:
 385          break;
 386      }
 387      Object outpos1(&scope, runtime->newInt(i));
 388      while (byte_offset < data.length() &&
 389             data.codePointAt(byte_offset, &num_bytes) > kMaxByte) {
 390        byte_offset += num_bytes;
 391        i++;
 392      }
 393      Object outpos2(&scope, runtime->newInt(i + 1));
 394      return runtime->newTupleWith2(outpos1, outpos2);
 395    }
 396  }
 397  Object output_bytes(&scope, bytearrayAsBytes(thread, output));
 398  Object outpos(&scope, runtime->newInt(i));
 399  return runtime->newTupleWith2(output_bytes, outpos);
 400}
 401
 402// Decodes a sequence of hexadecimal encoded bytes into a codepoint or returns
 403// a negative value if the value could not be decoded. Sets the start variable
 404// to where decoding should continue.
 405static int32_t decodeHexEscaped(const Byteslike& bytes, word* start,
 406                                word count) {
 407  DCHECK_BOUND(count, 8);
 408  word result = 0;
 409  word i = *start;
 410  for (word len = bytes.length(); i < len && count != 0; i++, count--) {
 411    byte ch = bytes.byteAt(i);
 412    result <<= 4;
 413    if (ch >= '0' && ch <= '9') {
 414      result += ch - '0';
 415    } else if (ch >= 'a' && ch <= 'f') {
 416      result += ch - ('a' - 10);
 417    } else if (ch >= 'A' && ch <= 'F') {
 418      result += ch - ('A' - 10);
 419    } else {
 420      break;  // not a hexadecimal digit, stop reading
 421    }
 422  }
 423  *start = i;
 424  if (count != 0) {
 425    return -1;
 426  }
 427  // if count is 4, result could be a 32-bit unicode character
 428  if (result > kMaxUnicode) {
 429    return -2;
 430  }
 431  return result;
 432}
 433
 434// Decodes a sequence of unicode encoded bytes into a codepoint or returns
 435// a negative value if no value should be written. Sets the iterating variable
 436// to where decoding should continue, sets invalid_escape_index if it doesn't
 437// recognize the escape sequence, and sets error_message if an error occurred.
 438static int32_t decodeUnicodeEscaped(const Byteslike& bytes, word* i,
 439                                    word* invalid_escape_index,
 440                                    const char** error_message) {
 441  switch (byte ch = bytes.byteAt((*i)++)) {
 442    // \x escapes
 443    case '\n':
 444      return -1;
 445    case '\\':
 446    case '\'':
 447    case '\"':
 448      return ch;
 449    case 'b':
 450      return '\b';
 451    case 't':
 452      return '\t';
 453    case 'n':
 454      return '\n';
 455    case 'r':
 456      return '\r';
 457    // BEL
 458    case 'a':
 459      return '\007';
 460    // FF
 461    case 'f':
 462      return '\014';
 463    // VT
 464    case 'v':
 465      return '\013';
 466
 467    // \OOO (octal) escapes
 468    case '0':
 469    case '1':
 470    case '2':
 471    case '3':
 472    case '4':
 473    case '5':
 474    case '6':
 475    case '7': {
 476      word escaped = ch - '0';
 477      word octal_index = *i;
 478      word length = bytes.length();
 479      if (octal_index < length) {
 480        word ch2 = bytes.byteAt(octal_index);
 481        if ('0' <= ch2 && ch2 <= '7') {
 482          escaped = (escaped << 3) + ch2 - '0';
 483          if (++octal_index < length) {
 484            word ch3 = bytes.byteAt(octal_index);
 485            if ('0' <= ch3 && ch3 <= '7') {
 486              octal_index++;
 487              escaped = (escaped << 3) + ch3 - '0';
 488            }
 489          }
 490        }
 491      }
 492      *i = octal_index;
 493      return escaped;
 494    }
 495
 496    // hex escapes
 497    // \xXX
 498    case 'x': {
 499      word escaped;
 500      if ((escaped = decodeHexEscaped(bytes, i, 2)) < 0) {
 501        *error_message = (escaped == -1 ? "truncated \\xXX escape"
 502                                        : "illegal Unicode character");
 503        return -1;
 504      }
 505      return escaped;
 506    }
 507
 508    // \uXXXX
 509    case 'u': {
 510      word escaped;
 511      if ((escaped = decodeHexEscaped(bytes, i, 4)) < 0) {
 512        *error_message = (escaped == -1 ? "truncated \\uXXXX escape"
 513                                        : "illegal Unicode character");
 514        return -1;
 515      }
 516      return escaped;
 517    }
 518
 519    // \UXXXXXXXX
 520    case 'U': {
 521      word escaped;
 522      if ((escaped = decodeHexEscaped(bytes, i, 8)) < 0) {
 523        *error_message = (escaped == -1 ? "truncated \\uXXXXXXXX escape"
 524                                        : "illegal Unicode character");
 525        return -1;
 526      }
 527      return escaped;
 528    }
 529
 530    // \N{name}
 531    case 'N': {
 532      *error_message = "malformed \\N character escape";
 533      word length = bytes.length();
 534      if (*i >= length || bytes.byteAt(*i) != '{') {
 535        return -1;
 536      }
 537      word start = ++(*i);
 538      while (*i < length && bytes.byteAt(*i) != '}') {
 539        *i += 1;
 540      }
 541      word size = *i - start;
 542      if (size == 0 || *i == length) {
 543        return -1;
 544      }
 545      *i += 1;
 546      *error_message = "unknown Unicode character name";
 547
 548      unique_c_ptr<byte> buffer(reinterpret_cast<byte*>(std::malloc(size)));
 549      bytes.copyToStartAt(buffer.get(), size, start);
 550      return codePointFromName(buffer.get(), size);
 551    }
 552
 553    default: {
 554      *invalid_escape_index = *i - 1;
 555      return ch;
 556    }
 557  }
 558}
 559
 560RawObject FUNC(_codecs, _unicode_escape_decode)(Thread* thread,
 561                                                Arguments args) {
 562  HandleScope scope(thread);
 563  Runtime* runtime = thread->runtime();
 564  Object data(&scope, args.get(0));
 565  Str errors(&scope, strUnderlying(args.get(1)));
 566  word index = intUnderlying(args.get(2)).asWord();
 567  StrArray dst(&scope, args.get(3));
 568
 569  Byteslike bytes(&scope, thread, *data);
 570  word length = bytes.length();
 571  runtime->strArrayEnsureCapacity(thread, dst, length);
 572  word first_invalid_escape_index = -1;
 573  for (word i = index; i < length;) {
 574    const char* message = nullptr;
 575    word start_pos = i;
 576    byte ch = bytes.byteAt(i++);
 577    if (ch != '\\') {
 578      if (ch <= kMaxASCII) {
 579        runtime->strArrayAddASCII(thread, dst, ch);
 580        continue;
 581      }
 582      Str temp(&scope, SmallStr::fromCodePoint(ch));
 583      runtime->strArrayAddStr(thread, dst, temp);
 584      continue;
 585    }
 586    if (i >= length) {
 587      message = "\\ at end of string";
 588    } else {
 589      word invalid_escape_index = -1;
 590      int32_t decoded =
 591          decodeUnicodeEscaped(bytes, &i, &invalid_escape_index, &message);
 592      if (invalid_escape_index != -1) {
 593        runtime->strArrayAddASCII(thread, dst, '\\');
 594        if (first_invalid_escape_index == -1) {
 595          first_invalid_escape_index = invalid_escape_index;
 596        }
 597      }
 598      if (decoded != -1) {
 599        if (decoded <= kMaxASCII) {
 600          runtime->strArrayAddASCII(thread, dst, decoded);
 601          continue;
 602        }
 603        Str temp(&scope, SmallStr::fromCodePoint(decoded));
 604        runtime->strArrayAddStr(thread, dst, temp);
 605        continue;
 606      }
 607    }
 608    if (message != nullptr) {
 609      SymbolId error_id = lookupSymbolForErrorHandler(errors);
 610      switch (error_id) {
 611        case ID(replace): {
 612          Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
 613          runtime->strArrayAddStr(thread, dst, temp);
 614          break;
 615        }
 616        case ID(ignore):
 617          break;
 618        default: {
 619          Object start_pos_obj(&scope, runtime->newInt(start_pos));
 620          Object outpos_obj(&scope, runtime->newInt(i));
 621          Object message_obj(&scope, runtime->newStrFromCStr(message));
 622          Object escape_obj(&scope,
 623                            runtime->newInt(first_invalid_escape_index));
 624          return runtime->newTupleWith4(start_pos_obj, outpos_obj, message_obj,
 625                                        escape_obj);
 626        }
 627      }
 628    }
 629  }
 630  Object dst_obj(&scope, runtime->strFromStrArray(dst));
 631  Object length_obj(&scope, runtime->newInt(length));
 632  Object message_obj(&scope, runtime->newStrFromCStr(""));
 633  Object escape_obj(&scope, runtime->newInt(first_invalid_escape_index));
 634  return runtime->newTupleWith4(dst_obj, length_obj, message_obj, escape_obj);
 635}
 636
 637enum Utf8DecoderResult {
 638  k1Byte = 1,
 639  k2Byte = 2,
 640  k3Byte = 3,
 641  k4Byte = 4,
 642  kInvalidStart = 0,
 643  kInvalidContinuation1 = -1,
 644  kInvalidContinuation2 = -2,
 645  kInvalidContinuation3 = -3,
 646  kUnexpectedEndOfData = -4,
 647};
 648
 649// This functionality is taken mostly from CPython:
 650//   Objects/stringlib/codecs.h::utf8_decode
 651// This does error checking to ensure well-formedness of the passed in UTF-8
 652// bytes, and returns the number of bytes of the codepoint at `index` as a
 653// Utf8DecoderResult enum value.
 654// Since this is supposed to work as an incremental decoder as well, this
 655// function returns specific values for errors to determine whether they could
 656// be caused by incremental decoding, or if they would be an error no matter
 657// what other bytes might be streamed in later.
 658static Utf8DecoderResult isValidUtf8Codepoint(const Byteslike& bytes,
 659                                              word index) {
 660  word length = bytes.length();
 661  byte ch = bytes.byteAt(index);
 662  if (ch <= kMaxASCII) {
 663    return k1Byte;
 664  }
 665  if (ch < 0xE0) {
 666    // \xC2\x80-\xDF\xBF -- 0080-07FF
 667    if (ch < 0xC2) {
 668      // invalid sequence
 669      // \x80-\xBF -- continuation byte
 670      // \xC0-\xC1 -- fake 0000-007F
 671      return kInvalidStart;
 672    }
 673    if (index + 1 >= length) {
 674      return kUnexpectedEndOfData;
 675    }
 676    if (!UTF8::isTrailByte(bytes.byteAt(index + 1))) {
 677      return kInvalidContinuation1;
 678    }
 679    return k2Byte;
 680  }
 681  if (ch < 0xF0) {
 682    // \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF
 683    if (index + 2 >= length) {
 684      if (index + 1 >= length) {
 685        return kUnexpectedEndOfData;
 686      }
 687      byte ch2 = bytes.byteAt(index + 1);
 688      if (!UTF8::isTrailByte(ch2) || (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) {
 689        return kInvalidContinuation1;
 690      }
 691      return kUnexpectedEndOfData;
 692    }
 693    byte ch2 = bytes.byteAt(index + 1);
 694    if (!UTF8::isTrailByte(ch2)) {
 695      return kInvalidContinuation1;
 696    }
 697    if (ch == 0xE0) {
 698      if (ch2 < 0xA0) {
 699        // invalid sequence
 700        // \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800
 701        return kInvalidContinuation1;
 702      }
 703    } else if (ch == 0xED && ch2 >= 0xA0) {
 704      // Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
 705      // will result in surrogates in range D800-DFFF. Surrogates are
 706      // not valid UTF-8 so they are rejected.
 707      // See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
 708      // (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
 709      return kInvalidContinuation1;
 710    }
 711    if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
 712      return kInvalidContinuation2;
 713    }
 714    return k3Byte;
 715  }
 716  if (ch < 0xF5) {
 717    // \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF
 718    if (index + 3 >= length) {
 719      if (index + 1 >= length) {
 720        return kUnexpectedEndOfData;
 721      }
 722      byte ch2 = bytes.byteAt(index + 1);
 723      if (!UTF8::isTrailByte(ch2) || (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) {
 724        return kInvalidContinuation1;
 725      }
 726      if (index + 2 >= length) {
 727        return kUnexpectedEndOfData;
 728      }
 729      if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
 730        return kInvalidContinuation2;
 731      }
 732      return kUnexpectedEndOfData;
 733    }
 734    byte ch2 = bytes.byteAt(index + 1);
 735    if (!UTF8::isTrailByte(ch2)) {
 736      return kInvalidContinuation1;
 737    }
 738    if (ch == 0xF0) {
 739      if (ch2 < 0x90) {
 740        // invalid sequence
 741        // \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
 742        return kInvalidContinuation1;
 743      }
 744    } else if (ch == 0xF4 && ch2 >= 0x90) {
 745      // invalid sequence
 746      // \xF4\x90\x80\80- -- 110000- overflow
 747      return kInvalidContinuation1;
 748    }
 749    if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
 750      return kInvalidContinuation2;
 751    }
 752    if (!UTF8::isTrailByte(bytes.byteAt(index + 3))) {
 753      return kInvalidContinuation3;
 754    }
 755    return k4Byte;
 756  }
 757  return kInvalidStart;
 758}
 759
 760RawObject FUNC(_codecs, _utf_8_decode)(Thread* thread, Arguments args) {
 761  Runtime* runtime = thread->runtime();
 762  HandleScope scope(thread);
 763  Object final_obj(&scope, args.get(4));
 764  DCHECK(final_obj.isBool(), "Fifth arg to _utf_8_decode must be bool");
 765  Object data(&scope, args.get(0));
 766  Str errors(&scope, strUnderlying(args.get(1)));
 767  word index = intUnderlying(args.get(2)).asWord();
 768  StrArray dst(&scope, args.get(3));
 769
 770  word length;
 771  Byteslike bytes(&scope, thread, *data);
 772  length = bytes.length();
 773  runtime->strArrayEnsureCapacity(thread, dst, length);
 774  word i = asciiDecode(thread, dst, bytes, index, length);
 775  if (i == length) {
 776    Object dst_obj(&scope, runtime->strFromStrArray(dst));
 777    Object length_obj(&scope, runtime->newInt(length));
 778    Object message_obj(&scope, runtime->newStrFromCStr(""));
 779    return runtime->newTupleWith3(dst_obj, length_obj, message_obj);
 780  }
 781
 782  SymbolId error_id = lookupSymbolForErrorHandler(errors);
 783  bool is_final = Bool::cast(*final_obj).value();
 784  while (i < length) {
 785    // TODO(T41032331): Scan for non-ASCII characters by words instead of chars
 786    Utf8DecoderResult validator_result = isValidUtf8Codepoint(bytes, i);
 787    if (validator_result >= k1Byte) {
 788      byte codepoint[4] = {0};
 789      for (int codeunit = 0; codeunit + 1 <= validator_result; codeunit++) {
 790        codepoint[codeunit] = bytes.byteAt(i + codeunit);
 791      }
 792      i += validator_result;
 793      Str temp(&scope,
 794               runtime->newStrWithAll(View<byte>{codepoint, validator_result}));
 795      runtime->strArrayAddStr(thread, dst, temp);
 796      continue;
 797    }
 798    if (validator_result != kInvalidStart && !is_final) {
 799      break;
 800    }
 801    word error_end = i;
 802    const char* error_message = nullptr;
 803    switch (validator_result) {
 804      case kInvalidStart:
 805        error_end += 1;
 806        error_message = "invalid start byte";
 807        break;
 808      case kInvalidContinuation1:
 809      case kInvalidContinuation2:
 810      case kInvalidContinuation3:
 811        error_end -= validator_result;
 812        error_message = "invalid continuation byte";
 813        break;
 814      case kUnexpectedEndOfData:
 815        error_end = length;
 816        error_message = "unexpected end of data";
 817        break;
 818      default:
 819        UNREACHABLE(
 820            "valid utf-8 codepoints should have been decoded by this point");
 821    }
 822    switch (error_id) {
 823      case ID(replace): {
 824        Str temp(&scope, SmallStr::fromCodePoint(kReplacementCharacter));
 825        runtime->strArrayAddStr(thread, dst, temp);
 826        i = error_end;
 827        break;
 828      }
 829      case ID(surrogateescape): {
 830        for (; i < error_end; ++i) {
 831          Str temp(&scope, SmallStr::fromCodePoint(Unicode::kLowSurrogateStart +
 832                                                   bytes.byteAt(i)));
 833          runtime->strArrayAddStr(thread, dst, temp);
 834        }
 835        break;
 836      }
 837      case ID(ignore):
 838        i = error_end;
 839        break;
 840      default: {
 841        Object outpos_obj(&scope, runtime->newInt(i));
 842        Object error_end_obj(&scope, runtime->newInt(error_end));
 843        Object message_obj(&scope, runtime->newStrFromCStr(error_message));
 844        return runtime->newTupleWith3(outpos_obj, error_end_obj, message_obj);
 845      }
 846    }
 847  }
 848  Object dst_obj(&scope, runtime->strFromStrArray(dst));
 849  Object outpos_obj(&scope, runtime->newInt(i));
 850  Object message_obj(&scope, Str::empty());
 851  return runtime->newTupleWith3(dst_obj, outpos_obj, message_obj);
 852}
 853
 854RawObject FUNC(_codecs, _utf_8_encode)(Thread* thread, Arguments args) {
 855  Runtime* runtime = thread->runtime();
 856  HandleScope scope(thread);
 857  Object output_obj(&scope, args.get(3));
 858  DCHECK(runtime->isInstanceOfBytearray(*output_obj),
 859         "Fourth arg to _utf_8_encode must be bytearray");
 860  Str data(&scope, strUnderlying(args.get(0)));
 861  Str errors(&scope, strUnderlying(args.get(1)));
 862  word index = intUnderlying(args.get(2)).asWord();
 863  Bytearray output(&scope, *output_obj);
 864
 865  SymbolId error_symbol = lookupSymbolForErrorHandler(errors);
 866  for (word byte_offset = thread->strOffset(data, index);
 867       byte_offset < data.length(); index++) {
 868    word num_bytes;
 869    int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
 870    byte_offset += num_bytes;
 871    if (!Unicode::isSurrogate(codepoint)) {
 872      for (word j = byte_offset - num_bytes; j < byte_offset; j++) {
 873        bytearrayAdd(thread, runtime, output, data.byteAt(j));
 874      }
 875    } else {
 876      switch (error_symbol) {
 877        case ID(ignore):
 878          continue;
 879        case ID(replace):
 880          bytearrayAdd(thread, runtime, output, kASCIIReplacement);
 881          continue;
 882        case ID(surrogateescape):
 883          if (isEscapedLatin1Surrogate(codepoint)) {
 884            bytearrayAdd(thread, runtime, output,
 885                         codepoint - Unicode::kLowSurrogateStart);
 886            continue;
 887          }
 888          break;
 889        case ID(surrogatepass):
 890          if (Unicode::isSurrogate(codepoint)) {
 891            bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 3));
 892            bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 2));
 893            bytearrayAdd(thread, runtime, output, data.byteAt(byte_offset - 1));
 894            continue;
 895          }
 896          break;
 897        default:
 898          break;
 899      }
 900      Object outpos1(&scope, runtime->newInt(index));
 901      while (byte_offset < data.length() &&
 902             Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
 903        byte_offset += num_bytes;
 904        index++;
 905      }
 906      Object outpos2(&scope, runtime->newInt(index + 1));
 907      return runtime->newTupleWith2(outpos1, outpos2);
 908    }
 909  }
 910  Object output_bytes(&scope, bytearrayAsBytes(thread, output));
 911  Object index_obj(&scope, runtime->newInt(index));
 912  return runtime->newTupleWith2(output_bytes, index_obj);
 913}
 914
 915static void appendUtf16ToBytearray(Thread* thread, Runtime* runtime,
 916                                   const Bytearray& writer, int32_t codepoint,
 917                                   endian endianness) {
 918  if (endianness == endian::little) {
 919    bytearrayAdd(thread, runtime, writer, codepoint);
 920    bytearrayAdd(thread, runtime, writer, codepoint >> kBitsPerByte);
 921  } else {
 922    bytearrayAdd(thread, runtime, writer, codepoint >> kBitsPerByte);
 923    bytearrayAdd(thread, runtime, writer, codepoint);
 924  }
 925}
 926
 927RawObject FUNC(_codecs, _utf_16_encode)(Thread* thread, Arguments args) {
 928  Runtime* runtime = thread->runtime();
 929  HandleScope scope(thread);
 930  Object output_obj(&scope, args.get(3));
 931  DCHECK(runtime->isInstanceOfBytearray(*output_obj),
 932         "Fourth arg to _utf_16_encode must be bytearray");
 933  Str data(&scope, strUnderlying(args.get(0)));
 934  Str errors(&scope, strUnderlying(args.get(1)));
 935  word index = intUnderlying(args.get(2)).asWord();
 936  Bytearray output(&scope, *output_obj);
 937  OptInt<int32_t> byteorder = intUnderlying(args.get(4)).asInt<int32_t>();
 938  if (byteorder.error != CastError::None) {
 939    return thread->raiseWithFmt(LayoutId::kOverflowError,
 940                                "Python int too large to convert to C int");
 941  }
 942
 943  SymbolId error_id = lookupSymbolForErrorHandler(errors);
 944  for (word byte_offset = thread->strOffset(data, index);
 945       byte_offset < data.length(); index++) {
 946    endian endianness = byteorder.value <= 0 ? endian::little : endian::big;
 947    word num_bytes;
 948    int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
 949    byte_offset += num_bytes;
 950    if (!Unicode::isSurrogate(codepoint)) {
 951      if (codepoint < Unicode::kHighSurrogateStart) {
 952        appendUtf16ToBytearray(thread, runtime, output, codepoint, endianness);
 953      } else {
 954        appendUtf16ToBytearray(thread, runtime, output,
 955                               Unicode::highSurrogateFor(codepoint),
 956                               endianness);
 957        appendUtf16ToBytearray(thread, runtime, output,
 958                               Unicode::lowSurrogateFor(codepoint), endianness);
 959      }
 960    } else {
 961      switch (error_id) {
 962        case ID(ignore):
 963          continue;
 964        case ID(replace):
 965          appendUtf16ToBytearray(thread, runtime, output, kASCIIReplacement,
 966                                 endianness);
 967          continue;
 968        case ID(surrogateescape):
 969          if (isEscapedLatin1Surrogate(codepoint)) {
 970            appendUtf16ToBytearray(thread, runtime, output,
 971                                   codepoint - Unicode::kLowSurrogateStart,
 972                                   endianness);
 973            continue;
 974          }
 975          break;
 976        default:
 977          break;
 978      }
 979      Object outpos1(&scope, runtime->newInt(index));
 980      while (byte_offset < data.length() &&
 981             Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
 982        byte_offset += num_bytes;
 983        index++;
 984      }
 985      Object outpos2(&scope, runtime->newInt(index + 1));
 986      return runtime->newTupleWith2(outpos1, outpos2);
 987    }
 988  }
 989  Object output_bytes(&scope, bytearrayAsBytes(thread, output));
 990  Object index_obj(&scope, runtime->newInt(index));
 991  return runtime->newTupleWith2(output_bytes, index_obj);
 992}
 993
 994static void appendUtf32ToBytearray(Thread* thread, Runtime* runtime,
 995                                   const Bytearray& writer, int32_t codepoint,
 996                                   endian endianness) {
 997  if (endianness == endian::little) {
 998    bytearrayAdd(thread, runtime, writer, codepoint);
 999    bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte));
1000    bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 2));
1001    bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 3));
1002  } else {
1003    bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 3));
1004    bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte * 2));
1005    bytearrayAdd(thread, runtime, writer, codepoint >> (kBitsPerByte));
1006    bytearrayAdd(thread, runtime, writer, codepoint);
1007  }
1008}
1009
1010RawObject FUNC(_codecs, _utf_32_encode)(Thread* thread, Arguments args) {
1011  Runtime* runtime = thread->runtime();
1012  HandleScope scope(thread);
1013  Object output_obj(&scope, args.get(3));
1014  DCHECK(runtime->isInstanceOfBytearray(*output_obj),
1015         "Fourth arg to _utf_32_encode must be bytearray");
1016  Str data(&scope, strUnderlying(args.get(0)));
1017  Str errors(&scope, strUnderlying(args.get(1)));
1018  word index = intUnderlying(args.get(2)).asWord();
1019  Bytearray output(&scope, *output_obj);
1020  OptInt<int32_t> byteorder = intUnderlying(args.get(4)).asInt<int32_t>();
1021  if (byteorder.error != CastError::None) {
1022    return thread->raiseWithFmt(LayoutId::kOverflowError,
1023                                "Python int too large to convert to C int");
1024  }
1025
1026  SymbolId error_id = lookupSymbolForErrorHandler(errors);
1027  for (word byte_offset = thread->strOffset(data, index);
1028       byte_offset < data.length(); index++) {
1029    endian endianness = byteorder.value <= 0 ? endian::little : endian::big;
1030    word num_bytes;
1031    int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
1032    byte_offset += num_bytes;
1033    if (!Unicode::isSurrogate(codepoint)) {
1034      appendUtf32ToBytearray(thread, runtime, output, codepoint, endianness);
1035    } else {
1036      switch (error_id) {
1037        case ID(ignore):
1038          continue;
1039        case ID(replace):
1040          appendUtf32ToBytearray(thread, runtime, output, kASCIIReplacement,
1041                                 endianness);
1042          continue;
1043        case ID(surrogateescape):
1044          if (isEscapedLatin1Surrogate(codepoint)) {
1045            appendUtf32ToBytearray(thread, runtime, output,
1046                                   codepoint - Unicode::kLowSurrogateStart,
1047                                   endianness);
1048            continue;
1049          }
1050          break;
1051        default:
1052          break;
1053      }
1054      Object outpos1(&scope, runtime->newInt(index));
1055      while (byte_offset < data.length() &&
1056             Unicode::isSurrogate(data.codePointAt(byte_offset, &num_bytes))) {
1057        byte_offset += num_bytes;
1058        index++;
1059      }
1060      Object outpos2(&scope, runtime->newInt(index + 1));
1061      return runtime->newTupleWith2(outpos1, outpos2);
1062    }
1063  }
1064  Object output_bytes(&scope, bytearrayAsBytes(thread, output));
1065  Object index_obj(&scope, runtime->newInt(index));
1066  return runtime->newTupleWith2(output_bytes, index_obj);
1067}
1068
1069// Takes a Bytearray and a Str object, and appends each byte in the Str to the
1070// Bytearray one by one
1071RawObject FUNC(_codecs, _bytearray_string_append)(Thread* thread,
1072                                                  Arguments args) {
1073  HandleScope scope(thread);
1074  Bytearray dst(&scope, args.get(0));
1075  Str data(&scope, args.get(1));
1076  for (word i = 0; i < data.length(); ++i) {
1077    bytearrayAdd(thread, thread->runtime(), dst, data.byteAt(i));
1078  }
1079  return NoneType::object();
1080}
1081
1082RawObject FUNC(_codecs, _raw_unicode_escape_encode)(Thread* thread,
1083                                                    Arguments args) {
1084  HandleScope scope(thread);
1085  Runtime* runtime = thread->runtime();
1086  Str data(&scope, strUnderlying(args.get(0)));
1087  word size = data.codePointLength();
1088  Bytearray dst(&scope, runtime->newBytearray());
1089  word length = data.length();
1090
1091  // 2 byte codepoints can be expanded to 4 bytes + 2 escape characters
1092  // 4 byte codepoints well be expanded to 8 bytes + 2 escape characters
1093  // To be safe we double the bytecount and add space for 2 escape characters
1094  // per codepoint.
1095  word expanded_size = length * 2 + size * 2;
1096  runtime->bytearrayEnsureCapacity(thread, dst, expanded_size);
1097  word num_bytes;
1098  for (word index = 0, byte_offset = thread->strOffset(data, index);
1099       byte_offset < data.length(); index++) {
1100    int32_t codepoint = data.codePointAt(byte_offset, &num_bytes);
1101    byte_offset += num_bytes;
1102    // U+0000-U+00ff range: Copy 8-bit characters as-is
1103    if (codepoint <= kMaxByte) {
1104      bytearrayAdd(thread, runtime, dst, codepoint);
1105    }
1106    // U+0100-U+ffff range: Map 16-bit characters to '\uHHHH'
1107    else if (codepoint <= kMaxUint16) {
1108      bytearrayAdd(thread, runtime, dst, '\\');
1109      bytearrayAdd(thread, runtime, dst, 'u');
1110      bytearrayAdd(thread, runtime, dst,
1111                   lowerCaseHexDigit((codepoint >> 12) & 0xf));
1112      bytearrayAdd(thread, runtime, dst,
1113                   lowerCaseHexDigit((codepoint >> 8) & 0xf));
1114      bytearrayAdd(thread, runtime, dst,
1115                   lowerCaseHexDigit((codepoint >> 4) & 0xf));
1116      bytearrayAdd(thread, runtime, dst, lowerCaseHexDigit(codepoint & 15));
1117    }
1118    // U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH'
1119    else {
1120      CHECK(codepoint <= kMaxUnicode, "expected a valid unicode code point");
1121      bytearrayAdd(thread, runtime, dst, '\\');
1122      bytearrayAdd(thread, runtime, dst, 'U');
1123      bytearrayAdd(thread, runtime, dst, '0');
1124      bytearrayAdd(thread, runtime, dst, '0');
1125      bytearrayAdd(thread, runtime, dst,
1126                   lowerCaseHexDigit((codepoint >> 20) & 0xf));
1127      bytearrayAdd(thread, runtime, dst,
1128                   lowerCaseHexDigit((codepoint >> 16) & 0xf));
1129      bytearrayAdd(thread, runtime, dst,
1130                   lowerCaseHexDigit((codepoint >> 12) & 0xf));
1131      bytearrayAdd(thread, runtime, dst,
1132                   lowerCaseHexDigit((codepoint >> 8) & 0xf));
1133      bytearrayAdd(thread, runtime, dst,
1134                   lowerCaseHexDigit((codepoint >> 4) & 0xf));
1135      bytearrayAdd(thread, runtime, dst, lowerCaseHexDigit(codepoint & 15));
1136    }
1137  }
1138  Object output_bytes(&scope, bytearrayAsBytes(thread, dst));
1139  Object size_obj(&scope, runtime->newInt(size));
1140  return runtime->newTupleWith2(output_bytes, size_obj);
1141}
1142
1143RawObject FUNC(_codecs, _raw_unicode_escape_decode)(Thread* thread,
1144                                                    Arguments args) {
1145  HandleScope scope(thread);
1146  Runtime* runtime = thread->runtime();
1147  Object data(&scope, args.get(0));
1148  Str errors(&scope, strUnderlying(args.get(1)));
1149  word index = intUnderlying(args.get(2)).asWord();
1150  StrArray dst(&scope, args.get(3));
1151
1152  Byteslike bytes(&scope, thread, *data);
1153  word length = bytes.length();
1154  runtime->strArrayEnsureCapacity(thread, dst, length);
1155  for (word i = index; i < length;) {
1156    const char* message = nullptr;
1157    word start_pos = i;
1158    byte ch = bytes.byteAt(i);
1159    i++;
1160    if (ch != '\\') {
1161      if (ch <= kMaxASCII) {
1162        runtime->strArrayAddASCII(thread, dst, ch);
1163        continue;
1164      }
1165      Str temp(&scope, SmallStr::fromCodePoint(ch));
1166      runtime->strArrayAddStr(thread, dst, temp);
1167      continue;
1168    }
1169    if (i >= length) {
1170      // \\ at end of string
1171      runtime->strArrayAddASCII(thread, dst, '\\');
1172    } else {
1173      int32_t decoded;
1174      ch = bytes.byteAt(i);
1175      i++;
1176      // Only care about \uXXXX and \UXXXXXXXX when decoding raw unicode.
1177      switch (ch) {
1178        // \uXXXX
1179        case 'u': {
1180          if ((decoded = decodeHexEscaped(bytes, &i, 4)) < 0) {
1181            message = (decoded == -1 ? "truncated \\uXXXX escape"
1182                                     : "illegal Unicode character");
1183          }
1184          break;
1185        }
1186        // \UXXXXXXXX
1187        case 'U': {
1188          if ((decoded = decodeHexEscaped(bytes, &i, 8)) < 0) {
1189            if (decoded == -1) {
1190              message = "truncated \\UXXXXXXXX escape";
1191            } else if (decoded == -2) {
1192              message = "\\Uxxxxxxxx out of range";
1193            } else {
1194              message = "illegal Unicode character";
1195            }
1196          }
1197          break;
1198        }
1199        default: {
1200          runtime->strArrayAddASCII(thread, dst, '\\');
1201          decoded = ch;
1202        }
1203      }
1204      if (decoded >= 0) {
1205        if (decoded <= kMaxASCII) {
1206          runtime->strArrayAddASCII(thread, dst, decoded);
1207          continue;
1208        }
1209        Str temp(&scope, SmallStr::fromCodePoint(decoded));
1210        runtime->strArrayAddStr(thread, dst, temp);
1211        continue;
1212      }
1213    }
1214    if (message != nullptr) {
1215      SymbolId error_id = lookupSymbolForErrorHandler(errors);
1216      switch (error_id) {
1217        case ID(replace): {
1218          Str temp(&scope, SmallStr::fromCodePoint(0xFFFD));
1219          runtime->strArrayAddStr(thread, dst, temp);
1220          break;
1221        }
1222        case ID(ignore):
1223          break;
1224        default: {
1225          Object start_pos_obj(&scope, runtime->newInt(start_pos));
1226          Object outpos_obj(&scope, runtime->newInt(i));
1227          Object message_obj(&scope, runtime->newStrFromCStr(message));
1228          return runtime->newTupleWith3(start_pos_obj, outpos_obj, message_obj);
1229        }
1230      }
1231    }
1232  }
1233  Object dst_obj(&scope, runtime->strFromStrArray(dst));
1234  Object length_obj(&scope, runtime->newInt(length));
1235  Object message_obj(&scope, runtime->newStrFromCStr(""));
1236  return runtime->newTupleWith3(dst_obj, length_obj, message_obj);
1237}
1238
1239RawObject FUNC(_codecs, backslashreplace_errors)(Thread* thread,
1240                                                 Arguments args) {
1241  HandleScope scope(thread);
1242  Runtime* runtime = thread->runtime();
1243  Object error(&scope, args.get(0));
1244  Object object(&scope, NoneType::object());
1245  word start;
1246  word end;
1247  if (runtime->isInstanceOfUnicodeDecodeError(*error)) {
1248    UnicodeErrorBase unicode_error(&scope, *error);
1249    start = SmallInt::cast(unicode_error.start()).value();
1250    end = SmallInt::cast(unicode_error.end()).value();
1251    object = unicode_error.object();
1252    if (!runtime->isInstanceOfBytes(*object)) {
1253      return thread->raiseWithFmt(LayoutId::kTypeError,
1254                                  "object attribute must be bytes");
1255    }
1256    Bytes bytes(&scope, bytesUnderlying(*object));
1257    word length = bytes.length();
1258    if (start >= length) start = length - 1;
1259    if (start < 0) start = 0;
1260    if (end >= length) end = length;
1261    if (end < 1) end = 1;
1262    word result_size = end - start;
1263    if (result_size < 0) {
1264      return thread->raiseWithFmt(LayoutId::kValueError, "end before start");
1265    }
1266    result_size *= 4;
1267    MutableBytes result(&scope,
1268                        runtime->newMutableBytesUninitialized(result_size));
1269    word pos = 0;
1270    for (word i = start; i < end; i++) {
1271      byte b = bytes.byteAt(i);
1272      result.byteAtPut(pos++, '\\');
1273      result.byteAtPut(pos++, 'x');
1274      uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/2, b);
1275      pos += 2;
1276    }
1277    DCHECK(pos == result.length(), "size mismatch");
1278    Object result_str(&scope, result.becomeStr());
1279    Object end_obj(&scope, SmallInt::fromWord(end));
1280    return runtime->newTupleWith2(result_str, end_obj);
1281  }
1282
1283  if (runtime->isInstanceOfUnicodeEncodeError(*error) ||
1284      runtime->isInstanceOfUnicodeTranslateError(*error)) {
1285    UnicodeErrorBase unicode_error(&scope, *error);
1286    start = SmallInt::cast(unicode_error.start()).value();
1287    end = SmallInt::cast(unicode_error.end()).value();
1288    object = unicode_error.object();
1289    if (!runtime->isInstanceOfStr(*object)) {
1290      return thread->raiseWithFmt(LayoutId::kTypeError,
1291                                  "object attribute must be unicode");
1292    }
1293    Str str(&scope, strUnderlying(*object));
1294
1295    if (start < 0) start = 0;
1296    if (end < 1) end = 1;
1297    if (end < start) {
1298      return thread->raiseWithFmt(LayoutId::kValueError, "end before start");
1299    }
1300    word start_byte = str.offsetByCodePoints(0, start);
1301    word end_byte = str.offsetByCodePoints(start_byte, end - start);
1302    word result_size = 0;
1303    for (word i = start_byte; i < end_byte;) {
1304      word num_bytes;
1305      int32_t cp = str.codePointAt(i, &num_bytes);
1306      i += num_bytes;
1307      if (cp > kMaxUint16) {
1308        result_size += 10;  // Will replace with `\Uxxxxxxxx`
1309      } else if (cp > kMaxByte) {
1310        result_size += 6;  // Will replace with `\uxxxx`
1311      } else {
1312        result_size += 4;  // Will replace with `\xyy`
1313      }
1314    }
1315    MutableBytes result(&scope,
1316                        runtime->newMutableBytesUninitialized(result_size));
1317    word pos = 0;
1318    for (word i = start_byte; i < end_byte;) {
1319      word num_bytes;
1320      int32_t cp = str.codePointAt(i, &num_bytes);
1321      i += num_bytes;
1322      result.byteAtPut(pos++, '\\');
1323      if (cp > kMaxUint16) {
1324        result.byteAtPut(pos++, 'U');
1325        uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/8, cp);
1326        pos += 8;
1327      } else if (cp > kMaxByte) {
1328        result.byteAtPut(pos++, 'u');
1329        uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/4, cp);
1330        pos += 4;
1331      } else {
1332        result.byteAtPut(pos++, 'x');
1333        uwordToHexadecimalWithMutableBytes(*result, pos, /*num_digits=*/2, cp);
1334        pos += 2;
1335      }
1336    }
1337    DCHECK(pos == result.length(), "size mismatch");
1338    Object result_bytes(&scope, result.becomeStr());
1339    Object end_obj(&scope, SmallInt::fromWord(end));
1340    return runtime->newTupleWith2(result_bytes, end_obj);
1341  }
1342  return thread->raiseWithFmt(LayoutId::kTypeError,
1343                              "don't know how to handle %T in error callback",
1344                              &error);
1345}
1346
1347}  // namespace py