ext/Objects/unicodeobject.cpp at trunk · bernsteinbear.com/skybison

bernsteinbear.com / skybison
fork atom
this repo has no description
fork atom
skybison / ext / Objects / unicodeobject.cpp
at trunk 2610 lines 92 kB view raw
wrap content
bernsteinbear.com Make all methods on ApiHandle static 3y ago
2d91172a
   1// Copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com)
   2// unicodeobject.c implementation
   3#include <cerrno>
   4#include <cstdarg>
   5#include <cstring>
   6#include <cwchar>
   7
   8#include "cpython-data.h"
   9#include "cpython-func.h"
  10
  11#include "api-handle.h"
  12#include "bytearray-builtins.h"
  13#include "bytes-builtins.h"
  14#include "handles.h"
  15#include "modules.h"
  16#include "objects.h"
  17#include "runtime.h"
  18#include "str-builtins.h"
  19#include "unicode.h"
  20#include "utils.h"
  21
  22const char* Py_FileSystemDefaultEncoding = "utf-8";
  23int Py_HasFileSystemDefaultEncoding = 1;
  24const char* Py_FileSystemDefaultEncodeErrors = "surrogatepass";
  25
  26namespace py {
  27
  28typedef byte Py_UCS1;
  29typedef uint16_t Py_UCS2;
  30
  31static const int kMaxLongLongChars = 19;  // len(str(2**63-1))
  32static const int kOverallocateFactor = 4;
  33
  34PY_EXPORT PyTypeObject* PyUnicodeIter_Type_Ptr() {
  35  Runtime* runtime = Thread::current()->runtime();
  36  return reinterpret_cast<PyTypeObject*>(ApiHandle::borrowedReference(
  37      runtime, runtime->typeAt(LayoutId::kStrIterator)));
  38}
  39
  40static RawObject symbolFromError(Thread* thread, const char* error) {
  41  Runtime* runtime = thread->runtime();
  42  Symbols* symbols = runtime->symbols();
  43  if (error == nullptr || std::strcmp(error, "strict") == 0) {
  44    return symbols->at(ID(strict));
  45  }
  46  if (std::strcmp(error, "ignore") == 0) {
  47    return symbols->at(ID(ignore));
  48  }
  49  if (std::strcmp(error, "replace") == 0) {
  50    return symbols->at(ID(replace));
  51  }
  52  return Runtime::internStrFromCStr(thread, error);
  53}
  54
  55PY_EXPORT void PyUnicode_WRITE_Func(enum PyUnicode_Kind kind, void* data,
  56                                    Py_ssize_t index, Py_UCS4 value) {
  57  if (kind == PyUnicode_1BYTE_KIND) {
  58    static_cast<Py_UCS1*>(data)[index] = static_cast<Py_UCS1>(value);
  59  } else if (kind == PyUnicode_2BYTE_KIND) {
  60    static_cast<Py_UCS2*>(data)[index] = static_cast<Py_UCS2>(value);
  61  } else {
  62    DCHECK(kind == PyUnicode_4BYTE_KIND, "kind must be PyUnicode_4BYTE_KIND");
  63    static_cast<Py_UCS4*>(data)[index] = static_cast<Py_UCS4>(value);
  64  }
  65}
  66
  67PY_EXPORT void _PyUnicodeWriter_Dealloc(_PyUnicodeWriter* writer) {
  68  PyMem_Free(writer->data);
  69}
  70
  71PY_EXPORT PyObject* _PyUnicodeWriter_Finish(_PyUnicodeWriter* writer) {
  72  Thread* thread = Thread::current();
  73  HandleScope scope(thread);
  74  Runtime* runtime = thread->runtime();
  75  Str str(&scope, runtime->newStrFromUTF32(View<int32_t>(
  76                      static_cast<int32_t*>(writer->data), writer->pos)));
  77  PyMem_Free(writer->data);
  78  return ApiHandle::newReference(runtime, *str);
  79}
  80
  81PY_EXPORT void _PyUnicodeWriter_Init(_PyUnicodeWriter* writer) {
  82  std::memset(writer, 0, sizeof(*writer));
  83  writer->kind = PyUnicode_4BYTE_KIND;
  84}
  85
  86static int _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter* writer,
  87                                            Py_ssize_t length,
  88                                            Py_UCS4 /* maxchar */) {
  89  writer->maxchar = kMaxUnicode;
  90  if (length > kMaxWord - writer->pos) {
  91    Thread::current()->raiseMemoryError();
  92    return -1;
  93  }
  94  Py_ssize_t newlen = writer->pos + length;
  95  if (writer->data == nullptr) {
  96    if (writer->overallocate &&
  97        newlen <= (kMaxWord - newlen / kOverallocateFactor)) {
  98      // overallocate to limit the number of realloc()
  99      newlen += newlen / kOverallocateFactor;
 100    }
 101    writer->data = PyMem_Malloc(newlen * sizeof(int32_t));
 102    if (writer->data == nullptr) return -1;
 103  } else if (newlen > writer->size) {
 104    if (writer->overallocate &&
 105        newlen <= (kMaxWord - newlen / kOverallocateFactor)) {
 106      // overallocate to limit the number of realloc()
 107      newlen += newlen / kOverallocateFactor;
 108    }
 109    writer->data = PyMem_Realloc(writer->data, newlen * sizeof(int32_t));
 110    if (writer->data == nullptr) return -1;
 111  }
 112  writer->size = newlen;
 113  return 0;
 114}
 115
 116PY_EXPORT int _PyUnicodeWriter_Prepare(_PyUnicodeWriter* writer,
 117                                       Py_ssize_t length, Py_UCS4 maxchar) {
 118  if (length <= writer->size - writer->pos || length == 0) return 0;
 119  return _PyUnicodeWriter_PrepareInternal(writer, length, maxchar);
 120}
 121
 122PY_EXPORT int _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter* writer,
 123                                                const char* ascii,
 124                                                Py_ssize_t len) {
 125  if (len == -1) len = std::strlen(ascii);
 126  if (writer->data == nullptr && !writer->overallocate) {
 127    writer->data = PyMem_Malloc(len * sizeof(int32_t));
 128    writer->size = len;
 129  }
 130
 131  if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) == -1) return -1;
 132  Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
 133  for (Py_ssize_t i = 0; i < len; ++i) {
 134    CHECK(ascii[i] >= 0, "_PyUnicodeWriter_WriteASCIIString only takes ASCII");
 135    data[writer->pos++] = static_cast<uint8_t>(ascii[i]);
 136  }
 137  return 0;
 138}
 139
 140PY_EXPORT int _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter* writer,
 141                                               Py_UCS4 ch) {
 142  if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) return -1;
 143  PyUnicode_WRITE(PyUnicode_4BYTE_KIND, writer->data, writer->pos, ch);
 144  writer->pos++;
 145  return 0;
 146}
 147
 148PY_EXPORT int _PyUnicodeWriter_WriteChar(_PyUnicodeWriter* writer, Py_UCS4 ch) {
 149  return _PyUnicodeWriter_WriteCharInline(writer, ch);
 150}
 151
 152PY_EXPORT int _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter* writer,
 153                                                 const char* str,
 154                                                 Py_ssize_t len) {
 155  if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) == -1) return -1;
 156  Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
 157  for (Py_ssize_t i = 0; i < len; ++i) {
 158    data[writer->pos++] = static_cast<uint8_t>(str[i]);
 159  }
 160  return 0;
 161}
 162
 163PY_EXPORT int _PyUnicodeWriter_WriteStr(_PyUnicodeWriter* writer,
 164                                        PyObject* str) {
 165  Thread* thread = Thread::current();
 166  HandleScope scope(thread);
 167  Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
 168  Str src(&scope, strUnderlying(*obj));
 169  Py_ssize_t codepoints = src.codePointLength();
 170  if (_PyUnicodeWriter_Prepare(writer, codepoints, kMaxUnicode) == -1) {
 171    return -1;
 172  }
 173  Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
 174  for (word i = 0, len = src.length(), cp_len; i < len; i += cp_len) {
 175    int32_t cp = src.codePointAt(i, &cp_len);
 176    data[writer->pos++] = cp;
 177  }
 178  return 0;
 179}
 180
 181PY_EXPORT int _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter* writer,
 182                                              PyObject* str, Py_ssize_t start,
 183                                              Py_ssize_t end) {
 184  if (end == 0) return 0;
 185  Py_ssize_t len = end - start;
 186  if (_PyUnicodeWriter_Prepare(writer, len, kMaxUnicode) < 0) return -1;
 187
 188  Thread* thread = Thread::current();
 189  HandleScope scope(thread);
 190  Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
 191  Str src(&scope, strUnderlying(*obj));
 192  word start_index = thread->strOffset(src, start);
 193  DCHECK_BOUND(start_index, src.length());
 194  word end_index = thread->strOffset(src, end);
 195  DCHECK_BOUND(end_index, src.length());
 196  Py_UCS4* data = static_cast<Py_UCS4*>(writer->data);
 197  for (word i = start_index, cp_len; i < end_index; i += cp_len) {
 198    int32_t cp = src.codePointAt(i, &cp_len);
 199    data[writer->pos++] = cp;
 200  }
 201  return 0;
 202}
 203
 204// Facebook: D13491655
 205// Most of the following helper functions, along with PyUnicode_FromFormat and
 206// PyUnicode_FromFormatV are directly imported from CPython. The following
 207// modifications have been made:
 208//
 209// - Since our internal strings are always UTF-8, we don't need maxchar or any
 210// of the helper functions required to calculate it
 211//
 212// - Since our strings are immutable, we can't use PyUnicode_Fill. However,
 213// since the helper functions always use it to append to strings, we can get
 214// away with just writing characters in a loop.
 215//
 216// - Since our internal strings are always UTF-8, there is no need to check
 217// a character's 'Kind' before writing it to a string
 218static int writeStr(_PyUnicodeWriter* writer, PyObject* str, Py_ssize_t width,
 219                    Py_ssize_t precision) {
 220  if (PyUnicode_READY(str) == -1) return -1;
 221
 222  Py_ssize_t length = PyUnicode_GET_LENGTH(str);
 223  if ((precision == -1 || precision >= length) && width <= length) {
 224    return _PyUnicodeWriter_WriteStr(writer, str);
 225  }
 226
 227  if (precision != -1) length = Py_MIN(precision, length);
 228
 229  Py_ssize_t arglen = Py_MAX(length, width);
 230  // Facebook: Our internal strings are always UTF-8, don't need maxchar
 231  // (D13491655)
 232  if (_PyUnicodeWriter_Prepare(writer, arglen, 0) == -1) return -1;
 233
 234  if (width > length) {
 235    Py_ssize_t fill = width - length;
 236    // Facebook: Our internal strings are immutable, can't use PyUnicode_Fill
 237    // (D13491655)
 238    for (Py_ssize_t i = 0; i < fill; ++i) {
 239      if (_PyUnicodeWriter_WriteCharInline(writer, ' ') == -1) return -1;
 240    }
 241  }
 242  // Facebook: Since we only have one internal representation, we don't have
 243  // to worry about changing a string's 'Kind' (D13491655)
 244  return _PyUnicodeWriter_WriteSubstring(writer, str, 0, length);
 245}
 246
 247static int writeCStr(_PyUnicodeWriter* writer, const char* str,
 248                     Py_ssize_t width, Py_ssize_t precision) {
 249  Py_ssize_t length = std::strlen(str);
 250  if (precision != -1) length = Py_MIN(length, precision);
 251  PyObject* unicode =
 252      PyUnicode_DecodeUTF8Stateful(str, length, "replace", nullptr);
 253  if (unicode == nullptr) return -1;
 254
 255  int res = writeStr(writer, unicode, width, -1);
 256  Py_DECREF(unicode);
 257  return res;
 258}
 259
 260static const char* writeArg(_PyUnicodeWriter* writer, const char* f,
 261                            va_list* vargs) {
 262  const char* p = f;
 263  f++;
 264  int zeropad = 0;
 265  if (*f == '0') {
 266    zeropad = 1;
 267    f++;
 268  }
 269
 270  // parse the width.precision part, e.g. "%2.5s" => width=2, precision=5
 271  Py_ssize_t width = -1;
 272  if (Py_ISDIGIT(static_cast<unsigned>(*f))) {
 273    width = *f - '0';
 274    f++;
 275    while (Py_ISDIGIT(static_cast<unsigned>(*f))) {
 276      if (width > (kMaxWord - (static_cast<int>(*f) - '0')) / 10) {
 277        Thread::current()->raiseWithFmt(LayoutId::kValueError, "width too big");
 278        return nullptr;
 279      }
 280      width = (width * 10) + (*f - '0');
 281      f++;
 282    }
 283  }
 284  Py_ssize_t precision = -1;
 285  if (*f == '.') {
 286    f++;
 287    if (Py_ISDIGIT(static_cast<unsigned>(*f))) {
 288      precision = (*f - '0');
 289      f++;
 290      while (Py_ISDIGIT(static_cast<unsigned>(*f))) {
 291        if (precision > (kMaxWord - (static_cast<int>(*f) - '0')) / 10) {
 292          Thread::current()->raiseWithFmt(LayoutId::kValueError,
 293                                          "precision too big");
 294          return nullptr;
 295        }
 296        precision = (precision * 10) + (*f - '0');
 297        f++;
 298      }
 299    }
 300    if (*f == '%') {
 301      // "%.3%s" => f points to "3"
 302      f--;
 303    }
 304  }
 305  if (*f == '\0') {
 306    // bogus format "%.123" => go backward, f points to "3"
 307    f--;
 308  }
 309
 310  // Handle %ld, %lu, %lld and %llu.
 311  int longflag = 0;
 312  int longlongflag = 0;
 313  int size_tflag = 0;
 314  if (*f == 'l') {
 315    if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
 316      longflag = 1;
 317      ++f;
 318    } else if (f[1] == 'l' && (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
 319      longlongflag = 1;
 320      f += 2;
 321    }
 322  }
 323  // handle the size_t flag.
 324  else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
 325    size_tflag = 1;
 326    ++f;
 327  }
 328
 329  if (f[1] == '\0') writer->overallocate = 0;
 330
 331  switch (*f) {
 332    case 'c': {
 333      int ordinal = va_arg(*vargs, int);
 334      if (ordinal < 0 || ordinal > kMaxUnicode) {
 335        Thread::current()->raiseWithFmt(
 336            LayoutId::kOverflowError,
 337            "character argument not in range(0x110000)");
 338        return nullptr;
 339      }
 340      if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0) return nullptr;
 341      break;
 342    }
 343
 344    case 'i':
 345    case 'd':
 346    case 'u':
 347    case 'x': {
 348      // used by sprintf
 349      char buffer[kMaxLongLongChars];
 350      Py_ssize_t len;
 351
 352      if (*f == 'u') {
 353        if (longflag) {
 354          len = std::sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
 355        } else if (longlongflag) {
 356          len =
 357              std::sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
 358        } else if (size_tflag) {
 359          len = std::sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
 360                             va_arg(*vargs, size_t));
 361        } else {
 362          len = std::sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
 363        }
 364      } else if (*f == 'x') {
 365        len = std::sprintf(buffer, "%x", va_arg(*vargs, int));
 366      } else {
 367        if (longflag) {
 368          len = std::sprintf(buffer, "%li", va_arg(*vargs, long));
 369        } else if (longlongflag) {
 370          len = std::sprintf(buffer, "%lli", va_arg(*vargs, long long));
 371        } else if (size_tflag) {
 372          len = std::sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
 373                             va_arg(*vargs, Py_ssize_t));
 374        } else {
 375          len = std::sprintf(buffer, "%i", va_arg(*vargs, int));
 376        }
 377      }
 378      DCHECK(len >= 0, "len must be >= 0");
 379
 380      if (precision < len) precision = len;
 381
 382      Py_ssize_t arglen = Py_MAX(precision, width);
 383      if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1) return nullptr;
 384
 385      if (width > precision) {
 386        Py_ssize_t fill = width - precision;
 387        Py_UCS4 fillchar = zeropad ? '0' : ' ';
 388        // Facebook: Our internal strings are immutable, can't use
 389        // PyUnicode_Fill (D13491655)
 390        for (Py_ssize_t i = 0; i < fill; ++i) {
 391          if (_PyUnicodeWriter_WriteCharInline(writer, fillchar) == -1) {
 392            return nullptr;
 393          }
 394        }
 395      }
 396      if (precision > len) {
 397        Py_ssize_t fill = precision - len;
 398        // Facebook: Our internal strings are immutable, can't use
 399        // PyUnicode_Fill (D13491655)
 400        for (Py_ssize_t i = 0; i < fill; ++i) {
 401          if (_PyUnicodeWriter_WriteCharInline(writer, '0') == -1) {
 402            return nullptr;
 403          }
 404        }
 405      }
 406
 407      if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0) {
 408        return nullptr;
 409      }
 410      break;
 411    }
 412
 413    case 'p': {
 414      char number[kMaxLongLongChars];
 415
 416      Py_ssize_t len = std::sprintf(number, "%p", va_arg(*vargs, void*));
 417      DCHECK(len >= 0, "len must be >= 0");
 418
 419      // %p is ill-defined:  ensure leading 0x.
 420      if (number[1] == 'X') {
 421        number[1] = 'x';
 422      } else if (number[1] != 'x') {
 423        std::memmove(number + 2, number, std::strlen(number) + 1);
 424        number[0] = '0';
 425        number[1] = 'x';
 426        len += 2;
 427      }
 428
 429      if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0) {
 430        return nullptr;
 431      }
 432      break;
 433    }
 434
 435    case 's': {
 436      // UTF-8
 437      const char* s = va_arg(*vargs, const char*);
 438      if (writeCStr(writer, s, width, precision) < 0) {
 439        return nullptr;
 440      }
 441      break;
 442    }
 443
 444    case 'U': {
 445      PyObject* obj = va_arg(*vargs, PyObject*);
 446      // This used to call _PyUnicode_CHECK, which is deprecated, and which we
 447      // have not imported.
 448      DCHECK(obj, "obj must not be null");
 449
 450      if (writeStr(writer, obj, width, precision) == -1) {
 451        return nullptr;
 452      }
 453      break;
 454    }
 455
 456    case 'V': {
 457      PyObject* obj = va_arg(*vargs, PyObject*);
 458      const char* str = va_arg(*vargs, const char*);
 459      if (obj) {
 460        // This used to DCHECK _PyUnicode_CHECK, which is deprecated, and which
 461        // we have not imported.
 462        if (writeStr(writer, obj, width, precision) == -1) {
 463          return nullptr;
 464        }
 465      } else {
 466        DCHECK(str != nullptr, "str must not be null");
 467        if (writeCStr(writer, str, width, precision) < 0) {
 468          return nullptr;
 469        }
 470      }
 471      break;
 472    }
 473
 474    case 'S': {
 475      PyObject* obj = va_arg(*vargs, PyObject*);
 476      DCHECK(obj, "obj must not be null");
 477      PyObject* str = PyObject_Str(obj);
 478      if (!str) return nullptr;
 479      if (writeStr(writer, str, width, precision) == -1) {
 480        Py_DECREF(str);
 481        return nullptr;
 482      }
 483      Py_DECREF(str);
 484      break;
 485    }
 486
 487    case 'R': {
 488      PyObject* obj = va_arg(*vargs, PyObject*);
 489      DCHECK(obj, "obj must not be null");
 490      PyObject* repr = PyObject_Repr(obj);
 491      if (!repr) return nullptr;
 492      if (writeStr(writer, repr, width, precision) == -1) {
 493        Py_DECREF(repr);
 494        return nullptr;
 495      }
 496      Py_DECREF(repr);
 497      break;
 498    }
 499
 500    case 'A': {
 501      PyObject* obj = va_arg(*vargs, PyObject*);
 502      DCHECK(obj, "obj must not be null");
 503      PyObject* ascii = PyObject_ASCII(obj);
 504      if (!ascii) return nullptr;
 505      if (writeStr(writer, ascii, width, precision) == -1) {
 506        Py_DECREF(ascii);
 507        return nullptr;
 508      }
 509      Py_DECREF(ascii);
 510      break;
 511    }
 512
 513    case '%':
 514      if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) return nullptr;
 515      break;
 516
 517    default: {
 518      // if we stumble upon an unknown formatting code, copy the rest
 519      // of the format string to the output string. (we cannot just
 520      // skip the code, since there's no way to know what's in the
 521      // argument list)
 522      Py_ssize_t len = std::strlen(p);
 523      if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) {
 524        return nullptr;
 525      }
 526      f = p + len;
 527      return f;
 528    }
 529  }
 530
 531  f++;
 532  return f;
 533}
 534
 535PY_EXPORT int _PyUnicode_EqualToASCIIString(PyObject* unicode,
 536                                            const char* c_str) {
 537  DCHECK(unicode, "nullptr argument");
 538  DCHECK(c_str, "nullptr argument");
 539  RawObject obj = ApiHandle::asObject(ApiHandle::fromPyObject(unicode));
 540  DCHECK(Thread::current()->runtime()->isInstanceOfStr(obj),
 541         "non-str argument");
 542  return strUnderlying(obj).equalsCStr(c_str);
 543}
 544
 545PY_EXPORT int _PyUnicode_EQ(PyObject* aa, PyObject* bb) {
 546  Thread* thread = Thread::current();
 547  HandleScope scope(thread);
 548  Object obj_aa(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(aa)));
 549  Object obj_bb(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(bb)));
 550  Str lhs(&scope, strUnderlying(*obj_aa));
 551  Str rhs(&scope, strUnderlying(*obj_bb));
 552  return lhs.equals(*rhs);
 553}
 554
 555PY_EXPORT size_t Py_UNICODE_strlen(const Py_UNICODE* u) {
 556  DCHECK(u != nullptr, "u should not be null");
 557  return std::wcslen(u);
 558}
 559
 560PY_EXPORT int _PyUnicode_Ready(PyObject* /* unicode */) { return 0; }
 561
 562PY_EXPORT int PyUnicode_CheckExact_Func(PyObject* obj) {
 563  return ApiHandle::asObject(ApiHandle::fromPyObject(obj)).isStr();
 564}
 565
 566PY_EXPORT int PyUnicode_Check_Func(PyObject* obj) {
 567  return Thread::current()->runtime()->isInstanceOfStr(
 568      ApiHandle::asObject(ApiHandle::fromPyObject(obj)));
 569}
 570
 571PY_EXPORT PyObject* PyUnicode_FromString(const char* c_string) {
 572  Runtime* runtime = Thread::current()->runtime();
 573  return ApiHandle::newReference(runtime, runtime->newStrFromCStr(c_string));
 574}
 575
 576// Look for a surrogate codepoint in str[start:]. Note that start is a byte
 577// offset. Return the first index found in that range, or -1 if not found.
 578static word strFindSurrogateCodepoint(const Str& str, word start) {
 579  word length = str.length();
 580  word byte_index = start;
 581  while (byte_index < length) {
 582    word num_bytes;
 583    int32_t codepoint = str.codePointAt(byte_index, &num_bytes);
 584    if (Unicode::isSurrogate(codepoint)) {
 585      return byte_index;
 586    }
 587    byte_index += num_bytes;
 588  }
 589  return -1;
 590}
 591
 592PY_EXPORT const char* PyUnicode_AsUTF8AndSize(PyObject* pyunicode,
 593                                              Py_ssize_t* size) {
 594  Thread* thread = Thread::current();
 595  if (pyunicode == nullptr) {
 596    thread->raiseBadArgument();
 597    return nullptr;
 598  }
 599
 600  HandleScope scope(thread);
 601  ApiHandle* handle = ApiHandle::fromPyObject(pyunicode);
 602  Object obj(&scope, ApiHandle::asObject(handle));
 603  Runtime* runtime = thread->runtime();
 604  if (!runtime->isInstanceOfStr(*obj)) {
 605    thread->raiseBadInternalCall();
 606    return nullptr;
 607  }
 608
 609  Str str(&scope, strUnderlying(*obj));
 610  word length = str.length();
 611  if (size != nullptr) *size = length;
 612  if (void* cache = ApiHandle::cache(runtime, handle)) {
 613    return static_cast<char*>(cache);
 614  }
 615
 616  word surr_index = strFindSurrogateCodepoint(str, 0);
 617  if (surr_index != -1) {
 618    Object encoding(&scope, SmallStr::fromCStr("utf-8"));
 619    Object start(&scope, SmallInt::fromWord(surr_index));
 620    Object end(&scope, SmallInt::fromWord(surr_index + 1));
 621    Object reason(&scope, runtime->newStrFromCStr("surrogates not allowed"));
 622    Object exc(&scope,
 623               thread->invokeFunction5(ID(builtins), ID(UnicodeEncodeError),
 624                                       encoding, str, start, end, reason));
 625    Object err(&scope,
 626               thread->invokeFunction1(ID(_codecs), ID(strict_errors), exc));
 627    DCHECK(err.isErrorException(),
 628           "_codecs.strict_errors should raise an exception");
 629    return nullptr;
 630  }
 631
 632  byte* result = static_cast<byte*>(std::malloc(length + 1));
 633  str.copyTo(result, length);
 634  result[length] = '\0';
 635  ApiHandle::setCache(runtime, handle, result);
 636  ApiHandle::setBorrowedNoImmediate(handle);
 637  return reinterpret_cast<char*>(result);
 638}
 639
 640PY_EXPORT const char* PyUnicode_AsUTF8(PyObject* unicode) {
 641  return PyUnicode_AsUTF8AndSize(unicode, nullptr);
 642}
 643
 644PY_EXPORT PyObject* PyUnicode_FromStringAndSize(const char* u,
 645                                                Py_ssize_t size) {
 646  Thread* thread = Thread::current();
 647
 648  if (size < 0) {
 649    thread->raiseWithFmt(LayoutId::kSystemError,
 650                         "Negative size passed to PyUnicode_FromStringAndSize");
 651    return nullptr;
 652  }
 653  if (u == nullptr && size != 0) {
 654    // TODO(T36562134): Implement _PyUnicode_New
 655    UNIMPLEMENTED("_PyUnicode_New");
 656  }
 657  const byte* data = reinterpret_cast<const byte*>(u);
 658  Runtime* runtime = thread->runtime();
 659  return ApiHandle::newReference(
 660      runtime, runtime->newStrWithAll(View<byte>(data, size)));
 661}
 662
 663PY_EXPORT PyObject* PyUnicode_EncodeFSDefault(PyObject* unicode) {
 664  // TODO(T40363016): Allow arbitrary encodings instead of defaulting to utf-8
 665  return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
 666}
 667
 668PY_EXPORT PyObject* PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) {
 669  Thread* thread = Thread::current();
 670  // Since CPython optimizes for empty string, we must do so as well to make
 671  // sure we don't fail if maxchar is invalid
 672  if (size == 0) {
 673    return ApiHandle::newReference(thread->runtime(), Str::empty());
 674  }
 675  if (maxchar > kMaxUnicode) {
 676    thread->raiseWithFmt(LayoutId::kSystemError,
 677                         "invalid maximum character passed to PyUnicode_New");
 678    return nullptr;
 679  }
 680  if (size < 0) {
 681    thread->raiseWithFmt(LayoutId::kSystemError,
 682                         "Negative size passed to PyUnicode_New");
 683    return nullptr;
 684  }
 685  // TODO(T41498010): Add modifiable string state
 686  UNIMPLEMENTED("Cannot create mutable strings yet");
 687}
 688
 689PY_EXPORT void PyUnicode_Append(PyObject** p_left, PyObject* right) {
 690  if (p_left == nullptr) {
 691    if (!PyErr_Occurred()) {
 692      PyErr_BadInternalCall();
 693    }
 694    return;
 695  }
 696
 697  PyObject* left = *p_left;
 698  if (left == nullptr || right == nullptr || !PyUnicode_Check(left) ||
 699      !PyUnicode_Check(right)) {
 700    if (!PyErr_Occurred()) {
 701      PyErr_BadInternalCall();
 702    }
 703    Py_CLEAR(*p_left);
 704    return;
 705  }
 706  *p_left = PyUnicode_Concat(left, right);
 707  Py_DECREF(left);
 708}
 709
 710PY_EXPORT void PyUnicode_AppendAndDel(PyObject** p_left, PyObject* right) {
 711  PyUnicode_Append(p_left, right);
 712  Py_XDECREF(right);
 713}
 714
 715PY_EXPORT PyObject* _PyUnicode_AsASCIIString(PyObject* unicode,
 716                                             const char* errors) {
 717  DCHECK(unicode != nullptr, "unicode cannot be null");
 718  Thread* thread = Thread::current();
 719  HandleScope scope(thread);
 720  Runtime* runtime = thread->runtime();
 721  Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
 722  if (!runtime->isInstanceOfStr(*str)) {
 723    thread->raiseBadArgument();
 724    return nullptr;
 725  }
 726  Object errors_obj(&scope, symbolFromError(thread, errors));
 727  Object tuple_obj(&scope, thread->invokeFunction2(
 728                               ID(_codecs), ID(ascii_encode), str, errors_obj));
 729  if (tuple_obj.isError()) {
 730    return nullptr;
 731  }
 732  Tuple tuple(&scope, *tuple_obj);
 733  return ApiHandle::newReference(runtime, tuple.at(0));
 734}
 735
 736PY_EXPORT PyObject* PyUnicode_AsASCIIString(PyObject* unicode) {
 737  return _PyUnicode_AsASCIIString(unicode, "strict");
 738}
 739
 740PY_EXPORT PyObject* PyUnicode_AsCharmapString(PyObject* /* e */,
 741                                              PyObject* /* g */) {
 742  UNIMPLEMENTED("PyUnicode_AsCharmapString");
 743}
 744
 745PY_EXPORT PyObject* PyUnicode_AsDecodedObject(PyObject* /* e */,
 746                                              const char* /* g */,
 747                                              const char* /* s */) {
 748  UNIMPLEMENTED("PyUnicode_AsDecodedObject");
 749}
 750
 751PY_EXPORT PyObject* PyUnicode_AsDecodedUnicode(PyObject* /* e */,
 752                                               const char* /* g */,
 753                                               const char* /* s */) {
 754  UNIMPLEMENTED("PyUnicode_AsDecodedUnicode");
 755}
 756
 757PY_EXPORT PyObject* PyUnicode_AsEncodedObject(PyObject* /* e */,
 758                                              const char* /* g */,
 759                                              const char* /* s */) {
 760  UNIMPLEMENTED("PyUnicode_AsEncodedObject");
 761}
 762
 763PY_EXPORT PyObject* PyUnicode_AsEncodedString(PyObject* unicode,
 764                                              const char* encoding,
 765                                              const char* errors) {
 766  DCHECK(unicode != nullptr, "unicode cannot be null");
 767  if (encoding == nullptr) {
 768    return _PyUnicode_AsUTF8String(unicode, errors);
 769  }
 770  Thread* thread = Thread::current();
 771  HandleScope scope(thread);
 772  Runtime* runtime = thread->runtime();
 773  Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
 774  if (!runtime->isInstanceOfStr(*str)) {
 775    thread->raiseBadArgument();
 776    return nullptr;
 777  }
 778  Object encoding_obj(&scope, runtime->newStrFromCStr(encoding));
 779  Object errors_obj(&scope, errors == nullptr
 780                                ? Unbound::object()
 781                                : symbolFromError(thread, errors));
 782  Object result(&scope, thread->invokeFunction3(ID(_codecs), ID(encode), str,
 783                                                encoding_obj, errors_obj));
 784  if (result.isError()) {
 785    return nullptr;
 786  }
 787  if (runtime->isInstanceOfBytes(*result)) {
 788    return ApiHandle::newReference(runtime, *result);
 789  }
 790  if (runtime->isInstanceOfBytearray(*result)) {
 791    // Equivalent to calling PyErr_WarnFormat
 792    if (!ensureBuiltinModuleById(thread, ID(warnings)).isErrorException()) {
 793      Object category(&scope, runtime->typeAt(LayoutId::kRuntimeWarning));
 794      Object message(&scope,
 795                     runtime->newStrFromFmt(
 796                         "encoder %s returned bytearray instead of bytes; "
 797                         "use codecs.encode() to encode to arbitrary types",
 798                         encoding));
 799      Object stack_level(&scope, runtime->newInt(1));
 800      Object source(&scope, NoneType::object());
 801      Object err(&scope,
 802                 thread->invokeFunction4(ID(warnings), ID(warn), message,
 803                                         category, stack_level, source));
 804      if (err.isErrorException()) {
 805        thread->clearPendingException();
 806      }
 807    }
 808    Bytearray result_bytearray(&scope, *result);
 809    return ApiHandle::newReference(runtime,
 810                                   bytearrayAsBytes(thread, result_bytearray));
 811  }
 812  thread->raiseWithFmt(LayoutId::kTypeError,
 813                       "'%s' encoder returned '%T' instead of 'bytes'; "
 814                       "use codecs.encode() to encode to arbitrary types",
 815                       encoding, *result);
 816  return nullptr;
 817}
 818
 819PY_EXPORT PyObject* PyUnicode_AsEncodedUnicode(PyObject* /* e */,
 820                                               const char* /* g */,
 821                                               const char* /* s */) {
 822  UNIMPLEMENTED("PyUnicode_AsEncodedUnicode");
 823}
 824
 825PY_EXPORT PyObject* _PyUnicode_AsLatin1String(PyObject* unicode,
 826                                              const char* errors) {
 827  DCHECK(unicode != nullptr, "unicode cannot be null");
 828  Thread* thread = Thread::current();
 829  HandleScope scope(thread);
 830  Runtime* runtime = thread->runtime();
 831  Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
 832  if (!runtime->isInstanceOfStr(*str)) {
 833    thread->raiseBadArgument();
 834    return nullptr;
 835  }
 836  Object errors_obj(&scope, symbolFromError(thread, errors));
 837  Object tuple_obj(&scope,
 838                   thread->invokeFunction2(ID(_codecs), ID(latin_1_encode), str,
 839                                           errors_obj));
 840  if (tuple_obj.isError()) {
 841    return nullptr;
 842  }
 843  Tuple tuple(&scope, *tuple_obj);
 844  return ApiHandle::newReference(runtime, tuple.at(0));
 845}
 846
 847PY_EXPORT PyObject* PyUnicode_AsLatin1String(PyObject* unicode) {
 848  return _PyUnicode_AsLatin1String(unicode, "strict");
 849}
 850
 851PY_EXPORT PyObject* PyUnicode_AsMBCSString(PyObject* /* e */) {
 852  UNIMPLEMENTED("PyUnicode_AsMBCSString");
 853}
 854
 855PY_EXPORT PyObject* PyUnicode_AsRawUnicodeEscapeString(PyObject* /* e */) {
 856  UNIMPLEMENTED("PyUnicode_AsRawUnicodeEscapeString");
 857}
 858
 859PY_EXPORT Py_UCS4* PyUnicode_AsUCS4(PyObject* u, Py_UCS4* buffer,
 860                                    Py_ssize_t buflen, int copy_null) {
 861  if (buffer == nullptr || buflen < 0) {
 862    PyErr_BadInternalCall();
 863    return nullptr;
 864  }
 865
 866  Thread* thread = Thread::current();
 867  HandleScope scope(thread);
 868  Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(u)));
 869  if (!thread->runtime()->isInstanceOfStr(*obj)) {
 870    thread->raiseBadArgument();
 871  }
 872
 873  Str str(&scope, strUnderlying(*obj));
 874  word num_codepoints = str.codePointLength();
 875  word target_buflen = copy_null ? num_codepoints + 1 : num_codepoints;
 876  if (buflen < target_buflen) {
 877    thread->raiseWithFmt(LayoutId::kSystemError,
 878                         "string is longer than the buffer");
 879    if (copy_null != 0 && 0 < buflen) {
 880      buffer[0] = 0;
 881    }
 882    return nullptr;
 883  }
 884
 885  for (word i = 0, offset = 0; i < num_codepoints; i++) {
 886    word num_bytes;
 887    buffer[i] = str.codePointAt(offset, &num_bytes);
 888    offset += num_bytes;
 889  }
 890  if (copy_null != 0) buffer[num_codepoints] = 0;
 891
 892  return buffer;
 893}
 894
 895PY_EXPORT Py_UCS4* PyUnicode_AsUCS4Copy(PyObject* str) {
 896  Py_ssize_t len = PyUnicode_GET_LENGTH(str) + 1;
 897  Py_UCS4* result = static_cast<Py_UCS4*>(PyMem_Malloc(len * sizeof(Py_UCS4)));
 898  if (result == nullptr) {
 899    PyErr_NoMemory();
 900    return nullptr;
 901  }
 902  return PyUnicode_AsUCS4(str, result, len, 1);
 903}
 904
 905PY_EXPORT PyObject* PyUnicode_AsUTF16String(PyObject* unicode) {
 906  return _PyUnicode_EncodeUTF16(unicode, nullptr, 0);
 907}
 908
 909PY_EXPORT PyObject* PyUnicode_AsUTF32String(PyObject* unicode) {
 910  return _PyUnicode_EncodeUTF32(unicode, nullptr, 0);
 911}
 912
 913PY_EXPORT PyObject* PyUnicode_AsUTF8String(PyObject* unicode) {
 914  return _PyUnicode_AsUTF8String(unicode, "strict");
 915}
 916
 917PY_EXPORT PyObject* PyUnicode_AsUnicodeEscapeString(PyObject* /* e */) {
 918  UNIMPLEMENTED("PyUnicode_AsUnicodeEscapeString");
 919}
 920
 921PY_EXPORT Py_ssize_t PyUnicode_AsWideChar(PyObject* str, wchar_t* result,
 922                                          Py_ssize_t size) {
 923  Thread* thread = Thread::current();
 924  if (str == nullptr) {
 925    thread->raiseBadInternalCall();
 926    return -1;
 927  }
 928  HandleScope scope(thread);
 929  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
 930  Runtime* runtime = thread->runtime();
 931  if (!runtime->isInstanceOfStr(*str_obj)) {
 932    thread->raiseWithFmt(
 933        LayoutId::kTypeError,
 934        "PyUnicode_AsWideChar requires 'str' object but received a '%T'",
 935        &str_obj);
 936    return -1;
 937  }
 938  Str str_str(&scope, strUnderlying(*str_obj));
 939  Py_ssize_t num_code_points = str_str.codePointLength();
 940  if (size > num_code_points) {
 941    size = num_code_points + 1;
 942  } else {
 943    num_code_points = size;
 944  }
 945
 946  {
 947    word byte_count = str_str.length();
 948    for (word byte_index = 0, wchar_index = 0, num_bytes = 0;
 949         byte_index < byte_count && wchar_index < size;
 950         byte_index += num_bytes, wchar_index += 1) {
 951      int32_t cp = str_str.codePointAt(byte_index, &num_bytes);
 952      static_assert(sizeof(wchar_t) == sizeof(cp), "Requires 32bit wchar_t");
 953      if (result != nullptr) {
 954        result[wchar_index] = static_cast<wchar_t>(cp);
 955      }
 956    }
 957    if (num_code_points < size) {
 958      result[num_code_points] = '\0';
 959    }
 960  }
 961
 962  return num_code_points;
 963}
 964
 965PY_EXPORT wchar_t* PyUnicode_AsWideCharString(PyObject* str,
 966                                              Py_ssize_t* result_len) {
 967  Thread* thread = Thread::current();
 968  if (str == nullptr) {
 969    thread->raiseBadInternalCall();
 970    return nullptr;
 971  }
 972  HandleScope scope(thread);
 973  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
 974  Runtime* runtime = thread->runtime();
 975  if (!runtime->isInstanceOfStr(*str_obj)) {
 976    thread->raiseWithFmt(
 977        LayoutId::kTypeError,
 978        "PyUnicode_AsWideChar requires 'str' object but received a '%T'",
 979        &str_obj);
 980    return nullptr;
 981  }
 982  Str str_str(&scope, strUnderlying(*str_obj));
 983  word length = str_str.codePointLength();
 984  wchar_t* result =
 985      static_cast<wchar_t*>(PyMem_Malloc((length + 1) * sizeof(wchar_t)));
 986  if (result == nullptr) {
 987    thread->raiseMemoryError();
 988    return nullptr;
 989  }
 990
 991  {
 992    word byte_count = str_str.length();
 993    for (word byte_index = 0, wchar_index = 0, num_bytes = 0;
 994         byte_index < byte_count && wchar_index < length + 1;
 995         byte_index += num_bytes, wchar_index += 1) {
 996      int32_t cp = str_str.codePointAt(byte_index, &num_bytes);
 997      if (cp == '\0') {
 998        PyMem_Free(result);
 999        thread->raiseWithFmt(LayoutId::kValueError, "embedded null character");
1000        return nullptr;
1001      }
1002      static_assert(sizeof(wchar_t) == sizeof(cp), "Requires 32bit wchar_t");
1003      result[wchar_index] = static_cast<wchar_t>(cp);
1004    }
1005    result[length] = '\0';
1006  }
1007
1008  if (result_len != nullptr) {
1009    *result_len = length;
1010  }
1011  return result;
1012}
1013
1014PY_EXPORT PyObject* PyUnicode_BuildEncodingMap(PyObject* /* g */) {
1015  UNIMPLEMENTED("PyUnicode_BuildEncodingMap");
1016}
1017
1018PY_EXPORT int PyUnicode_Compare(PyObject* left, PyObject* right) {
1019  Thread* thread = Thread::current();
1020  if (left == nullptr || right == nullptr) {
1021    thread->raiseBadInternalCall();
1022    return -1;
1023  }
1024
1025  Runtime* runtime = thread->runtime();
1026  HandleScope scope(thread);
1027  Object left_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(left)));
1028  Object right_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(right)));
1029  if (runtime->isInstanceOfStr(*left_obj) &&
1030      runtime->isInstanceOfStr(*right_obj)) {
1031    Str left_str(&scope, strUnderlying(*left_obj));
1032    Str right_str(&scope, strUnderlying(*right_obj));
1033    word result = left_str.compare(*right_str);
1034    return result > 0 ? 1 : (result < 0 ? -1 : 0);
1035  }
1036  thread->raiseWithFmt(LayoutId::kTypeError, "Can't compare %T and %T",
1037                       &left_obj, &right_obj);
1038  return -1;
1039}
1040
1041PY_EXPORT int PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str) {
1042  Thread* thread = Thread::current();
1043  HandleScope scope(thread);
1044  Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(uni)));
1045  Str str_obj(&scope, strUnderlying(*obj));
1046  // TODO(atalaba): Allow for proper comparison against Latin-1 strings. For
1047  // example, in CPython: "\xC3\xA9" (UTF-8) == "\xE9" (Latin-1), and
1048  // "\xE9 longer" > "\xC3\xA9".
1049  return str_obj.compareCStr(str);
1050}
1051
1052PY_EXPORT PyObject* PyUnicode_Concat(PyObject* left, PyObject* right) {
1053  Thread* thread = Thread::current();
1054  HandleScope scope(thread);
1055  Runtime* runtime = thread->runtime();
1056
1057  Object left_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(left)));
1058  Object right_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(right)));
1059  if (!runtime->isInstanceOfStr(*left_obj) ||
1060      !runtime->isInstanceOfStr(*right_obj)) {
1061    thread->raiseWithFmt(LayoutId::kTypeError,
1062                         "can only concatenate str to str");
1063    return nullptr;
1064  }
1065  Str left_str(&scope, strUnderlying(*left_obj));
1066  Str right_str(&scope, strUnderlying(*right_obj));
1067  word dummy;
1068  if (__builtin_add_overflow(left_str.length(), right_str.length(), &dummy)) {
1069    thread->raiseWithFmt(LayoutId::kOverflowError,
1070                         "strings are too large to concat");
1071    return nullptr;
1072  }
1073  return ApiHandle::newReference(
1074      runtime, runtime->strConcat(thread, left_str, right_str));
1075}
1076
1077PY_EXPORT int PyUnicode_Contains(PyObject* str, PyObject* substr) {
1078  DCHECK(str != nullptr, "str should not be null");
1079  DCHECK(substr != nullptr, "substr should not be null");
1080  Thread* thread = Thread::current();
1081  HandleScope scope(thread);
1082  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1083  Object substr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr)));
1084  Object result(&scope,
1085                thread->invokeMethodStatic2(LayoutId::kStr, ID(__contains__),
1086                                            str_obj, substr_obj));
1087  if (result.isError()) {
1088    if (result.isErrorNotFound()) {
1089      thread->raiseWithFmt(LayoutId::kTypeError,
1090                           "could not call str.__contains__");
1091    }
1092    return -1;
1093  }
1094  DCHECK(result.isBool(), "result of __contains__ should be bool");
1095  return Bool::cast(*result).value();
1096}
1097
1098PY_EXPORT Py_ssize_t PyUnicode_CopyCharacters(PyObject*, Py_ssize_t, PyObject*,
1099                                              Py_ssize_t, Py_ssize_t) {
1100  UNIMPLEMENTED("PyUnicode_CopyCharacters");
1101}
1102
1103PY_EXPORT Py_ssize_t PyUnicode_Count(PyObject* /* r */, PyObject* /* r */,
1104                                     Py_ssize_t /* t */, Py_ssize_t /* d */) {
1105  UNIMPLEMENTED("PyUnicode_Count");
1106}
1107
1108PY_EXPORT PyObject* PyUnicode_Decode(const char* c_str, Py_ssize_t size,
1109                                     const char* encoding, const char* errors) {
1110  DCHECK(c_str != nullptr, "c_str cannot be null");
1111  if (encoding == nullptr) {
1112    return PyUnicode_DecodeUTF8Stateful(c_str, size, errors, nullptr);
1113  }
1114
1115  Thread* thread = Thread::current();
1116  Runtime* runtime = thread->runtime();
1117  HandleScope scope(thread);
1118  Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
1119                          reinterpret_cast<const byte*>(c_str), size)));
1120  Object errors_obj(&scope, symbolFromError(thread, errors));
1121  Object encoding_obj(&scope, runtime->newStrFromCStr(encoding));
1122  Object result(&scope, thread->invokeFunction3(ID(_codecs), ID(decode), bytes,
1123                                                encoding_obj, errors_obj));
1124  if (result.isError()) {
1125    return nullptr;
1126  }
1127  return ApiHandle::newReference(runtime, *result);
1128}
1129
1130PY_EXPORT PyObject* PyUnicode_DecodeASCII(const char* c_str, Py_ssize_t size,
1131                                          const char* errors) {
1132  Thread* thread = Thread::current();
1133  Runtime* runtime = thread->runtime();
1134  HandleScope scope(thread);
1135  Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
1136                          reinterpret_cast<const byte*>(c_str), size)));
1137  Str errors_obj(&scope, symbolFromError(thread, errors));
1138  Object result_obj(
1139      &scope, thread->invokeFunction2(ID(_codecs), ID(ascii_decode), bytes,
1140                                      errors_obj));
1141  if (result_obj.isError()) {
1142    if (result_obj.isErrorNotFound()) {
1143      thread->raiseWithFmt(LayoutId::kSystemError,
1144                           "could not call _codecs.ascii_decode");
1145    }
1146    return nullptr;
1147  }
1148  Tuple result(&scope, *result_obj);
1149  return ApiHandle::newReference(runtime, result.at(0));
1150}
1151
1152PY_EXPORT PyObject* PyUnicode_DecodeCharmap(const char* /* s */,
1153                                            Py_ssize_t /* e */,
1154                                            PyObject* /* g */,
1155                                            const char* /* s */) {
1156  UNIMPLEMENTED("PyUnicode_DecodeCharmap");
1157}
1158
1159PY_EXPORT PyObject* PyUnicode_DecodeCodePageStateful(int /* e */,
1160                                                     const char* /* s */,
1161                                                     Py_ssize_t /* e */,
1162                                                     const char* /* s */,
1163                                                     Py_ssize_t* /* d */) {
1164  UNIMPLEMENTED("PyUnicode_DecodeCodePageStateful");
1165}
1166
1167PY_EXPORT PyObject* PyUnicode_DecodeFSDefault(const char* c_str) {
1168  Runtime* runtime = Thread::current()->runtime();
1169  return ApiHandle::newReference(runtime, runtime->newStrFromCStr(c_str));
1170}
1171
1172PY_EXPORT PyObject* PyUnicode_DecodeFSDefaultAndSize(const char* c_str,
1173                                                     Py_ssize_t size) {
1174  Runtime* runtime = Thread::current()->runtime();
1175  View<byte> str(reinterpret_cast<const byte*>(c_str), size);
1176  return ApiHandle::newReference(runtime, runtime->newStrWithAll(str));
1177}
1178
1179PY_EXPORT PyObject* PyUnicode_DecodeLatin1(const char* c_str, Py_ssize_t size,
1180                                           const char* /* errors */) {
1181  Thread* thread = Thread::current();
1182  Runtime* runtime = thread->runtime();
1183  HandleScope scope(thread);
1184  Bytes bytes(&scope, runtime->newBytesWithAll(View<byte>(
1185                          reinterpret_cast<const byte*>(c_str), size)));
1186  Object result_obj(
1187      &scope, thread->invokeFunction1(ID(_codecs), ID(latin_1_decode), bytes));
1188  if (result_obj.isError()) {
1189    if (result_obj.isErrorNotFound()) {
1190      thread->raiseWithFmt(LayoutId::kSystemError,
1191                           "could not call _codecs.latin_1_decode");
1192    }
1193    return nullptr;
1194  }
1195  Tuple result(&scope, *result_obj);
1196  return ApiHandle::newReference(runtime, result.at(0));
1197}
1198
1199PY_EXPORT PyObject* PyUnicode_DecodeLocale(const char* str,
1200                                           const char* errors) {
1201  return PyUnicode_DecodeLocaleAndSize(str, std::strlen(str), errors);
1202}
1203
1204PY_EXPORT PyObject* PyUnicode_DecodeLocaleAndSize(const char* str,
1205                                                  Py_ssize_t len,
1206                                                  const char* errors) {
1207  _Py_error_handler surrogateescape;
1208  if (errors == nullptr || std::strcmp(errors, "strict") == 0) {
1209    surrogateescape = _Py_ERROR_STRICT;
1210  } else if (std::strcmp(errors, "surrogateescape") == 0) {
1211    surrogateescape = _Py_ERROR_SURROGATEESCAPE;
1212  } else {
1213    Thread::current()->raiseWithFmt(
1214        LayoutId::kValueError,
1215        "only 'strict' and 'surrogateescape' error handlers "
1216        "are supported, not '%s'",
1217        errors);
1218    return nullptr;
1219  }
1220
1221  if (str[len] != '\0' || static_cast<size_t>(len) != std::strlen(str)) {
1222    Thread::current()->raiseWithFmt(LayoutId::kValueError,
1223                                    "embedded null byte");
1224    return nullptr;
1225  }
1226
1227  wchar_t* wstr;
1228  size_t wlen;
1229  const char* reason;
1230  int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason, 1, surrogateescape);
1231  if (res != 0) {
1232    if (res == -2) {
1233      PyObject* exc =
1234          PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns", "locale",
1235                                str, len, wlen, wlen + 1, reason);
1236      if (exc != nullptr) {
1237        PyCodec_StrictErrors(exc);
1238        Py_DECREF(exc);
1239      }
1240    } else {
1241      PyErr_NoMemory();
1242    }
1243    return nullptr;
1244  }
1245
1246  PyObject* unicode = PyUnicode_FromWideChar(wstr, wlen);
1247  PyMem_RawFree(wstr);
1248  return unicode;
1249}
1250
1251PY_EXPORT PyObject* PyUnicode_DecodeMBCS(const char* /* s */,
1252                                         Py_ssize_t /* e */,
1253                                         const char* /* s */) {
1254  UNIMPLEMENTED("PyUnicode_DecodeMBCS");
1255}
1256
1257PY_EXPORT PyObject* PyUnicode_DecodeMBCSStateful(const char* /* s */,
1258                                                 Py_ssize_t /* e */,
1259                                                 const char* /* s */,
1260                                                 Py_ssize_t* /* d */) {
1261  UNIMPLEMENTED("PyUnicode_DecodeMBCSStateful");
1262}
1263
1264PY_EXPORT PyObject* PyUnicode_DecodeRawUnicodeEscape(const char* /* s */,
1265                                                     Py_ssize_t /* e */,
1266                                                     const char* /* s */) {
1267  UNIMPLEMENTED("PyUnicode_DecodeRawUnicodeEscape");
1268}
1269
1270PY_EXPORT PyObject* PyUnicode_DecodeUTF16(const char* /* s */,
1271                                          Py_ssize_t /* e */,
1272                                          const char* /* s */, int* /* r */) {
1273  UNIMPLEMENTED("PyUnicode_DecodeUTF16");
1274}
1275
1276PY_EXPORT PyObject* PyUnicode_DecodeUTF16Stateful(const char* /* s */,
1277                                                  Py_ssize_t /* e */,
1278                                                  const char* /* s */,
1279                                                  int* /* r */,
1280                                                  Py_ssize_t* /* d */) {
1281  UNIMPLEMENTED("PyUnicode_DecodeUTF16Stateful");
1282}
1283
1284PY_EXPORT PyObject* PyUnicode_DecodeUTF32(const char* /* s */,
1285                                          Py_ssize_t /* e */,
1286                                          const char* /* s */, int* /* r */) {
1287  UNIMPLEMENTED("PyUnicode_DecodeUTF32");
1288}
1289
1290PY_EXPORT PyObject* PyUnicode_DecodeUTF32Stateful(const char* /* s */,
1291                                                  Py_ssize_t /* e */,
1292                                                  const char* /* s */,
1293                                                  int* /* r */,
1294                                                  Py_ssize_t* /* d */) {
1295  UNIMPLEMENTED("PyUnicode_DecodeUTF32Stateful");
1296}
1297
1298PY_EXPORT PyObject* PyUnicode_DecodeUTF7(const char* /* s */,
1299                                         Py_ssize_t /* e */,
1300                                         const char* /* s */) {
1301  UNIMPLEMENTED("PyUnicode_DecodeUTF7");
1302}
1303
1304PY_EXPORT PyObject* PyUnicode_DecodeUTF7Stateful(const char* /* s */,
1305                                                 Py_ssize_t /* e */,
1306                                                 const char* /* s */,
1307                                                 Py_ssize_t* /* d */) {
1308  UNIMPLEMENTED("PyUnicode_DecodeUTF7Stateful");
1309}
1310
1311PY_EXPORT PyObject* PyUnicode_DecodeUTF8(const char* c_str, Py_ssize_t size,
1312                                         const char* errors) {
1313  return PyUnicode_DecodeUTF8Stateful(c_str, size, errors, nullptr);
1314}
1315
1316PY_EXPORT PyObject* PyUnicode_DecodeUTF8Stateful(const char* c_str,
1317                                                 Py_ssize_t size,
1318                                                 const char* errors,
1319                                                 Py_ssize_t* consumed) {
1320  DCHECK(c_str != nullptr, "c_str cannot be null");
1321
1322  Thread* thread = Thread::current();
1323  HandleScope scope(thread);
1324  Runtime* runtime = thread->runtime();
1325  word i = 0;
1326  const byte* byte_str = reinterpret_cast<const byte*>(c_str);
1327  for (; i < size; ++i) {
1328    if (byte_str[i] > kMaxASCII) break;
1329  }
1330  if (i == size) {
1331    if (consumed != nullptr) {
1332      *consumed = size;
1333    }
1334    return ApiHandle::newReference(runtime,
1335                                   runtime->newStrWithAll({byte_str, size}));
1336  }
1337  Object bytes(&scope, runtime->newBytesWithAll(View<byte>({byte_str, size})));
1338  Object errors_obj(&scope, symbolFromError(thread, errors));
1339  Object is_final(&scope, Bool::fromBool(consumed == nullptr));
1340  Object result_obj(
1341      &scope, thread->invokeFunction3(ID(_codecs), ID(utf_8_decode), bytes,
1342                                      errors_obj, is_final));
1343  if (result_obj.isError()) {
1344    if (result_obj.isErrorNotFound()) {
1345      thread->raiseWithFmt(LayoutId::kSystemError,
1346                           "could not call _codecs._utf_8_decode_stateful");
1347    }
1348    return nullptr;
1349  }
1350  Tuple result(&scope, *result_obj);
1351  if (consumed != nullptr) {
1352    *consumed = Int::cast(result.at(1)).asWord();
1353  }
1354  return ApiHandle::newReference(runtime, result.at(0));
1355}
1356
1357PY_EXPORT PyObject* PyUnicode_DecodeUnicodeEscape(const char* c_str,
1358                                                  Py_ssize_t size,
1359                                                  const char* errors) {
1360  DCHECK(c_str != nullptr, "c_str cannot be null");
1361  const char* first_invalid_escape;
1362  PyObject* result = _PyUnicode_DecodeUnicodeEscape(c_str, size, errors,
1363                                                    &first_invalid_escape);
1364  if (result == nullptr) {
1365    return nullptr;
1366  }
1367  if (first_invalid_escape != nullptr) {
1368    if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
1369                         "invalid escape sequence '\\%c'",
1370                         static_cast<byte>(*first_invalid_escape)) < 0) {
1371      Py_DECREF(result);
1372      return nullptr;
1373    }
1374  }
1375  return result;
1376}
1377
1378PY_EXPORT PyObject* _PyUnicode_DecodeUnicodeEscape(
1379    const char* c_str, Py_ssize_t size, const char* errors,
1380    const char** first_invalid_escape) {
1381  DCHECK(c_str != nullptr, "c_str cannot be null");
1382  DCHECK(first_invalid_escape != nullptr,
1383         "first_invalid_escape cannot be null");
1384
1385  // So we can remember if we've seen an invalid escape char or not
1386  *first_invalid_escape = nullptr;
1387
1388  Thread* thread = Thread::current();
1389  HandleScope scope(thread);
1390  Runtime* runtime = thread->runtime();
1391  Object bytes(&scope, runtime->newBytesWithAll(View<byte>(
1392                           reinterpret_cast<const byte*>(c_str), size)));
1393  Object errors_obj(&scope, symbolFromError(thread, errors));
1394  Object result_obj(
1395      &scope,
1396      thread->invokeFunction2(ID(_codecs), ID(_unicode_escape_decode_stateful),
1397                              bytes, errors_obj));
1398  if (result_obj.isError()) {
1399    if (result_obj.isErrorNotFound()) {
1400      thread->raiseWithFmt(LayoutId::kSystemError,
1401                           "could not call _codecs.unicode_escape_decode");
1402    }
1403    return nullptr;
1404  }
1405  Tuple result(&scope, *result_obj);
1406  Int first_invalid_index(&scope, result.at(2));
1407  word invalid_index = first_invalid_index.asWord();
1408  if (invalid_index > -1) {
1409    *first_invalid_escape = c_str + invalid_index;
1410  }
1411  return ApiHandle::newReference(runtime, result.at(0));
1412}
1413
1414PY_EXPORT PyObject* PyUnicode_EncodeCodePage(int /* e */, PyObject* /* e */,
1415                                             const char* /* s */) {
1416  UNIMPLEMENTED("PyUnicode_EncodeCodePage");
1417}
1418
1419PY_EXPORT PyObject* PyUnicode_EncodeLocale(PyObject* unicode,
1420                                           const char* errors) {
1421  _Py_error_handler surrogateescape;
1422  if (errors == nullptr || std::strcmp(errors, "strict") == 0) {
1423    surrogateescape = _Py_ERROR_STRICT;
1424  } else if (std::strcmp(errors, "surrogateescape") == 0) {
1425    surrogateescape = _Py_ERROR_SURROGATEESCAPE;
1426  } else {
1427    Thread::current()->raiseWithFmt(
1428        LayoutId::kValueError,
1429        "only 'strict' and 'surrogateescape' error handlers "
1430        "are supported, not '%s'",
1431        errors);
1432    return nullptr;
1433  }
1434  Py_ssize_t wlen;
1435  wchar_t* wstr = PyUnicode_AsWideCharString(unicode, &wlen);
1436  if (wstr == nullptr) {
1437    return nullptr;
1438  }
1439
1440  if (static_cast<size_t>(wlen) != std::wcslen(wstr)) {
1441    Thread::current()->raiseWithFmt(LayoutId::kValueError,
1442                                    "embedded null character");
1443    PyMem_Free(wstr);
1444    return nullptr;
1445  }
1446
1447  char* str;
1448  size_t error_pos;
1449  const char* reason;
1450  int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
1451                               /*current_locale=*/1, surrogateescape);
1452  PyMem_Free(wstr);
1453
1454  if (res != 0) {
1455    if (res == -2) {
1456      PyObject* exc =
1457          PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns", "locale",
1458                                unicode, error_pos, error_pos + 1, reason);
1459      if (exc != nullptr) {
1460        PyCodec_StrictErrors(exc);
1461        Py_DECREF(exc);
1462      }
1463    } else {
1464      PyErr_NoMemory();
1465    }
1466    return nullptr;
1467  }
1468
1469  PyObject* bytes = PyBytes_FromString(str);
1470  PyMem_RawFree(str);
1471  return bytes;
1472}
1473
1474PY_EXPORT PyObject* _PyUnicode_EncodeUTF16(PyObject* unicode,
1475                                           const char* errors, int byteorder) {
1476  DCHECK(unicode != nullptr, "unicode cannot be null");
1477  Thread* thread = Thread::current();
1478  HandleScope scope(thread);
1479  Runtime* runtime = thread->runtime();
1480  Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
1481  if (!runtime->isInstanceOfStr(*str)) {
1482    thread->raiseBadArgument();
1483    return nullptr;
1484  }
1485  Object errors_obj(&scope, symbolFromError(thread, errors));
1486  Object byteorder_obj(&scope, runtime->newInt(byteorder));
1487  Object tuple_obj(&scope,
1488                   thread->invokeFunction3(ID(_codecs), ID(utf_16_encode), str,
1489                                           errors_obj, byteorder_obj));
1490  if (tuple_obj.isError()) {
1491    return nullptr;
1492  }
1493  Tuple tuple(&scope, *tuple_obj);
1494  return ApiHandle::newReference(runtime, tuple.at(0));
1495}
1496
1497PY_EXPORT PyObject* PyUnicode_EncodeUTF16(const Py_UNICODE* unicode,
1498                                          Py_ssize_t size, const char* errors,
1499                                          int byteorder) {
1500  PyObject* str = PyUnicode_FromUnicode(unicode, size);
1501  if (str == nullptr) return nullptr;
1502  PyObject* result = _PyUnicode_EncodeUTF16(str, errors, byteorder);
1503  Py_DECREF(str);
1504  return result;
1505}
1506
1507PY_EXPORT PyObject* _PyUnicode_EncodeUTF32(PyObject* unicode,
1508                                           const char* errors, int byteorder) {
1509  DCHECK(unicode != nullptr, "unicode cannot be null");
1510  Thread* thread = Thread::current();
1511  HandleScope scope(thread);
1512  Runtime* runtime = thread->runtime();
1513  Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
1514  if (!runtime->isInstanceOfStr(*str)) {
1515    thread->raiseBadArgument();
1516    return nullptr;
1517  }
1518  Object errors_obj(&scope, symbolFromError(thread, errors));
1519  Object byteorder_obj(&scope, runtime->newInt(byteorder));
1520  Object tuple_obj(&scope,
1521                   thread->invokeFunction3(ID(_codecs), ID(utf_32_encode), str,
1522                                           errors_obj, byteorder_obj));
1523  if (tuple_obj.isError()) {
1524    return nullptr;
1525  }
1526  Tuple tuple(&scope, *tuple_obj);
1527  return ApiHandle::newReference(runtime, tuple.at(0));
1528}
1529
1530PY_EXPORT PyObject* PyUnicode_EncodeUTF32(const Py_UNICODE* unicode,
1531                                          Py_ssize_t size, const char* errors,
1532                                          int byteorder) {
1533  PyObject* str = PyUnicode_FromUnicode(unicode, size);
1534  if (str == nullptr) return nullptr;
1535  PyObject* result = _PyUnicode_EncodeUTF32(str, errors, byteorder);
1536  Py_DECREF(str);
1537  return result;
1538}
1539
1540PY_EXPORT int PyUnicode_FSConverter(PyObject* arg, void* addr) {
1541  if (arg == nullptr) {
1542    Py_DECREF(*reinterpret_cast<PyObject**>(addr));
1543    *reinterpret_cast<PyObject**>(addr) = nullptr;
1544    return 1;
1545  }
1546  Thread* thread = Thread::current();
1547  HandleScope scope(thread);
1548  Object arg_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(arg)));
1549  Object path(&scope, NoneType::object());
1550  Runtime* runtime = thread->runtime();
1551  if (runtime->isInstanceOfStr(*arg_obj) ||
1552      runtime->isInstanceOfBytes(*arg_obj)) {
1553    path = *arg_obj;
1554  } else {
1555    path = thread->invokeFunction1(ID(_io), ID(_fspath), arg_obj);
1556    if (path.isErrorException()) {
1557      return 0;
1558    }
1559  }
1560  Object output(&scope, NoneType::object());
1561  if (runtime->isInstanceOfBytes(*path)) {
1562    output = *path;
1563  } else {
1564    CHECK(std::strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0, "");
1565    CHECK(std::strcmp(Py_FileSystemDefaultEncodeErrors, "surrogatepass") == 0,
1566          "");
1567    // PyOS_FSPath/_io._fspath guarantee their returned value is bytes or str.
1568    // This is an inlined PyUnicode_FSDecoder, which does a UTF-8 decode with
1569    // surrogatepass. Since our strings are UTF-8 with UTF-16 surrogates
1570    // (WTF-8), we can just copy the bytes out.
1571    Str path_str(&scope, strUnderlying(*path));
1572    word path_len = path_str.length();
1573    MutableBytes bytes(&scope, runtime->newMutableBytesUninitialized(path_len));
1574    bytes.replaceFromWithStr(0, *path_str, path_len);
1575    output = bytes.becomeImmutable();
1576  }
1577  Bytes underlying(&scope, bytesUnderlying(*output));
1578  if (underlying.findByte('\0', /*start=*/0, /*length=*/underlying.length()) !=
1579      -1) {
1580    thread->raiseWithFmt(LayoutId::kValueError, "embedded null byte");
1581    return 0;
1582  }
1583  *reinterpret_cast<PyObject**>(addr) =
1584      ApiHandle::newReference(runtime, *output);
1585  return Py_CLEANUP_SUPPORTED;
1586}
1587
1588PY_EXPORT int PyUnicode_FSDecoder(PyObject* arg, void* addr) {
1589  if (arg == nullptr) {
1590    Py_DECREF(*(PyObject**)addr);
1591    *reinterpret_cast<PyObject**>(addr) = nullptr;
1592    return 1;
1593  }
1594
1595  bool is_buffer = PyObject_CheckBuffer(arg);
1596  PyObject* path;
1597  if (!is_buffer) {
1598    path = PyOS_FSPath(arg);
1599    if (path == nullptr) return 0;
1600  } else {
1601    path = arg;
1602    Py_INCREF(arg);
1603  }
1604
1605  PyObject* output;
1606  if (PyUnicode_Check(path)) {
1607    output = path;
1608  } else if (PyBytes_Check(path) || is_buffer) {
1609    if (!PyBytes_Check(path) &&
1610        PyErr_WarnFormat(
1611            PyExc_DeprecationWarning, 1,
1612            "path should be string, bytes, or os.PathLike, not %.200s",
1613            PyObject_TypeName(arg))) {
1614      Py_DECREF(path);
1615      return 0;
1616    }
1617    PyObject* path_bytes = PyBytes_FromObject(path);
1618    Py_DECREF(path);
1619    if (!path_bytes) return 0;
1620    output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
1621                                              PyBytes_GET_SIZE(path_bytes));
1622    Py_DECREF(path_bytes);
1623    if (!output) return 0;
1624  } else {
1625    Thread::current()->raiseWithFmt(
1626        LayoutId::kTypeError,
1627        "path should be string, bytes, or os.PathLike, not %s",
1628        PyObject_TypeName(arg));
1629    Py_DECREF(path);
1630    return 0;
1631  }
1632
1633  Thread* thread = Thread::current();
1634  HandleScope scope(thread);
1635  Str output_str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(output)));
1636  if (strFindAsciiChar(output_str, '\0') >= 0) {
1637    thread->raiseWithFmt(LayoutId::kValueError, "embedded null character");
1638    Py_DECREF(output);
1639    return 0;
1640  }
1641  *reinterpret_cast<PyObject**>(addr) = output;
1642  return Py_CLEANUP_SUPPORTED;
1643}
1644
1645PY_EXPORT Py_ssize_t PyUnicode_Find(PyObject* str, PyObject* substr,
1646                                    Py_ssize_t start, Py_ssize_t end,
1647                                    int direction) {
1648  DCHECK(str != nullptr, "str must be non-null");
1649  DCHECK(substr != nullptr, "substr must be non-null");
1650  DCHECK(direction == -1 || direction == 1, "direction must be -1 or 1");
1651  Thread* thread = Thread::current();
1652  HandleScope scope(thread);
1653  Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1654  Object needle_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr)));
1655  Runtime* runtime = thread->runtime();
1656  if (!runtime->isInstanceOfStr(*haystack_obj)) {
1657    thread->raiseWithFmt(LayoutId::kTypeError,
1658                         "PyUnicode_Find requires a 'str' instance");
1659    return -2;
1660  }
1661  Str haystack(&scope, strUnderlying(*haystack_obj));
1662  if (!runtime->isInstanceOfStr(*needle_obj)) {
1663    thread->raiseWithFmt(LayoutId::kTypeError,
1664                         "PyUnicode_Find requires a 'str' instance");
1665    return -2;
1666  }
1667  Str needle(&scope, strUnderlying(*needle_obj));
1668  if (direction == 1) return strFindWithRange(haystack, needle, start, end);
1669  return strRFind(haystack, needle, start, end);
1670}
1671
1672PY_EXPORT Py_ssize_t PyUnicode_FindChar(PyObject* str, Py_UCS4 ch,
1673                                        Py_ssize_t start, Py_ssize_t end,
1674                                        int direction) {
1675  DCHECK(str != nullptr, "str must not be null");
1676  DCHECK(direction == 1 || direction == -1, "direction must be -1 or 1");
1677  Thread* thread = Thread::current();
1678  HandleScope scope(thread);
1679  Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1680  Runtime* runtime = thread->runtime();
1681  DCHECK(runtime->isInstanceOfStr(*haystack_obj),
1682         "PyUnicode_FindChar requires a 'str' instance");
1683  Str haystack(&scope, strUnderlying(*haystack_obj));
1684  Str needle(&scope, SmallStr::fromCodePoint(ch));
1685  if (direction == 1) return strFindWithRange(haystack, needle, start, end);
1686  return strRFind(haystack, needle, start, end);
1687}
1688
1689PY_EXPORT PyObject* PyUnicode_Format(PyObject* format, PyObject* args) {
1690  if (format == nullptr || args == nullptr) {
1691    PyErr_BadInternalCall();
1692    return nullptr;
1693  }
1694  if (!PyUnicode_Check(format)) {
1695    Thread::current()->raiseWithFmt(LayoutId::kTypeError, "must be str, not %s",
1696                                    _PyType_Name(Py_TYPE(format)));
1697    return nullptr;
1698  }
1699  return PyNumber_Remainder(format, args);
1700}
1701
1702PY_EXPORT PyObject* PyUnicode_FromEncodedObject(PyObject* /* j */,
1703                                                const char* /* g */,
1704                                                const char* /* s */) {
1705  UNIMPLEMENTED("PyUnicode_FromEncodedObject");
1706}
1707
1708PY_EXPORT PyObject* PyUnicode_FromFormat(const char* format, ...) {
1709  va_list vargs;
1710
1711  va_start(vargs, format);
1712  PyObject* ret = PyUnicode_FromFormatV(format, vargs);
1713  va_end(vargs);
1714  return ret;
1715}
1716
1717PY_EXPORT PyObject* PyUnicode_FromFormatV(const char* format, va_list vargs) {
1718  va_list vargs2;
1719  _PyUnicodeWriter writer;
1720
1721  _PyUnicodeWriter_Init(&writer);
1722  writer.min_length = std::strlen(format) + 100;
1723  writer.overallocate = 1;
1724
1725  // This copy seems unnecessary but it may have been needed by CPython for
1726  // historical reasons.
1727  va_copy(vargs2, vargs);
1728
1729  for (const char* f = format; *f;) {
1730    if (*f == '%') {
1731      f = writeArg(&writer, f, &vargs2);
1732      if (f == nullptr) goto fail;
1733    } else {
1734      const char* p = f;
1735      do {
1736        if (static_cast<unsigned char>(*p) > 127) {
1737          PyErr_Format(
1738              PyExc_ValueError,
1739              "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1740              "string, got a non-ASCII byte: 0x%02x",
1741              static_cast<unsigned char>(*p));
1742          goto fail;
1743        }
1744        p++;
1745      } while (*p != '\0' && *p != '%');
1746      Py_ssize_t len = p - f;
1747
1748      if (*p == '\0') writer.overallocate = 0;
1749
1750      if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0) goto fail;
1751
1752      f = p;
1753    }
1754  }
1755  va_end(vargs2);
1756  return _PyUnicodeWriter_Finish(&writer);
1757
1758fail:
1759  va_end(vargs2);
1760  _PyUnicodeWriter_Dealloc(&writer);
1761  return nullptr;
1762}
1763
1764PY_EXPORT PyObject* PyUnicode_FromObject(PyObject* /* j */) {
1765  UNIMPLEMENTED("PyUnicode_FromObject");
1766}
1767
1768PY_EXPORT PyObject* PyUnicode_FromOrdinal(int ordinal) {
1769  Thread* thread = Thread::current();
1770  if (ordinal < 0 || ordinal > kMaxUnicode) {
1771    thread->raiseWithFmt(LayoutId::kValueError,
1772                         "chr() arg not in range(0x110000)");
1773    return nullptr;
1774  }
1775  return ApiHandle::newReference(thread->runtime(),
1776                                 SmallStr::fromCodePoint(ordinal));
1777}
1778
1779PY_EXPORT PyObject* PyUnicode_FromWideChar(const wchar_t* buffer,
1780                                           Py_ssize_t size) {
1781  Thread* thread = Thread::current();
1782  if (buffer == nullptr && size != 0) {
1783    thread->raiseBadInternalCall();
1784    return nullptr;
1785  }
1786
1787  RawObject result = size == -1
1788                         ? newStrFromWideChar(thread, buffer)
1789                         : newStrFromWideCharWithLength(thread, buffer, size);
1790  return result.isErrorException()
1791             ? nullptr
1792             : ApiHandle::newReference(thread->runtime(), result);
1793}
1794
1795PY_EXPORT Py_ssize_t PyUnicode_GET_LENGTH_Func(PyObject* pyobj) {
1796  RawObject obj = ApiHandle::asObjectNoImmediate(ApiHandle::fromPyObject(pyobj));
1797  DCHECK(Thread::current()->runtime()->isInstanceOfStr(obj),
1798         "non-str argument to PyUnicode_GET_LENGTH");
1799  return strUnderlying(obj).codePointLength();
1800}
1801
1802PY_EXPORT const char* PyUnicode_GetDefaultEncoding() {
1803  return Py_FileSystemDefaultEncoding;
1804}
1805
1806PY_EXPORT Py_ssize_t PyUnicode_GetLength(PyObject* pyobj) {
1807  Thread* thread = Thread::current();
1808  RawObject obj = ApiHandle::asObject(ApiHandle::fromPyObject(pyobj));
1809  if (!thread->runtime()->isInstanceOfStr(obj)) {
1810    thread->raiseBadArgument();
1811    return -1;
1812  }
1813  return strUnderlying(obj).codePointLength();
1814}
1815
1816PY_EXPORT Py_ssize_t PyUnicode_GetSize(PyObject* pyobj) {
1817  // This function returns the number of UTF-16 or UTF-32 code units, depending
1818  // on the size of wchar_t on the operating system. On the machines that we
1819  // currently use for testing, this is the same as the number of Unicode code
1820  // points. This must be modified when we support operating systems with
1821  // different wchar_t (e.g. Windows).
1822  return PyUnicode_GetLength(pyobj);
1823}
1824
1825PY_EXPORT PyObject* PyUnicode_InternFromString(const char* c_str) {
1826  DCHECK(c_str != nullptr, "c_str must not be nullptr");
1827  Thread* thread = Thread::current();
1828  return ApiHandle::newReference(thread->runtime(),
1829                                 Runtime::internStrFromCStr(thread, c_str));
1830}
1831
1832PY_EXPORT void PyUnicode_InternImmortal(PyObject** /* p */) {
1833  UNIMPLEMENTED("PyUnicode_InternImmortal");
1834}
1835
1836PY_EXPORT void PyUnicode_InternInPlace(PyObject** obj_ptr) {
1837  PyObject* pobj = *obj_ptr;
1838  DCHECK(pobj != nullptr, "pobj should not be null");
1839  if (pobj == nullptr) {
1840    return;
1841  }
1842  Thread* thread = Thread::current();
1843  HandleScope scope(thread);
1844  Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(pobj)));
1845  if (!obj.isLargeStr()) {
1846    return;
1847  }
1848  Object result(&scope, Runtime::internStr(thread, obj));
1849  if (result != obj) {
1850    Py_DECREF(pobj);
1851    *obj_ptr = ApiHandle::newReference(thread->runtime(), *result);
1852  }
1853}
1854
1855PY_EXPORT int PyUnicode_IsIdentifier(PyObject* str) {
1856  DCHECK(str != nullptr, "str must not be null");
1857  Thread* thread = Thread::current();
1858  HandleScope scope(thread);
1859  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1860  if (str_obj == Str::empty()) {
1861    return false;
1862  }
1863  Object result(&scope, thread->invokeMethodStatic1(LayoutId::kStr,
1864                                                    ID(isidentifier), str_obj));
1865  DCHECK(!result.isErrorNotFound(), "could not call str.isidentifier");
1866  CHECK(!result.isError(), "this function should not error");
1867  return Bool::cast(*result).value();
1868}
1869
1870PY_EXPORT PyObject* PyUnicode_Join(PyObject* sep, PyObject* seq) {
1871  DCHECK(sep != nullptr, "sep should not be null");
1872  DCHECK(seq != nullptr, "seq should not be null");
1873  Thread* thread = Thread::current();
1874  HandleScope scope(thread);
1875  Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
1876  // An optimization to rule out non-str values here to use the further
1877  // optimization of `strJoinWithTupleOrList`.
1878  Runtime* runtime = thread->runtime();
1879  if (!runtime->isInstanceOfStr(*sep_obj)) {
1880    thread->raiseWithFmt(LayoutId::kTypeError,
1881                         "separator: expected str instance,"
1882                         "'%T' found",
1883                         &sep_obj);
1884    return nullptr;
1885  }
1886  Str sep_str(&scope, strUnderlying(*sep_obj));
1887  Object seq_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(seq)));
1888  // An ad-hoc optimization for the case `seq_obj` is a `tuple` or `list`,
1889  // that can be removed without changing the correctness of PyUnicode_Join.
1890  Object result(&scope, strJoinWithTupleOrList(thread, sep_str, seq_obj));
1891  if (result.isUnbound()) {
1892    result =
1893        thread->invokeMethodStatic2(LayoutId::kStr, ID(join), sep_str, seq_obj);
1894  }
1895  if (result.isError()) {
1896    if (result.isErrorNotFound()) {
1897      thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.join");
1898    }
1899    return nullptr;
1900  }
1901  return ApiHandle::newReference(runtime, *result);
1902}
1903
1904PY_EXPORT PyObject* PyUnicode_Partition(PyObject* str, PyObject* sep) {
1905  DCHECK(str != nullptr, "str should not be null");
1906  DCHECK(sep != nullptr, "sep should not be null");
1907  Thread* thread = Thread::current();
1908  HandleScope scope(thread);
1909  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1910  Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
1911  Object result(&scope, thread->invokeMethodStatic2(
1912                            LayoutId::kStr, ID(partition), str_obj, sep_obj));
1913  if (result.isError()) {
1914    if (result.isErrorNotFound()) {
1915      thread->raiseWithFmt(LayoutId::kTypeError,
1916                           "could not call str.partition");
1917    }
1918    return nullptr;
1919  }
1920  return ApiHandle::newReference(thread->runtime(), *result);
1921}
1922
1923PY_EXPORT PyObject* PyUnicode_RPartition(PyObject* str, PyObject* sep) {
1924  DCHECK(str != nullptr, "str should not be null");
1925  DCHECK(sep != nullptr, "sep should not be null");
1926  Thread* thread = Thread::current();
1927  HandleScope scope(thread);
1928  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1929  Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
1930  Object result(&scope, thread->invokeMethodStatic2(
1931                            LayoutId::kStr, ID(rpartition), str_obj, sep_obj));
1932  if (result.isError()) {
1933    if (result.isErrorNotFound()) {
1934      thread->raiseWithFmt(LayoutId::kTypeError,
1935                           "could not call str.rpartition");
1936    }
1937    return nullptr;
1938  }
1939  return ApiHandle::newReference(thread->runtime(), *result);
1940}
1941
1942PY_EXPORT PyObject* PyUnicode_RSplit(PyObject* str, PyObject* sep,
1943                                     Py_ssize_t maxsplit) {
1944  DCHECK(str != nullptr, "str must not be null");
1945  DCHECK(sep != nullptr, "sep must not be null");
1946  Thread* thread = Thread::current();
1947  HandleScope scope(thread);
1948  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1949  Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
1950  Runtime* runtime = thread->runtime();
1951  Object maxsplit_obj(&scope, runtime->newInt(maxsplit));
1952  Object result(&scope,
1953                thread->invokeMethodStatic3(LayoutId::kStr, ID(rsplit), str_obj,
1954                                            sep_obj, maxsplit_obj));
1955  if (result.isError()) {
1956    if (result.isErrorNotFound()) {
1957      thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.rsplit");
1958    }
1959    return nullptr;
1960  }
1961  return ApiHandle::newReference(runtime, *result);
1962}
1963
1964PY_EXPORT Py_UCS4 PyUnicode_ReadChar(PyObject* obj, Py_ssize_t index) {
1965  DCHECK(obj != nullptr, "obj must not be null");
1966  Thread* thread = Thread::current();
1967  HandleScope scope(thread);
1968  Runtime* runtime = thread->runtime();
1969  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj)));
1970  if (!runtime->isInstanceOfStr(*str_obj)) {
1971    thread->raiseBadArgument();
1972    return -1;
1973  }
1974  Str str(&scope, strUnderlying(*str_obj));
1975  word byte_offset;
1976  if (index < 0 ||
1977      (byte_offset = thread->strOffset(str, index)) >= str.length()) {
1978    thread->raiseWithFmt(LayoutId::kIndexError, "string index out of range");
1979    return -1;
1980  }
1981  word num_bytes;
1982  return str.codePointAt(byte_offset, &num_bytes);
1983}
1984
1985PY_EXPORT PyObject* PyUnicode_Replace(PyObject* str, PyObject* substr,
1986                                      PyObject* replstr, Py_ssize_t maxcount) {
1987  DCHECK(str != nullptr, "str must not be null");
1988  DCHECK(substr != nullptr, "substr must not be null");
1989  DCHECK(replstr != nullptr, "replstr must not be null");
1990  Thread* thread = Thread::current();
1991  HandleScope scope(thread);
1992  Runtime* runtime = thread->runtime();
1993  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
1994  if (!runtime->isInstanceOfStr(*str_obj)) {
1995    thread->raiseWithFmt(LayoutId::kTypeError, "str must be str");
1996    return nullptr;
1997  }
1998
1999  Object substr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr)));
2000  if (!runtime->isInstanceOfStr(*substr_obj)) {
2001    thread->raiseWithFmt(LayoutId::kTypeError, "substr must be str");
2002    return nullptr;
2003  }
2004
2005  Object replstr_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(replstr)));
2006  if (!runtime->isInstanceOfStr(*replstr_obj)) {
2007    thread->raiseWithFmt(LayoutId::kTypeError, "replstr must be str");
2008    return nullptr;
2009  }
2010
2011  Str str_str(&scope, strUnderlying(*str_obj));
2012  Str substr_str(&scope, strUnderlying(*substr_obj));
2013  Str replstr_str(&scope, strUnderlying(*replstr_obj));
2014  return ApiHandle::newReference(
2015      runtime,
2016      runtime->strReplace(thread, str_str, substr_str, replstr_str, maxcount));
2017}
2018
2019PY_EXPORT int PyUnicode_Resize(PyObject** /* p_unicode */, Py_ssize_t /* h */) {
2020  UNIMPLEMENTED("PyUnicode_Resize");
2021}
2022
2023PY_EXPORT PyObject* PyUnicode_RichCompare(PyObject* /* t */, PyObject* /* t */,
2024                                          int /* p */) {
2025  UNIMPLEMENTED("PyUnicode_RichCompare");
2026}
2027
2028PY_EXPORT PyObject* PyUnicode_Split(PyObject* str, PyObject* sep,
2029                                    Py_ssize_t maxsplit) {
2030  DCHECK(str != nullptr, "str must not be null");
2031  DCHECK(sep != nullptr, "sep must not be null");
2032  Thread* thread = Thread::current();
2033  HandleScope scope(thread);
2034  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
2035  Object sep_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(sep)));
2036  Runtime* runtime = thread->runtime();
2037  Object maxsplit_obj(&scope, runtime->newInt(maxsplit));
2038  Object result(&scope,
2039                thread->invokeMethodStatic3(LayoutId::kStr, ID(split), str_obj,
2040                                            sep_obj, maxsplit_obj));
2041  if (result.isError()) {
2042    if (result.isErrorNotFound()) {
2043      thread->raiseWithFmt(LayoutId::kTypeError, "could not call str.split");
2044    }
2045    return nullptr;
2046  }
2047  return ApiHandle::newReference(runtime, *result);
2048}
2049
2050PY_EXPORT PyObject* PyUnicode_Splitlines(PyObject* str, int keepends) {
2051  Thread* thread = Thread::current();
2052  HandleScope scope(thread);
2053  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
2054  Runtime* runtime = thread->runtime();
2055  if (!runtime->isInstanceOfStr(*str_obj)) {
2056    thread->raiseWithFmt(LayoutId::kTypeError, "must be str, not '%T'",
2057                         &str_obj);
2058    return nullptr;
2059  }
2060  Str str_str(&scope, strUnderlying(*str_obj));
2061  return ApiHandle::newReference(runtime,
2062                                 strSplitlines(thread, str_str, keepends));
2063}
2064
2065PY_EXPORT PyObject* PyUnicode_Substring(PyObject* pyobj, Py_ssize_t start,
2066                                        Py_ssize_t end) {
2067  DCHECK(pyobj != nullptr, "null argument to PyUnicode_Substring");
2068  Thread* thread = Thread::current();
2069  if (start < 0 || end < 0) {
2070    thread->raiseWithFmt(LayoutId::kIndexError, "string index out of range");
2071    return nullptr;
2072  }
2073  Runtime* runtime = thread->runtime();
2074  if (end <= start) {
2075    return ApiHandle::newReference(runtime, Str::empty());
2076  }
2077  HandleScope scope(thread);
2078  ApiHandle* handle = ApiHandle::fromPyObject(pyobj);
2079  Object obj(&scope, ApiHandle::asObject(handle));
2080  DCHECK(runtime->isInstanceOfStr(*obj),
2081         "PyUnicode_Substring requires a 'str' instance");
2082  Str self(&scope, strUnderlying(*obj));
2083  word len = self.length();
2084  word start_index = thread->strOffset(self, start);
2085  if (start_index == len) {
2086    return ApiHandle::newReference(runtime, Str::empty());
2087  }
2088  word end_index = thread->strOffset(self, end);
2089  if (end_index == len) {
2090    if (start_index == 0) {
2091      ApiHandle::incref(handle);
2092      return pyobj;
2093    }
2094  }
2095  return ApiHandle::newReference(
2096      runtime, strSubstr(thread, self, start_index, end_index - start_index));
2097}
2098
2099PY_EXPORT Py_ssize_t PyUnicode_Tailmatch(PyObject* str, PyObject* substr,
2100                                         Py_ssize_t start, Py_ssize_t end,
2101                                         int direction) {
2102  DCHECK(str != nullptr, "str must be non-null");
2103  DCHECK(substr != nullptr, "substr must be non-null");
2104  DCHECK(direction == -1 || direction == 1, "direction must be -1 or 1");
2105  Thread* thread = Thread::current();
2106  HandleScope scope(thread);
2107  Object haystack_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(str)));
2108  Object needle_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(substr)));
2109  Runtime* runtime = thread->runtime();
2110  if (!runtime->isInstanceOfStr(*haystack_obj) ||
2111      !runtime->isInstanceOfStr(*needle_obj)) {
2112    thread->raiseBadArgument();
2113    return -1;
2114  }
2115  Str haystack(&scope, strUnderlying(*haystack_obj));
2116  Str needle(&scope, strUnderlying(*needle_obj));
2117  word haystack_len = haystack.codePointLength();
2118  Slice::adjustSearchIndices(&start, &end, haystack_len);
2119  word needle_len = needle.codePointLength();
2120  if (start + needle_len > end) {
2121    return 0;
2122  }
2123  word start_offset;
2124  if (direction == 1) {
2125    start_offset = haystack.offsetByCodePoints(0, end - needle_len);
2126  } else {
2127    start_offset = haystack.offsetByCodePoints(0, start);
2128  }
2129  word needle_chars = needle.length();
2130  for (word i = start_offset, j = 0; j < needle_chars; i++, j++) {
2131    if (haystack.byteAt(i) != needle.byteAt(j)) {
2132      return 0;
2133    }
2134  }
2135  return 1;
2136}
2137
2138PY_EXPORT PyObject* PyUnicode_Translate(PyObject* /* r */, PyObject* /* g */,
2139                                        const char* /* s */) {
2140  UNIMPLEMENTED("PyUnicode_Translate");
2141}
2142
2143PY_EXPORT PyTypeObject* PyUnicode_Type_Ptr() {
2144  Runtime* runtime = Thread::current()->runtime();
2145  return reinterpret_cast<PyTypeObject*>(
2146      ApiHandle::borrowedReference(runtime, runtime->typeAt(LayoutId::kStr)));
2147}
2148
2149PY_EXPORT int PyUnicode_WriteChar(PyObject* /* e */, Py_ssize_t /* x */,
2150                                  Py_UCS4 /* h */) {
2151  UNIMPLEMENTED("PyUnicode_WriteChar");
2152}
2153
2154PY_EXPORT Py_UNICODE* PyUnicode_AsUnicode(PyObject* /* e */) {
2155  UNIMPLEMENTED("PyUnicode_AsUnicode");
2156}
2157
2158PY_EXPORT Py_UNICODE* PyUnicode_AsUnicodeAndSize(PyObject* /* unicode */,
2159                                                 Py_ssize_t* /* size */) {
2160  UNIMPLEMENTED("PyUnicode_AsUnicodeAndSize");
2161}
2162
2163template <typename T>
2164static PyObject* decodeUnicodeToString(Thread* thread, const void* src,
2165                                       word size) {
2166  Runtime* runtime = thread->runtime();
2167  DCHECK(src != nullptr, "Must pass in a non-null buffer");
2168  const T* cp = static_cast<const T*>(src);
2169  if (size == 1) {
2170    return ApiHandle::newReference(runtime, SmallStr::fromCodePoint(cp[0]));
2171  }
2172  HandleScope scope(thread);
2173  // TODO(T41785453): Remove the StrArray intermediary
2174  StrArray array(&scope, runtime->newStrArray());
2175  runtime->strArrayEnsureCapacity(thread, array, size);
2176  for (word i = 0; i < size; ++i) {
2177    runtime->strArrayAddCodePoint(thread, array, cp[i]);
2178  }
2179  return ApiHandle::newReference(runtime, runtime->strFromStrArray(array));
2180}
2181
2182PY_EXPORT PyObject* PyUnicode_FromKindAndData(int kind, const void* buffer,
2183                                              Py_ssize_t size) {
2184  Thread* thread = Thread::current();
2185  if (size < 0) {
2186    thread->raiseWithFmt(LayoutId::kValueError, "size must be positive");
2187    return nullptr;
2188  }
2189  if (size == 0) {
2190    return ApiHandle::newReference(thread->runtime(), Str::empty());
2191  }
2192  switch (kind) {
2193    case PyUnicode_1BYTE_KIND:
2194      return decodeUnicodeToString<Py_UCS1>(thread, buffer, size);
2195    case PyUnicode_2BYTE_KIND:
2196      return decodeUnicodeToString<Py_UCS2>(thread, buffer, size);
2197    case PyUnicode_4BYTE_KIND:
2198      return decodeUnicodeToString<Py_UCS4>(thread, buffer, size);
2199  }
2200  thread->raiseWithFmt(LayoutId::kSystemError, "invalid kind");
2201  return nullptr;
2202}
2203
2204PY_EXPORT PyObject* PyUnicode_FromUnicode(const Py_UNICODE* code_units,
2205                                          Py_ssize_t size) {
2206  if (code_units == nullptr) {
2207    // TODO(T36562134): Implement _PyUnicode_New
2208    UNIMPLEMENTED("_PyUnicode_New");
2209  }
2210
2211  Thread* thread = Thread::current();
2212  RawObject result = newStrFromWideCharWithLength(thread, code_units, size);
2213  return result.isErrorException()
2214             ? nullptr
2215             : ApiHandle::newReference(thread->runtime(), result);
2216}
2217
2218PY_EXPORT int PyUnicode_KIND_Func(PyObject* obj) {
2219  // TODO(T47682853): Introduce new PyUnicode_VARBYTE_KIND
2220  CHECK(PyUnicode_IS_ASCII_Func(obj), "only ASCII allowed");
2221  return PyUnicode_1BYTE_KIND;
2222}
2223
2224// NOTE: This will return a cached and managed C-string buffer that is a copy
2225// of the Str internal buffer. It is NOT a direct pointer into the string
2226// object, so writing into this buffer will do nothing. This is different
2227// behavior from CPython, where changing the data in the buffer changes the
2228// string object.
2229PY_EXPORT void* PyUnicode_DATA_Func(PyObject* str) {
2230  Thread* thread = Thread::current();
2231  Runtime* runtime = thread->runtime();
2232  ApiHandle* handle = ApiHandle::fromPyObject(str);
2233  if (void* cache = ApiHandle::cache(runtime, handle)) {
2234    return static_cast<char*>(cache);
2235  }
2236  HandleScope scope(thread);
2237  Object obj(&scope, ApiHandle::asObject(handle));
2238  DCHECK(runtime->isInstanceOfStr(*obj), "str should be a str instance");
2239  Str str_obj(&scope, strUnderlying(*obj));
2240  word length = str_obj.length();
2241  byte* result = static_cast<byte*>(std::malloc(length + 1));
2242  str_obj.copyTo(result, length);
2243  result[length] = '\0';
2244  ApiHandle::setCache(runtime, handle, result);
2245  ApiHandle::setBorrowedNoImmediate(handle);
2246  return reinterpret_cast<char*>(result);
2247}
2248
2249PY_EXPORT Py_UCS4 PyUnicode_READ_Func(int kind, void* data, Py_ssize_t index) {
2250  if (kind == PyUnicode_1BYTE_KIND) return static_cast<Py_UCS1*>(data)[index];
2251  if (kind == PyUnicode_2BYTE_KIND) return static_cast<Py_UCS2*>(data)[index];
2252  DCHECK(kind == PyUnicode_4BYTE_KIND, "kind must be PyUnicode_4BYTE_KIND");
2253  return static_cast<Py_UCS4*>(data)[index];
2254}
2255
2256PY_EXPORT Py_UCS4 PyUnicode_READ_CHAR_Func(PyObject* obj, Py_ssize_t index) {
2257  Thread* thread = Thread::current();
2258  HandleScope scope(thread);
2259  Object str_obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj)));
2260  DCHECK(thread->runtime()->isInstanceOfStr(*str_obj),
2261         "PyUnicode_READ_CHAR must receive a unicode object");
2262  Str str(&scope, strUnderlying(*str_obj));
2263  word byte_offset = thread->strOffset(str, index);
2264  if (byte_offset == str.length()) return Py_UCS4{0};
2265  word num_bytes;
2266  return static_cast<Py_UCS4>(str.codePointAt(byte_offset, &num_bytes));
2267}
2268
2269PY_EXPORT int PyUnicode_IS_ASCII_Func(PyObject* obj) {
2270  Thread* thread = Thread::current();
2271  HandleScope scope(thread);
2272  Object str(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(obj)));
2273  DCHECK(thread->runtime()->isInstanceOfStr(*str),
2274         "strIsASCII must receive a unicode object");
2275  return strUnderlying(*str).isASCII() ? 1 : 0;
2276}
2277
2278PY_EXPORT int Py_UNICODE_ISALPHA_Func(Py_UCS4 code_point) {
2279  if (code_point > kMaxUnicode) {
2280    return 0;
2281  }
2282  return Unicode::isAlpha(static_cast<int32_t>(code_point)) ? 1 : 0;
2283}
2284
2285PY_EXPORT int Py_UNICODE_ISDECIMAL_Func(Py_UCS4 code_point) {
2286  if (code_point > kMaxUnicode) {
2287    return 0;
2288  }
2289  return Unicode::isDecimal(static_cast<int32_t>(code_point)) ? 1 : 0;
2290}
2291
2292PY_EXPORT int Py_UNICODE_ISDIGIT_Func(Py_UCS4 code_point) {
2293  if (code_point > kMaxUnicode) {
2294    return 0;
2295  }
2296  return Unicode::isDigit(static_cast<int32_t>(code_point)) ? 1 : 0;
2297}
2298
2299PY_EXPORT int Py_UNICODE_ISLINEBREAK_Func(Py_UCS4 code_point) {
2300  if (code_point > kMaxUnicode) {
2301    return 0;
2302  }
2303  return Unicode::isLinebreak(static_cast<int32_t>(code_point)) ? 1 : 0;
2304}
2305
2306PY_EXPORT int Py_UNICODE_ISLOWER_Func(Py_UCS4 code_point) {
2307  if (code_point > kMaxUnicode) {
2308    return 0;
2309  }
2310  return Unicode::isLower(static_cast<int32_t>(code_point)) ? 1 : 0;
2311}
2312
2313PY_EXPORT int Py_UNICODE_ISNUMERIC_Func(Py_UCS4 code_point) {
2314  if (code_point > kMaxUnicode) {
2315    return 0;
2316  }
2317  return Unicode::isNumeric(static_cast<int32_t>(code_point)) ? 1 : 0;
2318}
2319
2320PY_EXPORT int Py_UNICODE_ISPRINTABLE_Func(Py_UCS4 code_point) {
2321  if (code_point > kMaxUnicode) {
2322    return 0;
2323  }
2324  return Unicode::isPrintable(static_cast<int32_t>(code_point)) ? 1 : 0;
2325}
2326
2327PY_EXPORT int Py_UNICODE_ISSPACE_Func(Py_UCS4 code_point) {
2328  if (code_point > kMaxUnicode) {
2329    return 0;
2330  }
2331  return Unicode::isSpace(static_cast<int32_t>(code_point)) ? 1 : 0;
2332}
2333
2334PY_EXPORT int Py_UNICODE_ISTITLE_Func(Py_UCS4 code_point) {
2335  if (code_point > kMaxUnicode) {
2336    return 0;
2337  }
2338  return Unicode::isTitle(static_cast<int32_t>(code_point)) ? 1 : 0;
2339}
2340
2341PY_EXPORT int Py_UNICODE_ISUPPER_Func(Py_UCS4 code_point) {
2342  if (code_point > kMaxUnicode) {
2343    return 0;
2344  }
2345  return Unicode::isUpper(static_cast<int32_t>(code_point)) ? 1 : 0;
2346}
2347
2348PY_EXPORT int Py_UNICODE_TODECIMAL_Func(Py_UCS4 code_point) {
2349  if (code_point > kMaxUnicode) {
2350    return -1;
2351  }
2352  return Unicode::toDecimal(static_cast<int32_t>(code_point));
2353}
2354
2355PY_EXPORT int Py_UNICODE_TODIGIT_Func(Py_UCS4 code_point) {
2356  if (code_point > kMaxUnicode) {
2357    return -1;
2358  }
2359  return Unicode::toDigit(static_cast<int32_t>(code_point));
2360}
2361
2362PY_EXPORT Py_UCS4 Py_UNICODE_TOLOWER_Func(Py_UCS4 code_point) {
2363  if (code_point > kMaxUnicode) {
2364    return code_point;
2365  }
2366  FullCasing lower = Unicode::toLower(static_cast<int32_t>(code_point));
2367  return lower.code_points[0];
2368}
2369
2370PY_EXPORT double Py_UNICODE_TONUMERIC_Func(Py_UCS4 code_point) {
2371  if (code_point > kMaxUnicode) {
2372    return -1.0;
2373  }
2374  return Unicode::toNumeric(static_cast<int32_t>(code_point));
2375}
2376
2377PY_EXPORT Py_UCS4 Py_UNICODE_TOTITLE_Func(Py_UCS4 code_point) {
2378  if (code_point > kMaxUnicode) {
2379    return code_point;
2380  }
2381  FullCasing title = Unicode::toTitle(static_cast<int32_t>(code_point));
2382  return title.code_points[0];
2383}
2384
2385PY_EXPORT Py_UCS4 Py_UNICODE_TOUPPER_Func(Py_UCS4 code_point) {
2386  if (code_point > kMaxUnicode) {
2387    return code_point;
2388  }
2389  FullCasing upper = Unicode::toUpper(static_cast<int32_t>(code_point));
2390  return upper.code_points[0];
2391}
2392
2393PY_EXPORT int _Py_normalize_encoding(const char* encoding, char* lower,
2394                                     size_t lower_len) {
2395  char* buffer = lower;
2396  const char* lower_end = &lower[lower_len - 1];
2397  bool has_punct = false;
2398  for (char ch = *encoding; ch != '\0'; ch = *++encoding) {
2399    if (Py_ISALNUM(ch) || ch == '.') {
2400      if (has_punct && buffer != lower) {
2401        if (buffer == lower_end) {
2402          return 0;
2403        }
2404        *buffer++ = '_';
2405      }
2406      has_punct = false;
2407
2408      if (buffer == lower_end) {
2409        return 0;
2410      }
2411      *buffer++ = Py_TOLOWER(ch);
2412    } else {
2413      has_punct = true;
2414    }
2415  }
2416  *buffer = '\0';
2417  return 1;
2418}
2419
2420PY_EXPORT PyObject* _PyUnicode_AsUTF8String(PyObject* unicode,
2421                                            const char* errors) {
2422  DCHECK(unicode != nullptr, "unicode cannot be null");
2423  Thread* thread = Thread::current();
2424  HandleScope scope(thread);
2425  Runtime* runtime = thread->runtime();
2426  Object obj(&scope, ApiHandle::asObject(ApiHandle::fromPyObject(unicode)));
2427  if (!runtime->isInstanceOfStr(*obj)) {
2428    thread->raiseBadArgument();
2429    return nullptr;
2430  }
2431  Str str(&scope, strUnderlying(*obj));
2432  if (!strHasSurrogate(str)) {
2433    word length = str.length();
2434    MutableBytes result(&scope, runtime->newMutableBytesUninitialized(length));
2435    result.replaceFromWithStr(0, *str, length);
2436    return ApiHandle::newReference(runtime, result.becomeImmutable());
2437  }
2438  Object errors_obj(&scope, symbolFromError(thread, errors));
2439  Object tuple_obj(&scope, thread->invokeFunction2(
2440                               ID(_codecs), ID(utf_8_encode), str, errors_obj));
2441  if (tuple_obj.isError()) {
2442    return nullptr;
2443  }
2444  Tuple tuple(&scope, *tuple_obj);
2445  return ApiHandle::newReference(runtime, tuple.at(0));
2446}
2447
2448PY_EXPORT wchar_t* _Py_DecodeUTF8_surrogateescape(const char* c_str,
2449                                                  Py_ssize_t size,
2450                                                  size_t* wlen) {
2451  DCHECK(c_str != nullptr, "c_str cannot be null");
2452  wchar_t* wc_str =
2453      static_cast<wchar_t*>(PyMem_RawMalloc((size + 1) * sizeof(wchar_t)));
2454  for (Py_ssize_t i = 0; i < size; i++) {
2455    char ch = c_str[i];
2456    // TODO(T57811636): Support UTF-8 arguments on macOS.
2457    // We don't have UTF-8 decoding machinery that is decoupled from the
2458    // runtime
2459    if (ch & 0x80) {
2460      UNIMPLEMENTED("UTF-8 argument support unimplemented");
2461    }
2462    wc_str[i] = static_cast<wchar_t>(ch);
2463  }
2464  wc_str[size] = '\0';
2465  if (wlen != nullptr) {
2466    *wlen = size;
2467  }
2468  return wc_str;
2469}
2470
2471PY_EXPORT int _Py_DecodeUTF8Ex(const char* c_str, Py_ssize_t size,
2472                               wchar_t** result, size_t* wlen,
2473                               const char** /* reason */,
2474                               _Py_error_handler /* surrogateescape */) {
2475  wchar_t* wc_str =
2476      static_cast<wchar_t*>(PyMem_RawMalloc((size + 1) * sizeof(*wc_str)));
2477  if (wc_str == nullptr) {
2478    return -1;
2479  }
2480  for (Py_ssize_t i = 0; i < size; i++) {
2481    byte ch = c_str[i];
2482    // TODO(T57811636): Support UTF-8 decoding decoupled from the runtime.
2483    // We don't have UTF-8 decoding machinery that is decoupled from the
2484    // runtime
2485    if (ch > kMaxASCII) {
2486      UNIMPLEMENTED("UTF-8 argument support unimplemented");
2487    }
2488    wc_str[i] = ch;
2489  }
2490  wc_str[size] = '\0';
2491  *result = wc_str;
2492  if (wlen) {
2493    *wlen = size;
2494  }
2495  return 0;
2496}
2497
2498// UTF-8 encoder using the surrogateescape error handler .
2499//
2500// On success, return 0 and write the newly allocated character string (use
2501// PyMem_Free() to free the memory) into *str.
2502//
2503// On encoding failure, return -2 and write the position of the invalid
2504// surrogate character into *error_pos (if error_pos is set) and the decoding
2505// error message into *reason (if reason is set).
2506//
2507// On memory allocation failure, return -1.
2508PY_EXPORT int _Py_EncodeUTF8Ex(const wchar_t* text, char** str,
2509                               size_t* error_pos, const char** reason,
2510                               int raw_malloc, _Py_error_handler errors) {
2511  const Py_ssize_t max_char_size = 4;
2512  Py_ssize_t len = std::wcslen(text);
2513  DCHECK(len >= 0, "len must be non-negative");
2514
2515  bool surrogateescape = false;
2516  bool surrogatepass = false;
2517  switch (errors) {
2518    case _Py_ERROR_STRICT:
2519      break;
2520    case _Py_ERROR_SURROGATEESCAPE:
2521      surrogateescape = true;
2522      break;
2523    case _Py_ERROR_SURROGATEPASS:
2524      surrogatepass = true;
2525      break;
2526    default:
2527      return -3;
2528  }
2529
2530  if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
2531    return -1;
2532  }
2533  char* bytes;
2534  if (raw_malloc) {
2535    bytes = reinterpret_cast<char*>(PyMem_RawMalloc((len + 1) * max_char_size));
2536  } else {
2537    bytes = reinterpret_cast<char*>(PyMem_Malloc((len + 1) * max_char_size));
2538  }
2539  if (bytes == nullptr) {
2540    return -1;
2541  }
2542
2543  char* p = bytes;
2544  for (Py_ssize_t i = 0; i < len; i++) {
2545    Py_UCS4 ch = text[i];
2546
2547    if (ch < 0x80) {
2548      // Encode ASCII
2549      *p++ = (char)ch;
2550
2551    } else if (ch < 0x0800) {
2552      // Encode Latin-1
2553      *p++ = (char)(0xc0 | (ch >> 6));
2554      *p++ = (char)(0x80 | (ch & 0x3f));
2555    } else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
2556      // surrogateescape error handler
2557      if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
2558        if (error_pos != nullptr) {
2559          *error_pos = (size_t)i;
2560        }
2561        if (reason != nullptr) {
2562          *reason = "encoding error";
2563        }
2564        if (raw_malloc) {
2565          PyMem_RawFree(bytes);
2566        } else {
2567          PyMem_Free(bytes);
2568        }
2569        return -2;
2570      }
2571      *p++ = (char)(ch & 0xff);
2572    } else if (ch < 0x10000) {
2573      *p++ = (char)(0xe0 | (ch >> 12));
2574      *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2575      *p++ = (char)(0x80 | (ch & 0x3f));
2576    } else {
2577      // ch >= 0x10000
2578      DCHECK(ch <= kMaxUnicode, "ch must be a valid unicode code point");
2579      // Encode UCS4 Unicode ordinals
2580      *p++ = (char)(0xf0 | (ch >> 18));
2581      *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2582      *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2583      *p++ = (char)(0x80 | (ch & 0x3f));
2584    }
2585  }
2586  *p++ = '\0';
2587
2588  size_t final_size = (p - bytes);
2589  char* bytes2;
2590  if (raw_malloc) {
2591    bytes2 = reinterpret_cast<char*>(PyMem_RawRealloc(bytes, final_size));
2592  } else {
2593    bytes2 = reinterpret_cast<char*>(PyMem_Realloc(bytes, final_size));
2594  }
2595  if (bytes2 == nullptr) {
2596    if (error_pos != nullptr) {
2597      *error_pos = (size_t)-1;
2598    }
2599    if (raw_malloc) {
2600      PyMem_RawFree(bytes);
2601    } else {
2602      PyMem_Free(bytes);
2603    }
2604    return -1;
2605  }
2606  *str = bytes2;
2607  return 0;
2608}
2609
2610}  // namespace py